diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..634a917c7392e06a557fc8de1abea7a46aa74659 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.avi filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..06295a8bb38913e6c1d6878500e309c89380c2f1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,20 @@
+# Ignore any file with '_local' in its name
+*_local.*
+
+# Ignore specific files and directories
+datasets/cached_data/
+datasets/outputs/
+saved_audio.wav
+result.avi
+test.mp4
+**/.ipynb_checkpoints/
+**/__pycache__/
+outputs/
+
+# Ignore specific files in youtube_test folder
+datasets/cached_graph/youtube_test/speaker0.pkl
+datasets/cached_graph/youtube_test/speaker2.pkl
+datasets/cached_graph/youtube_test/speaker3.pkl
+datasets/cached_graph/youtube_test/speaker4.pkl
+datasets/cached_graph/youtube_test/speaker5.pkl
+datasets/cached_graph/youtube_test/speaker6.pkl
\ No newline at end of file
diff --git a/SMPLer-X/README.md b/SMPLer-X/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9c84c38f5466e8b25e4e7cc38a985a14f2d08914
--- /dev/null
+++ b/SMPLer-X/README.md
@@ -0,0 +1,13 @@
+---
+title: SMPLer X
+emoji: ⚡
+colorFrom: blue
+colorTo: indigo
+sdk: gradio
+python_version: 3.9
+sdk_version: 4.38.1
+app_file: app.py
+pinned: false
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
\ No newline at end of file
diff --git a/SMPLer-X/app.py b/SMPLer-X/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..389cd30ef2088c33a129b197be1ac47b732cca27
--- /dev/null
+++ b/SMPLer-X/app.py
@@ -0,0 +1,134 @@
+import os
+import shutil
+import argparse
+import sys
+import re
+import json
+import numpy as np
+import os.path as osp
+from pathlib import Path
+import cv2
+import torch
+import math
+from tqdm import tqdm
+from huggingface_hub import hf_hub_download
+try:
+    import mmpose
+except:
+    os.system('pip install ./main/transformer_utils')
+# hf_hub_download(repo_id="caizhongang/SMPLer-X", filename="smpler_x_h32.pth.tar", local_dir="/home/user/app/pretrained_models")
+os.system('cp -rf ./assets/conversions.py /content/myenv/lib/python3.10/site-packages/torchgeometry/core/conversions.py')
+
+def extract_frame_number(file_name):
+    match = re.search(r'(\d{5})', file_name)
+    if match:
+        return int(match.group(1))
+    return None
+
+def merge_npz_files(npz_files, output_file):
+    npz_files = sorted(npz_files, key=lambda x: extract_frame_number(os.path.basename(x)))
+    merged_data = {}
+    for file in npz_files:
+        data = np.load(file)
+        for key in data.files:
+            if key not in merged_data:
+                merged_data[key] = []
+            merged_data[key].append(data[key])
+    for key in merged_data:
+        merged_data[key] = np.stack(merged_data[key], axis=0)
+    np.savez(output_file, **merged_data)
+
+def npz_to_npz(pkl_path, npz_path):
+    # Load the pickle file
+    pkl_example = np.load(pkl_path, allow_pickle=True)
+    n = pkl_example["expression"].shape[0]  # Assuming this is the batch size
+    full_pose = np.concatenate([pkl_example["global_orient"], pkl_example["body_pose"], pkl_example["jaw_pose"],  pkl_example["leye_pose"], pkl_example["reye_pose"], pkl_example["left_hand_pose"], pkl_example["right_hand_pose"]], axis=1)
+    # print(full_pose.shape)
+    np.savez(npz_path,
+        betas=np.zeros(300),
+        poses=full_pose.reshape(n, -1),
+        expressions=np.zeros((n, 100)),
+        trans=pkl_example["transl"].reshape(n, -1),
+        model='smplx2020',
+        gender='neutral',
+        mocap_frame_rate=30,
+    )
+
+def get_json(root_dir, output_dir):
+    clips = []
+    dirs = os.listdir(root_dir)
+    all_length = 0
+    for dir in dirs:
+        if not dir.endswith(".mp4"): continue
+        video_id = dir[:-4]
+        root = root_dir
+        try: 
+            length = np.load(os.path.join(root, video_id+".npz"), allow_pickle=True)["poses"].shape[0]
+            all_length += length
+        except:
+            print("cant open ", dir)
+            continue
+        clip = {
+                "video_id": video_id,
+                "video_path": root[1:],
+                # "audio_path": root,
+                "motion_path": root[1:],
+                "mode": "test",
+                "start_idx": 0,
+                "end_idx": length
+            }
+        clips.append(clip)
+    if all_length < 1:
+        print(f"skip due to total frames is less than 1500 for {root_dir}")
+        return 0 
+    else:
+        with open(output_dir, 'w') as f:
+            json.dump(clips, f, indent=4)
+        return all_length
+
+
+def infer(video_input, in_threshold, num_people, render_mesh, inferer, OUT_FOLDER):
+    os.system(f'rm -rf {OUT_FOLDER}/smplx/*')
+    multi_person = num_people
+    cap = cv2.VideoCapture(video_input)
+    video_name = video_input.split("/")[-1]
+    success = 1
+    frame = 0
+    while success:
+        success, original_img = cap.read()
+        if not success:
+            break
+        frame += 1
+        _, _, _ = inferer.infer(original_img, in_threshold, frame, multi_person, not(render_mesh))
+    cap.release()
+    npz_files = [os.path.join(OUT_FOLDER, 'smplx', x) for x in os.listdir(os.path.join(OUT_FOLDER, 'smplx'))]
+  
+    merge_npz_files(npz_files, os.path.join(OUT_FOLDER, video_name.replace(".mp4", ".npz")))
+    os.system(f'rm -r {OUT_FOLDER}/smplx')
+    npz_to_npz(os.path.join(OUT_FOLDER, video_name.replace(".mp4", ".npz")), os.path.join(OUT_FOLDER, video_name.replace(".mp4", ".npz")))
+    source = video_input
+    destination = os.path.join(OUT_FOLDER, video_name.replace('.mp4', '.npz')).replace('.npz', '.mp4')
+    shutil.copy(source, destination)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--video_folder_path", type=str, default="")
+    parser.add_argument("--data_save_path", type=str, default="")
+    parser.add_argument("--json_save_path", type=str, default="")
+    args = parser.parse_args()
+    video_folder = args.video_folder_path
+
+    DEFAULT_MODEL='smpler_x_s32'
+    OUT_FOLDER = args.data_save_path
+    os.makedirs(OUT_FOLDER, exist_ok=True)
+    num_gpus = 1 if torch.cuda.is_available() else -1
+    index = torch.cuda.current_device()
+    from main.inference import Inferer
+    inferer = Inferer(DEFAULT_MODEL, num_gpus, OUT_FOLDER)
+
+    for video_input in tqdm(os.listdir(video_folder)):
+        if not video_input.endswith(".mp4"):
+            continue
+        infer(os.path.join(video_folder, video_input), 0.5, False, False, inferer, OUT_FOLDER)
+    get_json(OUT_FOLDER, args.json_save_path)
+    
\ No newline at end of file
diff --git a/SMPLer-X/assets/01.mp4 b/SMPLer-X/assets/01.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..f75b220e49cb4fadb1a8ef1ec20b31bdf17a7e6a
--- /dev/null
+++ b/SMPLer-X/assets/01.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c49b991ef14cd562a4b8708ab4adc654930408c3daa64e7066ebcfc09956649
+size 1803162
diff --git a/SMPLer-X/assets/02.mp4 b/SMPLer-X/assets/02.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..339ed85c75bc144b4a3827cfceb15c4e7e92bb61
--- /dev/null
+++ b/SMPLer-X/assets/02.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a56cca49b5a06c7d1b59bf4e9a038957ff435397f04117a04b81167c9efce249
+size 1647456
diff --git a/SMPLer-X/assets/03.mp4 b/SMPLer-X/assets/03.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..5b011ad654f3e1cc6b9b0c52b1349eb2899086dd
--- /dev/null
+++ b/SMPLer-X/assets/03.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5c8a8fb20005eabc0793d93527bb1c3ef0d4302e9d982d2fc3a90b14e4caea0
+size 1715169
diff --git a/SMPLer-X/assets/04.mp4 b/SMPLer-X/assets/04.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..0f41aa6e4232f765e8512121442c865b0feb6eb5
--- /dev/null
+++ b/SMPLer-X/assets/04.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e86fca0eeca1e53661847cc365c804115e102705af9626a7c81a06aa1b385678
+size 1667600
diff --git a/SMPLer-X/assets/05.mp4 b/SMPLer-X/assets/05.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..5ba6da0c5ce91caa0d4dd4d1ce063f9cb7aba470
--- /dev/null
+++ b/SMPLer-X/assets/05.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80f8f88da5df0f69cca5c0ce57d2c5b27cb9a531316bc0ba4b7ee93534c305d1
+size 372289
diff --git a/SMPLer-X/assets/06.mp4 b/SMPLer-X/assets/06.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..e34f6a33533ae045d16dabdca2283dc2c745c5df
--- /dev/null
+++ b/SMPLer-X/assets/06.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b40e413393997b281feb60c44d76c69021e726ad0ebc38d68f25776f44729d3
+size 2201834
diff --git a/SMPLer-X/assets/07.mp4 b/SMPLer-X/assets/07.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..15104556f09608c13f310b4eadd96f197af4b980
--- /dev/null
+++ b/SMPLer-X/assets/07.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6865143408318ff6959acb2ceb34e44fe64df20d3dbeef24611752d5bd683387
+size 1793114
diff --git a/SMPLer-X/assets/08.mp4 b/SMPLer-X/assets/08.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..2bf123e26e0c1802b117f75c1d84df61bfb1d9e9
--- /dev/null
+++ b/SMPLer-X/assets/08.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9977fdeb615d39c44d4be4d3589818aa12c4c288b76687301f3558caf5e8f419
+size 798019
diff --git a/SMPLer-X/assets/09.mp4 b/SMPLer-X/assets/09.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..468c237f6bac0a8ff6949035eb069ebbf023fa9d
--- /dev/null
+++ b/SMPLer-X/assets/09.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34d6ed77d1a1f803c17da35313e033045a1b9e8908cac1c0c23491c113c0cff6
+size 1628528
diff --git a/SMPLer-X/assets/conversions.py b/SMPLer-X/assets/conversions.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e96d56dd58c419037d7565129b66a1f64cfd568
--- /dev/null
+++ b/SMPLer-X/assets/conversions.py
@@ -0,0 +1,523 @@
+import torch
+import torch.nn as nn
+
+import torchgeometry as tgm
+
+__all__ = [
+    # functional api
+    "pi",
+    "rad2deg",
+    "deg2rad",
+    "convert_points_from_homogeneous",
+    "convert_points_to_homogeneous",
+    "angle_axis_to_rotation_matrix",
+    "rotation_matrix_to_angle_axis",
+    "rotation_matrix_to_quaternion",
+    "quaternion_to_angle_axis",
+    "angle_axis_to_quaternion",
+    "rtvec_to_pose",
+    # layer api
+    "RadToDeg",
+    "DegToRad",
+    "ConvertPointsFromHomogeneous",
+    "ConvertPointsToHomogeneous",
+]
+
+
+"""Constant with number pi
+"""
+pi = torch.Tensor([3.14159265358979323846])
+
+
+def rad2deg(tensor):
+    r"""Function that converts angles from radians to degrees.
+
+    See :class:`~torchgeometry.RadToDeg` for details.
+
+    Args:
+        tensor (Tensor): Tensor of arbitrary shape.
+
+    Returns:
+        Tensor: Tensor with same shape as input.
+
+    Example:
+        >>> input = tgm.pi * torch.rand(1, 3, 3)
+        >>> output = tgm.rad2deg(input)
+    """
+    if not torch.is_tensor(tensor):
+        raise TypeError("Input type is not a torch.Tensor. Got {}"
+                        .format(type(tensor)))
+
+    return 180. * tensor / pi.to(tensor.device).type(tensor.dtype)
+
+
+def deg2rad(tensor):
+    r"""Function that converts angles from degrees to radians.
+
+    See :class:`~torchgeometry.DegToRad` for details.
+
+    Args:
+        tensor (Tensor): Tensor of arbitrary shape.
+
+    Returns:
+        Tensor: Tensor with same shape as input.
+
+    Examples::
+
+        >>> input = 360. * torch.rand(1, 3, 3)
+        >>> output = tgm.deg2rad(input)
+    """
+    if not torch.is_tensor(tensor):
+        raise TypeError("Input type is not a torch.Tensor. Got {}"
+                        .format(type(tensor)))
+
+    return tensor * pi.to(tensor.device).type(tensor.dtype) / 180.
+
+
+def convert_points_from_homogeneous(points):
+    r"""Function that converts points from homogeneous to Euclidean space.
+
+    See :class:`~torchgeometry.ConvertPointsFromHomogeneous` for details.
+
+    Examples::
+
+        >>> input = torch.rand(2, 4, 3)  # BxNx3
+        >>> output = tgm.convert_points_from_homogeneous(input)  # BxNx2
+    """
+    if not torch.is_tensor(points):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(points)))
+    if len(points.shape) < 2:
+        raise ValueError("Input must be at least a 2D tensor. Got {}".format(
+            points.shape))
+
+    return points[..., :-1] / points[..., -1:]
+
+
+def convert_points_to_homogeneous(points):
+    r"""Function that converts points from Euclidean to homogeneous space.
+
+    See :class:`~torchgeometry.ConvertPointsToHomogeneous` for details.
+
+    Examples::
+
+        >>> input = torch.rand(2, 4, 3)  # BxNx3
+        >>> output = tgm.convert_points_to_homogeneous(input)  # BxNx4
+    """
+    if not torch.is_tensor(points):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(points)))
+    if len(points.shape) < 2:
+        raise ValueError("Input must be at least a 2D tensor. Got {}".format(
+            points.shape))
+
+    return nn.functional.pad(points, (0, 1), "constant", 1.0)
+
+
+def angle_axis_to_rotation_matrix(angle_axis):
+    """Convert 3d vector of axis-angle rotation to 4x4 rotation matrix
+
+    Args:
+        angle_axis (Tensor): tensor of 3d vector of axis-angle rotations.
+
+    Returns:
+        Tensor: tensor of 4x4 rotation matrices.
+
+    Shape:
+        - Input: :math:`(N, 3)`
+        - Output: :math:`(N, 4, 4)`
+
+    Example:
+        >>> input = torch.rand(1, 3)  # Nx3
+        >>> output = tgm.angle_axis_to_rotation_matrix(input)  # Nx4x4
+    """
+    def _compute_rotation_matrix(angle_axis, theta2, eps=1e-6):
+        # We want to be careful to only evaluate the square root if the
+        # norm of the angle_axis vector is greater than zero. Otherwise
+        # we get a division by zero.
+        k_one = 1.0
+        theta = torch.sqrt(theta2)
+        wxyz = angle_axis / (theta + eps)
+        wx, wy, wz = torch.chunk(wxyz, 3, dim=1)
+        cos_theta = torch.cos(theta)
+        sin_theta = torch.sin(theta)
+
+        r00 = cos_theta + wx * wx * (k_one - cos_theta)
+        r10 = wz * sin_theta + wx * wy * (k_one - cos_theta)
+        r20 = -wy * sin_theta + wx * wz * (k_one - cos_theta)
+        r01 = wx * wy * (k_one - cos_theta) - wz * sin_theta
+        r11 = cos_theta + wy * wy * (k_one - cos_theta)
+        r21 = wx * sin_theta + wy * wz * (k_one - cos_theta)
+        r02 = wy * sin_theta + wx * wz * (k_one - cos_theta)
+        r12 = -wx * sin_theta + wy * wz * (k_one - cos_theta)
+        r22 = cos_theta + wz * wz * (k_one - cos_theta)
+        rotation_matrix = torch.cat(
+            [r00, r01, r02, r10, r11, r12, r20, r21, r22], dim=1)
+        return rotation_matrix.view(-1, 3, 3)
+
+    def _compute_rotation_matrix_taylor(angle_axis):
+        rx, ry, rz = torch.chunk(angle_axis, 3, dim=1)
+        k_one = torch.ones_like(rx)
+        rotation_matrix = torch.cat(
+            [k_one, -rz, ry, rz, k_one, -rx, -ry, rx, k_one], dim=1)
+        return rotation_matrix.view(-1, 3, 3)
+
+    # stolen from ceres/rotation.h
+
+    _angle_axis = torch.unsqueeze(angle_axis, dim=1)
+    theta2 = torch.matmul(_angle_axis, _angle_axis.transpose(1, 2))
+    theta2 = torch.squeeze(theta2, dim=1)
+
+    # compute rotation matrices
+    rotation_matrix_normal = _compute_rotation_matrix(angle_axis, theta2)
+    rotation_matrix_taylor = _compute_rotation_matrix_taylor(angle_axis)
+
+    # create mask to handle both cases
+    eps = 1e-6
+    mask = (theta2 > eps).view(-1, 1, 1).to(theta2.device)
+    mask_pos = (mask).type_as(theta2)
+    mask_neg = (mask == False).type_as(theta2)  # noqa
+
+    # create output pose matrix
+    batch_size = angle_axis.shape[0]
+    rotation_matrix = torch.eye(4).to(angle_axis.device).type_as(angle_axis)
+    rotation_matrix = rotation_matrix.view(1, 4, 4).repeat(batch_size, 1, 1)
+    # fill output matrix with masked values
+    rotation_matrix[..., :3, :3] = \
+        mask_pos * rotation_matrix_normal + mask_neg * rotation_matrix_taylor
+    return rotation_matrix  # Nx4x4
+
+
+def rtvec_to_pose(rtvec):
+    """
+    Convert axis-angle rotation and translation vector to 4x4 pose matrix
+
+    Args:
+        rtvec (Tensor): Rodrigues vector transformations
+
+    Returns:
+        Tensor: transformation matrices
+
+    Shape:
+        - Input: :math:`(N, 6)`
+        - Output: :math:`(N, 4, 4)`
+
+    Example:
+        >>> input = torch.rand(3, 6)  # Nx6
+        >>> output = tgm.rtvec_to_pose(input)  # Nx4x4
+    """
+    assert rtvec.shape[-1] == 6, 'rtvec=[rx, ry, rz, tx, ty, tz]'
+    pose = angle_axis_to_rotation_matrix(rtvec[..., :3])
+    pose[..., :3, 3] = rtvec[..., 3:]
+    return pose
+
+
+def rotation_matrix_to_angle_axis(rotation_matrix):
+    """Convert 3x4 rotation matrix to Rodrigues vector
+
+    Args:
+        rotation_matrix (Tensor): rotation matrix.
+
+    Returns:
+        Tensor: Rodrigues vector transformation.
+
+    Shape:
+        - Input: :math:`(N, 3, 4)`
+        - Output: :math:`(N, 3)`
+
+    Example:
+        >>> input = torch.rand(2, 3, 4)  # Nx4x4
+        >>> output = tgm.rotation_matrix_to_angle_axis(input)  # Nx3
+    """
+    # todo add check that matrix is a valid rotation matrix
+    quaternion = rotation_matrix_to_quaternion(rotation_matrix)
+    return quaternion_to_angle_axis(quaternion)
+
+
+def rotation_matrix_to_quaternion(rotation_matrix, eps=1e-6):
+    """Convert 3x4 rotation matrix to 4d quaternion vector
+
+    This algorithm is based on algorithm described in
+    https://github.com/KieranWynn/pyquaternion/blob/master/pyquaternion/quaternion.py#L201
+
+    Args:
+        rotation_matrix (Tensor): the rotation matrix to convert.
+
+    Return:
+        Tensor: the rotation in quaternion
+
+    Shape:
+        - Input: :math:`(N, 3, 4)`
+        - Output: :math:`(N, 4)`
+
+    Example:
+        >>> input = torch.rand(4, 3, 4)  # Nx3x4
+        >>> output = tgm.rotation_matrix_to_quaternion(input)  # Nx4
+    """
+    if not torch.is_tensor(rotation_matrix):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(rotation_matrix)))
+
+    if len(rotation_matrix.shape) > 3:
+        raise ValueError(
+            "Input size must be a three dimensional tensor. Got {}".format(
+                rotation_matrix.shape))
+    if not rotation_matrix.shape[-2:] == (3, 4):
+        raise ValueError(
+            "Input size must be a N x 3 x 4  tensor. Got {}".format(
+                rotation_matrix.shape))
+
+    rmat_t = torch.transpose(rotation_matrix, 1, 2)
+
+    mask_d2 = rmat_t[:, 2, 2] < eps
+
+    mask_d0_d1 = rmat_t[:, 0, 0] > rmat_t[:, 1, 1]
+    mask_d0_nd1 = rmat_t[:, 0, 0] < -rmat_t[:, 1, 1]
+
+    t0 = 1 + rmat_t[:, 0, 0] - rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
+    q0 = torch.stack([rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
+                      t0, rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
+                      rmat_t[:, 2, 0] + rmat_t[:, 0, 2]], -1)
+    t0_rep = t0.repeat(4, 1).t()
+
+    t1 = 1 - rmat_t[:, 0, 0] + rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
+    q1 = torch.stack([rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
+                      rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
+                      t1, rmat_t[:, 1, 2] + rmat_t[:, 2, 1]], -1)
+    t1_rep = t1.repeat(4, 1).t()
+
+    t2 = 1 - rmat_t[:, 0, 0] - rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
+    q2 = torch.stack([rmat_t[:, 0, 1] - rmat_t[:, 1, 0],
+                      rmat_t[:, 2, 0] + rmat_t[:, 0, 2],
+                      rmat_t[:, 1, 2] + rmat_t[:, 2, 1], t2], -1)
+    t2_rep = t2.repeat(4, 1).t()
+
+    t3 = 1 + rmat_t[:, 0, 0] + rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
+    q3 = torch.stack([t3, rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
+                      rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
+                      rmat_t[:, 0, 1] - rmat_t[:, 1, 0]], -1)
+    t3_rep = t3.repeat(4, 1).t()
+
+    mask_c0 = mask_d2 * mask_d0_d1
+    mask_c1 = mask_d2 * ~(mask_d0_d1)
+    mask_c2 = ~(mask_d2) * mask_d0_nd1
+    mask_c3 = ~(mask_d2) * ~(mask_d0_nd1)
+    mask_c0 = mask_c0.view(-1, 1).type_as(q0)
+    mask_c1 = mask_c1.view(-1, 1).type_as(q1)
+    mask_c2 = mask_c2.view(-1, 1).type_as(q2)
+    mask_c3 = mask_c3.view(-1, 1).type_as(q3)
+
+    q = q0 * mask_c0 + q1 * mask_c1 + q2 * mask_c2 + q3 * mask_c3
+    q /= torch.sqrt(t0_rep * mask_c0 + t1_rep * mask_c1 +  # noqa
+                    t2_rep * mask_c2 + t3_rep * mask_c3)  # noqa
+    q *= 0.5
+    return q
+
+
+def quaternion_to_angle_axis(quaternion: torch.Tensor) -> torch.Tensor:
+    """Convert quaternion vector to angle axis of rotation.
+
+    Adapted from ceres C++ library: ceres-solver/include/ceres/rotation.h
+
+    Args:
+        quaternion (torch.Tensor): tensor with quaternions.
+
+    Return:
+        torch.Tensor: tensor with angle axis of rotation.
+
+    Shape:
+        - Input: :math:`(*, 4)` where `*` means, any number of dimensions
+        - Output: :math:`(*, 3)`
+
+    Example:
+        >>> quaternion = torch.rand(2, 4)  # Nx4
+        >>> angle_axis = tgm.quaternion_to_angle_axis(quaternion)  # Nx3
+    """
+    if not torch.is_tensor(quaternion):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(quaternion)))
+
+    if not quaternion.shape[-1] == 4:
+        raise ValueError("Input must be a tensor of shape Nx4 or 4. Got {}"
+                         .format(quaternion.shape))
+    # unpack input and compute conversion
+    q1: torch.Tensor = quaternion[..., 1]
+    q2: torch.Tensor = quaternion[..., 2]
+    q3: torch.Tensor = quaternion[..., 3]
+    sin_squared_theta: torch.Tensor = q1 * q1 + q2 * q2 + q3 * q3
+
+    sin_theta: torch.Tensor = torch.sqrt(sin_squared_theta)
+    cos_theta: torch.Tensor = quaternion[..., 0]
+    two_theta: torch.Tensor = 2.0 * torch.where(
+        cos_theta < 0.0,
+        torch.atan2(-sin_theta, -cos_theta),
+        torch.atan2(sin_theta, cos_theta))
+
+    k_pos: torch.Tensor = two_theta / sin_theta
+    k_neg: torch.Tensor = 2.0 * torch.ones_like(sin_theta)
+    k: torch.Tensor = torch.where(sin_squared_theta > 0.0, k_pos, k_neg)
+
+    angle_axis: torch.Tensor = torch.zeros_like(quaternion)[..., :3]
+    angle_axis[..., 0] += q1 * k
+    angle_axis[..., 1] += q2 * k
+    angle_axis[..., 2] += q3 * k
+    return angle_axis
+
+# based on:
+# https://github.com/facebookresearch/QuaterNet/blob/master/common/quaternion.py#L138
+
+
+def angle_axis_to_quaternion(angle_axis: torch.Tensor) -> torch.Tensor:
+    """Convert an angle axis to a quaternion.
+
+    Adapted from ceres C++ library: ceres-solver/include/ceres/rotation.h
+
+    Args:
+        angle_axis (torch.Tensor): tensor with angle axis.
+
+    Return:
+        torch.Tensor: tensor with quaternion.
+
+    Shape:
+        - Input: :math:`(*, 3)` where `*` means, any number of dimensions
+        - Output: :math:`(*, 4)`
+
+    Example:
+        >>> angle_axis = torch.rand(2, 4)  # Nx4
+        >>> quaternion = tgm.angle_axis_to_quaternion(angle_axis)  # Nx3
+    """
+    if not torch.is_tensor(angle_axis):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(angle_axis)))
+
+    if not angle_axis.shape[-1] == 3:
+        raise ValueError("Input must be a tensor of shape Nx3 or 3. Got {}"
+                         .format(angle_axis.shape))
+    # unpack input and compute conversion
+    a0: torch.Tensor = angle_axis[..., 0:1]
+    a1: torch.Tensor = angle_axis[..., 1:2]
+    a2: torch.Tensor = angle_axis[..., 2:3]
+    theta_squared: torch.Tensor = a0 * a0 + a1 * a1 + a2 * a2
+
+    theta: torch.Tensor = torch.sqrt(theta_squared)
+    half_theta: torch.Tensor = theta * 0.5
+
+    mask: torch.Tensor = theta_squared > 0.0
+    ones: torch.Tensor = torch.ones_like(half_theta)
+
+    k_neg: torch.Tensor = 0.5 * ones
+    k_pos: torch.Tensor = torch.sin(half_theta) / theta
+    k: torch.Tensor = torch.where(mask, k_pos, k_neg)
+    w: torch.Tensor = torch.where(mask, torch.cos(half_theta), ones)
+
+    quaternion: torch.Tensor = torch.zeros_like(angle_axis)
+    quaternion[..., 0:1] += a0 * k
+    quaternion[..., 1:2] += a1 * k
+    quaternion[..., 2:3] += a2 * k
+    return torch.cat([w, quaternion], dim=-1)
+
+# TODO: add below funtionalities
+#  - pose_to_rtvec
+
+
+# layer api
+
+
+class RadToDeg(nn.Module):
+    r"""Creates an object that converts angles from radians to degrees.
+
+    Args:
+        tensor (Tensor): Tensor of arbitrary shape.
+
+    Returns:
+        Tensor: Tensor with same shape as input.
+
+    Examples::
+
+        >>> input = tgm.pi * torch.rand(1, 3, 3)
+        >>> output = tgm.RadToDeg()(input)
+    """
+
+    def __init__(self):
+        super(RadToDeg, self).__init__()
+
+    def forward(self, input):
+        return rad2deg(input)
+
+
+class DegToRad(nn.Module):
+    r"""Function that converts angles from degrees to radians.
+
+    Args:
+        tensor (Tensor): Tensor of arbitrary shape.
+
+    Returns:
+        Tensor: Tensor with same shape as input.
+
+    Examples::
+
+        >>> input = 360. * torch.rand(1, 3, 3)
+        >>> output = tgm.DegToRad()(input)
+    """
+
+    def __init__(self):
+        super(DegToRad, self).__init__()
+
+    def forward(self, input):
+        return deg2rad(input)
+
+
+class ConvertPointsFromHomogeneous(nn.Module):
+    r"""Creates a transformation that converts points from homogeneous to
+    Euclidean space.
+
+    Args:
+        points (Tensor): tensor of N-dimensional points.
+
+    Returns:
+        Tensor: tensor of N-1-dimensional points.
+
+    Shape:
+        - Input: :math:`(B, D, N)` or :math:`(D, N)`
+        - Output: :math:`(B, D, N + 1)` or :math:`(D, N + 1)`
+
+    Examples::
+
+        >>> input = torch.rand(2, 4, 3)  # BxNx3
+        >>> transform = tgm.ConvertPointsFromHomogeneous()
+        >>> output = transform(input)  # BxNx2
+    """
+
+    def __init__(self):
+        super(ConvertPointsFromHomogeneous, self).__init__()
+
+    def forward(self, input):
+        return convert_points_from_homogeneous(input)
+
+
+class ConvertPointsToHomogeneous(nn.Module):
+    r"""Creates a transformation to convert points from Euclidean to
+    homogeneous space.
+
+    Args:
+        points (Tensor): tensor of N-dimensional points.
+
+    Returns:
+        Tensor: tensor of N+1-dimensional points.
+
+    Shape:
+        - Input: :math:`(B, D, N)` or :math:`(D, N)`
+        - Output: :math:`(B, D, N + 1)` or :math:`(D, N + 1)`
+
+    Examples::
+
+        >>> input = torch.rand(2, 4, 3)  # BxNx3
+        >>> transform = tgm.ConvertPointsToHomogeneous()
+        >>> output = transform(input)  # BxNx4
+    """
+
+    def __init__(self):
+        super(ConvertPointsToHomogeneous, self).__init__()
+
+    def forward(self, input):
+        return convert_points_to_homogeneous(input)
diff --git a/SMPLer-X/common/base.py b/SMPLer-X/common/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..8434f1b8e6c49988df0175649a5cfaf4c6a16269
--- /dev/null
+++ b/SMPLer-X/common/base.py
@@ -0,0 +1,86 @@
+import os.path as osp
+import math
+import abc
+from torch.utils.data import DataLoader
+import torch.optim
+import torchvision.transforms as transforms
+from timer import Timer
+from logger import colorlogger
+from torch.nn.parallel.data_parallel import DataParallel
+from config import cfg
+from SMPLer_X import get_model
+
+# ddp
+import torch.distributed as dist
+from torch.utils.data import DistributedSampler
+import torch.utils.data.distributed
+from utils.distribute_utils import (
+    get_rank, is_main_process, time_synchronized, get_group_idx, get_process_groups
+)
+
+
+class Base(object):
+    __metaclass__ = abc.ABCMeta
+
+    def __init__(self, log_name='logs.txt'):
+        self.cur_epoch = 0
+
+        # timer
+        self.tot_timer = Timer()
+        self.gpu_timer = Timer()
+        self.read_timer = Timer()
+
+        # logger
+        self.logger = colorlogger(cfg.log_dir, log_name=log_name)
+
+    @abc.abstractmethod
+    def _make_batch_generator(self):
+        return
+
+    @abc.abstractmethod
+    def _make_model(self):
+        return
+
+class Demoer(Base):
+    def __init__(self, test_epoch=None):
+        if test_epoch is not None:
+            self.test_epoch = int(test_epoch)
+        super(Demoer, self).__init__(log_name='test_logs.txt')
+
+    def _make_batch_generator(self, demo_scene):
+        # data load and construct batch generator
+        self.logger.info("Creating dataset...")
+        from data.UBody.UBody import UBody
+        testset_loader = UBody(transforms.ToTensor(), "demo", demo_scene) # eval(demoset)(transforms.ToTensor(), "demo")
+        batch_generator = DataLoader(dataset=testset_loader, batch_size=cfg.num_gpus * cfg.test_batch_size,
+                                     shuffle=False, num_workers=cfg.num_thread, pin_memory=True)
+
+        self.testset = testset_loader
+        self.batch_generator = batch_generator
+
+    def _make_model(self):
+        self.logger.info('Load checkpoint from {}'.format(cfg.pretrained_model_path))
+
+        # prepare network
+        self.logger.info("Creating graph...")
+        model = get_model('test')
+        model = DataParallel(model).to(cfg.device)
+        ckpt = torch.load(cfg.pretrained_model_path, map_location=cfg.device)
+
+        from collections import OrderedDict
+        new_state_dict = OrderedDict()
+        for k, v in ckpt['network'].items():
+            if 'module' not in k:
+                k = 'module.' + k
+            k = k.replace('module.backbone', 'module.encoder').replace('body_rotation_net', 'body_regressor').replace(
+                'hand_rotation_net', 'hand_regressor')
+            new_state_dict[k] = v
+        model.load_state_dict(new_state_dict, strict=False)
+        model.eval()
+
+        self.model = model
+
+    def _evaluate(self, outs, cur_sample_idx):
+        eval_result = self.testset.evaluate(outs, cur_sample_idx)
+        return eval_result
+
diff --git a/SMPLer-X/common/logger.py b/SMPLer-X/common/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..a117118df6e29aea3922dccef7901d136f5a52b0
--- /dev/null
+++ b/SMPLer-X/common/logger.py
@@ -0,0 +1,50 @@
+import logging
+import os
+
+OK = '\033[92m'
+WARNING = '\033[93m'
+FAIL = '\033[91m'
+END = '\033[0m'
+
+PINK = '\033[95m'
+BLUE = '\033[94m'
+GREEN = OK
+RED = FAIL
+WHITE = END
+YELLOW = WARNING
+
+class colorlogger():
+    def __init__(self, log_dir, log_name='train_logs.txt'):
+        # set log
+        self._logger = logging.getLogger(log_name)
+        self._logger.setLevel(logging.INFO)
+        log_file = os.path.join(log_dir, log_name)
+        if not os.path.exists(log_dir):
+            os.makedirs(log_dir)
+        file_log = logging.FileHandler(log_file, mode='a')
+        file_log.setLevel(logging.INFO)
+        console_log = logging.StreamHandler()
+        console_log.setLevel(logging.INFO)
+        formatter = logging.Formatter(
+            "{}%(asctime)s{} %(message)s".format(GREEN, END),
+            "%m-%d %H:%M:%S")
+        file_log.setFormatter(formatter)
+        console_log.setFormatter(formatter)
+        self._logger.addHandler(file_log)
+        self._logger.addHandler(console_log)
+
+    def debug(self, msg):
+        self._logger.debug(str(msg))
+
+    def info(self, msg):
+        self._logger.info(str(msg))
+
+    def warning(self, msg):
+        self._logger.warning(WARNING + 'WRN: ' + str(msg) + END)
+
+    def critical(self, msg):
+        self._logger.critical(RED + 'CRI: ' + str(msg) + END)
+
+    def error(self, msg):
+        self._logger.error(RED + 'ERR: ' + str(msg) + END)
+
diff --git a/SMPLer-X/common/nets/layer.py b/SMPLer-X/common/nets/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..69d61bb01d8b5782b61b7b36b4152a2053bfbb80
--- /dev/null
+++ b/SMPLer-X/common/nets/layer.py
@@ -0,0 +1,53 @@
+import torch.nn as nn
+
+def make_linear_layers(feat_dims, relu_final=True, use_bn=False):
+    layers = []
+    for i in range(len(feat_dims)-1):
+        layers.append(nn.Linear(feat_dims[i], feat_dims[i+1]))
+
+        # Do not use ReLU for final estimation
+        if i < len(feat_dims)-2 or (i == len(feat_dims)-2 and relu_final):
+            if use_bn:
+                layers.append(nn.BatchNorm1d(feat_dims[i+1]))
+            layers.append(nn.ReLU(inplace=True))
+
+    return nn.Sequential(*layers)
+
+def make_conv_layers(feat_dims, kernel=3, stride=1, padding=1, bnrelu_final=True):
+    layers = []
+    for i in range(len(feat_dims)-1):
+        layers.append(
+            nn.Conv2d(
+                in_channels=feat_dims[i],
+                out_channels=feat_dims[i+1],
+                kernel_size=kernel,
+                stride=stride,
+                padding=padding
+                ))
+        # Do not use BN and ReLU for final estimation
+        if i < len(feat_dims)-2 or (i == len(feat_dims)-2 and bnrelu_final):
+            layers.append(nn.BatchNorm2d(feat_dims[i+1]))
+            layers.append(nn.ReLU(inplace=True))
+
+    return nn.Sequential(*layers)
+
+def make_deconv_layers(feat_dims, bnrelu_final=True):
+    layers = []
+    for i in range(len(feat_dims)-1):
+        layers.append(
+            nn.ConvTranspose2d(
+                in_channels=feat_dims[i],
+                out_channels=feat_dims[i+1],
+                kernel_size=4,
+                stride=2,
+                padding=1,
+                output_padding=0,
+                bias=False))
+
+        # Do not use BN and ReLU for final estimation
+        if i < len(feat_dims)-2 or (i == len(feat_dims)-2 and bnrelu_final):
+            layers.append(nn.BatchNorm2d(feat_dims[i+1]))
+            layers.append(nn.ReLU(inplace=True))
+
+    return nn.Sequential(*layers)
+
diff --git a/SMPLer-X/common/nets/loss.py b/SMPLer-X/common/nets/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fd3298ef71128d4bfe58099c8ee7f80aa6215a9
--- /dev/null
+++ b/SMPLer-X/common/nets/loss.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+
+class CoordLoss(nn.Module):
+    def __init__(self):
+        super(CoordLoss, self).__init__()
+
+    def forward(self, coord_out, coord_gt, valid, is_3D=None):
+        loss = torch.abs(coord_out - coord_gt) * valid
+        if is_3D is not None:
+            loss_z = loss[:,:,2:] * is_3D[:,None,None].float()
+            loss = torch.cat((loss[:,:,:2], loss_z),2)
+        return loss
+
+class ParamLoss(nn.Module):
+    def __init__(self):
+        super(ParamLoss, self).__init__()
+
+    def forward(self, param_out, param_gt, valid):
+        loss = torch.abs(param_out - param_gt) * valid
+        return loss
+
+class CELoss(nn.Module):
+    def __init__(self):
+        super(CELoss, self).__init__()
+        self.ce_loss = nn.CrossEntropyLoss(reduction='none')
+
+    def forward(self, out, gt_index):
+        loss = self.ce_loss(out, gt_index)
+        return loss
diff --git a/SMPLer-X/common/nets/smpler_x.py b/SMPLer-X/common/nets/smpler_x.py
new file mode 100644
index 0000000000000000000000000000000000000000..60610da8528f266bf6c99a50ac11b4549ee77958
--- /dev/null
+++ b/SMPLer-X/common/nets/smpler_x.py
@@ -0,0 +1,172 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from nets.layer import make_conv_layers, make_linear_layers, make_deconv_layers
+from utils.transforms import sample_joint_features, soft_argmax_2d, soft_argmax_3d
+from utils.human_models import smpl_x
+from config import cfg
+from mmcv.ops.roi_align import roi_align
+
+class PositionNet(nn.Module):
+    def __init__(self, part, feat_dim=768):
+        super(PositionNet, self).__init__()
+        if part == 'body':
+            self.joint_num = len(smpl_x.pos_joint_part['body'])
+            self.hm_shape = cfg.output_hm_shape
+        elif part == 'hand':
+            self.joint_num = len(smpl_x.pos_joint_part['rhand'])
+            self.hm_shape = cfg.output_hand_hm_shape
+        self.conv = make_conv_layers([feat_dim, self.joint_num * self.hm_shape[0]], kernel=1, stride=1, padding=0, bnrelu_final=False)
+
+    def forward(self, img_feat):
+        joint_hm = self.conv(img_feat).view(-1, self.joint_num, self.hm_shape[0], self.hm_shape[1], self.hm_shape[2])
+        joint_coord = soft_argmax_3d(joint_hm)
+        joint_hm = F.softmax(joint_hm.view(-1, self.joint_num, self.hm_shape[0] * self.hm_shape[1] * self.hm_shape[2]), 2)
+        joint_hm = joint_hm.view(-1, self.joint_num, self.hm_shape[0], self.hm_shape[1], self.hm_shape[2])
+        return joint_hm, joint_coord
+
+class HandRotationNet(nn.Module):
+    def __init__(self, part, feat_dim = 768):
+        super(HandRotationNet, self).__init__()
+        self.part = part
+        self.joint_num = len(smpl_x.pos_joint_part['rhand'])
+        self.hand_conv = make_conv_layers([feat_dim, 512], kernel=1, stride=1, padding=0)
+        self.hand_pose_out = make_linear_layers([self.joint_num * 515, len(smpl_x.orig_joint_part['rhand']) * 6], relu_final=False)
+        self.feat_dim = feat_dim
+
+    def forward(self, img_feat, joint_coord_img):
+        batch_size = img_feat.shape[0]
+        img_feat = self.hand_conv(img_feat)
+        img_feat_joints = sample_joint_features(img_feat, joint_coord_img[:, :, :2])
+        feat = torch.cat((img_feat_joints, joint_coord_img), 2)  # batch_size, joint_num, 512+3
+        hand_pose = self.hand_pose_out(feat.view(batch_size, -1))
+        return hand_pose
+
+class BodyRotationNet(nn.Module):
+    def __init__(self, feat_dim = 768):
+        super(BodyRotationNet, self).__init__()
+        self.joint_num = len(smpl_x.pos_joint_part['body'])
+        self.body_conv = make_linear_layers([feat_dim, 512], relu_final=False)
+        self.root_pose_out = make_linear_layers([self.joint_num * (512+3), 6], relu_final=False)
+        self.body_pose_out = make_linear_layers(
+            [self.joint_num * (512+3), (len(smpl_x.orig_joint_part['body']) - 1) * 6], relu_final=False)  # without root
+        self.shape_out = make_linear_layers([feat_dim, smpl_x.shape_param_dim], relu_final=False)
+        self.cam_out = make_linear_layers([feat_dim, 3], relu_final=False)
+        self.feat_dim = feat_dim
+
+    def forward(self, body_pose_token, shape_token, cam_token, body_joint_img):
+        batch_size = body_pose_token.shape[0]
+
+        # shape parameter
+        shape_param = self.shape_out(shape_token)
+
+        # camera parameter
+        cam_param = self.cam_out(cam_token)
+
+        # body pose parameter
+        body_pose_token = self.body_conv(body_pose_token)
+        body_pose_token = torch.cat((body_pose_token, body_joint_img), 2)
+        root_pose = self.root_pose_out(body_pose_token.view(batch_size, -1))
+        body_pose = self.body_pose_out(body_pose_token.view(batch_size, -1))
+
+        return root_pose, body_pose, shape_param, cam_param
+
+class FaceRegressor(nn.Module):
+    def __init__(self, feat_dim=768):
+        super(FaceRegressor, self).__init__()
+        self.expr_out = make_linear_layers([feat_dim, smpl_x.expr_code_dim], relu_final=False)
+        self.jaw_pose_out = make_linear_layers([feat_dim, 6], relu_final=False)
+
+    def forward(self, expr_token, jaw_pose_token):
+        expr_param = self.expr_out(expr_token)  # expression parameter
+        jaw_pose = self.jaw_pose_out(jaw_pose_token)  # jaw pose parameter
+        return expr_param, jaw_pose
+
+class BoxNet(nn.Module):
+    def __init__(self, feat_dim=768):
+        super(BoxNet, self).__init__()
+        self.joint_num = len(smpl_x.pos_joint_part['body'])
+        self.deconv = make_deconv_layers([feat_dim + self.joint_num * cfg.output_hm_shape[0], 256, 256, 256])
+        self.bbox_center = make_conv_layers([256, 3], kernel=1, stride=1, padding=0, bnrelu_final=False)
+        self.lhand_size = make_linear_layers([256, 256, 2], relu_final=False)
+        self.rhand_size = make_linear_layers([256, 256, 2], relu_final=False)
+        self.face_size = make_linear_layers([256, 256, 2], relu_final=False)
+
+    def forward(self, img_feat, joint_hm):
+        joint_hm = joint_hm.view(joint_hm.shape[0], joint_hm.shape[1] * cfg.output_hm_shape[0], cfg.output_hm_shape[1], cfg.output_hm_shape[2])
+        img_feat = torch.cat((img_feat, joint_hm), 1)
+        img_feat = self.deconv(img_feat)
+
+        # bbox center
+        bbox_center_hm = self.bbox_center(img_feat)
+        bbox_center = soft_argmax_2d(bbox_center_hm)
+        lhand_center, rhand_center, face_center = bbox_center[:, 0, :], bbox_center[:, 1, :], bbox_center[:, 2, :]
+
+        # bbox size
+        lhand_feat = sample_joint_features(img_feat, lhand_center[:, None, :].detach())[:, 0, :]
+        lhand_size = self.lhand_size(lhand_feat)
+        rhand_feat = sample_joint_features(img_feat, rhand_center[:, None, :].detach())[:, 0, :]
+        rhand_size = self.rhand_size(rhand_feat)
+        face_feat = sample_joint_features(img_feat, face_center[:, None, :].detach())[:, 0, :]
+        face_size = self.face_size(face_feat)
+
+        lhand_center = lhand_center / 8
+        rhand_center = rhand_center / 8
+        face_center = face_center / 8
+        return lhand_center, lhand_size, rhand_center, rhand_size, face_center, face_size
+
+class BoxSizeNet(nn.Module):
+    def __init__(self):
+        super(BoxSizeNet, self).__init__()
+        self.lhand_size = make_linear_layers([256, 256, 2], relu_final=False)
+        self.rhand_size = make_linear_layers([256, 256, 2], relu_final=False)
+        self.face_size = make_linear_layers([256, 256, 2], relu_final=False)
+
+    def forward(self, box_fea):
+        # box_fea: [bs, 3, C]
+        lhand_size = self.lhand_size(box_fea[:, 0])
+        rhand_size = self.rhand_size(box_fea[:, 1])
+        face_size = self.face_size(box_fea[:, 2])
+        return lhand_size, rhand_size, face_size
+
+class HandRoI(nn.Module):
+    def __init__(self, feat_dim=768, upscale=4):
+        super(HandRoI, self).__init__()
+        self.upscale = upscale
+        if upscale==1:
+            self.deconv = make_conv_layers([feat_dim, feat_dim], kernel=1, stride=1, padding=0, bnrelu_final=False)
+            self.conv = make_conv_layers([feat_dim, feat_dim], kernel=1, stride=1, padding=0, bnrelu_final=False)
+        elif upscale==2:
+            self.deconv = make_deconv_layers([feat_dim, feat_dim//2])
+            self.conv = make_conv_layers([feat_dim//2, feat_dim], kernel=1, stride=1, padding=0, bnrelu_final=False)
+        elif upscale==4:
+            self.deconv = make_deconv_layers([feat_dim, feat_dim//2, feat_dim//4])
+            self.conv = make_conv_layers([feat_dim//4, feat_dim], kernel=1, stride=1, padding=0, bnrelu_final=False)
+        elif upscale==8:
+            self.deconv = make_deconv_layers([feat_dim, feat_dim//2, feat_dim//4, feat_dim//8])
+            self.conv = make_conv_layers([feat_dim//8, feat_dim], kernel=1, stride=1, padding=0, bnrelu_final=False)
+
+    def forward(self, img_feat, lhand_bbox, rhand_bbox):
+        lhand_bbox = torch.cat((torch.arange(lhand_bbox.shape[0]).float().to(cfg.device)[:, None], lhand_bbox),
+                               1)  # batch_idx, xmin, ymin, xmax, ymax
+        rhand_bbox = torch.cat((torch.arange(rhand_bbox.shape[0]).float().to(cfg.device)[:, None], rhand_bbox),
+                               1)  # batch_idx, xmin, ymin, xmax, ymax
+        img_feat = self.deconv(img_feat)
+        lhand_bbox_roi = lhand_bbox.clone()
+        lhand_bbox_roi[:, 1] = lhand_bbox_roi[:, 1] / cfg.input_body_shape[1] * cfg.output_hm_shape[2] * self.upscale
+        lhand_bbox_roi[:, 2] = lhand_bbox_roi[:, 2] / cfg.input_body_shape[0] * cfg.output_hm_shape[1] * self.upscale
+        lhand_bbox_roi[:, 3] = lhand_bbox_roi[:, 3] / cfg.input_body_shape[1] * cfg.output_hm_shape[2] * self.upscale
+        lhand_bbox_roi[:, 4] = lhand_bbox_roi[:, 4] / cfg.input_body_shape[0] * cfg.output_hm_shape[1] * self.upscale
+        assert (cfg.output_hm_shape[1]*self.upscale, cfg.output_hm_shape[2]*self.upscale) == (img_feat.shape[2], img_feat.shape[3])
+        lhand_img_feat = roi_align(img_feat, lhand_bbox_roi, (cfg.output_hand_hm_shape[1], cfg.output_hand_hm_shape[2]), 1.0, 0, 'avg', False)
+        lhand_img_feat = torch.flip(lhand_img_feat, [3])  # flip to the right hand
+
+        rhand_bbox_roi = rhand_bbox.clone()
+        rhand_bbox_roi[:, 1] = rhand_bbox_roi[:, 1] / cfg.input_body_shape[1] * cfg.output_hm_shape[2] * self.upscale
+        rhand_bbox_roi[:, 2] = rhand_bbox_roi[:, 2] / cfg.input_body_shape[0] * cfg.output_hm_shape[1] * self.upscale
+        rhand_bbox_roi[:, 3] = rhand_bbox_roi[:, 3] / cfg.input_body_shape[1] * cfg.output_hm_shape[2] * self.upscale
+        rhand_bbox_roi[:, 4] = rhand_bbox_roi[:, 4] / cfg.input_body_shape[0] * cfg.output_hm_shape[1] * self.upscale
+        rhand_img_feat = roi_align(img_feat, rhand_bbox_roi, (cfg.output_hand_hm_shape[1], cfg.output_hand_hm_shape[2]), 1.0, 0, 'avg', False)
+        hand_img_feat = torch.cat((lhand_img_feat, rhand_img_feat))  # [bs, c, cfg.output_hand_hm_shape[2]*scale, cfg.output_hand_hm_shape[1]*scale]
+        hand_img_feat = self.conv(hand_img_feat)
+        return hand_img_feat
\ No newline at end of file
diff --git a/SMPLer-X/common/timer.py b/SMPLer-X/common/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7152ae943c94c66d879a6048671c4427100dad42
--- /dev/null
+++ b/SMPLer-X/common/timer.py
@@ -0,0 +1,38 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import time
+
+class Timer(object):
+    """A simple timer."""
+    def __init__(self):
+        self.total_time = 0.
+        self.calls = 0
+        self.start_time = 0.
+        self.diff = 0.
+        self.average_time = 0.
+        self.warm_up = 0
+
+    def tic(self):
+        # using time.time instead of time.clock because time time.clock
+        # does not normalize for multithreading
+        self.start_time = time.time()
+
+    def toc(self, average=True):
+        self.diff = time.time() - self.start_time
+        if self.warm_up < 10:
+            self.warm_up += 1
+            return self.diff
+        else:
+            self.total_time += self.diff
+            self.calls += 1
+            self.average_time = self.total_time / self.calls
+
+        if average:
+            return self.average_time
+        else:
+            return self.diff
diff --git a/SMPLer-X/common/utils/__init__.py b/SMPLer-X/common/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SMPLer-X/common/utils/dir.py b/SMPLer-X/common/utils/dir.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae29db5b6b5f317ca67c12c897f6daed6a9d749a
--- /dev/null
+++ b/SMPLer-X/common/utils/dir.py
@@ -0,0 +1,10 @@
+import os
+import sys
+
+def make_folder(folder_name):
+    os.makedirs(folder_name, exist_ok=True)
+
+def add_pypath(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+
diff --git a/SMPLer-X/common/utils/distribute_utils.py b/SMPLer-X/common/utils/distribute_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6c928a5a3c66885e1311949a82feb181aee60a3
--- /dev/null
+++ b/SMPLer-X/common/utils/distribute_utils.py
@@ -0,0 +1,217 @@
+import mmcv
+import os
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+import torch
+import torch.distributed as dist
+from mmengine.dist import get_dist_info
+import random
+import numpy as np
+import subprocess
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    # torch.set_deterministic(True)
+
+
+def time_synchronized():
+    torch.cuda.synchronize() if torch.cuda.is_available() else None
+    return time.time()
+
+
+def setup_for_distributed(is_master):
+    """This function disables printing when not in master process."""
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def init_distributed_mode(port = None, master_port=29500):
+    """Initialize slurm distributed training environment.
+
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    dist_backend = 'nccl'
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(proc_id % num_gpus)
+    addr = subprocess.getoutput(
+        f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = str(master_port)
+    # use MASTER_ADDR in the environment variable if it already exists
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+    os.environ['RANK'] = str(proc_id)
+    dist.init_process_group(backend=dist_backend)
+
+    distributed = True
+    gpu_idx = proc_id % num_gpus
+
+    return distributed, gpu_idx
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+def get_process_groups():
+    world_size = int(os.environ['WORLD_SIZE'])
+    ranks = list(range(world_size))
+    num_gpus = torch.cuda.device_count()
+    num_nodes = world_size // num_gpus
+    if world_size % num_gpus != 0:
+        raise NotImplementedError('Not implemented for node not fully used.')
+
+    groups = []
+    for node_idx in range(num_nodes):
+        groups.append(ranks[node_idx*num_gpus : (node_idx+1)*num_gpus])
+    process_groups = [torch.distributed.new_group(group) for group in groups]
+
+    return process_groups
+
+def get_group_idx():
+    num_gpus = torch.cuda.device_count()
+    proc_id = get_rank()
+    group_idx = proc_id // num_gpus
+
+    return group_idx
+
+
+def is_main_process():
+    return get_rank() == 0
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+def collect_results(result_part, size, tmpdir=None):
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            tmpdir = tempfile.mkdtemp()
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_list.append(mmcv.load(part_file))
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data:
+            Any picklable object
+    Returns:
+        data_list(list):
+            List of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to('cuda')
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device='cuda')
+    size_list = [torch.tensor([0], device='cuda') for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(
+            torch.empty((max_size, ), dtype=torch.uint8, device='cuda'))
+    if local_size != max_size:
+        padding = torch.empty(
+            size=(max_size - local_size, ), dtype=torch.uint8, device='cuda')
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
diff --git a/SMPLer-X/common/utils/human_model_files/smpl/SMPL_FEMALE.pkl b/SMPLer-X/common/utils/human_model_files/smpl/SMPL_FEMALE.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..c7aee61286b91ee83f1dc8846e7bab306882f30f
--- /dev/null
+++ b/SMPLer-X/common/utils/human_model_files/smpl/SMPL_FEMALE.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d4a1791b6b94880397e1a3a4539b703a228d2150c57de7b288389a8115f4ef0
+size 247530000
diff --git a/SMPLer-X/common/utils/human_model_files/smpl/SMPL_MALE.pkl b/SMPLer-X/common/utils/human_model_files/smpl/SMPL_MALE.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..247d55241f21c4190521321279b1dc6f94be02a3
--- /dev/null
+++ b/SMPLer-X/common/utils/human_model_files/smpl/SMPL_MALE.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed4d55bb3041fefc6f73b70694d6c8edc1020c0d07340be5cc651cae2c6a6ae3
+size 247101031
diff --git a/SMPLer-X/common/utils/human_model_files/smpl/SMPL_NEUTRAL.pkl b/SMPLer-X/common/utils/human_model_files/smpl/SMPL_NEUTRAL.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..65ae47d34e5b26720c9ccdd2614044832f0e30f2
--- /dev/null
+++ b/SMPLer-X/common/utils/human_model_files/smpl/SMPL_NEUTRAL.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4924f235e63f7c5d5b690acedf736419c2edb846a2d69fc0956169615fa75688
+size 247186228
diff --git a/SMPLer-X/common/utils/human_model_files/smpl/smpl_uv.npz b/SMPLer-X/common/utils/human_model_files/smpl/smpl_uv.npz
new file mode 100644
index 0000000000000000000000000000000000000000..808dd7dc08e09a82564fde4add97bd1d24f6447c
--- /dev/null
+++ b/SMPLer-X/common/utils/human_model_files/smpl/smpl_uv.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb2a1aaf8be2091ebc4344daefae0622cc09252b33d4f6c36ea2c6541a01d469
+size 1524004
diff --git a/SMPLer-X/common/utils/human_model_files/smplx/MANO_SMPLX_vertex_ids.pkl b/SMPLer-X/common/utils/human_model_files/smplx/MANO_SMPLX_vertex_ids.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..dabec1377a0da4c511a519a00f51f1a3a23f33af
--- /dev/null
+++ b/SMPLer-X/common/utils/human_model_files/smplx/MANO_SMPLX_vertex_ids.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5abe70b6574de25470475091e8008314a5b90127eb48c3e63bfa0adf8c04dcf
+size 13535
diff --git a/SMPLer-X/common/utils/human_model_files/smplx/SMPL-X__FLAME_vertex_ids.npy b/SMPLer-X/common/utils/human_model_files/smplx/SMPL-X__FLAME_vertex_ids.npy
new file mode 100644
index 0000000000000000000000000000000000000000..c940d3aa6cb4cbbcc348fd518b15d8777dc350fd
--- /dev/null
+++ b/SMPLer-X/common/utils/human_model_files/smplx/SMPL-X__FLAME_vertex_ids.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e70cdc3659aae699b9732e8dd4af49106310c69b90dc83d9f73e96dbf871e49
+size 40312
diff --git a/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_FEMALE.npz b/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_FEMALE.npz
new file mode 100644
index 0000000000000000000000000000000000000000..da0a200cd85eb10f73aa36d44f1d9c509a82dfcc
--- /dev/null
+++ b/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_FEMALE.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2a3686c9d6d218ff6822fba411c607a3c8125a70af340f384ce68bebecabe0e
+size 108794146
diff --git a/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_MALE.npz b/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_MALE.npz
new file mode 100644
index 0000000000000000000000000000000000000000..41fdef3ff2784eb06bb479ebf5fb6887aafbc183
--- /dev/null
+++ b/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_MALE.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab318e3f37d2bfaae26abf4e6fab445c2a610e1d63714794d60379cc263bc2a5
+size 108753445
diff --git a/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_NEUTRAL.npz b/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_NEUTRAL.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6f42b326bd60123bd813c0fa2df7f4660862a920
--- /dev/null
+++ b/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_NEUTRAL.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:376021446ddc86e99acacd795182bbef903e61d33b76b9d8b359c2b0865bd992
+size 108752058
diff --git a/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_NEUTRAL.pkl b/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_NEUTRAL.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..c2ef9ea8a36f2bf51256325bc6d24c181975483c
--- /dev/null
+++ b/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_NEUTRAL.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:381c808965deb4f5e845f8c3eddb0cd69930cc72e5774ce4f34c4ce3cf058361
+size 544173380
diff --git a/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_to_J14.pkl b/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_to_J14.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..db8aa5c74b860a2b9555383d5ca2a09523851fe4
--- /dev/null
+++ b/SMPLer-X/common/utils/human_model_files/smplx/SMPLX_to_J14.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5df844ddea85b0a400a2e8dbe63d09d19f2b1b7ec0e0e952daeae08f83d82d61
+size 4692193
diff --git a/SMPLer-X/common/utils/human_models.py b/SMPLer-X/common/utils/human_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccc2bc818e471967b6321c96def08c5f5502a985
--- /dev/null
+++ b/SMPLer-X/common/utils/human_models.py
@@ -0,0 +1,176 @@
+import numpy as np
+import torch
+import os.path as osp
+from config import cfg
+from utils.smplx import smplx
+import pickle
+
+class SMPLX(object):
+    def __init__(self):
+        self.layer_arg = {'create_global_orient': False, 'create_body_pose': False, 'create_left_hand_pose': False, 'create_right_hand_pose': False, 'create_jaw_pose': False, 'create_leye_pose': False, 'create_reye_pose': False, 'create_betas': False, 'create_expression': False, 'create_transl': False}
+        self.layer = {'neutral': smplx.create(cfg.human_model_path, 'smplx', gender='NEUTRAL', use_pca=False, use_face_contour=True, **self.layer_arg),
+                        'male': smplx.create(cfg.human_model_path, 'smplx', gender='MALE', use_pca=False, use_face_contour=True, **self.layer_arg),
+                        'female': smplx.create(cfg.human_model_path, 'smplx', gender='FEMALE', use_pca=False, use_face_contour=True, **self.layer_arg)
+                        }
+        self.vertex_num = 10475
+        self.face = self.layer['neutral'].faces
+        self.shape_param_dim = 10
+        self.expr_code_dim = 10
+        with open(osp.join(cfg.human_model_path, 'smplx', 'SMPLX_to_J14.pkl'), 'rb') as f:
+            self.j14_regressor = pickle.load(f, encoding='latin1')
+        with open(osp.join(cfg.human_model_path, 'smplx', 'MANO_SMPLX_vertex_ids.pkl'), 'rb') as f:
+            self.hand_vertex_idx = pickle.load(f, encoding='latin1')
+        self.face_vertex_idx = np.load(osp.join(cfg.human_model_path, 'smplx', 'SMPL-X__FLAME_vertex_ids.npy'))
+        self.J_regressor = self.layer['neutral'].J_regressor.numpy()
+        self.J_regressor_idx = {'pelvis': 0, 'lwrist': 20, 'rwrist': 21, 'neck': 12}
+        self.orig_hand_regressor = self.make_hand_regressor()
+        #self.orig_hand_regressor = {'left': self.layer.J_regressor.numpy()[[20,37,38,39,25,26,27,28,29,30,34,35,36,31,32,33],:], 'right': self.layer.J_regressor.numpy()[[21,52,53,54,40,41,42,43,44,45,49,50,51,46,47,48],:]}
+
+        # original SMPLX joint set
+        self.orig_joint_num = 53 # 22 (body joints) + 30 (hand joints) + 1 (face jaw joint)
+        self.orig_joints_name = \
+        ('Pelvis', 'L_Hip', 'R_Hip', 'Spine_1', 'L_Knee', 'R_Knee', 'Spine_2', 'L_Ankle', 'R_Ankle', 'Spine_3', 'L_Foot', 'R_Foot', 'Neck', 'L_Collar', 'R_Collar', 'Head', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', # body joints
+        'L_Index_1', 'L_Index_2', 'L_Index_3', 'L_Middle_1', 'L_Middle_2', 'L_Middle_3', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Ring_1', 'L_Ring_2', 'L_Ring_3', 'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3', # left hand joints
+        'R_Index_1', 'R_Index_2', 'R_Index_3', 'R_Middle_1', 'R_Middle_2', 'R_Middle_3', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Ring_1', 'R_Ring_2', 'R_Ring_3', 'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3', # right hand joints
+        'Jaw' # face jaw joint
+        )
+        self.orig_flip_pairs = \
+        ( (1,2), (4,5), (7,8), (10,11), (13,14), (16,17), (18,19), (20,21), # body joints
+        (22,37), (23,38), (24,39), (25,40), (26,41), (27,42), (28,43), (29,44), (30,45), (31,46), (32,47), (33,48), (34,49), (35,50), (36,51) # hand joints
+        )
+        self.orig_root_joint_idx = self.orig_joints_name.index('Pelvis')
+        self.orig_joint_part = \
+        {'body': range(self.orig_joints_name.index('Pelvis'), self.orig_joints_name.index('R_Wrist')+1),
+        'lhand': range(self.orig_joints_name.index('L_Index_1'), self.orig_joints_name.index('L_Thumb_3')+1),
+        'rhand': range(self.orig_joints_name.index('R_Index_1'), self.orig_joints_name.index('R_Thumb_3')+1),
+        'face': range(self.orig_joints_name.index('Jaw'), self.orig_joints_name.index('Jaw')+1)}
+
+        # changed SMPLX joint set for the supervision
+        self.joint_num = 137 # 25 (body joints) + 40 (hand joints) + 72 (face keypoints)
+        self.joints_name = \
+        ('Pelvis', 'L_Hip', 'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle', 'R_Ankle', 'Neck', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Big_toe', 'L_Small_toe', 'L_Heel', 'R_Big_toe', 'R_Small_toe', 'R_Heel', 'L_Ear', 'R_Ear', 'L_Eye', 'R_Eye', 'Nose',# body joints
+         'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3', 'L_Thumb_4', 'L_Index_1', 'L_Index_2', 'L_Index_3', 'L_Index_4', 'L_Middle_1', 'L_Middle_2', 'L_Middle_3', 'L_Middle_4', 'L_Ring_1', 'L_Ring_2', 'L_Ring_3', 'L_Ring_4', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Pinky_4', # left hand joints
+         'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3', 'R_Thumb_4', 'R_Index_1', 'R_Index_2', 'R_Index_3', 'R_Index_4', 'R_Middle_1', 'R_Middle_2', 'R_Middle_3', 'R_Middle_4', 'R_Ring_1', 'R_Ring_2', 'R_Ring_3', 'R_Ring_4', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Pinky_4', # right hand joints
+         *['Face_' + str(i) for i in range(1,73)] # face keypoints (too many keypoints... omit real names. have same name of keypoints defined in FLAME class)
+         )
+        self.root_joint_idx = self.joints_name.index('Pelvis')
+        self.lwrist_idx = self.joints_name.index('L_Wrist')
+        self.rwrist_idx = self.joints_name.index('R_Wrist')
+        self.neck_idx = self.joints_name.index('Neck')
+        self.flip_pairs = \
+        ( (1,2), (3,4), (5,6), (8,9), (10,11), (12,13), (14,17), (15,18), (16,19), (20,21), (22,23), # body joints
+        (25,45), (26,46), (27,47), (28,48), (29,49), (30,50), (31,51), (32,52), (33,53), (34,54), (35,55), (36,56), (37,57), (38,58), (39,59), (40,60), (41,61), (42,62), (43,63), (44,64), # hand joints
+        (67,68), # face eyeballs
+        (69,78), (70,77), (71,76), (72,75), (73,74), # face eyebrow
+        (83,87), (84,86), # face below nose
+        (88,97), (89,96), (90,95), (91,94), (92,99), (93,98), # face eyes
+        (100,106), (101,105), (102,104), (107,111), (108,110), # face mouth
+        (112,116), (113,115), (117,119), # face lip
+        (120,136), (121,135), (122,134), (123,133), (124,132), (125,131), (126,130), (127,129) # face contours
+        )
+        self.joint_idx = \
+        (0,1,2,4,5,7,8,12,16,17,18,19,20,21,60,61,62,63,64,65,59,58,57,56,55, # body joints
+        37,38,39,66,25,26,27,67,28,29,30,68,34,35,36,69,31,32,33,70, # left hand joints
+        52,53,54,71,40,41,42,72,43,44,45,73,49,50,51,74,46,47,48,75, # right hand joints
+        22,15, # jaw, head
+        57,56, # eyeballs
+        76,77,78,79,80,81,82,83,84,85, # eyebrow
+        86,87,88,89, # nose
+        90,91,92,93,94, # below nose
+        95,96,97,98,99,100,101,102,103,104,105,106, # eyes
+        107, # right mouth
+        108,109,110,111,112, # upper mouth
+        113, # left mouth
+        114,115,116,117,118, # lower mouth
+        119, # right lip
+        120,121,122, # upper lip
+        123, # left lip
+        124,125,126, # lower lip
+        127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143 # face contour
+        )
+        self.joint_part = \
+        {'body': range(self.joints_name.index('Pelvis'), self.joints_name.index('Nose')+1),
+        'lhand': range(self.joints_name.index('L_Thumb_1'), self.joints_name.index('L_Pinky_4')+1),
+        'rhand': range(self.joints_name.index('R_Thumb_1'), self.joints_name.index('R_Pinky_4')+1),
+        'hand': range(self.joints_name.index('L_Thumb_1'), self.joints_name.index('R_Pinky_4')+1),
+        'face': range(self.joints_name.index('Face_1'), self.joints_name.index('Face_72')+1)}
+        
+        # changed SMPLX joint set for PositionNet prediction
+        self.pos_joint_num = 65 # 25 (body joints) + 40 (hand joints)
+        self.pos_joints_name = \
+        ('Pelvis', 'L_Hip', 'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle', 'R_Ankle', 'Neck', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Big_toe', 'L_Small_toe', 'L_Heel', 'R_Big_toe', 'R_Small_toe', 'R_Heel', 'L_Ear', 'R_Ear', 'L_Eye', 'R_Eye', 'Nose', # body joints
+         'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3', 'L_Thumb_4', 'L_Index_1', 'L_Index_2', 'L_Index_3', 'L_Index_4', 'L_Middle_1', 'L_Middle_2', 'L_Middle_3', 'L_Middle_4', 'L_Ring_1', 'L_Ring_2', 'L_Ring_3', 'L_Ring_4', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Pinky_4', # left hand joints
+         'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3', 'R_Thumb_4', 'R_Index_1', 'R_Index_2', 'R_Index_3', 'R_Index_4', 'R_Middle_1', 'R_Middle_2', 'R_Middle_3', 'R_Middle_4', 'R_Ring_1', 'R_Ring_2', 'R_Ring_3', 'R_Ring_4', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Pinky_4', # right hand joints
+         )
+        self.pos_joint_part = \
+        {'body': range(self.pos_joints_name.index('Pelvis'), self.pos_joints_name.index('Nose')+1),
+        'lhand': range(self.pos_joints_name.index('L_Thumb_1'), self.pos_joints_name.index('L_Pinky_4')+1),
+        'rhand': range(self.pos_joints_name.index('R_Thumb_1'), self.pos_joints_name.index('R_Pinky_4')+1),
+        'hand': range(self.pos_joints_name.index('L_Thumb_1'), self.pos_joints_name.index('R_Pinky_4')+1)}
+        self.pos_joint_part['L_MCP'] = [self.pos_joints_name.index('L_Index_1') - len(self.pos_joint_part['body']),
+                                        self.pos_joints_name.index('L_Middle_1') - len(self.pos_joint_part['body']),
+                                        self.pos_joints_name.index('L_Ring_1') - len(self.pos_joint_part['body']),
+                                        self.pos_joints_name.index('L_Pinky_1') - len(self.pos_joint_part['body'])]
+        self.pos_joint_part['R_MCP'] = [self.pos_joints_name.index('R_Index_1') - len(self.pos_joint_part['body']) - len(self.pos_joint_part['lhand']),
+                                        self.pos_joints_name.index('R_Middle_1') - len(self.pos_joint_part['body']) - len(self.pos_joint_part['lhand']),
+                                        self.pos_joints_name.index('R_Ring_1') - len(self.pos_joint_part['body']) - len(self.pos_joint_part['lhand']),
+                                        self.pos_joints_name.index('R_Pinky_1') - len(self.pos_joint_part['body']) - len(self.pos_joint_part['lhand'])]
+    
+    def make_hand_regressor(self):
+        regressor = self.layer['neutral'].J_regressor.numpy()
+        lhand_regressor = np.concatenate((regressor[[20,37,38,39],:],
+                                            np.eye(self.vertex_num)[5361,None],
+                                                regressor[[25,26,27],:],
+                                                np.eye(self.vertex_num)[4933,None],
+                                                regressor[[28,29,30],:],
+                                                np.eye(self.vertex_num)[5058,None],
+                                                regressor[[34,35,36],:],
+                                                np.eye(self.vertex_num)[5169,None],
+                                                regressor[[31,32,33],:],
+                                                np.eye(self.vertex_num)[5286,None]))
+        rhand_regressor = np.concatenate((regressor[[21,52,53,54],:],
+                                            np.eye(self.vertex_num)[8079,None],
+                                                regressor[[40,41,42],:],
+                                                np.eye(self.vertex_num)[7669,None],
+                                                regressor[[43,44,45],:],
+                                                np.eye(self.vertex_num)[7794,None],
+                                                regressor[[49,50,51],:],
+                                                np.eye(self.vertex_num)[7905,None],
+                                                regressor[[46,47,48],:],
+                                                np.eye(self.vertex_num)[8022,None]))
+        hand_regressor = {'left': lhand_regressor, 'right': rhand_regressor}
+        return hand_regressor
+
+        
+    def reduce_joint_set(self, joint):
+        new_joint = []
+        for name in self.pos_joints_name:
+            idx = self.joints_name.index(name)
+            new_joint.append(joint[:,idx,:])
+        new_joint = torch.stack(new_joint,1)
+        return new_joint
+
+class SMPL(object):
+    def __init__(self):
+        self.layer_arg = {'create_body_pose': False, 'create_betas': False, 'create_global_orient': False, 'create_transl': False}
+        self.layer = {'neutral': smplx.create(cfg.human_model_path, 'smpl', gender='NEUTRAL', **self.layer_arg), 'male': smplx.create(cfg.human_model_path, 'smpl', gender='MALE', **self.layer_arg), 'female': smplx.create(cfg.human_model_path, 'smpl', gender='FEMALE', **self.layer_arg)}
+        self.vertex_num = 6890
+        self.face = self.layer['neutral'].faces
+        self.shape_param_dim = 10
+        self.vposer_code_dim = 32
+
+        # original SMPL joint set
+        self.orig_joint_num = 24
+        self.orig_joints_name = ('Pelvis', 'L_Hip', 'R_Hip', 'Spine_1', 'L_Knee', 'R_Knee', 'Spine_2', 'L_Ankle', 'R_Ankle', 'Spine_3', 'L_Foot', 'R_Foot', 'Neck', 'L_Collar', 'R_Collar', 'Head', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Hand', 'R_Hand')
+        self.orig_flip_pairs = ( (1,2), (4,5), (7,8), (10,11), (13,14), (16,17), (18,19), (20,21), (22,23) )
+        self.orig_root_joint_idx = self.orig_joints_name.index('Pelvis')
+        self.orig_joint_regressor = self.layer['neutral'].J_regressor.numpy().astype(np.float32)
+        
+        self.joint_num = self.orig_joint_num
+        self.joints_name = self.orig_joints_name
+        self.flip_pairs = self.orig_flip_pairs
+        self.root_joint_idx = self.orig_root_joint_idx
+        self.joint_regressor = self.orig_joint_regressor
+
+smpl_x = SMPLX()
+smpl = SMPL()
diff --git a/SMPLer-X/common/utils/inference_utils.py b/SMPLer-X/common/utils/inference_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ccc515a3dcc24662005a4e6be0948024e98c995
--- /dev/null
+++ b/SMPLer-X/common/utils/inference_utils.py
@@ -0,0 +1,153 @@
+from typing import Literal, Union
+
+def process_mmdet_results(mmdet_results: list,
+                          cat_id: int = 0,
+                          multi_person: bool = True) -> list:
+    """Process mmdet results, sort bboxes by area in descending order.
+
+    Args:
+        mmdet_results (list):
+            Result of mmdet.apis.inference_detector
+            when the input is a batch.
+            Shape of the nested lists is
+            (n_frame, n_category, n_human, 5).
+        cat_id (int, optional):
+            Category ID. This function will only select
+            the selected category, and drop the others.
+            Defaults to 0, ID of human category.
+        multi_person (bool, optional):
+            Whether to allow multi-person detection, which is
+            slower than single-person. If false, the function
+            only assure that the first person of each frame
+            has the biggest bbox.
+            Defaults to True.
+
+    Returns:
+        list:
+            A list of detected bounding boxes.
+            Shape of the nested lists is
+            (n_frame, n_human, 5)
+            and each bbox is (x, y, x, y, score).
+    """
+    ret_list = []
+    only_max_arg = not multi_person
+    # for _, frame_results in enumerate(mmdet_results):
+    cat_bboxes = mmdet_results[cat_id]
+    # import pdb; pdb.set_trace()
+    sorted_bbox = qsort_bbox_list(cat_bboxes, only_max_arg)
+
+    if only_max_arg:
+        ret_list.append(sorted_bbox[0:1])
+    else:
+        ret_list.append(sorted_bbox)
+    return ret_list
+
+
+def qsort_bbox_list(bbox_list: list,
+                    only_max: bool = False,
+                    bbox_convention: Literal['xyxy', 'xywh'] = 'xyxy'):
+    """Sort a list of bboxes, by their area in pixel(W*H).
+
+    Args:
+        input_list (list):
+            A list of bboxes. Each item is a list of (x1, y1, x2, y2)
+        only_max (bool, optional):
+            If True, only assure the max element at first place,
+            others may not be well sorted.
+            If False, return a well sorted descending list.
+            Defaults to False.
+        bbox_convention (str, optional):
+            Bbox type, xyxy or xywh. Defaults to 'xyxy'.
+
+    Returns:
+        list:
+            A sorted(maybe not so well) descending list.
+    """
+    # import pdb; pdb.set_trace()
+    if len(bbox_list) <= 1:
+        return bbox_list
+    else:
+        bigger_list = []
+        less_list = []
+        anchor_index = int(len(bbox_list) / 2)
+        anchor_bbox = bbox_list[anchor_index]
+        anchor_area = get_area_of_bbox(anchor_bbox, bbox_convention)
+        for i in range(len(bbox_list)):
+            if i == anchor_index:
+                continue
+            tmp_bbox = bbox_list[i]
+            tmp_area = get_area_of_bbox(tmp_bbox, bbox_convention)
+            if tmp_area >= anchor_area:
+                bigger_list.append(tmp_bbox)
+            else:
+                less_list.append(tmp_bbox)
+        if only_max:
+            return qsort_bbox_list(bigger_list) + \
+                [anchor_bbox, ] + less_list
+        else:
+            return qsort_bbox_list(bigger_list) + \
+                [anchor_bbox, ] + qsort_bbox_list(less_list)
+
+def get_area_of_bbox(
+        bbox: Union[list, tuple],
+        bbox_convention: Literal['xyxy', 'xywh'] = 'xyxy') -> float:
+    """Get the area of a bbox_xyxy.
+
+    Args:
+        (Union[list, tuple]):
+            A list of [x1, y1, x2, y2].
+        bbox_convention (str, optional):
+            Bbox type, xyxy or xywh. Defaults to 'xyxy'.
+
+    Returns:
+        float:
+            Area of the bbox(|y2-y1|*|x2-x1|).
+    """
+    # import pdb;pdb.set_trace()
+    if bbox_convention == 'xyxy':
+        return abs(bbox[2] - bbox[0]) * abs(bbox[3] - bbox[1])
+    elif bbox_convention == 'xywh':
+        return abs(bbox[2] * bbox[3])
+    else:
+        raise TypeError(f'Wrong bbox convention: {bbox_convention}')
+
+def calculate_iou(bbox1, bbox2):
+    # Calculate the Intersection over Union (IoU) between two bounding boxes
+    x1 = max(bbox1[0], bbox2[0])
+    y1 = max(bbox1[1], bbox2[1])
+    x2 = min(bbox1[2], bbox2[2])
+    y2 = min(bbox1[3], bbox2[3])
+    
+    intersection_area = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1)
+    
+    bbox1_area = (bbox1[2] - bbox1[0] + 1) * (bbox1[3] - bbox1[1] + 1)
+    bbox2_area = (bbox2[2] - bbox2[0] + 1) * (bbox2[3] - bbox2[1] + 1)
+    
+    union_area = bbox1_area + bbox2_area - intersection_area
+    
+    iou = intersection_area / union_area
+    return iou
+
+
+def non_max_suppression(bboxes, iou_threshold):
+    # Sort the bounding boxes by their confidence scores (e.g., the probability of containing an object)
+    bboxes = sorted(bboxes, key=lambda x: x[4], reverse=True)
+    
+    # Initialize a list to store the selected bounding boxes
+    selected_bboxes = []
+    
+    # Perform non-maximum suppression
+    while len(bboxes) > 0:
+        current_bbox = bboxes[0]
+        selected_bboxes.append(current_bbox)
+        bboxes = bboxes[1:]
+        
+        remaining_bboxes = []
+        for bbox in bboxes:
+            iou = calculate_iou(current_bbox, bbox)
+            if iou < iou_threshold:
+                remaining_bboxes.append(bbox)
+                
+        bboxes = remaining_bboxes
+        
+    return selected_bboxes
\ No newline at end of file
diff --git a/SMPLer-X/common/utils/preprocessing.py b/SMPLer-X/common/utils/preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..81d53b0d67fcc59d8c1684519eb8b29cf2bd59e1
--- /dev/null
+++ b/SMPLer-X/common/utils/preprocessing.py
@@ -0,0 +1,541 @@
+import numpy as np
+import cv2
+import random
+from config import cfg
+import math
+from utils.human_models import smpl_x, smpl
+from utils.transforms import cam2pixel, transform_joint_to_other_db
+from plyfile import PlyData, PlyElement
+import torch
+
+
+def load_img(path, order='RGB'):
+    img = cv2.imread(path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
+    if not isinstance(img, np.ndarray):
+        raise IOError("Fail to read %s" % path)
+
+    if order == 'RGB':
+        img = img[:, :, ::-1].copy()
+
+    img = img.astype(np.float32)
+    return img
+
+
+def get_bbox(joint_img, joint_valid, extend_ratio=1.2):
+    x_img, y_img = joint_img[:, 0], joint_img[:, 1]
+    x_img = x_img[joint_valid == 1];
+    y_img = y_img[joint_valid == 1];
+    xmin = min(x_img);
+    ymin = min(y_img);
+    xmax = max(x_img);
+    ymax = max(y_img);
+
+    x_center = (xmin + xmax) / 2.;
+    width = xmax - xmin;
+    xmin = x_center - 0.5 * width * extend_ratio
+    xmax = x_center + 0.5 * width * extend_ratio
+
+    y_center = (ymin + ymax) / 2.;
+    height = ymax - ymin;
+    ymin = y_center - 0.5 * height * extend_ratio
+    ymax = y_center + 0.5 * height * extend_ratio
+
+    bbox = np.array([xmin, ymin, xmax - xmin, ymax - ymin]).astype(np.float32)
+    return bbox
+
+
+def sanitize_bbox(bbox, img_width, img_height):
+    x, y, w, h = bbox
+    x1 = np.max((0, x))
+    y1 = np.max((0, y))
+    x2 = np.min((img_width - 1, x1 + np.max((0, w - 1))))
+    y2 = np.min((img_height - 1, y1 + np.max((0, h - 1))))
+    if w * h > 0 and x2 > x1 and y2 > y1:
+        bbox = np.array([x1, y1, x2 - x1, y2 - y1])
+    else:
+        bbox = None
+
+    return bbox
+
+
+def process_bbox(bbox, img_width, img_height, ratio=1.25):
+    bbox = sanitize_bbox(bbox, img_width, img_height)
+    if bbox is None:
+        return bbox
+
+    # aspect ratio preserving bbox
+    w = bbox[2]
+    h = bbox[3]
+    c_x = bbox[0] + w / 2.
+    c_y = bbox[1] + h / 2.
+    aspect_ratio = cfg.input_img_shape[1] / cfg.input_img_shape[0]
+    if w > aspect_ratio * h:
+        h = w / aspect_ratio
+    elif w < aspect_ratio * h:
+        w = h * aspect_ratio
+    bbox[2] = w * ratio
+    bbox[3] = h * ratio
+    bbox[0] = c_x - bbox[2] / 2.
+    bbox[1] = c_y - bbox[3] / 2.
+
+    bbox = bbox.astype(np.float32)
+    return bbox
+
+
+def get_aug_config():
+    scale_factor = 0.25
+    rot_factor = 30
+    color_factor = 0.2
+
+    scale = np.clip(np.random.randn(), -1.0, 1.0) * scale_factor + 1.0
+    rot = np.clip(np.random.randn(), -2.0,
+                  2.0) * rot_factor if random.random() <= 0.6 else 0
+    c_up = 1.0 + color_factor
+    c_low = 1.0 - color_factor
+    color_scale = np.array([random.uniform(c_low, c_up), random.uniform(c_low, c_up), random.uniform(c_low, c_up)])
+    do_flip = random.random() <= 0.5
+
+    return scale, rot, color_scale, do_flip
+
+
+def augmentation(img, bbox, data_split):
+    if getattr(cfg, 'no_aug', False):
+        scale, rot, color_scale, do_flip = 1.0, 0.0, np.array([1, 1, 1]), False
+    elif data_split == 'train':
+        scale, rot, color_scale, do_flip = get_aug_config()
+    else:
+        scale, rot, color_scale, do_flip = 1.0, 0.0, np.array([1, 1, 1]), False
+
+    img, trans, inv_trans = generate_patch_image(img, bbox, scale, rot, do_flip, cfg.input_img_shape)
+    img = np.clip(img * color_scale[None, None, :], 0, 255)
+    return img, trans, inv_trans, rot, do_flip
+
+
+def generate_patch_image(cvimg, bbox, scale, rot, do_flip, out_shape):
+    img = cvimg.copy()
+    img_height, img_width, img_channels = img.shape
+
+    bb_c_x = float(bbox[0] + 0.5 * bbox[2])
+    bb_c_y = float(bbox[1] + 0.5 * bbox[3])
+    bb_width = float(bbox[2])
+    bb_height = float(bbox[3])
+
+    if do_flip:
+        img = img[:, ::-1, :]
+        bb_c_x = img_width - bb_c_x - 1
+
+    trans = gen_trans_from_patch_cv(bb_c_x, bb_c_y, bb_width, bb_height, out_shape[1], out_shape[0], scale, rot)
+    img_patch = cv2.warpAffine(img, trans, (int(out_shape[1]), int(out_shape[0])), flags=cv2.INTER_LINEAR)
+    img_patch = img_patch.astype(np.float32)
+    inv_trans = gen_trans_from_patch_cv(bb_c_x, bb_c_y, bb_width, bb_height, out_shape[1], out_shape[0], scale, rot,
+                                        inv=True)
+
+    return img_patch, trans, inv_trans
+
+
+def rotate_2d(pt_2d, rot_rad):
+    x = pt_2d[0]
+    y = pt_2d[1]
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+    xx = x * cs - y * sn
+    yy = x * sn + y * cs
+    return np.array([xx, yy], dtype=np.float32)
+
+
+def gen_trans_from_patch_cv(c_x, c_y, src_width, src_height, dst_width, dst_height, scale, rot, inv=False):
+    # augment size with scale
+    src_w = src_width * scale
+    src_h = src_height * scale
+    src_center = np.array([c_x, c_y], dtype=np.float32)
+
+    # augment rotation
+    rot_rad = np.pi * rot / 180
+    src_downdir = rotate_2d(np.array([0, src_h * 0.5], dtype=np.float32), rot_rad)
+    src_rightdir = rotate_2d(np.array([src_w * 0.5, 0], dtype=np.float32), rot_rad)
+
+    dst_w = dst_width
+    dst_h = dst_height
+    dst_center = np.array([dst_w * 0.5, dst_h * 0.5], dtype=np.float32)
+    dst_downdir = np.array([0, dst_h * 0.5], dtype=np.float32)
+    dst_rightdir = np.array([dst_w * 0.5, 0], dtype=np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = src_center
+    src[1, :] = src_center + src_downdir
+    src[2, :] = src_center + src_rightdir
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = dst_center
+    dst[1, :] = dst_center + dst_downdir
+    dst[2, :] = dst_center + dst_rightdir
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    trans = trans.astype(np.float32)
+    return trans
+
+
+def process_db_coord(joint_img, joint_cam, joint_valid, do_flip, img_shape, flip_pairs, img2bb_trans, rot,
+                     src_joints_name, target_joints_name):
+    joint_img_original = joint_img.copy()
+    joint_img, joint_cam, joint_valid = joint_img.copy(), joint_cam.copy(), joint_valid.copy()
+
+    # flip augmentation
+    if do_flip:
+        joint_cam[:, 0] = -joint_cam[:, 0]
+        joint_img[:, 0] = img_shape[1] - 1 - joint_img[:, 0]
+        for pair in flip_pairs:
+            joint_img[pair[0], :], joint_img[pair[1], :] = joint_img[pair[1], :].copy(), joint_img[pair[0], :].copy()
+            joint_cam[pair[0], :], joint_cam[pair[1], :] = joint_cam[pair[1], :].copy(), joint_cam[pair[0], :].copy()
+            joint_valid[pair[0], :], joint_valid[pair[1], :] = joint_valid[pair[1], :].copy(), joint_valid[pair[0],
+                                                                                               :].copy()
+
+    # 3D data rotation augmentation
+    rot_aug_mat = np.array([[np.cos(np.deg2rad(-rot)), -np.sin(np.deg2rad(-rot)), 0],
+                            [np.sin(np.deg2rad(-rot)), np.cos(np.deg2rad(-rot)), 0],
+                            [0, 0, 1]], dtype=np.float32)
+    joint_cam = np.dot(rot_aug_mat, joint_cam.transpose(1, 0)).transpose(1, 0)
+
+    # affine transformation
+    joint_img_xy1 = np.concatenate((joint_img[:, :2], np.ones_like(joint_img[:, :1])), 1)
+    joint_img[:, :2] = np.dot(img2bb_trans, joint_img_xy1.transpose(1, 0)).transpose(1, 0)
+    joint_img[:, 0] = joint_img[:, 0] / cfg.input_img_shape[1] * cfg.output_hm_shape[2]
+    joint_img[:, 1] = joint_img[:, 1] / cfg.input_img_shape[0] * cfg.output_hm_shape[1]
+
+    # check truncation
+    joint_trunc = joint_valid * ((joint_img_original[:, 0] > 0) * (joint_img[:, 0] >= 0) * (joint_img[:, 0] < cfg.output_hm_shape[2]) * \
+                                 (joint_img_original[:, 1] > 0) *(joint_img[:, 1] >= 0) * (joint_img[:, 1] < cfg.output_hm_shape[1]) * \
+                                 (joint_img_original[:, 2] > 0) *(joint_img[:, 2] >= 0) * (joint_img[:, 2] < cfg.output_hm_shape[0])).reshape(-1,
+                                                                                                              1).astype(
+        np.float32)
+
+    # transform joints to target db joints
+    joint_img = transform_joint_to_other_db(joint_img, src_joints_name, target_joints_name)
+    joint_cam_wo_ra = transform_joint_to_other_db(joint_cam, src_joints_name, target_joints_name)
+    joint_valid = transform_joint_to_other_db(joint_valid, src_joints_name, target_joints_name)
+    joint_trunc = transform_joint_to_other_db(joint_trunc, src_joints_name, target_joints_name)
+
+    # root-alignment, for joint_cam input wo ra
+    joint_cam_ra = joint_cam_wo_ra.copy()
+    joint_cam_ra = joint_cam_ra - joint_cam_ra[smpl_x.root_joint_idx, None, :]  # root-relative
+    joint_cam_ra[smpl_x.joint_part['lhand'], :] = joint_cam_ra[smpl_x.joint_part['lhand'], :] - joint_cam_ra[
+                                                                                            smpl_x.lwrist_idx, None,
+                                                                                            :]  # left hand root-relative
+    joint_cam_ra[smpl_x.joint_part['rhand'], :] = joint_cam_ra[smpl_x.joint_part['rhand'], :] - joint_cam_ra[
+                                                                                            smpl_x.rwrist_idx, None,
+                                                                                            :]  # right hand root-relative
+    joint_cam_ra[smpl_x.joint_part['face'], :] = joint_cam_ra[smpl_x.joint_part['face'], :] - joint_cam_ra[smpl_x.neck_idx,
+                                                                                        None,
+                                                                                        :]  # face root-relative
+
+    return joint_img, joint_cam_wo_ra, joint_cam_ra, joint_valid, joint_trunc
+
+
+def process_human_model_output(human_model_param, cam_param, do_flip, img_shape, img2bb_trans, rot, human_model_type, joint_img=None):
+    if human_model_type == 'smplx':
+        human_model = smpl_x
+        rotation_valid = np.ones((smpl_x.orig_joint_num), dtype=np.float32)
+        coord_valid = np.ones((smpl_x.joint_num), dtype=np.float32)
+
+        root_pose, body_pose, shape, trans = human_model_param['root_pose'], human_model_param['body_pose'], \
+                                             human_model_param['shape'], human_model_param['trans']
+        if 'lhand_pose' in human_model_param and human_model_param['lhand_valid']:
+            lhand_pose = human_model_param['lhand_pose']
+        else:
+            lhand_pose = np.zeros((3 * len(smpl_x.orig_joint_part['lhand'])), dtype=np.float32)
+            rotation_valid[smpl_x.orig_joint_part['lhand']] = 0
+            coord_valid[smpl_x.joint_part['lhand']] = 0
+        if 'rhand_pose' in human_model_param and human_model_param['rhand_valid']:
+            rhand_pose = human_model_param['rhand_pose']
+        else:
+            rhand_pose = np.zeros((3 * len(smpl_x.orig_joint_part['rhand'])), dtype=np.float32)
+            rotation_valid[smpl_x.orig_joint_part['rhand']] = 0
+            coord_valid[smpl_x.joint_part['rhand']] = 0
+        if 'jaw_pose' in human_model_param and 'expr' in human_model_param and human_model_param['face_valid']:
+            jaw_pose = human_model_param['jaw_pose']
+            expr = human_model_param['expr']
+            expr_valid = True
+        else:
+            jaw_pose = np.zeros((3), dtype=np.float32)
+            expr = np.zeros((smpl_x.expr_code_dim), dtype=np.float32)
+            rotation_valid[smpl_x.orig_joint_part['face']] = 0
+            coord_valid[smpl_x.joint_part['face']] = 0
+            expr_valid = False
+        if 'gender' in human_model_param:
+            gender = human_model_param['gender']
+        else:
+            gender = 'neutral'
+        root_pose = torch.FloatTensor(root_pose).view(1, 3)  # (1,3)
+        body_pose = torch.FloatTensor(body_pose).view(-1, 3)  # (21,3)
+        lhand_pose = torch.FloatTensor(lhand_pose).view(-1, 3)  # (15,3)
+        rhand_pose = torch.FloatTensor(rhand_pose).view(-1, 3)  # (15,3)
+        jaw_pose = torch.FloatTensor(jaw_pose).view(-1, 3)  # (1,3)
+        shape = torch.FloatTensor(shape).view(1, -1)  # SMPLX shape parameter
+        expr = torch.FloatTensor(expr).view(1, -1)  # SMPLX expression parameter
+        trans = torch.FloatTensor(trans).view(1, -1)  # translation vector
+
+        # apply camera extrinsic (rotation)
+        # merge root pose and camera rotation
+        if 'R' in cam_param:
+            R = np.array(cam_param['R'], dtype=np.float32).reshape(3, 3)
+            root_pose = root_pose.numpy()
+            root_pose, _ = cv2.Rodrigues(root_pose)
+            root_pose, _ = cv2.Rodrigues(np.dot(R, root_pose))
+            root_pose = torch.from_numpy(root_pose).view(1, 3)
+
+        # get mesh and joint coordinates
+        zero_pose = torch.zeros((1, 3)).float()  # eye poses
+        with torch.no_grad():
+            output = smpl_x.layer[gender](betas=shape, body_pose=body_pose.view(1, -1), global_orient=root_pose,
+                                          transl=trans, left_hand_pose=lhand_pose.view(1, -1),
+                                          right_hand_pose=rhand_pose.view(1, -1), jaw_pose=jaw_pose.view(1, -1),
+                                          leye_pose=zero_pose, reye_pose=zero_pose, expression=expr)
+        mesh_cam = output.vertices[0].numpy()
+        joint_cam = output.joints[0].numpy()[smpl_x.joint_idx, :]
+
+        # apply camera exrinsic (translation)
+        # compenstate rotation (translation from origin to root joint was not cancled)
+        if 'R' in cam_param and 't' in cam_param:
+            R, t = np.array(cam_param['R'], dtype=np.float32).reshape(3, 3), np.array(cam_param['t'],
+                                                                                      dtype=np.float32).reshape(1, 3)
+            root_cam = joint_cam[smpl_x.root_joint_idx, None, :]
+            joint_cam = joint_cam - root_cam + np.dot(R, root_cam.transpose(1, 0)).transpose(1, 0) + t
+            mesh_cam = mesh_cam - root_cam + np.dot(R, root_cam.transpose(1, 0)).transpose(1, 0) + t
+
+        # concat root, body, two hands, and jaw pose
+        pose = torch.cat((root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose))
+
+        # joint coordinates
+        if 'focal' not in cam_param or 'princpt' not in cam_param:
+            assert joint_img is not None 
+        else:   
+            joint_img = cam2pixel(joint_cam, cam_param['focal'], cam_param['princpt'])
+
+        joint_img_original = joint_img.copy()
+
+        joint_cam = joint_cam - joint_cam[smpl_x.root_joint_idx, None, :]  # root-relative
+        joint_cam[smpl_x.joint_part['lhand'], :] = joint_cam[smpl_x.joint_part['lhand'], :] - joint_cam[
+                                                                                              smpl_x.lwrist_idx, None,
+                                                                                              :]  # left hand root-relative
+        joint_cam[smpl_x.joint_part['rhand'], :] = joint_cam[smpl_x.joint_part['rhand'], :] - joint_cam[
+                                                                                              smpl_x.rwrist_idx, None,
+                                                                                              :]  # right hand root-relative
+        joint_cam[smpl_x.joint_part['face'], :] = joint_cam[smpl_x.joint_part['face'], :] - joint_cam[smpl_x.neck_idx,
+                                                                                            None,
+                                                                                            :]  # face root-relative
+        joint_img[smpl_x.joint_part['body'], 2] = (joint_cam[smpl_x.joint_part['body'], 2].copy() / (
+                    cfg.body_3d_size / 2) + 1) / 2. * cfg.output_hm_shape[0]  # body depth discretize
+        joint_img[smpl_x.joint_part['lhand'], 2] = (joint_cam[smpl_x.joint_part['lhand'], 2].copy() / (
+                    cfg.hand_3d_size / 2) + 1) / 2. * cfg.output_hm_shape[0]  # left hand depth discretize
+        joint_img[smpl_x.joint_part['rhand'], 2] = (joint_cam[smpl_x.joint_part['rhand'], 2].copy() / (
+                    cfg.hand_3d_size / 2) + 1) / 2. * cfg.output_hm_shape[0]  # right hand depth discretize
+        joint_img[smpl_x.joint_part['face'], 2] = (joint_cam[smpl_x.joint_part['face'], 2].copy() / (
+                    cfg.face_3d_size / 2) + 1) / 2. * cfg.output_hm_shape[0]  # face depth discretize
+
+    elif human_model_type == 'smpl':
+        human_model = smpl
+        pose, shape, trans = human_model_param['pose'], human_model_param['shape'], human_model_param['trans']
+        if 'gender' in human_model_param:
+            gender = human_model_param['gender']
+        else:
+            gender = 'neutral'
+        pose = torch.FloatTensor(pose).view(-1, 3)
+        shape = torch.FloatTensor(shape).view(1, -1);
+        trans = torch.FloatTensor(trans).view(1, -1)  # translation vector
+
+        # apply camera extrinsic (rotation)
+        # merge root pose and camera rotation
+        if 'R' in cam_param:
+            R = np.array(cam_param['R'], dtype=np.float32).reshape(3, 3)
+            root_pose = pose[smpl.orig_root_joint_idx, :].numpy()
+            root_pose, _ = cv2.Rodrigues(root_pose)
+            root_pose, _ = cv2.Rodrigues(np.dot(R, root_pose))
+            pose[smpl.orig_root_joint_idx] = torch.from_numpy(root_pose).view(3)
+
+        # get mesh and joint coordinates
+        root_pose = pose[smpl.orig_root_joint_idx].view(1, 3)
+        body_pose = torch.cat((pose[:smpl.orig_root_joint_idx, :], pose[smpl.orig_root_joint_idx + 1:, :])).view(1, -1)
+        with torch.no_grad():
+            output = smpl.layer[gender](betas=shape, body_pose=body_pose, global_orient=root_pose, transl=trans)
+        mesh_cam = output.vertices[0].numpy()
+        joint_cam = np.dot(smpl.joint_regressor, mesh_cam)
+
+        # apply camera exrinsic (translation)
+        # compenstate rotation (translation from origin to root joint was not cancled)
+        if 'R' in cam_param and 't' in cam_param:
+            R, t = np.array(cam_param['R'], dtype=np.float32).reshape(3, 3), np.array(cam_param['t'],
+                                                                                      dtype=np.float32).reshape(1, 3)
+            root_cam = joint_cam[smpl.root_joint_idx, None, :]
+            joint_cam = joint_cam - root_cam + np.dot(R, root_cam.transpose(1, 0)).transpose(1, 0) + t
+            mesh_cam = mesh_cam - root_cam + np.dot(R, root_cam.transpose(1, 0)).transpose(1, 0) + t
+
+        # joint coordinates
+        if 'focal' not in cam_param or 'princpt' not in cam_param:
+            assert joint_img is not None 
+        else:   
+            joint_img = cam2pixel(joint_cam, cam_param['focal'], cam_param['princpt'])
+        
+        joint_img_original = joint_img.copy()
+        joint_cam = joint_cam - joint_cam[smpl.root_joint_idx, None, :]  # body root-relative
+        joint_img[:, 2] = (joint_cam[:, 2].copy() / (cfg.body_3d_size / 2) + 1) / 2. * cfg.output_hm_shape[
+            0]  # body depth discretize
+
+    elif human_model_type == 'mano':
+        human_model = mano
+        pose, shape, trans = human_model_param['pose'], human_model_param['shape'], human_model_param['trans']
+        hand_type = human_model_param['hand_type']
+        pose = torch.FloatTensor(pose).view(-1, 3)
+        shape = torch.FloatTensor(shape).view(1, -1);
+        trans = torch.FloatTensor(trans).view(1, -1)  # translation vector
+
+        # apply camera extrinsic (rotation)
+        # merge root pose and camera rotation
+        if 'R' in cam_param:
+            R = np.array(cam_param['R'], dtype=np.float32).reshape(3, 3)
+            root_pose = pose[mano.orig_root_joint_idx, :].numpy()
+            root_pose, _ = cv2.Rodrigues(root_pose)
+            root_pose, _ = cv2.Rodrigues(np.dot(R, root_pose))
+            pose[mano.orig_root_joint_idx] = torch.from_numpy(root_pose).view(3)
+
+        # get mesh and joint coordinates
+        root_pose = pose[mano.orig_root_joint_idx].view(1, 3)
+        hand_pose = torch.cat((pose[:mano.orig_root_joint_idx, :], pose[mano.orig_root_joint_idx + 1:, :])).view(1, -1)
+        with torch.no_grad():
+            output = mano.layer[hand_type](betas=shape, hand_pose=hand_pose, global_orient=root_pose, transl=trans)
+        mesh_cam = output.vertices[0].numpy()
+        joint_cam = np.dot(mano.joint_regressor, mesh_cam)
+
+        # apply camera exrinsic (translation)
+        # compenstate rotation (translation from origin to root joint was not cancled)
+        if 'R' in cam_param and 't' in cam_param:
+            R, t = np.array(cam_param['R'], dtype=np.float32).reshape(3, 3), np.array(cam_param['t'],
+                                                                                      dtype=np.float32).reshape(1, 3)
+            root_cam = joint_cam[mano.root_joint_idx, None, :]
+            joint_cam = joint_cam - root_cam + np.dot(R, root_cam.transpose(1, 0)).transpose(1, 0) + t
+            mesh_cam = mesh_cam - root_cam + np.dot(R, root_cam.transpose(1, 0)).transpose(1, 0) + t
+
+        # joint coordinates
+        if 'focal' not in cam_param or 'princpt' not in cam_param:
+            assert joint_img is not None 
+        else:   
+            joint_img = cam2pixel(joint_cam, cam_param['focal'], cam_param['princpt'])
+        joint_cam = joint_cam - joint_cam[mano.root_joint_idx, None, :]  # hand root-relative
+        joint_img[:, 2] = (joint_cam[:, 2].copy() / (cfg.hand_3d_size / 2) + 1) / 2. * cfg.output_hm_shape[
+            0]  # hand depth discretize
+
+    mesh_cam_orig = mesh_cam.copy()  # back-up the original one
+
+    ## so far, data augmentations are not applied yet
+    ## now, apply data augmentations
+
+    # image projection
+    if do_flip:
+        joint_cam[:, 0] = -joint_cam[:, 0]
+        joint_img[:, 0] = img_shape[1] - 1 - joint_img[:, 0]
+        for pair in human_model.flip_pairs:
+            joint_cam[pair[0], :], joint_cam[pair[1], :] = joint_cam[pair[1], :].copy(), joint_cam[pair[0], :].copy()
+            joint_img[pair[0], :], joint_img[pair[1], :] = joint_img[pair[1], :].copy(), joint_img[pair[0], :].copy()
+            if human_model_type == 'smplx':
+                coord_valid[pair[0]], coord_valid[pair[1]] = coord_valid[pair[1]].copy(), coord_valid[pair[0]].copy()
+
+    # x,y affine transform, root-relative depth
+    joint_img_xy1 = np.concatenate((joint_img[:, :2], np.ones_like(joint_img[:, 0:1])), 1)
+    joint_img[:, :2] = np.dot(img2bb_trans, joint_img_xy1.transpose(1, 0)).transpose(1, 0)[:, :2]
+    joint_img[:, 0] = joint_img[:, 0] / cfg.input_img_shape[1] * cfg.output_hm_shape[2]
+    joint_img[:, 1] = joint_img[:, 1] / cfg.input_img_shape[0] * cfg.output_hm_shape[1]
+
+    # check truncation
+    # TODO
+    joint_trunc = ((joint_img_original[:, 0] > 0) * (joint_img[:, 0] >= 0) * (joint_img[:, 0] < cfg.output_hm_shape[2]) * \
+                   (joint_img_original[:, 1] > 0) * (joint_img[:, 1] >= 0) * (joint_img[:, 1] < cfg.output_hm_shape[1]) * \
+                   (joint_img_original[:, 2] > 0) * (joint_img[:, 2] >= 0) * (joint_img[:, 2] < cfg.output_hm_shape[0])).reshape(-1, 1).astype(
+        np.float32)
+
+    # 3D data rotation augmentation
+    rot_aug_mat = np.array([[np.cos(np.deg2rad(-rot)), -np.sin(np.deg2rad(-rot)), 0],
+                            [np.sin(np.deg2rad(-rot)), np.cos(np.deg2rad(-rot)), 0],
+                            [0, 0, 1]], dtype=np.float32)
+    # coordinate
+    joint_cam = np.dot(rot_aug_mat, joint_cam.transpose(1, 0)).transpose(1, 0)
+    # parameters
+    # flip pose parameter (axis-angle)
+    if do_flip:
+        for pair in human_model.orig_flip_pairs:
+            pose[pair[0], :], pose[pair[1], :] = pose[pair[1], :].clone(), pose[pair[0], :].clone()
+            if human_model_type == 'smplx':
+                rotation_valid[pair[0]], rotation_valid[pair[1]] = rotation_valid[pair[1]].copy(), rotation_valid[
+                    pair[0]].copy()
+        pose[:, 1:3] *= -1  # multiply -1 to y and z axis of axis-angle
+
+    # rotate root pose
+    pose = pose.numpy()
+    root_pose = pose[human_model.orig_root_joint_idx, :]
+    root_pose, _ = cv2.Rodrigues(root_pose)
+    root_pose, _ = cv2.Rodrigues(np.dot(rot_aug_mat, root_pose))
+    pose[human_model.orig_root_joint_idx] = root_pose.reshape(3)
+
+    # change to mean shape if beta is too far from it
+    shape[(shape.abs() > 3).any(dim=1)] = 0.
+    shape = shape.numpy().reshape(-1)
+
+    # return results
+    if human_model_type == 'smplx':
+        pose = pose.reshape(-1)
+        expr = expr.numpy().reshape(-1)
+
+        return joint_img, joint_cam, joint_trunc, pose, shape, expr, rotation_valid, coord_valid, expr_valid, mesh_cam_orig
+    elif human_model_type == 'smpl':
+        pose = pose.reshape(-1)
+        return joint_img, joint_cam, joint_trunc, pose, shape, mesh_cam_orig
+    elif human_model_type == 'mano':
+        pose = pose.reshape(-1)
+        return joint_img, joint_cam, joint_trunc, pose, shape, mesh_cam_orig
+
+
+def get_fitting_error_3D(db_joint, db_joint_from_fit, joint_valid):
+    # mask coordinate
+    db_joint = db_joint[np.tile(joint_valid, (1, 3)) == 1].reshape(-1, 3)
+    db_joint_from_fit = db_joint_from_fit[np.tile(joint_valid, (1, 3)) == 1].reshape(-1, 3)
+
+    db_joint_from_fit = db_joint_from_fit - np.mean(db_joint_from_fit, 0)[None, :] + np.mean(db_joint, 0)[None,
+                                                                                     :]  # translation alignment
+    error = np.sqrt(np.sum((db_joint - db_joint_from_fit) ** 2, 1)).mean()
+    return error
+
+
+def load_obj(file_name):
+    v = []
+    obj_file = open(file_name)
+    for line in obj_file:
+        words = line.split(' ')
+        if words[0] == 'v':
+            x, y, z = float(words[1]), float(words[2]), float(words[3])
+            v.append(np.array([x, y, z]))
+    return np.stack(v)
+
+
+def load_ply(file_name):
+    plydata = PlyData.read(file_name)
+    x = plydata['vertex']['x']
+    y = plydata['vertex']['y']
+    z = plydata['vertex']['z']
+    v = np.stack((x, y, z), 1)
+    return v
+
+def resize_bbox(bbox, scale=1.2):
+    if isinstance(bbox, list):
+        x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
+    else:
+        x1, y1, x2, y2 = bbox
+    x_center = (x1+x2)/2.0
+    y_center = (y1+y2)/2.0
+    x_size, y_size = x2-x1, y2-y1
+    x1_resize = x_center-x_size/2.0*scale
+    x2_resize = x_center+x_size/2.0*scale
+    y1_resize = y_center - y_size / 2.0 * scale
+    y2_resize = y_center + y_size / 2.0 * scale
+    bbox[0], bbox[1], bbox[2], bbox[3] = x1_resize, y1_resize, x2_resize, y2_resize
+    return bbox
\ No newline at end of file
diff --git a/SMPLer-X/common/utils/smplx/LICENSE b/SMPLer-X/common/utils/smplx/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..3034a97b164d6e006655493e950314ec58e200cd
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/LICENSE
@@ -0,0 +1,58 @@
+License
+
+Software Copyright License for non-commercial scientific research purposes
+Please read carefully the following terms and conditions and any accompanying documentation before you download and/or use the SMPL-X/SMPLify-X model, data and software, (the "Model & Software"), including 3D meshes, blend weights, blend shapes, textures, software, scripts, and animations. By downloading and/or using the Model & Software (including downloading, cloning, installing, and any other use of this github repository), you acknowledge that you have read these terms and conditions, understand them, and agree to be bound by them. If you do not agree with these terms and conditions, you must not download and/or use the Model & Software. Any infringement of the terms of this agreement will automatically terminate your rights under this License
+
+Ownership / Licensees
+The Software and the associated materials has been developed at the
+
+Max Planck Institute for Intelligent Systems (hereinafter "MPI").
+
+Any copyright or patent right is owned by and proprietary material of the
+
+Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (hereinafter “MPG”; MPI and MPG hereinafter collectively “Max-Planck”)
+
+hereinafter the “Licensor”.
+
+License Grant
+Licensor grants you (Licensee) personally a single-user, non-exclusive, non-transferable, free of charge right:
+
+To install the Model & Software on computers owned, leased or otherwise controlled by you and/or your organization;
+To use the Model & Software for the sole purpose of performing non-commercial scientific research, non-commercial education, or non-commercial artistic projects;
+Any other use, in particular any use for commercial purposes, is prohibited. This includes, without limitation, incorporation in a commercial product, use in a commercial service, or production of other artifacts for commercial purposes. The Model & Software may not be reproduced, modified and/or made available in any form to any third party without Max-Planck’s prior written permission.
+
+The Model & Software may not be used for pornographic purposes or to generate pornographic material whether commercial or not. This license also prohibits the use of the Model & Software to train methods/algorithms/neural networks/etc. for commercial use of any kind. By downloading the Model & Software, you agree not to reverse engineer it.
+
+No Distribution
+The Model & Software and the license herein granted shall not be copied, shared, distributed, re-sold, offered for re-sale, transferred or sub-licensed in whole or in part except that you may make one copy for archive purposes only.
+
+Disclaimer of Representations and Warranties
+You expressly acknowledge and agree that the Model & Software results from basic research, is provided “AS IS”, may contain errors, and that any use of the Model & Software is at your sole risk. LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE MODEL & SOFTWARE, NEITHER EXPRESS NOR IMPLIED, AND THE ABSENCE OF ANY LEGAL OR ACTUAL DEFECTS, WHETHER DISCOVERABLE OR NOT. Specifically, and not to limit the foregoing, licensor makes no representations or warranties (i) regarding the merchantability or fitness for a particular purpose of the Model & Software, (ii) that the use of the Model & Software will not infringe any patents, copyrights or other intellectual property rights of a third party, and (iii) that the use of the Model & Software will not cause any damage of any kind to you or a third party.
+
+Limitation of Liability
+Because this Model & Software License Agreement qualifies as a donation, according to Section 521 of the German Civil Code (Bürgerliches Gesetzbuch – BGB) Licensor as a donor is liable for intent and gross negligence only. If the Licensor fraudulently conceals a legal or material defect, they are obliged to compensate the Licensee for the resulting damage.
+Licensor shall be liable for loss of data only up to the amount of typical recovery costs which would have arisen had proper and regular data backup measures been taken. For the avoidance of doubt Licensor shall be liable in accordance with the German Product Liability Act in the event of product liability. The foregoing applies also to Licensor’s legal representatives or assistants in performance. Any further liability shall be excluded.
+Patent claims generated through the usage of the Model & Software cannot be directed towards the copyright holders.
+The Model & Software is provided in the state of development the licensor defines. If modified or extended by Licensee, the Licensor makes no claims about the fitness of the Model & Software and is not responsible for any problems such modifications cause.
+
+No Maintenance Services
+You understand and agree that Licensor is under no obligation to provide either maintenance services, update services, notices of latent defects, or corrections of defects with regard to the Model & Software. Licensor nevertheless reserves the right to update, modify, or discontinue the Model & Software at any time.
+
+Defects of the Model & Software must be notified in writing to the Licensor with a comprehensible description of the error symptoms. The notification of the defect should enable the reproduction of the error. The Licensee is encouraged to communicate any use, results, modification or publication.
+
+Publications using the Model & Software
+You acknowledge that the Model & Software is a valuable scientific resource and agree to appropriately reference the following paper in any publication making use of the Model & Software.
+
+Citation:
+
+
+@inproceedings{SMPL-X:2019,
+  title = {Expressive Body Capture: 3D Hands, Face, and Body from a Single Image},
+  author = {Pavlakos, Georgios and Choutas, Vasileios and Ghorbani, Nima and Bolkart, Timo and Osman, Ahmed A. A. and Tzionas, Dimitrios and Black, Michael J.},
+  booktitle = {Proceedings IEEE Conf. on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2019}
+}
+Commercial licensing opportunities
+For commercial uses of the Software, please send email to ps-license@tue.mpg.de
+
+This Agreement shall be governed by the laws of the Federal Republic of Germany except for the UN Sales Convention.
diff --git a/SMPLer-X/common/utils/smplx/README.md b/SMPLer-X/common/utils/smplx/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fb2df07aae5c116d23056b53160505316d72ea5f
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/README.md
@@ -0,0 +1,186 @@
+## SMPL-X:  A new joint 3D model of the human body, face and hands together
+
+[[Paper Page](https://smpl-x.is.tue.mpg.de)] [[Paper](https://ps.is.tuebingen.mpg.de/uploads_file/attachment/attachment/497/SMPL-X.pdf)]
+[[Supp. Mat.](https://ps.is.tuebingen.mpg.de/uploads_file/attachment/attachment/498/SMPL-X-supp.pdf)]
+
+![SMPL-X Examples](./images/teaser_fig.png)
+
+## Table of Contents
+  * [License](#license)
+  * [Description](#description)
+  * [Installation](#installation)
+  * [Downloading the model](#downloading-the-model)
+  * [Loading SMPL-X, SMPL+H and SMPL](#loading-smpl-x-smplh-and-smpl) 
+    * [SMPL and SMPL+H setup](#smpl-and-smplh-setup)
+    * [Model loading](https://github.com/vchoutas/smplx#model-loading)
+  * [MANO and FLAME correspondences](#mano-and-flame-correspondences) 
+  * [Example](#example)
+  * [Citation](#citation)
+  * [Acknowledgments](#acknowledgments)
+  * [Contact](#contact)
+
+## License
+
+Software Copyright License for **non-commercial scientific research purposes**.
+Please read carefully the [terms and conditions](https://github.com/vchoutas/smplx/blob/master/LICENSE) and any accompanying documentation before you download and/or use the SMPL-X/SMPLify-X model, data and software, (the "Model & Software"), including 3D meshes, blend weights, blend shapes, textures, software, scripts, and animations. By downloading and/or using the Model & Software (including downloading, cloning, installing, and any other use of this github repository), you acknowledge that you have read these terms and conditions, understand them, and agree to be bound by them. If you do not agree with these terms and conditions, you must not download and/or use the Model & Software. Any infringement of the terms of this agreement will automatically terminate your rights under this [License](./LICENSE).
+
+## Disclaimer
+
+The original images used for the figures 1 and 2 of the paper can be found in this link. 
+The images in the paper are used under license from gettyimages.com.
+We have acquired the right to use them in the publication, but redistribution is not allowed.
+Please follow the instructions on the given link to acquire right of usage.
+Our results are obtained on the 483 × 724 pixels resolution of the original images.
+
+## Description
+
+*SMPL-X* (SMPL eXpressive) is a unified body model with shape parameters trained jointly for the
+face, hands and body. *SMPL-X* uses standard vertex based linear blend skinning with learned corrective blend
+shapes, has N = 10, 475 vertices and K = 54 joints,
+which include joints for the neck, jaw, eyeballs and fingers. 
+SMPL-X is defined by a function M(θ, β, ψ), where θ is the pose parameters, β the shape parameters and
+ψ the facial expression parameters.
+
+
+## Installation
+
+To install the model please follow the next steps in the specified order:
+1. To install from PyPi simply run: 
+  ```Shell
+  pip install smplx[all]
+  ```
+2. Clone this repository and install it using the *setup.py* script: 
+```Shell
+git clone https://github.com/vchoutas/smplx
+python setup.py install
+```
+
+## Downloading the model
+
+To download the *SMPL-X* model go to [this project website](https://smpl-x.is.tue.mpg.de) and register to get access to the downloads section. 
+
+To download the *SMPL+H* model go to [this project website](http://mano.is.tue.mpg.de) and register to get access to the downloads section. 
+
+To download the *SMPL* model go to [this](http://smpl.is.tue.mpg.de) (male and female models) and [this](http://smplify.is.tue.mpg.de) (gender neutral model) project website and register to get access to the downloads section. 
+
+## Loading SMPL-X, SMPL+H and SMPL
+
+### SMPL and SMPL+H setup
+
+The loader gives the option to use any of the SMPL-X, SMPL+H, SMPL, and MANO models. Depending on the model you want to use, please follow the respective download instructions. To switch between MANO, SMPL, SMPL+H and SMPL-X just change the *model_path* or *model_type* parameters. For more details please check the docs of the model classes.
+Before using SMPL and SMPL+H you should follow the instructions in [tools/README.md](./tools/README.md) to remove the
+Chumpy objects from both model pkls, as well as merge the MANO parameters with SMPL+H.
+
+### Model loading 
+
+You can either use the [create](https://github.com/vchoutas/smplx/blob/c63c02b478c5c6f696491ed9167e3af6b08d89b1/smplx/body_models.py#L54)
+function from [body_models](./smplx/body_models.py) or directly call the constructor for the 
+[SMPL](https://github.com/vchoutas/smplx/blob/c63c02b478c5c6f696491ed9167e3af6b08d89b1/smplx/body_models.py#L106), 
+[SMPL+H](https://github.com/vchoutas/smplx/blob/c63c02b478c5c6f696491ed9167e3af6b08d89b1/smplx/body_models.py#L395) and 
+[SMPL-X](https://github.com/vchoutas/smplx/blob/c63c02b478c5c6f696491ed9167e3af6b08d89b1/smplx/body_models.py#L628) model. The path to the model can either be the path to the file with the parameters or a directory with the following structure:
+```bash
+models
+├── smpl
+│   ├── SMPL_FEMALE.pkl
+│   └── SMPL_MALE.pkl
+│   └── SMPL_NEUTRAL.pkl
+├── smplh
+│   ├── SMPLH_FEMALE.pkl
+│   └── SMPLH_MALE.pkl
+├── mano
+|   ├── MANO_RIGHT.pkl
+|   └── MANO_LEFT.pkl
+└── smplx
+    ├── SMPLX_FEMALE.npz
+    ├── SMPLX_FEMALE.pkl
+    ├── SMPLX_MALE.npz
+    ├── SMPLX_MALE.pkl
+    ├── SMPLX_NEUTRAL.npz
+    └── SMPLX_NEUTRAL.pkl
+```
+
+
+## MANO and FLAME correspondences
+
+The vertex correspondences between SMPL-X and MANO, FLAME can be downloaded
+from [the project website](https://smpl-x.is.tue.mpg.de). If you have extracted
+the correspondence data in the folder *correspondences*, then use the following
+scripts to visualize them:
+
+1. To view MANO correspondences run the following command:
+
+```
+python examples/vis_mano_vertices.py --model-folder $SMPLX_FOLDER --corr-fname correspondences/MANO_SMPLX_vertex_ids.pkl
+```
+
+2. To view FLAME correspondences run the following command:
+
+```
+python examples/vis_flame_vertices.py --model-folder $SMPLX_FOLDER --corr-fname correspondences/SMPL-X__FLAME_vertex_ids.npy
+```
+
+## Example
+
+After installing the *smplx* package and downloading the model parameters you should be able to run the *demo.py*
+script to visualize the results. For this step you have to install the [pyrender](https://pyrender.readthedocs.io/en/latest/index.html) and [trimesh](https://trimsh.org/) packages.
+
+`python examples/demo.py --model-folder $SMPLX_FOLDER --plot-joints=True --gender="neutral"`
+
+![SMPL-X Examples](./images/example.png)
+
+## Citation
+
+Depending on which model is loaded for your project, i.e. SMPL-X or SMPL+H or SMPL, please cite the most relevant work below, listed in the same order:
+
+```
+@inproceedings{SMPL-X:2019,
+    title = {Expressive Body Capture: 3D Hands, Face, and Body from a Single Image},
+    author = {Pavlakos, Georgios and Choutas, Vasileios and Ghorbani, Nima and Bolkart, Timo and Osman, Ahmed A. A. and Tzionas, Dimitrios and Black, Michael J.},
+    booktitle = {Proceedings IEEE Conf. on Computer Vision and Pattern Recognition (CVPR)},
+    year = {2019}
+}
+```
+
+```
+@article{MANO:SIGGRAPHASIA:2017,
+    title = {Embodied Hands: Modeling and Capturing Hands and Bodies Together},
+    author = {Romero, Javier and Tzionas, Dimitrios and Black, Michael J.},
+    journal = {ACM Transactions on Graphics, (Proc. SIGGRAPH Asia)},
+    volume = {36},
+    number = {6},
+    series = {245:1--245:17},
+    month = nov,
+    year = {2017},
+    month_numeric = {11}
+  }
+```
+
+```
+@article{SMPL:2015,
+    author = {Loper, Matthew and Mahmood, Naureen and Romero, Javier and Pons-Moll, Gerard and Black, Michael J.},
+    title = {{SMPL}: A Skinned Multi-Person Linear Model},
+    journal = {ACM Transactions on Graphics, (Proc. SIGGRAPH Asia)},
+    month = oct,
+    number = {6},
+    pages = {248:1--248:16},
+    publisher = {ACM},
+    volume = {34},
+    year = {2015}
+}
+```
+
+This repository was originally developed for SMPL-X / SMPLify-X (CVPR 2019), you might be interested in having a look: [https://smpl-x.is.tue.mpg.de](https://smpl-x.is.tue.mpg.de).
+
+## Acknowledgments
+
+### Facial Contour
+
+Special thanks to [Soubhik Sanyal](https://github.com/soubhiksanyal) for sharing the Tensorflow code used for the facial
+landmarks.
+
+## Contact
+The code of this repository was implemented by [Vassilis Choutas](vassilis.choutas@tuebingen.mpg.de).
+
+For questions, please contact [smplx@tue.mpg.de](smplx@tue.mpg.de).
+
+For commercial licensing (and all related questions for business applications), please contact [ps-licensing@tue.mpg.de](ps-licensing@tue.mpg.de).
diff --git a/SMPLer-X/common/utils/smplx/examples/demo.py b/SMPLer-X/common/utils/smplx/examples/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a6fd5024f4ac05d9f5db336b769d84836b51c18
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/examples/demo.py
@@ -0,0 +1,180 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import os.path as osp
+import argparse
+
+import numpy as np
+import torch
+
+import smplx
+
+
+def main(model_folder,
+         model_type='smplx',
+         ext='npz',
+         gender='neutral',
+         plot_joints=False,
+         num_betas=10,
+         sample_shape=True,
+         sample_expression=True,
+         num_expression_coeffs=10,
+         plotting_module='pyrender',
+         use_face_contour=False):
+
+    model = smplx.create(model_folder, model_type=model_type,
+                         gender=gender, use_face_contour=use_face_contour,
+                         num_betas=num_betas,
+                         num_expression_coeffs=num_expression_coeffs,
+                         ext=ext)
+    print(model)
+
+    betas, expression = None, None
+    if sample_shape:
+        betas = torch.randn([1, model.num_betas], dtype=torch.float32)
+    if sample_expression:
+        expression = torch.randn(
+            [1, model.num_expression_coeffs], dtype=torch.float32)
+
+    output = model(betas=betas, expression=expression,
+                   return_verts=True)
+    vertices = output.vertices.detach().cpu().numpy().squeeze()
+    joints = output.joints.detach().cpu().numpy().squeeze()
+
+    print('Vertices shape =', vertices.shape)
+    print('Joints shape =', joints.shape)
+
+    if plotting_module == 'pyrender':
+        import pyrender
+        import trimesh
+        vertex_colors = np.ones([vertices.shape[0], 4]) * [0.3, 0.3, 0.3, 0.8]
+        tri_mesh = trimesh.Trimesh(vertices, model.faces,
+                                   vertex_colors=vertex_colors)
+
+        mesh = pyrender.Mesh.from_trimesh(tri_mesh)
+
+        scene = pyrender.Scene()
+        scene.add(mesh)
+
+        if plot_joints:
+            sm = trimesh.creation.uv_sphere(radius=0.005)
+            sm.visual.vertex_colors = [0.9, 0.1, 0.1, 1.0]
+            tfs = np.tile(np.eye(4), (len(joints), 1, 1))
+            tfs[:, :3, 3] = joints
+            joints_pcl = pyrender.Mesh.from_trimesh(sm, poses=tfs)
+            scene.add(joints_pcl)
+
+        pyrender.Viewer(scene, use_raymond_lighting=True)
+    elif plotting_module == 'matplotlib':
+        from matplotlib import pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D
+        from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+
+        mesh = Poly3DCollection(vertices[model.faces], alpha=0.1)
+        face_color = (1.0, 1.0, 0.9)
+        edge_color = (0, 0, 0)
+        mesh.set_edgecolor(edge_color)
+        mesh.set_facecolor(face_color)
+        ax.add_collection3d(mesh)
+        ax.scatter(joints[:, 0], joints[:, 1], joints[:, 2], color='r')
+
+        if plot_joints:
+            ax.scatter(joints[:, 0], joints[:, 1], joints[:, 2], alpha=0.1)
+        plt.show()
+    elif plotting_module == 'open3d':
+        import open3d as o3d
+
+        mesh = o3d.geometry.TriangleMesh()
+        mesh.vertices = o3d.utility.Vector3dVector(
+            vertices)
+        mesh.triangles = o3d.utility.Vector3iVector(model.faces)
+        mesh.compute_vertex_normals()
+        mesh.paint_uniform_color([0.3, 0.3, 0.3])
+
+        geometry = [mesh]
+        if plot_joints:
+            joints_pcl = o3d.geometry.PointCloud()
+            joints_pcl.points = o3d.utility.Vector3dVector(joints)
+            joints_pcl.paint_uniform_color([0.7, 0.3, 0.3])
+            geometry.append(joints_pcl)
+
+        o3d.visualization.draw_geometries(geometry)
+    else:
+        raise ValueError('Unknown plotting_module: {}'.format(plotting_module))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='SMPL-X Demo')
+
+    parser.add_argument('--model-folder', required=True, type=str,
+                        help='The path to the model folder')
+    parser.add_argument('--model-type', default='smplx', type=str,
+                        choices=['smpl', 'smplh', 'smplx', 'mano', 'flame'],
+                        help='The type of model to load')
+    parser.add_argument('--gender', type=str, default='neutral',
+                        help='The gender of the model')
+    parser.add_argument('--num-betas', default=10, type=int,
+                        dest='num_betas',
+                        help='Number of shape coefficients.')
+    parser.add_argument('--num-expression-coeffs', default=10, type=int,
+                        dest='num_expression_coeffs',
+                        help='Number of expression coefficients.')
+    parser.add_argument('--plotting-module', type=str, default='pyrender',
+                        dest='plotting_module',
+                        choices=['pyrender', 'matplotlib', 'open3d'],
+                        help='The module to use for plotting the result')
+    parser.add_argument('--ext', type=str, default='npz',
+                        help='Which extension to use for loading')
+    parser.add_argument('--plot-joints', default=False,
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='The path to the model folder')
+    parser.add_argument('--sample-shape', default=True,
+                        dest='sample_shape',
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Sample a random shape')
+    parser.add_argument('--sample-expression', default=True,
+                        dest='sample_expression',
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Sample a random expression')
+    parser.add_argument('--use-face-contour', default=False,
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Compute the contour of the face')
+
+    args = parser.parse_args()
+
+    model_folder = osp.expanduser(osp.expandvars(args.model_folder))
+    model_type = args.model_type
+    plot_joints = args.plot_joints
+    use_face_contour = args.use_face_contour
+    gender = args.gender
+    ext = args.ext
+    plotting_module = args.plotting_module
+    num_betas = args.num_betas
+    num_expression_coeffs = args.num_expression_coeffs
+    sample_shape = args.sample_shape
+    sample_expression = args.sample_expression
+
+    main(model_folder, model_type, ext=ext,
+         gender=gender, plot_joints=plot_joints,
+         num_betas=num_betas,
+         num_expression_coeffs=num_expression_coeffs,
+         sample_shape=sample_shape,
+         sample_expression=sample_expression,
+         plotting_module=plotting_module,
+         use_face_contour=use_face_contour)
diff --git a/SMPLer-X/common/utils/smplx/examples/demo_layers.py b/SMPLer-X/common/utils/smplx/examples/demo_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d4e18226c02a6c06c5158dc66276598ba96163a
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/examples/demo_layers.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import os.path as osp
+import argparse
+
+import numpy as np
+import torch
+
+import smplx
+
+
+def main(model_folder,
+         model_type='smplx',
+         ext='npz',
+         gender='neutral',
+         plot_joints=False,
+         num_betas=10,
+         sample_shape=True,
+         sample_expression=True,
+         num_expression_coeffs=10,
+         plotting_module='pyrender',
+         use_face_contour=False):
+
+    model = smplx.build_layer(
+        model_folder, model_type=model_type,
+        gender=gender, use_face_contour=use_face_contour,
+        num_betas=num_betas,
+        num_expression_coeffs=num_expression_coeffs,
+        ext=ext)
+    print(model)
+
+    betas, expression = None, None
+    if sample_shape:
+        betas = torch.randn([1, model.num_betas], dtype=torch.float32)
+    if sample_expression:
+        expression = torch.randn(
+            [1, model.num_expression_coeffs], dtype=torch.float32)
+
+    output = model(betas=betas, expression=expression,
+                   return_verts=True)
+    vertices = output.vertices.detach().cpu().numpy().squeeze()
+    joints = output.joints.detach().cpu().numpy().squeeze()
+
+    print('Vertices shape =', vertices.shape)
+    print('Joints shape =', joints.shape)
+
+    if plotting_module == 'pyrender':
+        import pyrender
+        import trimesh
+        vertex_colors = np.ones([vertices.shape[0], 4]) * [0.3, 0.3, 0.3, 0.8]
+        tri_mesh = trimesh.Trimesh(vertices, model.faces,
+                                   vertex_colors=vertex_colors)
+
+        mesh = pyrender.Mesh.from_trimesh(tri_mesh)
+
+        scene = pyrender.Scene()
+        scene.add(mesh)
+
+        if plot_joints:
+            sm = trimesh.creation.uv_sphere(radius=0.005)
+            sm.visual.vertex_colors = [0.9, 0.1, 0.1, 1.0]
+            tfs = np.tile(np.eye(4), (len(joints), 1, 1))
+            tfs[:, :3, 3] = joints
+            joints_pcl = pyrender.Mesh.from_trimesh(sm, poses=tfs)
+            scene.add(joints_pcl)
+
+        pyrender.Viewer(scene, use_raymond_lighting=True)
+    elif plotting_module == 'matplotlib':
+        from matplotlib import pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D
+        from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+
+        mesh = Poly3DCollection(vertices[model.faces], alpha=0.1)
+        face_color = (1.0, 1.0, 0.9)
+        edge_color = (0, 0, 0)
+        mesh.set_edgecolor(edge_color)
+        mesh.set_facecolor(face_color)
+        ax.add_collection3d(mesh)
+        ax.scatter(joints[:, 0], joints[:, 1], joints[:, 2], color='r')
+
+        if plot_joints:
+            ax.scatter(joints[:, 0], joints[:, 1], joints[:, 2], alpha=0.1)
+        plt.show()
+    elif plotting_module == 'open3d':
+        import open3d as o3d
+
+        mesh = o3d.geometry.TriangleMesh()
+        mesh.vertices = o3d.utility.Vector3dVector(
+            vertices)
+        mesh.triangles = o3d.utility.Vector3iVector(model.faces)
+        mesh.compute_vertex_normals()
+        mesh.paint_uniform_color([0.3, 0.3, 0.3])
+
+        geometry = [mesh]
+        if plot_joints:
+            joints_pcl = o3d.geometry.PointCloud()
+            joints_pcl.points = o3d.utility.Vector3dVector(joints)
+            joints_pcl.paint_uniform_color([0.7, 0.3, 0.3])
+            geometry.append(joints_pcl)
+
+        o3d.visualization.draw_geometries(geometry)
+    else:
+        raise ValueError('Unknown plotting_module: {}'.format(plotting_module))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='SMPL-X Demo')
+
+    parser.add_argument('--model-folder', required=True, type=str,
+                        help='The path to the model folder')
+    parser.add_argument('--model-type', default='smplx', type=str,
+                        choices=['smpl', 'smplh', 'smplx', 'mano', 'flame'],
+                        help='The type of model to load')
+    parser.add_argument('--gender', type=str, default='neutral',
+                        help='The gender of the model')
+    parser.add_argument('--num-betas', default=10, type=int,
+                        dest='num_betas',
+                        help='Number of shape coefficients.')
+    parser.add_argument('--num-expression-coeffs', default=10, type=int,
+                        dest='num_expression_coeffs',
+                        help='Number of expression coefficients.')
+    parser.add_argument('--plotting-module', type=str, default='pyrender',
+                        dest='plotting_module',
+                        choices=['pyrender', 'matplotlib', 'open3d'],
+                        help='The module to use for plotting the result')
+    parser.add_argument('--ext', type=str, default='npz',
+                        help='Which extension to use for loading')
+    parser.add_argument('--plot-joints', default=False,
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='The path to the model folder')
+    parser.add_argument('--sample-shape', default=True,
+                        dest='sample_shape',
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Sample a random shape')
+    parser.add_argument('--sample-expression', default=True,
+                        dest='sample_expression',
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Sample a random expression')
+    parser.add_argument('--use-face-contour', default=False,
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Compute the contour of the face')
+
+    args = parser.parse_args()
+
+    model_folder = osp.expanduser(osp.expandvars(args.model_folder))
+    model_type = args.model_type
+    plot_joints = args.plot_joints
+    use_face_contour = args.use_face_contour
+    gender = args.gender
+    ext = args.ext
+    plotting_module = args.plotting_module
+    num_betas = args.num_betas
+    num_expression_coeffs = args.num_expression_coeffs
+    sample_shape = args.sample_shape
+    sample_expression = args.sample_expression
+
+    main(model_folder, model_type, ext=ext,
+         gender=gender, plot_joints=plot_joints,
+         num_betas=num_betas,
+         num_expression_coeffs=num_expression_coeffs,
+         sample_shape=sample_shape,
+         sample_expression=sample_expression,
+         plotting_module=plotting_module,
+         use_face_contour=use_face_contour)
diff --git a/SMPLer-X/common/utils/smplx/examples/vis_flame_vertices.py b/SMPLer-X/common/utils/smplx/examples/vis_flame_vertices.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8d6b9b33610876a9d555f87492b326b172692a7
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/examples/vis_flame_vertices.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import os.path as osp
+import argparse
+import pickle
+
+import numpy as np
+import torch
+import open3d as o3d
+
+import smplx
+
+
+def main(model_folder, corr_fname, ext='npz',
+         head_color=(0.3, 0.3, 0.6),
+         gender='neutral'):
+
+    head_idxs = np.load(corr_fname)
+
+    model = smplx.create(model_folder, model_type='smplx',
+                         gender=gender,
+                         ext=ext)
+    betas = torch.zeros([1, 10], dtype=torch.float32)
+    expression = torch.zeros([1, 10], dtype=torch.float32)
+
+    output = model(betas=betas, expression=expression,
+                   return_verts=True)
+    vertices = output.vertices.detach().cpu().numpy().squeeze()
+    joints = output.joints.detach().cpu().numpy().squeeze()
+
+    print('Vertices shape =', vertices.shape)
+    print('Joints shape =', joints.shape)
+
+    mesh = o3d.geometry.TriangleMesh()
+    mesh.vertices = o3d.utility.Vector3dVector(vertices)
+    mesh.triangles = o3d.utility.Vector3iVector(model.faces)
+    mesh.compute_vertex_normals()
+
+    colors = np.ones_like(vertices) * [0.3, 0.3, 0.3]
+    colors[head_idxs] = head_color
+
+    mesh.vertex_colors = o3d.utility.Vector3dVector(colors)
+
+    o3d.visualization.draw_geometries([mesh])
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='SMPL-X Demo')
+
+    parser.add_argument('--model-folder', required=True, type=str,
+                        help='The path to the model folder')
+    parser.add_argument('--corr-fname', required=True, type=str,
+                        dest='corr_fname',
+                        help='Filename with the head correspondences')
+    parser.add_argument('--gender', type=str, default='neutral',
+                        help='The gender of the model')
+    parser.add_argument('--ext', type=str, default='npz',
+                        help='Which extension to use for loading')
+    parser.add_argument('--head', default='right',
+                        choices=['right', 'left'],
+                        type=str, help='Which head to plot')
+    parser.add_argument('--head-color', type=float, nargs=3, dest='head_color',
+                        default=(0.3, 0.3, 0.6),
+                        help='Color for the head vertices')
+
+    args = parser.parse_args()
+
+    model_folder = osp.expanduser(osp.expandvars(args.model_folder))
+    corr_fname = args.corr_fname
+    gender = args.gender
+    ext = args.ext
+    head = args.head
+    head_color = args.head_color
+
+    main(model_folder, corr_fname, ext=ext,
+         head_color=head_color,
+         gender=gender
+         )
diff --git a/SMPLer-X/common/utils/smplx/examples/vis_mano_vertices.py b/SMPLer-X/common/utils/smplx/examples/vis_mano_vertices.py
new file mode 100644
index 0000000000000000000000000000000000000000..1741542a1808071cc35fa1fcdef01a869885ec7e
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/examples/vis_mano_vertices.py
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import os.path as osp
+import argparse
+import pickle
+
+import numpy as np
+import torch
+import open3d as o3d
+
+import smplx
+
+
+def main(model_folder, corr_fname, ext='npz',
+         hand_color=(0.3, 0.3, 0.6),
+         gender='neutral', hand='right'):
+
+    with open(corr_fname, 'rb') as f:
+        idxs_data = pickle.load(f)
+        if hand == 'both':
+            hand_idxs = np.concatenate(
+                [idxs_data['left_hand'], idxs_data['right_hand']]
+            )
+        else:
+            hand_idxs = idxs_data[f'{hand}_hand']
+
+    model = smplx.create(model_folder, model_type='smplx',
+                         gender=gender,
+                         ext=ext)
+    betas = torch.zeros([1, 10], dtype=torch.float32)
+    expression = torch.zeros([1, 10], dtype=torch.float32)
+
+    output = model(betas=betas, expression=expression,
+                   return_verts=True)
+    vertices = output.vertices.detach().cpu().numpy().squeeze()
+    joints = output.joints.detach().cpu().numpy().squeeze()
+
+    print('Vertices shape =', vertices.shape)
+    print('Joints shape =', joints.shape)
+
+    mesh = o3d.geometry.TriangleMesh()
+    mesh.vertices = o3d.utility.Vector3dVector(vertices)
+    mesh.triangles = o3d.utility.Vector3iVector(model.faces)
+    mesh.compute_vertex_normals()
+
+    colors = np.ones_like(vertices) * [0.3, 0.3, 0.3]
+    colors[hand_idxs] = hand_color
+
+    mesh.vertex_colors = o3d.utility.Vector3dVector(colors)
+
+    o3d.visualization.draw_geometries([mesh])
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='SMPL-X Demo')
+
+    parser.add_argument('--model-folder', required=True, type=str,
+                        help='The path to the model folder')
+    parser.add_argument('--corr-fname', required=True, type=str,
+                        dest='corr_fname',
+                        help='Filename with the hand correspondences')
+    parser.add_argument('--gender', type=str, default='neutral',
+                        help='The gender of the model')
+    parser.add_argument('--ext', type=str, default='npz',
+                        help='Which extension to use for loading')
+    parser.add_argument('--hand', default='right',
+                        choices=['right', 'left', 'both'],
+                        type=str, help='Which hand to plot')
+    parser.add_argument('--hand-color', type=float, nargs=3, dest='hand_color',
+                        default=(0.3, 0.3, 0.6),
+                        help='Color for the hand vertices')
+
+    args = parser.parse_args()
+
+    model_folder = osp.expanduser(osp.expandvars(args.model_folder))
+    corr_fname = args.corr_fname
+    gender = args.gender
+    ext = args.ext
+    hand = args.hand
+    hand_color = args.hand_color
+
+    main(model_folder, corr_fname, ext=ext,
+         hand_color=hand_color,
+         gender=gender, hand=hand
+         )
diff --git a/SMPLer-X/common/utils/smplx/setup.py b/SMPLer-X/common/utils/smplx/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..0496b2a2ae47157e60c6f1a1b6766404df9c7e16
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/setup.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems and the Max Planck Institute for Biological
+# Cybernetics. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import io
+import os
+
+from setuptools import setup
+
+# Package meta-data.
+NAME = 'smplx'
+DESCRIPTION = 'PyTorch module for loading the SMPLX body model'
+URL = 'http://smpl-x.is.tuebingen.mpg.de'
+EMAIL = 'vassilis.choutas@tuebingen.mpg.de'
+AUTHOR = 'Vassilis Choutas'
+REQUIRES_PYTHON = '>=3.6.0'
+VERSION = '0.1.21'
+
+here = os.path.abspath(os.path.dirname(__file__))
+
+try:
+    FileNotFoundError
+except NameError:
+    FileNotFoundError = IOError
+
+# Import the README and use it as the long-description.
+# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
+try:
+    with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
+        long_description = '\n' + f.read()
+except FileNotFoundError:
+    long_description = DESCRIPTION
+
+# Load the package's __version__.py module as a dictionary.
+about = {}
+if not VERSION:
+    with open(os.path.join(here, NAME, '__version__.py')) as f:
+        exec(f.read(), about)
+else:
+    about['__version__'] = VERSION
+
+pyrender_reqs = ['pyrender>=0.1.23', 'trimesh>=2.37.6', 'shapely']
+matplotlib_reqs = ['matplotlib']
+open3d_reqs = ['open3d-python']
+
+setup(name=NAME,
+      version=about['__version__'],
+      description=DESCRIPTION,
+      long_description=long_description,
+      long_description_content_type='text/markdown',
+      author=AUTHOR,
+      author_email=EMAIL,
+      python_requires=REQUIRES_PYTHON,
+      url=URL,
+      install_requires=[
+          'numpy>=1.16.2',
+          'torch>=1.0.1.post2',
+          'torchgeometry>=0.1.2'
+      ],
+      extras_require={
+          'pyrender': pyrender_reqs,
+          'open3d': open3d_reqs,
+          'matplotlib': matplotlib_reqs,
+          'all': pyrender_reqs + matplotlib_reqs + open3d_reqs
+      },
+      packages=['smplx', 'tools'])
diff --git a/SMPLer-X/common/utils/smplx/smplx/__init__.py b/SMPLer-X/common/utils/smplx/smplx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..886949df670691d1ef5995737cafa285224826c4
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/smplx/__init__.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from .body_models import (
+    create,
+    SMPL,
+    SMPLH,
+    SMPLX,
+    MANO,
+    FLAME,
+    build_layer,
+    SMPLLayer,
+    SMPLHLayer,
+    SMPLXLayer,
+    MANOLayer,
+    FLAMELayer,
+)
diff --git a/SMPLer-X/common/utils/smplx/smplx/body_models.py b/SMPLer-X/common/utils/smplx/smplx/body_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b46353a5dc96cdaa53a8a25dafd0d15a25aaef2
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/smplx/body_models.py
@@ -0,0 +1,2331 @@
+#  -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from typing import Optional, Dict, Union
+import os
+import os.path as osp
+
+import pickle
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from .lbs import (
+    lbs, vertices2landmarks, find_dynamic_lmk_idx_and_bcoords)
+
+from .vertex_ids import vertex_ids as VERTEX_IDS
+from .utils import (
+    Struct, to_np, to_tensor, Tensor, Array,
+    SMPLOutput,
+    SMPLHOutput,
+    SMPLXOutput,
+    MANOOutput,
+    FLAMEOutput,
+    find_joint_kin_chain)
+from .vertex_joint_selector import VertexJointSelector
+from config import cfg
+
+class SMPL(nn.Module):
+
+    NUM_JOINTS = 23
+    NUM_BODY_JOINTS = 23
+    SHAPE_SPACE_DIM = 300
+
+    def __init__(
+        self, model_path: str,
+        data_struct: Optional[Struct] = None,
+        create_betas: bool = True,
+        betas: Optional[Tensor] = None,
+        num_betas: int = 10,
+        create_global_orient: bool = True,
+        global_orient: Optional[Tensor] = None,
+        create_body_pose: bool = True,
+        body_pose: Optional[Tensor] = None,
+        create_transl: bool = True,
+        transl: Optional[Tensor] = None,
+        dtype=torch.float32,
+        batch_size: int = 1,
+        joint_mapper=None,
+        gender: str = 'neutral',
+        vertex_ids: Dict[str, int] = None,
+        v_template: Optional[Union[Tensor, Array]] = None,
+        **kwargs
+    ) -> None:
+        ''' SMPL model constructor
+
+            Parameters
+            ----------
+            model_path: str
+                The path to the folder or to the file where the model
+                parameters are stored
+            data_struct: Strct
+                A struct object. If given, then the parameters of the model are
+                read from the object. Otherwise, the model tries to read the
+                parameters from the given `model_path`. (default = None)
+            create_global_orient: bool, optional
+                Flag for creating a member variable for the global orientation
+                of the body. (default = True)
+            global_orient: torch.tensor, optional, Bx3
+                The default value for the global orientation variable.
+                (default = None)
+            create_body_pose: bool, optional
+                Flag for creating a member variable for the pose of the body.
+                (default = True)
+            body_pose: torch.tensor, optional, Bx(Body Joints * 3)
+                The default value for the body pose variable.
+                (default = None)
+            num_betas: int, optional
+                Number of shape components to use
+                (default = 10).
+            create_betas: bool, optional
+                Flag for creating a member variable for the shape space
+                (default = True).
+            betas: torch.tensor, optional, Bx10
+                The default value for the shape member variable.
+                (default = None)
+            create_transl: bool, optional
+                Flag for creating a member variable for the translation
+                of the body. (default = True)
+            transl: torch.tensor, optional, Bx3
+                The default value for the transl variable.
+                (default = None)
+            dtype: torch.dtype, optional
+                The data type for the created variables
+            batch_size: int, optional
+                The batch size used for creating the member variables
+            joint_mapper: object, optional
+                An object that re-maps the joints. Useful if one wants to
+                re-order the SMPL joints to some other convention (e.g. MSCOCO)
+                (default = None)
+            gender: str, optional
+                Which gender to load
+            vertex_ids: dict, optional
+                A dictionary containing the indices of the extra vertices that
+                will be selected
+        '''
+
+        self.gender = gender
+
+        if data_struct is None:
+            if osp.isdir(model_path):
+                model_fn = 'SMPL_{}.{ext}'.format(gender.upper(), ext='pkl')
+                smpl_path = os.path.join(model_path, model_fn)
+            else:
+                smpl_path = model_path
+            assert osp.exists(smpl_path), 'Path {} does not exist!'.format(
+                smpl_path)
+
+            with open(smpl_path, 'rb') as smpl_file:
+                data_struct = Struct(**pickle.load(smpl_file,
+                                                   encoding='latin1'))
+
+        super(SMPL, self).__init__()
+        self.batch_size = batch_size
+        shapedirs = data_struct.shapedirs
+        if (shapedirs.shape[-1] < self.SHAPE_SPACE_DIM):
+            print(f'WARNING: You are using a {self.name()} model, with only'
+                  ' 10 shape coefficients.')
+            num_betas = min(num_betas, 10)
+        else:
+            num_betas = min(num_betas, self.SHAPE_SPACE_DIM)
+
+        self._num_betas = num_betas
+        shapedirs = shapedirs[:, :, :num_betas]
+        # The shape components
+        self.register_buffer(
+            'shapedirs',
+            to_tensor(to_np(shapedirs), dtype=dtype))
+
+        if vertex_ids is None:
+            # SMPL and SMPL-H share the same topology, so any extra joints can
+            # be drawn from the same place
+            vertex_ids = VERTEX_IDS['smplh']
+
+        self.dtype = dtype
+
+        self.joint_mapper = joint_mapper
+
+        self.vertex_joint_selector = VertexJointSelector(
+            vertex_ids=vertex_ids, **kwargs)
+
+        self.faces = data_struct.f
+        self.register_buffer('faces_tensor',
+                             to_tensor(to_np(self.faces, dtype=np.int64),
+                                       dtype=torch.long))
+
+        if create_betas:
+            if betas is None:
+                default_betas = torch.zeros(
+                    [batch_size, self.num_betas], dtype=dtype)
+            else:
+                if torch.is_tensor(betas):
+                    default_betas = betas.clone().detach()
+                else:
+                    default_betas = torch.tensor(betas, dtype=dtype)
+
+            self.register_parameter(
+                'betas', nn.Parameter(default_betas, requires_grad=True))
+
+        # The tensor that contains the global rotation of the model
+        # It is separated from the pose of the joints in case we wish to
+        # optimize only over one of them
+        if create_global_orient:
+            if global_orient is None:
+                default_global_orient = torch.zeros(
+                    [batch_size, 3], dtype=dtype)
+            else:
+                if torch.is_tensor(global_orient):
+                    default_global_orient = global_orient.clone().detach()
+                else:
+                    default_global_orient = torch.tensor(
+                        global_orient, dtype=dtype)
+
+            global_orient = nn.Parameter(default_global_orient,
+                                         requires_grad=True)
+            self.register_parameter('global_orient', global_orient)
+
+        if create_body_pose:
+            if body_pose is None:
+                default_body_pose = torch.zeros(
+                    [batch_size, self.NUM_BODY_JOINTS * 3], dtype=dtype)
+            else:
+                if torch.is_tensor(body_pose):
+                    default_body_pose = body_pose.clone().detach()
+                else:
+                    default_body_pose = torch.tensor(body_pose,
+                                                     dtype=dtype)
+            self.register_parameter(
+                'body_pose',
+                nn.Parameter(default_body_pose, requires_grad=True))
+
+        if create_transl:
+            if transl is None:
+                default_transl = torch.zeros([batch_size, 3],
+                                             dtype=dtype,
+                                             requires_grad=True)
+            else:
+                default_transl = torch.tensor(transl, dtype=dtype)
+            self.register_parameter(
+                'transl', nn.Parameter(default_transl, requires_grad=True))
+
+        if v_template is None:
+            v_template = data_struct.v_template
+        if not torch.is_tensor(v_template):
+            v_template = to_tensor(to_np(v_template), dtype=dtype)
+        # The vertices of the template model
+        self.register_buffer('v_template', v_template)
+
+        j_regressor = to_tensor(to_np(
+            data_struct.J_regressor), dtype=dtype)
+        self.register_buffer('J_regressor', j_regressor)
+
+        # Pose blend shape basis: 6890 x 3 x 207, reshaped to 6890*3 x 207
+        num_pose_basis = data_struct.posedirs.shape[-1]
+        # 207 x 20670
+        posedirs = np.reshape(data_struct.posedirs, [-1, num_pose_basis]).T
+        self.register_buffer('posedirs',
+                             to_tensor(to_np(posedirs), dtype=dtype))
+
+        # indices of parents for each joints
+        parents = to_tensor(to_np(data_struct.kintree_table[0])).long()
+        parents[0] = -1
+        self.register_buffer('parents', parents)
+
+        self.register_buffer(
+            'lbs_weights', to_tensor(to_np(data_struct.weights), dtype=dtype))
+
+    @property
+    def num_betas(self):
+        return self._num_betas
+
+    @property
+    def num_expression_coeffs(self):
+        return 0
+
+    def create_mean_pose(self, data_struct) -> Tensor:
+        pass
+
+    def name(self) -> str:
+        return 'SMPL'
+
+    @torch.no_grad()
+    def reset_params(self, **params_dict) -> None:
+        for param_name, param in self.named_parameters():
+            if param_name in params_dict:
+                param[:] = torch.tensor(params_dict[param_name])
+            else:
+                param.fill_(0)
+
+    def get_num_verts(self) -> int:
+        return self.v_template.shape[0]
+
+    def get_num_faces(self) -> int:
+        return self.faces.shape[0]
+
+    def extra_repr(self) -> str:
+        msg = [
+            f'Gender: {self.gender.upper()}',
+            f'Number of joints: {self.J_regressor.shape[0]}',
+            f'Betas: {self.num_betas}',
+        ]
+        return '\n'.join(msg)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        body_pose: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        return_verts=True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> SMPLOutput:
+        ''' Forward pass for the SMPL model
+
+            Parameters
+            ----------
+            global_orient: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable and use it as the global
+                rotation of the body. Useful if someone wishes to predicts this
+                with an external model. (default=None)
+            betas: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `betas` and use it
+                instead. For example, it can used if shape parameters
+                `betas` are predicted from some external model.
+                (default=None)
+            body_pose: torch.tensor, optional, shape Bx(J*3)
+                If given, ignore the member variable `body_pose` and use it
+                instead. For example, it can used if someone predicts the
+                pose of the body joints are predicted from some external model.
+                It should be a tensor that contains joint rotations in
+                axis-angle format. (default=None)
+            transl: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `transl` and use it
+                instead. For example, it can used if the translation
+                `transl` is predicted from some external model.
+                (default=None)
+            return_verts: bool, optional
+                Return the vertices. (default=True)
+            return_full_pose: bool, optional
+                Returns the full axis-angle pose vector (default=False)
+
+            Returns
+            -------
+        '''
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient if global_orient is not None else
+                         self.global_orient)
+        body_pose = body_pose if body_pose is not None else self.body_pose
+        betas = betas if betas is not None else self.betas
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None and hasattr(self, 'transl'):
+            transl = self.transl
+
+        full_pose = torch.cat([global_orient, body_pose], dim=1)
+
+        batch_size = max(betas.shape[0], global_orient.shape[0],
+                         body_pose.shape[0])
+
+        if betas.shape[0] != batch_size:
+            num_repeats = int(batch_size / betas.shape[0])
+            betas = betas.expand(num_repeats, -1)
+
+        vertices, joints = lbs(betas, full_pose, self.v_template,
+                               self.shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=pose2rot)
+
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Map the joints to the current dataset
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if apply_trans:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLOutput(vertices=vertices if return_verts else None,
+                            global_orient=global_orient,
+                            body_pose=body_pose,
+                            joints=joints,
+                            betas=betas,
+                            full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class SMPLLayer(SMPL):
+    def __init__(
+        self,
+        *args,
+        **kwargs
+    ) -> None:
+        # Just create a SMPL module without any member variables
+        super(SMPLLayer, self).__init__(
+            create_body_pose=False,
+            create_betas=False,
+            create_global_orient=False,
+            create_transl=False,
+            *args,
+            **kwargs,
+        )
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        body_pose: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        return_verts=True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> SMPLOutput:
+        ''' Forward pass for the SMPL model
+
+            Parameters
+            ----------
+            global_orient: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable and use it as the global
+                rotation of the body. Useful if someone wishes to predicts this
+                with an external model. (default=None)
+            betas: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `betas` and use it
+                instead. For example, it can used if shape parameters
+                `betas` are predicted from some external model.
+                (default=None)
+            body_pose: torch.tensor, optional, shape Bx(J*3)
+                If given, ignore the member variable `body_pose` and use it
+                instead. For example, it can used if someone predicts the
+                pose of the body joints are predicted from some external model.
+                It should be a tensor that contains joint rotations in
+                axis-angle format. (default=None)
+            transl: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `transl` and use it
+                instead. For example, it can used if the translation
+                `transl` is predicted from some external model.
+                (default=None)
+            return_verts: bool, optional
+                Return the vertices. (default=True)
+            return_full_pose: bool, optional
+                Returns the full axis-angle pose vector (default=False)
+
+            Returns
+            -------
+        '''
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 1, 1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if body_pose is None:
+            body_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(
+                    batch_size, self.NUM_BODY_JOINTS, 1).contiguous()
+        if betas is None:
+            betas = torch.zeros([batch_size, self.num_betas],
+                                dtype=dtype, device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+        full_pose = torch.cat(
+            [global_orient.reshape(-1, 1, 3),
+             body_pose.reshape(-1, self.NUM_BODY_JOINTS, 3)],
+            dim=1)
+
+        vertices, joints = lbs(betas, full_pose, self.v_template,
+                               self.shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights,
+                               pose2rot=True)
+
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Map the joints to the current dataset
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if transl is not None:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLOutput(vertices=vertices if return_verts else None,
+                            global_orient=global_orient,
+                            body_pose=body_pose,
+                            joints=joints,
+                            betas=betas,
+                            full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class SMPLH(SMPL):
+
+    # The hand joints are replaced by MANO
+    NUM_BODY_JOINTS = SMPL.NUM_JOINTS - 2
+    NUM_HAND_JOINTS = 15
+    NUM_JOINTS = NUM_BODY_JOINTS + 2 * NUM_HAND_JOINTS
+
+    def __init__(
+        self, model_path,
+        data_struct: Optional[Struct] = None,
+        create_left_hand_pose: bool = True,
+        left_hand_pose: Optional[Tensor] = None,
+        create_right_hand_pose: bool = True,
+        right_hand_pose: Optional[Tensor] = None,
+        use_pca: bool = True,
+        num_pca_comps: int = 6,
+        flat_hand_mean: bool = False,
+        batch_size: int = 1,
+        gender: str = 'neutral',
+        dtype=torch.float32,
+        vertex_ids=None,
+        use_compressed: bool = True,
+        ext: str = 'pkl',
+        **kwargs
+    ) -> None:
+        ''' SMPLH model constructor
+
+            Parameters
+            ----------
+            model_path: str
+                The path to the folder or to the file where the model
+                parameters are stored
+            data_struct: Strct
+                A struct object. If given, then the parameters of the model are
+                read from the object. Otherwise, the model tries to read the
+                parameters from the given `model_path`. (default = None)
+            create_left_hand_pose: bool, optional
+                Flag for creating a member variable for the pose of the left
+                hand. (default = True)
+            left_hand_pose: torch.tensor, optional, BxP
+                The default value for the left hand pose member variable.
+                (default = None)
+            create_right_hand_pose: bool, optional
+                Flag for creating a member variable for the pose of the right
+                hand. (default = True)
+            right_hand_pose: torch.tensor, optional, BxP
+                The default value for the right hand pose member variable.
+                (default = None)
+            num_pca_comps: int, optional
+                The number of PCA components to use for each hand.
+                (default = 6)
+            flat_hand_mean: bool, optional
+                If False, then the pose of the hand is initialized to False.
+            batch_size: int, optional
+                The batch size used for creating the member variables
+            gender: str, optional
+                Which gender to load
+            dtype: torch.dtype, optional
+                The data type for the created variables
+            vertex_ids: dict, optional
+                A dictionary containing the indices of the extra vertices that
+                will be selected
+        '''
+
+        self.num_pca_comps = num_pca_comps
+        # If no data structure is passed, then load the data from the given
+        # model folder
+        if data_struct is None:
+            # Load the model
+            if osp.isdir(model_path):
+                model_fn = 'SMPLH_{}.{ext}'.format(gender.upper(), ext=ext)
+                smplh_path = os.path.join(model_path, model_fn)
+            else:
+                smplh_path = model_path
+            assert osp.exists(smplh_path), 'Path {} does not exist!'.format(
+                smplh_path)
+
+            if ext == 'pkl':
+                with open(smplh_path, 'rb') as smplh_file:
+                    model_data = pickle.load(smplh_file, encoding='latin1')
+            elif ext == 'npz':
+                model_data = np.load(smplh_path, allow_pickle=True)
+            else:
+                raise ValueError('Unknown extension: {}'.format(ext))
+            data_struct = Struct(**model_data)
+
+        if vertex_ids is None:
+            vertex_ids = VERTEX_IDS['smplh']
+
+        super(SMPLH, self).__init__(
+            model_path=model_path,
+            data_struct=data_struct,
+            batch_size=batch_size, vertex_ids=vertex_ids, gender=gender,
+            use_compressed=use_compressed, dtype=dtype, ext=ext, **kwargs)
+
+        self.use_pca = use_pca
+        self.num_pca_comps = num_pca_comps
+        self.flat_hand_mean = flat_hand_mean
+
+        left_hand_components = data_struct.hands_componentsl[:num_pca_comps]
+        right_hand_components = data_struct.hands_componentsr[:num_pca_comps]
+
+        self.np_left_hand_components = left_hand_components
+        self.np_right_hand_components = right_hand_components
+        if self.use_pca:
+            self.register_buffer(
+                'left_hand_components',
+                torch.tensor(left_hand_components, dtype=dtype))
+            self.register_buffer(
+                'right_hand_components',
+                torch.tensor(right_hand_components, dtype=dtype))
+
+        if self.flat_hand_mean:
+            left_hand_mean = np.zeros_like(data_struct.hands_meanl)
+        else:
+            left_hand_mean = data_struct.hands_meanl
+
+        if self.flat_hand_mean:
+            right_hand_mean = np.zeros_like(data_struct.hands_meanr)
+        else:
+            right_hand_mean = data_struct.hands_meanr
+
+        self.register_buffer('left_hand_mean',
+                             to_tensor(left_hand_mean, dtype=self.dtype))
+        self.register_buffer('right_hand_mean',
+                             to_tensor(right_hand_mean, dtype=self.dtype))
+
+        # Create the buffers for the pose of the left hand
+        hand_pose_dim = num_pca_comps if use_pca else 3 * self.NUM_HAND_JOINTS
+        if create_left_hand_pose:
+            if left_hand_pose is None:
+                default_lhand_pose = torch.zeros([batch_size, hand_pose_dim],
+                                                 dtype=dtype)
+            else:
+                default_lhand_pose = torch.tensor(left_hand_pose, dtype=dtype)
+
+            left_hand_pose_param = nn.Parameter(default_lhand_pose,
+                                                requires_grad=True)
+            self.register_parameter('left_hand_pose',
+                                    left_hand_pose_param)
+
+        if create_right_hand_pose:
+            if right_hand_pose is None:
+                default_rhand_pose = torch.zeros([batch_size, hand_pose_dim],
+                                                 dtype=dtype)
+            else:
+                default_rhand_pose = torch.tensor(right_hand_pose, dtype=dtype)
+
+            right_hand_pose_param = nn.Parameter(default_rhand_pose,
+                                                 requires_grad=True)
+            self.register_parameter('right_hand_pose',
+                                    right_hand_pose_param)
+
+        # Create the buffer for the mean pose.
+        pose_mean_tensor = self.create_mean_pose(
+            data_struct, flat_hand_mean=flat_hand_mean)
+        if not torch.is_tensor(pose_mean_tensor):
+            pose_mean_tensor = torch.tensor(pose_mean_tensor, dtype=dtype)
+        self.register_buffer('pose_mean', pose_mean_tensor)
+
+    def create_mean_pose(self, data_struct, flat_hand_mean=False):
+        # Create the array for the mean pose. If flat_hand is false, then use
+        # the mean that is given by the data, rather than the flat open hand
+        global_orient_mean = torch.zeros([3], dtype=self.dtype)
+        body_pose_mean = torch.zeros([self.NUM_BODY_JOINTS * 3],
+                                     dtype=self.dtype)
+
+        pose_mean = torch.cat([global_orient_mean, body_pose_mean,
+                               self.left_hand_mean,
+                               self.right_hand_mean], dim=0)
+        return pose_mean
+
+    def name(self) -> str:
+        return 'SMPL+H'
+
+    def extra_repr(self):
+        msg = super(SMPLH, self).extra_repr()
+        msg = [msg]
+        if self.use_pca:
+            msg.append(f'Number of PCA components: {self.num_pca_comps}')
+        msg.append(f'Flat hand mean: {self.flat_hand_mean}')
+        return '\n'.join(msg)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        body_pose: Optional[Tensor] = None,
+        left_hand_pose: Optional[Tensor] = None,
+        right_hand_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> SMPLHOutput:
+        '''
+        '''
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient if global_orient is not None else
+                         self.global_orient)
+        body_pose = body_pose if body_pose is not None else self.body_pose
+        betas = betas if betas is not None else self.betas
+        left_hand_pose = (left_hand_pose if left_hand_pose is not None else
+                          self.left_hand_pose)
+        right_hand_pose = (right_hand_pose if right_hand_pose is not None else
+                           self.right_hand_pose)
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None:
+            if hasattr(self, 'transl'):
+                transl = self.transl
+
+        if self.use_pca:
+            left_hand_pose = torch.einsum(
+                'bi,ij->bj', [left_hand_pose, self.left_hand_components])
+            right_hand_pose = torch.einsum(
+                'bi,ij->bj', [right_hand_pose, self.right_hand_components])
+
+        full_pose = torch.cat([global_orient, body_pose,
+                               left_hand_pose,
+                               right_hand_pose], dim=1)
+        full_pose += self.pose_mean
+
+        vertices, joints = lbs(self.betas, full_pose, self.v_template,
+                               self.shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=pose2rot)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if apply_trans:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLHOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             global_orient=global_orient,
+                             body_pose=body_pose,
+                             left_hand_pose=left_hand_pose,
+                             right_hand_pose=right_hand_pose,
+                             full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class SMPLHLayer(SMPLH):
+
+    def __init__(
+        self, *args, **kwargs
+    ) -> None:
+        ''' SMPL+H as a layer model constructor
+        '''
+        super(SMPLHLayer, self).__init__(
+            create_global_orient=False,
+            create_body_pose=False,
+            create_left_hand_pose=False,
+            create_right_hand_pose=False,
+            create_betas=False,
+            create_transl=False,
+            *args,
+            **kwargs)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        body_pose: Optional[Tensor] = None,
+        left_hand_pose: Optional[Tensor] = None,
+        right_hand_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> SMPLHOutput:
+        '''
+        '''
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if body_pose is None:
+            body_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 21, -1).contiguous()
+        if left_hand_pose is None:
+            left_hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if right_hand_pose is None:
+            right_hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if betas is None:
+            betas = torch.zeros([batch_size, self.num_betas],
+                                dtype=dtype, device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+
+        # Concatenate all pose vectors
+        full_pose = torch.cat(
+            [global_orient.reshape(-1, 1, 3),
+             body_pose.reshape(-1, self.NUM_BODY_JOINTS, 3),
+             left_hand_pose.reshape(-1, self.NUM_HAND_JOINTS, 3),
+             right_hand_pose.reshape(-1, self.NUM_HAND_JOINTS, 3)],
+            dim=1)
+
+        vertices, joints = lbs(betas, full_pose, self.v_template,
+                               self.shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=True)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if transl is not None:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLHOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             global_orient=global_orient,
+                             body_pose=body_pose,
+                             left_hand_pose=left_hand_pose,
+                             right_hand_pose=right_hand_pose,
+                             full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class SMPLX(SMPLH):
+    '''
+    SMPL-X (SMPL eXpressive) is a unified body model, with shape parameters
+    trained jointly for the face, hands and body.
+    SMPL-X uses standard vertex based linear blend skinning with learned
+    corrective blend shapes, has N=10475 vertices and K=54 joints,
+    which includes joints for the neck, jaw, eyeballs and fingers.
+    '''
+
+    NUM_BODY_JOINTS = SMPLH.NUM_BODY_JOINTS
+    NUM_HAND_JOINTS = 15
+    NUM_FACE_JOINTS = 3
+    NUM_JOINTS = NUM_BODY_JOINTS + 2 * NUM_HAND_JOINTS + NUM_FACE_JOINTS
+    EXPRESSION_SPACE_DIM = 100
+    NECK_IDX = 12
+
+    def __init__(
+        self, model_path: str,
+        num_expression_coeffs: int = 10,
+        create_expression: bool = True,
+        expression: Optional[Tensor] = None,
+        create_jaw_pose: bool = True,
+        jaw_pose: Optional[Tensor] = None,
+        create_leye_pose: bool = True,
+        leye_pose: Optional[Tensor] = None,
+        create_reye_pose=True,
+        reye_pose: Optional[Tensor] = None,
+        use_face_contour: bool = False,
+        batch_size: int = 1,
+        gender: str = 'neutral',
+        dtype=torch.float32,
+        ext: str = 'npz',
+        **kwargs
+    ) -> None:
+        ''' SMPLX model constructor
+
+            Parameters
+            ----------
+            model_path: str
+                The path to the folder or to the file where the model
+                parameters are stored
+            num_expression_coeffs: int, optional
+                Number of expression components to use
+                (default = 10).
+            create_expression: bool, optional
+                Flag for creating a member variable for the expression space
+                (default = True).
+            expression: torch.tensor, optional, Bx10
+                The default value for the expression member variable.
+                (default = None)
+            create_jaw_pose: bool, optional
+                Flag for creating a member variable for the jaw pose.
+                (default = False)
+            jaw_pose: torch.tensor, optional, Bx3
+                The default value for the jaw pose variable.
+                (default = None)
+            create_leye_pose: bool, optional
+                Flag for creating a member variable for the left eye pose.
+                (default = False)
+            leye_pose: torch.tensor, optional, Bx10
+                The default value for the left eye pose variable.
+                (default = None)
+            create_reye_pose: bool, optional
+                Flag for creating a member variable for the right eye pose.
+                (default = False)
+            reye_pose: torch.tensor, optional, Bx10
+                The default value for the right eye pose variable.
+                (default = None)
+            use_face_contour: bool, optional
+                Whether to compute the keypoints that form the facial contour
+            batch_size: int, optional
+                The batch size used for creating the member variables
+            gender: str, optional
+                Which gender to load
+            dtype: torch.dtype
+                The data type for the created variables
+        '''
+
+        # Load the model
+        if osp.isdir(model_path):
+            model_fn = 'SMPLX_{}.{ext}'.format(gender.upper(), ext=ext)
+            smplx_path = os.path.join(model_path, model_fn)
+        else:
+            smplx_path = model_path
+        assert osp.exists(smplx_path), 'Path {} does not exist!'.format(smplx_path)
+        if ext == 'pkl':
+            with open(smplx_path, 'rb') as smplx_file:
+                model_data = pickle.load(smplx_file, encoding='latin1')
+        elif ext == 'npz':
+            model_data = np.load(smplx_path, allow_pickle=True)
+        else:
+            raise ValueError('Unknown extension: {}'.format(ext))
+
+        data_struct = Struct(**model_data)
+
+        super(SMPLX, self).__init__(
+            model_path=model_path,
+            data_struct=data_struct,
+            dtype=dtype,
+            batch_size=batch_size,
+            vertex_ids=VERTEX_IDS['smplx'],
+            gender=gender, ext=ext,
+            **kwargs)
+
+        lmk_faces_idx = data_struct.lmk_faces_idx
+        self.register_buffer('lmk_faces_idx',
+                             torch.tensor(lmk_faces_idx, dtype=torch.long))
+        lmk_bary_coords = data_struct.lmk_bary_coords
+        self.register_buffer('lmk_bary_coords',
+                             torch.tensor(lmk_bary_coords, dtype=dtype))
+
+        self.use_face_contour = use_face_contour
+        if self.use_face_contour:
+            dynamic_lmk_faces_idx = data_struct.dynamic_lmk_faces_idx
+            dynamic_lmk_faces_idx = torch.tensor(
+                dynamic_lmk_faces_idx,
+                dtype=torch.long)
+            self.register_buffer('dynamic_lmk_faces_idx',
+                                 dynamic_lmk_faces_idx)
+
+            dynamic_lmk_bary_coords = data_struct.dynamic_lmk_bary_coords
+            dynamic_lmk_bary_coords = torch.tensor(
+                dynamic_lmk_bary_coords, dtype=dtype)
+            self.register_buffer('dynamic_lmk_bary_coords',
+                                 dynamic_lmk_bary_coords)
+
+            neck_kin_chain = find_joint_kin_chain(self.NECK_IDX, self.parents)
+            self.register_buffer(
+                'neck_kin_chain',
+                torch.tensor(neck_kin_chain, dtype=torch.long))
+
+        if create_jaw_pose:
+            if jaw_pose is None:
+                default_jaw_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_jaw_pose = torch.tensor(jaw_pose, dtype=dtype)
+            jaw_pose_param = nn.Parameter(default_jaw_pose,
+                                          requires_grad=True)
+            self.register_parameter('jaw_pose', jaw_pose_param)
+
+        if create_leye_pose:
+            if leye_pose is None:
+                default_leye_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_leye_pose = torch.tensor(leye_pose, dtype=dtype)
+            leye_pose_param = nn.Parameter(default_leye_pose,
+                                           requires_grad=True)
+            self.register_parameter('leye_pose', leye_pose_param)
+
+        if create_reye_pose:
+            if reye_pose is None:
+                default_reye_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_reye_pose = torch.tensor(reye_pose, dtype=dtype)
+            reye_pose_param = nn.Parameter(default_reye_pose,
+                                           requires_grad=True)
+            self.register_parameter('reye_pose', reye_pose_param)
+
+        shapedirs = data_struct.shapedirs
+        if len(shapedirs.shape) < 3:
+            shapedirs = shapedirs[:, :, None]
+        if (shapedirs.shape[-1] < self.SHAPE_SPACE_DIM +
+                self.EXPRESSION_SPACE_DIM):
+            print(f'WARNING: You are using a {self.name()} model, with only'
+                  ' 10 shape and 10 expression coefficients.')
+            expr_start_idx = 10
+            expr_end_idx = 20
+            num_expression_coeffs = min(num_expression_coeffs, 10)
+        else:
+            expr_start_idx = self.SHAPE_SPACE_DIM
+            expr_end_idx = self.SHAPE_SPACE_DIM + num_expression_coeffs
+            num_expression_coeffs = min(
+                num_expression_coeffs, self.EXPRESSION_SPACE_DIM)
+
+        self._num_expression_coeffs = num_expression_coeffs
+
+        expr_dirs = shapedirs[:, :, expr_start_idx:expr_end_idx]
+        self.register_buffer(
+            'expr_dirs', to_tensor(to_np(expr_dirs), dtype=dtype))
+
+        if create_expression:
+            if expression is None:
+                default_expression = torch.zeros(
+                    [batch_size, self.num_expression_coeffs], dtype=dtype)
+            else:
+                default_expression = torch.tensor(expression, dtype=dtype)
+            expression_param = nn.Parameter(default_expression,
+                                            requires_grad=True)
+            self.register_parameter('expression', expression_param)
+
+    def name(self) -> str:
+        return 'SMPL-X'
+
+    @property
+    def num_expression_coeffs(self):
+        return self._num_expression_coeffs
+
+    def create_mean_pose(self, data_struct, flat_hand_mean=False):
+        # Create the array for the mean pose. If flat_hand is false, then use
+        # the mean that is given by the data, rather than the flat open hand
+        global_orient_mean = torch.zeros([3], dtype=self.dtype)
+        body_pose_mean = torch.zeros([self.NUM_BODY_JOINTS * 3],
+                                     dtype=self.dtype)
+        jaw_pose_mean = torch.zeros([3], dtype=self.dtype)
+        leye_pose_mean = torch.zeros([3], dtype=self.dtype)
+        reye_pose_mean = torch.zeros([3], dtype=self.dtype)
+
+        pose_mean = np.concatenate([global_orient_mean, body_pose_mean,
+                                    jaw_pose_mean,
+                                    leye_pose_mean, reye_pose_mean,
+                                    self.left_hand_mean, self.right_hand_mean],
+                                   axis=0)
+
+        return pose_mean
+
+    def extra_repr(self):
+        msg = super(SMPLX, self).extra_repr()
+        msg = [
+            msg,
+            f'Number of Expression Coefficients: {self.num_expression_coeffs}'
+        ]
+        return '\n'.join(msg)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        body_pose: Optional[Tensor] = None,
+        left_hand_pose: Optional[Tensor] = None,
+        right_hand_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        expression: Optional[Tensor] = None,
+        jaw_pose: Optional[Tensor] = None,
+        leye_pose: Optional[Tensor] = None,
+        reye_pose: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> SMPLXOutput:
+        '''
+        Forward pass for the SMPLX model
+
+            Parameters
+            ----------
+            global_orient: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable and use it as the global
+                rotation of the body. Useful if someone wishes to predicts this
+                with an external model. (default=None)
+            betas: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `betas` and use it
+                instead. For example, it can used if shape parameters
+                `betas` are predicted from some external model.
+                (default=None)
+            expression: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `expression` and use it
+                instead. For example, it can used if expression parameters
+                `expression` are predicted from some external model.
+            body_pose: torch.tensor, optional, shape Bx(J*3)
+                If given, ignore the member variable `body_pose` and use it
+                instead. For example, it can used if someone predicts the
+                pose of the body joints are predicted from some external model.
+                It should be a tensor that contains joint rotations in
+                axis-angle format. (default=None)
+            left_hand_pose: torch.tensor, optional, shape BxP
+                If given, ignore the member variable `left_hand_pose` and
+                use this instead. It should either contain PCA coefficients or
+                joint rotations in axis-angle format.
+            right_hand_pose: torch.tensor, optional, shape BxP
+                If given, ignore the member variable `right_hand_pose` and
+                use this instead. It should either contain PCA coefficients or
+                joint rotations in axis-angle format.
+            jaw_pose: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `jaw_pose` and
+                use this instead. It should either joint rotations in
+                axis-angle format.
+            transl: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `transl` and use it
+                instead. For example, it can used if the translation
+                `transl` is predicted from some external model.
+                (default=None)
+            return_verts: bool, optional
+                Return the vertices. (default=True)
+            return_full_pose: bool, optional
+                Returns the full axis-angle pose vector (default=False)
+
+            Returns
+            -------
+                output: ModelOutput
+                A named tuple of type `ModelOutput`
+        '''
+
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient if global_orient is not None else
+                         self.global_orient)
+        body_pose = body_pose if body_pose is not None else self.body_pose
+        betas = betas if betas is not None else self.betas
+
+        left_hand_pose = (left_hand_pose if left_hand_pose is not None else
+                          self.left_hand_pose)
+        right_hand_pose = (right_hand_pose if right_hand_pose is not None else
+                           self.right_hand_pose)
+        jaw_pose = jaw_pose if jaw_pose is not None else self.jaw_pose
+        leye_pose = leye_pose if leye_pose is not None else self.leye_pose
+        reye_pose = reye_pose if reye_pose is not None else self.reye_pose
+        expression = expression if expression is not None else self.expression
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None:
+            if hasattr(self, 'transl'):
+                transl = self.transl
+
+        if self.use_pca:
+            left_hand_pose = torch.einsum(
+                'bi,ij->bj', [left_hand_pose, self.left_hand_components])
+            right_hand_pose = torch.einsum(
+                'bi,ij->bj', [right_hand_pose, self.right_hand_components])
+
+        full_pose = torch.cat([global_orient, body_pose,
+                               jaw_pose, leye_pose, reye_pose,
+                               left_hand_pose,
+                               right_hand_pose], dim=1)
+
+        # Add the mean pose of the model. Does not affect the body, only the
+        # hands when flat_hand_mean == False
+        full_pose += self.pose_mean
+
+        batch_size = max(betas.shape[0], global_orient.shape[0],
+                         body_pose.shape[0])
+        # Concatenate the shape and expression coefficients
+        scale = int(batch_size / betas.shape[0])
+        if scale > 1:
+            betas = betas.expand(scale, -1)
+        shape_components = torch.cat([betas, expression], dim=-1)
+
+        shapedirs = torch.cat([self.shapedirs, self.expr_dirs], dim=-1)
+
+        vertices, joints = lbs(shape_components, full_pose, self.v_template,
+                               shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=pose2rot,
+                               )
+
+        lmk_faces_idx = self.lmk_faces_idx.unsqueeze(
+            dim=0).expand(batch_size, -1).contiguous()
+        lmk_bary_coords = self.lmk_bary_coords.unsqueeze(dim=0).repeat(
+            self.batch_size, 1, 1)
+        if self.use_face_contour:
+            lmk_idx_and_bcoords = find_dynamic_lmk_idx_and_bcoords(
+                vertices, full_pose, self.dynamic_lmk_faces_idx,
+                self.dynamic_lmk_bary_coords,
+                self.neck_kin_chain,
+                pose2rot=True,
+            )
+            dyn_lmk_faces_idx, dyn_lmk_bary_coords = lmk_idx_and_bcoords
+
+            lmk_faces_idx = torch.cat([lmk_faces_idx,
+                                       dyn_lmk_faces_idx], 1)
+            lmk_bary_coords = torch.cat(
+                [lmk_bary_coords.expand(batch_size, -1, -1),
+                 dyn_lmk_bary_coords], 1)
+
+        landmarks = vertices2landmarks(vertices, self.faces_tensor,
+                                       lmk_faces_idx,
+                                       lmk_bary_coords)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Add the landmarks to the joints
+        joints = torch.cat([joints, landmarks], dim=1)
+        # Map the joints to the current dataset
+
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints=joints, vertices=vertices)
+
+        if apply_trans:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLXOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             expression=expression,
+                             global_orient=global_orient,
+                             body_pose=body_pose,
+                             left_hand_pose=left_hand_pose,
+                             right_hand_pose=right_hand_pose,
+                             jaw_pose=jaw_pose,
+                             full_pose=full_pose if return_full_pose else None)
+        return output
+
+
+class SMPLXLayer(SMPLX):
+    def __init__(
+        self,
+        *args,
+        **kwargs
+    ) -> None:
+        # Just create a SMPLX module without any member variables
+        super(SMPLXLayer, self).__init__(
+            create_global_orient=False,
+            create_body_pose=False,
+            create_left_hand_pose=False,
+            create_right_hand_pose=False,
+            create_jaw_pose=False,
+            create_leye_pose=False,
+            create_reye_pose=False,
+            create_betas=False,
+            create_expression=False,
+            create_transl=False,
+            *args, **kwargs,
+        )
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        body_pose: Optional[Tensor] = None,
+        left_hand_pose: Optional[Tensor] = None,
+        right_hand_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        expression: Optional[Tensor] = None,
+        jaw_pose: Optional[Tensor] = None,
+        leye_pose: Optional[Tensor] = None,
+        reye_pose: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        **kwargs
+    ) -> SMPLXOutput:
+        '''
+        Forward pass for the SMPLX model
+
+            Parameters
+            ----------
+            global_orient: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable and use it as the global
+                rotation of the body. Useful if someone wishes to predicts this
+                with an external model. (default=None)
+            betas: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `betas` and use it
+                instead. For example, it can used if shape parameters
+                `betas` are predicted from some external model.
+                (default=None)
+            expression: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `expression` and use it
+                instead. For example, it can used if expression parameters
+                `expression` are predicted from some external model.
+            body_pose: torch.tensor, optional, shape Bx(J*3)
+                If given, ignore the member variable `body_pose` and use it
+                instead. For example, it can used if someone predicts the
+                pose of the body joints are predicted from some external model.
+                It should be a tensor that contains joint rotations in
+                axis-angle format. (default=None)
+            left_hand_pose: torch.tensor, optional, shape BxP
+                If given, ignore the member variable `left_hand_pose` and
+                use this instead. It should either contain PCA coefficients or
+                joint rotations in axis-angle format.
+            right_hand_pose: torch.tensor, optional, shape BxP
+                If given, ignore the member variable `right_hand_pose` and
+                use this instead. It should either contain PCA coefficients or
+                joint rotations in axis-angle format.
+            jaw_pose: torch.tensor, optional, shape Bx3x3
+                If given, ignore the member variable `jaw_pose` and
+                use this instead. It should either joint rotations in
+                axis-angle format.
+            transl: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `transl` and use it
+                instead. For example, it can used if the translation
+                `transl` is predicted from some external model.
+                (default=None)
+            return_verts: bool, optional
+                Return the vertices. (default=True)
+            return_full_pose: bool, optional
+                Returns the full pose vector (default=False)
+            Returns
+            -------
+                output: ModelOutput
+                A data class that contains the posed vertices and joints
+        '''
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if body_pose is None:
+            body_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(
+                    batch_size, self.NUM_BODY_JOINTS, -1).contiguous()
+        if left_hand_pose is None:
+            left_hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if right_hand_pose is None:
+            right_hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if jaw_pose is None:
+            jaw_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if leye_pose is None:
+            leye_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if reye_pose is None:
+            reye_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if expression is None:
+            expression = torch.zeros([batch_size, self.num_expression_coeffs],
+                                     dtype=dtype, device=device)
+        if betas is None:
+            betas = torch.zeros([batch_size, self.num_betas],
+                                dtype=dtype, device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+
+        # Concatenate all pose vectors
+        full_pose = torch.cat(
+            [global_orient.reshape(-1, 1, 3),
+             body_pose.reshape(-1, self.NUM_BODY_JOINTS, 3),
+             jaw_pose.reshape(-1, 1, 3),
+             leye_pose.reshape(-1, 1, 3),
+             reye_pose.reshape(-1, 1, 3),
+             left_hand_pose.reshape(-1, self.NUM_HAND_JOINTS, 3),
+             right_hand_pose.reshape(-1, self.NUM_HAND_JOINTS, 3)],
+            dim=1)
+        shape_components = torch.cat([betas, expression], dim=-1)
+
+        shapedirs = torch.cat([self.shapedirs, self.expr_dirs], dim=-1)
+
+        vertices, joints = lbs(shape_components, full_pose, self.v_template,
+                               shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=True)
+
+        lmk_faces_idx = self.lmk_faces_idx.unsqueeze(
+            dim=0).expand(batch_size, -1).contiguous()
+        lmk_bary_coords = self.lmk_bary_coords.unsqueeze(dim=0).repeat(
+            self.batch_size, 1, 1)
+        if self.use_face_contour:
+            lmk_idx_and_bcoords = find_dynamic_lmk_idx_and_bcoords(
+                vertices, full_pose,
+                self.dynamic_lmk_faces_idx,
+                self.dynamic_lmk_bary_coords,
+                self.neck_kin_chain,
+                pose2rot=False,
+            )
+            dyn_lmk_faces_idx, dyn_lmk_bary_coords = lmk_idx_and_bcoords
+
+            lmk_faces_idx = torch.cat([lmk_faces_idx, dyn_lmk_faces_idx], 1)
+            lmk_bary_coords = torch.cat(
+                [lmk_bary_coords.expand(batch_size, -1, -1),
+                 dyn_lmk_bary_coords], 1)
+
+        landmarks = vertices2landmarks(vertices, self.faces_tensor,
+                                       lmk_faces_idx,
+                                       lmk_bary_coords)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Add the landmarks to the joints
+        joints = torch.cat([joints, landmarks], dim=1)
+        # Map the joints to the current dataset
+
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints=joints, vertices=vertices)
+
+        if transl is not None:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLXOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             expression=expression,
+                             global_orient=global_orient,
+                             body_pose=body_pose,
+                             left_hand_pose=left_hand_pose,
+                             right_hand_pose=right_hand_pose,
+                             jaw_pose=jaw_pose,
+                             transl=transl,
+                             full_pose=full_pose if return_full_pose else None)
+        return output
+
+
+class MANO(SMPL):
+    # The hand joints are replaced by MANO
+    NUM_BODY_JOINTS = 1
+    NUM_HAND_JOINTS = 15
+    NUM_JOINTS = NUM_BODY_JOINTS + NUM_HAND_JOINTS
+
+    def __init__(
+        self,
+        model_path: str,
+        is_rhand: bool = True,
+        data_struct: Optional[Struct] = None,
+        create_hand_pose: bool = True,
+        hand_pose: Optional[Tensor] = None,
+        use_pca: bool = True,
+        num_pca_comps: int = 6,
+        flat_hand_mean: bool = False,
+        batch_size: int = 1,
+        dtype=torch.float32,
+        vertex_ids=None,
+        use_compressed: bool = True,
+        ext: str = 'pkl',
+        **kwargs
+    ) -> None:
+        ''' MANO model constructor
+
+            Parameters
+            ----------
+            model_path: str
+                The path to the folder or to the file where the model
+                parameters are stored
+            data_struct: Strct
+                A struct object. If given, then the parameters of the model are
+                read from the object. Otherwise, the model tries to read the
+                parameters from the given `model_path`. (default = None)
+            create_hand_pose: bool, optional
+                Flag for creating a member variable for the pose of the right
+                hand. (default = True)
+            hand_pose: torch.tensor, optional, BxP
+                The default value for the right hand pose member variable.
+                (default = None)
+            num_pca_comps: int, optional
+                The number of PCA components to use for each hand.
+                (default = 6)
+            flat_hand_mean: bool, optional
+                If False, then the pose of the hand is initialized to False.
+            batch_size: int, optional
+                The batch size used for creating the member variables
+            dtype: torch.dtype, optional
+                The data type for the created variables
+            vertex_ids: dict, optional
+                A dictionary containing the indices of the extra vertices that
+                will be selected
+        '''
+
+        self.num_pca_comps = num_pca_comps
+        self.is_rhand = is_rhand
+        # If no data structure is passed, then load the data from the given
+        # model folder
+        if data_struct is None:
+            # Load the model
+            if osp.isdir(model_path):
+                model_fn = 'MANO_{}.{ext}'.format(
+                    'RIGHT' if is_rhand else 'LEFT', ext=ext)
+                mano_path = os.path.join(model_path, model_fn)
+            else:
+                mano_path = model_path
+                self.is_rhand = True if 'RIGHT' in os.path.basename(
+                    model_path) else False
+            assert osp.exists(mano_path), 'Path {} does not exist!'.format(
+                mano_path)
+
+            if ext == 'pkl':
+                with open(mano_path, 'rb') as mano_file:
+                    model_data = pickle.load(mano_file, encoding='latin1')
+            elif ext == 'npz':
+                model_data = np.load(mano_path, allow_pickle=True)
+            else:
+                raise ValueError('Unknown extension: {}'.format(ext))
+            data_struct = Struct(**model_data)
+
+        if vertex_ids is None:
+            vertex_ids = VERTEX_IDS['smplh']
+
+        super(MANO, self).__init__(
+            model_path=model_path, data_struct=data_struct,
+            batch_size=batch_size, vertex_ids=vertex_ids,
+            use_compressed=use_compressed, dtype=dtype, ext=ext, **kwargs)
+
+        # add only MANO tips to the extra joints
+        self.vertex_joint_selector.extra_joints_idxs = to_tensor(
+            list(VERTEX_IDS['mano'].values()), dtype=torch.long)
+
+        self.use_pca = use_pca
+        self.num_pca_comps = num_pca_comps
+        if self.num_pca_comps == 45:
+            self.use_pca = False
+        self.flat_hand_mean = flat_hand_mean
+
+        hand_components = data_struct.hands_components[:num_pca_comps]
+
+        self.np_hand_components = hand_components
+
+        if self.use_pca:
+            self.register_buffer(
+                'hand_components',
+                torch.tensor(hand_components, dtype=dtype))
+
+        if self.flat_hand_mean:
+            hand_mean = np.zeros_like(data_struct.hands_mean)
+        else:
+            hand_mean = data_struct.hands_mean
+
+        self.register_buffer('hand_mean',
+                             to_tensor(hand_mean, dtype=self.dtype))
+
+        # Create the buffers for the pose of the left hand
+        hand_pose_dim = num_pca_comps if use_pca else 3 * self.NUM_HAND_JOINTS
+        if create_hand_pose:
+            if hand_pose is None:
+                default_hand_pose = torch.zeros([batch_size, hand_pose_dim],
+                                                dtype=dtype)
+            else:
+                default_hand_pose = torch.tensor(hand_pose, dtype=dtype)
+
+            hand_pose_param = nn.Parameter(default_hand_pose,
+                                           requires_grad=True)
+            self.register_parameter('hand_pose',
+                                    hand_pose_param)
+
+        # Create the buffer for the mean pose.
+        pose_mean = self.create_mean_pose(
+            data_struct, flat_hand_mean=flat_hand_mean)
+        pose_mean_tensor = pose_mean.clone().to(dtype)
+        # pose_mean_tensor = torch.tensor(pose_mean, dtype=dtype)
+        self.register_buffer('pose_mean', pose_mean_tensor)
+
+    def name(self) -> str:
+        return 'MANO'
+
+    def create_mean_pose(self, data_struct, flat_hand_mean=False):
+        # Create the array for the mean pose. If flat_hand is false, then use
+        # the mean that is given by the data, rather than the flat open hand
+        global_orient_mean = torch.zeros([3], dtype=self.dtype)
+        pose_mean = torch.cat([global_orient_mean, self.hand_mean], dim=0)
+        return pose_mean
+
+    def extra_repr(self):
+        msg = [super(MANO, self).extra_repr()]
+        if self.use_pca:
+            msg.append(f'Number of PCA components: {self.num_pca_comps}')
+        msg.append(f'Flat hand mean: {self.flat_hand_mean}')
+        return '\n'.join(msg)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        hand_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        **kwargs
+    ) -> MANOOutput:
+        ''' Forward pass for the MANO model
+        '''
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient if global_orient is not None else
+                         self.global_orient)
+        betas = betas if betas is not None else self.betas
+        hand_pose = (hand_pose if hand_pose is not None else
+                     self.hand_pose)
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None:
+            if hasattr(self, 'transl'):
+                transl = self.transl
+
+        if self.use_pca:
+            hand_pose = torch.einsum(
+                'bi,ij->bj', [hand_pose, self.hand_components])
+
+        full_pose = torch.cat([global_orient, hand_pose], dim=1)
+        full_pose += self.pose_mean
+
+        vertices, joints = lbs(betas, full_pose, self.v_template,
+                               self.shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=True,
+                               )
+
+        # # Add pre-selected extra joints that might be needed
+        # joints = self.vertex_joint_selector(vertices, joints)
+
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if apply_trans:
+            joints = joints + transl.unsqueeze(dim=1)
+            vertices = vertices + transl.unsqueeze(dim=1)
+
+        output = MANOOutput(vertices=vertices if return_verts else None,
+                            joints=joints if return_verts else None,
+                            betas=betas,
+                            global_orient=global_orient,
+                            hand_pose=hand_pose,
+                            full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class MANOLayer(MANO):
+    def __init__(self, *args, **kwargs) -> None:
+        ''' MANO as a layer model constructor
+        '''
+        super(MANOLayer, self).__init__(
+            create_global_orient=False,
+            create_hand_pose=False,
+            create_betas=False,
+            create_transl=False,
+            *args, **kwargs)
+
+    def name(self) -> str:
+        return 'MANO'
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        hand_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        **kwargs
+    ) -> MANOOutput:
+        ''' Forward pass for the MANO model
+        '''
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if hand_pose is None:
+            hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if betas is None:
+            betas = torch.zeros(
+                [batch_size, self.num_betas], dtype=dtype, device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+
+        full_pose = torch.cat([global_orient, hand_pose], dim=1)
+        vertices, joints = lbs(betas, full_pose, self.v_template,
+                               self.shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=True)
+
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if transl is not None:
+            joints = joints + transl.unsqueeze(dim=1)
+            vertices = vertices + transl.unsqueeze(dim=1)
+
+        output = MANOOutput(
+            vertices=vertices if return_verts else None,
+            joints=joints if return_verts else None,
+            betas=betas,
+            global_orient=global_orient,
+            hand_pose=hand_pose,
+            full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class FLAME(SMPL):
+    NUM_JOINTS = 5
+    SHAPE_SPACE_DIM = 300
+    EXPRESSION_SPACE_DIM = 100
+    NECK_IDX = 0
+
+    def __init__(
+        self,
+        model_path: str,
+        data_struct=None,
+        num_expression_coeffs=10,
+        create_expression: bool = True,
+        expression: Optional[Tensor] = None,
+        create_neck_pose: bool = True,
+        neck_pose: Optional[Tensor] = None,
+        create_jaw_pose: bool = True,
+        jaw_pose: Optional[Tensor] = None,
+        create_leye_pose: bool = True,
+        leye_pose: Optional[Tensor] = None,
+        create_reye_pose=True,
+        reye_pose: Optional[Tensor] = None,
+        use_face_contour=False,
+        batch_size: int = 1,
+        gender: str = 'neutral',
+        dtype: torch.dtype = torch.float32,
+        ext='pkl',
+        **kwargs
+    ) -> None:
+        ''' FLAME model constructor
+
+            Parameters
+            ----------
+            model_path: str
+                The path to the folder or to the file where the model
+                parameters are stored
+            num_expression_coeffs: int, optional
+                Number of expression components to use
+                (default = 10).
+            create_expression: bool, optional
+                Flag for creating a member variable for the expression space
+                (default = True).
+            expression: torch.tensor, optional, Bx10
+                The default value for the expression member variable.
+                (default = None)
+            create_neck_pose: bool, optional
+                Flag for creating a member variable for the neck pose.
+                (default = False)
+            neck_pose: torch.tensor, optional, Bx3
+                The default value for the neck pose variable.
+                (default = None)
+            create_jaw_pose: bool, optional
+                Flag for creating a member variable for the jaw pose.
+                (default = False)
+            jaw_pose: torch.tensor, optional, Bx3
+                The default value for the jaw pose variable.
+                (default = None)
+            create_leye_pose: bool, optional
+                Flag for creating a member variable for the left eye pose.
+                (default = False)
+            leye_pose: torch.tensor, optional, Bx10
+                The default value for the left eye pose variable.
+                (default = None)
+            create_reye_pose: bool, optional
+                Flag for creating a member variable for the right eye pose.
+                (default = False)
+            reye_pose: torch.tensor, optional, Bx10
+                The default value for the right eye pose variable.
+                (default = None)
+            use_face_contour: bool, optional
+                Whether to compute the keypoints that form the facial contour
+            batch_size: int, optional
+                The batch size used for creating the member variables
+            gender: str, optional
+                Which gender to load
+            dtype: torch.dtype
+                The data type for the created variables
+        '''
+        model_fn = f'FLAME_{gender.upper()}.{ext}'
+        flame_path = os.path.join(model_path, model_fn)
+        assert osp.exists(flame_path), 'Path {} does not exist!'.format(
+            flame_path)
+        if ext == 'npz':
+            file_data = np.load(flame_path, allow_pickle=True)
+        elif ext == 'pkl':
+            with open(flame_path, 'rb') as smpl_file:
+                file_data = pickle.load(smpl_file, encoding='latin1')
+        else:
+            raise ValueError('Unknown extension: {}'.format(ext))
+        data_struct = Struct(**file_data)
+
+        super(FLAME, self).__init__(
+            model_path=model_path,
+            data_struct=data_struct,
+            dtype=dtype,
+            batch_size=batch_size,
+            gender=gender,
+            ext=ext,
+            **kwargs)
+
+        self.use_face_contour = use_face_contour
+
+        self.vertex_joint_selector.extra_joints_idxs = to_tensor(
+            [], dtype=torch.long)
+
+        if create_neck_pose:
+            if neck_pose is None:
+                default_neck_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_neck_pose = torch.tensor(neck_pose, dtype=dtype)
+            neck_pose_param = nn.Parameter(
+                default_neck_pose, requires_grad=True)
+            self.register_parameter('neck_pose', neck_pose_param)
+
+        if create_jaw_pose:
+            if jaw_pose is None:
+                default_jaw_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_jaw_pose = torch.tensor(jaw_pose, dtype=dtype)
+            jaw_pose_param = nn.Parameter(default_jaw_pose,
+                                          requires_grad=True)
+            self.register_parameter('jaw_pose', jaw_pose_param)
+
+        if create_leye_pose:
+            if leye_pose is None:
+                default_leye_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_leye_pose = torch.tensor(leye_pose, dtype=dtype)
+            leye_pose_param = nn.Parameter(default_leye_pose,
+                                           requires_grad=True)
+            self.register_parameter('leye_pose', leye_pose_param)
+
+        if create_reye_pose:
+            if reye_pose is None:
+                default_reye_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_reye_pose = torch.tensor(reye_pose, dtype=dtype)
+            reye_pose_param = nn.Parameter(default_reye_pose,
+                                           requires_grad=True)
+            self.register_parameter('reye_pose', reye_pose_param)
+
+        shapedirs = data_struct.shapedirs
+        if len(shapedirs.shape) < 3:
+            shapedirs = shapedirs[:, :, None]
+        if (shapedirs.shape[-1] < self.SHAPE_SPACE_DIM +
+                self.EXPRESSION_SPACE_DIM):
+            print(f'WARNING: You are using a {self.name()} model, with only'
+                  ' 10 shape and 10 expression coefficients.')
+            expr_start_idx = 10
+            expr_end_idx = 20
+            num_expression_coeffs = min(num_expression_coeffs, 10)
+        else:
+            expr_start_idx = self.SHAPE_SPACE_DIM
+            expr_end_idx = self.SHAPE_SPACE_DIM + num_expression_coeffs
+            num_expression_coeffs = min(
+                num_expression_coeffs, self.EXPRESSION_SPACE_DIM)
+
+        self._num_expression_coeffs = num_expression_coeffs
+
+        expr_dirs = shapedirs[:, :, expr_start_idx:expr_end_idx]
+        self.register_buffer(
+            'expr_dirs', to_tensor(to_np(expr_dirs), dtype=dtype))
+
+        if create_expression:
+            if expression is None:
+                default_expression = torch.zeros(
+                    [batch_size, self.num_expression_coeffs], dtype=dtype)
+            else:
+                default_expression = torch.tensor(expression, dtype=dtype)
+            expression_param = nn.Parameter(default_expression,
+                                            requires_grad=True)
+            self.register_parameter('expression', expression_param)
+
+        # The pickle file that contains the barycentric coordinates for
+        # regressing the landmarks
+        landmark_bcoord_filename = osp.join(
+            model_path, 'flame_static_embedding.pkl')
+
+        with open(landmark_bcoord_filename, 'rb') as fp:
+            landmarks_data = pickle.load(fp, encoding='latin1')
+
+        lmk_faces_idx = landmarks_data['lmk_face_idx'].astype(np.int64)
+        self.register_buffer('lmk_faces_idx',
+                             torch.tensor(lmk_faces_idx, dtype=torch.long))
+        lmk_bary_coords = landmarks_data['lmk_b_coords']
+        self.register_buffer('lmk_bary_coords',
+                             torch.tensor(lmk_bary_coords, dtype=dtype))
+        if self.use_face_contour:
+            face_contour_path = os.path.join(
+                model_path, 'flame_dynamic_embedding.npy')
+            contour_embeddings = np.load(face_contour_path,
+                                         allow_pickle=True,
+                                         encoding='latin1')[()]
+
+            dynamic_lmk_faces_idx = np.array(
+                contour_embeddings['lmk_face_idx'], dtype=np.int64)
+            dynamic_lmk_faces_idx = torch.tensor(
+                dynamic_lmk_faces_idx,
+                dtype=torch.long)
+            self.register_buffer('dynamic_lmk_faces_idx',
+                                 dynamic_lmk_faces_idx)
+
+            dynamic_lmk_b_coords = torch.tensor(
+                contour_embeddings['lmk_b_coords'], dtype=dtype)
+            self.register_buffer(
+                'dynamic_lmk_bary_coords', dynamic_lmk_b_coords)
+
+            neck_kin_chain = find_joint_kin_chain(self.NECK_IDX, self.parents)
+            self.register_buffer(
+                'neck_kin_chain',
+                torch.tensor(neck_kin_chain, dtype=torch.long))
+
+    @property
+    def num_expression_coeffs(self):
+        return self._num_expression_coeffs
+
+    def name(self) -> str:
+        return 'FLAME'
+
+    def extra_repr(self):
+        msg = [
+            super(FLAME, self).extra_repr(),
+            f'Number of Expression Coefficients: {self.num_expression_coeffs}',
+            f'Use face contour: {self.use_face_contour}',
+        ]
+        return '\n'.join(msg)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        neck_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        expression: Optional[Tensor] = None,
+        jaw_pose: Optional[Tensor] = None,
+        leye_pose: Optional[Tensor] = None,
+        reye_pose: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> FLAMEOutput:
+        '''
+        Forward pass for the SMPLX model
+
+            Parameters
+            ----------
+            global_orient: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable and use it as the global
+                rotation of the body. Useful if someone wishes to predicts this
+                with an external model. (default=None)
+            betas: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `betas` and use it
+                instead. For example, it can used if shape parameters
+                `betas` are predicted from some external model.
+                (default=None)
+            expression: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `expression` and use it
+                instead. For example, it can used if expression parameters
+                `expression` are predicted from some external model.
+            jaw_pose: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `jaw_pose` and
+                use this instead. It should either joint rotations in
+                axis-angle format.
+            jaw_pose: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `jaw_pose` and
+                use this instead. It should either joint rotations in
+                axis-angle format.
+            transl: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `transl` and use it
+                instead. For example, it can used if the translation
+                `transl` is predicted from some external model.
+                (default=None)
+            return_verts: bool, optional
+                Return the vertices. (default=True)
+            return_full_pose: bool, optional
+                Returns the full axis-angle pose vector (default=False)
+
+            Returns
+            -------
+                output: ModelOutput
+                A named tuple of type `ModelOutput`
+        '''
+
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient if global_orient is not None else
+                         self.global_orient)
+        jaw_pose = jaw_pose if jaw_pose is not None else self.jaw_pose
+        neck_pose = neck_pose if neck_pose is not None else self.neck_pose
+
+        leye_pose = leye_pose if leye_pose is not None else self.leye_pose
+        reye_pose = reye_pose if reye_pose is not None else self.reye_pose
+
+        betas = betas if betas is not None else self.betas
+        expression = expression if expression is not None else self.expression
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None:
+            if hasattr(self, 'transl'):
+                transl = self.transl
+
+        full_pose = torch.cat(
+            [global_orient, neck_pose, jaw_pose, leye_pose, reye_pose], dim=1)
+
+        batch_size = max(betas.shape[0], global_orient.shape[0],
+                         jaw_pose.shape[0])
+        # Concatenate the shape and expression coefficients
+        scale = int(batch_size / betas.shape[0])
+        if scale > 1:
+            betas = betas.expand(scale, -1)
+        shape_components = torch.cat([betas, expression], dim=-1)
+        shapedirs = torch.cat([self.shapedirs, self.expr_dirs], dim=-1)
+
+        vertices, joints = lbs(shape_components, full_pose, self.v_template,
+                               shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=pose2rot,
+                               )
+
+        lmk_faces_idx = self.lmk_faces_idx.unsqueeze(
+            dim=0).expand(batch_size, -1).contiguous()
+        lmk_bary_coords = self.lmk_bary_coords.unsqueeze(dim=0).repeat(
+            self.batch_size, 1, 1)
+        if self.use_face_contour:
+            lmk_idx_and_bcoords = find_dynamic_lmk_idx_and_bcoords(
+                vertices, full_pose, self.dynamic_lmk_faces_idx,
+                self.dynamic_lmk_bary_coords,
+                self.neck_kin_chain,
+                pose2rot=True,
+            )
+            dyn_lmk_faces_idx, dyn_lmk_bary_coords = lmk_idx_and_bcoords
+            lmk_faces_idx = torch.cat([lmk_faces_idx,
+                                       dyn_lmk_faces_idx], 1)
+            lmk_bary_coords = torch.cat(
+                [lmk_bary_coords.expand(batch_size, -1, -1),
+                 dyn_lmk_bary_coords], 1)
+
+        landmarks = vertices2landmarks(vertices, self.faces_tensor,
+                                       lmk_faces_idx,
+                                       lmk_bary_coords)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Add the landmarks to the joints
+        joints = torch.cat([joints, landmarks], dim=1)
+
+        # Map the joints to the current dataset
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints=joints, vertices=vertices)
+
+        if apply_trans:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = FLAMEOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             expression=expression,
+                             global_orient=global_orient,
+                             neck_pose=neck_pose,
+                             jaw_pose=jaw_pose,
+                             full_pose=full_pose if return_full_pose else None)
+        return output
+
+
+class FLAMELayer(FLAME):
+    def __init__(self, *args, **kwargs) -> None:
+        ''' FLAME as a layer model constructor '''
+        super(FLAMELayer, self).__init__(
+            create_betas=False,
+            create_expression=False,
+            create_global_orient=False,
+            create_neck_pose=False,
+            create_jaw_pose=False,
+            create_leye_pose=False,
+            create_reye_pose=False,
+            *args,
+            **kwargs)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        neck_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        expression: Optional[Tensor] = None,
+        jaw_pose: Optional[Tensor] = None,
+        leye_pose: Optional[Tensor] = None,
+        reye_pose: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> FLAMEOutput:
+        '''
+        Forward pass for the SMPLX model
+
+            Parameters
+            ----------
+            global_orient: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable and use it as the global
+                rotation of the body. Useful if someone wishes to predicts this
+                with an external model. (default=None)
+            betas: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `betas` and use it
+                instead. For example, it can used if shape parameters
+                `betas` are predicted from some external model.
+                (default=None)
+            expression: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `expression` and use it
+                instead. For example, it can used if expression parameters
+                `expression` are predicted from some external model.
+            jaw_pose: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `jaw_pose` and
+                use this instead. It should either joint rotations in
+                axis-angle format.
+            jaw_pose: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `jaw_pose` and
+                use this instead. It should either joint rotations in
+                axis-angle format.
+            transl: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `transl` and use it
+                instead. For example, it can used if the translation
+                `transl` is predicted from some external model.
+                (default=None)
+            return_verts: bool, optional
+                Return the vertices. (default=True)
+            return_full_pose: bool, optional
+                Returns the full axis-angle pose vector (default=False)
+
+            Returns
+            -------
+                output: ModelOutput
+                A named tuple of type `ModelOutput`
+        '''
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if neck_pose is None:
+            neck_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 1, -1).contiguous()
+        if jaw_pose is None:
+            jaw_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if leye_pose is None:
+            leye_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if reye_pose is None:
+            reye_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if betas is None:
+            betas = torch.zeros([batch_size, self.num_betas],
+                                dtype=dtype, device=device)
+        if expression is None:
+            expression = torch.zeros([batch_size, self.num_expression_coeffs],
+                                     dtype=dtype, device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+
+        full_pose = torch.cat(
+            [global_orient, neck_pose, jaw_pose, leye_pose, reye_pose], dim=1)
+
+        shape_components = torch.cat([betas, expression], dim=-1)
+        shapedirs = torch.cat([self.shapedirs, self.expr_dirs], dim=-1)
+
+        vertices, joints = lbs(shape_components, full_pose, self.v_template,
+                               shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=True,
+                               )
+
+        lmk_faces_idx = self.lmk_faces_idx.unsqueeze(
+            dim=0).expand(batch_size, -1).contiguous()
+        lmk_bary_coords = self.lmk_bary_coords.unsqueeze(dim=0).repeat(
+            self.batch_size, 1, 1)
+        if self.use_face_contour:
+            lmk_idx_and_bcoords = find_dynamic_lmk_idx_and_bcoords(
+                vertices, full_pose, self.dynamic_lmk_faces_idx,
+                self.dynamic_lmk_bary_coords,
+                self.neck_kin_chain,
+                pose2rot=False,
+            )
+            dyn_lmk_faces_idx, dyn_lmk_bary_coords = lmk_idx_and_bcoords
+            lmk_faces_idx = torch.cat([lmk_faces_idx,
+                                       dyn_lmk_faces_idx], 1)
+            lmk_bary_coords = torch.cat(
+                [lmk_bary_coords.expand(batch_size, -1, -1),
+                 dyn_lmk_bary_coords], 1)
+
+        landmarks = vertices2landmarks(vertices, self.faces_tensor,
+                                       lmk_faces_idx,
+                                       lmk_bary_coords)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Add the landmarks to the joints
+        joints = torch.cat([joints, landmarks], dim=1)
+
+        # Map the joints to the current dataset
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints=joints, vertices=vertices)
+
+        joints += transl.unsqueeze(dim=1)
+        vertices += transl.unsqueeze(dim=1)
+
+        output = FLAMEOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             expression=expression,
+                             global_orient=global_orient,
+                             neck_pose=neck_pose,
+                             jaw_pose=jaw_pose,
+                             full_pose=full_pose if return_full_pose else None)
+        return output
+
+
+def build_layer(
+    model_path: str,
+    model_type: str = 'smpl',
+    **kwargs
+) -> Union[SMPLLayer, SMPLHLayer, SMPLXLayer, MANOLayer, FLAMELayer]:
+    ''' Method for creating a model from a path and a model type
+
+        Parameters
+        ----------
+        model_path: str
+            Either the path to the model you wish to load or a folder,
+            where each subfolder contains the differents types, i.e.:
+            model_path:
+            |
+            |-- smpl
+                |-- SMPL_FEMALE
+                |-- SMPL_NEUTRAL
+                |-- SMPL_MALE
+            |-- smplh
+                |-- SMPLH_FEMALE
+                |-- SMPLH_MALE
+            |-- smplx
+                |-- SMPLX_FEMALE
+                |-- SMPLX_NEUTRAL
+                |-- SMPLX_MALE
+            |-- mano
+                |-- MANO RIGHT
+                |-- MANO LEFT
+            |-- flame
+                |-- FLAME_FEMALE
+                |-- FLAME_MALE
+                |-- FLAME_NEUTRAL
+
+        model_type: str, optional
+            When model_path is a folder, then this parameter specifies  the
+            type of model to be loaded
+        **kwargs: dict
+            Keyword arguments
+
+        Returns
+        -------
+            body_model: nn.Module
+                The PyTorch module that implements the corresponding body model
+        Raises
+        ------
+            ValueError: In case the model type is not one of SMPL, SMPLH,
+            SMPLX, MANO or FLAME
+    '''
+
+    if osp.isdir(model_path):
+        model_path = os.path.join(model_path, model_type)
+    else:
+        model_type = osp.basename(model_path).split('_')[0].lower()
+
+    if model_type.lower() == 'smpl':
+        return SMPLLayer(model_path, **kwargs)
+    elif model_type.lower() == 'smplh':
+        return SMPLHLayer(model_path, **kwargs)
+    elif model_type.lower() == 'smplx':
+        return SMPLXLayer(model_path, **kwargs)
+    elif 'mano' in model_type.lower():
+        return MANOLayer(model_path, **kwargs)
+    elif 'flame' in model_type.lower():
+        return FLAMELayer(model_path, **kwargs)
+    else:
+        raise ValueError(f'Unknown model type {model_type}, exiting!')
+
+
+def create(
+    model_path: str,
+    model_type: str = 'smpl',
+    **kwargs
+) -> Union[SMPL, SMPLH, SMPLX, MANO, FLAME]:
+    ''' Method for creating a model from a path and a model type
+
+        Parameters
+        ----------
+        model_path: str
+            Either the path to the model you wish to load or a folder,
+            where each subfolder contains the differents types, i.e.:
+            model_path:
+            |
+            |-- smpl
+                |-- SMPL_FEMALE
+                |-- SMPL_NEUTRAL
+                |-- SMPL_MALE
+            |-- smplh
+                |-- SMPLH_FEMALE
+                |-- SMPLH_MALE
+            |-- smplx
+                |-- SMPLX_FEMALE
+                |-- SMPLX_NEUTRAL
+                |-- SMPLX_MALE
+            |-- mano
+                |-- MANO RIGHT
+                |-- MANO LEFT
+
+        model_type: str, optional
+            When model_path is a folder, then this parameter specifies  the
+            type of model to be loaded
+        **kwargs: dict
+            Keyword arguments
+
+        Returns
+        -------
+            body_model: nn.Module
+                The PyTorch module that implements the corresponding body model
+        Raises
+        ------
+            ValueError: In case the model type is not one of SMPL, SMPLH,
+            SMPLX, MANO or FLAME
+    '''
+
+    # If it's a folder, assume
+    if osp.isdir(model_path):
+        model_path = os.path.join(model_path, model_type)
+    else:
+        model_type = osp.basename(model_path).split('_')[0].lower()
+
+    if model_type.lower() == 'smpl':
+        return SMPL(model_path, **kwargs)
+    elif model_type.lower() == 'smplh':
+        return SMPLH(model_path, **kwargs)
+    elif model_type.lower() == 'smplx':
+        return SMPLX(model_path, **kwargs)
+    elif 'mano' in model_type.lower():
+        return MANO(model_path, **kwargs)
+    elif 'flame' in model_type.lower():
+        return FLAME(model_path, **kwargs)
+    else:
+        raise ValueError(f'Unknown model type {model_type}, exiting!')
diff --git a/SMPLer-X/common/utils/smplx/smplx/joint_names.py b/SMPLer-X/common/utils/smplx/smplx/joint_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a3a10f8cef8b50075dc9f680459fc5d596a0013
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/smplx/joint_names.py
@@ -0,0 +1,163 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+JOINT_NAMES = [
+    'pelvis',
+    'left_hip',
+    'right_hip',
+    'spine1',
+    'left_knee',
+    'right_knee',
+    'spine2',
+    'left_ankle',
+    'right_ankle',
+    'spine3',
+    'left_foot',
+    'right_foot',
+    'neck',
+    'left_collar',
+    'right_collar',
+    'head',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'jaw',
+    'left_eye_smplhf',
+    'right_eye_smplhf',
+    'left_index1',
+    'left_index2',
+    'left_index3',
+    'left_middle1',
+    'left_middle2',
+    'left_middle3',
+    'left_pinky1',
+    'left_pinky2',
+    'left_pinky3',
+    'left_ring1',
+    'left_ring2',
+    'left_ring3',
+    'left_thumb1',
+    'left_thumb2',
+    'left_thumb3',
+    'right_index1',
+    'right_index2',
+    'right_index3',
+    'right_middle1',
+    'right_middle2',
+    'right_middle3',
+    'right_pinky1',
+    'right_pinky2',
+    'right_pinky3',
+    'right_ring1',
+    'right_ring2',
+    'right_ring3',
+    'right_thumb1',
+    'right_thumb2',
+    'right_thumb3',
+    'nose',
+    'right_eye',
+    'left_eye',
+    'right_ear',
+    'left_ear',
+    'left_big_toe',
+    'left_small_toe',
+    'left_heel',
+    'right_big_toe',
+    'right_small_toe',
+    'right_heel',
+    'left_thumb',
+    'left_index',
+    'left_middle',
+    'left_ring',
+    'left_pinky',
+    'right_thumb',
+    'right_index',
+    'right_middle',
+    'right_ring',
+    'right_pinky',
+    'right_eye_brow1',
+    'right_eye_brow2',
+    'right_eye_brow3',
+    'right_eye_brow4',
+    'right_eye_brow5',
+    'left_eye_brow5',
+    'left_eye_brow4',
+    'left_eye_brow3',
+    'left_eye_brow2',
+    'left_eye_brow1',
+    'nose1',
+    'nose2',
+    'nose3',
+    'nose4',
+    'right_nose_2',
+    'right_nose_1',
+    'nose_middle',
+    'left_nose_1',
+    'left_nose_2',
+    'right_eye1',
+    'right_eye2',
+    'right_eye3',
+    'right_eye4',
+    'right_eye5',
+    'right_eye6',
+    'left_eye4',
+    'left_eye3',
+    'left_eye2',
+    'left_eye1',
+    'left_eye6',
+    'left_eye5',
+    'right_mouth_1',
+    'right_mouth_2',
+    'right_mouth_3',
+    'mouth_top',
+    'left_mouth_3',
+    'left_mouth_2',
+    'left_mouth_1',
+    'left_mouth_5',  # 59 in OpenPose output
+    'left_mouth_4',  # 58 in OpenPose output
+    'mouth_bottom',
+    'right_mouth_4',
+    'right_mouth_5',
+    'right_lip_1',
+    'right_lip_2',
+    'lip_top',
+    'left_lip_2',
+    'left_lip_1',
+    'left_lip_3',
+    'lip_bottom',
+    'right_lip_3',
+    # Face contour
+    'right_contour_1',
+    'right_contour_2',
+    'right_contour_3',
+    'right_contour_4',
+    'right_contour_5',
+    'right_contour_6',
+    'right_contour_7',
+    'right_contour_8',
+    'contour_middle',
+    'left_contour_8',
+    'left_contour_7',
+    'left_contour_6',
+    'left_contour_5',
+    'left_contour_4',
+    'left_contour_3',
+    'left_contour_2',
+    'left_contour_1',
+]
diff --git a/SMPLer-X/common/utils/smplx/smplx/lbs.py b/SMPLer-X/common/utils/smplx/smplx/lbs.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4d8bb266dac88dda10a2eeb536076533604ff52
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/smplx/lbs.py
@@ -0,0 +1,404 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+from typing import Tuple, List
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+
+from .utils import rot_mat_to_euler, Tensor
+
+
+def find_dynamic_lmk_idx_and_bcoords(
+    vertices: Tensor,
+    pose: Tensor,
+    dynamic_lmk_faces_idx: Tensor,
+    dynamic_lmk_b_coords: Tensor,
+    neck_kin_chain: List[int],
+    pose2rot: bool = True,
+) -> Tuple[Tensor, Tensor]:
+    ''' Compute the faces, barycentric coordinates for the dynamic landmarks
+
+
+        To do so, we first compute the rotation of the neck around the y-axis
+        and then use a pre-computed look-up table to find the faces and the
+        barycentric coordinates that will be used.
+
+        Special thanks to Soubhik Sanyal (soubhik.sanyal@tuebingen.mpg.de)
+        for providing the original TensorFlow implementation and for the LUT.
+
+        Parameters
+        ----------
+        vertices: torch.tensor BxVx3, dtype = torch.float32
+            The tensor of input vertices
+        pose: torch.tensor Bx(Jx3), dtype = torch.float32
+            The current pose of the body model
+        dynamic_lmk_faces_idx: torch.tensor L, dtype = torch.long
+            The look-up table from neck rotation to faces
+        dynamic_lmk_b_coords: torch.tensor Lx3, dtype = torch.float32
+            The look-up table from neck rotation to barycentric coordinates
+        neck_kin_chain: list
+            A python list that contains the indices of the joints that form the
+            kinematic chain of the neck.
+        dtype: torch.dtype, optional
+
+        Returns
+        -------
+        dyn_lmk_faces_idx: torch.tensor, dtype = torch.long
+            A tensor of size BxL that contains the indices of the faces that
+            will be used to compute the current dynamic landmarks.
+        dyn_lmk_b_coords: torch.tensor, dtype = torch.float32
+            A tensor of size BxL that contains the indices of the faces that
+            will be used to compute the current dynamic landmarks.
+    '''
+
+    dtype = vertices.dtype
+    batch_size = vertices.shape[0]
+
+    if pose2rot:
+        aa_pose = torch.index_select(pose.view(batch_size, -1, 3), 1,
+                                     neck_kin_chain)
+        rot_mats = batch_rodrigues(
+            aa_pose.view(-1, 3)).view(batch_size, -1, 3, 3)
+    else:
+        rot_mats = torch.index_select(
+            pose.view(batch_size, -1, 3, 3), 1, neck_kin_chain)
+
+    rel_rot_mat = torch.eye(
+        3, device=vertices.device, dtype=dtype).unsqueeze_(dim=0).repeat(
+            batch_size, 1, 1)
+    for idx in range(len(neck_kin_chain)):
+        rel_rot_mat = torch.bmm(rot_mats[:, idx], rel_rot_mat)
+
+    y_rot_angle = torch.round(
+        torch.clamp(-rot_mat_to_euler(rel_rot_mat) * 180.0 / np.pi,
+                    max=39)).to(dtype=torch.long)
+    neg_mask = y_rot_angle.lt(0).to(dtype=torch.long)
+    mask = y_rot_angle.lt(-39).to(dtype=torch.long)
+    neg_vals = mask * 78 + (1 - mask) * (39 - y_rot_angle)
+    y_rot_angle = (neg_mask * neg_vals +
+                   (1 - neg_mask) * y_rot_angle)
+
+    dyn_lmk_faces_idx = torch.index_select(dynamic_lmk_faces_idx,
+                                           0, y_rot_angle)
+    dyn_lmk_b_coords = torch.index_select(dynamic_lmk_b_coords,
+                                          0, y_rot_angle)
+
+    return dyn_lmk_faces_idx, dyn_lmk_b_coords
+
+
+def vertices2landmarks(
+    vertices: Tensor,
+    faces: Tensor,
+    lmk_faces_idx: Tensor,
+    lmk_bary_coords: Tensor
+) -> Tensor:
+    ''' Calculates landmarks by barycentric interpolation
+
+        Parameters
+        ----------
+        vertices: torch.tensor BxVx3, dtype = torch.float32
+            The tensor of input vertices
+        faces: torch.tensor Fx3, dtype = torch.long
+            The faces of the mesh
+        lmk_faces_idx: torch.tensor L, dtype = torch.long
+            The tensor with the indices of the faces used to calculate the
+            landmarks.
+        lmk_bary_coords: torch.tensor Lx3, dtype = torch.float32
+            The tensor of barycentric coordinates that are used to interpolate
+            the landmarks
+
+        Returns
+        -------
+        landmarks: torch.tensor BxLx3, dtype = torch.float32
+            The coordinates of the landmarks for each mesh in the batch
+    '''
+    # Extract the indices of the vertices for each face
+    # BxLx3
+    batch_size, num_verts = vertices.shape[:2]
+    device = vertices.device
+
+    lmk_faces = torch.index_select(faces, 0, lmk_faces_idx.view(-1)).view(
+        batch_size, -1, 3)
+
+    lmk_faces += torch.arange(
+        batch_size, dtype=torch.long, device=device).view(-1, 1, 1) * num_verts
+
+    lmk_vertices = vertices.view(-1, 3)[lmk_faces].view(
+        batch_size, -1, 3, 3)
+
+    landmarks = torch.einsum('blfi,blf->bli', [lmk_vertices, lmk_bary_coords])
+    return landmarks
+
+
+def lbs(
+    betas: Tensor,
+    pose: Tensor,
+    v_template: Tensor,
+    shapedirs: Tensor,
+    posedirs: Tensor,
+    J_regressor: Tensor,
+    parents: Tensor,
+    lbs_weights: Tensor,
+    pose2rot: bool = True,
+) -> Tuple[Tensor, Tensor]:
+    ''' Performs Linear Blend Skinning with the given shape and pose parameters
+
+        Parameters
+        ----------
+        betas : torch.tensor BxNB
+            The tensor of shape parameters
+        pose : torch.tensor Bx(J + 1) * 3
+            The pose parameters in axis-angle format
+        v_template torch.tensor BxVx3
+            The template mesh that will be deformed
+        shapedirs : torch.tensor 1xNB
+            The tensor of PCA shape displacements
+        posedirs : torch.tensor Px(V * 3)
+            The pose PCA coefficients
+        J_regressor : torch.tensor JxV
+            The regressor array that is used to calculate the joints from
+            the position of the vertices
+        parents: torch.tensor J
+            The array that describes the kinematic tree for the model
+        lbs_weights: torch.tensor N x V x (J + 1)
+            The linear blend skinning weights that represent how much the
+            rotation matrix of each part affects each vertex
+        pose2rot: bool, optional
+            Flag on whether to convert the input pose tensor to rotation
+            matrices. The default value is True. If False, then the pose tensor
+            should already contain rotation matrices and have a size of
+            Bx(J + 1)x9
+        dtype: torch.dtype, optional
+
+        Returns
+        -------
+        verts: torch.tensor BxVx3
+            The vertices of the mesh after applying the shape and pose
+            displacements.
+        joints: torch.tensor BxJx3
+            The joints of the model
+    '''
+
+    batch_size = max(betas.shape[0], pose.shape[0])
+    device, dtype = betas.device, betas.dtype
+
+    # Add shape contribution
+    v_shaped = v_template + blend_shapes(betas, shapedirs)
+
+    # Get the joints
+    # NxJx3 array
+    J = vertices2joints(J_regressor, v_shaped)
+
+    # 3. Add pose blend shapes
+    # N x J x 3 x 3
+    ident = torch.eye(3, dtype=dtype, device=device)
+    if pose2rot:
+        rot_mats = batch_rodrigues(pose.view(-1, 3)).view(
+            [batch_size, -1, 3, 3])
+
+        pose_feature = (rot_mats[:, 1:, :, :] - ident).view([batch_size, -1])
+        # (N x P) x (P, V * 3) -> N x V x 3
+        pose_offsets = torch.matmul(
+            pose_feature, posedirs).view(batch_size, -1, 3)
+    else:
+        pose_feature = pose[:, 1:].view(batch_size, -1, 3, 3) - ident
+        rot_mats = pose.view(batch_size, -1, 3, 3)
+
+        pose_offsets = torch.matmul(pose_feature.view(batch_size, -1),
+                                    posedirs).view(batch_size, -1, 3)
+
+    v_posed = pose_offsets + v_shaped
+    # 4. Get the global joint location
+    J_transformed, A = batch_rigid_transform(rot_mats, J, parents, dtype=dtype)
+
+    # 5. Do skinning:
+    # W is N x V x (J + 1)
+    W = lbs_weights.unsqueeze(dim=0).expand([batch_size, -1, -1])
+    # (N x V x (J + 1)) x (N x (J + 1) x 16)
+    num_joints = J_regressor.shape[0]
+    T = torch.matmul(W, A.view(batch_size, num_joints, 16)) \
+        .view(batch_size, -1, 4, 4)
+
+    homogen_coord = torch.ones([batch_size, v_posed.shape[1], 1],
+                               dtype=dtype, device=device)
+    v_posed_homo = torch.cat([v_posed, homogen_coord], dim=2)
+    v_homo = torch.matmul(T, torch.unsqueeze(v_posed_homo, dim=-1))
+
+    verts = v_homo[:, :, :3, 0]
+
+    return verts, J_transformed
+
+
+def vertices2joints(J_regressor: Tensor, vertices: Tensor) -> Tensor:
+    ''' Calculates the 3D joint locations from the vertices
+
+    Parameters
+    ----------
+    J_regressor : torch.tensor JxV
+        The regressor array that is used to calculate the joints from the
+        position of the vertices
+    vertices : torch.tensor BxVx3
+        The tensor of mesh vertices
+
+    Returns
+    -------
+    torch.tensor BxJx3
+        The location of the joints
+    '''
+
+    return torch.einsum('bik,ji->bjk', [vertices, J_regressor])
+
+
+def blend_shapes(betas: Tensor, shape_disps: Tensor) -> Tensor:
+    ''' Calculates the per vertex displacement due to the blend shapes
+
+
+    Parameters
+    ----------
+    betas : torch.tensor Bx(num_betas)
+        Blend shape coefficients
+    shape_disps: torch.tensor Vx3x(num_betas)
+        Blend shapes
+
+    Returns
+    -------
+    torch.tensor BxVx3
+        The per-vertex displacement due to shape deformation
+    '''
+
+    # Displacement[b, m, k] = sum_{l} betas[b, l] * shape_disps[m, k, l]
+    # i.e. Multiply each shape displacement by its corresponding beta and
+    # then sum them.
+    blend_shape = torch.einsum('bl,mkl->bmk', [betas, shape_disps])
+    return blend_shape
+
+
+def batch_rodrigues(
+    rot_vecs: Tensor,
+    epsilon: float = 1e-8,
+) -> Tensor:
+    ''' Calculates the rotation matrices for a batch of rotation vectors
+        Parameters
+        ----------
+        rot_vecs: torch.tensor Nx3
+            array of N axis-angle vectors
+        Returns
+        -------
+        R: torch.tensor Nx3x3
+            The rotation matrices for the given axis-angle parameters
+    '''
+
+    batch_size = rot_vecs.shape[0]
+    device, dtype = rot_vecs.device, rot_vecs.dtype
+
+    angle = torch.norm(rot_vecs + 1e-8, dim=1, keepdim=True)
+    rot_dir = rot_vecs / angle
+
+    cos = torch.unsqueeze(torch.cos(angle), dim=1)
+    sin = torch.unsqueeze(torch.sin(angle), dim=1)
+
+    # Bx1 arrays
+    rx, ry, rz = torch.split(rot_dir, 1, dim=1)
+    K = torch.zeros((batch_size, 3, 3), dtype=dtype, device=device)
+
+    zeros = torch.zeros((batch_size, 1), dtype=dtype, device=device)
+    K = torch.cat([zeros, -rz, ry, rz, zeros, -rx, -ry, rx, zeros], dim=1) \
+        .view((batch_size, 3, 3))
+
+    ident = torch.eye(3, dtype=dtype, device=device).unsqueeze(dim=0)
+    rot_mat = ident + sin * K + (1 - cos) * torch.bmm(K, K)
+    return rot_mat
+
+
+def transform_mat(R: Tensor, t: Tensor) -> Tensor:
+    ''' Creates a batch of transformation matrices
+        Args:
+            - R: Bx3x3 array of a batch of rotation matrices
+            - t: Bx3x1 array of a batch of translation vectors
+        Returns:
+            - T: Bx4x4 Transformation matrix
+    '''
+    # No padding left or right, only add an extra row
+    return torch.cat([F.pad(R, [0, 0, 0, 1]),
+                      F.pad(t, [0, 0, 0, 1], value=1)], dim=2)
+
+
+def batch_rigid_transform(
+    rot_mats: Tensor,
+    joints: Tensor,
+    parents: Tensor,
+    dtype=torch.float32
+) -> Tensor:
+    """
+    Applies a batch of rigid transformations to the joints
+
+    Parameters
+    ----------
+    rot_mats : torch.tensor BxNx3x3
+        Tensor of rotation matrices
+    joints : torch.tensor BxNx3
+        Locations of joints
+    parents : torch.tensor BxN
+        The kinematic tree of each object
+    dtype : torch.dtype, optional:
+        The data type of the created tensors, the default is torch.float32
+
+    Returns
+    -------
+    posed_joints : torch.tensor BxNx3
+        The locations of the joints after applying the pose rotations
+    rel_transforms : torch.tensor BxNx4x4
+        The relative (with respect to the root joint) rigid transformations
+        for all the joints
+    """
+
+    joints = torch.unsqueeze(joints, dim=-1)
+
+    rel_joints = joints.clone()
+    rel_joints[:, 1:] -= joints[:, parents[1:]]
+
+    transforms_mat = transform_mat(
+        rot_mats.reshape(-1, 3, 3),
+        rel_joints.reshape(-1, 3, 1)).reshape(-1, joints.shape[1], 4, 4)
+
+    transform_chain = [transforms_mat[:, 0]]
+    for i in range(1, parents.shape[0]):
+        # Subtract the joint location at the rest pose
+        # No need for rotation, since it's identity when at rest
+        curr_res = torch.matmul(transform_chain[parents[i]],
+                                transforms_mat[:, i])
+        transform_chain.append(curr_res)
+
+    transforms = torch.stack(transform_chain, dim=1)
+
+    # The last column of the transformations contains the posed joints
+    posed_joints = transforms[:, :, :3, 3]
+
+    # The last column of the transformations contains the posed joints
+    posed_joints = transforms[:, :, :3, 3]
+
+    joints_homogen = F.pad(joints, [0, 0, 0, 1])
+
+    rel_transforms = transforms - F.pad(
+        torch.matmul(transforms, joints_homogen), [3, 0, 0, 0, 0, 0, 0, 0])
+
+    return posed_joints, rel_transforms
diff --git a/SMPLer-X/common/utils/smplx/smplx/utils.py b/SMPLer-X/common/utils/smplx/smplx/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..55dd1ed6d94582c744d0b99bc4150b1040cf58a0
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/smplx/utils.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from typing import NewType, Union, Optional
+from dataclasses import dataclass, asdict, fields
+import numpy as np
+import torch
+
+Tensor = NewType('Tensor', torch.Tensor)
+Array = NewType('Array', np.ndarray)
+
+
+@dataclass
+class ModelOutput:
+    vertices: Optional[Tensor] = None
+    joints: Optional[Tensor] = None
+    full_pose: Optional[Tensor] = None
+    global_orient: Optional[Tensor] = None
+    transl: Optional[Tensor] = None
+
+    def __getitem__(self, key):
+        return getattr(self, key)
+
+    def get(self, key, default=None):
+        return getattr(self, key, default)
+
+    def __iter__(self):
+        return self.keys()
+
+    def keys(self):
+        keys = [t.name for t in fields(self)]
+        return iter(keys)
+
+    def values(self):
+        values = [getattr(self, t.name) for t in fields(self)]
+        return iter(values)
+
+    def items(self):
+        data = [(t.name, getattr(self, t.name)) for t in fields(self)]
+        return iter(data)
+
+
+@dataclass
+class SMPLOutput(ModelOutput):
+    betas: Optional[Tensor] = None
+    body_pose: Optional[Tensor] = None
+
+
+@dataclass
+class SMPLHOutput(SMPLOutput):
+    left_hand_pose: Optional[Tensor] = None
+    right_hand_pose: Optional[Tensor] = None
+    transl: Optional[Tensor] = None
+
+
+@dataclass
+class SMPLXOutput(SMPLHOutput):
+    expression: Optional[Tensor] = None
+    jaw_pose: Optional[Tensor] = None
+
+
+@dataclass
+class MANOOutput(ModelOutput):
+    betas: Optional[Tensor] = None
+    hand_pose: Optional[Tensor] = None
+
+
+@dataclass
+class FLAMEOutput(ModelOutput):
+    betas: Optional[Tensor] = None
+    expression: Optional[Tensor] = None
+    jaw_pose: Optional[Tensor] = None
+    neck_pose: Optional[Tensor] = None
+
+
+def find_joint_kin_chain(joint_id, kinematic_tree):
+    kin_chain = []
+    curr_idx = joint_id
+    while curr_idx != -1:
+        kin_chain.append(curr_idx)
+        curr_idx = kinematic_tree[curr_idx]
+    return kin_chain
+
+
+def to_tensor(
+        array: Union[Array, Tensor], dtype=torch.float32
+) -> Tensor:
+    if torch.is_tensor(array):
+        return array
+    else:
+        return torch.tensor(array, dtype=dtype)
+
+
+class Struct(object):
+    def __init__(self, **kwargs):
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+def to_np(array, dtype=np.float32):
+    if 'scipy.sparse' in str(type(array)):
+        array = array.todense()
+    return np.array(array, dtype=dtype)
+
+
+def rot_mat_to_euler(rot_mats):
+    # Calculates rotation matrix to euler angles
+    # Careful for extreme cases of eular angles like [0.0, pi, 0.0]
+
+    sy = torch.sqrt(rot_mats[:, 0, 0] * rot_mats[:, 0, 0] +
+                    rot_mats[:, 1, 0] * rot_mats[:, 1, 0])
+    return torch.atan2(-rot_mats[:, 2, 0], sy)
diff --git a/SMPLer-X/common/utils/smplx/smplx/vertex_ids.py b/SMPLer-X/common/utils/smplx/smplx/vertex_ids.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e7a4c36700f002da54a9e181eabbd47af2a95bc
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/smplx/vertex_ids.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+
+# Joint name to vertex mapping. SMPL/SMPL-H/SMPL-X vertices that correspond to
+# MSCOCO and OpenPose joints
+vertex_ids = {
+    'smplh': {
+        'nose':		    332,
+        'reye':		    6260,
+        'leye':		    2800,
+        'rear':		    4071,
+        'lear':		    583,
+        'rthumb':		6191,
+        'rindex':		5782,
+        'rmiddle':		5905,
+        'rring':		6016,
+        'rpinky':		6133,
+        'lthumb':		2746,
+        'lindex':		2319,
+        'lmiddle':		2445,
+        'lring':		2556,
+        'lpinky':		2673,
+        'LBigToe':		3216,
+        'LSmallToe':	3226,
+        'LHeel':		3387,
+        'RBigToe':		6617,
+        'RSmallToe':    6624,
+        'RHeel':		6787
+    },
+    'smplx': {
+        'nose':		    9120,
+        'reye':		    9929,
+        'leye':		    9448,
+        'rear':		    616,
+        'lear':		    6,
+        'rthumb':		8079,
+        'rindex':		7669,
+        'rmiddle':		7794,
+        'rring':		7905,
+        'rpinky':		8022,
+        'lthumb':		5361,
+        'lindex':		4933,
+        'lmiddle':		5058,
+        'lring':		5169,
+        'lpinky':		5286,
+        'LBigToe':		5770,
+        'LSmallToe':    5780,
+        'LHeel':		8846,
+        'RBigToe':		8463,
+        'RSmallToe': 	8474,
+        'RHeel':  		8635
+    },
+    'mano': {
+            'thumb':		744,
+            'index':		320,
+            'middle':		443,
+            'ring':		    554,
+            'pinky':		671,
+        }
+}
diff --git a/SMPLer-X/common/utils/smplx/smplx/vertex_joint_selector.py b/SMPLer-X/common/utils/smplx/smplx/vertex_joint_selector.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b8298bd5e087731f86c1c699703b5219e046c5c
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/smplx/vertex_joint_selector.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from .utils import to_tensor
+
+
+class VertexJointSelector(nn.Module):
+
+    def __init__(self, vertex_ids=None,
+                 use_hands=True,
+                 use_feet_keypoints=True, **kwargs):
+        super(VertexJointSelector, self).__init__()
+
+        extra_joints_idxs = []
+
+        face_keyp_idxs = np.array([
+            vertex_ids['nose'],
+            vertex_ids['reye'],
+            vertex_ids['leye'],
+            vertex_ids['rear'],
+            vertex_ids['lear']], dtype=np.int64)
+
+        extra_joints_idxs = np.concatenate([extra_joints_idxs,
+                                            face_keyp_idxs])
+
+        if use_feet_keypoints:
+            feet_keyp_idxs = np.array([vertex_ids['LBigToe'],
+                                       vertex_ids['LSmallToe'],
+                                       vertex_ids['LHeel'],
+                                       vertex_ids['RBigToe'],
+                                       vertex_ids['RSmallToe'],
+                                       vertex_ids['RHeel']], dtype=np.int32)
+
+            extra_joints_idxs = np.concatenate(
+                [extra_joints_idxs, feet_keyp_idxs])
+
+        if use_hands:
+            self.tip_names = ['thumb', 'index', 'middle', 'ring', 'pinky']
+
+            tips_idxs = []
+            for hand_id in ['l', 'r']:
+                for tip_name in self.tip_names:
+                    tips_idxs.append(vertex_ids[hand_id + tip_name])
+
+            extra_joints_idxs = np.concatenate(
+                [extra_joints_idxs, tips_idxs])
+
+        self.register_buffer('extra_joints_idxs',
+                             to_tensor(extra_joints_idxs, dtype=torch.long))
+
+    def forward(self, vertices, joints):
+        extra_joints = torch.index_select(vertices, 1, self.extra_joints_idxs)
+        joints = torch.cat([joints, extra_joints], dim=1)
+
+        return joints
diff --git a/SMPLer-X/common/utils/smplx/tools/README.md b/SMPLer-X/common/utils/smplx/tools/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1e69d971bc22f8d65f2e751c24c6125d363a3e76
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/tools/README.md
@@ -0,0 +1,20 @@
+## Removing Chumpy objects
+
+In a Python 2 virtual environment with [Chumpy](https://github.com/mattloper/chumpy) installed run the following to remove any Chumpy objects from the model data:
+
+```bash
+python tools/clean_ch.py --input-models path-to-models/*.pkl --output-folder output-folder
+```
+
+## Merging SMPL-H and MANO parameters
+
+In order to use the given PyTorch SMPL-H module we first need to merge the SMPL-H and MANO parameters in a single file. After agreeing to the license and downloading the models, run the following command:
+
+```bash
+python tools/merge_smplh_mano.py --smplh-fn SMPLH_FOLDER/SMPLH_GENDER.pkl \
+ --mano-left-fn MANO_FOLDER/MANO_LEFT.pkl \
+ --mano-right-fn MANO_FOLDER/MANO_RIGHT.pkl \
+ --output-folder OUTPUT_FOLDER
+```
+
+where SMPLH_FOLDER is the folder with the SMPL-H files and MANO_FOLDER the one for the MANO files.
diff --git a/SMPLer-X/common/utils/smplx/tools/__init__.py b/SMPLer-X/common/utils/smplx/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..098b529b7f169758710ab788be94fe5d83e51256
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/tools/__init__.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems and the Max Planck Institute for Biological
+# Cybernetics. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import clean_ch
+import merge_smplh_mano
diff --git a/SMPLer-X/common/utils/smplx/tools/clean_ch.py b/SMPLer-X/common/utils/smplx/tools/clean_ch.py
new file mode 100644
index 0000000000000000000000000000000000000000..56874b374c5d25aeb4ace0aefb3570bd7b891c22
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/tools/clean_ch.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems and the Max Planck Institute for Biological
+# Cybernetics. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+
+import argparse
+import os
+import os.path as osp
+
+import pickle
+
+from tqdm import tqdm
+import numpy as np
+
+
+def clean_fn(fn, output_folder='output'):
+    with open(fn, 'rb') as body_file:
+        body_data = pickle.load(body_file)
+
+    output_dict = {}
+    for key, data in body_data.iteritems():
+        if 'chumpy' in str(type(data)):
+            output_dict[key] = np.array(data)
+        else:
+            output_dict[key] = data
+
+    out_fn = osp.split(fn)[1]
+
+    out_path = osp.join(output_folder, out_fn)
+    with open(out_path, 'wb') as out_file:
+        pickle.dump(output_dict, out_file)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-models', dest='input_models', nargs='+',
+                        required=True, type=str,
+                        help='The path to the model that will be processed')
+    parser.add_argument('--output-folder', dest='output_folder',
+                        required=True, type=str,
+                        help='The path to the output folder')
+
+    args = parser.parse_args()
+
+    input_models = args.input_models
+    output_folder = args.output_folder
+    if not osp.exists(output_folder):
+        print('Creating directory: {}'.format(output_folder))
+        os.makedirs(output_folder)
+
+    for input_model in input_models:
+        clean_fn(input_model, output_folder=output_folder)
diff --git a/SMPLer-X/common/utils/smplx/tools/merge_smplh_mano.py b/SMPLer-X/common/utils/smplx/tools/merge_smplh_mano.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab9d1ea60c224cf3785bd90dc542569ad81cd78
--- /dev/null
+++ b/SMPLer-X/common/utils/smplx/tools/merge_smplh_mano.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems and the Max Planck Institute for Biological
+# Cybernetics. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import print_function
+
+import os
+import os.path as osp
+import pickle
+
+import argparse
+
+import numpy as np
+
+
+def merge_models(smplh_fn, mano_left_fn, mano_right_fn,
+                 output_folder='output'):
+
+    with open(smplh_fn, 'rb') as body_file:
+        body_data = pickle.load(body_file)
+
+    with open(mano_left_fn, 'rb') as lhand_file:
+        lhand_data = pickle.load(lhand_file)
+
+    with open(mano_right_fn, 'rb') as rhand_file:
+        rhand_data = pickle.load(rhand_file)
+
+    out_fn = osp.split(smplh_fn)[1]
+
+    output_data = body_data.copy()
+    output_data['hands_componentsl'] = lhand_data['hands_components']
+    output_data['hands_componentsr'] = rhand_data['hands_components']
+
+    output_data['hands_coeffsl'] = lhand_data['hands_coeffs']
+    output_data['hands_coeffsr'] = rhand_data['hands_coeffs']
+
+    output_data['hands_meanl'] = lhand_data['hands_mean']
+    output_data['hands_meanr'] = rhand_data['hands_mean']
+
+    for key, data in output_data.iteritems():
+        if 'chumpy' in str(type(data)):
+            output_data[key] = np.array(data)
+        else:
+            output_data[key] = data
+
+    out_path = osp.join(output_folder, out_fn)
+    print(out_path)
+    print('Saving to {}'.format(out_path))
+    with open(out_path, 'wb') as output_file:
+        pickle.dump(output_data, output_file)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--smplh-fn', dest='smplh_fn', required=True,
+                        type=str, help='The path to the SMPLH model')
+    parser.add_argument('--mano-left-fn', dest='mano_left_fn', required=True,
+                        type=str, help='The path to the left hand MANO model')
+    parser.add_argument('--mano-right-fn', dest='mano_right_fn', required=True,
+                        type=str, help='The path to the right hand MANO model')
+    parser.add_argument('--output-folder', dest='output_folder',
+                        required=True, type=str,
+                        help='The path to the output folder')
+
+    args = parser.parse_args()
+
+    smplh_fn = args.smplh_fn
+    mano_left_fn = args.mano_left_fn
+    mano_right_fn = args.mano_right_fn
+    output_folder = args.output_folder
+
+    if not osp.exists(output_folder):
+        print('Creating directory: {}'.format(output_folder))
+        os.makedirs(output_folder)
+
+    merge_models(smplh_fn, mano_left_fn, mano_right_fn, output_folder)
diff --git a/SMPLer-X/common/utils/transforms.py b/SMPLer-X/common/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b5cfcd0d57ad829c4ee4a6044744c3992663600
--- /dev/null
+++ b/SMPLer-X/common/utils/transforms.py
@@ -0,0 +1,172 @@
+import torch
+import numpy as np
+import scipy
+from config import cfg
+from torch.nn import functional as F
+import torchgeometry as tgm
+
+
+def cam2pixel(cam_coord, f, c):
+    x = cam_coord[:, 0] / cam_coord[:, 2] * f[0] + c[0]
+    y = cam_coord[:, 1] / cam_coord[:, 2] * f[1] + c[1]
+    z = cam_coord[:, 2]
+    return np.stack((x, y, z), 1)
+
+
+def pixel2cam(pixel_coord, f, c):
+    x = (pixel_coord[:, 0] - c[0]) / f[0] * pixel_coord[:, 2]
+    y = (pixel_coord[:, 1] - c[1]) / f[1] * pixel_coord[:, 2]
+    z = pixel_coord[:, 2]
+    return np.stack((x, y, z), 1)
+
+
+def world2cam(world_coord, R, t):
+    cam_coord = np.dot(R, world_coord.transpose(1, 0)).transpose(1, 0) + t.reshape(1, 3)
+    return cam_coord
+
+
+def cam2world(cam_coord, R, t):
+    world_coord = np.dot(np.linalg.inv(R), (cam_coord - t.reshape(1, 3)).transpose(1, 0)).transpose(1, 0)
+    return world_coord
+
+
+def rigid_transform_3D(A, B):
+    n, dim = A.shape
+    centroid_A = np.mean(A, axis=0)
+    centroid_B = np.mean(B, axis=0)
+    H = np.dot(np.transpose(A - centroid_A), B - centroid_B) / n
+    U, s, V = np.linalg.svd(H)
+    R = np.dot(np.transpose(V), np.transpose(U))
+    if np.linalg.det(R) < 0:
+        s[-1] = -s[-1]
+        V[2] = -V[2]
+        R = np.dot(np.transpose(V), np.transpose(U))
+
+    varP = np.var(A, axis=0).sum()
+    c = 1 / varP * np.sum(s)
+
+    t = -np.dot(c * R, np.transpose(centroid_A)) + np.transpose(centroid_B)
+    return c, R, t
+
+
+def rigid_align(A, B):
+    c, R, t = rigid_transform_3D(A, B)
+    A2 = np.transpose(np.dot(c * R, np.transpose(A))) + t
+    return A2
+
+
+def transform_joint_to_other_db(src_joint, src_name, dst_name):
+    src_joint_num = len(src_name)
+    dst_joint_num = len(dst_name)
+
+    new_joint = np.zeros(((dst_joint_num,) + src_joint.shape[1:]), dtype=np.float32)
+    for src_idx in range(len(src_name)):
+        name = src_name[src_idx]
+        if name in dst_name:
+            dst_idx = dst_name.index(name)
+            new_joint[dst_idx] = src_joint[src_idx]
+
+    return new_joint
+
+
+def rot6d_to_axis_angle(x):
+    batch_size = x.shape[0]
+
+    x = x.view(-1, 3, 2)
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
+    b3 = torch.cross(b1, b2)
+    rot_mat = torch.stack((b1, b2, b3), dim=-1)  # 3x3 rotation matrix
+
+    rot_mat = torch.cat([rot_mat, torch.zeros((batch_size, 3, 1)).to(cfg.device).float()], 2)  # 3x4 rotation matrix
+    axis_angle = tgm.rotation_matrix_to_angle_axis(rot_mat).reshape(-1, 3)  # axis-angle
+    axis_angle[torch.isnan(axis_angle)] = 0.0
+    return axis_angle
+
+
+def sample_joint_features(img_feat, joint_xy):
+    height, width = img_feat.shape[2:]
+    x = joint_xy[:, :, 0] / (width - 1) * 2 - 1
+    y = joint_xy[:, :, 1] / (height - 1) * 2 - 1
+    grid = torch.stack((x, y), 2)[:, :, None, :]
+    img_feat = F.grid_sample(img_feat, grid, align_corners=True)[:, :, :, 0]  # batch_size, channel_dim, joint_num
+    img_feat = img_feat.permute(0, 2, 1).contiguous()  # batch_size, joint_num, channel_dim
+    return img_feat
+
+
+def soft_argmax_2d(heatmap2d):
+    batch_size = heatmap2d.shape[0]
+    height, width = heatmap2d.shape[2:]
+    heatmap2d = heatmap2d.reshape((batch_size, -1, height * width))
+    heatmap2d = F.softmax(heatmap2d, 2)
+    heatmap2d = heatmap2d.reshape((batch_size, -1, height, width))
+
+    accu_x = heatmap2d.sum(dim=(2))
+    accu_y = heatmap2d.sum(dim=(3))
+
+    accu_x = accu_x * torch.arange(width).float().to(cfg.device)[None, None, :]
+    accu_y = accu_y * torch.arange(height).float().to(cfg.device)[None, None, :]
+
+    accu_x = accu_x.sum(dim=2, keepdim=True)
+    accu_y = accu_y.sum(dim=2, keepdim=True)
+
+    coord_out = torch.cat((accu_x, accu_y), dim=2)
+    return coord_out
+
+
+def soft_argmax_3d(heatmap3d):
+    batch_size = heatmap3d.shape[0]
+    depth, height, width = heatmap3d.shape[2:]
+    heatmap3d = heatmap3d.reshape((batch_size, -1, depth * height * width))
+    heatmap3d = F.softmax(heatmap3d, 2)
+    heatmap3d = heatmap3d.reshape((batch_size, -1, depth, height, width))
+
+    accu_x = heatmap3d.sum(dim=(2, 3))
+    accu_y = heatmap3d.sum(dim=(2, 4))
+    accu_z = heatmap3d.sum(dim=(3, 4))
+
+    accu_x = accu_x * torch.arange(width).float().to(cfg.device)[None, None, :]
+    accu_y = accu_y * torch.arange(height).float().to(cfg.device)[None, None, :]
+    accu_z = accu_z * torch.arange(depth).float().to(cfg.device)[None, None, :]
+
+    accu_x = accu_x.sum(dim=2, keepdim=True)
+    accu_y = accu_y.sum(dim=2, keepdim=True)
+    accu_z = accu_z.sum(dim=2, keepdim=True)
+
+    coord_out = torch.cat((accu_x, accu_y, accu_z), dim=2)
+    return coord_out
+
+
+def restore_bbox(bbox_center, bbox_size, aspect_ratio, extension_ratio):
+    bbox = bbox_center.view(-1, 1, 2) + torch.cat((-bbox_size.view(-1, 1, 2) / 2., bbox_size.view(-1, 1, 2) / 2.),
+                                                  1)  # xyxy in (cfg.output_hm_shape[2], cfg.output_hm_shape[1]) space
+    bbox[:, :, 0] = bbox[:, :, 0] / cfg.output_hm_shape[2] * cfg.input_body_shape[1]
+    bbox[:, :, 1] = bbox[:, :, 1] / cfg.output_hm_shape[1] * cfg.input_body_shape[0]
+    bbox = bbox.view(-1, 4)
+
+    # xyxy -> xywh
+    bbox[:, 2] = bbox[:, 2] - bbox[:, 0]
+    bbox[:, 3] = bbox[:, 3] - bbox[:, 1]
+
+    # aspect ratio preserving bbox
+    w = bbox[:, 2]
+    h = bbox[:, 3]
+    c_x = bbox[:, 0] + w / 2.
+    c_y = bbox[:, 1] + h / 2.
+
+    mask1 = w > (aspect_ratio * h)
+    mask2 = w < (aspect_ratio * h)
+    h[mask1] = w[mask1] / aspect_ratio
+    w[mask2] = h[mask2] * aspect_ratio
+
+    bbox[:, 2] = w * extension_ratio
+    bbox[:, 3] = h * extension_ratio
+    bbox[:, 0] = c_x - bbox[:, 2] / 2.
+    bbox[:, 1] = c_y - bbox[:, 3] / 2.
+
+    # xywh -> xyxy
+    bbox[:, 2] = bbox[:, 2] + bbox[:, 0]
+    bbox[:, 3] = bbox[:, 3] + bbox[:, 1]
+    return bbox
diff --git a/SMPLer-X/common/utils/vis.py b/SMPLer-X/common/utils/vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5b7dd3b6775e16bff638c8383ed04ab916978c1
--- /dev/null
+++ b/SMPLer-X/common/utils/vis.py
@@ -0,0 +1,183 @@
+import os
+import cv2
+import numpy as np
+from mpl_toolkits.mplot3d import Axes3D
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import os
+os.environ["PYOPENGL_PLATFORM"] = "egl"
+import pyrender
+import trimesh
+from config import cfg
+
+def vis_keypoints_with_skeleton(img, kps, kps_lines, kp_thresh=0.4, alpha=1):
+    # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
+    cmap = plt.get_cmap('rainbow')
+    colors = [cmap(i) for i in np.linspace(0, 1, len(kps_lines) + 2)]
+    colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors]
+
+    # Perform the drawing on a copy of the image, to allow for blending.
+    kp_mask = np.copy(img)
+
+    # Draw the keypoints.
+    for l in range(len(kps_lines)):
+        i1 = kps_lines[l][0]
+        i2 = kps_lines[l][1]
+        p1 = kps[0, i1].astype(np.int32), kps[1, i1].astype(np.int32)
+        p2 = kps[0, i2].astype(np.int32), kps[1, i2].astype(np.int32)
+        if kps[2, i1] > kp_thresh and kps[2, i2] > kp_thresh:
+            cv2.line(
+                kp_mask, p1, p2,
+                color=colors[l], thickness=2, lineType=cv2.LINE_AA)
+        if kps[2, i1] > kp_thresh:
+            cv2.circle(
+                kp_mask, p1,
+                radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA)
+        if kps[2, i2] > kp_thresh:
+            cv2.circle(
+                kp_mask, p2,
+                radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA)
+
+    # Blend the keypoints.
+    return cv2.addWeighted(img, 1.0 - alpha, kp_mask, alpha, 0)
+
+def vis_keypoints(img, kps, alpha=1, radius=3, color=None):
+    # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
+    cmap = plt.get_cmap('rainbow')
+    if color is None:
+        colors = [cmap(i) for i in np.linspace(0, 1, len(kps) + 2)]
+        colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors]
+
+    # Perform the drawing on a copy of the image, to allow for blending.
+    kp_mask = np.copy(img)
+
+    # Draw the keypoints.
+    for i in range(len(kps)):
+        p = kps[i][0].astype(np.int32), kps[i][1].astype(np.int32)
+        if color is None:
+            cv2.circle(kp_mask, p, radius=radius, color=colors[i], thickness=-1, lineType=cv2.LINE_AA)
+        else:
+            cv2.circle(kp_mask, p, radius=radius, color=color, thickness=-1, lineType=cv2.LINE_AA)
+
+    # Blend the keypoints.
+    return cv2.addWeighted(img, 1.0 - alpha, kp_mask, alpha, 0)
+
+def vis_mesh(img, mesh_vertex, alpha=0.5):
+    # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
+    cmap = plt.get_cmap('rainbow')
+    colors = [cmap(i) for i in np.linspace(0, 1, len(mesh_vertex))]
+    colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors]
+
+    # Perform the drawing on a copy of the image, to allow for blending.
+    mask = np.copy(img)
+
+    # Draw the mesh
+    for i in range(len(mesh_vertex)):
+        p = mesh_vertex[i][0].astype(np.int32), mesh_vertex[i][1].astype(np.int32)
+        cv2.circle(mask, p, radius=1, color=colors[i], thickness=-1, lineType=cv2.LINE_AA)
+
+    # Blend the keypoints.
+    return cv2.addWeighted(img, 1.0 - alpha, mask, alpha, 0)
+
+def vis_3d_skeleton(kpt_3d, kpt_3d_vis, kps_lines, filename=None):
+
+    fig = plt.figure()
+    ax = fig.add_subplot(111, projection='3d')
+
+    # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
+    cmap = plt.get_cmap('rainbow')
+    colors = [cmap(i) for i in np.linspace(0, 1, len(kps_lines) + 2)]
+    colors = [np.array((c[2], c[1], c[0])) for c in colors]
+
+    for l in range(len(kps_lines)):
+        i1 = kps_lines[l][0]
+        i2 = kps_lines[l][1]
+        x = np.array([kpt_3d[i1,0], kpt_3d[i2,0]])
+        y = np.array([kpt_3d[i1,1], kpt_3d[i2,1]])
+        z = np.array([kpt_3d[i1,2], kpt_3d[i2,2]])
+
+        if kpt_3d_vis[i1,0] > 0 and kpt_3d_vis[i2,0] > 0:
+            ax.plot(x, z, -y, c=colors[l], linewidth=2)
+        if kpt_3d_vis[i1,0] > 0:
+            ax.scatter(kpt_3d[i1,0], kpt_3d[i1,2], -kpt_3d[i1,1], c=colors[l], marker='o')
+        if kpt_3d_vis[i2,0] > 0:
+            ax.scatter(kpt_3d[i2,0], kpt_3d[i2,2], -kpt_3d[i2,1], c=colors[l], marker='o')
+
+    x_r = np.array([0, cfg.input_shape[1]], dtype=np.float32)
+    y_r = np.array([0, cfg.input_shape[0]], dtype=np.float32)
+    z_r = np.array([0, 1], dtype=np.float32)
+    
+    if filename is None:
+        ax.set_title('3D vis')
+    else:
+        ax.set_title(filename)
+
+    ax.set_xlabel('X Label')
+    ax.set_ylabel('Z Label')
+    ax.set_zlabel('Y Label')
+    ax.legend()
+
+    plt.show()
+    cv2.waitKey(0)
+
+def save_obj(v, f, file_name='output.obj'):
+    obj_file = open(file_name, 'w')
+    for i in range(len(v)):
+        obj_file.write('v ' + str(v[i][0]) + ' ' + str(v[i][1]) + ' ' + str(v[i][2]) + '\n')
+    for i in range(len(f)):
+        obj_file.write('f ' + str(f[i][0]+1) + '/' + str(f[i][0]+1) + ' ' + str(f[i][1]+1) + '/' + str(f[i][1]+1) + ' ' + str(f[i][2]+1) + '/' + str(f[i][2]+1) + '\n')
+    obj_file.close()
+
+
+def perspective_projection(vertices, cam_param):
+    # vertices: [N, 3]
+    # cam_param: [3]
+    fx, fy= cam_param['focal']
+    cx, cy = cam_param['princpt']
+    vertices[:, 0] = vertices[:, 0] * fx / vertices[:, 2] + cx
+    vertices[:, 1] = vertices[:, 1] * fy / vertices[:, 2] + cy
+    return vertices
+
+
+def render_mesh(img, mesh, face, cam_param, mesh_as_vertices=False):
+    if mesh_as_vertices:
+        # to run on cluster where headless pyrender is not supported for A100/V100
+        vertices_2d = perspective_projection(mesh, cam_param)
+        img = vis_keypoints(img, vertices_2d, alpha=0.8, radius=2, color=(0, 0, 255))
+    else:
+        # mesh
+        mesh = trimesh.Trimesh(mesh, face)
+        rot = trimesh.transformations.rotation_matrix(
+        np.radians(180), [1, 0, 0])
+        mesh.apply_transform(rot)
+        material = pyrender.MetallicRoughnessMaterial(metallicFactor=0.0, alphaMode='OPAQUE', baseColorFactor=(1.0, 1.0, 0.9, 1.0))
+        mesh = pyrender.Mesh.from_trimesh(mesh, material=material, smooth=False)
+        scene = pyrender.Scene(ambient_light=(0.3, 0.3, 0.3))
+        scene.add(mesh, 'mesh')
+
+        focal, princpt = cam_param['focal'], cam_param['princpt']
+        camera = pyrender.IntrinsicsCamera(fx=focal[0], fy=focal[1], cx=princpt[0], cy=princpt[1])
+        scene.add(camera)
+
+        # renderer
+        renderer = pyrender.OffscreenRenderer(viewport_width=img.shape[1], viewport_height=img.shape[0], point_size=1.0)
+
+        # light
+        light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=0.8)
+        light_pose = np.eye(4)
+        light_pose[:3, 3] = np.array([0, -1, 1])
+        scene.add(light, pose=light_pose)
+        light_pose[:3, 3] = np.array([0, 1, 1])
+        scene.add(light, pose=light_pose)
+        light_pose[:3, 3] = np.array([1, 1, 2])
+        scene.add(light, pose=light_pose)
+
+        # render
+        rgb, depth = renderer.render(scene, flags=pyrender.RenderFlags.RGBA)
+        rgb = rgb[:,:,:3].astype(np.float32)
+        valid_mask = (depth > 0)[:,:,None]
+
+        # save to image
+        img = rgb * valid_mask + img * (1-valid_mask)
+
+    return img
diff --git a/SMPLer-X/cover2eamge.py b/SMPLer-X/cover2eamge.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e4d00e565926207968cd58da3b1f5300aaa1488
--- /dev/null
+++ b/SMPLer-X/cover2eamge.py
@@ -0,0 +1,108 @@
+import smplx
+import torch
+import pickle
+import numpy as np
+
+# # Global: Load the SMPL-X model once
+# smplx_model = smplx.create(
+#     "/content/drive/MyDrive/003_Codes/TANGO-JointEmbedding/beat2/smplx_models/", 
+#     model_type='smplx',
+#     gender='NEUTRAL_2020', 
+#     use_face_contour=False,
+#     num_betas=10,
+#     num_expression_coeffs=10, 
+#     ext='npz',
+#     use_pca=False,
+# ).to("cuda").eval()
+
+# device = "cuda"
+
+def extract_frame_number(file_name):
+    match = re.search(r'(\d{5})', file_name)
+    if match:
+        return int(match.group(1))
+    return None
+
+def merge_npz_files(npz_files, output_file):
+    npz_files = sorted(npz_files, key=lambda x: extract_frame_number(os.path.basename(x)))
+    merged_data = {}
+    for file in npz_files:
+        data = np.load(file)
+        for key in data.files:
+            if key not in merged_data:
+                merged_data[key] = []
+            merged_data[key].append(data[key])
+    for key in merged_data:
+        merged_data[key] = np.stack(merged_data[key], axis=0)
+    np.savez(output_file, **merged_data)
+
+# smplierx data
+def npz_to_npz_v2(pkl_path, npz_path):
+    # Load the pickle file
+    pkl_example = np.load(pkl_path, allow_pickle=True)
+
+    bs = 1
+    n = pkl_example["expression"].shape[0]  # Assuming this is the batch size
+
+    # Convert numpy arrays to torch tensors
+    def to_tensor(numpy_array):
+        return torch.tensor(numpy_array, dtype=torch.float32).to(device)
+
+    # Ensure that betas are loaded from the pickle data, converting them to torch tensors
+    betas = to_tensor(pkl_example["betas"]).reshape(n, -1)
+    transl = to_tensor(pkl_example["transl"]).reshape(n, -1)
+    expression = to_tensor(pkl_example["expression"]).reshape(n, -1)
+    jaw_pose = to_tensor(pkl_example["jaw_pose"]).reshape(n, -1)
+    global_orient = to_tensor(pkl_example["global_orient"]).reshape(n, -1)
+    body_pose_axis = to_tensor(pkl_example["body_pose"]).reshape(n, -1)
+    left_hand_pose = to_tensor(pkl_example['left_hand_pose']).reshape(n, -1)
+    right_hand_pose = to_tensor(pkl_example['right_hand_pose']).reshape(n, -1)
+    leye_pose = to_tensor(pkl_example['leye_pose']).reshape(n, -1)
+    reye_pose = to_tensor(pkl_example['reye_pose']).reshape(n, -1)
+
+    # print(left_hand_pose.shape, right_hand_pose.shape)
+
+    # Pass the loaded data into the SMPL-X model
+    gt_vertex = smplx_model(
+        betas=betas,
+        transl=transl,  # Translation
+        expression=expression,  # Expression
+        jaw_pose=jaw_pose,  # Jaw pose
+        global_orient=global_orient,  # Global orientation
+        body_pose=body_pose_axis,  # Body pose
+        left_hand_pose=left_hand_pose,  # Left hand pose
+        right_hand_pose=right_hand_pose,  # Right hand pose
+        return_full_pose=True,
+        leye_pose=leye_pose,  # Left eye pose
+        reye_pose=reye_pose,  # Right eye pose
+    )
+
+    # Save the relevant data to an npz file
+    np.savez(npz_path,
+        betas=np.zeros((n, 300)),
+        poses=gt_vertex["full_pose"].cpu().numpy(),
+        expressions=np.zeros((n, 100)),
+        trans=pkl_example["transl"].reshape(n, -1),
+        model='smplx2020',
+        gender='neutral',
+        mocap_frame_rate=30,
+    )
+
+# smplierx data
+def npz_to_npz(pkl_path, npz_path):
+    # Load the pickle file
+    pkl_example = np.load(pkl_path, allow_pickle=True)
+    n = pkl_example["expression"].shape[0]  # Assuming this is the batch size
+    full_pose = np.concatenate([pkl_example["global_orient"], pkl_example["body_pose"], pkl_example["jaw_pose"],  pkl_example["leye_pose"], pkl_example["reye_pose"], pkl_example["left_hand_pose"], pkl_example["right_hand_pose"]], axis=1)
+    # print(full_pose.shape)
+    np.savez(npz_path,
+        betas=np.zeros(300),
+        poses=full_pose.reshape(n, -1),
+        expressions=np.zeros((n, 100)),
+        trans=np.zeros((n, 3)),
+        model='smplx2020',
+        gender='neutral',
+        mocap_frame_rate=30,
+    )
+if __name__ == "__main__":
+    npz_to_npz("/content/drive/MyDrive/003_Codes/TANGO/SMPLer-X/demo/outputs/results_smplx.npz", "/content/drive/MyDrive/003_Codes/TANGO/SMPLer-X/demo/outputs/results_smplx_emage.npz")
\ No newline at end of file
diff --git a/SMPLer-X/main/SMPLer_X.py b/SMPLer-X/main/SMPLer_X.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1c71cc6d0d3d6ada4d164da94cda0784ab9aaad
--- /dev/null
+++ b/SMPLer-X/main/SMPLer_X.py
@@ -0,0 +1,468 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from nets.smpler_x import PositionNet, HandRotationNet, FaceRegressor, BoxNet, HandRoI, BodyRotationNet
+from nets.loss import CoordLoss, ParamLoss, CELoss
+from utils.human_models import smpl_x
+from utils.transforms import rot6d_to_axis_angle, restore_bbox
+from config import cfg
+import math
+import copy
+from mmpose.models import build_posenet
+from mmengine.config import Config
+
+class Model(nn.Module):
+    def __init__(self, encoder, body_position_net, body_rotation_net, box_net, hand_position_net, hand_roi_net,
+                 hand_rotation_net, face_regressor):
+        super(Model, self).__init__()
+
+        # body
+        self.encoder = encoder
+        self.body_position_net = body_position_net
+        self.body_regressor = body_rotation_net
+        self.box_net = box_net
+
+        # hand
+        self.hand_roi_net = hand_roi_net
+        self.hand_position_net = hand_position_net
+        self.hand_regressor = hand_rotation_net
+
+        # face
+        self.face_regressor = face_regressor
+
+        self.smplx_layer = copy.deepcopy(smpl_x.layer['neutral']).to(cfg.device)
+        self.coord_loss = CoordLoss()
+        self.param_loss = ParamLoss()
+        self.ce_loss = CELoss()
+
+        self.body_num_joints = len(smpl_x.pos_joint_part['body'])
+        self.hand_joint_num = len(smpl_x.pos_joint_part['rhand'])
+
+        self.neck = [self.box_net, self.hand_roi_net]
+
+        self.head = [self.body_position_net, self.body_regressor,
+                    self.hand_position_net, self.hand_regressor, 
+                    self.face_regressor]
+
+        self.trainable_modules = [self.encoder, self.body_position_net, self.body_regressor,
+                                  self.box_net, self.hand_position_net,
+                                  self.hand_roi_net, self.hand_regressor, self.face_regressor]
+        self.special_trainable_modules = []
+
+        # backbone:
+        param_bb = sum(p.numel() for p in self.encoder.parameters() if p.requires_grad)
+        # neck 
+        param_neck = 0
+        for module in self.neck:
+            param_neck += sum(p.numel() for p in module.parameters() if p.requires_grad)
+        # head
+        param_head = 0
+        for module in self.head:
+            param_head += sum(p.numel() for p in module.parameters() if p.requires_grad)
+
+        param_net = param_bb + param_neck + param_head
+
+        # print('#parameters:')
+        # print(f'{param_bb}, {param_neck}, {param_head}, {param_net}')
+
+    def get_camera_trans(self, cam_param):
+        # camera translation
+        t_xy = cam_param[:, :2]
+        gamma = torch.sigmoid(cam_param[:, 2])  # apply sigmoid to make it positive
+        k_value = torch.FloatTensor([math.sqrt(cfg.focal[0] * cfg.focal[1] * cfg.camera_3d_size * cfg.camera_3d_size / (
+                cfg.input_body_shape[0] * cfg.input_body_shape[1]))]).to(cfg.device).view(-1)
+        t_z = k_value * gamma
+        cam_trans = torch.cat((t_xy, t_z[:, None]), 1)
+        return cam_trans
+
+    def get_coord(self, root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose, shape, expr, cam_trans, mode):
+        batch_size = root_pose.shape[0]
+        zero_pose = torch.zeros((1, 3)).float().to(cfg.device).repeat(batch_size, 1)  # eye poses
+        output = self.smplx_layer(betas=shape, body_pose=body_pose, global_orient=root_pose, right_hand_pose=rhand_pose,
+                                  left_hand_pose=lhand_pose, jaw_pose=jaw_pose, leye_pose=zero_pose,
+                                  reye_pose=zero_pose, expression=expr)
+        # camera-centered 3D coordinate
+        mesh_cam = output.vertices
+        if mode == 'test' and cfg.testset == 'AGORA':  # use 144 joints for AGORA evaluation
+            joint_cam = output.joints
+        else:
+            joint_cam = output.joints[:, smpl_x.joint_idx, :]
+
+        # project 3D coordinates to 2D space
+        if mode == 'train' and len(cfg.trainset_3d) == 1 and cfg.trainset_3d[0] == 'AGORA' and len(
+                cfg.trainset_2d) == 0:  # prevent gradients from backpropagating to SMPLX paraemter regression module
+            x = (joint_cam[:, :, 0].detach() + cam_trans[:, None, 0]) / (
+                    joint_cam[:, :, 2].detach() + cam_trans[:, None, 2] + 1e-4) * cfg.focal[0] + cfg.princpt[0]
+            y = (joint_cam[:, :, 1].detach() + cam_trans[:, None, 1]) / (
+                    joint_cam[:, :, 2].detach() + cam_trans[:, None, 2] + 1e-4) * cfg.focal[1] + cfg.princpt[1]
+        else:
+            x = (joint_cam[:, :, 0] + cam_trans[:, None, 0]) / (joint_cam[:, :, 2] + cam_trans[:, None, 2] + 1e-4) * \
+                cfg.focal[0] + cfg.princpt[0]
+            y = (joint_cam[:, :, 1] + cam_trans[:, None, 1]) / (joint_cam[:, :, 2] + cam_trans[:, None, 2] + 1e-4) * \
+                cfg.focal[1] + cfg.princpt[1]
+        x = x / cfg.input_body_shape[1] * cfg.output_hm_shape[2]
+        y = y / cfg.input_body_shape[0] * cfg.output_hm_shape[1]
+        joint_proj = torch.stack((x, y), 2)
+
+        # root-relative 3D coordinates
+        root_cam = joint_cam[:, smpl_x.root_joint_idx, None, :]
+        joint_cam = joint_cam - root_cam
+        mesh_cam = mesh_cam + cam_trans[:, None, :]  # for rendering
+        joint_cam_wo_ra = joint_cam.clone()
+
+        # left hand root (left wrist)-relative 3D coordinatese
+        lhand_idx = smpl_x.joint_part['lhand']
+        lhand_cam = joint_cam[:, lhand_idx, :]
+        lwrist_cam = joint_cam[:, smpl_x.lwrist_idx, None, :]
+        lhand_cam = lhand_cam - lwrist_cam
+        joint_cam = torch.cat((joint_cam[:, :lhand_idx[0], :], lhand_cam, joint_cam[:, lhand_idx[-1] + 1:, :]), 1)
+
+        # right hand root (right wrist)-relative 3D coordinatese
+        rhand_idx = smpl_x.joint_part['rhand']
+        rhand_cam = joint_cam[:, rhand_idx, :]
+        rwrist_cam = joint_cam[:, smpl_x.rwrist_idx, None, :]
+        rhand_cam = rhand_cam - rwrist_cam
+        joint_cam = torch.cat((joint_cam[:, :rhand_idx[0], :], rhand_cam, joint_cam[:, rhand_idx[-1] + 1:, :]), 1)
+
+        # face root (neck)-relative 3D coordinates
+        face_idx = smpl_x.joint_part['face']
+        face_cam = joint_cam[:, face_idx, :]
+        neck_cam = joint_cam[:, smpl_x.neck_idx, None, :]
+        face_cam = face_cam - neck_cam
+        joint_cam = torch.cat((joint_cam[:, :face_idx[0], :], face_cam, joint_cam[:, face_idx[-1] + 1:, :]), 1)
+
+        return joint_proj, joint_cam, joint_cam_wo_ra, mesh_cam
+
+    def generate_mesh_gt(self, targets, mode):
+        if 'smplx_mesh_cam' in targets:
+            return targets['smplx_mesh_cam']
+        nums = [3, 63, 45, 45, 3]
+        accu = []
+        temp = 0
+        for num in nums:
+            temp += num
+            accu.append(temp)
+        pose = targets['smplx_pose']
+        root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose = \
+            pose[:, :accu[0]], pose[:, accu[0]:accu[1]], pose[:, accu[1]:accu[2]], pose[:, accu[2]:accu[3]], pose[:,
+                                                                                                             accu[3]:
+                                                                                                             accu[4]]
+        # print(lhand_pose)
+        shape = targets['smplx_shape']
+        expr = targets['smplx_expr']
+        cam_trans = targets['smplx_cam_trans']
+
+        # final output
+        joint_proj, joint_cam, joint_cam_wo_ra, mesh_cam = self.get_coord(root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose, shape,
+                                                         expr, cam_trans, mode)
+
+        return mesh_cam
+
+    def bbox_split(self, bbox):
+        # bbox:[bs, 3, 3]
+        lhand_bbox_center, rhand_bbox_center, face_bbox_center = \
+            bbox[:, 0, :2], bbox[:, 1, :2], bbox[:, 2, :2]
+        return lhand_bbox_center, rhand_bbox_center, face_bbox_center
+
+    def forward(self, inputs, targets, meta_info, mode):
+
+        body_img = F.interpolate(inputs['img'], cfg.input_body_shape)
+
+        # 1. Encoder
+        img_feat, task_tokens = self.encoder(body_img)  # task_token:[bs, N, c]
+        shape_token, cam_token, expr_token, jaw_pose_token, hand_token, body_pose_token = \
+            task_tokens[:, 0], task_tokens[:, 1], task_tokens[:, 2], task_tokens[:, 3], task_tokens[:, 4:6], task_tokens[:, 6:]
+
+        # 2. Body Regressor
+        body_joint_hm, body_joint_img = self.body_position_net(img_feat)
+        root_pose, body_pose, shape, cam_param, = self.body_regressor(body_pose_token, shape_token, cam_token, body_joint_img.detach())
+        root_pose = rot6d_to_axis_angle(root_pose)
+        body_pose = rot6d_to_axis_angle(body_pose.reshape(-1, 6)).reshape(body_pose.shape[0], -1)  # (N, J_R*3)
+        cam_trans = self.get_camera_trans(cam_param)
+
+        # 3. Hand and Face BBox Estimation
+        lhand_bbox_center, lhand_bbox_size, rhand_bbox_center, rhand_bbox_size, face_bbox_center, face_bbox_size = self.box_net(img_feat, body_joint_hm.detach())
+        lhand_bbox = restore_bbox(lhand_bbox_center, lhand_bbox_size, cfg.input_hand_shape[1] / cfg.input_hand_shape[0], 2.0).detach()  # xyxy in (cfg.input_body_shape[1], cfg.input_body_shape[0]) space
+        rhand_bbox = restore_bbox(rhand_bbox_center, rhand_bbox_size, cfg.input_hand_shape[1] / cfg.input_hand_shape[0], 2.0).detach()  # xyxy in (cfg.input_body_shape[1], cfg.input_body_shape[0]) space
+        face_bbox = restore_bbox(face_bbox_center, face_bbox_size, cfg.input_face_shape[1] / cfg.input_face_shape[0], 1.5).detach()  # xyxy in (cfg.input_body_shape[1], cfg.input_body_shape[0]) space
+
+        # 4. Differentiable Feature-level Hand Crop-Upsample
+        # hand_feat: list, [bsx2, c, cfg.output_hm_shape[1]*scale, cfg.output_hm_shape[2]*scale]
+        hand_feat = self.hand_roi_net(img_feat, lhand_bbox, rhand_bbox)  # hand_feat: flipped left hand + right hand
+
+        # 5. Hand/Face Regressor
+        # hand regressor
+        _, hand_joint_img = self.hand_position_net(hand_feat)  # (2N, J_P, 3)
+        hand_pose = self.hand_regressor(hand_feat, hand_joint_img.detach())
+        hand_pose = rot6d_to_axis_angle(hand_pose.reshape(-1, 6)).reshape(hand_feat.shape[0], -1)  # (2N, J_R*3)
+        # restore flipped left hand joint coordinates
+        batch_size = hand_joint_img.shape[0] // 2
+        lhand_joint_img = hand_joint_img[:batch_size, :, :]
+        lhand_joint_img = torch.cat((cfg.output_hand_hm_shape[2] - 1 - lhand_joint_img[:, :, 0:1], lhand_joint_img[:, :, 1:]), 2)
+        rhand_joint_img = hand_joint_img[batch_size:, :, :]
+        # restore flipped left hand joint rotations
+        batch_size = hand_pose.shape[0] // 2
+        lhand_pose = hand_pose[:batch_size, :].reshape(-1, len(smpl_x.orig_joint_part['lhand']), 3)
+        lhand_pose = torch.cat((lhand_pose[:, :, 0:1], -lhand_pose[:, :, 1:3]), 2).view(batch_size, -1)
+        rhand_pose = hand_pose[batch_size:, :]
+
+        # hand regressor
+        expr, jaw_pose = self.face_regressor(expr_token, jaw_pose_token)
+        jaw_pose = rot6d_to_axis_angle(jaw_pose)
+
+        # final output
+        joint_proj, joint_cam, joint_cam_wo_ra, mesh_cam = self.get_coord(root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose, shape, expr, cam_trans, mode)
+        pose = torch.cat((root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose), 1)
+        joint_img = torch.cat((body_joint_img, lhand_joint_img, rhand_joint_img), 1)
+
+        if mode == 'test' and 'smplx_pose' in targets:
+            mesh_pseudo_gt = self.generate_mesh_gt(targets, mode)
+
+        if mode == 'train':
+            # loss functions
+            loss = {}
+
+            smplx_kps_3d_weight = getattr(cfg, 'smplx_kps_3d_weight', 1.0)
+            smplx_kps_3d_weight = getattr(cfg, 'smplx_kps_weight', smplx_kps_3d_weight) # old config
+
+            smplx_kps_2d_weight = getattr(cfg, 'smplx_kps_2d_weight', 1.0)
+            net_kps_2d_weight = getattr(cfg, 'net_kps_2d_weight', 1.0)
+
+            smplx_pose_weight = getattr(cfg, 'smplx_pose_weight', 1.0)
+            smplx_shape_weight = getattr(cfg, 'smplx_loss_weight', 1.0)
+            # smplx_orient_weight = getattr(cfg, 'smplx_orient_weight', smplx_pose_weight) # if not specified, use the same weight as pose
+    
+
+            # do not supervise root pose if original agora json is used
+            if getattr(cfg, 'agora_fix_global_orient_transl', False):
+                # loss['smplx_pose'] = self.param_loss(pose, targets['smplx_pose'], meta_info['smplx_pose_valid'])[:, 3:] * smplx_pose_weight
+                if hasattr(cfg, 'smplx_orient_weight'):
+                    smplx_orient_weight = getattr(cfg, 'smplx_orient_weight')
+                    loss['smplx_orient'] = self.param_loss(pose, targets['smplx_pose'], meta_info['smplx_pose_valid'])[:, :3] * smplx_orient_weight
+
+                loss['smplx_pose'] = self.param_loss(pose, targets['smplx_pose'], meta_info['smplx_pose_valid']) * smplx_pose_weight
+
+            else:
+                loss['smplx_pose'] = self.param_loss(pose, targets['smplx_pose'], meta_info['smplx_pose_valid'])[:, 3:] * smplx_pose_weight
+
+            loss['smplx_shape'] = self.param_loss(shape, targets['smplx_shape'],
+                                                  meta_info['smplx_shape_valid'][:, None]) * smplx_shape_weight 
+            loss['smplx_expr'] = self.param_loss(expr, targets['smplx_expr'], meta_info['smplx_expr_valid'][:, None])
+
+            # supervision for keypoints3d wo/ ra
+            loss['joint_cam'] = self.coord_loss(joint_cam_wo_ra, targets['joint_cam'], meta_info['joint_valid'] * meta_info['is_3D'][:, None, None]) * smplx_kps_3d_weight
+            # supervision for keypoints3d w/ ra
+            loss['smplx_joint_cam'] = self.coord_loss(joint_cam, targets['smplx_joint_cam'], meta_info['smplx_joint_valid']) * smplx_kps_3d_weight
+
+            if not (meta_info['lhand_bbox_valid'] == 0).all():
+                loss['lhand_bbox'] = (self.coord_loss(lhand_bbox_center, targets['lhand_bbox_center'], meta_info['lhand_bbox_valid'][:, None]) +
+                                    self.coord_loss(lhand_bbox_size, targets['lhand_bbox_size'], meta_info['lhand_bbox_valid'][:, None]))
+            if not (meta_info['rhand_bbox_valid'] == 0).all():
+                loss['rhand_bbox'] = (self.coord_loss(rhand_bbox_center, targets['rhand_bbox_center'], meta_info['rhand_bbox_valid'][:, None]) +
+                                    self.coord_loss(rhand_bbox_size, targets['rhand_bbox_size'], meta_info['rhand_bbox_valid'][:, None]))
+            if not (meta_info['face_bbox_valid'] == 0).all():
+                loss['face_bbox'] = (self.coord_loss(face_bbox_center, targets['face_bbox_center'], meta_info['face_bbox_valid'][:, None]) +
+                                 self.coord_loss(face_bbox_size, targets['face_bbox_size'], meta_info['face_bbox_valid'][:, None]))
+            
+            # if (meta_info['face_bbox_valid'] == 0).all():
+            #     out = {}
+            targets['original_joint_img'] = targets['joint_img'].clone()
+            targets['original_smplx_joint_img'] = targets['smplx_joint_img'].clone()
+            # out['original_joint_proj'] = joint_proj.clone()
+            if not (meta_info['lhand_bbox_valid'] + meta_info['rhand_bbox_valid'] == 0).all():
+
+                # change hand target joint_img and joint_trunc according to hand bbox (cfg.output_hm_shape -> downsampled hand bbox space)
+                for part_name, bbox in (('lhand', lhand_bbox), ('rhand', rhand_bbox)):
+                    for coord_name, trunc_name in (('joint_img', 'joint_trunc'), ('smplx_joint_img', 'smplx_joint_trunc')):
+                        x = targets[coord_name][:, smpl_x.joint_part[part_name], 0]
+                        y = targets[coord_name][:, smpl_x.joint_part[part_name], 1]
+                        z = targets[coord_name][:, smpl_x.joint_part[part_name], 2]
+                        trunc = meta_info[trunc_name][:, smpl_x.joint_part[part_name], 0]
+
+                        x -= (bbox[:, None, 0] / cfg.input_body_shape[1] * cfg.output_hm_shape[2])
+                        x *= (cfg.output_hand_hm_shape[2] / (
+                                (bbox[:, None, 2] - bbox[:, None, 0]) / cfg.input_body_shape[1] * cfg.output_hm_shape[
+                            2]))
+                        y -= (bbox[:, None, 1] / cfg.input_body_shape[0] * cfg.output_hm_shape[1])
+                        y *= (cfg.output_hand_hm_shape[1] / (
+                                (bbox[:, None, 3] - bbox[:, None, 1]) / cfg.input_body_shape[0] * cfg.output_hm_shape[
+                            1]))
+                        z *= cfg.output_hand_hm_shape[0] / cfg.output_hm_shape[0]
+                        trunc *= ((x >= 0) * (x < cfg.output_hand_hm_shape[2]) * (y >= 0) * (
+                                y < cfg.output_hand_hm_shape[1]))
+
+                        coord = torch.stack((x, y, z), 2)
+                        trunc = trunc[:, :, None]
+                        targets[coord_name] = torch.cat((targets[coord_name][:, :smpl_x.joint_part[part_name][0], :], coord,
+                                                        targets[coord_name][:, smpl_x.joint_part[part_name][-1] + 1:, :]),
+                                                        1)
+                        meta_info[trunc_name] = torch.cat((meta_info[trunc_name][:, :smpl_x.joint_part[part_name][0], :],
+                                                        trunc,
+                                                        meta_info[trunc_name][:, smpl_x.joint_part[part_name][-1] + 1:,
+                                                        :]), 1)
+
+                # change hand projected joint coordinates according to hand bbox (cfg.output_hm_shape -> hand bbox space)
+                for part_name, bbox in (('lhand', lhand_bbox), ('rhand', rhand_bbox)):
+                    x = joint_proj[:, smpl_x.joint_part[part_name], 0]
+                    y = joint_proj[:, smpl_x.joint_part[part_name], 1]
+
+                    x -= (bbox[:, None, 0] / cfg.input_body_shape[1] * cfg.output_hm_shape[2])
+                    x *= (cfg.output_hand_hm_shape[2] / (
+                            (bbox[:, None, 2] - bbox[:, None, 0]) / cfg.input_body_shape[1] * cfg.output_hm_shape[2]))
+                    y -= (bbox[:, None, 1] / cfg.input_body_shape[0] * cfg.output_hm_shape[1])
+                    y *= (cfg.output_hand_hm_shape[1] / (
+                            (bbox[:, None, 3] - bbox[:, None, 1]) / cfg.input_body_shape[0] * cfg.output_hm_shape[1]))
+
+                    coord = torch.stack((x, y), 2)
+                    trans = []
+                    for bid in range(coord.shape[0]):
+                        mask = meta_info['joint_trunc'][bid, smpl_x.joint_part[part_name], 0] == 1
+                        if torch.sum(mask) == 0:
+                            trans.append(torch.zeros((2)).float().to(cfg.device))
+                        else:
+                            trans.append((-coord[bid, mask, :2] + targets['joint_img'][:, smpl_x.joint_part[part_name], :][
+                                                                bid, mask, :2]).mean(0))
+                    trans = torch.stack(trans)[:, None, :]
+                    coord = coord + trans  # global translation alignment
+                    joint_proj = torch.cat((joint_proj[:, :smpl_x.joint_part[part_name][0], :], coord,
+                                            joint_proj[:, smpl_x.joint_part[part_name][-1] + 1:, :]), 1)
+
+            if not (meta_info['face_bbox_valid'] == 0).all():
+                # change face projected joint coordinates according to face bbox (cfg.output_hm_shape -> face bbox space)
+                coord = joint_proj[:, smpl_x.joint_part['face'], :]
+                trans = []
+                for bid in range(coord.shape[0]):
+                    mask = meta_info['joint_trunc'][bid, smpl_x.joint_part['face'], 0] == 1
+                    if torch.sum(mask) == 0:
+                        trans.append(torch.zeros((2)).float().to(cfg.device))
+                    else:
+                        trans.append((-coord[bid, mask, :2] + targets['joint_img'][:, smpl_x.joint_part['face'], :][bid,
+                                                            mask, :2]).mean(0))
+                trans = torch.stack(trans)[:, None, :]
+                coord = coord + trans  # global translation alignment
+                joint_proj = torch.cat((joint_proj[:, :smpl_x.joint_part['face'][0], :], coord,
+                                        joint_proj[:, smpl_x.joint_part['face'][-1] + 1:, :]), 1)
+            
+            loss['joint_proj'] = self.coord_loss(joint_proj, targets['joint_img'][:, :, :2], meta_info['joint_trunc']) * smplx_kps_2d_weight
+            loss['joint_img'] = self.coord_loss(joint_img, smpl_x.reduce_joint_set(targets['joint_img']),
+                                                smpl_x.reduce_joint_set(meta_info['joint_trunc']), meta_info['is_3D']) * net_kps_2d_weight
+            
+            loss['smplx_joint_img'] = self.coord_loss(joint_img, smpl_x.reduce_joint_set(targets['smplx_joint_img']),
+                                                      smpl_x.reduce_joint_set(meta_info['smplx_joint_trunc'])) * net_kps_2d_weight
+
+            return loss
+        else:
+            # change hand output joint_img according to hand bbox
+            for part_name, bbox in (('lhand', lhand_bbox), ('rhand', rhand_bbox)):
+                joint_img[:, smpl_x.pos_joint_part[part_name], 0] *= (
+                        ((bbox[:, None, 2] - bbox[:, None, 0]) / cfg.input_body_shape[1] * cfg.output_hm_shape[2]) /
+                        cfg.output_hand_hm_shape[2])
+                joint_img[:, smpl_x.pos_joint_part[part_name], 0] += (
+                        bbox[:, None, 0] / cfg.input_body_shape[1] * cfg.output_hm_shape[2])
+                joint_img[:, smpl_x.pos_joint_part[part_name], 1] *= (
+                        ((bbox[:, None, 3] - bbox[:, None, 1]) / cfg.input_body_shape[0] * cfg.output_hm_shape[1]) /
+                        cfg.output_hand_hm_shape[1])
+                joint_img[:, smpl_x.pos_joint_part[part_name], 1] += (
+                        bbox[:, None, 1] / cfg.input_body_shape[0] * cfg.output_hm_shape[1])
+
+            # change input_body_shape to input_img_shape
+            for bbox in (lhand_bbox, rhand_bbox, face_bbox):
+                bbox[:, 0] *= cfg.input_img_shape[1] / cfg.input_body_shape[1]
+                bbox[:, 1] *= cfg.input_img_shape[0] / cfg.input_body_shape[0]
+                bbox[:, 2] *= cfg.input_img_shape[1] / cfg.input_body_shape[1]
+                bbox[:, 3] *= cfg.input_img_shape[0] / cfg.input_body_shape[0]
+
+            # test output
+            out = {}
+            out['img'] = inputs['img']
+            out['joint_img'] = joint_img
+            out['smplx_joint_proj'] = joint_proj
+            out['smplx_mesh_cam'] = mesh_cam
+            out['smplx_root_pose'] = root_pose
+            out['smplx_body_pose'] = body_pose
+            out['smplx_lhand_pose'] = lhand_pose
+            out['smplx_rhand_pose'] = rhand_pose
+            out['smplx_jaw_pose'] = jaw_pose
+            out['smplx_shape'] = shape
+            out['smplx_expr'] = expr
+            out['cam_trans'] = cam_trans
+            out['lhand_bbox'] = lhand_bbox
+            out['rhand_bbox'] = rhand_bbox
+            out['face_bbox'] = face_bbox
+            if 'smplx_shape' in targets:
+                out['smplx_shape_target'] = targets['smplx_shape']
+            if 'img_path' in meta_info:
+                out['img_path'] = meta_info['img_path']
+            if 'smplx_pose' in targets:
+                out['smplx_mesh_cam_pseudo_gt'] = mesh_pseudo_gt
+            if 'smplx_mesh_cam' in targets:
+                out['smplx_mesh_cam_target'] = targets['smplx_mesh_cam']
+            if 'smpl_mesh_cam' in targets:
+                out['smpl_mesh_cam_target'] = targets['smpl_mesh_cam']
+            if 'bb2img_trans' in meta_info:
+                out['bb2img_trans'] = meta_info['bb2img_trans']
+            if 'gt_smplx_transl' in meta_info:
+                out['gt_smplx_transl'] = meta_info['gt_smplx_transl']
+
+            return out
+
+def init_weights(m):
+    try:
+        if type(m) == nn.ConvTranspose2d:
+            nn.init.normal_(m.weight, std=0.001)
+        elif type(m) == nn.Conv2d:
+            nn.init.normal_(m.weight, std=0.001)
+            nn.init.constant_(m.bias, 0)
+        elif type(m) == nn.BatchNorm2d:
+            nn.init.constant_(m.weight, 1)
+            nn.init.constant_(m.bias, 0)
+        elif type(m) == nn.Linear:
+            nn.init.normal_(m.weight, std=0.01)
+            nn.init.constant_(m.bias, 0)
+    except AttributeError:
+        pass
+
+
+def get_model(mode):
+
+    # body
+    vit_cfg = Config.fromfile(cfg.encoder_config_file)
+    vit = build_posenet(vit_cfg.model)
+    body_position_net = PositionNet('body', feat_dim=cfg.feat_dim)
+    body_rotation_net = BodyRotationNet(feat_dim=cfg.feat_dim)
+    box_net = BoxNet(feat_dim=cfg.feat_dim)
+
+    # hand
+    hand_position_net = PositionNet('hand', feat_dim=cfg.feat_dim)
+    hand_roi_net = HandRoI(feat_dim=cfg.feat_dim, upscale=cfg.upscale)
+    hand_rotation_net = HandRotationNet('hand', feat_dim=cfg.feat_dim)
+
+    # face
+    face_regressor = FaceRegressor(feat_dim=cfg.feat_dim)
+
+    if mode == 'train':
+        # body
+        if not getattr(cfg, 'random_init', False):
+            encoder_pretrained_model = torch.load(cfg.encoder_pretrained_model_path)['state_dict']
+            vit.load_state_dict(encoder_pretrained_model, strict=False)
+            print(f"Initialize encoder from {cfg.encoder_pretrained_model_path}")
+        else:
+            print('Random init!!!!!!!')
+
+        body_position_net.apply(init_weights)
+        body_rotation_net.apply(init_weights)
+        box_net.apply(init_weights)
+
+        # hand
+        hand_position_net.apply(init_weights)
+        hand_roi_net.apply(init_weights)
+        hand_rotation_net.apply(init_weights)
+
+        # face
+        face_regressor.apply(init_weights)
+
+    encoder = vit.backbone
+
+    model = Model(encoder, body_position_net, body_rotation_net, box_net, hand_position_net, hand_roi_net, hand_rotation_net,
+                  face_regressor)
+    return model
\ No newline at end of file
diff --git a/SMPLer-X/main/_base_/datasets/300w.py b/SMPLer-X/main/_base_/datasets/300w.py
new file mode 100644
index 0000000000000000000000000000000000000000..10c343a2adf84947159f2651b3e918d1fc32ea90
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/300w.py
@@ -0,0 +1,384 @@
+dataset_info = dict(
+    dataset_name='300w',
+    paper_info=dict(
+        author='Sagonas, Christos and Antonakos, Epameinondas '
+        'and Tzimiropoulos, Georgios and Zafeiriou, Stefanos '
+        'and Pantic, Maja',
+        title='300 faces in-the-wild challenge: '
+        'Database and results',
+        container='Image and vision computing',
+        year='2016',
+        homepage='https://ibug.doc.ic.ac.uk/resources/300-W/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-16'),
+        1:
+        dict(
+            name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-15'),
+        2:
+        dict(
+            name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-14'),
+        3:
+        dict(
+            name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-13'),
+        4:
+        dict(
+            name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-12'),
+        5:
+        dict(
+            name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-11'),
+        6:
+        dict(
+            name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-10'),
+        7:
+        dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-9'),
+        8:
+        dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap=''),
+        9:
+        dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-7'),
+        10:
+        dict(
+            name='kpt-10', id=10, color=[255, 255, 255], type='',
+            swap='kpt-6'),
+        11:
+        dict(
+            name='kpt-11', id=11, color=[255, 255, 255], type='',
+            swap='kpt-5'),
+        12:
+        dict(
+            name='kpt-12', id=12, color=[255, 255, 255], type='',
+            swap='kpt-4'),
+        13:
+        dict(
+            name='kpt-13', id=13, color=[255, 255, 255], type='',
+            swap='kpt-3'),
+        14:
+        dict(
+            name='kpt-14', id=14, color=[255, 255, 255], type='',
+            swap='kpt-2'),
+        15:
+        dict(
+            name='kpt-15', id=15, color=[255, 255, 255], type='',
+            swap='kpt-1'),
+        16:
+        dict(
+            name='kpt-16', id=16, color=[255, 255, 255], type='',
+            swap='kpt-0'),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-26'),
+        18:
+        dict(
+            name='kpt-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-25'),
+        19:
+        dict(
+            name='kpt-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-24'),
+        20:
+        dict(
+            name='kpt-20',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-23'),
+        21:
+        dict(
+            name='kpt-21',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-22'),
+        22:
+        dict(
+            name='kpt-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-21'),
+        23:
+        dict(
+            name='kpt-23',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-20'),
+        24:
+        dict(
+            name='kpt-24',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-19'),
+        25:
+        dict(
+            name='kpt-25',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-18'),
+        26:
+        dict(
+            name='kpt-26',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        27:
+        dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''),
+        28:
+        dict(name='kpt-28', id=28, color=[255, 255, 255], type='', swap=''),
+        29:
+        dict(name='kpt-29', id=29, color=[255, 255, 255], type='', swap=''),
+        30:
+        dict(name='kpt-30', id=30, color=[255, 255, 255], type='', swap=''),
+        31:
+        dict(
+            name='kpt-31',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-35'),
+        32:
+        dict(
+            name='kpt-32',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-34'),
+        33:
+        dict(name='kpt-33', id=33, color=[255, 255, 255], type='', swap=''),
+        34:
+        dict(
+            name='kpt-34',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-32'),
+        35:
+        dict(
+            name='kpt-35',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-31'),
+        36:
+        dict(
+            name='kpt-36',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-45'),
+        37:
+        dict(
+            name='kpt-37',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-44'),
+        38:
+        dict(
+            name='kpt-38',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-43'),
+        39:
+        dict(
+            name='kpt-39',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-42'),
+        40:
+        dict(
+            name='kpt-40',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-47'),
+        41:
+        dict(
+            name='kpt-41',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-46'),
+        42:
+        dict(
+            name='kpt-42',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-39'),
+        43:
+        dict(
+            name='kpt-43',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-38'),
+        44:
+        dict(
+            name='kpt-44',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-37'),
+        45:
+        dict(
+            name='kpt-45',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-36'),
+        46:
+        dict(
+            name='kpt-46',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-41'),
+        47:
+        dict(
+            name='kpt-47',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-40'),
+        48:
+        dict(
+            name='kpt-48',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-54'),
+        49:
+        dict(
+            name='kpt-49',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-53'),
+        50:
+        dict(
+            name='kpt-50',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-52'),
+        51:
+        dict(name='kpt-51', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(
+            name='kpt-52',
+            id=52,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-50'),
+        53:
+        dict(
+            name='kpt-53',
+            id=53,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-49'),
+        54:
+        dict(
+            name='kpt-54',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-48'),
+        55:
+        dict(
+            name='kpt-55',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-59'),
+        56:
+        dict(
+            name='kpt-56',
+            id=56,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-58'),
+        57:
+        dict(name='kpt-57', id=57, color=[255, 255, 255], type='', swap=''),
+        58:
+        dict(
+            name='kpt-58',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-56'),
+        59:
+        dict(
+            name='kpt-59',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-55'),
+        60:
+        dict(
+            name='kpt-60',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-64'),
+        61:
+        dict(
+            name='kpt-61',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-63'),
+        62:
+        dict(name='kpt-62', id=62, color=[255, 255, 255], type='', swap=''),
+        63:
+        dict(
+            name='kpt-63',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-61'),
+        64:
+        dict(
+            name='kpt-64',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-60'),
+        65:
+        dict(
+            name='kpt-65',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-67'),
+        66:
+        dict(name='kpt-66', id=66, color=[255, 255, 255], type='', swap=''),
+        67:
+        dict(
+            name='kpt-67',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-65'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 68,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/aflw.py b/SMPLer-X/main/_base_/datasets/aflw.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf534cbb756e8c514c2f5e2a7fceedd55afb637e
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/aflw.py
@@ -0,0 +1,83 @@
+dataset_info = dict(
+    dataset_name='aflw',
+    paper_info=dict(
+        author='Koestinger, Martin and Wohlhart, Paul and '
+        'Roth, Peter M and Bischof, Horst',
+        title='Annotated facial landmarks in the wild: '
+        'A large-scale, real-world database for facial '
+        'landmark localization',
+        container='2011 IEEE international conference on computer '
+        'vision workshops (ICCV workshops)',
+        year='2011',
+        homepage='https://www.tugraz.at/institute/icg/research/'
+        'team-bischof/lrs/downloads/aflw/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-5'),
+        1:
+        dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-4'),
+        2:
+        dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-3'),
+        3:
+        dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-2'),
+        4:
+        dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-1'),
+        5:
+        dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-0'),
+        6:
+        dict(
+            name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-11'),
+        7:
+        dict(
+            name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-10'),
+        8:
+        dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-9'),
+        9:
+        dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-8'),
+        10:
+        dict(
+            name='kpt-10', id=10, color=[255, 255, 255], type='',
+            swap='kpt-7'),
+        11:
+        dict(
+            name='kpt-11', id=11, color=[255, 255, 255], type='',
+            swap='kpt-6'),
+        12:
+        dict(
+            name='kpt-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-14'),
+        13:
+        dict(name='kpt-13', id=13, color=[255, 255, 255], type='', swap=''),
+        14:
+        dict(
+            name='kpt-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-12'),
+        15:
+        dict(
+            name='kpt-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        16:
+        dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap=''),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-15'),
+        18:
+        dict(name='kpt-18', id=18, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 19,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/aic.py b/SMPLer-X/main/_base_/datasets/aic.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ecdbe3f0afeb19dbb7aed42653ce5efd85cfda3
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/aic.py
@@ -0,0 +1,140 @@
+dataset_info = dict(
+    dataset_name='aic',
+    paper_info=dict(
+        author='Wu, Jiahong and Zheng, He and Zhao, Bo and '
+        'Li, Yixin and Yan, Baoming and Liang, Rui and '
+        'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and '
+        'Fu, Yanwei and others',
+        title='Ai challenger: A large-scale dataset for going '
+        'deeper in image understanding',
+        container='arXiv',
+        year='2017',
+        homepage='https://github.com/AIChallenger/AI_Challenger_2017',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_shoulder',
+            id=0,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        1:
+        dict(
+            name='right_elbow',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        2:
+        dict(
+            name='right_wrist',
+            id=2,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        3:
+        dict(
+            name='left_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        4:
+        dict(
+            name='left_elbow',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        5:
+        dict(
+            name='left_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        6:
+        dict(
+            name='right_hip',
+            id=6,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        7:
+        dict(
+            name='right_knee',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        8:
+        dict(
+            name='right_ankle',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        9:
+        dict(
+            name='left_hip',
+            id=9,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        10:
+        dict(
+            name='left_knee',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        11:
+        dict(
+            name='left_ankle',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        12:
+        dict(
+            name='head_top',
+            id=12,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        13:
+        dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]),
+        1: dict(
+            link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]),
+        2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]),
+        3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]),
+        4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]),
+        7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]),
+        8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]),
+        9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]),
+        10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]),
+        11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]),
+        12: dict(
+            link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.
+    ],
+
+    # 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/'
+    # 'Evaluation/keypoint_eval/keypoint_eval.py#L50'
+    # delta = 2 x sigma
+    sigmas=[
+        0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144,
+        0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081,
+        0.01291456, 0.01236173
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/animalpose.py b/SMPLer-X/main/_base_/datasets/animalpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5bb62d951b71da25e679bd755fe566216dc3f6f
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/animalpose.py
@@ -0,0 +1,166 @@
+dataset_info = dict(
+    dataset_name='animalpose',
+    paper_info=dict(
+        author='Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and '
+        'Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing',
+        title='Cross-Domain Adaptation for Animal Pose Estimation',
+        container='The IEEE International Conference on '
+        'Computer Vision (ICCV)',
+        year='2019',
+        homepage='https://sites.google.com/view/animal-pose/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
+        1:
+        dict(
+            name='R_Eye',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Eye'),
+        2:
+        dict(
+            name='L_EarBase',
+            id=2,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_EarBase'),
+        3:
+        dict(
+            name='R_EarBase',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_EarBase'),
+        4:
+        dict(name='Nose', id=4, color=[51, 153, 255], type='upper', swap=''),
+        5:
+        dict(name='Throat', id=5, color=[51, 153, 255], type='upper', swap=''),
+        6:
+        dict(
+            name='TailBase', id=6, color=[51, 153, 255], type='lower',
+            swap=''),
+        7:
+        dict(
+            name='Withers', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='L_F_Elbow',
+            id=8,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Elbow'),
+        9:
+        dict(
+            name='R_F_Elbow',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_F_Elbow'),
+        10:
+        dict(
+            name='L_B_Elbow',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Elbow'),
+        11:
+        dict(
+            name='R_B_Elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='L_B_Elbow'),
+        12:
+        dict(
+            name='L_F_Knee',
+            id=12,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Knee'),
+        13:
+        dict(
+            name='R_F_Knee',
+            id=13,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_F_Knee'),
+        14:
+        dict(
+            name='L_B_Knee',
+            id=14,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Knee'),
+        15:
+        dict(
+            name='R_B_Knee',
+            id=15,
+            color=[255, 128, 0],
+            type='lower',
+            swap='L_B_Knee'),
+        16:
+        dict(
+            name='L_F_Paw',
+            id=16,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Paw'),
+        17:
+        dict(
+            name='R_F_Paw',
+            id=17,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_F_Paw'),
+        18:
+        dict(
+            name='L_B_Paw',
+            id=18,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Paw'),
+        19:
+        dict(
+            name='R_B_Paw',
+            id=19,
+            color=[255, 128, 0],
+            type='lower',
+            swap='L_B_Paw')
+    },
+    skeleton_info={
+        0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[51, 153, 255]),
+        1: dict(link=('L_Eye', 'L_EarBase'), id=1, color=[0, 255, 0]),
+        2: dict(link=('R_Eye', 'R_EarBase'), id=2, color=[255, 128, 0]),
+        3: dict(link=('L_Eye', 'Nose'), id=3, color=[0, 255, 0]),
+        4: dict(link=('R_Eye', 'Nose'), id=4, color=[255, 128, 0]),
+        5: dict(link=('Nose', 'Throat'), id=5, color=[51, 153, 255]),
+        6: dict(link=('Throat', 'Withers'), id=6, color=[51, 153, 255]),
+        7: dict(link=('TailBase', 'Withers'), id=7, color=[51, 153, 255]),
+        8: dict(link=('Throat', 'L_F_Elbow'), id=8, color=[0, 255, 0]),
+        9: dict(link=('L_F_Elbow', 'L_F_Knee'), id=9, color=[0, 255, 0]),
+        10: dict(link=('L_F_Knee', 'L_F_Paw'), id=10, color=[0, 255, 0]),
+        11: dict(link=('Throat', 'R_F_Elbow'), id=11, color=[255, 128, 0]),
+        12: dict(link=('R_F_Elbow', 'R_F_Knee'), id=12, color=[255, 128, 0]),
+        13: dict(link=('R_F_Knee', 'R_F_Paw'), id=13, color=[255, 128, 0]),
+        14: dict(link=('TailBase', 'L_B_Elbow'), id=14, color=[0, 255, 0]),
+        15: dict(link=('L_B_Elbow', 'L_B_Knee'), id=15, color=[0, 255, 0]),
+        16: dict(link=('L_B_Knee', 'L_B_Paw'), id=16, color=[0, 255, 0]),
+        17: dict(link=('TailBase', 'R_B_Elbow'), id=17, color=[255, 128, 0]),
+        18: dict(link=('R_B_Elbow', 'R_B_Knee'), id=18, color=[255, 128, 0]),
+        19: dict(link=('R_B_Knee', 'R_B_Paw'), id=19, color=[255, 128, 0])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2,
+        1.5, 1.5, 1.5, 1.5
+    ],
+
+    # Note: The original paper did not provide enough information about
+    # the sigmas. We modified from 'https://github.com/cocodataset/'
+    # 'cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py#L523'
+    sigmas=[
+        0.025, 0.025, 0.026, 0.035, 0.035, 0.10, 0.10, 0.10, 0.107, 0.107,
+        0.107, 0.107, 0.087, 0.087, 0.087, 0.087, 0.089, 0.089, 0.089, 0.089
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/ap10k.py b/SMPLer-X/main/_base_/datasets/ap10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0df579acbb8cf0de1ef62412ba865ee8710f0aa
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/ap10k.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+    dataset_name='ap10k',
+    paper_info=dict(
+        author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
+        'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
+        title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
+        container='35th Conference on Neural Information Processing Systems '
+        '(NeurIPS 2021) Track on Datasets and Bench-marks.',
+        year='2021',
+        homepage='https://github.com/AlexTheBad/AP-10K',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
+        1:
+        dict(
+            name='R_Eye',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Eye'),
+        2:
+        dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
+        3:
+        dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
+        4:
+        dict(
+            name='Root of tail',
+            id=4,
+            color=[51, 153, 255],
+            type='lower',
+            swap=''),
+        5:
+        dict(
+            name='L_Shoulder',
+            id=5,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Shoulder'),
+        6:
+        dict(
+            name='L_Elbow',
+            id=6,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Elbow'),
+        7:
+        dict(
+            name='L_F_Paw',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Paw'),
+        8:
+        dict(
+            name='R_Shoulder',
+            id=8,
+            color=[0, 255, 0],
+            type='upper',
+            swap='L_Shoulder'),
+        9:
+        dict(
+            name='R_Elbow',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Elbow'),
+        10:
+        dict(
+            name='R_F_Paw',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_F_Paw'),
+        11:
+        dict(
+            name='L_Hip',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Hip'),
+        12:
+        dict(
+            name='L_Knee',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Knee'),
+        13:
+        dict(
+            name='L_B_Paw',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Paw'),
+        14:
+        dict(
+            name='R_Hip', id=14, color=[0, 255, 0], type='lower',
+            swap='L_Hip'),
+        15:
+        dict(
+            name='R_Knee',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_Knee'),
+        16:
+        dict(
+            name='R_B_Paw',
+            id=16,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_B_Paw'),
+    },
+    skeleton_info={
+        0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
+        1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
+        2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
+        3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
+        4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
+        5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
+        6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
+        7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
+        8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
+        9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
+        10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
+        11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
+        12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
+        13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
+        14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
+        15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
+        16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
+        0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/atrw.py b/SMPLer-X/main/_base_/datasets/atrw.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec71c8c508a0340139371a651ca2dd56eeae3cf
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/atrw.py
@@ -0,0 +1,144 @@
+dataset_info = dict(
+    dataset_name='atrw',
+    paper_info=dict(
+        author='Li, Shuyuan and Li, Jianguo and Tang, Hanlin '
+        'and Qian, Rui and Lin, Weiyao',
+        title='ATRW: A Benchmark for Amur Tiger '
+        'Re-identification in the Wild',
+        container='Proceedings of the 28th ACM '
+        'International Conference on Multimedia',
+        year='2020',
+        homepage='https://cvwc2019.github.io/challenge.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left_ear',
+            id=0,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        1:
+        dict(
+            name='right_ear',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        2:
+        dict(name='nose', id=2, color=[51, 153, 255], type='upper', swap=''),
+        3:
+        dict(
+            name='right_shoulder',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        4:
+        dict(
+            name='right_front_paw',
+            id=4,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_front_paw'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='left_front_paw',
+            id=6,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_front_paw'),
+        7:
+        dict(
+            name='right_hip',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        8:
+        dict(
+            name='right_knee',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        9:
+        dict(
+            name='right_back_paw',
+            id=9,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_back_paw'),
+        10:
+        dict(
+            name='left_hip',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        11:
+        dict(
+            name='left_knee',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        12:
+        dict(
+            name='left_back_paw',
+            id=12,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_back_paw'),
+        13:
+        dict(name='tail', id=13, color=[51, 153, 255], type='lower', swap=''),
+        14:
+        dict(
+            name='center', id=14, color=[51, 153, 255], type='lower', swap=''),
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ear', 'nose'), id=0, color=[51, 153, 255]),
+        1:
+        dict(link=('right_ear', 'nose'), id=1, color=[51, 153, 255]),
+        2:
+        dict(link=('nose', 'center'), id=2, color=[51, 153, 255]),
+        3:
+        dict(
+            link=('left_shoulder', 'left_front_paw'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_shoulder', 'center'), id=4, color=[0, 255, 0]),
+        5:
+        dict(
+            link=('right_shoulder', 'right_front_paw'),
+            id=5,
+            color=[255, 128, 0]),
+        6:
+        dict(link=('right_shoulder', 'center'), id=6, color=[255, 128, 0]),
+        7:
+        dict(link=('tail', 'center'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('right_back_paw', 'right_knee'), id=8, color=[255, 128, 0]),
+        9:
+        dict(link=('right_knee', 'right_hip'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('right_hip', 'tail'), id=10, color=[255, 128, 0]),
+        11:
+        dict(link=('left_back_paw', 'left_knee'), id=11, color=[0, 255, 0]),
+        12:
+        dict(link=('left_knee', 'left_hip'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_hip', 'tail'), id=13, color=[0, 255, 0]),
+    },
+    joint_weights=[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+    sigmas=[
+        0.0277, 0.0823, 0.0831, 0.0202, 0.0716, 0.0263, 0.0646, 0.0302, 0.0440,
+        0.0316, 0.0333, 0.0547, 0.0263, 0.0683, 0.0539
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/campus.py b/SMPLer-X/main/_base_/datasets/campus.py
new file mode 100644
index 0000000000000000000000000000000000000000..334316e9c25282508767158d3fae30578ab3949d
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/campus.py
@@ -0,0 +1,151 @@
+dataset_info = dict(
+    dataset_name='campus',
+    paper_info=dict(
+        author='Belagiannis, Vasileios and Amin, Sikandar and Andriluka, '
+        'Mykhaylo and Schiele, Bernt and Navab, Nassir and Ilic, Slobodan',
+        title='3D Pictorial Structures for Multiple Human Pose Estimation',
+        container='IEEE Computer Society Conference on Computer Vision and '
+        'Pattern Recognition (CVPR)',
+        year='2014',
+        homepage='http://campar.in.tum.de/Chair/MultiHumanPose',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(
+            name='right_wrist',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        7:
+        dict(
+            name='right_elbow',
+            id=7,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        8:
+        dict(
+            name='right_shoulder',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        9:
+        dict(
+            name='left_shoulder',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        10:
+        dict(
+            name='left_elbow',
+            id=10,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        11:
+        dict(
+            name='left_wrist',
+            id=11,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        12:
+        dict(
+            name='bottom_head',
+            id=12,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        13:
+        dict(
+            name='top_head',
+            id=13,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('left_hip', 'left_knee'), id=2, color=[0, 255, 0]),
+        3:
+        dict(link=('left_knee', 'left_ankle'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('right_hip', 'left_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('right_wrist', 'right_elbow'), id=5, color=[255, 128, 0]),
+        6:
+        dict(
+            link=('right_elbow', 'right_shoulder'), id=6, color=[255, 128, 0]),
+        7:
+        dict(link=('left_shoulder', 'left_elbow'), id=7, color=[0, 255, 0]),
+        8:
+        dict(link=('left_elbow', 'left_wrist'), id=8, color=[0, 255, 0]),
+        9:
+        dict(link=('right_hip', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_hip', 'left_shoulder'), id=10, color=[0, 255, 0]),
+        11:
+        dict(
+            link=('right_shoulder', 'bottom_head'), id=11, color=[255, 128,
+                                                                  0]),
+        12:
+        dict(link=('left_shoulder', 'bottom_head'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('bottom_head', 'top_head'), id=13, color=[51, 153, 255]),
+    },
+    joint_weights=[
+        1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.0, 1.0
+    ],
+    sigmas=[
+        0.089, 0.087, 0.107, 0.107, 0.087, 0.089, 0.062, 0.072, 0.079, 0.079,
+        0.072, 0.062, 0.026, 0.026
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/coco.py b/SMPLer-X/main/_base_/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..865a95bc02fedd318f32d2e7aa8397147d78fdb5
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/coco.py
@@ -0,0 +1,181 @@
+dataset_info = dict(
+    dataset_name='coco',
+    paper_info=dict(
+        author='Lin, Tsung-Yi and Maire, Michael and '
+        'Belongie, Serge and Hays, James and '
+        'Perona, Pietro and Ramanan, Deva and '
+        r'Doll{\'a}r, Piotr and Zitnick, C Lawrence',
+        title='Microsoft coco: Common objects in context',
+        container='European conference on computer vision',
+        year='2014',
+        homepage='http://cocodataset.org/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/coco_wholebody.py b/SMPLer-X/main/_base_/datasets/coco_wholebody.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef9b707017a24a1a133bb28566d212c618fee694
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/coco_wholebody.py
@@ -0,0 +1,1154 @@
+dataset_info = dict(
+    dataset_name='coco_wholebody',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        17:
+        dict(
+            name='left_big_toe',
+            id=17,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_big_toe'),
+        18:
+        dict(
+            name='left_small_toe',
+            id=18,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_small_toe'),
+        19:
+        dict(
+            name='left_heel',
+            id=19,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_heel'),
+        20:
+        dict(
+            name='right_big_toe',
+            id=20,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_big_toe'),
+        21:
+        dict(
+            name='right_small_toe',
+            id=21,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_small_toe'),
+        22:
+        dict(
+            name='right_heel',
+            id=22,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_heel'),
+        23:
+        dict(
+            name='face-0',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        24:
+        dict(
+            name='face-1',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        25:
+        dict(
+            name='face-2',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        26:
+        dict(
+            name='face-3',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        27:
+        dict(
+            name='face-4',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        28:
+        dict(
+            name='face-5',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        29:
+        dict(
+            name='face-6',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        30:
+        dict(
+            name='face-7',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='face-9'),
+        31:
+        dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''),
+        32:
+        dict(
+            name='face-9',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-7'),
+        33:
+        dict(
+            name='face-10',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        34:
+        dict(
+            name='face-11',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        35:
+        dict(
+            name='face-12',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        36:
+        dict(
+            name='face-13',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        37:
+        dict(
+            name='face-14',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        38:
+        dict(
+            name='face-15',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        39:
+        dict(
+            name='face-16',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        40:
+        dict(
+            name='face-17',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        41:
+        dict(
+            name='face-18',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        42:
+        dict(
+            name='face-19',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        43:
+        dict(
+            name='face-20',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        44:
+        dict(
+            name='face-21',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        45:
+        dict(
+            name='face-22',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        46:
+        dict(
+            name='face-23',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        47:
+        dict(
+            name='face-24',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        48:
+        dict(
+            name='face-25',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        49:
+        dict(
+            name='face-26',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        50:
+        dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''),
+        51:
+        dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''),
+        53:
+        dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(
+            name='face-31',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        55:
+        dict(
+            name='face-32',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        56:
+        dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''),
+        57:
+        dict(
+            name='face-34',
+            id=57,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        58:
+        dict(
+            name='face-35',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        59:
+        dict(
+            name='face-36',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        60:
+        dict(
+            name='face-37',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        61:
+        dict(
+            name='face-38',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        62:
+        dict(
+            name='face-39',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        63:
+        dict(
+            name='face-40',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        64:
+        dict(
+            name='face-41',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        65:
+        dict(
+            name='face-42',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        66:
+        dict(
+            name='face-43',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        67:
+        dict(
+            name='face-44',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        68:
+        dict(
+            name='face-45',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        69:
+        dict(
+            name='face-46',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        70:
+        dict(
+            name='face-47',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        71:
+        dict(
+            name='face-48',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        72:
+        dict(
+            name='face-49',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        73:
+        dict(
+            name='face-50',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        74:
+        dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''),
+        75:
+        dict(
+            name='face-52',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        76:
+        dict(
+            name='face-53',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        77:
+        dict(
+            name='face-54',
+            id=77,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        78:
+        dict(
+            name='face-55',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        79:
+        dict(
+            name='face-56',
+            id=79,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        80:
+        dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''),
+        81:
+        dict(
+            name='face-58',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        82:
+        dict(
+            name='face-59',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        83:
+        dict(
+            name='face-60',
+            id=83,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        84:
+        dict(
+            name='face-61',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        85:
+        dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''),
+        86:
+        dict(
+            name='face-63',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        87:
+        dict(
+            name='face-64',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        88:
+        dict(
+            name='face-65',
+            id=88,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        89:
+        dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''),
+        90:
+        dict(
+            name='face-67',
+            id=90,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65'),
+        91:
+        dict(
+            name='left_hand_root',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='right_hand_root'),
+        92:
+        dict(
+            name='left_thumb1',
+            id=92,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        93:
+        dict(
+            name='left_thumb2',
+            id=93,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        94:
+        dict(
+            name='left_thumb3',
+            id=94,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        95:
+        dict(
+            name='left_thumb4',
+            id=95,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        96:
+        dict(
+            name='left_forefinger1',
+            id=96,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        97:
+        dict(
+            name='left_forefinger2',
+            id=97,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        98:
+        dict(
+            name='left_forefinger3',
+            id=98,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        99:
+        dict(
+            name='left_forefinger4',
+            id=99,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        100:
+        dict(
+            name='left_middle_finger1',
+            id=100,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        101:
+        dict(
+            name='left_middle_finger2',
+            id=101,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        102:
+        dict(
+            name='left_middle_finger3',
+            id=102,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        103:
+        dict(
+            name='left_middle_finger4',
+            id=103,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        104:
+        dict(
+            name='left_ring_finger1',
+            id=104,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        105:
+        dict(
+            name='left_ring_finger2',
+            id=105,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        106:
+        dict(
+            name='left_ring_finger3',
+            id=106,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        107:
+        dict(
+            name='left_ring_finger4',
+            id=107,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        108:
+        dict(
+            name='left_pinky_finger1',
+            id=108,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        109:
+        dict(
+            name='left_pinky_finger2',
+            id=109,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        110:
+        dict(
+            name='left_pinky_finger3',
+            id=110,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        111:
+        dict(
+            name='left_pinky_finger4',
+            id=111,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        112:
+        dict(
+            name='right_hand_root',
+            id=112,
+            color=[255, 255, 255],
+            type='',
+            swap='left_hand_root'),
+        113:
+        dict(
+            name='right_thumb1',
+            id=113,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        114:
+        dict(
+            name='right_thumb2',
+            id=114,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        115:
+        dict(
+            name='right_thumb3',
+            id=115,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        116:
+        dict(
+            name='right_thumb4',
+            id=116,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        117:
+        dict(
+            name='right_forefinger1',
+            id=117,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        118:
+        dict(
+            name='right_forefinger2',
+            id=118,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        119:
+        dict(
+            name='right_forefinger3',
+            id=119,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        120:
+        dict(
+            name='right_forefinger4',
+            id=120,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        121:
+        dict(
+            name='right_middle_finger1',
+            id=121,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        122:
+        dict(
+            name='right_middle_finger2',
+            id=122,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        123:
+        dict(
+            name='right_middle_finger3',
+            id=123,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        124:
+        dict(
+            name='right_middle_finger4',
+            id=124,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        125:
+        dict(
+            name='right_ring_finger1',
+            id=125,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        126:
+        dict(
+            name='right_ring_finger2',
+            id=126,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        127:
+        dict(
+            name='right_ring_finger3',
+            id=127,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        128:
+        dict(
+            name='right_ring_finger4',
+            id=128,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        129:
+        dict(
+            name='right_pinky_finger1',
+            id=129,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        130:
+        dict(
+            name='right_pinky_finger2',
+            id=130,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        131:
+        dict(
+            name='right_pinky_finger3',
+            id=131,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        132:
+        dict(
+            name='right_pinky_finger4',
+            id=132,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]),
+        19:
+        dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]),
+        20:
+        dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]),
+        21:
+        dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]),
+        22:
+        dict(
+            link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]),
+        23:
+        dict(
+            link=('right_ankle', 'right_small_toe'),
+            id=23,
+            color=[255, 128, 0]),
+        24:
+        dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]),
+        25:
+        dict(
+            link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128,
+                                                                  0]),
+        26:
+        dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]),
+        27:
+        dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]),
+        28:
+        dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]),
+        29:
+        dict(
+            link=('left_hand_root', 'left_forefinger1'),
+            id=29,
+            color=[255, 153, 255]),
+        30:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=30,
+            color=[255, 153, 255]),
+        31:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=31,
+            color=[255, 153, 255]),
+        32:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=32,
+            color=[255, 153, 255]),
+        33:
+        dict(
+            link=('left_hand_root', 'left_middle_finger1'),
+            id=33,
+            color=[102, 178, 255]),
+        34:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=34,
+            color=[102, 178, 255]),
+        35:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=35,
+            color=[102, 178, 255]),
+        36:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=36,
+            color=[102, 178, 255]),
+        37:
+        dict(
+            link=('left_hand_root', 'left_ring_finger1'),
+            id=37,
+            color=[255, 51, 51]),
+        38:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=38,
+            color=[255, 51, 51]),
+        39:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=39,
+            color=[255, 51, 51]),
+        40:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=40,
+            color=[255, 51, 51]),
+        41:
+        dict(
+            link=('left_hand_root', 'left_pinky_finger1'),
+            id=41,
+            color=[0, 255, 0]),
+        42:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=42,
+            color=[0, 255, 0]),
+        43:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=43,
+            color=[0, 255, 0]),
+        44:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=44,
+            color=[0, 255, 0]),
+        45:
+        dict(
+            link=('right_hand_root', 'right_thumb1'),
+            id=45,
+            color=[255, 128, 0]),
+        46:
+        dict(
+            link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]),
+        47:
+        dict(
+            link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]),
+        48:
+        dict(
+            link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]),
+        49:
+        dict(
+            link=('right_hand_root', 'right_forefinger1'),
+            id=49,
+            color=[255, 153, 255]),
+        50:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=50,
+            color=[255, 153, 255]),
+        51:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=51,
+            color=[255, 153, 255]),
+        52:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=52,
+            color=[255, 153, 255]),
+        53:
+        dict(
+            link=('right_hand_root', 'right_middle_finger1'),
+            id=53,
+            color=[102, 178, 255]),
+        54:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=54,
+            color=[102, 178, 255]),
+        55:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=55,
+            color=[102, 178, 255]),
+        56:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=56,
+            color=[102, 178, 255]),
+        57:
+        dict(
+            link=('right_hand_root', 'right_ring_finger1'),
+            id=57,
+            color=[255, 51, 51]),
+        58:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=58,
+            color=[255, 51, 51]),
+        59:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=59,
+            color=[255, 51, 51]),
+        60:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=60,
+            color=[255, 51, 51]),
+        61:
+        dict(
+            link=('right_hand_root', 'right_pinky_finger1'),
+            id=61,
+            color=[0, 255, 0]),
+        62:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=62,
+            color=[0, 255, 0]),
+        63:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=63,
+            color=[0, 255, 0]),
+        64:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=64,
+            color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 133,
+    # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+    # 'evaluation/myeval_wholebody.py#L175'
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066,
+        0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031,
+        0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045,
+        0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015,
+        0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017,
+        0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010,
+        0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009,
+        0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01,
+        0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035,
+        0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019,
+        0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024,
+        0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02,
+        0.019, 0.022, 0.031
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/coco_wholebody_face.py b/SMPLer-X/main/_base_/datasets/coco_wholebody_face.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c9ee3350e3bd67ab1825344849487834c71c82b
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/coco_wholebody_face.py
@@ -0,0 +1,448 @@
+dataset_info = dict(
+    dataset_name='coco_wholebody_face',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='face-0',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        1:
+        dict(
+            name='face-1',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        2:
+        dict(
+            name='face-2',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        3:
+        dict(
+            name='face-3',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        4:
+        dict(
+            name='face-4',
+            id=4,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        5:
+        dict(
+            name='face-5',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        6:
+        dict(
+            name='face-6',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        7:
+        dict(
+            name='face-7', id=7, color=[255, 255, 255], type='',
+            swap='face-9'),
+        8:
+        dict(name='face-8', id=8, color=[255, 255, 255], type='', swap=''),
+        9:
+        dict(
+            name='face-9', id=9, color=[255, 255, 255], type='',
+            swap='face-7'),
+        10:
+        dict(
+            name='face-10',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        11:
+        dict(
+            name='face-11',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        12:
+        dict(
+            name='face-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        13:
+        dict(
+            name='face-13',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        14:
+        dict(
+            name='face-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        15:
+        dict(
+            name='face-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        16:
+        dict(
+            name='face-16',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        17:
+        dict(
+            name='face-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        18:
+        dict(
+            name='face-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        19:
+        dict(
+            name='face-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        20:
+        dict(
+            name='face-20',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        21:
+        dict(
+            name='face-21',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        22:
+        dict(
+            name='face-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        23:
+        dict(
+            name='face-23',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        24:
+        dict(
+            name='face-24',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        25:
+        dict(
+            name='face-25',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        26:
+        dict(
+            name='face-26',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        27:
+        dict(name='face-27', id=27, color=[255, 255, 255], type='', swap=''),
+        28:
+        dict(name='face-28', id=28, color=[255, 255, 255], type='', swap=''),
+        29:
+        dict(name='face-29', id=29, color=[255, 255, 255], type='', swap=''),
+        30:
+        dict(name='face-30', id=30, color=[255, 255, 255], type='', swap=''),
+        31:
+        dict(
+            name='face-31',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        32:
+        dict(
+            name='face-32',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        33:
+        dict(name='face-33', id=33, color=[255, 255, 255], type='', swap=''),
+        34:
+        dict(
+            name='face-34',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        35:
+        dict(
+            name='face-35',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        36:
+        dict(
+            name='face-36',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        37:
+        dict(
+            name='face-37',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        38:
+        dict(
+            name='face-38',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        39:
+        dict(
+            name='face-39',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        40:
+        dict(
+            name='face-40',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        41:
+        dict(
+            name='face-41',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        42:
+        dict(
+            name='face-42',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        43:
+        dict(
+            name='face-43',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        44:
+        dict(
+            name='face-44',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        45:
+        dict(
+            name='face-45',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        46:
+        dict(
+            name='face-46',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        47:
+        dict(
+            name='face-47',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        48:
+        dict(
+            name='face-48',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        49:
+        dict(
+            name='face-49',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        50:
+        dict(
+            name='face-50',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        51:
+        dict(name='face-51', id=52, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(
+            name='face-52',
+            id=52,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        53:
+        dict(
+            name='face-53',
+            id=53,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        54:
+        dict(
+            name='face-54',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        55:
+        dict(
+            name='face-55',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        56:
+        dict(
+            name='face-56',
+            id=56,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        57:
+        dict(name='face-57', id=57, color=[255, 255, 255], type='', swap=''),
+        58:
+        dict(
+            name='face-58',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        59:
+        dict(
+            name='face-59',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        60:
+        dict(
+            name='face-60',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        61:
+        dict(
+            name='face-61',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        62:
+        dict(name='face-62', id=62, color=[255, 255, 255], type='', swap=''),
+        63:
+        dict(
+            name='face-63',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        64:
+        dict(
+            name='face-64',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        65:
+        dict(
+            name='face-65',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        66:
+        dict(name='face-66', id=66, color=[255, 255, 255], type='', swap=''),
+        67:
+        dict(
+            name='face-67',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 68,
+
+    # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+    # 'evaluation/myeval_wholebody.py#L177'
+    sigmas=[
+        0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031, 0.025, 0.020, 0.023,
+        0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045, 0.013, 0.012, 0.011,
+        0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015, 0.009, 0.007, 0.007,
+        0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017, 0.011, 0.009, 0.011,
+        0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, 0.034, 0.008, 0.008,
+        0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, 0.009, 0.009, 0.007,
+        0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01, 0.008
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/coco_wholebody_hand.py b/SMPLer-X/main/_base_/datasets/coco_wholebody_hand.py
new file mode 100644
index 0000000000000000000000000000000000000000..1910b2ced5a8b31cd6f83911e41cae9f1a580222
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/coco_wholebody_hand.py
@@ -0,0 +1,147 @@
+dataset_info = dict(
+    dataset_name='coco_wholebody_hand',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[
+        0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, 0.018,
+        0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, 0.022,
+        0.031
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/cofw.py b/SMPLer-X/main/_base_/datasets/cofw.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fb7ad2f8d1fdbe868b3691858a370e26b59a105
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/cofw.py
@@ -0,0 +1,134 @@
+dataset_info = dict(
+    dataset_name='cofw',
+    paper_info=dict(
+        author='Burgos-Artizzu, Xavier P and Perona, '
+        r'Pietro and Doll{\'a}r, Piotr',
+        title='Robust face landmark estimation under occlusion',
+        container='Proceedings of the IEEE international '
+        'conference on computer vision',
+        year='2013',
+        homepage='http://www.vision.caltech.edu/xpburgos/ICCV13/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-1'),
+        1:
+        dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-0'),
+        2:
+        dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-3'),
+        3:
+        dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-2'),
+        4:
+        dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-6'),
+        5:
+        dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-7'),
+        6:
+        dict(name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-4'),
+        7:
+        dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-5'),
+        8:
+        dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-9'),
+        9:
+        dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-8'),
+        10:
+        dict(
+            name='kpt-10',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-11'),
+        11:
+        dict(
+            name='kpt-11',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-10'),
+        12:
+        dict(
+            name='kpt-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-14'),
+        13:
+        dict(
+            name='kpt-13',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-15'),
+        14:
+        dict(
+            name='kpt-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-12'),
+        15:
+        dict(
+            name='kpt-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-13'),
+        16:
+        dict(
+            name='kpt-16',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-16'),
+        18:
+        dict(
+            name='kpt-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-19'),
+        19:
+        dict(
+            name='kpt-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-18'),
+        20:
+        dict(name='kpt-20', id=20, color=[255, 255, 255], type='', swap=''),
+        21:
+        dict(name='kpt-21', id=21, color=[255, 255, 255], type='', swap=''),
+        22:
+        dict(
+            name='kpt-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-23'),
+        23:
+        dict(
+            name='kpt-23',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-22'),
+        24:
+        dict(name='kpt-24', id=24, color=[255, 255, 255], type='', swap=''),
+        25:
+        dict(name='kpt-25', id=25, color=[255, 255, 255], type='', swap=''),
+        26:
+        dict(name='kpt-26', id=26, color=[255, 255, 255], type='', swap=''),
+        27:
+        dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''),
+        28:
+        dict(name='kpt-28', id=28, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 29,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/crowdpose.py b/SMPLer-X/main/_base_/datasets/crowdpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..45086531a601870716eed15a32c5413c0e24b7ae
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/crowdpose.py
@@ -0,0 +1,147 @@
+dataset_info = dict(
+    dataset_name='crowdpose',
+    paper_info=dict(
+        author='Li, Jiefeng and Wang, Can and Zhu, Hao and '
+        'Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu',
+        title='CrowdPose: Efficient Crowded Scenes Pose Estimation '
+        'and A New Benchmark',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2019',
+        homepage='https://github.com/Jeff-sjtu/CrowdPose',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left_shoulder',
+            id=0,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_shoulder'),
+        1:
+        dict(
+            name='right_shoulder',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_shoulder'),
+        2:
+        dict(
+            name='left_elbow',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_elbow'),
+        3:
+        dict(
+            name='right_elbow',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_elbow'),
+        4:
+        dict(
+            name='left_wrist',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_wrist'),
+        5:
+        dict(
+            name='right_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='left_wrist'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='right_hip',
+            id=7,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_hip'),
+        8:
+        dict(
+            name='left_knee',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_knee'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='left_ankle',
+            id=10,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_ankle'),
+        11:
+        dict(
+            name='right_ankle',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_ankle'),
+        12:
+        dict(
+            name='top_head', id=12, color=[255, 128, 0], type='upper',
+            swap=''),
+        13:
+        dict(name='neck', id=13, color=[0, 255, 0], type='upper', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('top_head', 'neck'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('right_shoulder', 'neck'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('left_shoulder', 'neck'), id=14, color=[51, 153, 255])
+    },
+    joint_weights=[
+        0.2, 0.2, 0.2, 1.3, 1.5, 0.2, 1.3, 1.5, 0.2, 0.2, 0.5, 0.2, 0.2, 0.5
+    ],
+    sigmas=[
+        0.079, 0.079, 0.072, 0.072, 0.062, 0.062, 0.107, 0.107, 0.087, 0.087,
+        0.089, 0.089, 0.079, 0.079
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/deepfashion_full.py b/SMPLer-X/main/_base_/datasets/deepfashion_full.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d989069ee7253d3a5b5f01c81135b1a472cd4b2
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/deepfashion_full.py
@@ -0,0 +1,74 @@
+dataset_info = dict(
+    dataset_name='deepfashion_full',
+    paper_info=dict(
+        author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+        'and Wang, Xiaogang and Tang, Xiaoou',
+        title='DeepFashion: Powering Robust Clothes Recognition '
+        'and Retrieval with Rich Annotations',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2016',
+        homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+        'DeepFashion/LandmarkDetection.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left collar',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right collar'),
+        1:
+        dict(
+            name='right collar',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left collar'),
+        2:
+        dict(
+            name='left sleeve',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right sleeve'),
+        3:
+        dict(
+            name='right sleeve',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left sleeve'),
+        4:
+        dict(
+            name='left waistline',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right waistline'),
+        5:
+        dict(
+            name='right waistline',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left waistline'),
+        6:
+        dict(
+            name='left hem',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right hem'),
+        7:
+        dict(
+            name='right hem',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left hem'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 8,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/deepfashion_lower.py b/SMPLer-X/main/_base_/datasets/deepfashion_lower.py
new file mode 100644
index 0000000000000000000000000000000000000000..db014a1747ca618f93a7d092d29027015b48ae3c
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/deepfashion_lower.py
@@ -0,0 +1,46 @@
+dataset_info = dict(
+    dataset_name='deepfashion_lower',
+    paper_info=dict(
+        author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+        'and Wang, Xiaogang and Tang, Xiaoou',
+        title='DeepFashion: Powering Robust Clothes Recognition '
+        'and Retrieval with Rich Annotations',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2016',
+        homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+        'DeepFashion/LandmarkDetection.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left waistline',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right waistline'),
+        1:
+        dict(
+            name='right waistline',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left waistline'),
+        2:
+        dict(
+            name='left hem',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right hem'),
+        3:
+        dict(
+            name='right hem',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left hem'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 4,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/deepfashion_upper.py b/SMPLer-X/main/_base_/datasets/deepfashion_upper.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0b012fd37bee1ba5ed956a7a5465a8623bf0894
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/deepfashion_upper.py
@@ -0,0 +1,60 @@
+dataset_info = dict(
+    dataset_name='deepfashion_upper',
+    paper_info=dict(
+        author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+        'and Wang, Xiaogang and Tang, Xiaoou',
+        title='DeepFashion: Powering Robust Clothes Recognition '
+        'and Retrieval with Rich Annotations',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2016',
+        homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+        'DeepFashion/LandmarkDetection.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left collar',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right collar'),
+        1:
+        dict(
+            name='right collar',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left collar'),
+        2:
+        dict(
+            name='left sleeve',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right sleeve'),
+        3:
+        dict(
+            name='right sleeve',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left sleeve'),
+        4:
+        dict(
+            name='left hem',
+            id=4,
+            color=[255, 255, 255],
+            type='',
+            swap='right hem'),
+        5:
+        dict(
+            name='right hem',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='left hem'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 6,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/fly.py b/SMPLer-X/main/_base_/datasets/fly.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f94ff57ca93d8f562b6a61b9a67198abdcde217
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/fly.py
@@ -0,0 +1,237 @@
+dataset_info = dict(
+    dataset_name='fly',
+    paper_info=dict(
+        author='Pereira, Talmo D and Aldarondo, Diego E and '
+        'Willmore, Lindsay and Kislin, Mikhail and '
+        'Wang, Samuel S-H and Murthy, Mala and Shaevitz, Joshua W',
+        title='Fast animal pose estimation using deep neural networks',
+        container='Nature methods',
+        year='2019',
+        homepage='https://github.com/jgraving/DeepPoseKit-Data',
+    ),
+    keypoint_info={
+        0:
+        dict(name='head', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='eyeL', id=1, color=[255, 255, 255], type='', swap='eyeR'),
+        2:
+        dict(name='eyeR', id=2, color=[255, 255, 255], type='', swap='eyeL'),
+        3:
+        dict(name='neck', id=3, color=[255, 255, 255], type='', swap=''),
+        4:
+        dict(name='thorax', id=4, color=[255, 255, 255], type='', swap=''),
+        5:
+        dict(name='abdomen', id=5, color=[255, 255, 255], type='', swap=''),
+        6:
+        dict(
+            name='forelegR1',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL1'),
+        7:
+        dict(
+            name='forelegR2',
+            id=7,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL2'),
+        8:
+        dict(
+            name='forelegR3',
+            id=8,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL3'),
+        9:
+        dict(
+            name='forelegR4',
+            id=9,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL4'),
+        10:
+        dict(
+            name='midlegR1',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL1'),
+        11:
+        dict(
+            name='midlegR2',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL2'),
+        12:
+        dict(
+            name='midlegR3',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL3'),
+        13:
+        dict(
+            name='midlegR4',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL4'),
+        14:
+        dict(
+            name='hindlegR1',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL1'),
+        15:
+        dict(
+            name='hindlegR2',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL2'),
+        16:
+        dict(
+            name='hindlegR3',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL3'),
+        17:
+        dict(
+            name='hindlegR4',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL4'),
+        18:
+        dict(
+            name='forelegL1',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR1'),
+        19:
+        dict(
+            name='forelegL2',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR2'),
+        20:
+        dict(
+            name='forelegL3',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR3'),
+        21:
+        dict(
+            name='forelegL4',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR4'),
+        22:
+        dict(
+            name='midlegL1',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR1'),
+        23:
+        dict(
+            name='midlegL2',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR2'),
+        24:
+        dict(
+            name='midlegL3',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR3'),
+        25:
+        dict(
+            name='midlegL4',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR4'),
+        26:
+        dict(
+            name='hindlegL1',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR1'),
+        27:
+        dict(
+            name='hindlegL2',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR2'),
+        28:
+        dict(
+            name='hindlegL3',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR3'),
+        29:
+        dict(
+            name='hindlegL4',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR4'),
+        30:
+        dict(
+            name='wingL', id=30, color=[255, 255, 255], type='', swap='wingR'),
+        31:
+        dict(
+            name='wingR', id=31, color=[255, 255, 255], type='', swap='wingL'),
+    },
+    skeleton_info={
+        0: dict(link=('eyeL', 'head'), id=0, color=[255, 255, 255]),
+        1: dict(link=('eyeR', 'head'), id=1, color=[255, 255, 255]),
+        2: dict(link=('neck', 'head'), id=2, color=[255, 255, 255]),
+        3: dict(link=('thorax', 'neck'), id=3, color=[255, 255, 255]),
+        4: dict(link=('abdomen', 'thorax'), id=4, color=[255, 255, 255]),
+        5: dict(link=('forelegR2', 'forelegR1'), id=5, color=[255, 255, 255]),
+        6: dict(link=('forelegR3', 'forelegR2'), id=6, color=[255, 255, 255]),
+        7: dict(link=('forelegR4', 'forelegR3'), id=7, color=[255, 255, 255]),
+        8: dict(link=('midlegR2', 'midlegR1'), id=8, color=[255, 255, 255]),
+        9: dict(link=('midlegR3', 'midlegR2'), id=9, color=[255, 255, 255]),
+        10: dict(link=('midlegR4', 'midlegR3'), id=10, color=[255, 255, 255]),
+        11:
+        dict(link=('hindlegR2', 'hindlegR1'), id=11, color=[255, 255, 255]),
+        12:
+        dict(link=('hindlegR3', 'hindlegR2'), id=12, color=[255, 255, 255]),
+        13:
+        dict(link=('hindlegR4', 'hindlegR3'), id=13, color=[255, 255, 255]),
+        14:
+        dict(link=('forelegL2', 'forelegL1'), id=14, color=[255, 255, 255]),
+        15:
+        dict(link=('forelegL3', 'forelegL2'), id=15, color=[255, 255, 255]),
+        16:
+        dict(link=('forelegL4', 'forelegL3'), id=16, color=[255, 255, 255]),
+        17: dict(link=('midlegL2', 'midlegL1'), id=17, color=[255, 255, 255]),
+        18: dict(link=('midlegL3', 'midlegL2'), id=18, color=[255, 255, 255]),
+        19: dict(link=('midlegL4', 'midlegL3'), id=19, color=[255, 255, 255]),
+        20:
+        dict(link=('hindlegL2', 'hindlegL1'), id=20, color=[255, 255, 255]),
+        21:
+        dict(link=('hindlegL3', 'hindlegL2'), id=21, color=[255, 255, 255]),
+        22:
+        dict(link=('hindlegL4', 'hindlegL3'), id=22, color=[255, 255, 255]),
+        23: dict(link=('wingL', 'neck'), id=23, color=[255, 255, 255]),
+        24: dict(link=('wingR', 'neck'), id=24, color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 32,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/freihand2d.py b/SMPLer-X/main/_base_/datasets/freihand2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b960d10f3538801531dbccdd67aeac6e73ac572
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/freihand2d.py
@@ -0,0 +1,144 @@
+dataset_info = dict(
+    dataset_name='freihand',
+    paper_info=dict(
+        author='Zimmermann, Christian and Ceylan, Duygu and '
+        'Yang, Jimei and Russell, Bryan and '
+        'Argus, Max and Brox, Thomas',
+        title='Freihand: A dataset for markerless capture of hand pose '
+        'and shape from single rgb images',
+        container='Proceedings of the IEEE International '
+        'Conference on Computer Vision',
+        year='2019',
+        homepage='https://lmb.informatik.uni-freiburg.de/projects/freihand/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/h36m.py b/SMPLer-X/main/_base_/datasets/h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..00a719d8b19f9ff3c5ef98476d73216055bf9186
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/h36m.py
@@ -0,0 +1,152 @@
+dataset_info = dict(
+    dataset_name='h36m',
+    paper_info=dict(
+        author='Ionescu, Catalin and Papava, Dragos and '
+        'Olaru, Vlad and Sminchisescu, Cristian',
+        title='Human3.6M: Large Scale Datasets and Predictive '
+        'Methods for 3D Human Sensing in Natural Environments',
+        container='IEEE Transactions on Pattern Analysis and '
+        'Machine Intelligence',
+        year='2014',
+        homepage='http://vision.imar.ro/human3.6m/description.php',
+    ),
+    keypoint_info={
+        0:
+        dict(name='root', id=0, color=[51, 153, 255], type='lower', swap=''),
+        1:
+        dict(
+            name='right_hip',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        2:
+        dict(
+            name='right_knee',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        3:
+        dict(
+            name='right_foot',
+            id=3,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_foot'),
+        4:
+        dict(
+            name='left_hip',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        5:
+        dict(
+            name='left_knee',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        6:
+        dict(
+            name='left_foot',
+            id=6,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_foot'),
+        7:
+        dict(name='spine', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(name='thorax', id=8, color=[51, 153, 255], type='upper', swap=''),
+        9:
+        dict(
+            name='neck_base',
+            id=9,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        10:
+        dict(name='head', id=10, color=[51, 153, 255], type='upper', swap=''),
+        11:
+        dict(
+            name='left_shoulder',
+            id=11,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        12:
+        dict(
+            name='left_elbow',
+            id=12,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        13:
+        dict(
+            name='left_wrist',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        14:
+        dict(
+            name='right_shoulder',
+            id=14,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        15:
+        dict(
+            name='right_elbow',
+            id=15,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        16:
+        dict(
+            name='right_wrist',
+            id=16,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('root', 'left_hip'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_hip', 'left_knee'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('left_knee', 'left_foot'), id=2, color=[0, 255, 0]),
+        3:
+        dict(link=('root', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('right_hip', 'right_knee'), id=4, color=[255, 128, 0]),
+        5:
+        dict(link=('right_knee', 'right_foot'), id=5, color=[255, 128, 0]),
+        6:
+        dict(link=('root', 'spine'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('spine', 'thorax'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('thorax', 'neck_base'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('neck_base', 'head'), id=9, color=[51, 153, 255]),
+        10:
+        dict(link=('thorax', 'left_shoulder'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('left_shoulder', 'left_elbow'), id=11, color=[0, 255, 0]),
+        12:
+        dict(link=('left_elbow', 'left_wrist'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('thorax', 'right_shoulder'), id=13, color=[255, 128, 0]),
+        14:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=14, color=[255, 128,
+                                                                  0]),
+        15:
+        dict(link=('right_elbow', 'right_wrist'), id=15, color=[255, 128, 0])
+    },
+    joint_weights=[1.] * 17,
+    sigmas=[],
+    stats_info=dict(bbox_center=(528., 427.), bbox_scale=400.))
diff --git a/SMPLer-X/main/_base_/datasets/halpe.py b/SMPLer-X/main/_base_/datasets/halpe.py
new file mode 100644
index 0000000000000000000000000000000000000000..1385fe81dc2190684f2142449c0f288f2cb74c1a
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/halpe.py
@@ -0,0 +1,1157 @@
+dataset_info = dict(
+    dataset_name='halpe',
+    paper_info=dict(
+        author='Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie'
+        ' and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu'
+        ' and Ma, Ze and Chen, Mingyang and Lu, Cewu',
+        title='PaStaNet: Toward Human Activity Knowledge Engine',
+        container='CVPR',
+        year='2020',
+        homepage='https://github.com/Fang-Haoshu/Halpe-FullBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        17:
+        dict(name='head', id=17, color=[255, 128, 0], type='upper', swap=''),
+        18:
+        dict(name='neck', id=18, color=[255, 128, 0], type='upper', swap=''),
+        19:
+        dict(name='hip', id=19, color=[255, 128, 0], type='lower', swap=''),
+        20:
+        dict(
+            name='left_big_toe',
+            id=20,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_big_toe'),
+        21:
+        dict(
+            name='right_big_toe',
+            id=21,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_big_toe'),
+        22:
+        dict(
+            name='left_small_toe',
+            id=22,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_small_toe'),
+        23:
+        dict(
+            name='right_small_toe',
+            id=23,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_small_toe'),
+        24:
+        dict(
+            name='left_heel',
+            id=24,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_heel'),
+        25:
+        dict(
+            name='right_heel',
+            id=25,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_heel'),
+        26:
+        dict(
+            name='face-0',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        27:
+        dict(
+            name='face-1',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        28:
+        dict(
+            name='face-2',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        29:
+        dict(
+            name='face-3',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        30:
+        dict(
+            name='face-4',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        31:
+        dict(
+            name='face-5',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        32:
+        dict(
+            name='face-6',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        33:
+        dict(
+            name='face-7',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='face-9'),
+        34:
+        dict(name='face-8', id=34, color=[255, 255, 255], type='', swap=''),
+        35:
+        dict(
+            name='face-9',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-7'),
+        36:
+        dict(
+            name='face-10',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        37:
+        dict(
+            name='face-11',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        38:
+        dict(
+            name='face-12',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        39:
+        dict(
+            name='face-13',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        40:
+        dict(
+            name='face-14',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        41:
+        dict(
+            name='face-15',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        42:
+        dict(
+            name='face-16',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        43:
+        dict(
+            name='face-17',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        44:
+        dict(
+            name='face-18',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        45:
+        dict(
+            name='face-19',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        46:
+        dict(
+            name='face-20',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        47:
+        dict(
+            name='face-21',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        48:
+        dict(
+            name='face-22',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        49:
+        dict(
+            name='face-23',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        50:
+        dict(
+            name='face-24',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        51:
+        dict(
+            name='face-25',
+            id=51,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        52:
+        dict(
+            name='face-26',
+            id=52,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        53:
+        dict(name='face-27', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(name='face-28', id=54, color=[255, 255, 255], type='', swap=''),
+        55:
+        dict(name='face-29', id=55, color=[255, 255, 255], type='', swap=''),
+        56:
+        dict(name='face-30', id=56, color=[255, 255, 255], type='', swap=''),
+        57:
+        dict(
+            name='face-31',
+            id=57,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        58:
+        dict(
+            name='face-32',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        59:
+        dict(name='face-33', id=59, color=[255, 255, 255], type='', swap=''),
+        60:
+        dict(
+            name='face-34',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        61:
+        dict(
+            name='face-35',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        62:
+        dict(
+            name='face-36',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        63:
+        dict(
+            name='face-37',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        64:
+        dict(
+            name='face-38',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        65:
+        dict(
+            name='face-39',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        66:
+        dict(
+            name='face-40',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        67:
+        dict(
+            name='face-41',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        68:
+        dict(
+            name='face-42',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        69:
+        dict(
+            name='face-43',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        70:
+        dict(
+            name='face-44',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        71:
+        dict(
+            name='face-45',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        72:
+        dict(
+            name='face-46',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        73:
+        dict(
+            name='face-47',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        74:
+        dict(
+            name='face-48',
+            id=74,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        75:
+        dict(
+            name='face-49',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        76:
+        dict(
+            name='face-50',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        77:
+        dict(name='face-51', id=77, color=[255, 255, 255], type='', swap=''),
+        78:
+        dict(
+            name='face-52',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        79:
+        dict(
+            name='face-53',
+            id=79,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        80:
+        dict(
+            name='face-54',
+            id=80,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        81:
+        dict(
+            name='face-55',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        82:
+        dict(
+            name='face-56',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        83:
+        dict(name='face-57', id=83, color=[255, 255, 255], type='', swap=''),
+        84:
+        dict(
+            name='face-58',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        85:
+        dict(
+            name='face-59',
+            id=85,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        86:
+        dict(
+            name='face-60',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        87:
+        dict(
+            name='face-61',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        88:
+        dict(name='face-62', id=88, color=[255, 255, 255], type='', swap=''),
+        89:
+        dict(
+            name='face-63',
+            id=89,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        90:
+        dict(
+            name='face-64',
+            id=90,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        91:
+        dict(
+            name='face-65',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        92:
+        dict(name='face-66', id=92, color=[255, 255, 255], type='', swap=''),
+        93:
+        dict(
+            name='face-67',
+            id=93,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65'),
+        94:
+        dict(
+            name='left_hand_root',
+            id=94,
+            color=[255, 255, 255],
+            type='',
+            swap='right_hand_root'),
+        95:
+        dict(
+            name='left_thumb1',
+            id=95,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        96:
+        dict(
+            name='left_thumb2',
+            id=96,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        97:
+        dict(
+            name='left_thumb3',
+            id=97,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        98:
+        dict(
+            name='left_thumb4',
+            id=98,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        99:
+        dict(
+            name='left_forefinger1',
+            id=99,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        100:
+        dict(
+            name='left_forefinger2',
+            id=100,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        101:
+        dict(
+            name='left_forefinger3',
+            id=101,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        102:
+        dict(
+            name='left_forefinger4',
+            id=102,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        103:
+        dict(
+            name='left_middle_finger1',
+            id=103,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        104:
+        dict(
+            name='left_middle_finger2',
+            id=104,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        105:
+        dict(
+            name='left_middle_finger3',
+            id=105,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        106:
+        dict(
+            name='left_middle_finger4',
+            id=106,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        107:
+        dict(
+            name='left_ring_finger1',
+            id=107,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        108:
+        dict(
+            name='left_ring_finger2',
+            id=108,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        109:
+        dict(
+            name='left_ring_finger3',
+            id=109,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        110:
+        dict(
+            name='left_ring_finger4',
+            id=110,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        111:
+        dict(
+            name='left_pinky_finger1',
+            id=111,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        112:
+        dict(
+            name='left_pinky_finger2',
+            id=112,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        113:
+        dict(
+            name='left_pinky_finger3',
+            id=113,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        114:
+        dict(
+            name='left_pinky_finger4',
+            id=114,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        115:
+        dict(
+            name='right_hand_root',
+            id=115,
+            color=[255, 255, 255],
+            type='',
+            swap='left_hand_root'),
+        116:
+        dict(
+            name='right_thumb1',
+            id=116,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        117:
+        dict(
+            name='right_thumb2',
+            id=117,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        118:
+        dict(
+            name='right_thumb3',
+            id=118,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        119:
+        dict(
+            name='right_thumb4',
+            id=119,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        120:
+        dict(
+            name='right_forefinger1',
+            id=120,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        121:
+        dict(
+            name='right_forefinger2',
+            id=121,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        122:
+        dict(
+            name='right_forefinger3',
+            id=122,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        123:
+        dict(
+            name='right_forefinger4',
+            id=123,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        124:
+        dict(
+            name='right_middle_finger1',
+            id=124,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        125:
+        dict(
+            name='right_middle_finger2',
+            id=125,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        126:
+        dict(
+            name='right_middle_finger3',
+            id=126,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        127:
+        dict(
+            name='right_middle_finger4',
+            id=127,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        128:
+        dict(
+            name='right_ring_finger1',
+            id=128,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        129:
+        dict(
+            name='right_ring_finger2',
+            id=129,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        130:
+        dict(
+            name='right_ring_finger3',
+            id=130,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        131:
+        dict(
+            name='right_ring_finger4',
+            id=131,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        132:
+        dict(
+            name='right_pinky_finger1',
+            id=132,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        133:
+        dict(
+            name='right_pinky_finger2',
+            id=133,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        134:
+        dict(
+            name='right_pinky_finger3',
+            id=134,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        135:
+        dict(
+            name='right_pinky_finger4',
+            id=135,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('left_hip', 'hip'), id=2, color=[0, 255, 0]),
+        3:
+        dict(link=('right_ankle', 'right_knee'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('right_knee', 'right_hip'), id=4, color=[255, 128, 0]),
+        5:
+        dict(link=('right_hip', 'hip'), id=5, color=[255, 128, 0]),
+        6:
+        dict(link=('head', 'neck'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('neck', 'hip'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('neck', 'left_shoulder'), id=8, color=[0, 255, 0]),
+        9:
+        dict(link=('left_shoulder', 'left_elbow'), id=9, color=[0, 255, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('neck', 'right_shoulder'), id=11, color=[255, 128, 0]),
+        12:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=12, color=[255, 128,
+                                                                  0]),
+        13:
+        dict(link=('right_elbow', 'right_wrist'), id=13, color=[255, 128, 0]),
+        14:
+        dict(link=('left_eye', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('nose', 'left_eye'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('nose', 'right_eye'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_eye', 'left_ear'), id=17, color=[51, 153, 255]),
+        18:
+        dict(link=('right_eye', 'right_ear'), id=18, color=[51, 153, 255]),
+        19:
+        dict(link=('left_ear', 'left_shoulder'), id=19, color=[51, 153, 255]),
+        20:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=20, color=[51, 153, 255]),
+        21:
+        dict(link=('left_ankle', 'left_big_toe'), id=21, color=[0, 255, 0]),
+        22:
+        dict(link=('left_ankle', 'left_small_toe'), id=22, color=[0, 255, 0]),
+        23:
+        dict(link=('left_ankle', 'left_heel'), id=23, color=[0, 255, 0]),
+        24:
+        dict(
+            link=('right_ankle', 'right_big_toe'), id=24, color=[255, 128, 0]),
+        25:
+        dict(
+            link=('right_ankle', 'right_small_toe'),
+            id=25,
+            color=[255, 128, 0]),
+        26:
+        dict(link=('right_ankle', 'right_heel'), id=26, color=[255, 128, 0]),
+        27:
+        dict(link=('left_wrist', 'left_thumb1'), id=27, color=[255, 128, 0]),
+        28:
+        dict(link=('left_thumb1', 'left_thumb2'), id=28, color=[255, 128, 0]),
+        29:
+        dict(link=('left_thumb2', 'left_thumb3'), id=29, color=[255, 128, 0]),
+        30:
+        dict(link=('left_thumb3', 'left_thumb4'), id=30, color=[255, 128, 0]),
+        31:
+        dict(
+            link=('left_wrist', 'left_forefinger1'),
+            id=31,
+            color=[255, 153, 255]),
+        32:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=32,
+            color=[255, 153, 255]),
+        33:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=33,
+            color=[255, 153, 255]),
+        34:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=34,
+            color=[255, 153, 255]),
+        35:
+        dict(
+            link=('left_wrist', 'left_middle_finger1'),
+            id=35,
+            color=[102, 178, 255]),
+        36:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=36,
+            color=[102, 178, 255]),
+        37:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=37,
+            color=[102, 178, 255]),
+        38:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=38,
+            color=[102, 178, 255]),
+        39:
+        dict(
+            link=('left_wrist', 'left_ring_finger1'),
+            id=39,
+            color=[255, 51, 51]),
+        40:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=40,
+            color=[255, 51, 51]),
+        41:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=41,
+            color=[255, 51, 51]),
+        42:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=42,
+            color=[255, 51, 51]),
+        43:
+        dict(
+            link=('left_wrist', 'left_pinky_finger1'),
+            id=43,
+            color=[0, 255, 0]),
+        44:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=44,
+            color=[0, 255, 0]),
+        45:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=45,
+            color=[0, 255, 0]),
+        46:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=46,
+            color=[0, 255, 0]),
+        47:
+        dict(link=('right_wrist', 'right_thumb1'), id=47, color=[255, 128, 0]),
+        48:
+        dict(
+            link=('right_thumb1', 'right_thumb2'), id=48, color=[255, 128, 0]),
+        49:
+        dict(
+            link=('right_thumb2', 'right_thumb3'), id=49, color=[255, 128, 0]),
+        50:
+        dict(
+            link=('right_thumb3', 'right_thumb4'), id=50, color=[255, 128, 0]),
+        51:
+        dict(
+            link=('right_wrist', 'right_forefinger1'),
+            id=51,
+            color=[255, 153, 255]),
+        52:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=52,
+            color=[255, 153, 255]),
+        53:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=53,
+            color=[255, 153, 255]),
+        54:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=54,
+            color=[255, 153, 255]),
+        55:
+        dict(
+            link=('right_wrist', 'right_middle_finger1'),
+            id=55,
+            color=[102, 178, 255]),
+        56:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=56,
+            color=[102, 178, 255]),
+        57:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=57,
+            color=[102, 178, 255]),
+        58:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=58,
+            color=[102, 178, 255]),
+        59:
+        dict(
+            link=('right_wrist', 'right_ring_finger1'),
+            id=59,
+            color=[255, 51, 51]),
+        60:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=60,
+            color=[255, 51, 51]),
+        61:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=61,
+            color=[255, 51, 51]),
+        62:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=62,
+            color=[255, 51, 51]),
+        63:
+        dict(
+            link=('right_wrist', 'right_pinky_finger1'),
+            id=63,
+            color=[0, 255, 0]),
+        64:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=64,
+            color=[0, 255, 0]),
+        65:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=65,
+            color=[0, 255, 0]),
+        66:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=66,
+            color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 136,
+
+    # 'https://github.com/Fang-Haoshu/Halpe-FullBody/blob/master/'
+    # 'HalpeCOCOAPI/PythonAPI/halpecocotools/cocoeval.py#L245'
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.08, 0.08, 0.08,
+        0.089, 0.089, 0.089, 0.089, 0.089, 0.089, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/horse10.py b/SMPLer-X/main/_base_/datasets/horse10.py
new file mode 100644
index 0000000000000000000000000000000000000000..a485bf191bc151b0d76e48f3e55eb8e2dda6c506
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/horse10.py
@@ -0,0 +1,201 @@
+dataset_info = dict(
+    dataset_name='horse10',
+    paper_info=dict(
+        author='Mathis, Alexander and Biasi, Thomas and '
+        'Schneider, Steffen and '
+        'Yuksekgonul, Mert and Rogers, Byron and '
+        'Bethge, Matthias and '
+        'Mathis, Mackenzie W',
+        title='Pretraining boosts out-of-domain robustness '
+        'for pose estimation',
+        container='Proceedings of the IEEE/CVF Winter Conference on '
+        'Applications of Computer Vision',
+        year='2021',
+        homepage='http://www.mackenziemathislab.org/horse10',
+    ),
+    keypoint_info={
+        0:
+        dict(name='Nose', id=0, color=[255, 153, 255], type='upper', swap=''),
+        1:
+        dict(name='Eye', id=1, color=[255, 153, 255], type='upper', swap=''),
+        2:
+        dict(
+            name='Nearknee',
+            id=2,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        3:
+        dict(
+            name='Nearfrontfetlock',
+            id=3,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        4:
+        dict(
+            name='Nearfrontfoot',
+            id=4,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        5:
+        dict(
+            name='Offknee', id=5, color=[255, 102, 255], type='upper',
+            swap=''),
+        6:
+        dict(
+            name='Offfrontfetlock',
+            id=6,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        7:
+        dict(
+            name='Offfrontfoot',
+            id=7,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        8:
+        dict(
+            name='Shoulder',
+            id=8,
+            color=[255, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='Midshoulder',
+            id=9,
+            color=[255, 153, 255],
+            type='upper',
+            swap=''),
+        10:
+        dict(
+            name='Elbow', id=10, color=[255, 153, 255], type='upper', swap=''),
+        11:
+        dict(
+            name='Girth', id=11, color=[255, 153, 255], type='upper', swap=''),
+        12:
+        dict(
+            name='Wither', id=12, color=[255, 153, 255], type='upper',
+            swap=''),
+        13:
+        dict(
+            name='Nearhindhock',
+            id=13,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        14:
+        dict(
+            name='Nearhindfetlock',
+            id=14,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        15:
+        dict(
+            name='Nearhindfoot',
+            id=15,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        16:
+        dict(name='Hip', id=16, color=[255, 153, 255], type='lower', swap=''),
+        17:
+        dict(
+            name='Stifle', id=17, color=[255, 153, 255], type='lower',
+            swap=''),
+        18:
+        dict(
+            name='Offhindhock',
+            id=18,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        19:
+        dict(
+            name='Offhindfetlock',
+            id=19,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        20:
+        dict(
+            name='Offhindfoot',
+            id=20,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        21:
+        dict(
+            name='Ischium',
+            id=21,
+            color=[255, 153, 255],
+            type='lower',
+            swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('Nose', 'Eye'), id=0, color=[255, 153, 255]),
+        1:
+        dict(link=('Eye', 'Wither'), id=1, color=[255, 153, 255]),
+        2:
+        dict(link=('Wither', 'Hip'), id=2, color=[255, 153, 255]),
+        3:
+        dict(link=('Hip', 'Ischium'), id=3, color=[255, 153, 255]),
+        4:
+        dict(link=('Ischium', 'Stifle'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('Stifle', 'Girth'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('Girth', 'Elbow'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('Elbow', 'Shoulder'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('Shoulder', 'Midshoulder'), id=8, color=[255, 153, 255]),
+        9:
+        dict(link=('Midshoulder', 'Wither'), id=9, color=[255, 153, 255]),
+        10:
+        dict(
+            link=('Nearknee', 'Nearfrontfetlock'),
+            id=10,
+            color=[255, 102, 255]),
+        11:
+        dict(
+            link=('Nearfrontfetlock', 'Nearfrontfoot'),
+            id=11,
+            color=[255, 102, 255]),
+        12:
+        dict(
+            link=('Offknee', 'Offfrontfetlock'), id=12, color=[255, 102, 255]),
+        13:
+        dict(
+            link=('Offfrontfetlock', 'Offfrontfoot'),
+            id=13,
+            color=[255, 102, 255]),
+        14:
+        dict(
+            link=('Nearhindhock', 'Nearhindfetlock'),
+            id=14,
+            color=[255, 51, 255]),
+        15:
+        dict(
+            link=('Nearhindfetlock', 'Nearhindfoot'),
+            id=15,
+            color=[255, 51, 255]),
+        16:
+        dict(
+            link=('Offhindhock', 'Offhindfetlock'),
+            id=16,
+            color=[255, 51, 255]),
+        17:
+        dict(
+            link=('Offhindfetlock', 'Offhindfoot'),
+            id=17,
+            color=[255, 51, 255])
+    },
+    joint_weights=[1.] * 22,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/interhand2d.py b/SMPLer-X/main/_base_/datasets/interhand2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..0134f07de5bf536eaffbf71155a7e6eb33b24f0a
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/interhand2d.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+    dataset_name='interhand2d',
+    paper_info=dict(
+        author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and '
+        'Shiratori, Takaaki and Lee, Kyoung Mu',
+        title='InterHand2.6M: A dataset and baseline for 3D '
+        'interacting hand pose estimation from a single RGB image',
+        container='arXiv',
+        year='2020',
+        homepage='https://mks0601.github.io/InterHand2.6M/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='thumb4', id=0, color=[255, 128, 0], type='', swap=''),
+        1:
+        dict(name='thumb3', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb1', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(
+            name='forefinger4', id=4, color=[255, 153, 255], type='', swap=''),
+        5:
+        dict(
+            name='forefinger3', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger1', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='middle_finger4',
+            id=8,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        9:
+        dict(
+            name='middle_finger3',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger1',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='ring_finger4', id=12, color=[255, 51, 51], type='', swap=''),
+        13:
+        dict(
+            name='ring_finger3', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger1', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(name='pinky_finger4', id=16, color=[0, 255, 0], type='', swap=''),
+        17:
+        dict(name='pinky_finger3', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger1', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='wrist', id=20, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/interhand3d.py b/SMPLer-X/main/_base_/datasets/interhand3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2bd8121c281c741ec9b980c7570ebef8a632993
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/interhand3d.py
@@ -0,0 +1,487 @@
+dataset_info = dict(
+    dataset_name='interhand3d',
+    paper_info=dict(
+        author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and '
+        'Shiratori, Takaaki and Lee, Kyoung Mu',
+        title='InterHand2.6M: A dataset and baseline for 3D '
+        'interacting hand pose estimation from a single RGB image',
+        container='arXiv',
+        year='2020',
+        homepage='https://mks0601.github.io/InterHand2.6M/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_thumb4',
+            id=0,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        1:
+        dict(
+            name='right_thumb3',
+            id=1,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        2:
+        dict(
+            name='right_thumb2',
+            id=2,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        3:
+        dict(
+            name='right_thumb1',
+            id=3,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        4:
+        dict(
+            name='right_forefinger4',
+            id=4,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        5:
+        dict(
+            name='right_forefinger3',
+            id=5,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        6:
+        dict(
+            name='right_forefinger2',
+            id=6,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        7:
+        dict(
+            name='right_forefinger1',
+            id=7,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        8:
+        dict(
+            name='right_middle_finger4',
+            id=8,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        9:
+        dict(
+            name='right_middle_finger3',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        10:
+        dict(
+            name='right_middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        11:
+        dict(
+            name='right_middle_finger1',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        12:
+        dict(
+            name='right_ring_finger4',
+            id=12,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        13:
+        dict(
+            name='right_ring_finger3',
+            id=13,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        14:
+        dict(
+            name='right_ring_finger2',
+            id=14,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        15:
+        dict(
+            name='right_ring_finger1',
+            id=15,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        16:
+        dict(
+            name='right_pinky_finger4',
+            id=16,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4'),
+        17:
+        dict(
+            name='right_pinky_finger3',
+            id=17,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        18:
+        dict(
+            name='right_pinky_finger2',
+            id=18,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        19:
+        dict(
+            name='right_pinky_finger1',
+            id=19,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        20:
+        dict(
+            name='right_wrist',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='left_wrist'),
+        21:
+        dict(
+            name='left_thumb4',
+            id=21,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        22:
+        dict(
+            name='left_thumb3',
+            id=22,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        23:
+        dict(
+            name='left_thumb2',
+            id=23,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        24:
+        dict(
+            name='left_thumb1',
+            id=24,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        25:
+        dict(
+            name='left_forefinger4',
+            id=25,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        26:
+        dict(
+            name='left_forefinger3',
+            id=26,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        27:
+        dict(
+            name='left_forefinger2',
+            id=27,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        28:
+        dict(
+            name='left_forefinger1',
+            id=28,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        29:
+        dict(
+            name='left_middle_finger4',
+            id=29,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        30:
+        dict(
+            name='left_middle_finger3',
+            id=30,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        31:
+        dict(
+            name='left_middle_finger2',
+            id=31,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        32:
+        dict(
+            name='left_middle_finger1',
+            id=32,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        33:
+        dict(
+            name='left_ring_finger4',
+            id=33,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        34:
+        dict(
+            name='left_ring_finger3',
+            id=34,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        35:
+        dict(
+            name='left_ring_finger2',
+            id=35,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        36:
+        dict(
+            name='left_ring_finger1',
+            id=36,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        37:
+        dict(
+            name='left_pinky_finger4',
+            id=37,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        38:
+        dict(
+            name='left_pinky_finger3',
+            id=38,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        39:
+        dict(
+            name='left_pinky_finger2',
+            id=39,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        40:
+        dict(
+            name='left_pinky_finger1',
+            id=40,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        41:
+        dict(
+            name='left_wrist',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='right_wrist'),
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_wrist', 'right_thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_thumb1', 'right_thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_thumb2', 'right_thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_thumb3', 'right_thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(
+            link=('right_wrist', 'right_forefinger1'),
+            id=4,
+            color=[255, 153, 255]),
+        5:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=5,
+            color=[255, 153, 255]),
+        6:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=6,
+            color=[255, 153, 255]),
+        7:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=7,
+            color=[255, 153, 255]),
+        8:
+        dict(
+            link=('right_wrist', 'right_middle_finger1'),
+            id=8,
+            color=[102, 178, 255]),
+        9:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(
+            link=('right_wrist', 'right_ring_finger1'),
+            id=12,
+            color=[255, 51, 51]),
+        13:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=13,
+            color=[255, 51, 51]),
+        14:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=14,
+            color=[255, 51, 51]),
+        15:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=15,
+            color=[255, 51, 51]),
+        16:
+        dict(
+            link=('right_wrist', 'right_pinky_finger1'),
+            id=16,
+            color=[0, 255, 0]),
+        17:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=17,
+            color=[0, 255, 0]),
+        18:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=18,
+            color=[0, 255, 0]),
+        19:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=19,
+            color=[0, 255, 0]),
+        20:
+        dict(link=('left_wrist', 'left_thumb1'), id=20, color=[255, 128, 0]),
+        21:
+        dict(link=('left_thumb1', 'left_thumb2'), id=21, color=[255, 128, 0]),
+        22:
+        dict(link=('left_thumb2', 'left_thumb3'), id=22, color=[255, 128, 0]),
+        23:
+        dict(link=('left_thumb3', 'left_thumb4'), id=23, color=[255, 128, 0]),
+        24:
+        dict(
+            link=('left_wrist', 'left_forefinger1'),
+            id=24,
+            color=[255, 153, 255]),
+        25:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=25,
+            color=[255, 153, 255]),
+        26:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=26,
+            color=[255, 153, 255]),
+        27:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=27,
+            color=[255, 153, 255]),
+        28:
+        dict(
+            link=('left_wrist', 'left_middle_finger1'),
+            id=28,
+            color=[102, 178, 255]),
+        29:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=29,
+            color=[102, 178, 255]),
+        30:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=30,
+            color=[102, 178, 255]),
+        31:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=31,
+            color=[102, 178, 255]),
+        32:
+        dict(
+            link=('left_wrist', 'left_ring_finger1'),
+            id=32,
+            color=[255, 51, 51]),
+        33:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=33,
+            color=[255, 51, 51]),
+        34:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=34,
+            color=[255, 51, 51]),
+        35:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=35,
+            color=[255, 51, 51]),
+        36:
+        dict(
+            link=('left_wrist', 'left_pinky_finger1'),
+            id=36,
+            color=[0, 255, 0]),
+        37:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=37,
+            color=[0, 255, 0]),
+        38:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=38,
+            color=[0, 255, 0]),
+        39:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=39,
+            color=[0, 255, 0]),
+    },
+    joint_weights=[1.] * 42,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/jhmdb.py b/SMPLer-X/main/_base_/datasets/jhmdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b37488498a2bade1fa6f2ff6532fcd219071803
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/jhmdb.py
@@ -0,0 +1,129 @@
+dataset_info = dict(
+    dataset_name='jhmdb',
+    paper_info=dict(
+        author='H. Jhuang and J. Gall and S. Zuffi and '
+        'C. Schmid and M. J. Black',
+        title='Towards understanding action recognition',
+        container='International Conf. on Computer Vision (ICCV)',
+        year='2013',
+        homepage='http://jhmdb.is.tue.mpg.de/dataset',
+    ),
+    keypoint_info={
+        0:
+        dict(name='neck', id=0, color=[255, 128, 0], type='upper', swap=''),
+        1:
+        dict(name='belly', id=1, color=[255, 128, 0], type='upper', swap=''),
+        2:
+        dict(name='head', id=2, color=[255, 128, 0], type='upper', swap=''),
+        3:
+        dict(
+            name='right_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='left_shoulder'),
+        4:
+        dict(
+            name='left_shoulder',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        5:
+        dict(
+            name='right_hip',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_hip'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[51, 153, 255],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='right_elbow',
+            id=7,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_elbow'),
+        8:
+        dict(
+            name='left_elbow',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_elbow'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[51, 153, 255],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='left_knee',
+            id=10,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_knee'),
+        11:
+        dict(
+            name='right_wrist',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        12:
+        dict(
+            name='left_wrist',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='right_wrist'),
+        13:
+        dict(
+            name='right_ankle',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_ankle'),
+        14:
+        dict(
+            name='left_ankle',
+            id=14,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle')
+    },
+    skeleton_info={
+        0: dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1: dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2: dict(link=('right_hip', 'belly'), id=2, color=[255, 128, 0]),
+        3: dict(link=('belly', 'left_hip'), id=3, color=[0, 255, 0]),
+        4: dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6: dict(link=('belly', 'neck'), id=6, color=[51, 153, 255]),
+        7: dict(link=('neck', 'head'), id=7, color=[51, 153, 255]),
+        8: dict(link=('neck', 'right_shoulder'), id=8, color=[255, 128, 0]),
+        9: dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('right_elbow', 'right_wrist'), id=10, color=[255, 128, 0]),
+        11: dict(link=('neck', 'left_shoulder'), id=11, color=[0, 255, 0]),
+        12:
+        dict(link=('left_shoulder', 'left_elbow'), id=12, color=[0, 255, 0]),
+        13: dict(link=('left_elbow', 'left_wrist'), id=13, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2, 1.5, 1.5, 1.5, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.025, 0.107, 0.025, 0.079, 0.079, 0.107, 0.107, 0.072, 0.072, 0.087,
+        0.087, 0.062, 0.062, 0.089, 0.089
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/locust.py b/SMPLer-X/main/_base_/datasets/locust.py
new file mode 100644
index 0000000000000000000000000000000000000000..db3fa15aa060b5806faae7a21f65460f77be2745
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/locust.py
@@ -0,0 +1,263 @@
+dataset_info = dict(
+    dataset_name='locust',
+    paper_info=dict(
+        author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and '
+        'Li, Liang and Koger, Benjamin and Costelloe, Blair R and '
+        'Couzin, Iain D',
+        title='DeepPoseKit, a software toolkit for fast and robust '
+        'animal pose estimation using deep learning',
+        container='Elife',
+        year='2019',
+        homepage='https://github.com/jgraving/DeepPoseKit-Data',
+    ),
+    keypoint_info={
+        0:
+        dict(name='head', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='neck', id=1, color=[255, 255, 255], type='', swap=''),
+        2:
+        dict(name='thorax', id=2, color=[255, 255, 255], type='', swap=''),
+        3:
+        dict(name='abdomen1', id=3, color=[255, 255, 255], type='', swap=''),
+        4:
+        dict(name='abdomen2', id=4, color=[255, 255, 255], type='', swap=''),
+        5:
+        dict(
+            name='anttipL',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='anttipR'),
+        6:
+        dict(
+            name='antbaseL',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='antbaseR'),
+        7:
+        dict(name='eyeL', id=7, color=[255, 255, 255], type='', swap='eyeR'),
+        8:
+        dict(
+            name='forelegL1',
+            id=8,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR1'),
+        9:
+        dict(
+            name='forelegL2',
+            id=9,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR2'),
+        10:
+        dict(
+            name='forelegL3',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR3'),
+        11:
+        dict(
+            name='forelegL4',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR4'),
+        12:
+        dict(
+            name='midlegL1',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR1'),
+        13:
+        dict(
+            name='midlegL2',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR2'),
+        14:
+        dict(
+            name='midlegL3',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR3'),
+        15:
+        dict(
+            name='midlegL4',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR4'),
+        16:
+        dict(
+            name='hindlegL1',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR1'),
+        17:
+        dict(
+            name='hindlegL2',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR2'),
+        18:
+        dict(
+            name='hindlegL3',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR3'),
+        19:
+        dict(
+            name='hindlegL4',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR4'),
+        20:
+        dict(
+            name='anttipR',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='anttipL'),
+        21:
+        dict(
+            name='antbaseR',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='antbaseL'),
+        22:
+        dict(name='eyeR', id=22, color=[255, 255, 255], type='', swap='eyeL'),
+        23:
+        dict(
+            name='forelegR1',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL1'),
+        24:
+        dict(
+            name='forelegR2',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL2'),
+        25:
+        dict(
+            name='forelegR3',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL3'),
+        26:
+        dict(
+            name='forelegR4',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL4'),
+        27:
+        dict(
+            name='midlegR1',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL1'),
+        28:
+        dict(
+            name='midlegR2',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL2'),
+        29:
+        dict(
+            name='midlegR3',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL3'),
+        30:
+        dict(
+            name='midlegR4',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL4'),
+        31:
+        dict(
+            name='hindlegR1',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL1'),
+        32:
+        dict(
+            name='hindlegR2',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL2'),
+        33:
+        dict(
+            name='hindlegR3',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL3'),
+        34:
+        dict(
+            name='hindlegR4',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL4')
+    },
+    skeleton_info={
+        0: dict(link=('neck', 'head'), id=0, color=[255, 255, 255]),
+        1: dict(link=('thorax', 'neck'), id=1, color=[255, 255, 255]),
+        2: dict(link=('abdomen1', 'thorax'), id=2, color=[255, 255, 255]),
+        3: dict(link=('abdomen2', 'abdomen1'), id=3, color=[255, 255, 255]),
+        4: dict(link=('antbaseL', 'anttipL'), id=4, color=[255, 255, 255]),
+        5: dict(link=('eyeL', 'antbaseL'), id=5, color=[255, 255, 255]),
+        6: dict(link=('forelegL2', 'forelegL1'), id=6, color=[255, 255, 255]),
+        7: dict(link=('forelegL3', 'forelegL2'), id=7, color=[255, 255, 255]),
+        8: dict(link=('forelegL4', 'forelegL3'), id=8, color=[255, 255, 255]),
+        9: dict(link=('midlegL2', 'midlegL1'), id=9, color=[255, 255, 255]),
+        10: dict(link=('midlegL3', 'midlegL2'), id=10, color=[255, 255, 255]),
+        11: dict(link=('midlegL4', 'midlegL3'), id=11, color=[255, 255, 255]),
+        12:
+        dict(link=('hindlegL2', 'hindlegL1'), id=12, color=[255, 255, 255]),
+        13:
+        dict(link=('hindlegL3', 'hindlegL2'), id=13, color=[255, 255, 255]),
+        14:
+        dict(link=('hindlegL4', 'hindlegL3'), id=14, color=[255, 255, 255]),
+        15: dict(link=('antbaseR', 'anttipR'), id=15, color=[255, 255, 255]),
+        16: dict(link=('eyeR', 'antbaseR'), id=16, color=[255, 255, 255]),
+        17:
+        dict(link=('forelegR2', 'forelegR1'), id=17, color=[255, 255, 255]),
+        18:
+        dict(link=('forelegR3', 'forelegR2'), id=18, color=[255, 255, 255]),
+        19:
+        dict(link=('forelegR4', 'forelegR3'), id=19, color=[255, 255, 255]),
+        20: dict(link=('midlegR2', 'midlegR1'), id=20, color=[255, 255, 255]),
+        21: dict(link=('midlegR3', 'midlegR2'), id=21, color=[255, 255, 255]),
+        22: dict(link=('midlegR4', 'midlegR3'), id=22, color=[255, 255, 255]),
+        23:
+        dict(link=('hindlegR2', 'hindlegR1'), id=23, color=[255, 255, 255]),
+        24:
+        dict(link=('hindlegR3', 'hindlegR2'), id=24, color=[255, 255, 255]),
+        25:
+        dict(link=('hindlegR4', 'hindlegR3'), id=25, color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 35,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/macaque.py b/SMPLer-X/main/_base_/datasets/macaque.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea8dac297ea2f0e36dabccccc021d953216a6ac8
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/macaque.py
@@ -0,0 +1,183 @@
+dataset_info = dict(
+    dataset_name='macaque',
+    paper_info=dict(
+        author='Labuguen, Rollyn and Matsumoto, Jumpei and '
+        'Negrete, Salvador and Nishimaru, Hiroshi and '
+        'Nishijo, Hisao and Takada, Masahiko and '
+        'Go, Yasuhiro and Inoue, Ken-ichi and Shibata, Tomohiro',
+        title='MacaquePose: A novel "in the wild" macaque monkey pose dataset '
+        'for markerless motion capture',
+        container='bioRxiv',
+        year='2020',
+        homepage='http://www.pri.kyoto-u.ac.jp/datasets/'
+        'macaquepose/index.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/mhp.py b/SMPLer-X/main/_base_/datasets/mhp.py
new file mode 100644
index 0000000000000000000000000000000000000000..e16e37c79cb63c4352c48bb4e45602b8408f534b
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/mhp.py
@@ -0,0 +1,156 @@
+dataset_info = dict(
+    dataset_name='mhp',
+    paper_info=dict(
+        author='Zhao, Jian and Li, Jianshu and Cheng, Yu and '
+        'Sim, Terence and Yan, Shuicheng and Feng, Jiashi',
+        title='Understanding humans in crowded scenes: '
+        'Deep nested adversarial learning and a '
+        'new benchmark for multi-human parsing',
+        container='Proceedings of the 26th ACM '
+        'international conference on Multimedia',
+        year='2018',
+        homepage='https://lv-mhp.github.io/dataset',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
+        7:
+        dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='upper_neck',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='head_top', id=9, color=[51, 153, 255], type='upper',
+            swap=''),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='right_elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        12:
+        dict(
+            name='right_shoulder',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        13:
+        dict(
+            name='left_shoulder',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        14:
+        dict(
+            name='left_elbow',
+            id=14,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        15:
+        dict(
+            name='left_wrist',
+            id=15,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5:
+        dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
+                                                                  0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
+        14:
+        dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
+        0.062, 0.072, 0.179, 0.179, 0.072, 0.062
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/mpi_inf_3dhp.py b/SMPLer-X/main/_base_/datasets/mpi_inf_3dhp.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd0a70297b24456ea38566ac205bb585aa47e5d
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/mpi_inf_3dhp.py
@@ -0,0 +1,132 @@
+dataset_info = dict(
+    dataset_name='mpi_inf_3dhp',
+    paper_info=dict(
+        author='ehta, Dushyant and Rhodin, Helge and Casas, Dan and '
+        'Fua, Pascal and Sotnychenko, Oleksandr and Xu, Weipeng and '
+        'Theobalt, Christian',
+        title='Monocular 3D Human Pose Estimation In The Wild Using Improved '
+        'CNN Supervision',
+        container='2017 international conference on 3D vision (3DV)',
+        year='2017',
+        homepage='http://gvv.mpi-inf.mpg.de/3dhp-dataset',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='head_top', id=0, color=[51, 153, 255], type='upper',
+            swap=''),
+        1:
+        dict(name='neck', id=1, color=[51, 153, 255], type='upper', swap=''),
+        2:
+        dict(
+            name='right_shoulder',
+            id=2,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        3:
+        dict(
+            name='right_elbow',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        4:
+        dict(
+            name='right_wrist',
+            id=4,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='left_elbow',
+            id=6,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        7:
+        dict(
+            name='left_wrist',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        8:
+        dict(
+            name='right_hip',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='right_ankle',
+            id=10,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='left_knee',
+            id=12,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        13:
+        dict(
+            name='left_ankle',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        14:
+        dict(name='root', id=14, color=[51, 153, 255], type='lower', swap=''),
+        15:
+        dict(name='spine', id=15, color=[51, 153, 255], type='upper', swap=''),
+        16:
+        dict(name='head', id=16, color=[51, 153, 255], type='upper', swap='')
+    },
+    skeleton_info={
+        0: dict(link=('neck', 'right_shoulder'), id=0, color=[255, 128, 0]),
+        1: dict(
+            link=('right_shoulder', 'right_elbow'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_elbow', 'right_wrist'), id=2, color=[255, 128, 0]),
+        3: dict(link=('neck', 'left_shoulder'), id=3, color=[0, 255, 0]),
+        4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6: dict(link=('root', 'right_hip'), id=6, color=[255, 128, 0]),
+        7: dict(link=('right_hip', 'right_knee'), id=7, color=[255, 128, 0]),
+        8: dict(link=('right_knee', 'right_ankle'), id=8, color=[255, 128, 0]),
+        9: dict(link=('root', 'left_hip'), id=9, color=[0, 255, 0]),
+        10: dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]),
+        11: dict(link=('left_knee', 'left_ankle'), id=11, color=[0, 255, 0]),
+        12: dict(link=('head_top', 'head'), id=12, color=[51, 153, 255]),
+        13: dict(link=('head', 'neck'), id=13, color=[51, 153, 255]),
+        14: dict(link=('neck', 'spine'), id=14, color=[51, 153, 255]),
+        15: dict(link=('spine', 'root'), id=15, color=[51, 153, 255])
+    },
+    joint_weights=[1.] * 17,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/mpii.py b/SMPLer-X/main/_base_/datasets/mpii.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c2a491c7b58bc3eaa5c0056d3d7184bdd1d1cc7
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/mpii.py
@@ -0,0 +1,155 @@
+dataset_info = dict(
+    dataset_name='mpii',
+    paper_info=dict(
+        author='Mykhaylo Andriluka and Leonid Pishchulin and '
+        'Peter Gehler and Schiele, Bernt',
+        title='2D Human Pose Estimation: New Benchmark and '
+        'State of the Art Analysis',
+        container='IEEE Conference on Computer Vision and '
+        'Pattern Recognition (CVPR)',
+        year='2014',
+        homepage='http://human-pose.mpi-inf.mpg.de/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
+        7:
+        dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='upper_neck',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='head_top', id=9, color=[51, 153, 255], type='upper',
+            swap=''),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='right_elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        12:
+        dict(
+            name='right_shoulder',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        13:
+        dict(
+            name='left_shoulder',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        14:
+        dict(
+            name='left_elbow',
+            id=14,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        15:
+        dict(
+            name='left_wrist',
+            id=15,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5:
+        dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
+                                                                  0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
+        14:
+        dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
+        0.062, 0.072, 0.179, 0.179, 0.072, 0.062
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/mpii_trb.py b/SMPLer-X/main/_base_/datasets/mpii_trb.py
new file mode 100644
index 0000000000000000000000000000000000000000..73940d4b4827f8e08343c3b517360db788e4820d
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/mpii_trb.py
@@ -0,0 +1,380 @@
+dataset_info = dict(
+    dataset_name='mpii_trb',
+    paper_info=dict(
+        author='Duan, Haodong and Lin, Kwan-Yee and Jin, Sheng and '
+        'Liu, Wentao and Qian, Chen and Ouyang, Wanli',
+        title='TRB: A Novel Triplet Representation for '
+        'Understanding 2D Human Body',
+        container='Proceedings of the IEEE International '
+        'Conference on Computer Vision',
+        year='2019',
+        homepage='https://github.com/kennymckormick/'
+        'Triplet-Representation-of-human-Body',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left_shoulder',
+            id=0,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        1:
+        dict(
+            name='right_shoulder',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        2:
+        dict(
+            name='left_elbow',
+            id=2,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        3:
+        dict(
+            name='right_elbow',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        4:
+        dict(
+            name='left_wrist',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        5:
+        dict(
+            name='right_wrist',
+            id=5,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='right_hip',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        8:
+        dict(
+            name='left_knee',
+            id=8,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='left_ankle',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        11:
+        dict(
+            name='right_ankle',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        12:
+        dict(name='head', id=12, color=[51, 153, 255], type='upper', swap=''),
+        13:
+        dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap=''),
+        14:
+        dict(
+            name='right_neck',
+            id=14,
+            color=[255, 255, 255],
+            type='upper',
+            swap='left_neck'),
+        15:
+        dict(
+            name='left_neck',
+            id=15,
+            color=[255, 255, 255],
+            type='upper',
+            swap='right_neck'),
+        16:
+        dict(
+            name='medial_right_shoulder',
+            id=16,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_left_shoulder'),
+        17:
+        dict(
+            name='lateral_right_shoulder',
+            id=17,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_left_shoulder'),
+        18:
+        dict(
+            name='medial_right_bow',
+            id=18,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_left_bow'),
+        19:
+        dict(
+            name='lateral_right_bow',
+            id=19,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_left_bow'),
+        20:
+        dict(
+            name='medial_right_wrist',
+            id=20,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_left_wrist'),
+        21:
+        dict(
+            name='lateral_right_wrist',
+            id=21,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_left_wrist'),
+        22:
+        dict(
+            name='medial_left_shoulder',
+            id=22,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_right_shoulder'),
+        23:
+        dict(
+            name='lateral_left_shoulder',
+            id=23,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_right_shoulder'),
+        24:
+        dict(
+            name='medial_left_bow',
+            id=24,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_right_bow'),
+        25:
+        dict(
+            name='lateral_left_bow',
+            id=25,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_right_bow'),
+        26:
+        dict(
+            name='medial_left_wrist',
+            id=26,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_right_wrist'),
+        27:
+        dict(
+            name='lateral_left_wrist',
+            id=27,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_right_wrist'),
+        28:
+        dict(
+            name='medial_right_hip',
+            id=28,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_left_hip'),
+        29:
+        dict(
+            name='lateral_right_hip',
+            id=29,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_left_hip'),
+        30:
+        dict(
+            name='medial_right_knee',
+            id=30,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_left_knee'),
+        31:
+        dict(
+            name='lateral_right_knee',
+            id=31,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_left_knee'),
+        32:
+        dict(
+            name='medial_right_ankle',
+            id=32,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_left_ankle'),
+        33:
+        dict(
+            name='lateral_right_ankle',
+            id=33,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_left_ankle'),
+        34:
+        dict(
+            name='medial_left_hip',
+            id=34,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_right_hip'),
+        35:
+        dict(
+            name='lateral_left_hip',
+            id=35,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_right_hip'),
+        36:
+        dict(
+            name='medial_left_knee',
+            id=36,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_right_knee'),
+        37:
+        dict(
+            name='lateral_left_knee',
+            id=37,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_right_knee'),
+        38:
+        dict(
+            name='medial_left_ankle',
+            id=38,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_right_ankle'),
+        39:
+        dict(
+            name='lateral_left_ankle',
+            id=39,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_right_ankle'),
+    },
+    skeleton_info={
+        0:
+        dict(link=('head', 'neck'), id=0, color=[51, 153, 255]),
+        1:
+        dict(link=('neck', 'left_shoulder'), id=1, color=[51, 153, 255]),
+        2:
+        dict(link=('neck', 'right_shoulder'), id=2, color=[51, 153, 255]),
+        3:
+        dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]),
+        4:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]),
+        5:
+        dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]),
+        7:
+        dict(link=('left_shoulder', 'left_hip'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('right_shoulder', 'right_hip'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('left_hip', 'right_hip'), id=9, color=[51, 153, 255]),
+        10:
+        dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_hip', 'right_knee'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_knee', 'left_ankle'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('right_knee', 'right_ankle'), id=13, color=[255, 128, 0]),
+        14:
+        dict(link=('right_neck', 'left_neck'), id=14, color=[255, 255, 255]),
+        15:
+        dict(
+            link=('medial_right_shoulder', 'lateral_right_shoulder'),
+            id=15,
+            color=[255, 255, 255]),
+        16:
+        dict(
+            link=('medial_right_bow', 'lateral_right_bow'),
+            id=16,
+            color=[255, 255, 255]),
+        17:
+        dict(
+            link=('medial_right_wrist', 'lateral_right_wrist'),
+            id=17,
+            color=[255, 255, 255]),
+        18:
+        dict(
+            link=('medial_left_shoulder', 'lateral_left_shoulder'),
+            id=18,
+            color=[255, 255, 255]),
+        19:
+        dict(
+            link=('medial_left_bow', 'lateral_left_bow'),
+            id=19,
+            color=[255, 255, 255]),
+        20:
+        dict(
+            link=('medial_left_wrist', 'lateral_left_wrist'),
+            id=20,
+            color=[255, 255, 255]),
+        21:
+        dict(
+            link=('medial_right_hip', 'lateral_right_hip'),
+            id=21,
+            color=[255, 255, 255]),
+        22:
+        dict(
+            link=('medial_right_knee', 'lateral_right_knee'),
+            id=22,
+            color=[255, 255, 255]),
+        23:
+        dict(
+            link=('medial_right_ankle', 'lateral_right_ankle'),
+            id=23,
+            color=[255, 255, 255]),
+        24:
+        dict(
+            link=('medial_left_hip', 'lateral_left_hip'),
+            id=24,
+            color=[255, 255, 255]),
+        25:
+        dict(
+            link=('medial_left_knee', 'lateral_left_knee'),
+            id=25,
+            color=[255, 255, 255]),
+        26:
+        dict(
+            link=('medial_left_ankle', 'lateral_left_ankle'),
+            id=26,
+            color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 40,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/nvgesture.py b/SMPLer-X/main/_base_/datasets/nvgesture.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d5a3df7b9c6ac553ff8eab9428a9a3fb96ef564
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/nvgesture.py
@@ -0,0 +1,42 @@
+dataset_info = dict(
+    dataset_name='nvgesture',
+    paper_info=dict(
+        author='Pavlo Molchanov and Xiaodong Yang and Shalini Gupta '
+        'and Kihwan Kim and Stephen Tyree and Jan Kautz',
+        title='Online Detection and Classification of Dynamic Hand Gestures '
+        'with Recurrent 3D Convolutional Neural Networks',
+        container='Proceedings of the IEEE Conference on '
+        'Computer Vision and Pattern Recognition',
+        year='2016',
+        homepage='https://research.nvidia.com/publication/2016-06_online-'
+        'detection-and-classification-dynamic-hand-gestures-recurrent-3d',
+    ),
+    category_info={
+        0: 'five fingers move right',
+        1: 'five fingers move left',
+        2: 'five fingers move up',
+        3: 'five fingers move down',
+        4: 'two fingers move right',
+        5: 'two fingers move left',
+        6: 'two fingers move up',
+        7: 'two fingers move down',
+        8: 'click',
+        9: 'beckoned',
+        10: 'stretch hand',
+        11: 'shake hand',
+        12: 'one',
+        13: 'two',
+        14: 'three',
+        15: 'lift up',
+        16: 'press down',
+        17: 'push',
+        18: 'shrink',
+        19: 'levorotation',
+        20: 'dextrorotation',
+        21: 'two fingers prod',
+        22: 'grab',
+        23: 'thumbs up',
+        24: 'OK'
+    },
+    flip_pairs=[(0, 1), (4, 5), (19, 20)],
+    fps=30)
diff --git a/SMPLer-X/main/_base_/datasets/ochuman.py b/SMPLer-X/main/_base_/datasets/ochuman.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ef20838fe583fde133a97e688d30e91ae562746
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/ochuman.py
@@ -0,0 +1,181 @@
+dataset_info = dict(
+    dataset_name='ochuman',
+    paper_info=dict(
+        author='Zhang, Song-Hai and Li, Ruilong and Dong, Xin and '
+        'Rosin, Paul and Cai, Zixi and Han, Xi and '
+        'Yang, Dingcheng and Huang, Haozhi and Hu, Shi-Min',
+        title='Pose2seg: Detection free human instance segmentation',
+        container='Proceedings of the IEEE conference on computer '
+        'vision and pattern recognition',
+        year='2019',
+        homepage='https://github.com/liruilong940607/OCHumanApi',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/onehand10k.py b/SMPLer-X/main/_base_/datasets/onehand10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..016770f14f3075dfa7d59389524a0c11a4feb802
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/onehand10k.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+    dataset_name='onehand10k',
+    paper_info=dict(
+        author='Wang, Yangang and Peng, Cong and Liu, Yebin',
+        title='Mask-pose cascaded cnn for 2d hand pose estimation '
+        'from single color image',
+        container='IEEE Transactions on Circuits and Systems '
+        'for Video Technology',
+        year='2018',
+        homepage='https://www.yangangwang.com/papers/WANG-MCC-2018-10.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/panoptic_body3d.py b/SMPLer-X/main/_base_/datasets/panoptic_body3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b19ac462415a840ca2e0b9e214bdb35d91b5e4
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/panoptic_body3d.py
@@ -0,0 +1,160 @@
+dataset_info = dict(
+    dataset_name='panoptic_pose_3d',
+    paper_info=dict(
+        author='Joo, Hanbyul and Simon, Tomas and  Li, Xulong'
+        'and Liu, Hao and Tan, Lei and Gui, Lin and Banerjee, Sean'
+        'and Godisart, Timothy and Nabbe, Bart and Matthews, Iain'
+        'and Kanade, Takeo and Nobuhara, Shohei and Sheikh, Yaser',
+        title='Panoptic Studio: A Massively Multiview System '
+        'for Interaction Motion Capture',
+        container='IEEE Transactions on Pattern Analysis'
+        ' and Machine Intelligence',
+        year='2017',
+        homepage='http://domedb.perception.cs.cmu.edu',
+    ),
+    keypoint_info={
+        0:
+        dict(name='neck', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(name='nose', id=1, color=[51, 153, 255], type='upper', swap=''),
+        2:
+        dict(name='mid_hip', id=2, color=[0, 255, 0], type='lower', swap=''),
+        3:
+        dict(
+            name='left_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        4:
+        dict(
+            name='left_elbow',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        5:
+        dict(
+            name='left_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='left_knee',
+            id=7,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        8:
+        dict(
+            name='left_ankle',
+            id=8,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        9:
+        dict(
+            name='right_shoulder',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        10:
+        dict(
+            name='right_elbow',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        11:
+        dict(
+            name='right_wrist',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='right_knee',
+            id=13,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        14:
+        dict(
+            name='right_ankle',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        15:
+        dict(
+            name='left_eye',
+            id=15,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        16:
+        dict(
+            name='left_ear',
+            id=16,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        17:
+        dict(
+            name='right_eye',
+            id=17,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        18:
+        dict(
+            name='right_ear',
+            id=18,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear')
+    },
+    skeleton_info={
+        0: dict(link=('nose', 'neck'), id=0, color=[51, 153, 255]),
+        1: dict(link=('neck', 'left_shoulder'), id=1, color=[0, 255, 0]),
+        2: dict(link=('neck', 'right_shoulder'), id=2, color=[255, 128, 0]),
+        3: dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]),
+        4: dict(
+            link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]),
+        7: dict(link=('left_ankle', 'left_knee'), id=7, color=[0, 255, 0]),
+        8: dict(link=('left_knee', 'left_hip'), id=8, color=[0, 255, 0]),
+        9: dict(link=('right_ankle', 'right_knee'), id=9, color=[255, 128, 0]),
+        10: dict(link=('right_knee', 'right_hip'), id=10, color=[255, 128, 0]),
+        11: dict(link=('mid_hip', 'left_hip'), id=11, color=[0, 255, 0]),
+        12: dict(link=('mid_hip', 'right_hip'), id=12, color=[255, 128, 0]),
+        13: dict(link=('mid_hip', 'neck'), id=13, color=[51, 153, 255]),
+    },
+    joint_weights=[
+        1.0, 1.0, 1.0, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2,
+        1.5, 1.0, 1.0, 1.0, 1.0
+    ],
+    sigmas=[
+        0.026, 0.026, 0.107, 0.079, 0.072, 0.062, 0.107, 0.087, 0.089, 0.079,
+        0.072, 0.062, 0.107, 0.087, 0.089, 0.025, 0.035, 0.025, 0.035
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/panoptic_hand2d.py b/SMPLer-X/main/_base_/datasets/panoptic_hand2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a65731ba87b155beb1b40591fd9acb232c2afc6
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/panoptic_hand2d.py
@@ -0,0 +1,143 @@
+dataset_info = dict(
+    dataset_name='panoptic_hand2d',
+    paper_info=dict(
+        author='Simon, Tomas and Joo, Hanbyul and '
+        'Matthews, Iain and Sheikh, Yaser',
+        title='Hand keypoint detection in single images using '
+        'multiview bootstrapping',
+        container='Proceedings of the IEEE conference on '
+        'Computer Vision and Pattern Recognition',
+        year='2017',
+        homepage='http://domedb.perception.cs.cmu.edu/handdb.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/posetrack18.py b/SMPLer-X/main/_base_/datasets/posetrack18.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aefd1c97fe083df35ee88bebab4f99134c27971
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/posetrack18.py
@@ -0,0 +1,176 @@
+dataset_info = dict(
+    dataset_name='posetrack18',
+    paper_info=dict(
+        author='Andriluka, Mykhaylo and Iqbal, Umar and '
+        'Insafutdinov, Eldar and Pishchulin, Leonid and '
+        'Milan, Anton and Gall, Juergen and Schiele, Bernt',
+        title='Posetrack: A benchmark for human pose estimation and tracking',
+        container='Proceedings of the IEEE Conference on '
+        'Computer Vision and Pattern Recognition',
+        year='2018',
+        homepage='https://posetrack.net/users/download.php',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='head_bottom',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        2:
+        dict(
+            name='head_top', id=2, color=[51, 153, 255], type='upper',
+            swap=''),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('nose', 'head_bottom'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'head_top'), id=13, color=[51, 153, 255]),
+        14:
+        dict(
+            link=('head_bottom', 'left_shoulder'), id=14, color=[51, 153,
+                                                                 255]),
+        15:
+        dict(
+            link=('head_bottom', 'right_shoulder'),
+            id=15,
+            color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/rhd2d.py b/SMPLer-X/main/_base_/datasets/rhd2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..4631ccd03814155b06687e0b1ba2b83404c837fc
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/rhd2d.py
@@ -0,0 +1,151 @@
+dataset_info = dict(
+    dataset_name='rhd2d',
+    paper_info=dict(
+        author='Christian Zimmermann and Thomas Brox',
+        title='Learning to Estimate 3D Hand Pose from Single RGB Images',
+        container='arXiv',
+        year='2017',
+        homepage='https://lmb.informatik.uni-freiburg.de/resources/'
+        'datasets/RenderedHandposeDataset.en.html',
+    ),
+    # In RHD, 1-4: left thumb [tip to palm], which means the finger is from
+    # tip to palm, so as other fingers. Please refer to
+    # `https://lmb.informatik.uni-freiburg.de/resources/datasets/
+    # RenderedHandpose/README` for details of keypoint definition.
+    # But in COCO-WholeBody-Hand, FreiHand, CMU Panoptic HandDB, it is in
+    # inverse order. Pay attention to this if you want to combine RHD with
+    # other hand datasets to train a single model.
+    # Also, note that 'keypoint_info' will not directly affect the order of
+    # the keypoint in the dataset. It is mostly for visualization & storing
+    # information about flip_pairs.
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb4', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb3', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb2', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb1', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger4', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger3', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger2', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger1', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger4',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger3',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger2',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger1',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger4', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger3', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger2', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger1', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger4', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger3', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger2', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger1', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/shelf.py b/SMPLer-X/main/_base_/datasets/shelf.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fe6e42b3b44e3f65947284efd9ffac58d41d43f
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/shelf.py
@@ -0,0 +1,151 @@
+dataset_info = dict(
+    dataset_name='shelf',
+    paper_info=dict(
+        author='Belagiannis, Vasileios and Amin, Sikandar and Andriluka, '
+        'Mykhaylo and Schiele, Bernt and Navab, Nassir and Ilic, Slobodan',
+        title='3D Pictorial Structures for Multiple Human Pose Estimation',
+        container='IEEE Computer Society Conference on Computer Vision and '
+        'Pattern Recognition (CVPR)',
+        year='2014',
+        homepage='http://campar.in.tum.de/Chair/MultiHumanPose',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(
+            name='right_wrist',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        7:
+        dict(
+            name='right_elbow',
+            id=7,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        8:
+        dict(
+            name='right_shoulder',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        9:
+        dict(
+            name='left_shoulder',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        10:
+        dict(
+            name='left_elbow',
+            id=10,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        11:
+        dict(
+            name='left_wrist',
+            id=11,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        12:
+        dict(
+            name='bottom_head',
+            id=12,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        13:
+        dict(
+            name='top_head',
+            id=13,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('left_hip', 'left_knee'), id=2, color=[0, 255, 0]),
+        3:
+        dict(link=('left_knee', 'left_ankle'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('right_hip', 'left_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('right_wrist', 'right_elbow'), id=5, color=[255, 128, 0]),
+        6:
+        dict(
+            link=('right_elbow', 'right_shoulder'), id=6, color=[255, 128, 0]),
+        7:
+        dict(link=('left_shoulder', 'left_elbow'), id=7, color=[0, 255, 0]),
+        8:
+        dict(link=('left_elbow', 'left_wrist'), id=8, color=[0, 255, 0]),
+        9:
+        dict(link=('right_hip', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_hip', 'left_shoulder'), id=10, color=[0, 255, 0]),
+        11:
+        dict(
+            link=('right_shoulder', 'bottom_head'), id=11, color=[255, 128,
+                                                                  0]),
+        12:
+        dict(link=('left_shoulder', 'bottom_head'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('bottom_head', 'top_head'), id=13, color=[51, 153, 255]),
+    },
+    joint_weights=[
+        1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.0, 1.0
+    ],
+    sigmas=[
+        0.089, 0.087, 0.107, 0.107, 0.087, 0.089, 0.062, 0.072, 0.079, 0.079,
+        0.072, 0.062, 0.026, 0.026
+    ])
diff --git a/SMPLer-X/main/_base_/datasets/wflw.py b/SMPLer-X/main/_base_/datasets/wflw.py
new file mode 100644
index 0000000000000000000000000000000000000000..bed6f56f30f7a2f093e44c5726212e2a0d4659d2
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/wflw.py
@@ -0,0 +1,582 @@
+dataset_info = dict(
+    dataset_name='wflw',
+    paper_info=dict(
+        author='Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, '
+        'Quan and Cai, Yici and Zhou, Qiang',
+        title='Look at boundary: A boundary-aware face alignment algorithm',
+        container='Proceedings of the IEEE conference on computer '
+        'vision and pattern recognition',
+        year='2018',
+        homepage='https://wywu.github.io/projects/LAB/WFLW.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-32'),
+        1:
+        dict(
+            name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-31'),
+        2:
+        dict(
+            name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-30'),
+        3:
+        dict(
+            name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-29'),
+        4:
+        dict(
+            name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-28'),
+        5:
+        dict(
+            name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-27'),
+        6:
+        dict(
+            name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-26'),
+        7:
+        dict(
+            name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-25'),
+        8:
+        dict(
+            name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-24'),
+        9:
+        dict(
+            name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-23'),
+        10:
+        dict(
+            name='kpt-10',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-22'),
+        11:
+        dict(
+            name='kpt-11',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-21'),
+        12:
+        dict(
+            name='kpt-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-20'),
+        13:
+        dict(
+            name='kpt-13',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-19'),
+        14:
+        dict(
+            name='kpt-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-18'),
+        15:
+        dict(
+            name='kpt-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        16:
+        dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap=''),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-15'),
+        18:
+        dict(
+            name='kpt-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-14'),
+        19:
+        dict(
+            name='kpt-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-13'),
+        20:
+        dict(
+            name='kpt-20',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-12'),
+        21:
+        dict(
+            name='kpt-21',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-11'),
+        22:
+        dict(
+            name='kpt-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-10'),
+        23:
+        dict(
+            name='kpt-23', id=23, color=[255, 255, 255], type='',
+            swap='kpt-9'),
+        24:
+        dict(
+            name='kpt-24', id=24, color=[255, 255, 255], type='',
+            swap='kpt-8'),
+        25:
+        dict(
+            name='kpt-25', id=25, color=[255, 255, 255], type='',
+            swap='kpt-7'),
+        26:
+        dict(
+            name='kpt-26', id=26, color=[255, 255, 255], type='',
+            swap='kpt-6'),
+        27:
+        dict(
+            name='kpt-27', id=27, color=[255, 255, 255], type='',
+            swap='kpt-5'),
+        28:
+        dict(
+            name='kpt-28', id=28, color=[255, 255, 255], type='',
+            swap='kpt-4'),
+        29:
+        dict(
+            name='kpt-29', id=29, color=[255, 255, 255], type='',
+            swap='kpt-3'),
+        30:
+        dict(
+            name='kpt-30', id=30, color=[255, 255, 255], type='',
+            swap='kpt-2'),
+        31:
+        dict(
+            name='kpt-31', id=31, color=[255, 255, 255], type='',
+            swap='kpt-1'),
+        32:
+        dict(
+            name='kpt-32', id=32, color=[255, 255, 255], type='',
+            swap='kpt-0'),
+        33:
+        dict(
+            name='kpt-33',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-46'),
+        34:
+        dict(
+            name='kpt-34',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-45'),
+        35:
+        dict(
+            name='kpt-35',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-44'),
+        36:
+        dict(
+            name='kpt-36',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-43'),
+        37:
+        dict(
+            name='kpt-37',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-42'),
+        38:
+        dict(
+            name='kpt-38',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-50'),
+        39:
+        dict(
+            name='kpt-39',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-49'),
+        40:
+        dict(
+            name='kpt-40',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-48'),
+        41:
+        dict(
+            name='kpt-41',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-47'),
+        42:
+        dict(
+            name='kpt-42',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-37'),
+        43:
+        dict(
+            name='kpt-43',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-36'),
+        44:
+        dict(
+            name='kpt-44',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-35'),
+        45:
+        dict(
+            name='kpt-45',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-34'),
+        46:
+        dict(
+            name='kpt-46',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-33'),
+        47:
+        dict(
+            name='kpt-47',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-41'),
+        48:
+        dict(
+            name='kpt-48',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-40'),
+        49:
+        dict(
+            name='kpt-49',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-39'),
+        50:
+        dict(
+            name='kpt-50',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-38'),
+        51:
+        dict(name='kpt-51', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(name='kpt-52', id=52, color=[255, 255, 255], type='', swap=''),
+        53:
+        dict(name='kpt-53', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(name='kpt-54', id=54, color=[255, 255, 255], type='', swap=''),
+        55:
+        dict(
+            name='kpt-55',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-59'),
+        56:
+        dict(
+            name='kpt-56',
+            id=56,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-58'),
+        57:
+        dict(name='kpt-57', id=57, color=[255, 255, 255], type='', swap=''),
+        58:
+        dict(
+            name='kpt-58',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-56'),
+        59:
+        dict(
+            name='kpt-59',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-55'),
+        60:
+        dict(
+            name='kpt-60',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-72'),
+        61:
+        dict(
+            name='kpt-61',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-71'),
+        62:
+        dict(
+            name='kpt-62',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-70'),
+        63:
+        dict(
+            name='kpt-63',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-69'),
+        64:
+        dict(
+            name='kpt-64',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-68'),
+        65:
+        dict(
+            name='kpt-65',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-75'),
+        66:
+        dict(
+            name='kpt-66',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-74'),
+        67:
+        dict(
+            name='kpt-67',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-73'),
+        68:
+        dict(
+            name='kpt-68',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-64'),
+        69:
+        dict(
+            name='kpt-69',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-63'),
+        70:
+        dict(
+            name='kpt-70',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-62'),
+        71:
+        dict(
+            name='kpt-71',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-61'),
+        72:
+        dict(
+            name='kpt-72',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-60'),
+        73:
+        dict(
+            name='kpt-73',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-67'),
+        74:
+        dict(
+            name='kpt-74',
+            id=74,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-66'),
+        75:
+        dict(
+            name='kpt-75',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-65'),
+        76:
+        dict(
+            name='kpt-76',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-82'),
+        77:
+        dict(
+            name='kpt-77',
+            id=77,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-81'),
+        78:
+        dict(
+            name='kpt-78',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-80'),
+        79:
+        dict(name='kpt-79', id=79, color=[255, 255, 255], type='', swap=''),
+        80:
+        dict(
+            name='kpt-80',
+            id=80,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-78'),
+        81:
+        dict(
+            name='kpt-81',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-77'),
+        82:
+        dict(
+            name='kpt-82',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-76'),
+        83:
+        dict(
+            name='kpt-83',
+            id=83,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-87'),
+        84:
+        dict(
+            name='kpt-84',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-86'),
+        85:
+        dict(name='kpt-85', id=85, color=[255, 255, 255], type='', swap=''),
+        86:
+        dict(
+            name='kpt-86',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-84'),
+        87:
+        dict(
+            name='kpt-87',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-83'),
+        88:
+        dict(
+            name='kpt-88',
+            id=88,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-92'),
+        89:
+        dict(
+            name='kpt-89',
+            id=89,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-91'),
+        90:
+        dict(name='kpt-90', id=90, color=[255, 255, 255], type='', swap=''),
+        91:
+        dict(
+            name='kpt-91',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-89'),
+        92:
+        dict(
+            name='kpt-92',
+            id=92,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-88'),
+        93:
+        dict(
+            name='kpt-93',
+            id=93,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-95'),
+        94:
+        dict(name='kpt-94', id=94, color=[255, 255, 255], type='', swap=''),
+        95:
+        dict(
+            name='kpt-95',
+            id=95,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-93'),
+        96:
+        dict(
+            name='kpt-96',
+            id=96,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-97'),
+        97:
+        dict(
+            name='kpt-97',
+            id=97,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-96')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 98,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/datasets/zebra.py b/SMPLer-X/main/_base_/datasets/zebra.py
new file mode 100644
index 0000000000000000000000000000000000000000..eac71f796a761bbf87b123f8b7b8b4585df0c525
--- /dev/null
+++ b/SMPLer-X/main/_base_/datasets/zebra.py
@@ -0,0 +1,64 @@
+dataset_info = dict(
+    dataset_name='zebra',
+    paper_info=dict(
+        author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and '
+        'Li, Liang and Koger, Benjamin and Costelloe, Blair R and '
+        'Couzin, Iain D',
+        title='DeepPoseKit, a software toolkit for fast and robust '
+        'animal pose estimation using deep learning',
+        container='Elife',
+        year='2019',
+        homepage='https://github.com/jgraving/DeepPoseKit-Data',
+    ),
+    keypoint_info={
+        0:
+        dict(name='snout', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='head', id=1, color=[255, 255, 255], type='', swap=''),
+        2:
+        dict(name='neck', id=2, color=[255, 255, 255], type='', swap=''),
+        3:
+        dict(
+            name='forelegL1',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR1'),
+        4:
+        dict(
+            name='forelegR1',
+            id=4,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL1'),
+        5:
+        dict(
+            name='hindlegL1',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR1'),
+        6:
+        dict(
+            name='hindlegR1',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL1'),
+        7:
+        dict(name='tailbase', id=7, color=[255, 255, 255], type='', swap=''),
+        8:
+        dict(name='tailtip', id=8, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={
+        0: dict(link=('head', 'snout'), id=0, color=[255, 255, 255]),
+        1: dict(link=('neck', 'head'), id=1, color=[255, 255, 255]),
+        2: dict(link=('forelegL1', 'neck'), id=2, color=[255, 255, 255]),
+        3: dict(link=('forelegR1', 'neck'), id=3, color=[255, 255, 255]),
+        4: dict(link=('hindlegL1', 'tailbase'), id=4, color=[255, 255, 255]),
+        5: dict(link=('hindlegR1', 'tailbase'), id=5, color=[255, 255, 255]),
+        6: dict(link=('tailbase', 'neck'), id=6, color=[255, 255, 255]),
+        7: dict(link=('tailtip', 'tailbase'), id=7, color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 9,
+    sigmas=[])
diff --git a/SMPLer-X/main/_base_/default_runtime.py b/SMPLer-X/main/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..62b7ff270aae280268ea528c1fbe99c0052e20e3
--- /dev/null
+++ b/SMPLer-X/main/_base_/default_runtime.py
@@ -0,0 +1,20 @@
+checkpoint_config = dict(interval=10)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        # dict(type='PaviLoggerHook') # for internal services
+    ])
+
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+
+# disable opencv multithreading to avoid system being overloaded
+opencv_num_threads = 0
+# set multi-process start method as `fork` to speed up the training
+mp_start_method = 'fork'
diff --git a/SMPLer-X/main/_base_/filters/gaussian.py b/SMPLer-X/main/_base_/filters/gaussian.py
new file mode 100644
index 0000000000000000000000000000000000000000..b855f4bde1e1adf71186b3f82f1a3e522fbc53ff
--- /dev/null
+++ b/SMPLer-X/main/_base_/filters/gaussian.py
@@ -0,0 +1,5 @@
+filter_cfg = dict(
+    type='GaussianFilter',
+    window_size=11,
+    sigma=4.0,
+)
diff --git a/SMPLer-X/main/_base_/filters/one_euro.py b/SMPLer-X/main/_base_/filters/one_euro.py
new file mode 100644
index 0000000000000000000000000000000000000000..61f797efdf9fb7a12d40b2d8eee6cb3a5e2e1ea9
--- /dev/null
+++ b/SMPLer-X/main/_base_/filters/one_euro.py
@@ -0,0 +1,5 @@
+filter_cfg = dict(
+    type='OneEuroFilter',
+    min_cutoff=0.004,
+    beta=0.7,
+)
diff --git a/SMPLer-X/main/_base_/filters/savizky_golay.py b/SMPLer-X/main/_base_/filters/savizky_golay.py
new file mode 100644
index 0000000000000000000000000000000000000000..40302b004460699dfe8522c59c9a3e8cf1c35d83
--- /dev/null
+++ b/SMPLer-X/main/_base_/filters/savizky_golay.py
@@ -0,0 +1,5 @@
+filter_cfg = dict(
+    type='SavizkyGolayFilter',
+    window_size=11,
+    polyorder=2,
+)
diff --git a/SMPLer-X/main/_base_/filters/smoothnet_h36m.md b/SMPLer-X/main/_base_/filters/smoothnet_h36m.md
new file mode 100644
index 0000000000000000000000000000000000000000..0901be8fe26468b3603ef77412a4feea16a1f239
--- /dev/null
+++ b/SMPLer-X/main/_base_/filters/smoothnet_h36m.md
@@ -0,0 +1,45 @@
+<!-- [OTHERS] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2112.13715">SmoothNet (arXiv'2021)</a></summary>
+
+```bibtex
+@article{zeng2021smoothnet,
+  title={SmoothNet: A Plug-and-Play Network for Refining Human Poses in Videos},
+  author={Zeng, Ailing and Yang, Lei and Ju, Xuan and Li, Jiefeng and Wang, Jianyi and Xu, Qiang},
+  journal={arXiv preprint arXiv:2112.13715},
+  year={2021}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/6682899/">Human3.6M (TPAMI'2014)</a></summary>
+
+```bibtex
+@article{h36m_pami,
+  author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu,  Cristian},
+  title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
+  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  publisher = {IEEE Computer Society},
+  volume = {36},
+  number = {7},
+  pages = {1325-1339},
+  month = {jul},
+  year = {2014}
+}
+```
+
+</details>
+
+The following SmoothNet model checkpoints are available for pose smoothing. The table shows the the performance of [SimpleBaseline3D](https://arxiv.org/abs/1705.03098) on [Human3.6M](https://ieeexplore.ieee.org/abstract/document/6682899/) dataset without/with the SmoothNet plugin, and compares the SmoothNet models with 4 different window sizes (8, 16, 32 and 64). The metrics are MPJPE(mm), P-MEJPE(mm) and Acceleration Error (mm/frame^2).
+
+| Arch                                 | Window Size | MPJPE<sup>w/o</sup> | MPJPE<sup>w</sup> | P-MPJPE<sup>w/o</sup> | P-MPJPE<sup>w</sup> | AC. Err<sup>w/o</sup> | AC. Err<sup>w</sup> |                 ckpt                  |
+| :----------------------------------- | :---------: | :-----------------: | :---------------: | :-------------------: | :-----------------: | :-------------------: | :-----------------: | :-----------------------------------: |
+| [smoothnet_ws8](/configs/_base_/filters/smoothnet_t8_h36m.py) |      8      |        54.48        |       53.15       |         42.20         |        41.32        |         19.18         |        1.87         | [ckpt](https://download.openmmlab.com/mmpose/plugin/smoothnet/smoothnet_ws8_h36m.pth) |
+| [smoothnet_ws16](/configs/_base_/filters/smoothnet_t16_h36m.py) |     16      |        54.48        |       52.74       |         42.20         |        41.20        |         19.18         |        1.22         | [ckpt](https://download.openmmlab.com/mmpose/plugin/smoothnet/smoothnet_ws16_h36m.pth) |
+| [smoothnet_ws32](/configs/_base_/filters/smoothnet_t32_h36m.py) |     32      |        54.48        |       52.47       |         42.20         |        40.84        |         19.18         |        0.99         | [ckpt](https://download.openmmlab.com/mmpose/plugin/smoothnet/smoothnet_ws32_h36m.pth) |
+| [smoothnet_ws64](/configs/_base_/filters/smoothnet_t64_h36m.py) |     64      |        54.48        |       53.37       |         42.20         |        40.77        |         19.18         |        0.92         | [ckpt](https://download.openmmlab.com/mmpose/plugin/smoothnet/smoothnet_ws64_h36m.pth) |
diff --git a/SMPLer-X/main/_base_/filters/smoothnet_t16_h36m.py b/SMPLer-X/main/_base_/filters/smoothnet_t16_h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cc0c3be924b59056b6b92e1a9f97978cce4a3e2
--- /dev/null
+++ b/SMPLer-X/main/_base_/filters/smoothnet_t16_h36m.py
@@ -0,0 +1,13 @@
+# Config for SmoothNet filter trained on Human3.6M data with a window size of
+# 16. The model is trained using root-centered keypoint coordinates around the
+# pelvis (index:0), thus we set root_index=0 for the filter
+filter_cfg = dict(
+    type='SmoothNetFilter',
+    window_size=16,
+    output_size=16,
+    checkpoint='https://download.openmmlab.com/mmpose/plugin/smoothnet/'
+    'smoothnet_ws16_h36m.pth',
+    hidden_size=512,
+    res_hidden_size=256,
+    num_blocks=3,
+    root_index=0)
diff --git a/SMPLer-X/main/_base_/filters/smoothnet_t32_h36m.py b/SMPLer-X/main/_base_/filters/smoothnet_t32_h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..dae59f3b81e2adceec532079a3849de23772f0eb
--- /dev/null
+++ b/SMPLer-X/main/_base_/filters/smoothnet_t32_h36m.py
@@ -0,0 +1,13 @@
+# Config for SmoothNet filter trained on Human3.6M data with a window size of
+# 32. The model is trained using root-centered keypoint coordinates around the
+# pelvis (index:0), thus we set root_index=0 for the filter
+filter_cfg = dict(
+    type='SmoothNetFilter',
+    window_size=32,
+    output_size=32,
+    checkpoint='https://download.openmmlab.com/mmpose/plugin/smoothnet/'
+    'smoothnet_ws32_h36m.pth',
+    hidden_size=512,
+    res_hidden_size=256,
+    num_blocks=3,
+    root_index=0)
diff --git a/SMPLer-X/main/_base_/filters/smoothnet_t64_h36m.py b/SMPLer-X/main/_base_/filters/smoothnet_t64_h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..aef2993272cef9fff1d7f8c882507781064d44b7
--- /dev/null
+++ b/SMPLer-X/main/_base_/filters/smoothnet_t64_h36m.py
@@ -0,0 +1,13 @@
+# Config for SmoothNet filter trained on Human3.6M data with a window size of
+# 64. The model is trained using root-centered keypoint coordinates around the
+# pelvis (index:0), thus we set root_index=0 for the filter
+filter_cfg = dict(
+    type='SmoothNetFilter',
+    window_size=64,
+    output_size=64,
+    checkpoint='https://download.openmmlab.com/mmpose/plugin/smoothnet/'
+    'smoothnet_ws64_h36m.pth',
+    hidden_size=512,
+    res_hidden_size=256,
+    num_blocks=3,
+    root_index=0)
diff --git a/SMPLer-X/main/_base_/filters/smoothnet_t8_h36m.py b/SMPLer-X/main/_base_/filters/smoothnet_t8_h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..cadd8865dc75d2247a8b4af6036131963aa4d4a5
--- /dev/null
+++ b/SMPLer-X/main/_base_/filters/smoothnet_t8_h36m.py
@@ -0,0 +1,13 @@
+# Config for SmoothNet filter trained on Human3.6M data with a window size of
+# 8. The model is trained using root-centered keypoint coordinates around the
+# pelvis (index:0), thus we set root_index=0 for the filter
+filter_cfg = dict(
+    type='SmoothNetFilter',
+    window_size=8,
+    output_size=8,
+    checkpoint='https://download.openmmlab.com/mmpose/plugin/smoothnet/'
+    'smoothnet_ws8_h36m.pth',
+    hidden_size=512,
+    res_hidden_size=256,
+    num_blocks=3,
+    root_index=0)
diff --git a/SMPLer-X/main/config.py b/SMPLer-X/main/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd5fd00b8e66a2ee782b3005075c5d23d032b1b9
--- /dev/null
+++ b/SMPLer-X/main/config.py
@@ -0,0 +1,66 @@
+import os
+import os.path as osp
+import sys
+import datetime
+from mmengine.config import Config as MMConfig
+
+
+class Config:
+    def get_config_fromfile(self, config_path):
+        self.config_path = config_path
+        cfg = MMConfig.fromfile(self.config_path)
+        self.__dict__.update(dict(cfg))
+
+        # update dir
+        self.cur_dir = osp.dirname(os.path.abspath(__file__))
+        self.root_dir = osp.join(self.cur_dir, '..')
+        self.data_dir = osp.join(self.root_dir, 'dataset')
+        self.human_model_path = osp.join(self.root_dir, 'common', 'utils', 'human_model_files')
+
+        ## add some paths to the system root dir
+        sys.path.insert(0, osp.join(self.root_dir, 'common'))
+                
+    def prepare_dirs(self, exp_name):
+        time_str = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+        self.output_dir = osp.join(self.root_dir, f'{exp_name}_{time_str}')
+        self.model_dir = osp.join(self.output_dir, 'model_dump')
+        self.vis_dir = osp.join(self.output_dir, 'vis')
+        self.log_dir = osp.join(self.output_dir, 'log')
+        self.code_dir = osp.join(self.output_dir, 'code')
+        self.result_dir = osp.join(self.output_dir, 'result')
+
+        from utils.dir import make_folder
+        make_folder(self.model_dir)
+        make_folder(self.vis_dir)
+        make_folder(self.log_dir)
+        make_folder(self.code_dir)
+        make_folder(self.result_dir)
+
+        ## copy some code to log dir as a backup
+        copy_files = ['main/train.py', 'main/test.py', 'common/base.py',
+                      'common/nets', 'main/SMPLer_X.py',
+                      'data/dataset.py', 'data/MSCOCO/MSCOCO.py', 'data/AGORA/AGORA.py']
+        for file in copy_files:
+            os.system(f'cp -r {self.root_dir}/{file} {self.code_dir}')
+
+    def update_test_config(self, testset, agora_benchmark, shapy_eval_split, pretrained_model_path, use_cache,
+                           eval_on_train=False, vis=False):
+        self.testset = testset
+        self.agora_benchmark = agora_benchmark
+        self.pretrained_model_path = pretrained_model_path
+        self.shapy_eval_split = shapy_eval_split
+        self.use_cache = use_cache
+        self.eval_on_train = eval_on_train
+        self.vis = vis
+
+
+    def update_config(self, num_gpus, pretrained_model_path, output_folder, device):
+        self.num_gpus = num_gpus
+        self.pretrained_model_path = pretrained_model_path
+        self.log_dir = output_folder
+        self.device = device
+        
+        # Save
+        cfg_save = MMConfig(self.__dict__)
+
+cfg = Config()
\ No newline at end of file
diff --git a/SMPLer-X/main/config/config_smpler_x_b32.py b/SMPLer-X/main/config/config_smpler_x_b32.py
new file mode 100644
index 0000000000000000000000000000000000000000..b737e5307a76fbeaf6eefa6e2bc775c52760fab4
--- /dev/null
+++ b/SMPLer-X/main/config/config_smpler_x_b32.py
@@ -0,0 +1,112 @@
+import os
+import os.path as osp
+
+# will be update in exp
+num_gpus = -1
+exp_name = 'output/exp1/pre_analysis'
+
+# quick access
+save_epoch = 1
+lr = 1e-5
+end_epoch = 10
+train_batch_size = 32
+
+syncbn = True
+bbox_ratio = 1.2
+
+# continue
+continue_train = False
+start_over = True
+
+# dataset setting
+agora_fix_betas = True
+agora_fix_global_orient_transl = True
+agora_valid_root_pose = True
+
+# all
+dataset_list = ['Human36M', 'MSCOCO', 'MPII', 'AGORA', 'EHF', 'SynBody', 'GTA_Human2', \
+    'EgoBody_Egocentric', 'EgoBody_Kinect', 'UBody', 'PW3D', 'MuCo', 'PROX']
+trainset_3d = ['MSCOCO','AGORA', 'UBody']
+trainset_2d = ['PW3D', 'MPII', 'Human36M']
+trainset_humandata = ['BEDLAM', 'SPEC', 'GTA_Human2','SynBody', 'PoseTrack',
+                    'EgoBody_Egocentric', 'PROX', 'CrowdPose',
+                    'EgoBody_Kinect', 'MPI_INF_3DHP', 'RICH', 'MuCo', 'InstaVariety',
+                    'Behave', 'UP3D', 'ARCTIC',
+                    'OCHuman', 'CHI3D', 'RenBody_HiRes', 'MTP', 'HumanSC3D', 'RenBody',
+                    'FIT3D', 'Talkshow' , 'SSP3D', 'LSPET']
+testset = 'EHF'
+
+use_cache = True
+# downsample
+BEDLAM_train_sample_interval = 5
+EgoBody_Kinect_train_sample_interval = 10
+train_sample_interval = 10 # UBody
+MPI_INF_3DHP_train_sample_interval = 5
+InstaVariety_train_sample_interval = 10
+RenBody_HiRes_train_sample_interval = 5
+ARCTIC_train_sample_interval = 10
+# RenBody_train_sample_interval = 10
+FIT3D_train_sample_interval = 10
+Talkshow_train_sample_interval = 10
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 4500000
+
+# model
+smplx_loss_weight = 1.0 #2 for agora_model for smplx shape
+smplx_pose_weight = 10.0
+
+smplx_kps_3d_weight = 100.0
+smplx_kps_2d_weight = 1.0
+net_kps_2d_weight = 1.0
+
+agora_benchmark = 'agora_model' # 'agora_model', 'test_only'
+
+model_type = 'smpler_x_b'
+encoder_config_file = 'main/transformer_utils/configs/smpler_x/encoder/body_encoder_base.py'
+encoder_pretrained_model_path = 'pretrained_models/vitpose_base.pth'
+feat_dim = 768
+
+
+## =====FIXED ARGS============================================================
+## model setting
+upscale = 4
+hand_pos_joint_num = 20
+face_pos_joint_num = 72
+num_task_token = 24
+num_noise_sample = 0
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_img_shape = (512, 384)
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2)  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+## training config
+print_iters = 100
+lr_mult = 1
+
+## testing config
+test_batch_size = 32
+
+## others
+num_thread = 2
+vis = False
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
diff --git a/SMPLer-X/main/config/config_smpler_x_h32.py b/SMPLer-X/main/config/config_smpler_x_h32.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ffd86e9e965f9f2d3fd5efdb98ad2cb83fa81ed
--- /dev/null
+++ b/SMPLer-X/main/config/config_smpler_x_h32.py
@@ -0,0 +1,111 @@
+import os
+import os.path as osp
+
+# will be update in exp
+num_gpus = -1
+exp_name = 'output/exp1/pre_analysis'
+
+# quick access
+save_epoch = 1
+lr = 1e-5
+end_epoch = 10
+train_batch_size = 16
+
+syncbn = True
+bbox_ratio = 1.2
+
+# continue
+continue_train = False
+start_over = True
+
+# dataset setting
+agora_fix_betas = True
+agora_fix_global_orient_transl = True
+agora_valid_root_pose = True
+
+# all
+dataset_list = ['Human36M', 'MSCOCO', 'MPII', 'AGORA', 'EHF', 'SynBody', 'GTA_Human2', \
+    'EgoBody_Egocentric', 'EgoBody_Kinect', 'UBody', 'PW3D', 'MuCo', 'PROX']
+trainset_3d = ['MSCOCO','AGORA', 'UBody']
+trainset_2d = ['PW3D', 'MPII', 'Human36M']
+trainset_humandata = ['BEDLAM', 'SPEC', 'GTA_Human2','SynBody', 'PoseTrack',
+                    'EgoBody_Egocentric', 'PROX', 'CrowdPose',
+                    'EgoBody_Kinect', 'MPI_INF_3DHP', 'RICH', 'MuCo', 'InstaVariety',
+                    'Behave', 'UP3D', 'ARCTIC',
+                    'OCHuman', 'CHI3D', 'RenBody_HiRes', 'MTP', 'HumanSC3D', 'RenBody',
+                    'FIT3D', 'Talkshow' , 'SSP3D', 'LSPET']
+testset = 'EHF'
+
+use_cache = True
+# downsample
+BEDLAM_train_sample_interval = 5
+EgoBody_Kinect_train_sample_interval = 10
+train_sample_interval = 10 # UBody
+MPI_INF_3DHP_train_sample_interval = 5
+InstaVariety_train_sample_interval = 10
+RenBody_HiRes_train_sample_interval = 5
+ARCTIC_train_sample_interval = 10
+# RenBody_train_sample_interval = 10
+FIT3D_train_sample_interval = 10
+Talkshow_train_sample_interval = 10
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 4500000
+
+# model
+smplx_loss_weight = 1.0 #2 for agora_model for smplx shape
+smplx_pose_weight = 10.0
+
+smplx_kps_3d_weight = 100.0
+smplx_kps_2d_weight = 1.0
+net_kps_2d_weight = 1.0
+
+agora_benchmark = 'agora_model' # 'agora_model', 'test_only'
+
+model_type = 'smpler_x_h'
+encoder_config_file = 'main/transformer_utils/configs/smpler_x/encoder/body_encoder_huge.py'
+encoder_pretrained_model_path = 'pretrained_models/vitpose_huge.pth'
+feat_dim = 1280
+
+## =====FIXED ARGS============================================================
+## model setting
+upscale = 4
+hand_pos_joint_num = 20
+face_pos_joint_num = 72
+num_task_token = 24
+num_noise_sample = 0
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_img_shape = (512, 384)
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2)  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+## training config
+print_iters = 100
+lr_mult = 1
+
+## testing config
+test_batch_size = 32
+
+## others
+num_thread = 2
+vis = False
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
diff --git a/SMPLer-X/main/config/config_smpler_x_l32.py b/SMPLer-X/main/config/config_smpler_x_l32.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cfedc0b6b59d17d2b666bfdfabff6c45069456b
--- /dev/null
+++ b/SMPLer-X/main/config/config_smpler_x_l32.py
@@ -0,0 +1,112 @@
+import os
+import os.path as osp
+
+# will be update in exp
+num_gpus = -1
+exp_name = 'output/exp1/pre_analysis'
+
+# quick access
+save_epoch = 1
+lr = 1e-5
+end_epoch = 10
+train_batch_size = 32
+
+syncbn = True
+bbox_ratio = 1.2
+
+# continue
+continue_train = False
+start_over = True
+
+# dataset setting
+agora_fix_betas = True
+agora_fix_global_orient_transl = True
+agora_valid_root_pose = True
+
+# all
+dataset_list = ['Human36M', 'MSCOCO', 'MPII', 'AGORA', 'EHF', 'SynBody', 'GTA_Human2', \
+    'EgoBody_Egocentric', 'EgoBody_Kinect', 'UBody', 'PW3D', 'MuCo', 'PROX']
+trainset_3d = ['MSCOCO','AGORA', 'UBody']
+trainset_2d = ['PW3D', 'MPII', 'Human36M']
+trainset_humandata = ['BEDLAM', 'SPEC', 'GTA_Human2','SynBody', 'PoseTrack',
+                    'EgoBody_Egocentric', 'PROX', 'CrowdPose',
+                    'EgoBody_Kinect', 'MPI_INF_3DHP', 'RICH', 'MuCo', 'InstaVariety',
+                    'Behave', 'UP3D', 'ARCTIC',
+                    'OCHuman', 'CHI3D', 'RenBody_HiRes', 'MTP', 'HumanSC3D', 'RenBody',
+                    'FIT3D', 'Talkshow' , 'SSP3D', 'LSPET']
+testset = 'EHF'
+
+use_cache = True
+# downsample
+BEDLAM_train_sample_interval = 5
+EgoBody_Kinect_train_sample_interval = 10
+train_sample_interval = 10 # UBody
+MPI_INF_3DHP_train_sample_interval = 5
+InstaVariety_train_sample_interval = 10
+RenBody_HiRes_train_sample_interval = 5
+ARCTIC_train_sample_interval = 10
+# RenBody_train_sample_interval = 10
+FIT3D_train_sample_interval = 10
+Talkshow_train_sample_interval = 10
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 4500000
+
+# model
+smplx_loss_weight = 1.0 #2 for agora_model for smplx shape
+smplx_pose_weight = 10.0
+
+smplx_kps_3d_weight = 100.0
+smplx_kps_2d_weight = 1.0
+net_kps_2d_weight = 1.0
+
+agora_benchmark = 'agora_model' # 'agora_model', 'test_only'
+
+model_type = 'smpler_x_l'
+encoder_config_file = 'main/transformer_utils/configs/smpler_x/encoder/body_encoder_large.py'
+encoder_pretrained_model_path = 'pretrained_models/vitpose_large.pth'
+feat_dim = 1024
+
+
+## =====FIXED ARGS============================================================
+## model setting
+upscale = 4
+hand_pos_joint_num = 20
+face_pos_joint_num = 72
+num_task_token = 24
+num_noise_sample = 0
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_img_shape = (512, 384)
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2)  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+## training config
+print_iters = 100
+lr_mult = 1
+
+## testing config
+test_batch_size = 32
+
+## others
+num_thread = 2
+vis = False
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
diff --git a/SMPLer-X/main/config/config_smpler_x_s32.py b/SMPLer-X/main/config/config_smpler_x_s32.py
new file mode 100644
index 0000000000000000000000000000000000000000..090501bef40b1130e733d9567c05dd11b22b9ed1
--- /dev/null
+++ b/SMPLer-X/main/config/config_smpler_x_s32.py
@@ -0,0 +1,111 @@
+import os
+import os.path as osp
+
+# will be update in exp
+num_gpus = -1
+exp_name = 'output/exp1/pre_analysis'
+
+# quick access
+save_epoch = 1
+lr = 1e-5
+end_epoch = 10
+train_batch_size = 32
+
+syncbn = True
+bbox_ratio = 1.2
+
+# continue
+continue_train = False
+start_over = True
+
+# dataset setting
+agora_fix_betas = True
+agora_fix_global_orient_transl = True
+agora_valid_root_pose = True
+
+# all data
+dataset_list = ['Human36M', 'MSCOCO', 'MPII', 'AGORA', 'EHF', 'SynBody', 'GTA_Human2', \
+    'EgoBody_Egocentric', 'EgoBody_Kinect', 'UBody', 'PW3D', 'MuCo', 'PROX']
+trainset_3d = ['MSCOCO','AGORA', 'UBody']
+trainset_2d = ['PW3D', 'MPII', 'Human36M']
+trainset_humandata = ['BEDLAM', 'SPEC', 'GTA_Human2','SynBody', 'PoseTrack',
+                    'EgoBody_Egocentric', 'PROX', 'CrowdPose',
+                    'EgoBody_Kinect', 'MPI_INF_3DHP', 'RICH', 'MuCo', 'InstaVariety',
+                    'Behave', 'UP3D', 'ARCTIC',
+                    'OCHuman', 'CHI3D', 'RenBody_HiRes', 'MTP', 'HumanSC3D', 'RenBody',
+                    'FIT3D', 'Talkshow' , 'SSP3D', 'LSPET']
+testset = 'EHF'
+
+use_cache = True
+# downsample
+BEDLAM_train_sample_interval = 5
+EgoBody_Kinect_train_sample_interval = 10
+train_sample_interval = 10 # UBody
+MPI_INF_3DHP_train_sample_interval = 5
+InstaVariety_train_sample_interval = 10
+RenBody_HiRes_train_sample_interval = 5
+ARCTIC_train_sample_interval = 10
+# RenBody_train_sample_interval = 10
+FIT3D_train_sample_interval = 10
+Talkshow_train_sample_interval = 10
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 4500000
+
+# model
+smplx_loss_weight = 1.0 #2 for agora_model for smplx shape
+smplx_pose_weight = 10.0
+
+smplx_kps_3d_weight = 100.0
+smplx_kps_2d_weight = 1.0
+net_kps_2d_weight = 1.0
+
+agora_benchmark = 'agora_model' # 'agora_model', 'test_only'
+
+model_type = 'smpler_x_s'
+encoder_config_file = 'main/transformer_utils/configs/smpler_x/encoder/body_encoder_small.py'
+encoder_pretrained_model_path = 'pretrained_models/vitpose_small.pth'
+feat_dim = 384
+
+## =====FIXED ARGS============================================================
+## model setting
+upscale = 4
+hand_pos_joint_num = 20
+face_pos_joint_num = 72
+num_task_token = 24
+num_noise_sample = 0
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_img_shape = (512, 384)
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2)  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+## training config
+print_iters = 100
+lr_mult = 1
+
+## testing config
+test_batch_size = 32
+
+## others
+num_thread = 2
+vis = False
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
diff --git a/SMPLer-X/main/inference.py b/SMPLer-X/main/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d9615d222ab78025dab4a1569c7b16e4a8dd30b
--- /dev/null
+++ b/SMPLer-X/main/inference.py
@@ -0,0 +1,137 @@
+import os
+import sys
+import os.path as osp
+import argparse
+import numpy as np
+import torchvision.transforms as transforms
+import torch.backends.cudnn as cudnn
+import torch
+CUR_DIR = osp.dirname(os.path.abspath(__file__))
+sys.path.insert(0, osp.join(CUR_DIR, '..', 'main'))
+sys.path.insert(0, osp.join(CUR_DIR , '..', 'common'))
+from config import cfg
+import cv2
+from tqdm import tqdm
+import json
+from typing import Literal, Union
+from mmdet.apis import init_detector, inference_detector
+from utils.inference_utils import process_mmdet_results, non_max_suppression
+
+class Inferer:
+
+    def __init__(self, pretrained_model, num_gpus, output_folder):
+        self.output_folder = output_folder
+        self.device = torch.device('cuda') if (num_gpus > 0) else torch.device('cpu')
+        config_path = osp.join(CUR_DIR, './config', f'config_{pretrained_model}.py')
+        ckpt_path = osp.join(CUR_DIR, '../pretrained_models', f'{pretrained_model}.pth.tar')
+        cfg.get_config_fromfile(config_path)
+        cfg.update_config(num_gpus, ckpt_path, output_folder, self.device)
+        self.cfg = cfg
+        cudnn.benchmark = True
+        
+        # load model
+        from base import Demoer
+        demoer = Demoer()
+        demoer._make_model()
+        demoer.model.eval()
+        self.demoer = demoer
+        checkpoint_file = osp.join(CUR_DIR, '../pretrained_models/mmdet/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth')
+        config_file= osp.join(CUR_DIR, '../pretrained_models/mmdet/mmdet_faster_rcnn_r50_fpn_coco.py')
+        model = init_detector(config_file, checkpoint_file, device=self.device)  # or device='cuda:0'
+        self.model = model
+
+    def infer(self, original_img, iou_thr, frame, multi_person=False, mesh_as_vertices=False):
+        from utils.preprocessing import process_bbox, generate_patch_image
+        # from utils.vis import render_mesh, save_obj
+        from utils.human_models import smpl_x
+        mesh_paths = []
+        smplx_paths = []
+        # prepare input image
+        transform = transforms.ToTensor()
+        vis_img = original_img.copy()
+        original_img_height, original_img_width = original_img.shape[:2]
+
+        ## mmdet inference
+        mmdet_results = inference_detector(self.model, original_img)
+        
+        pred_instance = mmdet_results.pred_instances.cpu().numpy()
+        bboxes = np.concatenate(
+            (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1)
+        bboxes = bboxes[pred_instance.labels == 0]
+        bboxes = np.expand_dims(bboxes, axis=0)
+        mmdet_box = process_mmdet_results(bboxes, cat_id=0, multi_person=True)
+
+        # save original image if no bbox
+        if len(mmdet_box[0])<1:
+            return original_img, [], []
+        
+        # if not multi_person:
+            # only select the largest bbox
+        num_bbox = 1
+        mmdet_box = mmdet_box[0]
+        # else:
+        #     # keep bbox by NMS with iou_thr
+        #     mmdet_box = non_max_suppression(mmdet_box[0], iou_thr)
+        #     num_bbox = len(mmdet_box)
+        
+        ## loop all detected bboxes
+        for bbox_id in range(num_bbox):
+            mmdet_box_xywh = np.zeros((4))
+            mmdet_box_xywh[0] = mmdet_box[bbox_id][0]
+            mmdet_box_xywh[1] = mmdet_box[bbox_id][1]
+            mmdet_box_xywh[2] =  abs(mmdet_box[bbox_id][2]-mmdet_box[bbox_id][0])
+            mmdet_box_xywh[3] =  abs(mmdet_box[bbox_id][3]-mmdet_box[bbox_id][1]) 
+
+            # skip small bboxes by bbox_thr in pixel
+            if mmdet_box_xywh[2] < 50 or mmdet_box_xywh[3] < 150:
+                continue
+
+            bbox = process_bbox(mmdet_box_xywh, original_img_width, original_img_height)
+            img, img2bb_trans, bb2img_trans = generate_patch_image(original_img, bbox, 1.0, 0.0, False, self.cfg.input_img_shape)
+            img = transform(img.astype(np.float32))/255
+            img = img.to(cfg.device)[None,:,:,:]
+            inputs = {'img': img}
+            targets = {}
+            meta_info = {}
+
+            # mesh recovery
+            with torch.no_grad():
+                out = self.demoer.model(inputs, targets, meta_info, 'test')
+            # mesh = out['smplx_mesh_cam'].detach().cpu().numpy()[0]
+
+            ## save mesh
+            # save_path_mesh = os.path.join(self.output_folder, 'mesh')
+            # os.makedirs(save_path_mesh, exist_ok= True)
+            # obj_path = os.path.join(save_path_mesh, f'{frame:05}_{bbox_id}.obj')
+            # save_obj(mesh, smpl_x.face, obj_path)
+            # mesh_paths.append(obj_path)
+
+            ## save single person param
+            smplx_pred = {}
+            smplx_pred['global_orient'] = out['smplx_root_pose'].reshape(-1,3).cpu().numpy()
+            smplx_pred['body_pose'] = out['smplx_body_pose'].reshape(-1,3).cpu().numpy()
+            smplx_pred['left_hand_pose'] = out['smplx_lhand_pose'].reshape(-1,3).cpu().numpy()
+            smplx_pred['right_hand_pose'] = out['smplx_rhand_pose'].reshape(-1,3).cpu().numpy()
+            smplx_pred['jaw_pose'] = out['smplx_jaw_pose'].reshape(-1,3).cpu().numpy()
+            smplx_pred['leye_pose'] = np.zeros((1, 3))
+            smplx_pred['reye_pose'] = np.zeros((1, 3))
+            smplx_pred['betas'] = out['smplx_shape'].reshape(-1,10).cpu().numpy()
+            smplx_pred['expression'] = out['smplx_expr'].reshape(-1,10).cpu().numpy()
+            smplx_pred['transl'] =  out['cam_trans'].reshape(-1,3).cpu().numpy()
+            save_path_smplx = os.path.join(self.output_folder, 'smplx')
+            os.makedirs(save_path_smplx, exist_ok= True)
+
+            npz_path = os.path.join(save_path_smplx, f'{frame:05}_{bbox_id}.npz')
+            np.savez(npz_path, **smplx_pred)
+            smplx_paths.append(npz_path)
+
+            ## render single person mesh
+            # focal = [self.cfg.focal[0] / self.cfg.input_body_shape[1] * bbox[2], self.cfg.focal[1] / self.cfg.input_body_shape[0] * bbox[3]]
+            # princpt = [self.cfg.princpt[0] / self.cfg.input_body_shape[1] * bbox[2] + bbox[0], self.cfg.princpt[1] / self.cfg.input_body_shape[0] * bbox[3] + bbox[1]]
+            # vis_img = render_mesh(vis_img, mesh, smpl_x.face, {'focal': focal, 'princpt': princpt}, 
+            #                       mesh_as_vertices=mesh_as_vertices)
+            # vis_img = vis_img.astype('uint8') 
+            vis_img = None
+            mesh_paths = None
+        return vis_img, mesh_paths, smplx_paths
+
diff --git a/SMPLer-X/main/transformer_utils/.gitignore b/SMPLer-X/main/transformer_utils/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e792015fe7abb9597efbe787d25d0c4d242ef42b
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/.gitignore
@@ -0,0 +1,141 @@
+
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+**/*.pyc
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/en/_build
+docs/zh_cn/_build
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# custom
+mmpose/.mim
+/models
+/data
+.vscode
+.idea
+*.pkl
+*.pkl.json
+*.log.json
+*.npy
+work_dirs/
+docs/**/topics/
+docs/**/papers/*.md
+docs/**/datasets.md
+docs/**/modelzoo.md
+
+!tests/data/**/*.pkl
+!tests/data/**/*.pkl.json
+!tests/data/**/*.log.json
+!tests/data/**/*.pth
+!tests/data/**/*.npy
+
+# Pytorch
+*.pth
+
+*.DS_Store
+
+# checkpoints
+ckpts/
+vis_results
+vis_results_poseur
+scripts
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/CITATION.cff b/SMPLer-X/main/transformer_utils/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..067f4ff996793aa8b8a5f39d35f43c0e275cfa64
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/CITATION.cff
@@ -0,0 +1,8 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - name: "Poseur Contributors"
+title: "Poseur: Direct Human Pose Regression with Transformers"
+date-released: 2022-07-21
+url: "https://github.com/aim-uofa/Poseur"
+license: 2-clause BSD
diff --git a/SMPLer-X/main/transformer_utils/LICENSE b/SMPLer-X/main/transformer_utils/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..acab10d9e2ab392bd0839978663306846c56aca3
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/LICENSE
@@ -0,0 +1,677 @@
+Poseur for non-commercial purposes
+(For commercial use, contact chhshen@gmail.com for obtaining a commerical license.)
+
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/MANIFEST.in b/SMPLer-X/main/transformer_utils/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..8a93c252bd38bafddc390bc9ae9b7278e3479246
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/MANIFEST.in
@@ -0,0 +1,5 @@
+include requirements/*.txt
+include mmpose/.mim/model-index.yml
+recursive-include mmpose/.mim/configs *.py *.yml
+recursive-include mmpose/.mim/tools *.py *.sh
+recursive-include mmpose/.mim/demo *.py
diff --git a/SMPLer-X/main/transformer_utils/README.md b/SMPLer-X/main/transformer_utils/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fe5fd4c4c2a3a081c9a06f23162975078f58c375
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/README.md
@@ -0,0 +1,80 @@
+# Poseur: Direct Human Pose Regression with Transformers
+
+
+> [**Poseur: Direct Human Pose Regression with Transformers**](https://arxiv.org/pdf/2201.07412.pdf),
+> Weian Mao\*, Yongtao Ge\*, Chunhua Shen, Zhi Tian, Xinlong Wang, Zhibin Wang, Anton van den Hengel
+> In: European Conference on Computer Vision (ECCV), 2022
+> *arXiv preprint ([arXiv 2201.07412](https://arxiv.org/pdf/2201.07412))*
+> (\* equal contribution)
+
+# Introduction
+This is a preview for Poseur, which currently including Poseur with R-50 backbone for both training and inference. More models with various backbones will be released soon. This project is bulit upon [MMPose](https://github.com/open-mmlab/mmpose) with commit ID [eeebc652842a9724259ed345c00112641d8ee06d](https://github.com/open-mmlab/mmpose/commit/eeebc652842a9724259ed345c00112641d8ee06d).
+
+# Installation & Quick Start
+1. Install following packages
+```
+pip install easydict einops
+```
+2. Follow the [MMPose instruction](mmpose_README.md) to install the project and set up the datasets (MS-COCO).
+
+For training on COCO, run:
+```
+./tools/dist_train.sh \
+configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_res50_coco_256x192.py 8 \
+--work-dir work_dirs/poseur_res50_coco_256x192
+```
+
+For evaluating on COCO, run the following command lines:
+```
+wget https://cloudstor.aarnet.edu.au/plus/s/UXr1Dn9w6ja4fM9/download -O poseur_256x192_r50_6dec_coco.pth
+./tools/dist_test.sh configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_res50_coco_256x192.py \
+    poseur_256x192_r50_6dec_coco.pth 4 \
+    --eval mAP \
+    --cfg-options model.filp_fuse_type=\'type2\'
+```
+
+For visualizing on COCO, run the following command lines:
+```
+python demo/top_down_img_demo.py \
+    configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_res50_coco_256x192.py \
+    poseur_256x192_r50_6dec_coco.pth \
+    --img-root tests/data/coco/ --json-file tests/data/coco/test_coco.json \
+    --out-img-root vis_results_poseur
+```
+
+## Models
+### COCO Keypoint Detection Results
+
+Name | AP | AP.5| AP.75 |download
+--- |:---:|:---:|:---:|:---:
+[poseur_mobilenetv2_coco_256x192](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_mobilenetv2_coco_256x192.py)| 71.9  | 88.9 |78.6 | [model](https://cloudstor.aarnet.edu.au/plus/s/L198TFFqwWYsSop/download)
+[poseur_mobilenetv2_coco_256x192_12dec](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_mobilenetv2_coco_256x192_12dec.py)| 72.3  | 88.9 |78.9 | [model](https://cloudstor.aarnet.edu.au/plus/s/sw0II7qSQDjJ88h/download)
+[poseur_res50_coco_256x192](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_res50_coco_256x192.py)| 75.5  | 90.7 |82.6 | [model](https://cloudstor.aarnet.edu.au/plus/s/UXr1Dn9w6ja4fM9/download)
+[poseur_hrnet_w32_coco_256x192](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_hrnet_w32_coco_256x192.py)| 76.8  | 91.0 |83.5 | [model](https://cloudstor.aarnet.edu.au/plus/s/xMvCnp5lb2MR7S4/download)
+[poseur_hrnet_w48_coco_384x288](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_hrnet_w48_coco_384x288.py)| 78.7  | 91.6 |85.1 | [model](https://cloudstor.aarnet.edu.au/plus/s/IGXy98TZlJYerNc/download)
+[poseur_hrformer_tiny_coco_256x192_3dec](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_hrformer_tiny_coco_256x192_3dec.py)| 74.2  | 90.1 |81.4 | [model](https://cloudstor.aarnet.edu.au/plus/s/CpGYghZQX3mv32i/download)
+[poseur_hrformer_small_coco_256x192_3dec](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_hrformer_small_coco_256x192_3dec.py)| 76.6  | 91.0 |83.4 | [model](https://cloudstor.aarnet.edu.au/plus/s/rK2s3fdrpeP9k6l/download)
+[poseur_hrformer_big_coco_256x192](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_hrformer_big_coco_256x192.py)| 78.9  | 91.9 |85.6 | [model](https://cloudstor.aarnet.edu.au/plus/s/34udjbTr9p9Aigo/download)
+[poseur_hrformer_big_coco_384x288](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_hrformer_big_coco_384x288.py)| 79.6  | 92.1 |85.9 | [model](https://cloudstor.aarnet.edu.au/plus/s/KST3aSAlGd8PJpQ/download)
+
+
+*Disclaimer:*
+
+- Due to the update of MMPose, the results are slightly different from our original paper.
+- We use the official HRFormer implement from [here](https://github.com/HRNet/HRFormer/tree/main/pose), the implementation in mmpose has not been verified by us.
+
+# Citations
+Please consider citing our papers in your publications if the project helps your research. BibTeX reference is as follows.
+```BibTeX
+@inproceedings{mao2022poseur,
+  title={Poseur: Direct human pose regression with transformers},
+  author={Mao, Weian and Ge, Yongtao and Shen, Chunhua and Tian, Zhi and Wang, Xinlong and Wang, Zhibin and Hengel, Anton van den},
+  journal = {Proceedings of the European Conference on Computer Vision {(ECCV)}},
+  month = {October},
+  year={2022}
+}
+```
+
+## License
+
+For commercial use, please contact [Chunhua Shen](mailto:chhshen@gmail.com).
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/configs/smpler_x/decoder/face_decoder.py b/SMPLer-X/main/transformer_utils/configs/smpler_x/decoder/face_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..18261da4bf55eef919556ff6075b8065d3ed5522
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/configs/smpler_x/decoder/face_decoder.py
@@ -0,0 +1,262 @@
+from config import cfg
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=25, metric='mAP', key_indicator='AP', rle_score=True)
+
+optimizer = dict(
+    type='AdamW',
+    lr=1e-3,
+    weight_decay=1e-4,
+    paramwise_cfg = dict(
+        custom_keys={
+            # 'backbone': dict(lr_mult=0.1),
+            'sampling_offsets': dict(lr_mult=0.1),
+            'reference_points': dict(lr_mult=0.1),
+            # 'query_embed': dict(lr_mult=0.5, decay_mult=1.0),
+        },
+    )
+)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[255, 310])
+total_epochs = 325
+
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=72,
+    dataset_joints=72,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+emb_dim = 256
+if cfg.upscale==1:
+    neck_in_channels = [cfg.feat_dim]
+    num_levels = 1
+elif cfg.upscale==2:
+    neck_in_channels = [cfg.feat_dim//2, cfg.feat_dim]
+    # neck_in_channels = [768, 768]
+    num_levels = 2
+elif cfg.upscale==4:
+    neck_in_channels = [cfg.feat_dim//4, cfg.feat_dim//2, cfg.feat_dim]
+    # neck_in_channels = [768, 768, 768]
+    num_levels = 3
+elif cfg.upscale==8:
+    neck_in_channels = [cfg.feat_dim//8, cfg.feat_dim//4, cfg.feat_dim//2, cfg.feat_dim]
+    # neck_in_channels = [768, 768, 768, 768]
+    num_levels = 4
+# model settings
+norm_cfg = dict(type='BN', requires_grad=True)
+# norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='Poseur',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', norm_cfg = norm_cfg, depth=50, num_stages=4, out_indices=(0, 1, 2, 3)),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=neck_in_channels,
+        kernel_size=1,
+        out_channels=emb_dim,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+    ),
+    keypoint_head=dict(
+        type='Poseur_noise_sample',
+        in_channels=512,
+        num_queries=channel_cfg['num_output_channels'],
+        num_reg_fcs=2,
+        num_joints=channel_cfg['num_output_channels'],
+        with_box_refine=True,
+        loss_coord_enc=dict(type='RLELoss_poseur', use_target_weight=True),
+        loss_coord_dec=dict(type='RLELoss_poseur', use_target_weight=True),
+        # loss_coord_dec=dict(type='L1Loss', use_target_weight=True, loss_weight=5),
+        loss_hp_keypoint=dict(type='JointsMSELoss', use_target_weight=True, loss_weight=10),
+        # loss_coord_keypoint=dict(type='L1Loss', use_target_weight=True, loss_weight=1),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=emb_dim//2,
+            normalize=True,
+            offset=-0.5),
+        transformer=dict(
+            type='PoseurTransformer_v3',
+            num_joints=channel_cfg['num_output_channels'],
+            query_pose_emb = True,
+            embed_dims = emb_dim,
+            encoder=dict(
+                type='DetrTransformerEncoder_zero_layer',
+                num_layers=0,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    ffn_cfgs = dict(
+                        embed_dims=emb_dim,
+                        ),
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        num_levels=num_levels,
+                        num_points=4,
+                        embed_dims=emb_dim),
+                    
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DeformableDetrTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer_grouped',
+                    ffn_cfgs = dict(
+                        embed_dims=emb_dim,
+                        ),
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=emb_dim,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='MultiScaleDeformableAttention_post_value',
+                            num_levels=num_levels,
+                            num_points=4,
+                            embed_dims=emb_dim)
+                    ],
+                    num_joints=channel_cfg['num_output_channels'],
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        as_two_stage=True,
+        use_heatmap_loss=False,
+    ),
+    train_cfg=dict(image_size=[192, 256]),
+    test_cfg = dict(
+        image_size=[192, 256],
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11)
+)
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    # use_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    det_bbox_thr=0.0,
+    # use_gt_bbox=True,
+    # bbox_file='',
+    use_gt_bbox=False,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    # dict(
+    #     type='TopDownGenerateTarget',
+    #     kernel=[(11, 11), (9, 9), (7, 7), (5, 5)],
+    #     encoding='Megvii'),
+    dict(
+        target_type='wo_mask',
+        type='TopDownGenerateCoordAndHeatMapTarget',
+        encoding='MSRA',
+        sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'coord_target', 'coord_target_weight', 'hp_target', 'hp_target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    # samples_per_gpu=64,
+    workers_per_gpu=8,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        # ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        # img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+)
+
+fp16 = dict(loss_scale='dynamic')
diff --git a/SMPLer-X/main/transformer_utils/configs/smpler_x/decoder/hand_decoder.py b/SMPLer-X/main/transformer_utils/configs/smpler_x/decoder/hand_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1900968fc24fe32e88c98cd3af56a0aa0b3a4e5
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/configs/smpler_x/decoder/hand_decoder.py
@@ -0,0 +1,262 @@
+from config import cfg
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=25, metric='mAP', key_indicator='AP', rle_score=True)
+
+optimizer = dict(
+    type='AdamW',
+    lr=1e-3,
+    weight_decay=1e-4,
+    paramwise_cfg = dict(
+        custom_keys={
+            # 'backbone': dict(lr_mult=0.1),
+            'sampling_offsets': dict(lr_mult=0.1),
+            'reference_points': dict(lr_mult=0.1),
+            # 'query_embed': dict(lr_mult=0.5, decay_mult=1.0),
+        },
+    )
+)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[255, 310])
+total_epochs = 325
+
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=20,
+    dataset_joints=20,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+emb_dim = 256
+if cfg.upscale==1:
+    neck_in_channels = [cfg.feat_dim]
+    num_levels = 1
+elif cfg.upscale==2:
+    neck_in_channels = [cfg.feat_dim//2, cfg.feat_dim]
+    # neck_in_channels = [768, 768]
+    num_levels = 2
+elif cfg.upscale==4:
+    neck_in_channels = [cfg.feat_dim//4, cfg.feat_dim//2, cfg.feat_dim]
+    # neck_in_channels = [768, 768, 768]
+    num_levels = 3
+elif cfg.upscale==8:
+    neck_in_channels = [cfg.feat_dim//8, cfg.feat_dim//4, cfg.feat_dim//2, cfg.feat_dim]
+    # neck_in_channels = [768, 768, 768, 768]
+    num_levels = 4
+# model settings
+norm_cfg = dict(type='BN', requires_grad=True)
+# norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='Poseur',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', norm_cfg = norm_cfg, depth=50, num_stages=4, out_indices=(0, 1, 2, 3)),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=neck_in_channels,
+        kernel_size=1,
+        out_channels=emb_dim,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+    ),
+    keypoint_head=dict(
+        type='Poseur_noise_sample',
+        in_channels=512,
+        num_queries=channel_cfg['num_output_channels'],
+        num_reg_fcs=2,
+        num_joints=channel_cfg['num_output_channels'],
+        with_box_refine=True,
+        loss_coord_enc=dict(type='RLELoss_poseur', use_target_weight=True),
+        loss_coord_dec=dict(type='RLELoss_poseur', use_target_weight=True),
+        # loss_coord_dec=dict(type='L1Loss', use_target_weight=True, loss_weight=5),
+        loss_hp_keypoint=dict(type='JointsMSELoss', use_target_weight=True, loss_weight=10),
+        # loss_coord_keypoint=dict(type='L1Loss', use_target_weight=True, loss_weight=1),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=emb_dim//2,
+            normalize=True,
+            offset=-0.5),
+        transformer=dict(
+            type='PoseurTransformer_v3',
+            num_joints=channel_cfg['num_output_channels'],
+            query_pose_emb = True,
+            embed_dims = emb_dim,
+            encoder=dict(
+                type='DetrTransformerEncoder_zero_layer',
+                num_layers=0,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    ffn_cfgs = dict(
+                        embed_dims=emb_dim,
+                        ),
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        num_levels=num_levels,
+                        num_points=4,
+                        embed_dims=emb_dim),
+                    
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DeformableDetrTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer_grouped',
+                    ffn_cfgs = dict(
+                        embed_dims=emb_dim,
+                        ),
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=emb_dim,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='MultiScaleDeformableAttention_post_value',
+                            num_levels=num_levels,
+                            num_points=4,
+                            embed_dims=emb_dim)
+                    ],
+                    feedforward_channels=1024,
+                    num_joints=channel_cfg['num_output_channels'],
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        as_two_stage=True,
+        use_heatmap_loss=False,
+    ),
+    train_cfg=dict(image_size=[192, 256]),
+    test_cfg = dict(
+        image_size=[192, 256],
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11)
+)
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    # use_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    det_bbox_thr=0.0,
+    # use_gt_bbox=True,
+    # bbox_file='',
+    use_gt_bbox=False,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    # dict(
+    #     type='TopDownGenerateTarget',
+    #     kernel=[(11, 11), (9, 9), (7, 7), (5, 5)],
+    #     encoding='Megvii'),
+    dict(
+        target_type='wo_mask',
+        type='TopDownGenerateCoordAndHeatMapTarget',
+        encoding='MSRA',
+        sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'coord_target', 'coord_target_weight', 'hp_target', 'hp_target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    # samples_per_gpu=64,
+    workers_per_gpu=8,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        # ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        # img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+)
+
+fp16 = dict(loss_scale='dynamic')
diff --git a/SMPLer-X/main/transformer_utils/configs/smpler_x/encoder/body_encoder_base.py b/SMPLer-X/main/transformer_utils/configs/smpler_x/encoder/body_encoder_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..26bdb9a4350e5fd371442113db8eb009d89cb649
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/configs/smpler_x/encoder/body_encoder_base.py
@@ -0,0 +1,169 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor',
+                 paramwise_cfg=dict(
+                                    num_layers=12,
+                                    layer_decay_rate=0.75,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/configs/smpler_x/encoder/body_encoder_huge.py b/SMPLer-X/main/transformer_utils/configs/smpler_x/encoder/body_encoder_huge.py
new file mode 100644
index 0000000000000000000000000000000000000000..cde1f090dbd2a9c6924363e5dedd36eadb827d89
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/configs/smpler_x/encoder/body_encoder_huge.py
@@ -0,0 +1,169 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=32, 
+                                    layer_decay_rate=0.85,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/SMPLer-X/main/transformer_utils/configs/smpler_x/encoder/body_encoder_large.py b/SMPLer-X/main/transformer_utils/configs/smpler_x/encoder/body_encoder_large.py
new file mode 100644
index 0000000000000000000000000000000000000000..efd95afa09d8326f2a76c55ca1eea85c6cb60dcc
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/configs/smpler_x/encoder/body_encoder_large.py
@@ -0,0 +1,169 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor',
+                 paramwise_cfg=dict(
+                                    num_layers=16,
+                                    layer_decay_rate=0.8,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.5,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/configs/smpler_x/encoder/body_encoder_small.py b/SMPLer-X/main/transformer_utils/configs/smpler_x/encoder/body_encoder_small.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a032877d7c9673704828f43da47a55d90ac0c2e
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/configs/smpler_x/encoder/body_encoder_small.py
@@ -0,0 +1,169 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=12, 
+                                    layer_decay_rate=0.8,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.1,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..690da3a78ba0033e7dc820b3d9a681da3ca39706
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import mmpose.ops
+from .version import __version__, short_version
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+mmcv_minimum_version = '1.3.8'
+mmcv_maximum_version = '2.3.0'
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version <= digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
+
+__all__ = ['__version__', 'short_version']
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..87f34c570a66dd58d6fb84f79b45063b89526d58
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox import *  # noqa: F401, F403
+from .camera import *  # noqa: F401, F403
+from .evaluation import *  # noqa: F401, F403
+from .fp16 import *  # noqa: F401, F403
+from .optimizers import *  # noqa: F401, F403
+from .post_processing import *  # noqa: F401, F403
+from .utils import *  # noqa: F401, F403
+from .visualization import *  # noqa: F401, F403
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/bbox/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/core/bbox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..557993386a6c5de8336a92514072c81b48419ba7
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/bbox/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .transforms import (bbox_cs2xywh, bbox_xywh2cs, bbox_xywh2xyxy,
+                         bbox_xyxy2xywh)
+
+__all__ = ['bbox_xywh2xyxy', 'bbox_xyxy2xywh', 'bbox_xywh2cs', 'bbox_cs2xywh']
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/bbox/transforms.py b/SMPLer-X/main/transformer_utils/mmpose/core/bbox/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..703639443a9f327801b6e6b00ca278b2e22a0ee0
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/bbox/transforms.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+
+def bbox_xyxy2xywh(bbox_xyxy):
+    """Transform the bbox format from x1y1x2y2 to xywh.
+
+    Args:
+        bbox_xyxy (np.ndarray): Bounding boxes (with scores), shaped (n, 4) or
+            (n, 5). (left, top, right, bottom, [score])
+
+    Returns:
+        np.ndarray: Bounding boxes (with scores),
+          shaped (n, 4) or (n, 5). (left, top, width, height, [score])
+    """
+    bbox_xywh = bbox_xyxy.copy()
+    bbox_xywh[:, 2] = bbox_xywh[:, 2] - bbox_xywh[:, 0]
+    bbox_xywh[:, 3] = bbox_xywh[:, 3] - bbox_xywh[:, 1]
+
+    return bbox_xywh
+
+
+def bbox_xywh2xyxy(bbox_xywh):
+    """Transform the bbox format from xywh to x1y1x2y2.
+
+    Args:
+        bbox_xywh (ndarray): Bounding boxes (with scores),
+            shaped (n, 4) or (n, 5). (left, top, width, height, [score])
+    Returns:
+        np.ndarray: Bounding boxes (with scores), shaped (n, 4) or
+          (n, 5). (left, top, right, bottom, [score])
+    """
+    bbox_xyxy = bbox_xywh.copy()
+    bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0]
+    bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1]
+
+    return bbox_xyxy
+
+
+def bbox_xywh2cs(bbox, aspect_ratio, padding=1., pixel_std=200.):
+    """Transform the bbox format from (x,y,w,h) into (center, scale)
+
+    Args:
+        bbox (ndarray): Single bbox in (x, y, w, h)
+        aspect_ratio (float): The expected bbox aspect ratio (w over h)
+        padding (float): Bbox padding factor that will be multilied to scale.
+            Default: 1.0
+        pixel_std (float): The scale normalization factor. Default: 200.0
+
+    Returns:
+        tuple: A tuple containing center and scale.
+        - np.ndarray[float32](2,): Center of the bbox (x, y).
+        - np.ndarray[float32](2,): Scale of the bbox w & h.
+    """
+
+    x, y, w, h = bbox[:4]
+    center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+
+    if w > aspect_ratio * h:
+        h = w * 1.0 / aspect_ratio
+    elif w < aspect_ratio * h:
+        w = h * aspect_ratio
+
+    scale = np.array([w, h], dtype=np.float32) / pixel_std
+    scale = scale * padding
+
+    return center, scale
+
+
+def bbox_cs2xywh(center, scale, padding=1., pixel_std=200.):
+    """Transform the bbox format from (center, scale) to (x,y,w,h). Note that
+    this is not an exact inverse operation of ``bbox_xywh2cs`` because the
+    normalization of aspect ratio in ``bbox_xywh2cs`` is irreversible.
+
+    Args:
+        center (ndarray): Single bbox center in (x, y)
+        scale (ndarray): Single bbox scale in (scale_x, scale_y)
+        padding (float): Bbox padding factor that will be multilied to scale.
+            Default: 1.0
+        pixel_std (float): The scale normalization factor. Default: 200.0
+
+    Returns:
+        ndarray: Single bbox in (x, y, w, h)
+    """
+
+    wh = scale / padding * pixel_std
+    xy = center - 0.5 * wh
+    return np.r_[xy, wh]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/camera/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/core/camera/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4a3c5526560996791a85f0d84a72a66286486ca
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/camera/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .camera_base import CAMERAS
+from .single_camera import SimpleCamera
+from .single_camera_torch import SimpleCameraTorch
+
+__all__ = ['CAMERAS', 'SimpleCamera', 'SimpleCameraTorch']
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/camera/camera_base.py b/SMPLer-X/main/transformer_utils/mmpose/core/camera/camera_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..092dc20d6b1f1d2db785ad720f67fd9184930ad5
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/camera/camera_base.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmengine import Registry
+
+CAMERAS = Registry('camera')
+
+
+class SingleCameraBase(metaclass=ABCMeta):
+    """Base class for single camera model.
+
+    Args:
+        param (dict): Camera parameters
+
+    Methods:
+        world_to_camera: Project points from world coordinates to camera
+            coordinates
+        camera_to_world: Project points from camera coordinates to world
+            coordinates
+        camera_to_pixel: Project points from camera coordinates to pixel
+            coordinates
+        world_to_pixel: Project points from world coordinates to pixel
+            coordinates
+    """
+
+    @abstractmethod
+    def __init__(self, param):
+        """Load camera parameters and check validity."""
+
+    def world_to_camera(self, X):
+        """Project points from world coordinates to camera coordinates."""
+        raise NotImplementedError
+
+    def camera_to_world(self, X):
+        """Project points from camera coordinates to world coordinates."""
+        raise NotImplementedError
+
+    def camera_to_pixel(self, X):
+        """Project points from camera coordinates to pixel coordinates."""
+        raise NotImplementedError
+
+    def world_to_pixel(self, X):
+        """Project points from world coordinates to pixel coordinates."""
+        _X = self.world_to_camera(X)
+        return self.camera_to_pixel(_X)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/camera/single_camera.py b/SMPLer-X/main/transformer_utils/mmpose/core/camera/single_camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..cabd79941af5c81110876e94ce6103cc02ea5078
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/camera/single_camera.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from .camera_base import CAMERAS, SingleCameraBase
+
+
+@CAMERAS.register_module()
+class SimpleCamera(SingleCameraBase):
+    """Camera model to calculate coordinate transformation with given
+    intrinsic/extrinsic camera parameters.
+
+    Note:
+        The keypoint coordinate should be an np.ndarray with a shape of
+    [...,J, C] where J is the keypoint number of an instance, and C is
+    the coordinate dimension. For example:
+
+        [J, C]: shape of joint coordinates of a person with J joints.
+        [N, J, C]: shape of a batch of person joint coordinates.
+        [N, T, J, C]: shape of a batch of pose sequences.
+
+    Args:
+        param (dict): camera parameters including:
+            - R: 3x3, camera rotation matrix (camera-to-world)
+            - T: 3x1, camera translation (camera-to-world)
+            - K: (optional) 2x3, camera intrinsic matrix
+            - k: (optional) nx1, camera radial distortion coefficients
+            - p: (optional) mx1, camera tangential distortion coefficients
+            - f: (optional) 2x1, camera focal length
+            - c: (optional) 2x1, camera center
+        if K is not provided, it will be calculated from f and c.
+
+    Methods:
+        world_to_camera: Project points from world coordinates to camera
+            coordinates
+        camera_to_pixel: Project points from camera coordinates to pixel
+            coordinates
+        world_to_pixel: Project points from world coordinates to pixel
+            coordinates
+    """
+
+    def __init__(self, param):
+
+        self.param = {}
+        # extrinsic param
+        R = np.array(param['R'], dtype=np.float32)
+        T = np.array(param['T'], dtype=np.float32)
+        assert R.shape == (3, 3)
+        assert T.shape == (3, 1)
+        # The camera matrices are transposed in advance because the joint
+        # coordinates are stored as row vectors.
+        self.param['R_c2w'] = R.T
+        self.param['T_c2w'] = T.T
+        self.param['R_w2c'] = R
+        self.param['T_w2c'] = -self.param['T_c2w'] @ self.param['R_w2c']
+
+        # intrinsic param
+        if 'K' in param:
+            K = np.array(param['K'], dtype=np.float32)
+            assert K.shape == (2, 3)
+            self.param['K'] = K.T
+            self.param['f'] = np.array([K[0, 0], K[1, 1]])[:, np.newaxis]
+            self.param['c'] = np.array([K[0, 2], K[1, 2]])[:, np.newaxis]
+        elif 'f' in param and 'c' in param:
+            f = np.array(param['f'], dtype=np.float32)
+            c = np.array(param['c'], dtype=np.float32)
+            assert f.shape == (2, 1)
+            assert c.shape == (2, 1)
+            self.param['K'] = np.concatenate((np.diagflat(f), c), axis=-1).T
+            self.param['f'] = f
+            self.param['c'] = c
+        else:
+            raise ValueError('Camera intrinsic parameters are missing. '
+                             'Either "K" or "f"&"c" should be provided.')
+
+        # distortion param
+        if 'k' in param and 'p' in param:
+            self.undistortion = True
+            self.param['k'] = np.array(param['k'], dtype=np.float32).flatten()
+            self.param['p'] = np.array(param['p'], dtype=np.float32).flatten()
+            assert self.param['k'].size in {3, 6}
+            assert self.param['p'].size == 2
+        else:
+            self.undistortion = False
+
+    def world_to_camera(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_w2c'] + self.param['T_w2c']
+
+    def camera_to_world(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_c2w'] + self.param['T_c2w']
+
+    def camera_to_pixel(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+
+        _X = X / X[..., 2:]
+
+        if self.undistortion:
+            k = self.param['k']
+            p = self.param['p']
+            _X_2d = _X[..., :2]
+            r2 = (_X_2d**2).sum(-1)
+            radial = 1 + sum(ki * r2**(i + 1) for i, ki in enumerate(k[:3]))
+            if k.size == 6:
+                radial /= 1 + sum(
+                    (ki * r2**(i + 1) for i, ki in enumerate(k[3:])))
+
+            tangential = 2 * (p[1] * _X[..., 0] + p[0] * _X[..., 1])
+
+            _X[..., :2] = _X_2d * (radial + tangential)[..., None] + np.outer(
+                r2, p[::-1]).reshape(_X_2d.shape)
+        return _X @ self.param['K']
+
+    def pixel_to_camera(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        _X = X.copy()
+        _X[:, :2] = (X[:, :2] - self.param['c'].T) / self.param['f'].T * X[:,
+                                                                           [2]]
+        return _X
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/camera/single_camera_torch.py b/SMPLer-X/main/transformer_utils/mmpose/core/camera/single_camera_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..22eb72f23d6eecf1b5c5a9b570a4f142fcf6e02a
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/camera/single_camera_torch.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .camera_base import CAMERAS, SingleCameraBase
+
+
+@CAMERAS.register_module()
+class SimpleCameraTorch(SingleCameraBase):
+    """Camera model to calculate coordinate transformation with given
+    intrinsic/extrinsic camera parameters.
+
+    Notes:
+        The keypoint coordinate should be an np.ndarray with a shape of
+    [...,J, C] where J is the keypoint number of an instance, and C is
+    the coordinate dimension. For example:
+
+        [J, C]: shape of joint coordinates of a person with J joints.
+        [N, J, C]: shape of a batch of person joint coordinates.
+        [N, T, J, C]: shape of a batch of pose sequences.
+
+    Args:
+        param (dict): camera parameters including:
+            - R: 3x3, camera rotation matrix (camera-to-world)
+            - T: 3x1, camera translation (camera-to-world)
+            - K: (optional) 2x3, camera intrinsic matrix
+            - k: (optional) nx1, camera radial distortion coefficients
+            - p: (optional) mx1, camera tangential distortion coefficients
+            - f: (optional) 2x1, camera focal length
+            - c: (optional) 2x1, camera center
+        if K is not provided, it will be calculated from f and c.
+
+    Methods:
+        world_to_camera: Project points from world coordinates to camera
+            coordinates
+        camera_to_pixel: Project points from camera coordinates to pixel
+            coordinates
+        world_to_pixel: Project points from world coordinates to pixel
+            coordinates
+    """
+
+    def __init__(self, param, device):
+
+        self.param = {}
+        # extrinsic param
+        R = torch.tensor(param['R'], device=device)
+        T = torch.tensor(param['T'], device=device)
+
+        assert R.shape == (3, 3)
+        assert T.shape == (3, 1)
+        # The camera matrices are transposed in advance because the joint
+        # coordinates are stored as row vectors.
+        self.param['R_c2w'] = R.T
+        self.param['T_c2w'] = T.T
+        self.param['R_w2c'] = R
+        self.param['T_w2c'] = -self.param['T_c2w'] @ self.param['R_w2c']
+
+        # intrinsic param
+        if 'K' in param:
+            K = torch.tensor(param['K'], device=device)
+            assert K.shape == (2, 3)
+            self.param['K'] = K.T
+            self.param['f'] = torch.tensor([[K[0, 0]], [K[1, 1]]],
+                                           device=device)
+            self.param['c'] = torch.tensor([[K[0, 2]], [K[1, 2]]],
+                                           device=device)
+        elif 'f' in param and 'c' in param:
+            f = torch.tensor(param['f'], device=device)
+            c = torch.tensor(param['c'], device=device)
+            assert f.shape == (2, 1)
+            assert c.shape == (2, 1)
+            self.param['K'] = torch.cat([torch.diagflat(f), c], dim=-1).T
+            self.param['f'] = f
+            self.param['c'] = c
+        else:
+            raise ValueError('Camera intrinsic parameters are missing. '
+                             'Either "K" or "f"&"c" should be provided.')
+
+        # distortion param
+        if 'k' in param and 'p' in param:
+            self.undistortion = True
+            self.param['k'] = torch.tensor(param['k'], device=device).view(-1)
+            self.param['p'] = torch.tensor(param['p'], device=device).view(-1)
+            assert len(self.param['k']) in {3, 6}
+            assert len(self.param['p']) == 2
+        else:
+            self.undistortion = False
+
+    def world_to_camera(self, X):
+        assert isinstance(X, torch.Tensor)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_w2c'] + self.param['T_w2c']
+
+    def camera_to_world(self, X):
+        assert isinstance(X, torch.Tensor)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_c2w'] + self.param['T_c2w']
+
+    def camera_to_pixel(self, X):
+        assert isinstance(X, torch.Tensor)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+
+        _X = X / X[..., 2:]
+
+        if self.undistortion:
+            k = self.param['k']
+            p = self.param['p']
+            _X_2d = _X[..., :2]
+            r2 = (_X_2d**2).sum(-1)
+            radial = 1 + sum(ki * r2**(i + 1) for i, ki in enumerate(k[:3]))
+            if k.size == 6:
+                radial /= 1 + sum(
+                    (ki * r2**(i + 1) for i, ki in enumerate(k[3:])))
+
+            tangential = 2 * (p[1] * _X[..., 0] + p[0] * _X[..., 1])
+
+            _X[..., :2] = _X_2d * (radial + tangential)[..., None] + torch.ger(
+                r2, p.flip([0])).reshape(_X_2d.shape)
+        return _X @ self.param['K']
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/distributed_wrapper.py b/SMPLer-X/main/transformer_utils/mmpose/core/distributed_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..12122e71cb4fd46f0e23bb6df127339325f47520
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/distributed_wrapper.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.parallel import MODULE_WRAPPERS as MMCV_MODULE_WRAPPERS
+from mmcv.parallel import MMDistributedDataParallel
+from mmcv.parallel.scatter_gather import scatter_kwargs
+from mmengine import Registry
+from torch.cuda._utils import _get_device_index
+
+MODULE_WRAPPERS = Registry('module wrapper', parent=MMCV_MODULE_WRAPPERS)
+
+
+@MODULE_WRAPPERS.register_module()
+class DistributedDataParallelWrapper(nn.Module):
+    """A DistributedDataParallel wrapper for models in 3D mesh estimation task.
+
+    In  3D mesh estimation task, there is a need to wrap different modules in
+    the models with separate DistributedDataParallel. Otherwise, it will cause
+    errors for GAN training.
+    More specific, the GAN model, usually has two sub-modules:
+    generator and discriminator. If we wrap both of them in one
+    standard DistributedDataParallel, it will cause errors during training,
+    because when we update the parameters of the generator (or discriminator),
+    the parameters of the discriminator (or generator) is not updated, which is
+    not allowed for DistributedDataParallel.
+    So we design this wrapper to separately wrap DistributedDataParallel
+    for generator and discriminator.
+
+    In this wrapper, we perform two operations:
+    1. Wrap the modules in the models with separate MMDistributedDataParallel.
+        Note that only modules with parameters will be wrapped.
+    2. Do scatter operation for 'forward', 'train_step' and 'val_step'.
+
+    Note that the arguments of this wrapper is the same as those in
+    `torch.nn.parallel.distributed.DistributedDataParallel`.
+
+    Args:
+        module (nn.Module): Module that needs to be wrapped.
+        device_ids (list[int | `torch.device`]): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+        dim (int, optional): Same as that in the official scatter function in
+            pytorch. Defaults to 0.
+        broadcast_buffers (bool): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+            Defaults to False.
+        find_unused_parameters (bool, optional): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+            Traverse the autograd graph of all tensors contained in returned
+            value of the wrapped module’s forward function. Defaults to False.
+        kwargs (dict): Other arguments used in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+    """
+
+    def __init__(self,
+                 module,
+                 device_ids,
+                 dim=0,
+                 broadcast_buffers=False,
+                 find_unused_parameters=False,
+                 **kwargs):
+        super().__init__()
+        assert len(device_ids) == 1, (
+            'Currently, DistributedDataParallelWrapper only supports one'
+            'single CUDA device for each process.'
+            f'The length of device_ids must be 1, but got {len(device_ids)}.')
+        self.module = module
+        self.dim = dim
+        self.to_ddp(
+            device_ids=device_ids,
+            dim=dim,
+            broadcast_buffers=broadcast_buffers,
+            find_unused_parameters=find_unused_parameters,
+            **kwargs)
+        self.output_device = _get_device_index(device_ids[0], True)
+
+    def to_ddp(self, device_ids, dim, broadcast_buffers,
+               find_unused_parameters, **kwargs):
+        """Wrap models with separate MMDistributedDataParallel.
+
+        It only wraps the modules with parameters.
+        """
+        for name, module in self.module._modules.items():
+            if next(module.parameters(), None) is None:
+                module = module.cuda()
+            elif all(not p.requires_grad for p in module.parameters()):
+                module = module.cuda()
+            else:
+                module = MMDistributedDataParallel(
+                    module.cuda(),
+                    device_ids=device_ids,
+                    dim=dim,
+                    broadcast_buffers=broadcast_buffers,
+                    find_unused_parameters=find_unused_parameters,
+                    **kwargs)
+            self.module._modules[name] = module
+
+    def scatter(self, inputs, kwargs, device_ids):
+        """Scatter function.
+
+        Args:
+            inputs (Tensor): Input Tensor.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+            device_ids (int): Device id.
+        """
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def forward(self, *inputs, **kwargs):
+        """Forward function.
+
+        Args:
+            inputs (tuple): Input data.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        return self.module(*inputs[0], **kwargs[0])
+
+    def train_step(self, *inputs, **kwargs):
+        """Train step function.
+
+        Args:
+            inputs (Tensor): Input Tensor.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.train_step(*inputs[0], **kwargs[0])
+        return output
+
+    def val_step(self, *inputs, **kwargs):
+        """Validation step function.
+
+        Args:
+            inputs (tuple): Input data.
+            kwargs (dict): Args for ``scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.val_step(*inputs[0], **kwargs[0])
+        return output
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f9378429c8ddaa15f7ac17446bc9d484987df16
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bottom_up_eval import (aggregate_scale, aggregate_stage_flip,
+                             flip_feature_maps, get_group_preds,
+                             split_ae_outputs)
+from .eval_hooks import DistEvalHook, EvalHook
+from .mesh_eval import compute_similarity_transform
+from .pose3d_eval import keypoint_3d_auc, keypoint_3d_pck, keypoint_mpjpe
+from .top_down_eval import (keypoint_auc, keypoint_epe, keypoint_pck_accuracy,
+                            keypoints_from_heatmaps, keypoints_from_heatmaps3d,
+                            keypoints_from_regression,
+                            multilabel_classification_accuracy,
+                            pose_pck_accuracy, post_dark_udp)
+
+__all__ = [
+    'EvalHook', 'DistEvalHook', 'pose_pck_accuracy', 'keypoints_from_heatmaps',
+    'keypoints_from_regression', 'keypoint_pck_accuracy', 'keypoint_3d_pck',
+    'keypoint_3d_auc', 'keypoint_auc', 'keypoint_epe', 'get_group_preds',
+    'split_ae_outputs', 'flip_feature_maps', 'aggregate_stage_flip',
+    'aggregate_scale', 'compute_similarity_transform', 'post_dark_udp',
+    'keypoint_mpjpe', 'keypoints_from_heatmaps3d',
+    'multilabel_classification_accuracy'
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/bottom_up_eval.py b/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/bottom_up_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b37d7c98e684284e3863922e7c7d2abedce0e24
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/bottom_up_eval.py
@@ -0,0 +1,333 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmpose.core.post_processing import (get_warp_matrix, transform_preds,
+                                         warp_affine_joints)
+
+
+def split_ae_outputs(outputs, num_joints, with_heatmaps, with_ae,
+                     select_output_index):
+    """Split multi-stage outputs into heatmaps & tags.
+
+    Args:
+        outputs (list(Tensor)): Outputs of network
+        num_joints (int): Number of joints
+        with_heatmaps (list[bool]): Option to output
+            heatmaps for different stages.
+        with_ae (list[bool]): Option to output
+            ae tags for different stages.
+        select_output_index (list[int]): Output keep the selected index
+
+    Returns:
+        tuple: A tuple containing multi-stage outputs.
+
+        - list[Tensor]: multi-stage heatmaps.
+        - list[Tensor]: multi-stage tags.
+    """
+
+    heatmaps = []
+    tags = []
+
+    # aggregate heatmaps from different stages
+    for i, output in enumerate(outputs):
+        if i not in select_output_index:
+            continue
+        # staring index of the associative embeddings
+        offset_feat = num_joints if with_heatmaps[i] else 0
+        if with_heatmaps[i]:
+            heatmaps.append(output[:, :num_joints])
+        if with_ae[i]:
+            tags.append(output[:, offset_feat:])
+
+    return heatmaps, tags
+
+
+def flip_feature_maps(feature_maps, flip_index=None):
+    """Flip the feature maps and swap the channels.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        flip_index (list[int] | None): Channel-flip indexes.
+            If None, do not flip channels.
+
+    Returns:
+        list[Tensor]: Flipped feature_maps.
+    """
+    flipped_feature_maps = []
+    for feature_map in feature_maps:
+        feature_map = torch.flip(feature_map, [3])
+        if flip_index is not None:
+            flipped_feature_maps.append(feature_map[:, flip_index, :, :])
+        else:
+            flipped_feature_maps.append(feature_map)
+
+    return flipped_feature_maps
+
+
+def _resize_average(feature_maps, align_corners, index=-1, resize_size=None):
+    """Resize the feature maps and compute the average.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        align_corners (bool): Align corners when performing interpolation.
+        index (int): Only used when `resize_size' is None.
+            If `resize_size' is None, the target size is the size
+            of the indexed feature maps.
+        resize_size (list[int, int]): The target size [w, h].
+
+    Returns:
+        list[Tensor]: Averaged feature_maps.
+    """
+
+    if feature_maps is None:
+        return None
+    feature_maps_avg = 0
+
+    feature_map_list = _resize_concate(
+        feature_maps, align_corners, index=index, resize_size=resize_size)
+    for feature_map in feature_map_list:
+        feature_maps_avg += feature_map
+
+    feature_maps_avg /= len(feature_map_list)
+    return [feature_maps_avg]
+
+
+def _resize_unsqueeze_concat(feature_maps,
+                             align_corners,
+                             index=-1,
+                             resize_size=None):
+    """Resize, unsqueeze and concatenate the feature_maps.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        align_corners (bool): Align corners when performing interpolation.
+        index (int): Only used when `resize_size' is None.
+            If `resize_size' is None, the target size is the size
+            of the indexed feature maps.
+        resize_size (list[int, int]): The target size [w, h].
+
+    Returns:
+        list[Tensor]: Averaged feature_maps.
+    """
+    if feature_maps is None:
+        return None
+    feature_map_list = _resize_concate(
+        feature_maps, align_corners, index=index, resize_size=resize_size)
+
+    feat_dim = len(feature_map_list[0].shape) - 1
+    output_feature_maps = torch.cat(
+        [torch.unsqueeze(fmap, dim=feat_dim + 1) for fmap in feature_map_list],
+        dim=feat_dim + 1)
+    return [output_feature_maps]
+
+
+def _resize_concate(feature_maps, align_corners, index=-1, resize_size=None):
+    """Resize and concatenate the feature_maps.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        align_corners (bool): Align corners when performing interpolation.
+        index (int): Only used when `resize_size' is None.
+            If `resize_size' is None, the target size is the size
+            of the indexed feature maps.
+        resize_size (list[int, int]): The target size [w, h].
+
+    Returns:
+        list[Tensor]: Averaged feature_maps.
+    """
+    if feature_maps is None:
+        return None
+
+    feature_map_list = []
+
+    if index < 0:
+        index += len(feature_maps)
+
+    if resize_size is None:
+        resize_size = (feature_maps[index].size(2),
+                       feature_maps[index].size(3))
+
+    for feature_map in feature_maps:
+        ori_size = (feature_map.size(2), feature_map.size(3))
+        if ori_size != resize_size:
+            feature_map = torch.nn.functional.interpolate(
+                feature_map,
+                size=resize_size,
+                mode='bilinear',
+                align_corners=align_corners)
+
+        feature_map_list.append(feature_map)
+
+    return feature_map_list
+
+
+def aggregate_stage_flip(feature_maps,
+                         feature_maps_flip,
+                         index=-1,
+                         project2image=True,
+                         size_projected=None,
+                         align_corners=False,
+                         aggregate_stage='concat',
+                         aggregate_flip='average'):
+    """Inference the model to get multi-stage outputs (heatmaps & tags), and
+    resize them to base sizes.
+
+    Args:
+        feature_maps (list[Tensor]): feature_maps can be heatmaps,
+            tags, and pafs.
+        feature_maps_flip (list[Tensor] | None): flipped feature_maps.
+            feature maps can be heatmaps, tags, and pafs.
+        project2image (bool): Option to resize to base scale.
+        size_projected (list[int, int]): Base size of heatmaps [w, h].
+        align_corners (bool): Align corners when performing interpolation.
+        aggregate_stage (str): Methods to aggregate multi-stage feature maps.
+            Options: 'concat', 'average'. Default: 'concat.
+
+            - 'concat': Concatenate the original and the flipped feature maps.
+            - 'average': Get the average of the original and the flipped
+                feature maps.
+        aggregate_flip (str): Methods to aggregate the original and
+            the flipped feature maps. Options: 'concat', 'average', 'none'.
+            Default: 'average.
+
+            - 'concat': Concatenate the original and the flipped feature maps.
+            - 'average': Get the average of the original and the flipped
+                feature maps..
+            - 'none': no flipped feature maps.
+
+    Returns:
+        list[Tensor]: Aggregated feature maps with shape [NxKxWxH].
+    """
+
+    if feature_maps_flip is None:
+        aggregate_flip = 'none'
+
+    output_feature_maps = []
+
+    if aggregate_stage == 'average':
+        _aggregate_stage_func = _resize_average
+    elif aggregate_stage == 'concat':
+        _aggregate_stage_func = _resize_concate
+    else:
+        NotImplementedError()
+
+    if project2image and size_projected:
+        _origin = _aggregate_stage_func(
+            feature_maps,
+            align_corners,
+            index=index,
+            resize_size=(size_projected[1], size_projected[0]))
+
+        _flipped = _aggregate_stage_func(
+            feature_maps_flip,
+            align_corners,
+            index=index,
+            resize_size=(size_projected[1], size_projected[0]))
+    else:
+        _origin = _aggregate_stage_func(
+            feature_maps, align_corners, index=index, resize_size=None)
+        _flipped = _aggregate_stage_func(
+            feature_maps_flip, align_corners, index=index, resize_size=None)
+
+    if aggregate_flip == 'average':
+        assert feature_maps_flip is not None
+        for _ori, _fli in zip(_origin, _flipped):
+            output_feature_maps.append((_ori + _fli) / 2.0)
+
+    elif aggregate_flip == 'concat':
+        assert feature_maps_flip is not None
+        output_feature_maps.append(*_origin)
+        output_feature_maps.append(*_flipped)
+
+    elif aggregate_flip == 'none':
+        if isinstance(_origin, list):
+            output_feature_maps.append(*_origin)
+        else:
+            output_feature_maps.append(_origin)
+    else:
+        NotImplementedError()
+
+    return output_feature_maps
+
+
+def aggregate_scale(feature_maps_list,
+                    align_corners=False,
+                    aggregate_scale='average'):
+    """Aggregate multi-scale outputs.
+
+    Note:
+        batch size: N
+        keypoints num : K
+        heatmap width: W
+        heatmap height: H
+
+    Args:
+        feature_maps_list (list[Tensor]): Aggregated feature maps.
+        project2image (bool): Option to resize to base scale.
+        align_corners (bool): Align corners when performing interpolation.
+        aggregate_scale (str): Methods to aggregate multi-scale feature maps.
+            Options: 'average', 'unsqueeze_concat'.
+
+            - 'average': Get the average of the feature maps.
+            - 'unsqueeze_concat': Concatenate the feature maps along new axis.
+                Default: 'average.
+
+    Returns:
+        Tensor: Aggregated feature maps.
+    """
+
+    if aggregate_scale == 'average':
+        output_feature_maps = _resize_average(
+            feature_maps_list, align_corners, index=0, resize_size=None)
+
+    elif aggregate_scale == 'unsqueeze_concat':
+        output_feature_maps = _resize_unsqueeze_concat(
+            feature_maps_list, align_corners, index=0, resize_size=None)
+    else:
+        NotImplementedError()
+
+    return output_feature_maps[0]
+
+
+def get_group_preds(grouped_joints,
+                    center,
+                    scale,
+                    heatmap_size,
+                    use_udp=False):
+    """Transform the grouped joints back to the image.
+
+    Args:
+        grouped_joints (list): Grouped person joints.
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        heatmap_size (np.ndarray[2, ]): Size of the destination heatmaps.
+        use_udp (bool): Unbiased data processing.
+             Paper ref: Huang et al. The Devil is in the Details: Delving into
+             Unbiased Data Processing for Human Pose Estimation (CVPR'2020).
+
+    Returns:
+        list: List of the pose result for each person.
+    """
+    if len(grouped_joints) == 0:
+        return []
+
+    if use_udp:
+        if grouped_joints[0].shape[0] > 0:
+            heatmap_size_t = np.array(heatmap_size, dtype=np.float32) - 1.0
+            trans = get_warp_matrix(
+                theta=0,
+                size_input=heatmap_size_t,
+                size_dst=scale,
+                size_target=heatmap_size_t)
+            grouped_joints[0][..., :2] = \
+                warp_affine_joints(grouped_joints[0][..., :2], trans)
+        results = [person for person in grouped_joints[0]]
+    else:
+        results = []
+        for person in grouped_joints[0]:
+            joints = transform_preds(person, center, scale, heatmap_size)
+            results.append(joints)
+
+    return results
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py b/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..e94cb9a914d5e18816c81545c329958f7630877e
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py
@@ -0,0 +1,608 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+import os.path as osp
+import warnings
+from math import inf
+from typing import Callable, List, Optional
+
+import torch.distributed as dist
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.utils.data import DataLoader
+
+from mmengine.fileio import FileClient
+from mmengine.utils import is_seq_of
+from mmengine.hooks import Hook, LoggerHook
+
+
+MMPOSE_GREATER_KEYS = [
+    'acc', 'ap', 'ar', 'pck', 'auc', '3dpck', 'p-3dpck', '3dauc', 'p-3dauc',
+    'pcp'
+]
+MMPOSE_LESS_KEYS = ['loss', 'epe', 'nme', 'mpjpe', 'p-mpjpe', 'n-mpjpe']
+
+class _EvalHook(Hook):
+    """Non-Distributed evaluation hook.
+
+    This hook will regularly perform evaluation in a given interval when
+    performing in non-distributed environment.
+
+    Args:
+        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
+            implemented ``evaluate`` function.
+        start (int | None, optional): Evaluation starting epoch or iteration.
+            It enables evaluation before the training starts if ``start`` <=
+            the resuming epoch or iteration. If None, whether to evaluate is
+            merely decided by ``interval``. Default: None.
+        interval (int): Evaluation interval. Default: 1.
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            Default: True.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resume checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Default: None.
+        rule (str | None, optional): Comparison rule for best score. If set to
+            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
+            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
+            be inferred by 'less' rule. Options are 'greater', 'less', None.
+            Default: None.
+        test_fn (callable, optional): test a model with samples from a
+            dataloader, and return the test results. If ``None``, the default
+            test function ``mmcv.engine.single_gpu_test`` will be used.
+            (default: ``None``)
+        greater_keys (List[str] | None, optional): Metric keys that will be
+            inferred by 'greater' comparison rule. If ``None``,
+            _default_greater_keys will be used. (default: ``None``)
+        less_keys (List[str] | None, optional): Metric keys that will be
+            inferred by 'less' comparison rule. If ``None``, _default_less_keys
+            will be used. (default: ``None``)
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, `runner.work_dir` will be used by default. If specified,
+            the `out_dir` will be the concatenation of `out_dir` and the last
+            level directory of `runner.work_dir`.
+            `New in version 1.3.16.`
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details. Default: None.
+            `New in version 1.3.16.`
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+
+    Note:
+        If new arguments are added for EvalHook, tools/test.py,
+        tools/eval_metric.py may be affected.
+    """
+
+    # Since the key for determine greater or less is related to the downstream
+    # tasks, downstream repos may need to overwrite the following inner
+    # variable accordingly.
+
+    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
+    init_value_map = {'greater': -inf, 'less': inf}
+    _default_greater_keys = [
+        'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU',
+        'mAcc', 'aAcc'
+    ]
+    _default_less_keys = ['loss']
+
+    def __init__(self,
+                 dataloader: DataLoader,
+                 start: Optional[int] = None,
+                 interval: int = 1,
+                 by_epoch: bool = True,
+                 save_best: Optional[str] = None,
+                 rule: Optional[str] = None,
+                 test_fn: Optional[Callable] = None,
+                 greater_keys: Optional[List[str]] = None,
+                 less_keys: Optional[List[str]] = None,
+                 out_dir: Optional[str] = None,
+                 file_client_args: Optional[dict] = None,
+                 **eval_kwargs):
+        if not isinstance(dataloader, DataLoader):
+            raise TypeError(f'dataloader must be a pytorch DataLoader, '
+                            f'but got {type(dataloader)}')
+
+        if interval <= 0:
+            raise ValueError(f'interval must be a positive number, '
+                             f'but got {interval}')
+
+        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean'
+
+        if start is not None and start < 0:
+            raise ValueError(f'The evaluation start epoch {start} is smaller '
+                             f'than 0')
+
+        self.dataloader = dataloader
+        self.interval = interval
+        self.start = start
+        self.by_epoch = by_epoch
+
+        assert isinstance(save_best, str) or save_best is None, \
+            '""save_best"" should be a str or None ' \
+            f'rather than {type(save_best)}'
+        self.save_best = save_best
+        self.eval_kwargs = eval_kwargs
+        self.initial_flag = True
+
+        if test_fn is None:
+            from mmcv.engine import single_gpu_test
+            self.test_fn = single_gpu_test
+        else:
+            self.test_fn = test_fn
+
+        if greater_keys is None:
+            self.greater_keys = self._default_greater_keys
+        else:
+            if not isinstance(greater_keys, (list, tuple)):
+                assert isinstance(greater_keys, str)
+                greater_keys = (greater_keys, )
+            assert is_seq_of(greater_keys, str)
+            self.greater_keys = greater_keys
+
+        if less_keys is None:
+            self.less_keys = self._default_less_keys
+        else:
+            if not isinstance(less_keys, (list, tuple)):
+                assert isinstance(greater_keys, str)
+                less_keys = (less_keys, )
+            assert is_seq_of(less_keys, str)
+            self.less_keys = less_keys
+
+        if self.save_best is not None:
+            self.best_ckpt_path = None
+            self._init_rule(rule, self.save_best)
+
+        self.out_dir = out_dir
+        self.file_client_args = file_client_args
+
+    def _init_rule(self, rule: Optional[str], key_indicator: str):
+        """Initialize rule, key_indicator, comparison_func, and best score.
+
+        Here is the rule to determine which rule is used for key indicator
+        when the rule is not specific (note that the key indicator matching
+        is case-insensitive):
+        1. If the key indicator is in ``self.greater_keys``, the rule will be
+           specified as 'greater'.
+        2. Or if the key indicator is in ``self.less_keys``, the rule will be
+           specified as 'less'.
+        3. Or if any one item in ``self.greater_keys`` is a substring of
+            key_indicator , the rule will be specified as 'greater'.
+        4. Or if any one item in ``self.less_keys`` is a substring of
+            key_indicator , the rule will be specified as 'less'.
+
+        Args:
+            rule (str | None): Comparison rule for best score.
+            key_indicator (str | None): Key indicator to determine the
+                comparison rule.
+        """
+        if rule not in self.rule_map and rule is not None:
+            raise KeyError(f'rule must be greater, less or None, '
+                           f'but got {rule}.')
+
+        if rule is None:
+            if key_indicator != 'auto':
+                # `_lc` here means we use the lower case of keys for
+                # case-insensitive matching
+                assert isinstance(key_indicator, str)
+                key_indicator_lc = key_indicator.lower()
+                greater_keys = [key.lower() for key in self.greater_keys]
+                less_keys = [key.lower() for key in self.less_keys]
+
+                if key_indicator_lc in greater_keys:
+                    rule = 'greater'
+                elif key_indicator_lc in less_keys:
+                    rule = 'less'
+                elif any(key in key_indicator_lc for key in greater_keys):
+                    rule = 'greater'
+                elif any(key in key_indicator_lc for key in less_keys):
+                    rule = 'less'
+                else:
+                    raise ValueError(f'Cannot infer the rule for key '
+                                     f'{key_indicator}, thus a specific rule '
+                                     f'must be specified.')
+        self.rule = rule
+        self.key_indicator = key_indicator
+        if self.rule is not None:
+            self.compare_func = self.rule_map[self.rule]
+
+    def before_run(self, runner):
+        if not self.out_dir:
+            self.out_dir = runner.work_dir
+
+        self.file_client = FileClient.infer_client(self.file_client_args,
+                                                   self.out_dir)
+
+        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
+        # `self.out_dir` is set so the final `self.out_dir` is the
+        # concatenation of `self.out_dir` and the last level directory of
+        # `runner.work_dir`
+        if self.out_dir != runner.work_dir:
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+            runner.logger.info(
+                f'The best checkpoint will be saved to {self.out_dir} by '
+                f'{self.file_client.name}')
+
+        if self.save_best is not None:
+            if runner.meta is None:
+                warnings.warn('runner.meta is None. Creating an empty one.')
+                runner.meta = dict()
+            runner.meta.setdefault('hook_msgs', dict())
+            self.best_ckpt_path = runner.meta['hook_msgs'].get(
+                'best_ckpt', None)
+
+    def before_train_iter(self, runner):
+        """Evaluate the model only at the start of training by iteration."""
+        if self.by_epoch or not self.initial_flag:
+            return
+        if self.start is not None and runner.iter >= self.start:
+            self.after_train_iter(runner)
+        self.initial_flag = False
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training by epoch."""
+        if not (self.by_epoch and self.initial_flag):
+            return
+        if self.start is not None and runner.epoch >= self.start:
+            self.after_train_epoch(runner)
+        self.initial_flag = False
+
+    def after_train_iter(self, runner):
+        """Called after every training iter to evaluate the results."""
+        if not self.by_epoch and self._should_evaluate(runner):
+            # Because the priority of EvalHook is higher than LoggerHook, the
+            # training log and the evaluating log are mixed. Therefore,
+            # we need to dump the training log and clear it before evaluating
+            # log is generated. In addition, this problem will only appear in
+            # `IterBasedRunner` whose `self.by_epoch` is False, because
+            # `EpochBasedRunner` whose `self.by_epoch` is True calls
+            # `_do_evaluate` in `after_train_epoch` stage, and at this stage
+            # the training log has been printed, so it will not cause any
+            # problem. more details at
+            # https://github.com/open-mmlab/mmsegmentation/issues/694
+            for hook in runner._hooks:
+                if isinstance(hook, LoggerHook):
+                    hook.after_train_iter(runner)
+            runner.log_buffer.clear()
+
+            self._do_evaluate(runner)
+
+    def after_train_epoch(self, runner):
+        """Called after every training epoch to evaluate the results."""
+        if self.by_epoch and self._should_evaluate(runner):
+            self._do_evaluate(runner)
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        results = self.test_fn(runner.model, self.dataloader)
+        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+        key_score = self.evaluate(runner, results)
+        # the key_score may be `None` so it needs to skip the action to save
+        # the best checkpoint
+        if self.save_best and key_score:
+            self._save_ckpt(runner, key_score)
+
+    def _should_evaluate(self, runner):
+        """Judge whether to perform evaluation.
+
+        Here is the rule to judge whether to perform evaluation:
+        1. It will not perform evaluation during the epoch/iteration interval,
+           which is determined by ``self.interval``.
+        2. It will not perform evaluation if the start time is larger than
+           current time.
+        3. It will not perform evaluation when current time is larger than
+           the start time but during epoch/iteration interval.
+
+        Returns:
+            bool: The flag indicating whether to perform evaluation.
+        """
+        if self.by_epoch:
+            current = runner.epoch
+            check_time = self.every_n_epochs
+        else:
+            current = runner.iter
+            check_time = self.every_n_iters
+
+        if self.start is None:
+            if not check_time(runner, self.interval):
+                # No evaluation during the interval.
+                return False
+        elif (current + 1) < self.start:
+            # No evaluation if start is larger than the current time.
+            return False
+        else:
+            # Evaluation only at epochs/iters 3, 5, 7...
+            # if start==3 and interval==2
+            if (current + 1 - self.start) % self.interval:
+                return False
+        return True
+
+    def _save_ckpt(self, runner, key_score):
+        """Save the best checkpoint.
+
+        It will compare the score according to the compare function, write
+        related information (best score, best checkpoint path) and save the
+        best checkpoint into ``work_dir``.
+        """
+        if self.by_epoch:
+            current = f'epoch_{runner.epoch + 1}'
+            cur_type, cur_time = 'epoch', runner.epoch + 1
+        else:
+            current = f'iter_{runner.iter + 1}'
+            cur_type, cur_time = 'iter', runner.iter + 1
+
+        best_score = runner.meta['hook_msgs'].get(
+            'best_score', self.init_value_map[self.rule])
+        if self.compare_func(key_score, best_score):
+            best_score = key_score
+            runner.meta['hook_msgs']['best_score'] = best_score
+
+            if self.best_ckpt_path and self.file_client.isfile(
+                    self.best_ckpt_path):
+                self.file_client.remove(self.best_ckpt_path)
+                runner.logger.info(
+                    f'The previous best checkpoint {self.best_ckpt_path} was '
+                    'removed')
+
+            best_ckpt_name = f'best_{self.key_indicator}_{current}.pth'
+            self.best_ckpt_path = self.file_client.join_path(
+                self.out_dir, best_ckpt_name)
+            runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path
+
+            runner.save_checkpoint(
+                self.out_dir,
+                filename_tmpl=best_ckpt_name,
+                create_symlink=False)
+            runner.logger.info(
+                f'Now best checkpoint is saved as {best_ckpt_name}.')
+            runner.logger.info(
+                f'Best {self.key_indicator} is {best_score:0.4f} '
+                f'at {cur_time} {cur_type}.')
+
+    def evaluate(self, runner, results):
+        """Evaluate the results.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlined training runner.
+            results (list): Output results.
+        """
+        eval_res = self.dataloader.dataset.evaluate(
+            results, logger=runner.logger, **self.eval_kwargs)
+
+        for name, val in eval_res.items():
+            runner.log_buffer.output[name] = val
+        runner.log_buffer.ready = True
+
+        if self.save_best is not None:
+            # If the performance of model is poor, the `eval_res` may be an
+            # empty dict and it will raise exception when `self.save_best` is
+            # not None. More details at
+            # https://github.com/open-mmlab/mmdetection/issues/6265.
+            if not eval_res:
+                warnings.warn(
+                    'Since `eval_res` is an empty dict, the behavior to save '
+                    'the best checkpoint will be skipped in this evaluation.')
+                return None
+
+            if self.key_indicator == 'auto':
+                # infer from eval_results
+                self._init_rule(self.rule, list(eval_res.keys())[0])
+            return eval_res[self.key_indicator]
+
+        return None
+
+
+class _DistEvalHook(_EvalHook):
+    """Distributed evaluation hook.
+
+    This hook will regularly perform evaluation in a given interval when
+    performing in distributed environment.
+
+    Args:
+        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
+            implemented ``evaluate`` function.
+        start (int | None, optional): Evaluation starting epoch. It enables
+            evaluation before the training starts if ``start`` <= the resuming
+            epoch. If None, whether to evaluate is merely decided by
+            ``interval``. Default: None.
+        interval (int): Evaluation interval. Default: 1.
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            default: True.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resume checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Default: None.
+        rule (str | None, optional): Comparison rule for best score. If set to
+            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
+            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
+            be inferred by 'less' rule. Options are 'greater', 'less', None.
+            Default: None.
+        test_fn (callable, optional): test a model with samples from a
+            dataloader in a multi-gpu manner, and return the test results. If
+            ``None``, the default test function ``mmcv.engine.multi_gpu_test``
+            will be used. (default: ``None``)
+        tmpdir (str | None): Temporary directory to save the results of all
+            processes. Default: None.
+        gpu_collect (bool): Whether to use gpu or cpu to collect results.
+            Default: False.
+        broadcast_bn_buffer (bool): Whether to broadcast the
+            buffer(running_mean and running_var) of rank 0 to other rank
+            before evaluation. Default: True.
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, `runner.work_dir` will be used by default. If specified,
+            the `out_dir` will be the concatenation of `out_dir` and the last
+            level directory of `runner.work_dir`.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details. Default: None.
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+    """
+
+    def __init__(self,
+                 dataloader: DataLoader,
+                 start: Optional[int] = None,
+                 interval: int = 1,
+                 by_epoch: bool = True,
+                 save_best: Optional[str] = None,
+                 rule: Optional[str] = None,
+                 test_fn: Optional[Callable] = None,
+                 greater_keys: Optional[List[str]] = None,
+                 less_keys: Optional[List[str]] = None,
+                 broadcast_bn_buffer: bool = True,
+                 tmpdir: Optional[str] = None,
+                 gpu_collect: bool = False,
+                 out_dir: Optional[str] = None,
+                 file_client_args: Optional[dict] = None,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from mmcv.engine import multi_gpu_test
+            test_fn = multi_gpu_test
+
+        super().__init__(
+            dataloader,
+            start=start,
+            interval=interval,
+            by_epoch=by_epoch,
+            save_best=save_best,
+            rule=rule,
+            test_fn=test_fn,
+            greater_keys=greater_keys,
+            less_keys=less_keys,
+            out_dir=out_dir,
+            file_client_args=file_client_args,
+            **eval_kwargs)
+
+        self.broadcast_bn_buffer = broadcast_bn_buffer
+        self.tmpdir = tmpdir
+        self.gpu_collect = gpu_collect
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if isinstance(module,
+                              _BatchNorm) and module.track_running_stats:
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+        results = self.test_fn(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect)
+        if runner.rank == 0:
+            print('\n')
+            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+            key_score = self.evaluate(runner, results)
+            # the key_score may be `None` so it needs to skip the action to
+            # save the best checkpoint
+            if self.save_best and key_score:
+                self._save_ckpt(runner, key_score)
+
+class EvalHook(_EvalHook):
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=MMPOSE_GREATER_KEYS,
+                 less_keys=MMPOSE_LESS_KEYS,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from mmpose.apis import single_gpu_test
+            test_fn = single_gpu_test
+
+        # to be compatible with the config before v0.16.0
+
+        # remove "gpu_collect" from eval_kwargs
+        if 'gpu_collect' in eval_kwargs:
+            warnings.warn(
+                '"gpu_collect" will be deprecated in EvalHook.'
+                'Please remove it from the config.', DeprecationWarning)
+            _ = eval_kwargs.pop('gpu_collect')
+
+        # update "save_best" according to "key_indicator" and remove the
+        # latter from eval_kwargs
+        if 'key_indicator' in eval_kwargs or isinstance(save_best, bool):
+            warnings.warn(
+                '"key_indicator" will be deprecated in EvalHook.'
+                'Please use "save_best" to specify the metric key,'
+                'e.g., save_best="AP".', DeprecationWarning)
+
+            key_indicator = eval_kwargs.pop('key_indicator', 'AP')
+            if save_best is True and key_indicator is None:
+                raise ValueError('key_indicator should not be None, when '
+                                 'save_best is set to True.')
+            save_best = key_indicator
+
+        super().__init__(dataloader, start, interval, by_epoch, save_best,
+                         rule, test_fn, greater_keys, less_keys, **eval_kwargs)
+
+
+class DistEvalHook(_DistEvalHook):
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=MMPOSE_GREATER_KEYS,
+                 less_keys=MMPOSE_LESS_KEYS,
+                 broadcast_bn_buffer=True,
+                 tmpdir=None,
+                 gpu_collect=False,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from mmpose.apis import multi_gpu_test
+            test_fn = multi_gpu_test
+
+        # to be compatible with the config before v0.16.0
+
+        # update "save_best" according to "key_indicator" and remove the
+        # latter from eval_kwargs
+        if 'key_indicator' in eval_kwargs or isinstance(save_best, bool):
+            warnings.warn(
+                '"key_indicator" will be deprecated in EvalHook.'
+                'Please use "save_best" to specify the metric key,'
+                'e.g., save_best="AP".', DeprecationWarning)
+
+            key_indicator = eval_kwargs.pop('key_indicator', 'AP')
+            if save_best is True and key_indicator is None:
+                raise ValueError('key_indicator should not be None, when '
+                                 'save_best is set to True.')
+            save_best = key_indicator
+
+        super().__init__(dataloader, start, interval, by_epoch, save_best,
+                         rule, test_fn, greater_keys, less_keys,
+                         broadcast_bn_buffer, tmpdir, gpu_collect,
+                         **eval_kwargs)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/mesh_eval.py b/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/mesh_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..683b4539b29d1829a324de424c6d9f85a7037e5d
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/mesh_eval.py
@@ -0,0 +1,66 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/akanazawa/hmr
+# Original licence: Copyright (c) 2018 akanazawa, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+
+
+def compute_similarity_transform(source_points, target_points):
+    """Computes a similarity transform (sR, t) that takes a set of 3D points
+    source_points (N x 3) closest to a set of 3D points target_points, where R
+    is an 3x3 rotation matrix, t 3x1 translation, s scale. And return the
+    transformed 3D points source_points_hat (N x 3). i.e. solves the orthogonal
+    Procrutes problem.
+
+    Note:
+        Points number: N
+
+    Args:
+        source_points (np.ndarray): Source point set with shape [N, 3].
+        target_points (np.ndarray): Target point set with shape [N, 3].
+
+    Returns:
+        np.ndarray: Transformed source point set with shape [N, 3].
+    """
+
+    assert target_points.shape[0] == source_points.shape[0]
+    assert target_points.shape[1] == 3 and source_points.shape[1] == 3
+
+    source_points = source_points.T
+    target_points = target_points.T
+
+    # 1. Remove mean.
+    mu1 = source_points.mean(axis=1, keepdims=True)
+    mu2 = target_points.mean(axis=1, keepdims=True)
+    X1 = source_points - mu1
+    X2 = target_points - mu2
+
+    # 2. Compute variance of X1 used for scale.
+    var1 = np.sum(X1**2)
+
+    # 3. The outer product of X1 and X2.
+    K = X1.dot(X2.T)
+
+    # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are
+    # singular vectors of K.
+    U, _, Vh = np.linalg.svd(K)
+    V = Vh.T
+    # Construct Z that fixes the orientation of R to get det(R)=1.
+    Z = np.eye(U.shape[0])
+    Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T)))
+    # Construct R.
+    R = V.dot(Z.dot(U.T))
+
+    # 5. Recover scale.
+    scale = np.trace(R.dot(K)) / var1
+
+    # 6. Recover translation.
+    t = mu2 - scale * (R.dot(mu1))
+
+    # 7. Transform the source points:
+    source_points_hat = scale * R.dot(source_points) + t
+
+    source_points_hat = source_points_hat.T
+
+    return source_points_hat
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/pose3d_eval.py b/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/pose3d_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..545778ca7441c2d3e8ec58449c8ca7b162322e9e
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/pose3d_eval.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from .mesh_eval import compute_similarity_transform
+
+
+def keypoint_mpjpe(pred, gt, mask, alignment='none'):
+    """Calculate the mean per-joint position error (MPJPE) and the error after
+    rigid alignment with the ground truth (P-MPJPE).
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - keypoint_dims: C
+
+    Args:
+        pred (np.ndarray): Predicted keypoint location with shape [N, K, C].
+        gt (np.ndarray): Groundtruth keypoint location with shape [N, K, C].
+        mask (np.ndarray): Visibility of the target with shape [N, K].
+            False for invisible joints, and True for visible.
+            Invisible joints will be ignored for accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+
+                - ``'none'``: no alignment will be applied
+                - ``'scale'``: align in the least-square sense in scale
+                - ``'procrustes'``: align in the least-square sense in
+                    scale, rotation and translation.
+    Returns:
+        tuple: A tuple containing joint position errors
+
+        - (float | np.ndarray): mean per-joint position error (mpjpe).
+        - (float | np.ndarray): mpjpe after rigid alignment with the
+            ground truth (p-mpjpe).
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)[mask].mean()
+
+    return error
+
+
+def keypoint_3d_pck(pred, gt, mask, alignment='none', threshold=0.15):
+    """Calculate the Percentage of Correct Keypoints (3DPCK) w. or w/o rigid
+    alignment.
+
+    Paper ref: `Monocular 3D Human Pose Estimation In The Wild Using Improved
+    CNN Supervision' 3DV'2017. <https://arxiv.org/pdf/1611.09813>`__ .
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - keypoint_dims: C
+
+    Args:
+        pred (np.ndarray[N, K, C]): Predicted keypoint location.
+        gt (np.ndarray[N, K, C]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+
+            - ``'none'``: no alignment will be applied
+            - ``'scale'``: align in the least-square sense in scale
+            - ``'procrustes'``: align in the least-square sense in scale,
+                rotation and translation.
+
+        threshold:  If L2 distance between the prediction and the groundtruth
+            is less then threshold, the predicted result is considered as
+            correct. Default: 0.15 (m).
+
+    Returns:
+        pck: percentage of correct keypoints.
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)
+    pck = (error < threshold).astype(np.float32)[mask].mean() * 100
+
+    return pck
+
+
+def keypoint_3d_auc(pred, gt, mask, alignment='none'):
+    """Calculate the Area Under the Curve (3DAUC) computed for a range of 3DPCK
+    thresholds.
+
+    Paper ref: `Monocular 3D Human Pose Estimation In The Wild Using Improved
+    CNN Supervision' 3DV'2017. <https://arxiv.org/pdf/1611.09813>`__ .
+    This implementation is derived from mpii_compute_3d_pck.m, which is
+    provided as part of the MPI-INF-3DHP test data release.
+
+    Note:
+        batch_size: N
+        num_keypoints: K
+        keypoint_dims: C
+
+    Args:
+        pred (np.ndarray[N, K, C]): Predicted keypoint location.
+        gt (np.ndarray[N, K, C]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+
+            - ``'none'``: no alignment will be applied
+            - ``'scale'``: align in the least-square sense in scale
+            - ``'procrustes'``: align in the least-square sense in scale,
+                rotation and translation.
+
+    Returns:
+        auc: AUC computed for a range of 3DPCK thresholds.
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)
+
+    thresholds = np.linspace(0., 0.15, 31)
+    pck_values = np.zeros(len(thresholds))
+    for i in range(len(thresholds)):
+        pck_values[i] = (error < thresholds[i]).astype(np.float32)[mask].mean()
+
+    auc = pck_values.mean() * 100
+
+    return auc
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/top_down_eval.py b/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/top_down_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee6a2501cf1eec1b16f7d58bf9fd62da0fa48ccf
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/evaluation/top_down_eval.py
@@ -0,0 +1,684 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import cv2
+import numpy as np
+
+from mmpose.core.post_processing import transform_preds
+
+
+def _calc_distances(preds, targets, mask, normalize):
+    """Calculate the normalized distances between preds and target.
+
+    Note:
+        batch_size: N
+        num_keypoints: K
+        dimension of keypoints: D (normally, D=2 or D=3)
+
+    Args:
+        preds (np.ndarray[N, K, D]): Predicted keypoint location.
+        targets (np.ndarray[N, K, D]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize (np.ndarray[N, D]): Typical value is heatmap_size
+
+    Returns:
+        np.ndarray[K, N]: The normalized distances. \
+            If target keypoints are missing, the distance is -1.
+    """
+    N, K, _ = preds.shape
+    # set mask=0 when normalize==0
+    _mask = mask.copy()
+    _mask[np.where((normalize == 0).sum(1))[0], :] = False
+    distances = np.full((N, K), -1, dtype=np.float32)
+    # handle invalid values
+    normalize[np.where(normalize <= 0)] = 1e6
+    distances[_mask] = np.linalg.norm(
+        ((preds - targets) / normalize[:, None, :])[_mask], axis=-1)
+    return distances.T
+
+
+def _distance_acc(distances, thr=0.5):
+    """Return the percentage below the distance threshold, while ignoring
+    distances values with -1.
+
+    Note:
+        batch_size: N
+    Args:
+        distances (np.ndarray[N, ]): The normalized distances.
+        thr (float): Threshold of the distances.
+
+    Returns:
+        float: Percentage of distances below the threshold. \
+            If all target keypoints are missing, return -1.
+    """
+    distance_valid = distances != -1
+    num_distance_valid = distance_valid.sum()
+    if num_distance_valid > 0:
+        return (distances[distance_valid] < thr).sum() / num_distance_valid
+    return -1
+
+
+def _get_max_preds(heatmaps):
+    """Get keypoint predictions from score maps.
+
+    Note:
+        batch_size: N
+        num_keypoints: K
+        heatmap height: H
+        heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+
+    Returns:
+        tuple: A tuple containing aggregated results.
+
+        - preds (np.ndarray[N, K, 2]): Predicted keypoint location.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    assert isinstance(heatmaps,
+                      np.ndarray), ('heatmaps should be numpy.ndarray')
+    assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
+
+    N, K, _, W = heatmaps.shape
+    heatmaps_reshaped = heatmaps.reshape((N, K, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
+    maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
+
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+    preds[:, :, 0] = preds[:, :, 0] % W
+    preds[:, :, 1] = preds[:, :, 1] // W
+
+    preds = np.where(np.tile(maxvals, (1, 1, 2)) > 0.0, preds, -1)
+    return preds, maxvals
+
+
+def _get_max_preds_3d(heatmaps):
+    """Get keypoint predictions from 3D score maps.
+
+    Note:
+        batch size: N
+        num keypoints: K
+        heatmap depth size: D
+        heatmap height: H
+        heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps.
+
+    Returns:
+        tuple: A tuple containing aggregated results.
+
+        - preds (np.ndarray[N, K, 3]): Predicted keypoint location.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    assert isinstance(heatmaps, np.ndarray), \
+        ('heatmaps should be numpy.ndarray')
+    assert heatmaps.ndim == 5, 'heatmaps should be 5-ndim'
+
+    N, K, D, H, W = heatmaps.shape
+    heatmaps_reshaped = heatmaps.reshape((N, K, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
+    maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
+
+    preds = np.zeros((N, K, 3), dtype=np.float32)
+    _idx = idx[..., 0]
+    preds[..., 2] = _idx // (H * W)
+    preds[..., 1] = (_idx // W) % H
+    preds[..., 0] = _idx % W
+
+    preds = np.where(maxvals > 0.0, preds, -1)
+    return preds, maxvals
+
+
+def pose_pck_accuracy(output, target, mask, thr=0.05, normalize=None):
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints from heatmaps.
+
+    Note:
+        PCK metric measures accuracy of the localization of the body joints.
+        The distances between predicted positions and the ground-truth ones
+        are typically normalized by the bounding box size.
+        The threshold (thr) of the normalized distance is commonly set
+        as 0.05, 0.1 or 0.2 etc.
+
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        output (np.ndarray[N, K, H, W]): Model output heatmaps.
+        target (np.ndarray[N, K, H, W]): Groundtruth heatmaps.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        thr (float): Threshold of PCK calculation. Default 0.05.
+        normalize (np.ndarray[N, 2]): Normalization factor for H&W.
+
+    Returns:
+        tuple: A tuple containing keypoint accuracy.
+
+        - np.ndarray[K]: Accuracy of each keypoint.
+        - float: Averaged accuracy across all keypoints.
+        - int: Number of valid keypoints.
+    """
+    N, K, H, W = output.shape
+    if K == 0:
+        return None, 0, 0
+    if normalize is None:
+        normalize = np.tile(np.array([[H, W]]), (N, 1))
+
+    pred, _ = _get_max_preds(output)
+    gt, _ = _get_max_preds(target)
+    return keypoint_pck_accuracy(pred, gt, mask, thr, normalize)
+
+
+def keypoint_pck_accuracy(pred, gt, mask, thr, normalize):
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints for coordinates.
+
+    Note:
+        PCK metric measures accuracy of the localization of the body joints.
+        The distances between predicted positions and the ground-truth ones
+        are typically normalized by the bounding box size.
+        The threshold (thr) of the normalized distance is commonly set
+        as 0.05, 0.1 or 0.2 etc.
+
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        thr (float): Threshold of PCK calculation.
+        normalize (np.ndarray[N, 2]): Normalization factor for H&W.
+
+    Returns:
+        tuple: A tuple containing keypoint accuracy.
+
+        - acc (np.ndarray[K]): Accuracy of each keypoint.
+        - avg_acc (float): Averaged accuracy across all keypoints.
+        - cnt (int): Number of valid keypoints.
+    """
+    distances = _calc_distances(pred, gt, mask, normalize)
+
+    acc = np.array([_distance_acc(d, thr) for d in distances])
+    valid_acc = acc[acc >= 0]
+    cnt = len(valid_acc)
+    avg_acc = valid_acc.mean() if cnt > 0 else 0
+    return acc, avg_acc, cnt
+
+
+def keypoint_auc(pred, gt, mask, normalize, num_step=20):
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints for coordinates.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize (float): Normalization factor.
+
+    Returns:
+        float: Area under curve.
+    """
+    nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1))
+    x = [1.0 * i / num_step for i in range(num_step)]
+    y = []
+    for thr in x:
+        _, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor)
+        y.append(avg_acc)
+
+    auc = 0
+    for i in range(num_step):
+        auc += 1.0 / num_step * y[i]
+    return auc
+
+
+def keypoint_nme(pred, gt, mask, normalize_factor):
+    """Calculate the normalized mean error (NME).
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize_factor (np.ndarray[N, 2]): Normalization factor.
+
+    Returns:
+        float: normalized mean error
+    """
+    distances = _calc_distances(pred, gt, mask, normalize_factor)
+    distance_valid = distances[distances != -1]
+    return distance_valid.sum() / max(1, len(distance_valid))
+
+
+def keypoint_epe(pred, gt, mask):
+    """Calculate the end-point error.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+
+    Returns:
+        float: Average end-point error.
+    """
+
+    distances = _calc_distances(
+        pred, gt, mask,
+        np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32))
+    distance_valid = distances[distances != -1]
+    return distance_valid.sum() / max(1, len(distance_valid))
+
+
+def _taylor(heatmap, coord):
+    """Distribution aware coordinate decoding method.
+
+    Note:
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmap (np.ndarray[H, W]): Heatmap of a particular joint type.
+        coord (np.ndarray[2,]): Coordinates of the predicted keypoints.
+
+    Returns:
+        np.ndarray[2,]: Updated coordinates.
+    """
+    H, W = heatmap.shape[:2]
+    px, py = int(coord[0]), int(coord[1])
+    if 1 < px < W - 2 and 1 < py < H - 2:
+        dx = 0.5 * (heatmap[py][px + 1] - heatmap[py][px - 1])
+        dy = 0.5 * (heatmap[py + 1][px] - heatmap[py - 1][px])
+        dxx = 0.25 * (
+            heatmap[py][px + 2] - 2 * heatmap[py][px] + heatmap[py][px - 2])
+        dxy = 0.25 * (
+            heatmap[py + 1][px + 1] - heatmap[py - 1][px + 1] -
+            heatmap[py + 1][px - 1] + heatmap[py - 1][px - 1])
+        dyy = 0.25 * (
+            heatmap[py + 2 * 1][px] - 2 * heatmap[py][px] +
+            heatmap[py - 2 * 1][px])
+        derivative = np.array([[dx], [dy]])
+        hessian = np.array([[dxx, dxy], [dxy, dyy]])
+        if dxx * dyy - dxy**2 != 0:
+            hessianinv = np.linalg.inv(hessian)
+            offset = -hessianinv @ derivative
+            offset = np.squeeze(np.array(offset.T), axis=0)
+            coord += offset
+    return coord
+
+
+def post_dark_udp(coords, batch_heatmaps, kernel=3):
+    """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The
+    Devil is in the Details: Delving into Unbiased Data Processing for Human
+    Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate
+    Representation for Human Pose Estimation (CVPR 2020).
+
+    Note:
+        - batch size: B
+        - num keypoints: K
+        - num persons: N
+        - height of heatmaps: H
+        - width of heatmaps: W
+
+        B=1 for bottom_up paradigm where all persons share the same heatmap.
+        B=N for top_down paradigm where each person has its own heatmaps.
+
+    Args:
+        coords (np.ndarray[N, K, 2]): Initial coordinates of human pose.
+        batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps
+        kernel (int): Gaussian kernel size (K) for modulation.
+
+    Returns:
+        np.ndarray([N, K, 2]): Refined coordinates.
+    """
+    if not isinstance(batch_heatmaps, np.ndarray):
+        batch_heatmaps = batch_heatmaps.cpu().numpy()
+    B, K, H, W = batch_heatmaps.shape
+    N = coords.shape[0]
+    assert (B == 1 or B == N)
+    for heatmaps in batch_heatmaps:
+        for heatmap in heatmaps:
+            cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)
+    np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
+    np.log(batch_heatmaps, batch_heatmaps)
+
+    batch_heatmaps_pad = np.pad(
+        batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)),
+        mode='edge').flatten()
+
+    index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2)
+    index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K)
+    index = index.astype(int).reshape(-1, 1)
+    i_ = batch_heatmaps_pad[index]
+    ix1 = batch_heatmaps_pad[index + 1]
+    iy1 = batch_heatmaps_pad[index + W + 2]
+    ix1y1 = batch_heatmaps_pad[index + W + 3]
+    ix1_y1_ = batch_heatmaps_pad[index - W - 3]
+    ix1_ = batch_heatmaps_pad[index - 1]
+    iy1_ = batch_heatmaps_pad[index - 2 - W]
+
+    dx = 0.5 * (ix1 - ix1_)
+    dy = 0.5 * (iy1 - iy1_)
+    derivative = np.concatenate([dx, dy], axis=1)
+    derivative = derivative.reshape(N, K, 2, 1)
+    dxx = ix1 - 2 * i_ + ix1_
+    dyy = iy1 - 2 * i_ + iy1_
+    dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
+    hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
+    hessian = hessian.reshape(N, K, 2, 2)
+    hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
+    coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze()
+    return coords
+
+
+def _gaussian_blur(heatmaps, kernel=11):
+    """Modulate heatmap distribution with Gaussian.
+     sigma = 0.3*((kernel_size-1)*0.5-1)+0.8
+     sigma~=3 if k=17
+     sigma=2 if k=11;
+     sigma~=1.5 if k=7;
+     sigma~=1 if k=3;
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        kernel (int): Gaussian kernel size (K) for modulation, which should
+            match the heatmap gaussian sigma when training.
+            K=17 for sigma=3 and k=11 for sigma=2.
+
+    Returns:
+        np.ndarray ([N, K, H, W]): Modulated heatmap distribution.
+    """
+    assert kernel % 2 == 1
+
+    border = (kernel - 1) // 2
+    batch_size = heatmaps.shape[0]
+    num_joints = heatmaps.shape[1]
+    height = heatmaps.shape[2]
+    width = heatmaps.shape[3]
+    for i in range(batch_size):
+        for j in range(num_joints):
+            origin_max = np.max(heatmaps[i, j])
+            dr = np.zeros((height + 2 * border, width + 2 * border),
+                          dtype=np.float32)
+            dr[border:-border, border:-border] = heatmaps[i, j].copy()
+            dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
+            heatmaps[i, j] = dr[border:-border, border:-border].copy()
+            heatmaps[i, j] *= origin_max / np.max(heatmaps[i, j])
+    return heatmaps
+
+
+def keypoints_from_regression(regression_preds, center, scale, img_size):
+    """Get final keypoint predictions from regression vectors and transform
+    them back to the image.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        regression_preds (np.ndarray[N, K, 2]): model prediction.
+        center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+        scale (np.ndarray[N, 2]): Scale of the bounding box
+            wrt height/width.
+        img_size (list(img_width, img_height)): model input image size.
+
+    Returns:
+        tuple:
+
+        - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    N, K, _ = regression_preds.shape
+    preds, maxvals = regression_preds, np.ones((N, K, 1), dtype=np.float32)
+
+    preds = preds * img_size
+
+    # Transform back to the image
+    for i in range(N):
+        preds[i] = transform_preds(preds[i], center[i], scale[i], img_size)
+
+    return preds, maxvals
+
+
+def keypoints_from_heatmaps(heatmaps,
+                            center,
+                            scale,
+                            unbiased=False,
+                            post_process='default',
+                            kernel=11,
+                            valid_radius_factor=0.0546875,
+                            use_udp=False,
+                            target_type='GaussianHeatmap'):
+    """Get final keypoint predictions from heatmaps and transform them back to
+    the image.
+
+    Note:
+        - batch size: N
+        - num keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+        scale (np.ndarray[N, 2]): Scale of the bounding box
+            wrt height/width.
+        post_process (str/None): Choice of methods to post-process
+            heatmaps. Currently supported: None, 'default', 'unbiased',
+            'megvii'.
+        unbiased (bool): Option to use unbiased decoding. Mutually
+            exclusive with megvii.
+            Note: this arg is deprecated and unbiased=True can be replaced
+            by post_process='unbiased'
+            Paper ref: Zhang et al. Distribution-Aware Coordinate
+            Representation for Human Pose Estimation (CVPR 2020).
+        kernel (int): Gaussian kernel size (K) for modulation, which should
+            match the heatmap gaussian sigma when training.
+            K=17 for sigma=3 and k=11 for sigma=2.
+        valid_radius_factor (float): The radius factor of the positive area
+            in classification heatmap for UDP.
+        use_udp (bool): Use unbiased data processing.
+        target_type (str): 'GaussianHeatmap' or 'CombinedTarget'.
+            GaussianHeatmap: Classification target with gaussian distribution.
+            CombinedTarget: The combination of classification target
+            (response map) and regression target (offset map).
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Returns:
+        tuple: A tuple containing keypoint predictions and scores.
+
+        - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    # Avoid being affected
+    heatmaps = heatmaps.copy()
+
+    # detect conflicts
+    if unbiased:
+        assert post_process not in [False, None, 'megvii']
+    if post_process in ['megvii', 'unbiased']:
+        assert kernel > 0
+    if use_udp:
+        assert not post_process == 'megvii'
+
+    # normalize configs
+    if post_process is False:
+        warnings.warn(
+            'post_process=False is deprecated, '
+            'please use post_process=None instead', DeprecationWarning)
+        post_process = None
+    elif post_process is True:
+        if unbiased is True:
+            warnings.warn(
+                'post_process=True, unbiased=True is deprecated,'
+                " please use post_process='unbiased' instead",
+                DeprecationWarning)
+            post_process = 'unbiased'
+        else:
+            warnings.warn(
+                'post_process=True, unbiased=False is deprecated, '
+                "please use post_process='default' instead",
+                DeprecationWarning)
+            post_process = 'default'
+    elif post_process == 'default':
+        if unbiased is True:
+            warnings.warn(
+                'unbiased=True is deprecated, please use '
+                "post_process='unbiased' instead", DeprecationWarning)
+            post_process = 'unbiased'
+
+    # start processing
+    if post_process == 'megvii':
+        heatmaps = _gaussian_blur(heatmaps, kernel=kernel)
+
+    N, K, H, W = heatmaps.shape
+    if use_udp:
+        if target_type.lower() == 'GaussianHeatMap'.lower():
+            preds, maxvals = _get_max_preds(heatmaps)
+            preds = post_dark_udp(preds, heatmaps, kernel=kernel)
+        elif target_type.lower() == 'CombinedTarget'.lower():
+            for person_heatmaps in heatmaps:
+                for i, heatmap in enumerate(person_heatmaps):
+                    kt = 2 * kernel + 1 if i % 3 == 0 else kernel
+                    cv2.GaussianBlur(heatmap, (kt, kt), 0, heatmap)
+            # valid radius is in direct proportion to the height of heatmap.
+            valid_radius = valid_radius_factor * H
+            offset_x = heatmaps[:, 1::3, :].flatten() * valid_radius
+            offset_y = heatmaps[:, 2::3, :].flatten() * valid_radius
+            heatmaps = heatmaps[:, ::3, :]
+            preds, maxvals = _get_max_preds(heatmaps)
+            index = preds[..., 0] + preds[..., 1] * W
+            index += W * H * np.arange(0, N * K / 3)
+            index = index.astype(int).reshape(N, K // 3, 1)
+            preds += np.concatenate((offset_x[index], offset_y[index]), axis=2)
+        else:
+            raise ValueError('target_type should be either '
+                             "'GaussianHeatmap' or 'CombinedTarget'")
+    else:
+        preds, maxvals = _get_max_preds(heatmaps)
+        if post_process == 'unbiased':  # alleviate biased coordinate
+            # apply Gaussian distribution modulation.
+            heatmaps = np.log(
+                np.maximum(_gaussian_blur(heatmaps, kernel), 1e-10))
+            for n in range(N):
+                for k in range(K):
+                    preds[n][k] = _taylor(heatmaps[n][k], preds[n][k])
+        elif post_process is not None:
+            # add +/-0.25 shift to the predicted locations for higher acc.
+            for n in range(N):
+                for k in range(K):
+                    heatmap = heatmaps[n][k]
+                    px = int(preds[n][k][0])
+                    py = int(preds[n][k][1])
+                    if 1 < px < W - 1 and 1 < py < H - 1:
+                        diff = np.array([
+                            heatmap[py][px + 1] - heatmap[py][px - 1],
+                            heatmap[py + 1][px] - heatmap[py - 1][px]
+                        ])
+                        preds[n][k] += np.sign(diff) * .25
+                        if post_process == 'megvii':
+                            preds[n][k] += 0.5
+
+    # Transform back to the image
+    for i in range(N):
+        preds[i] = transform_preds(
+            preds[i], center[i], scale[i], [W, H], use_udp=use_udp)
+
+    if post_process == 'megvii':
+        maxvals = maxvals / 255.0 + 0.5
+
+    return preds, maxvals
+
+
+def keypoints_from_heatmaps3d(heatmaps, center, scale):
+    """Get final keypoint predictions from 3d heatmaps and transform them back
+    to the image.
+
+    Note:
+        - batch size: N
+        - num keypoints: K
+        - heatmap depth size: D
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps.
+        center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+        scale (np.ndarray[N, 2]): Scale of the bounding box
+            wrt height/width.
+
+    Returns:
+        tuple: A tuple containing keypoint predictions and scores.
+
+        - preds (np.ndarray[N, K, 3]): Predicted 3d keypoint location \
+            in images.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    N, K, D, H, W = heatmaps.shape
+    preds, maxvals = _get_max_preds_3d(heatmaps)
+    # Transform back to the image
+    for i in range(N):
+        preds[i, :, :2] = transform_preds(preds[i, :, :2], center[i], scale[i],
+                                          [W, H])
+    return preds, maxvals
+
+
+def multilabel_classification_accuracy(pred, gt, mask, thr=0.5):
+    """Get multi-label classification accuracy.
+
+    Note:
+        - batch size: N
+        - label number: L
+
+    Args:
+        pred (np.ndarray[N, L, 2]): model predicted labels.
+        gt (np.ndarray[N, L, 2]): ground-truth labels.
+        mask (np.ndarray[N, 1] or np.ndarray[N, L] ): reliability of
+        ground-truth labels.
+
+    Returns:
+        float: multi-label classification accuracy.
+    """
+    # we only compute accuracy on the samples with ground-truth of all labels.
+    valid = (mask > 0).min(axis=1) if mask.ndim == 2 else (mask > 0)
+    pred, gt = pred[valid], gt[valid]
+
+    if pred.shape[0] == 0:
+        acc = 0.0  # when no sample is with gt labels, set acc to 0.
+    else:
+        # The classification of a sample is regarded as correct
+        # only if it's correct for all labels.
+        acc = (((pred - thr) * (gt - thr)) > 0).all(axis=1).mean()
+    return acc
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/fp16/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/core/fp16/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cb054810870626496ab4145446b17cf2c2e0b5d
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/fp16/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .decorators import auto_fp16, force_fp32
+from .hooks import Fp16OptimizerHook, wrap_fp16_model
+from .utils import cast_tensor_type
+
+__all__ = [
+    'auto_fp16', 'force_fp32', 'Fp16OptimizerHook', 'wrap_fp16_model',
+    'cast_tensor_type'
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/fp16/decorators.py b/SMPLer-X/main/transformer_utils/mmpose/core/fp16/decorators.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d70ddf533c069b26f08ef3a973328790843def5
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/fp16/decorators.py
@@ -0,0 +1,175 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import warnings
+from inspect import getfullargspec
+
+import torch
+
+from .utils import cast_tensor_type
+
+
+def auto_fp16(apply_to=None, out_fp32=False):
+    """Decorator to enable fp16 training automatically.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If inputs arguments are fp32 tensors, they will
+    be converted to fp16 automatically. Arguments other than fp32 tensors are
+    ignored.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp32 (bool): Whether to convert the output back to fp32.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp16
+        >>>     @auto_fp16()
+        >>>     def forward(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp16
+        >>>     @auto_fp16(apply_to=('pred', ))
+        >>>     def do_something(self, pred, others):
+        >>>         pass
+    """
+
+    warnings.warn(
+        'auto_fp16 in mmpose will be deprecated in the next release.'
+        'Please use mmcv.runner.auto_fp16 instead (mmcv>=1.3.1).',
+        DeprecationWarning)
+
+    def auto_fp16_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@auto_fp16 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            # NOTE: default args are not taken into consideration
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.float, torch.half))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = {}
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.float, torch.half)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp32:
+                output = cast_tensor_type(output, torch.half, torch.float)
+            return output
+
+        return new_func
+
+    return auto_fp16_wrapper
+
+
+def force_fp32(apply_to=None, out_fp16=False):
+    """Decorator to convert input arguments to fp32 in force.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If there are some inputs that must be processed
+    in fp32 mode, then this decorator can handle it. If inputs arguments are
+    fp16 tensors, they will be converted to fp32 automatically. Arguments other
+    than fp16 tensors are ignored.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp16 (bool): Whether to convert the output back to fp16.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp32
+        >>>     @force_fp32()
+        >>>     def loss(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp32
+        >>>     @force_fp32(apply_to=('pred', ))
+        >>>     def post_process(self, pred, others):
+        >>>         pass
+    """
+    warnings.warn(
+        'force_fp32 in mmpose will be deprecated in the next release.'
+        'Please use mmcv.runner.force_fp32 instead (mmcv>=1.3.1).',
+        DeprecationWarning)
+
+    def force_fp32_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@force_fp32 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.half, torch.float))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = dict()
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.half, torch.float)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp16:
+                output = cast_tensor_type(output, torch.float, torch.half)
+            return output
+
+        return new_func
+
+    return force_fp32_wrapper
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/fp16/hooks.py b/SMPLer-X/main/transformer_utils/mmpose/core/fp16/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4e414396925b9d15b5958d4831bec06f0d0f7bf
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/fp16/hooks.py
@@ -0,0 +1,242 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn.utils import clip_grad
+from mmengine.hooks import Hook
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..utils.dist_utils import allreduce_grads
+from .utils import cast_tensor_type
+
+
+class OptimizerHook(Hook):
+    """A hook contains custom operations for the optimizer.
+
+    Args:
+        grad_clip (dict, optional): A config dict to control the clip_grad.
+            Default: None.
+        detect_anomalous_params (bool): This option is only used for
+            debugging which will slow down the training speed.
+            Detect anomalous parameters that are not included in
+            the computational graph with `loss` as the root.
+            There are two cases
+
+                - Parameters were not used during
+                  forward pass.
+                - Parameters were not used to produce
+                  loss.
+            Default: False.
+    """
+
+    def __init__(self,
+                 grad_clip: Optional[dict] = None,
+                 detect_anomalous_params: bool = False):
+        self.grad_clip = grad_clip
+        self.detect_anomalous_params = detect_anomalous_params
+
+    def clip_grads(self, params):
+        params = list(
+            filter(lambda p: p.requires_grad and p.grad is not None, params))
+        if len(params) > 0:
+            return clip_grad.clip_grad_norm_(params, **self.grad_clip)
+
+    def after_train_iter(self, runner):
+        runner.optimizer.zero_grad()
+        if self.detect_anomalous_params:
+            self.detect_anomalous_parameters(runner.outputs['loss'], runner)
+        runner.outputs['loss'].backward()
+
+        if self.grad_clip is not None:
+            grad_norm = self.clip_grads(runner.model.parameters())
+            if grad_norm is not None:
+                # Add grad norm to the logger
+                runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                         runner.outputs['num_samples'])
+        runner.optimizer.step()
+
+    def detect_anomalous_parameters(self, loss: Tensor, runner) -> None:
+        logger = runner.logger
+        parameters_in_graph = set()
+        visited = set()
+
+        def traverse(grad_fn):
+            if grad_fn is None:
+                return
+            if grad_fn not in visited:
+                visited.add(grad_fn)
+                if hasattr(grad_fn, 'variable'):
+                    parameters_in_graph.add(grad_fn.variable)
+                parents = grad_fn.next_functions
+                if parents is not None:
+                    for parent in parents:
+                        grad_fn = parent[0]
+                        traverse(grad_fn)
+
+        traverse(loss.grad_fn)
+        for n, p in runner.model.named_parameters():
+            if p not in parameters_in_graph and p.requires_grad:
+                logger.log(
+                    level=logging.ERROR,
+                    msg=f'{n} with shape {p.size()} is not '
+                    f'in the computational graph \n')
+
+class Fp16OptimizerHook(OptimizerHook):
+    """FP16 optimizer hook.
+
+    The steps of fp16 optimizer is as follows.
+    1. Scale the loss value.
+    2. BP in the fp16 model.
+    2. Copy gradients from fp16 model to fp32 weights.
+    3. Update fp32 weights.
+    4. Copy updated parameters from fp32 weights to fp16 model.
+
+    Refer to https://arxiv.org/abs/1710.03740 for more details.
+
+    Args:
+        loss_scale (float): Scale factor multiplied with loss.
+    """
+
+    def __init__(self,
+                 grad_clip=None,
+                 coalesce=True,
+                 bucket_size_mb=-1,
+                 loss_scale=512.,
+                 distributed=True):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
+        self.loss_scale = loss_scale
+        self.distributed = distributed
+
+    def before_run(self, runner):
+        """Preparing steps before Mixed Precision Training.
+
+        1. Make a master copy of fp32 weights for optimization.
+        2. Convert the main model from fp32 to fp16.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlines training runner.
+        """
+        # keep a copy of fp32 weights
+        runner.optimizer.param_groups = copy.deepcopy(
+            runner.optimizer.param_groups)
+        # convert model to fp16
+        wrap_fp16_model(runner.model)
+
+    @staticmethod
+    def copy_grads_to_fp32(fp16_net, fp32_weights):
+        """Copy gradients from fp16 model to fp32 weight copy."""
+        for fp32_param, fp16_param in zip(fp32_weights, fp16_net.parameters()):
+            if fp16_param.grad is not None:
+                if fp32_param.grad is None:
+                    fp32_param.grad = fp32_param.data.new(fp32_param.size())
+                fp32_param.grad.copy_(fp16_param.grad)
+
+    @staticmethod
+    def copy_params_to_fp16(fp16_net, fp32_weights):
+        """Copy updated params from fp32 weight copy to fp16 model."""
+        for fp16_param, fp32_param in zip(fp16_net.parameters(), fp32_weights):
+            fp16_param.data.copy_(fp32_param.data)
+
+    def after_train_iter(self, runner):
+        """Backward optimization steps for Mixed Precision Training.
+
+        1. Scale the loss by a scale factor.
+        2. Backward the loss to obtain the gradients (fp16).
+        3. Copy gradients from the model to the fp32 weight copy.
+        4. Scale the gradients back and update the fp32 weight copy.
+        5. Copy back the params from fp32 weight copy to the fp16 model.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlines training runner.
+        """
+        # clear grads of last iteration
+        runner.model.zero_grad()
+        runner.optimizer.zero_grad()
+        # scale the loss value
+        scaled_loss = runner.outputs['loss'] * self.loss_scale
+        scaled_loss.backward()
+        # copy fp16 grads in the model to fp32 params in the optimizer
+        fp32_weights = []
+        for param_group in runner.optimizer.param_groups:
+            fp32_weights += param_group['params']
+        self.copy_grads_to_fp32(runner.model, fp32_weights)
+        # allreduce grads
+        if self.distributed:
+            allreduce_grads(fp32_weights, self.coalesce, self.bucket_size_mb)
+        # scale the gradients back
+        for param in fp32_weights:
+            if param.grad is not None:
+                param.grad.div_(self.loss_scale)
+        if self.grad_clip is not None:
+            self.clip_grads(fp32_weights)
+        # update fp32 params
+        runner.optimizer.step()
+        # copy fp32 params to the fp16 model
+        self.copy_params_to_fp16(runner.model, fp32_weights)
+
+
+def wrap_fp16_model(model):
+    """Wrap the FP32 model to FP16.
+
+    1. Convert FP32 model to FP16.
+    2. Remain some necessary layers to be FP32, e.g., normalization layers.
+
+    Args:
+        model (nn.Module): Model in FP32.
+    """
+    # convert model to fp16
+    model.half()
+    # patch the normalization layers to make it work in fp32 mode
+    patch_norm_fp32(model)
+    # set `fp16_enabled` flag
+    for m in model.modules():
+        if hasattr(m, 'fp16_enabled'):
+            m.fp16_enabled = True
+
+
+def patch_norm_fp32(module):
+    """Recursively convert normalization layers from FP16 to FP32.
+
+    Args:
+        module (nn.Module): The modules to be converted in FP16.
+
+    Returns:
+        nn.Module: The converted module, the normalization layers have been
+            converted to FP32.
+    """
+    if isinstance(module, (_BatchNorm, nn.GroupNorm)):
+        module.float()
+        module.forward = patch_forward_method(module.forward, torch.half,
+                                              torch.float)
+    for child in module.children():
+        patch_norm_fp32(child)
+    return module
+
+
+def patch_forward_method(func, src_type, dst_type, convert_output=True):
+    """Patch the forward method of a module.
+
+    Args:
+        func (callable): The original forward method.
+        src_type (torch.dtype): Type of input arguments to be converted from.
+        dst_type (torch.dtype): Type of input arguments to be converted to.
+        convert_output (bool): Whether to convert the output back to src_type.
+
+    Returns:
+        callable: The patched forward method.
+    """
+
+    def new_forward(*args, **kwargs):
+        output = func(*cast_tensor_type(args, src_type, dst_type),
+                      **cast_tensor_type(kwargs, src_type, dst_type))
+        if convert_output:
+            output = cast_tensor_type(output, dst_type, src_type)
+        return output
+
+    return new_forward
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/fp16/utils.py b/SMPLer-X/main/transformer_utils/mmpose/core/fp16/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1ec3d328328560c7959ae5e77621feb77692068
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/fp16/utils.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import abc
+
+import numpy as np
+import torch
+
+
+def cast_tensor_type(inputs, src_type, dst_type):
+    """Recursively convert Tensor in inputs from src_type to dst_type.
+
+    Args:
+        inputs: Inputs that to be casted.
+        src_type (torch.dtype): Source type.
+        dst_type (torch.dtype): Destination type.
+
+    Returns:
+        The same type with inputs, but all contained Tensors have been cast.
+    """
+    if isinstance(inputs, torch.Tensor):
+        return inputs.to(dst_type)
+    elif isinstance(inputs, str):
+        return inputs
+    elif isinstance(inputs, np.ndarray):
+        return inputs
+    elif isinstance(inputs, abc.Mapping):
+        return type(inputs)({
+            k: cast_tensor_type(v, src_type, dst_type)
+            for k, v in inputs.items()
+        })
+    elif isinstance(inputs, abc.Iterable):
+        return type(inputs)(
+            cast_tensor_type(item, src_type, dst_type) for item in inputs)
+
+    return inputs
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/optimizers/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/core/optimizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..906f67c231d9d33faee6c15f5c9b5582af6fdb19
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/optimizers/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS,
+                      build_optimizer_constructor, build_optimizers)
+
+__all__ = [
+    'build_optimizers', 'build_optimizer_constructor', 'OPTIMIZERS',
+    'OPTIMIZER_BUILDERS'
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/optimizers/builder.py b/SMPLer-X/main/transformer_utils/mmpose/core/optimizers/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa9d2c7ab4b464b2900a7bd14076e601a4d1168c
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/optimizers/builder.py
@@ -0,0 +1,83 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict
+# from mmcv.runner.optimizer import OPTIMIZER_BUILDERS as MMCV_OPTIMIZER_BUILDERS
+from mmengine import Registry
+from mmengine.registry import build_from_cfg
+
+OPTIMIZERS = Registry('optimizers')
+OPTIMIZER_BUILDERS = Registry('optimizer builder')
+
+
+def build_optimizer_constructor(cfg):
+    constructor_type = cfg.get('type')
+    if constructor_type in OPTIMIZER_BUILDERS:
+        return build_from_cfg(cfg, OPTIMIZER_BUILDERS)
+    else:
+        raise KeyError(f'{constructor_type} is not registered '
+                       'in the optimizer builder registry.')
+
+
+def build_optimizer(model, cfg: Dict):
+    optimizer_cfg = copy.deepcopy(cfg)
+    constructor_type = optimizer_cfg.pop('constructor',
+                                         'DefaultOptimizerConstructor')
+    paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None)
+    optim_constructor = build_optimizer_constructor(
+        dict(
+            type=constructor_type,
+            optimizer_cfg=optimizer_cfg,
+            paramwise_cfg=paramwise_cfg))
+    optimizer = optim_constructor(model)
+    return optimizer
+
+
+def build_optimizers(model, cfgs):
+    """Build multiple optimizers from configs.
+
+    If `cfgs` contains several dicts for optimizers, then a dict for each
+    constructed optimizers will be returned.
+    If `cfgs` only contains one optimizer config, the constructed optimizer
+    itself will be returned.
+
+    For example,
+
+    1) Multiple optimizer configs:
+
+    .. code-block:: python
+
+        optimizer_cfg = dict(
+            model1=dict(type='SGD', lr=lr),
+            model2=dict(type='SGD', lr=lr))
+
+    The return dict is
+    ``dict('model1': torch.optim.Optimizer, 'model2': torch.optim.Optimizer)``
+
+    2) Single optimizer config:
+
+    .. code-block:: python
+
+        optimizer_cfg = dict(type='SGD', lr=lr)
+
+    The return is ``torch.optim.Optimizer``.
+
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        cfgs (dict): The config dict of the optimizer.
+
+    Returns:
+        dict[:obj:`torch.optim.Optimizer`] | :obj:`torch.optim.Optimizer`:
+            The initialized optimizers.
+    """
+    optimizers = {}
+    if hasattr(model, 'module'):
+        model = model.module
+    # determine whether 'cfgs' has several dicts for optimizers
+    if all(isinstance(v, dict) for v in cfgs.values()):
+        for key, cfg in cfgs.items():
+            cfg_ = cfg.copy()
+            module = getattr(model, key)
+            optimizers[key] = build_optimizer(module, cfg_)
+        return optimizers
+
+    return build_optimizer(model, cfgs)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py b/SMPLer-X/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..958b50ae4839f5fd4dc0aa864a1723c9cbc9d8c8
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import warnings
+from mmengine.dist import get_dist_info
+from mmcv.runner import DefaultOptimizerConstructor
+
+from mmpose.utils import get_root_logger
+from .builder import OPTIMIZER_BUILDERS
+
+
+def get_layer_id_for_convnext(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates in ``layer_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_layer_id (int): Maximum number of backbone layers.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        stage_id = int(var_name.split('.')[2])
+        if stage_id == 0:
+            layer_id = 0
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        block_id = int(var_name.split('.')[3])
+        if stage_id == 0:
+            layer_id = 1
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3 + block_id // 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    else:
+        return max_layer_id + 1
+
+
+def get_stage_id_for_convnext(var_name, max_stage_id):
+    """Get the stage id to set the different learning rates in ``stage_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_stage_id (int): Maximum number of backbone layers.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        return 0
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        return stage_id + 1
+    else:
+        return max_stage_id - 1
+
+
+def get_layer_id_for_vit(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates.
+
+    Args:
+        var_name (str): The key of the model.
+        num_max_layer (int): Maximum number of backbone layers.
+
+    Returns:
+        int: Returns the layer id of the key.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.patch_embed'):
+        return 0
+    elif var_name.startswith('backbone.layers'):
+        layer_id = int(var_name.split('.')[2])
+        return layer_id + 1
+    else:
+        return max_layer_id - 1
+
+
+@OPTIMIZER_BUILDERS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimizerConstructor):
+    """Different learning rates are set for different layers of backbone.
+
+    Note: Currently, this optimizer constructor is built for ConvNeXt,
+    BEiT and MAE.
+    """
+
+    def add_params(self, params, module, **kwargs):
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+        """
+        logger = get_root_logger()
+
+        parameter_groups = {}
+        logger.info(f'self.paramwise_cfg is {self.paramwise_cfg}')
+        num_layers = self.paramwise_cfg.get('num_layers') + 2
+        decay_rate = self.paramwise_cfg.get('decay_rate')
+        decay_type = self.paramwise_cfg.get('decay_type', 'layer_wise')
+        logger.info('Build LearningRateDecayOptimizerConstructor  '
+                    f'{decay_type} {decay_rate} - {num_layers}')
+        weight_decay = self.base_wd
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith('.bias') or name in (
+                    'pos_embed', 'cls_token'):
+                group_name = 'no_decay'
+                this_weight_decay = 0.
+            else:
+                group_name = 'decay'
+                this_weight_decay = weight_decay
+            if 'layer_wise' in decay_type:
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_convnext(
+                        name, self.paramwise_cfg.get('num_layers'))
+                    logger.info(f'set param {name} as id {layer_id}')
+                elif 'BEiT' in module.backbone.__class__.__name__ or \
+                     'MAE' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_vit(name, num_layers)
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            elif decay_type == 'stage_wise':
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_stage_id_for_convnext(name, num_layers)
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            group_name = f'layer_{layer_id}_{group_name}'
+
+            if group_name not in parameter_groups:
+                scale = decay_rate**(num_layers - layer_id - 1)
+
+                parameter_groups[group_name] = {
+                    'weight_decay': this_weight_decay,
+                    'params': [],
+                    'param_names': [],
+                    'lr_scale': scale,
+                    'group_name': group_name,
+                    'lr': scale * self.base_lr,
+                }
+
+            parameter_groups[group_name]['params'].append(param)
+            parameter_groups[group_name]['param_names'].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    'param_names': parameter_groups[key]['param_names'],
+                    'lr_scale': parameter_groups[key]['lr_scale'],
+                    'lr': parameter_groups[key]['lr'],
+                    'weight_decay': parameter_groups[key]['weight_decay'],
+                }
+            logger.info(f'Param groups = {json.dumps(to_display, indent=2)}')
+        params.extend(parameter_groups.values())
+
+
+@OPTIMIZER_BUILDERS.register_module()
+class LayerDecayOptimizerConstructor(LearningRateDecayOptimizerConstructor):
+    """Different learning rates are set for different layers of backbone.
+
+    Note: Currently, this optimizer constructor is built for BEiT,
+    and it will be deprecated.
+    Please use ``LearningRateDecayOptimizerConstructor`` instead.
+    """
+
+    def __init__(self, optimizer_cfg, paramwise_cfg):
+        warnings.warn('DeprecationWarning: Original '
+                      'LayerDecayOptimizerConstructor of BEiT '
+                      'will be deprecated. Please use '
+                      'LearningRateDecayOptimizerConstructor instead, '
+                      'and set decay_type = layer_wise_vit in paramwise_cfg.')
+        paramwise_cfg.update({'decay_type': 'layer_wise_vit'})
+        warnings.warn('DeprecationWarning: Layer_decay_rate will '
+                      'be deleted, please use decay_rate instead.')
+        paramwise_cfg['decay_rate'] = paramwise_cfg.pop('layer_decay_rate')
+        super(LayerDecayOptimizerConstructor,
+              self).__init__(optimizer_cfg, paramwise_cfg)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8076b799b9e405e7ac5a883aa3a6d5dcb84060b5
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .nms import oks_iou, oks_nms, soft_oks_nms
+from .one_euro_filter import OneEuroFilter
+from .post_transforms import (affine_transform, flip_back, fliplr_joints,
+                              fliplr_regression, get_affine_transform,
+                              get_warp_matrix, rotate_point, transform_preds,
+                              warp_affine_joints)
+from .smoother import Smoother
+
+__all__ = [
+    'oks_nms', 'soft_oks_nms', 'affine_transform', 'rotate_point', 'flip_back',
+    'fliplr_joints', 'fliplr_regression', 'transform_preds',
+    'get_affine_transform', 'get_warp_matrix', 'warp_affine_joints', 'oks_iou',
+    'OneEuroFilter', 'Smoother'
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/group.py b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/group.py
new file mode 100644
index 0000000000000000000000000000000000000000..75499cb0bc4eb96f9255e9c02d20cf7a9c95c402
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/group.py
@@ -0,0 +1,418 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/princeton-vl/pose-ae-train/
+# Original licence: Copyright (c) 2017, umich-vl, under BSD 3-Clause License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+import torch
+from munkres import Munkres
+
+from mmpose.core.evaluation import post_dark_udp
+
+
+def _py_max_match(scores):
+    """Apply munkres algorithm to get the best match.
+
+    Args:
+        scores(np.ndarray): cost matrix.
+
+    Returns:
+        np.ndarray: best match.
+    """
+    m = Munkres()
+    tmp = m.compute(scores)
+    tmp = np.array(tmp).astype(int)
+    return tmp
+
+
+def _match_by_tag(inp, params):
+    """Match joints by tags. Use Munkres algorithm to calculate the best match
+    for keypoints grouping.
+
+    Note:
+        number of keypoints: K
+        max number of people in an image: M (M=30 by default)
+        dim of tags: L
+            If use flip testing, L=2; else L=1.
+
+    Args:
+        inp(tuple):
+            tag_k (np.ndarray[KxMxL]): tag corresponding to the
+                top k values of feature map per keypoint.
+            loc_k (np.ndarray[KxMx2]): top k locations of the
+                feature maps for keypoint.
+            val_k (np.ndarray[KxM]): top k value of the
+                feature maps per keypoint.
+        params(Params): class Params().
+
+    Returns:
+        np.ndarray: result of pose groups.
+    """
+    assert isinstance(params, _Params), 'params should be class _Params()'
+
+    tag_k, loc_k, val_k = inp
+
+    default_ = np.zeros((params.num_joints, 3 + tag_k.shape[2]),
+                        dtype=np.float32)
+
+    joint_dict = {}
+    tag_dict = {}
+    for i in range(params.num_joints):
+        idx = params.joint_order[i]
+
+        tags = tag_k[idx]
+        joints = np.concatenate((loc_k[idx], val_k[idx, :, None], tags), 1)
+        mask = joints[:, 2] > params.detection_threshold
+        tags = tags[mask]  # shape: [M, L]
+        joints = joints[mask]  # shape: [M, 3 + L], 3: x, y, val
+
+        if joints.shape[0] == 0:
+            continue
+
+        if i == 0 or len(joint_dict) == 0:
+            for tag, joint in zip(tags, joints):
+                key = tag[0]
+                joint_dict.setdefault(key, np.copy(default_))[idx] = joint
+                tag_dict[key] = [tag]
+        else:
+            # shape: [M]
+            grouped_keys = list(joint_dict.keys())
+            if params.ignore_too_much:
+                grouped_keys = grouped_keys[:params.max_num_people]
+            # shape: [M, L]
+            grouped_tags = [np.mean(tag_dict[i], axis=0) for i in grouped_keys]
+
+            # shape: [M, M, L]
+            diff = joints[:, None, 3:] - np.array(grouped_tags)[None, :, :]
+            # shape: [M, M]
+            diff_normed = np.linalg.norm(diff, ord=2, axis=2)
+            diff_saved = np.copy(diff_normed)
+
+            if params.use_detection_val:
+                diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3]
+
+            num_added = diff.shape[0]
+            num_grouped = diff.shape[1]
+
+            if num_added > num_grouped:
+                diff_normed = np.concatenate(
+                    (diff_normed,
+                     np.zeros((num_added, num_added - num_grouped),
+                              dtype=np.float32) + 1e10),
+                    axis=1)
+
+            pairs = _py_max_match(diff_normed)
+            for row, col in pairs:
+                if (row < num_added and col < num_grouped
+                        and diff_saved[row][col] < params.tag_threshold):
+                    key = grouped_keys[col]
+                    joint_dict[key][idx] = joints[row]
+                    tag_dict[key].append(tags[row])
+                else:
+                    key = tags[row][0]
+                    joint_dict.setdefault(key, np.copy(default_))[idx] = \
+                        joints[row]
+                    tag_dict[key] = [tags[row]]
+
+    joint_dict_keys = list(joint_dict.keys())
+    if params.ignore_too_much:
+        # The new person joints beyond the params.max_num_people will be
+        # ignored, for the dict is in ordered when python > 3.6 version.
+        joint_dict_keys = joint_dict_keys[:params.max_num_people]
+    results = np.array([joint_dict[i]
+                        for i in joint_dict_keys]).astype(np.float32)
+    return results
+
+
+class _Params:
+    """A class of parameter.
+
+    Args:
+        cfg(Config): config.
+    """
+
+    def __init__(self, cfg):
+        self.num_joints = cfg['num_joints']
+        self.max_num_people = cfg['max_num_people']
+
+        self.detection_threshold = cfg['detection_threshold']
+        self.tag_threshold = cfg['tag_threshold']
+        self.use_detection_val = cfg['use_detection_val']
+        self.ignore_too_much = cfg['ignore_too_much']
+
+        if self.num_joints == 17:
+            self.joint_order = [
+                i - 1 for i in
+                [1, 2, 3, 4, 5, 6, 7, 12, 13, 8, 9, 10, 11, 14, 15, 16, 17]
+            ]
+        else:
+            self.joint_order = list(np.arange(self.num_joints))
+
+
+class HeatmapParser:
+    """The heatmap parser for post processing."""
+
+    def __init__(self, cfg):
+        self.params = _Params(cfg)
+        self.tag_per_joint = cfg['tag_per_joint']
+        self.pool = torch.nn.MaxPool2d(cfg['nms_kernel'], 1,
+                                       cfg['nms_padding'])
+        self.use_udp = cfg.get('use_udp', False)
+        self.score_per_joint = cfg.get('score_per_joint', False)
+
+    def nms(self, heatmaps):
+        """Non-Maximum Suppression for heatmaps.
+
+        Args:
+            heatmap(torch.Tensor): Heatmaps before nms.
+
+        Returns:
+            torch.Tensor: Heatmaps after nms.
+        """
+
+        maxm = self.pool(heatmaps)
+        maxm = torch.eq(maxm, heatmaps).float()
+        heatmaps = heatmaps * maxm
+
+        return heatmaps
+
+    def match(self, tag_k, loc_k, val_k):
+        """Group keypoints to human poses in a batch.
+
+        Args:
+            tag_k (np.ndarray[NxKxMxL]): tag corresponding to the
+                top k values of feature map per keypoint.
+            loc_k (np.ndarray[NxKxMx2]): top k locations of the
+                feature maps for keypoint.
+            val_k (np.ndarray[NxKxM]): top k value of the
+                feature maps per keypoint.
+
+        Returns:
+            list
+        """
+
+        def _match(x):
+            return _match_by_tag(x, self.params)
+
+        return list(map(_match, zip(tag_k, loc_k, val_k)))
+
+    def top_k(self, heatmaps, tags):
+        """Find top_k values in an image.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            max number of people: M
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmaps (torch.Tensor[NxKxHxW])
+            tags (torch.Tensor[NxKxHxWxL])
+
+        Returns:
+            dict: A dict containing top_k values.
+
+            - tag_k (np.ndarray[NxKxMxL]):
+                tag corresponding to the top k values of
+                feature map per keypoint.
+            - loc_k (np.ndarray[NxKxMx2]):
+                top k location of feature map per keypoint.
+            - val_k (np.ndarray[NxKxM]):
+                top k value of feature map per keypoint.
+        """
+        heatmaps = self.nms(heatmaps)
+        N, K, H, W = heatmaps.size()
+        heatmaps = heatmaps.view(N, K, -1)
+        val_k, ind = heatmaps.topk(self.params.max_num_people, dim=2)
+
+        tags = tags.view(tags.size(0), tags.size(1), W * H, -1)
+        if not self.tag_per_joint:
+            tags = tags.expand(-1, self.params.num_joints, -1, -1)
+
+        tag_k = torch.stack(
+            [torch.gather(tags[..., i], 2, ind) for i in range(tags.size(3))],
+            dim=3)
+
+        x = ind % W
+        y = ind // W
+
+        ind_k = torch.stack((x, y), dim=3)
+
+        results = {
+            'tag_k': tag_k.cpu().numpy(),
+            'loc_k': ind_k.cpu().numpy(),
+            'val_k': val_k.cpu().numpy()
+        }
+
+        return results
+
+    @staticmethod
+    def adjust(results, heatmaps):
+        """Adjust the coordinates for better accuracy.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+
+        Args:
+            results (list(np.ndarray)): Keypoint predictions.
+            heatmaps (torch.Tensor[NxKxHxW]): Heatmaps.
+        """
+        _, _, H, W = heatmaps.shape
+        for batch_id, people in enumerate(results):
+            for people_id, people_i in enumerate(people):
+                for joint_id, joint in enumerate(people_i):
+                    if joint[2] > 0:
+                        x, y = joint[0:2]
+                        xx, yy = int(x), int(y)
+                        tmp = heatmaps[batch_id][joint_id]
+                        if tmp[min(H - 1, yy + 1), xx] > tmp[max(0, yy - 1),
+                                                             xx]:
+                            y += 0.25
+                        else:
+                            y -= 0.25
+
+                        if tmp[yy, min(W - 1, xx + 1)] > tmp[yy,
+                                                             max(0, xx - 1)]:
+                            x += 0.25
+                        else:
+                            x -= 0.25
+                        results[batch_id][people_id, joint_id,
+                                          0:2] = (x + 0.5, y + 0.5)
+        return results
+
+    @staticmethod
+    def refine(heatmap, tag, keypoints, use_udp=False):
+        """Given initial keypoint predictions, we identify missing joints.
+
+        Note:
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmap: np.ndarray(K, H, W).
+            tag: np.ndarray(K, H, W) |  np.ndarray(K, H, W, L)
+            keypoints: np.ndarray of size (K, 3 + L)
+                        last dim is (x, y, score, tag).
+            use_udp: bool-unbiased data processing
+
+        Returns:
+            np.ndarray: The refined keypoints.
+        """
+
+        K, H, W = heatmap.shape
+        if len(tag.shape) == 3:
+            tag = tag[..., None]
+
+        tags = []
+        for i in range(K):
+            if keypoints[i, 2] > 0:
+                # save tag value of detected keypoint
+                x, y = keypoints[i][:2].astype(int)
+                x = np.clip(x, 0, W - 1)
+                y = np.clip(y, 0, H - 1)
+                tags.append(tag[i, y, x])
+
+        # mean tag of current detected people
+        prev_tag = np.mean(tags, axis=0)
+        results = []
+
+        for _heatmap, _tag in zip(heatmap, tag):
+            # distance of all tag values with mean tag of
+            # current detected people
+            distance_tag = (((_tag -
+                              prev_tag[None, None, :])**2).sum(axis=2)**0.5)
+            norm_heatmap = _heatmap - np.round(distance_tag)
+
+            # find maximum position
+            y, x = np.unravel_index(np.argmax(norm_heatmap), _heatmap.shape)
+            xx = x.copy()
+            yy = y.copy()
+            # detection score at maximum position
+            val = _heatmap[y, x]
+            if not use_udp:
+                # offset by 0.5
+                x += 0.5
+                y += 0.5
+
+            # add a quarter offset
+            if _heatmap[yy, min(W - 1, xx + 1)] > _heatmap[yy, max(0, xx - 1)]:
+                x += 0.25
+            else:
+                x -= 0.25
+
+            if _heatmap[min(H - 1, yy + 1), xx] > _heatmap[max(0, yy - 1), xx]:
+                y += 0.25
+            else:
+                y -= 0.25
+
+            results.append((x, y, val))
+        results = np.array(results)
+
+        if results is not None:
+            for i in range(K):
+                # add keypoint if it is not detected
+                if results[i, 2] > 0 and keypoints[i, 2] == 0:
+                    keypoints[i, :3] = results[i, :3]
+
+        return keypoints
+
+    def parse(self, heatmaps, tags, adjust=True, refine=True):
+        """Group keypoints into poses given heatmap and tag.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmaps (torch.Tensor[NxKxHxW]): model output heatmaps.
+            tags (torch.Tensor[NxKxHxWxL]): model output tagmaps.
+
+        Returns:
+            tuple: A tuple containing keypoint grouping results.
+
+            - results (list(np.ndarray)): Pose results.
+            - scores (list/list(np.ndarray)): Score of people.
+        """
+        results = self.match(**self.top_k(heatmaps, tags))
+
+        if adjust:
+            if self.use_udp:
+                for i in range(len(results)):
+                    if results[i].shape[0] > 0:
+                        results[i][..., :2] = post_dark_udp(
+                            results[i][..., :2].copy(), heatmaps[i:i + 1, :])
+            else:
+                results = self.adjust(results, heatmaps)
+
+        if self.score_per_joint:
+            scores = [i[:, 2] for i in results[0]]
+        else:
+            scores = [i[:, 2].mean() for i in results[0]]
+
+        if refine:
+            results = results[0]
+            # for every detected person
+            for i in range(len(results)):
+                heatmap_numpy = heatmaps[0].cpu().numpy()
+                tag_numpy = tags[0].cpu().numpy()
+                if not self.tag_per_joint:
+                    tag_numpy = np.tile(tag_numpy,
+                                        (self.params.num_joints, 1, 1, 1))
+                results[i] = self.refine(
+                    heatmap_numpy, tag_numpy, results[i], use_udp=self.use_udp)
+            results = [results]
+
+        return results, scores
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/nms.py b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..86a0ab35e0e26d27bb0bb55071018ffc5ac9af1d
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/nms.py
@@ -0,0 +1,207 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+
+
+def nms(dets, thr):
+    """Greedily select boxes with high confidence and overlap <= thr.
+
+    Args:
+        dets: [[x1, y1, x2, y2, score]].
+        thr: Retain overlap < thr.
+
+    Returns:
+         list: Indexes to keep.
+    """
+    if len(dets) == 0:
+        return []
+
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while len(order) > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thr)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def oks_iou(g, d, a_g, a_d, sigmas=None, vis_thr=None):
+    """Calculate oks ious.
+
+    Args:
+        g: Ground truth keypoints.
+        d: Detected keypoints.
+        a_g: Area of the ground truth object.
+        a_d: Area of the detected object.
+        sigmas: standard deviation of keypoint labelling.
+        vis_thr: threshold of the keypoint visibility.
+
+    Returns:
+        list: The oks ious.
+    """
+    if sigmas is None:
+        sigmas = np.array([
+            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
+            .87, .87, .89, .89
+        ]) / 10.0
+    vars = (sigmas * 2)**2
+    xg = g[0::3]
+    yg = g[1::3]
+    vg = g[2::3]
+    ious = np.zeros(len(d), dtype=np.float32)
+    for n_d in range(0, len(d)):
+        xd = d[n_d, 0::3]
+        yd = d[n_d, 1::3]
+        vd = d[n_d, 2::3]
+        dx = xd - xg
+        dy = yd - yg
+        e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
+        if vis_thr is not None:
+            ind = list(vg > vis_thr) and list(vd > vis_thr)
+            e = e[ind]
+        ious[n_d] = np.sum(np.exp(-e)) / len(e) if len(e) != 0 else 0.0
+    return ious
+
+
+def oks_nms(kpts_db, thr, sigmas=None, vis_thr=None, score_per_joint=False):
+    """OKS NMS implementations.
+
+    Args:
+        kpts_db: keypoints.
+        thr: Retain overlap < thr.
+        sigmas: standard deviation of keypoint labelling.
+        vis_thr: threshold of the keypoint visibility.
+        score_per_joint: the input scores (in kpts_db) are per joint scores
+
+    Returns:
+        np.ndarray: indexes to keep.
+    """
+    if len(kpts_db) == 0:
+        return []
+
+    if score_per_joint:
+        scores = np.array([k['score'].mean() for k in kpts_db])
+    else:
+        scores = np.array([k['score'] for k in kpts_db])
+
+    kpts = np.array([k['keypoints'].flatten() for k in kpts_db])
+    areas = np.array([k['area'] for k in kpts_db])
+
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while len(order) > 0:
+        i = order[0]
+        keep.append(i)
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, vis_thr)
+
+        inds = np.where(oks_ovr <= thr)[0]
+        order = order[inds + 1]
+
+    keep = np.array(keep)
+
+    return keep
+
+
+def _rescore(overlap, scores, thr, type='gaussian'):
+    """Rescoring mechanism gaussian or linear.
+
+    Args:
+        overlap: calculated ious
+        scores: target scores.
+        thr: retain oks overlap < thr.
+        type: 'gaussian' or 'linear'
+
+    Returns:
+        np.ndarray: indexes to keep
+    """
+    assert len(overlap) == len(scores)
+    assert type in ['gaussian', 'linear']
+
+    if type == 'linear':
+        inds = np.where(overlap >= thr)[0]
+        scores[inds] = scores[inds] * (1 - overlap[inds])
+    else:
+        scores = scores * np.exp(-overlap**2 / thr)
+
+    return scores
+
+
+def soft_oks_nms(kpts_db,
+                 thr,
+                 max_dets=20,
+                 sigmas=None,
+                 vis_thr=None,
+                 score_per_joint=False):
+    """Soft OKS NMS implementations.
+
+    Args:
+        kpts_db
+        thr: retain oks overlap < thr.
+        max_dets: max number of detections to keep.
+        sigmas: Keypoint labelling uncertainty.
+        score_per_joint: the input scores (in kpts_db) are per joint scores
+
+    Returns:
+        np.ndarray: indexes to keep.
+    """
+    if len(kpts_db) == 0:
+        return []
+
+    if score_per_joint:
+        scores = np.array([k['score'].mean() for k in kpts_db])
+    else:
+        scores = np.array([k['score'] for k in kpts_db])
+
+    kpts = np.array([k['keypoints'].flatten() for k in kpts_db])
+    areas = np.array([k['area'] for k in kpts_db])
+
+    order = scores.argsort()[::-1]
+    scores = scores[order]
+
+    keep = np.zeros(max_dets, dtype=np.intp)
+    keep_cnt = 0
+    while len(order) > 0 and keep_cnt < max_dets:
+        i = order[0]
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, vis_thr)
+
+        order = order[1:]
+        scores = _rescore(oks_ovr, scores[1:], thr)
+
+        tmp = scores.argsort()[::-1]
+        order = order[tmp]
+        scores = scores[tmp]
+
+        keep[keep_cnt] = i
+        keep_cnt += 1
+
+    keep = keep[:keep_cnt]
+
+    return keep
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/one_euro_filter.py b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/one_euro_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..325466522dbcbd5f2cdf85276a94269466fe741f
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/one_euro_filter.py
@@ -0,0 +1,113 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HoBeom/OneEuroFilter-Numpy
+# Original licence: Copyright (c)  HoBeom Jeon, under the MIT License.
+# ------------------------------------------------------------------------------
+import warnings
+from time import time
+
+import numpy as np
+
+
+def smoothing_factor(t_e, cutoff):
+    r = 2 * np.pi * cutoff * t_e
+    return r / (r + 1)
+
+
+def exponential_smoothing(a, x, x_prev):
+    return a * x + (1 - a) * x_prev
+
+
+class OneEuroFilter:
+
+    def __init__(self,
+                 x0,
+                 dx0=0.0,
+                 min_cutoff=1.7,
+                 beta=0.3,
+                 d_cutoff=30.0,
+                 fps=None):
+        """One Euro Filter for keypoints smoothing.
+
+        Args:
+            x0 (np.ndarray[K, 2]): Initialize keypoints value
+            dx0 (float): 0.0
+            min_cutoff (float): parameter for one euro filter
+            beta (float): parameter for one euro filter
+            d_cutoff (float): Input data FPS
+            fps (float): Video FPS for video inference
+        """
+        warnings.warn(
+            'OneEuroFilter from '
+            '`mmpose/core/post_processing/one_euro_filter.py` will '
+            'be deprecated in the future. Please use Smoother'
+            '(`mmpose/core/post_processing/smoother.py`) with '
+            'OneEuroFilter (`mmpose/core/post_processing/temporal_'
+            'filters/one_euro_filter.py`).', DeprecationWarning)
+
+        # The parameters.
+        self.data_shape = x0.shape
+        self.min_cutoff = np.full(x0.shape, min_cutoff)
+        self.beta = np.full(x0.shape, beta)
+        self.d_cutoff = np.full(x0.shape, d_cutoff)
+        # Previous values.
+        self.x_prev = x0.astype(np.float32)
+        self.dx_prev = np.full(x0.shape, dx0)
+        self.mask_prev = np.ma.masked_where(x0 <= 0, x0)
+        self.realtime = True
+        if fps is None:
+            # Using in realtime inference
+            self.t_e = None
+            self.skip_frame_factor = d_cutoff
+            self.fps = d_cutoff
+        else:
+            # fps using video inference
+            self.realtime = False
+            self.fps = float(fps)
+            self.d_cutoff = np.full(x0.shape, self.fps)
+
+        self.t_prev = time()
+
+    def __call__(self, x, t_e=1.0):
+        """Compute the filtered signal.
+
+        Hyper-parameters (cutoff, beta) are from `VNect
+        <http://gvv.mpi-inf.mpg.de/projects/VNect/>`__ .
+
+        Realtime Camera fps (d_cutoff) default 30.0
+
+        Args:
+            x (np.ndarray[K, 2]): keypoints results in frame
+            t_e (Optional): video skip frame count for posetrack
+                evaluation
+        """
+        assert x.shape == self.data_shape
+
+        t = 0
+        if self.realtime:
+            t = time()
+            t_e = (t - self.t_prev) * self.skip_frame_factor
+        t_e = np.full(x.shape, t_e)
+
+        # missing keypoints mask
+        mask = np.ma.masked_where(x <= 0, x)
+
+        # The filtered derivative of the signal.
+        a_d = smoothing_factor(t_e / self.fps, self.d_cutoff)
+        dx = (x - self.x_prev) / t_e
+        dx_hat = exponential_smoothing(a_d, dx, self.dx_prev)
+
+        # The filtered signal.
+        cutoff = self.min_cutoff + self.beta * np.abs(dx_hat)
+        a = smoothing_factor(t_e / self.fps, cutoff)
+        x_hat = exponential_smoothing(a, x, self.x_prev)
+
+        # missing keypoints remove
+        np.copyto(x_hat, -10, where=mask.mask)
+
+        # Memorize the previous values.
+        self.x_prev = x_hat
+        self.dx_prev = dx_hat
+        self.t_prev = t
+        self.mask_prev = mask
+
+        return x_hat
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/post_transforms.py b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/post_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..93063fb1c1a60519a527037795654b0278a880e4
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/post_transforms.py
@@ -0,0 +1,366 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import math
+
+import cv2
+import numpy as np
+import torch
+
+
+def fliplr_joints(joints_3d, joints_3d_visible, img_width, flip_pairs):
+    """Flip human joints horizontally.
+
+    Note:
+        - num_keypoints: K
+
+    Args:
+        joints_3d (np.ndarray([K, 3])): Coordinates of keypoints.
+        joints_3d_visible (np.ndarray([K, 1])): Visibility of keypoints.
+        img_width (int): Image width.
+        flip_pairs (list[tuple]): Pairs of keypoints which are mirrored
+            (for example, left ear and right ear).
+
+    Returns:
+        tuple: Flipped human joints.
+
+        - joints_3d_flipped (np.ndarray([K, 3])): Flipped joints.
+        - joints_3d_visible_flipped (np.ndarray([K, 1])): Joint visibility.
+    """
+
+    assert len(joints_3d) == len(joints_3d_visible)
+    assert img_width > 0
+
+    joints_3d_flipped = joints_3d.copy()
+    joints_3d_visible_flipped = joints_3d_visible.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        joints_3d_flipped[left, :] = joints_3d[right, :]
+        joints_3d_flipped[right, :] = joints_3d[left, :]
+
+        joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]
+        joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]
+
+    # Flip horizontally
+    joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0]
+    joints_3d_flipped = joints_3d_flipped * joints_3d_visible_flipped
+
+    return joints_3d_flipped, joints_3d_visible_flipped
+
+
+def fliplr_regression(regression,
+                      flip_pairs,
+                      center_mode='static',
+                      center_x=0.5,
+                      center_index=0):
+    """Flip human joints horizontally.
+
+    Note:
+        - batch_size: N
+        - num_keypoint: K
+
+    Args:
+        regression (np.ndarray([..., K, C])): Coordinates of keypoints, where K
+            is the joint number and C is the dimension. Example shapes are:
+
+            - [N, K, C]: a batch of keypoints where N is the batch size.
+            - [N, T, K, C]: a batch of pose sequences, where T is the frame
+                number.
+        flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        center_mode (str): The mode to set the center location on the x-axis
+            to flip around. Options are:
+
+            - static: use a static x value (see center_x also)
+            - root: use a root joint (see center_index also)
+        center_x (float): Set the x-axis location of the flip center. Only used
+            when center_mode=static.
+        center_index (int): Set the index of the root joint, whose x location
+            will be used as the flip center. Only used when center_mode=root.
+
+    Returns:
+        np.ndarray([..., K, C]): Flipped joints.
+    """
+    assert regression.ndim >= 2, f'Invalid pose shape {regression.shape}'
+
+    allowed_center_mode = {'static', 'root'}
+    assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \
+        f'{center_mode}, allowed choices are {allowed_center_mode}'
+
+    if center_mode == 'static':
+        x_c = center_x
+    elif center_mode == 'root':
+        assert regression.shape[-2] > center_index
+        x_c = regression[..., center_index:center_index + 1, 0]
+
+    regression_flipped = regression.copy()
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        regression_flipped[..., left, :] = regression[..., right, :]
+        regression_flipped[..., right, :] = regression[..., left, :]
+
+    # Flip horizontally
+    regression_flipped[..., 0] = x_c * 2 - regression_flipped[..., 0]
+    return regression_flipped
+
+
+def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
+    """Flip the flipped heatmaps back to the original form.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
+            from the flipped images.
+        flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        target_type (str): GaussianHeatmap or CombinedTarget
+
+    Returns:
+        np.ndarray: heatmaps that flipped back to the original image
+    """
+    assert output_flipped.ndim == 4, \
+        'output_flipped should be [batch_size, num_keypoints, height, width]'
+    shape_ori = output_flipped.shape
+    channels = 1
+    if target_type.lower() == 'CombinedTarget'.lower():
+        channels = 3
+        output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
+    output_flipped = output_flipped.reshape(shape_ori[0], -1, channels,
+                                            shape_ori[2], shape_ori[3])
+    output_flipped_back = output_flipped.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
+        output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
+    output_flipped_back = output_flipped_back.reshape(shape_ori)
+    # Flip horizontally
+    output_flipped_back = output_flipped_back[..., ::-1]
+    return output_flipped_back
+
+
+def transform_preds(coords, center, scale, output_size, use_udp=False):
+    """Get final keypoint predictions from heatmaps and apply scaling and
+    translation to map them back to the image.
+
+    Note:
+        num_keypoints: K
+
+    Args:
+        coords (np.ndarray[K, ndims]):
+
+            * If ndims=2, corrds are predicted keypoint location.
+            * If ndims=4, corrds are composed of (x, y, scores, tags)
+            * If ndims=5, corrds are composed of (x, y, scores, tags,
+              flipped_tags)
+
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        use_udp (bool): Use unbiased data processing
+
+    Returns:
+        np.ndarray: Predicted coordinates in the images.
+    """
+    assert coords.shape[1] in (2, 4, 5)
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+
+    # Recover the scale which is normalized by a factor of 200.
+    scale = scale * 200.0
+
+    if use_udp:
+        scale_x = scale[0] / (output_size[0] - 1.0)
+        scale_y = scale[1] / (output_size[1] - 1.0)
+    else:
+        scale_x = scale[0] / output_size[0]
+        scale_y = scale[1] / output_size[1]
+
+    target_coords = np.ones_like(coords)
+    target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
+    target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5
+
+    return target_coords
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+
+    # pixel_std is 200.
+    scale_tmp = scale * 200.0
+
+    shift = np.array(shift)
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, trans_mat):
+    """Apply an affine transformation to the points.
+
+    Args:
+        pt (np.ndarray): a 2 dimensional point to be transformed
+        trans_mat (np.ndarray): 2x3 matrix of an affine transform
+
+    Returns:
+        np.ndarray: Transformed points.
+    """
+    assert len(pt) == 2
+    new_pt = np.array(trans_mat) @ np.array([pt[0], pt[1], 1.])
+
+    return new_pt
+
+
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(a) == 2
+    assert len(b) == 2
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+
+    return third_pt
+
+
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+
+    return rotated_pt
+
+
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """Calculate the transformation matrix under the constraint of unbiased.
+    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
+    Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+
+    Returns:
+        np.ndarray: A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = math.cos(theta) * scale_x
+    matrix[0, 1] = -math.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (-0.5 * size_input[0] * math.cos(theta) +
+                              0.5 * size_input[1] * math.sin(theta) +
+                              0.5 * size_target[0])
+    matrix[1, 0] = math.sin(theta) * scale_y
+    matrix[1, 1] = math.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (-0.5 * size_input[0] * math.sin(theta) -
+                              0.5 * size_input[1] * math.cos(theta) +
+                              0.5 * size_target[1])
+    return matrix
+
+
+def warp_affine_joints(joints, mat):
+    """Apply affine transformation defined by the transform matrix on the
+    joints.
+
+    Args:
+        joints (np.ndarray[..., 2]): Origin coordinate of joints.
+        mat (np.ndarray[3, 2]): The affine matrix.
+
+    Returns:
+        np.ndarray[..., 2]: Result coordinate of joints.
+    """
+    joints = np.array(joints)
+    shape = joints.shape
+    joints = joints.reshape(-1, 2)
+    return np.dot(
+        np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1),
+        mat.T).reshape(shape)
+
+
+def affine_transform_torch(pts, t):
+    npts = pts.shape[0]
+    pts_homo = torch.cat([pts, torch.ones(npts, 1, device=pts.device)], dim=1)
+    out = torch.mm(t, torch.t(pts_homo))
+    return torch.t(out[:2, :])
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/smoother.py b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/smoother.py
new file mode 100644
index 0000000000000000000000000000000000000000..083e360a15f38660eea19a8115412ff70fcd1b80
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/smoother.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import Dict, Union
+
+import numpy as np
+from mmengine.config import Config
+from mmengine.utils import is_seq_of
+from mmpose.core.post_processing.temporal_filters import build_filter
+
+
+class Smoother():
+    """Smoother to apply temporal smoothing on pose estimation results with a
+    filter.
+
+    Note:
+        T: The temporal length of the pose sequence
+        K: The keypoint number of each target
+        C: The keypoint coordinate dimension
+
+    Args:
+        filter_cfg (dict | str): The filter config. See example config files in
+            `configs/_base_/filters/` for details. Alternatively a config file
+            path can be accepted and the config will be loaded.
+        keypoint_dim (int): The keypoint coordinate dimension, which is
+            also indicated as C. Default: 2
+        keypoint_key (str): The dict key of the keypoints in the pose results.
+            Default: 'keypoints'
+    Example:
+        >>> import numpy as np
+        >>> # Build dummy pose result
+        >>> results = []
+        >>> for t in range(10):
+        >>>     results_t = []
+        >>>     for track_id in range(2):
+        >>>         result = {
+        >>>             'track_id': track_id,
+        >>>             'keypoints': np.random.rand(17, 3)
+        >>>         }
+        >>>         results_t.append(result)
+        >>>     results.append(results_t)
+        >>> # Example 1: Smooth multi-frame pose results offline.
+        >>> filter_cfg = dict(type='GaussianFilter', window_size=3)
+        >>> smoother = Smoother(filter_cfg, keypoint_dim=2)
+        >>> smoothed_results = smoother.smooth(results)
+        >>> # Example 2: Smooth pose results online frame-by-frame
+        >>> filter_cfg = dict(type='GaussianFilter', window_size=3)
+        >>> smoother = Smoother(filter_cfg, keypoint_dim=2)
+        >>> for result_t in results:
+        >>>     smoothed_result_t = smoother.smooth(result_t)
+    """
+
+    def __init__(self,
+                 filter_cfg: Union[Dict, str],
+                 keypoint_dim: int = 2,
+                 keypoint_key: str = 'keypoints'):
+        if isinstance(filter_cfg, str):
+            filter_cfg = Config.fromfile(filter_cfg).filter_cfg
+        self.filter_cfg = filter_cfg
+        self._filter = build_filter(filter_cfg)
+        self.keypoint_dim = keypoint_dim
+        self.key = keypoint_key
+        self.padding_size = self._filter.window_size - 1
+        self.history = {}
+
+    def _get_filter(self):
+        fltr = self._filter
+        if not fltr.shareable:
+            # If the filter is not shareable, build a new filter for the next
+            # requires
+            self._filter = build_filter(self.filter_cfg)
+        return fltr
+
+    def _collate_pose(self, results):
+        """Collate the pose results to pose sequences.
+
+        Args:
+            results (list[list[dict]]): The pose results of multiple frames.
+
+        Returns:
+            dict[str, np.ndarray]: A dict of collated pose sequences, where
+            the key is the track_id (in untracked scenario, the target index
+            will be used as the track_id), and the value is the pose sequence
+            in an array of shape [T, K, C]
+        """
+
+        if self._has_track_id(results):
+            # If the results have track_id, use it as the target indicator
+            results = [{res['track_id']: res
+                        for res in results_t} for results_t in results]
+            track_ids = results[0].keys()
+
+            for t, results_t in enumerate(results[1:]):
+                if results_t.keys() != track_ids:
+                    raise ValueError(f'Inconsistent track ids in frame {t+1}')
+
+            collated = {
+                id: np.stack([
+                    results_t[id][self.key][:, :self.keypoint_dim]
+                    for results_t in results
+                ])
+                for id in track_ids
+            }
+        else:
+            # If the results don't have track_id, use the target index
+            # as the target indicator
+            n_target = len(results[0])
+            for t, results_t in enumerate(results[1:]):
+                if len(results_t) != n_target:
+                    raise ValueError(
+                        f'Inconsistent target number in frame {t+1}: '
+                        f'{len(results_t)} vs {n_target}')
+
+            collated = {
+                id: np.stack([
+                    results_t[id][self.key][:, :self.keypoint_dim]
+                    for results_t in results
+                ])
+                for id in range(n_target)
+            }
+
+        return collated
+
+    def _scatter_pose(self, results, poses):
+        """Scatter the smoothed pose sequences and use them to update the pose
+        results.
+
+        Args:
+            results (list[list[dict]]): The original pose results
+            poses (dict[str, np.ndarray]): The smoothed pose sequences
+
+        Returns:
+            list[list[dict]]: The updated pose results
+        """
+        updated_results = []
+        for t, results_t in enumerate(results):
+            updated_results_t = []
+            if self._has_track_id(results):
+                id2result = ((result['track_id'], result)
+                             for result in results_t)
+            else:
+                id2result = enumerate(results_t)
+
+            for track_id, result in id2result:
+                result = copy.deepcopy(result)
+                result[self.key][:, :self.keypoint_dim] = poses[track_id][t]
+                updated_results_t.append(result)
+
+            updated_results.append(updated_results_t)
+        return updated_results
+
+    @staticmethod
+    def _has_track_id(results):
+        """Check if the pose results contain track_id."""
+        return 'track_id' in results[0][0]
+
+    def smooth(self, results):
+        """Apply temporal smoothing on pose estimation sequences.
+
+        Args:
+            results (list[dict] | list[list[dict]]): The pose results of a
+                single frame (non-nested list) or multiple frames (nested
+                list). The result of each target is a dict, which should
+                contains:
+
+                - track_id (optional, Any): The track ID of the target
+                - keypoints (np.ndarray): The keypoint coordinates in [K, C]
+
+        Returns:
+            (list[dict] | list[list[dict]]): Temporal smoothed pose results,
+            which has the same data structure as the input's.
+        """
+
+        # Check if input is empty
+        if not (results) or not (results[0]):
+            warnings.warn('Smoother received empty result.')
+            return results
+
+        # Check input is single frame or sequence
+        if is_seq_of(results, dict):
+            single_frame = True
+            results = [results]
+        else:
+            assert is_seq_of(results, list)
+            single_frame = False
+
+        # Get temporal length of input
+        T = len(results)
+
+        # Collate the input results to pose sequences
+        poses = self._collate_pose(results)
+
+        # Smooth the pose sequence of each target
+        smoothed_poses = {}
+        update_history = {}
+        for track_id, pose in poses.items():
+            if track_id in self.history:
+                # For tracked target, get its filter and pose history
+                pose_history, pose_filter = self.history[track_id]
+                if self.padding_size > 0:
+                    # Pad the pose sequence with pose history
+                    pose = np.concatenate((pose_history, pose), axis=0)
+            else:
+                # For new target, build a new filter
+                pose_filter = self._get_filter()
+
+            # Update the history information
+            if self.padding_size > 0:
+                pose_history = pose[-self.padding_size:].copy()
+            else:
+                pose_history = None
+            update_history[track_id] = (pose_history, pose_filter)
+
+            # Smooth the pose sequence with the filter
+            smoothed_pose = pose_filter(pose)
+            smoothed_poses[track_id] = smoothed_pose[-T:]
+
+        self.history = update_history
+
+        # Scatter the pose sequences back to the format of results
+        smoothed_results = self._scatter_pose(results, smoothed_poses)
+
+        # If the input is single frame, remove the nested list to keep the
+        # output structure consistent with the input's
+        if single_frame:
+            smoothed_results = smoothed_results[0]
+        return smoothed_results
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8aea62513b14fdb6ac740c06e82683a1e27363db
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import build_filter
+from .gaussian_filter import GaussianFilter
+from .one_euro_filter import OneEuroFilter
+from .savizky_golay_filter import SavizkyGolayFilter
+from .smoothnet_filter import SmoothNetFilter
+
+__all__ = [
+    'build_filter', 'GaussianFilter', 'OneEuroFilter', 'SavizkyGolayFilter',
+    'SmoothNetFilter'
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd429df5106ff7a27dc4f63cb510b442ed48bb87
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import Registry
+
+FILTERS = Registry('filters')
+
+
+def build_filter(cfg):
+    """Build filters function."""
+    return FILTERS.build(cfg)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/filter.py b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c6ce0127092235c370f8e398751884f09a18bf5
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/filter.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class TemporalFilter(metaclass=ABCMeta):
+    """Base class of temporal filter.
+
+    A subclass should implement the method __call__().
+
+    Parameters:
+        window_size (int): the size of the sliding window.
+    """
+
+    # If the filter can be shared by multiple humans or targets
+    _shareable: bool = True
+
+    def __init__(self, window_size=1):
+        self._window_size = window_size
+
+    @property
+    def window_size(self):
+        return self._window_size
+
+    @property
+    def shareable(self):
+        return self._shareable
+
+    @abstractmethod
+    def __call__(self, x):
+        """Apply filter to a pose sequence.
+
+        Note:
+            T: The temporal length of the pose sequence
+            K: The keypoint number of each target
+            C: The keypoint coordinate dimension
+
+        Args:
+            x (np.ndarray): input pose sequence in shape [T, K, C]
+
+        Returns:
+            np.ndarray: Smoothed pose sequence in shape [T, K, C]
+        """
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/gaussian_filter.py b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/gaussian_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b737cdb15aeb9985c0666afeb26e919893343262
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/gaussian_filter.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from scipy.ndimage.filters import gaussian_filter1d
+from scipy.signal import medfilt
+
+from .builder import FILTERS
+from .filter import TemporalFilter
+
+
+@FILTERS.register_module(name=['GaussianFilter', 'gaussian'])
+class GaussianFilter(TemporalFilter):
+    """Apply median filter and then gaussian filter.
+
+    Adapted from:
+    https://github.com/akanazawa/human_dynamics/blob/mas
+    ter/src/util/smooth_bbox.py.
+
+    Args:
+        window_size (int): The size of the filter window (i.e., the number
+            of coefficients). window_length must be a positive odd integer.
+            Default: 11
+        sigma (float): Sigma for gaussian smoothing. Default: 4.0
+    """
+
+    def __init__(self, window_size: int = 11, sigma: float = 4.0):
+        super().__init__(window_size)
+        assert window_size % 2 == 1, (
+            'The window size of GaussianFilter should'
+            f'be odd, but got {window_size}')
+        self.sigma = sigma
+
+    def __call__(self, x: np.ndarray):
+
+        assert x.ndim == 3, ('Input should be an array with shape [T, K, C]'
+                             f', but got invalid shape {x.shape}')
+
+        T = x.shape[0]
+        if T < self.window_size:
+            pad_width = [(self.window_size - T, 0), (0, 0), (0, 0)]
+            x = np.pad(x, pad_width, mode='edge')
+        smoothed = medfilt(x, (self.window_size, 1, 1))
+
+        smoothed = gaussian_filter1d(smoothed, self.sigma, axis=0)
+        return smoothed[-T:]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/one_euro_filter.py b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/one_euro_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b954a97fd79543f243a087510a20c4e0037b9ef5
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/one_euro_filter.py
@@ -0,0 +1,113 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HoBeom/OneEuroFilter-Numpy
+# Original licence: Copyright (c)  HoBeom Jeon, under the MIT License.
+# ------------------------------------------------------------------------------
+import math
+
+import numpy as np
+
+from .builder import FILTERS
+from .filter import TemporalFilter
+
+
+def smoothing_factor(t_e, cutoff):
+    r = 2 * math.pi * cutoff * t_e
+    return r / (r + 1)
+
+
+def exponential_smoothing(a, x, x_prev):
+    return a * x + (1 - a) * x_prev
+
+
+class OneEuro:
+
+    def __init__(self, t0, x0, dx0, min_cutoff, beta, d_cutoff=1.0):
+        super(OneEuro, self).__init__()
+        """Initialize the one euro filter."""
+        # The parameters.
+        self.min_cutoff = float(min_cutoff)
+        self.beta = float(beta)
+        self.d_cutoff = float(d_cutoff)
+        # Previous values.
+        self.x_prev = x0
+        self.dx_prev = dx0
+        self.t_prev = t0
+
+    def __call__(self, x, t=None):
+        """Compute the filtered signal."""
+
+        if t is None:
+            # Assume input is feed frame by frame if not specified
+            t = self.t_prev + 1
+
+        t_e = t - self.t_prev
+
+        # The filtered derivative of the signal.
+        a_d = smoothing_factor(t_e, self.d_cutoff)  # [k, c]
+        dx = (x - self.x_prev) / t_e
+        dx_hat = exponential_smoothing(a_d, dx, self.dx_prev)
+
+        # The filtered signal.
+        cutoff = self.min_cutoff + self.beta * np.abs(dx_hat)
+        a = smoothing_factor(t_e, cutoff)
+        x_hat = exponential_smoothing(a, x, self.x_prev)
+        # Memorize the previous values.
+        self.x_prev = x_hat
+        self.dx_prev = dx_hat
+        self.t_prev = t
+        return x_hat
+
+
+@FILTERS.register_module(name=['OneEuroFilter', 'oneeuro'])
+class OneEuroFilter(TemporalFilter):
+    """Oneeuro filter, source code: https://github.com/mkocabas/VIBE/blob/c0
+    c3f77d587351c806e901221a9dc05d1ffade4b/lib/utils/smooth_pose.py.
+
+    Args:
+        min_cutoff (float, optional): Decreasing the minimum cutoff frequency
+            decreases slow speed jitter
+        beta (float, optional): Increasing the speed coefficient(beta)
+            decreases speed lag.
+    """
+
+    # Not shareable because the filter holds status of a specific target
+    _shareable: bool = False
+
+    def __init__(self, min_cutoff=0.004, beta=0.7):
+        # OneEuroFilter has Markov Property and maintains status variables
+        # within the class, thus has a windows_size of 1
+        super().__init__(window_size=1)
+        self.min_cutoff = min_cutoff
+        self.beta = beta
+        self._one_euro = None
+
+    def __call__(self, x: np.ndarray):
+        assert x.ndim == 3, ('Input should be an array with shape [T, K, C]'
+                             f', but got invalid shape {x.shape}')
+
+        pred_pose_hat = x.copy()
+
+        if self._one_euro is None:
+            # The filter is invoked for the first time
+            # Initialize the filter
+            self._one_euro = OneEuro(
+                np.zeros_like(x[0]),
+                x[0],
+                dx0=0.0,
+                min_cutoff=self.min_cutoff,
+                beta=self.beta,
+            )
+            t0 = 1
+        else:
+            # The filter has been invoked
+            t0 = 0
+
+        for t, pose in enumerate(x):
+            if t < t0:
+                # If the filter is invoked for the first time
+                # set pred_pose_hat[0] = x[0]
+                continue
+            pose = self._one_euro(pose)
+            pred_pose_hat[t] = pose
+
+        return pred_pose_hat
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/savizky_golay_filter.py b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/savizky_golay_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..18e0528f6cec71f19fe1c4a1f26560c1438bd1ce
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/savizky_golay_filter.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from scipy.signal import savgol_filter
+
+from .builder import FILTERS
+from .filter import TemporalFilter
+
+
+@FILTERS.register_module(name=['SavizkyGolayFilter', 'savgol'])
+class SavizkyGolayFilter(TemporalFilter):
+    """Savizky-Golay filter.
+
+    Adapted from:
+    https://docs.scipy.org/doc/scipy/reference/generated/
+    scipy.signal.savgol_filter.html.
+
+    Args:
+        window_size (int): The size of the filter window (i.e., the number
+            of coefficients). window_length must be a positive odd integer.
+            Default: 11
+        polyorder (int): The order of the polynomial used to fit the samples.
+            polyorder must be less than window_size.
+    """
+
+    def __init__(self, window_size: int = 11, polyorder: int = 2):
+        super().__init__(window_size)
+
+        # 1-D Savitzky-Golay filter
+        assert polyorder > 0, (
+            f'Got invalid parameter polyorder={polyorder}. Polyorder '
+            'should be positive.')
+        assert polyorder < window_size, (
+            f'Got invalid parameters polyorder={polyorder} and '
+            f'window_size={window_size}. Polyorder should be less than '
+            'window_size.')
+        self.polyorder = polyorder
+
+    def __call__(self, x: np.ndarray):
+
+        assert x.ndim == 3, ('Input should be an array with shape [T, K, C]'
+                             f', but got invalid shape {x.shape}')
+
+        T = x.shape[0]
+        if T < self.window_size:
+            pad_width = [(self.window_size - T, 0), (0, 0), (0, 0)]
+            x = np.pad(x, pad_width, mode='edge')
+
+        smoothed = savgol_filter(x, self.window_size, self.polyorder, axis=0)
+
+        return smoothed[-T:]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd73b09717c25d2fef9839f6f9869bc45d8958ef
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import numpy as np
+import torch
+from mmengine.runner import load_checkpoint
+from torch import Tensor, nn
+
+from .builder import FILTERS
+from .filter import TemporalFilter
+
+
+class SmoothNetResBlock(nn.Module):
+    """Residual block module used in SmoothNet.
+
+    Args:
+        in_channels (int): Input channel number.
+        hidden_channels (int): The hidden feature channel number.
+        dropout (float): Dropout probability. Default: 0.5
+
+    Shape:
+        Input: (*, in_channels)
+        Output: (*, in_channels)
+    """
+
+    def __init__(self, in_channels, hidden_channels, dropout=0.5):
+        super().__init__()
+        self.linear1 = nn.Linear(in_channels, hidden_channels)
+        self.linear2 = nn.Linear(hidden_channels, in_channels)
+        self.lrelu = nn.LeakyReLU(0.2, inplace=True)
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+
+    def forward(self, x):
+        identity = x
+        x = self.linear1(x)
+        x = self.dropout(x)
+        x = self.lrelu(x)
+        x = self.linear2(x)
+        x = self.dropout(x)
+        x = self.lrelu(x)
+
+        out = x + identity
+        return out
+
+
+class SmoothNet(nn.Module):
+    """SmoothNet is a plug-and-play temporal-only network to refine human
+    poses. It works for 2d/3d/6d pose smoothing.
+
+    "SmoothNet: A Plug-and-Play Network for Refining Human Poses in Videos",
+    arXiv'2021. More details can be found in the `paper
+    <https://arxiv.org/abs/2112.13715>`__ .
+
+    Note:
+        N: The batch size
+        T: The temporal length of the pose sequence
+        C: The total pose dimension (e.g. keypoint_number * keypoint_dim)
+
+    Args:
+        window_size (int): The size of the input window.
+        output_size (int): The size of the output window.
+        hidden_size (int): The hidden feature dimension in the encoder,
+            the decoder and between residual blocks. Default: 512
+        res_hidden_size (int): The hidden feature dimension inside the
+            residual blocks. Default: 256
+        num_blocks (int): The number of residual blocks. Default: 3
+        dropout (float): Dropout probability. Default: 0.5
+
+    Shape:
+        Input: (N, C, T) the original pose sequence
+        Output: (N, C, T) the smoothed pose sequence
+    """
+
+    def __init__(self,
+                 window_size: int,
+                 output_size: int,
+                 hidden_size: int = 512,
+                 res_hidden_size: int = 256,
+                 num_blocks: int = 3,
+                 dropout: float = 0.5):
+        super().__init__()
+        self.window_size = window_size
+        self.output_size = output_size
+        self.hidden_size = hidden_size
+        self.res_hidden_size = res_hidden_size
+        self.num_blocks = num_blocks
+        self.dropout = dropout
+
+        assert output_size <= window_size, (
+            'The output size should be less than or equal to the window size.',
+            f' Got output_size=={output_size} and window_size=={window_size}')
+
+        # Build encoder layers
+        self.encoder = nn.Sequential(
+            nn.Linear(window_size, hidden_size),
+            nn.LeakyReLU(0.1, inplace=True))
+
+        # Build residual blocks
+        res_blocks = []
+        for _ in range(num_blocks):
+            res_blocks.append(
+                SmoothNetResBlock(
+                    in_channels=hidden_size,
+                    hidden_channels=res_hidden_size,
+                    dropout=dropout))
+        self.res_blocks = nn.Sequential(*res_blocks)
+
+        # Build decoder layers
+        self.decoder = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        N, C, T = x.shape
+        num_windows = T - self.window_size + 1
+
+        assert T >= self.window_size, (
+            'Input sequence length must be no less than the window size. ',
+            f'Got x.shape[2]=={T} and window_size=={self.window_size}')
+
+        # Unfold x to obtain input sliding windows
+        # [N, C, num_windows, window_size]
+        x = x.unfold(2, self.window_size, 1)
+
+        # Forward layers
+        x = self.encoder(x)
+        x = self.res_blocks(x)
+        x = self.decoder(x)  # [N, C, num_windows, output_size]
+
+        # Accumulate output ensembles
+        out = x.new_zeros(N, C, T)
+        count = x.new_zeros(T)
+
+        for t in range(num_windows):
+            out[..., t:t + self.output_size] += x[:, :, t]
+            count[t:t + self.output_size] += 1.0
+
+        return out.div(count)
+
+
+@FILTERS.register_module(name=['SmoothNetFilter', 'SmoothNet', 'smoothnet'])
+class SmoothNetFilter(TemporalFilter):
+    """Apply SmoothNet filter.
+
+    "SmoothNet: A Plug-and-Play Network for Refining Human Poses in Videos",
+    arXiv'2021. More details can be found in the `paper
+    <https://arxiv.org/abs/2112.13715>`__ .
+
+    Args:
+        window_size (int): The size of the filter window. It's also the
+            window_size of SmoothNet model.
+        output_size (int): The output window size of SmoothNet model.
+        checkpoint (str): The checkpoint file of the pretrained SmoothNet
+            model. Please note that `checkpoint` should be matched with
+            `window_size` and `output_size`.
+        hidden_size (int): SmoothNet argument. See :class:`SmoothNet` for
+            details. Default: 512
+        hidden_res_size (int): SmoothNet argument. See :class:`SmoothNet`
+            for details. Default: 256
+        num_blocks (int): SmoothNet argument. See :class:`SmoothNet` for
+            details. Default: 3
+        device (str): Device for model inference. Default: 'cpu'
+        root_index (int, optional): If not None, relative keypoint coordinates
+            will be calculated as the SmoothNet input, by centering the
+            keypoints around the root point. The model output will be
+            converted back to absolute coordinates. Default: None
+    """
+
+    def __init__(
+        self,
+        window_size: int,
+        output_size: int,
+        checkpoint: Optional[str] = None,
+        hidden_size: int = 512,
+        res_hidden_size: int = 256,
+        num_blocks: int = 3,
+        device: str = 'cpu',
+        root_index: Optional[int] = None,
+    ):
+        super().__init__(window_size)
+        self.device = device
+        self.root_index = root_index
+        self.smoothnet = SmoothNet(window_size, output_size, hidden_size,
+                                   res_hidden_size, num_blocks)
+        if checkpoint:
+            load_checkpoint(self.smoothnet, checkpoint)
+        self.smoothnet.to(device)
+        self.smoothnet.eval()
+
+        for p in self.smoothnet.parameters():
+            p.requires_grad_(False)
+
+    def __call__(self, x: np.ndarray):
+        assert x.ndim == 3, ('Input should be an array with shape [T, K, C]'
+                             f', but got invalid shape {x.shape}')
+
+        root_index = self.root_index
+        if root_index is not None:
+            x_root = x[:, root_index:root_index + 1]
+            x = np.delete(x, root_index, axis=1)
+            x = x - x_root
+
+        T, K, C = x.shape
+
+        if T < self.window_size:
+            # Skip smoothing if the input length is less than the window size
+            smoothed = x
+        else:
+            dtype = x.dtype
+
+            # Convert to tensor and forward the model
+            with torch.no_grad():
+                x = torch.tensor(x, dtype=torch.float32, device=self.device)
+                x = x.view(1, T, K * C).permute(0, 2, 1)  # to [1, KC, T]
+                smoothed = self.smoothnet(x)  # in shape [1, KC, T]
+
+            # Convert model output back to input shape and format
+            smoothed = smoothed.permute(0, 2, 1).view(T, K, C)  # to [T, K, C]
+            smoothed = smoothed.cpu().numpy().astype(dtype)  # to numpy.ndarray
+
+        if root_index is not None:
+            smoothed += x_root
+            smoothed = np.concatenate(
+                (smoothed[:, :root_index], x_root, smoothed[:, root_index:]),
+                axis=1)
+
+        return smoothed
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/utils/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/core/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..512e7680bcce478ca00f79e536ee54cd02de93df
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/utils/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dist_utils import allreduce_grads, sync_random_seed
+from .model_util_hooks import ModelSetEpochHook
+from .regularizations import WeightNormClipHook
+
+__all__ = [
+    'allreduce_grads', 'WeightNormClipHook', 'sync_random_seed',
+    'ModelSetEpochHook'
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/utils/dist_utils.py b/SMPLer-X/main/transformer_utils/mmpose/core/utils/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6273bab4870ac646edbddcc21e2c30de462f2a2
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/utils/dist_utils.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmengine.dist import get_dist_info
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    """Allreduce parameters as a whole."""
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters of a model
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Default: True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Default: -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+def sync_random_seed(seed=None, device='cuda'):
+    """Make sure different ranks share the same seed.
+
+    All workers must call
+    this function, otherwise it will deadlock. This method is generally used in
+    `DistributedSampler`, because the seed should be identical across all
+    processes in the distributed group.
+    In distributed sampling, different ranks should sample non-overlapped
+    data in the dataset. Therefore, this function is used to make sure that
+    each rank shuffles the data indices in the same order based
+    on the same seed. Then different ranks could use different indices
+    to select non-overlapped data from the same data list.
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is None:
+        seed = np.random.randint(2**31)
+    assert isinstance(seed, int)
+
+    rank, world_size = get_dist_info()
+
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/utils/model_util_hooks.py b/SMPLer-X/main/transformer_utils/mmpose/core/utils/model_util_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..f03e3178309b08e7969dd6793e39d8bb743115cf
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/utils/model_util_hooks.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.registry import HOOKS
+from mmengine.hooks import Hook
+
+@HOOKS.register_module()
+class ModelSetEpochHook(Hook):
+    """The hook that tells model the current epoch in training."""
+
+    def __init__(self):
+        pass
+
+    def before_epoch(self, runner):
+        runner.model.module.set_train_epoch(runner.epoch + 1)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/utils/regularizations.py b/SMPLer-X/main/transformer_utils/mmpose/core/utils/regularizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8c7449038066016f6efb60e126111ace962fe98
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/utils/regularizations.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod, abstractproperty
+
+import torch
+
+
+class PytorchModuleHook(metaclass=ABCMeta):
+    """Base class for PyTorch module hook registers.
+
+    An instance of a subclass of PytorchModuleHook can be used to
+    register hook to a pytorch module using the `register` method like:
+        hook_register.register(module)
+
+    Subclasses should add/overwrite the following methods:
+        - __init__
+        - hook
+        - hook_type
+    """
+
+    @abstractmethod
+    def hook(self, *args, **kwargs):
+        """Hook function."""
+
+    @abstractproperty
+    def hook_type(self) -> str:
+        """Hook type Subclasses should overwrite this function to return a
+        string value in.
+
+        {`forward`, `forward_pre`, `backward`}
+        """
+
+    def register(self, module):
+        """Register the hook function to the module.
+
+        Args:
+            module (pytorch module): the module to register the hook.
+
+        Returns:
+            handle (torch.utils.hooks.RemovableHandle): a handle to remove
+                the hook by calling handle.remove()
+        """
+        assert isinstance(module, torch.nn.Module)
+
+        if self.hook_type == 'forward':
+            h = module.register_forward_hook(self.hook)
+        elif self.hook_type == 'forward_pre':
+            h = module.register_forward_pre_hook(self.hook)
+        elif self.hook_type == 'backward':
+            h = module.register_backward_hook(self.hook)
+        else:
+            raise ValueError(f'Invalid hook type {self.hook}')
+
+        return h
+
+
+class WeightNormClipHook(PytorchModuleHook):
+    """Apply weight norm clip regularization.
+
+    The module's parameter will be clip to a given maximum norm before each
+    forward pass.
+
+    Args:
+        max_norm (float): The maximum norm of the parameter.
+        module_param_names (str|list): The parameter name (or name list) to
+            apply weight norm clip.
+    """
+
+    def __init__(self, max_norm=1.0, module_param_names='weight'):
+        self.module_param_names = module_param_names if isinstance(
+            module_param_names, list) else [module_param_names]
+        self.max_norm = max_norm
+
+    @property
+    def hook_type(self):
+        return 'forward_pre'
+
+    def hook(self, module, _input):
+        for name in self.module_param_names:
+            assert name in module._parameters, f'{name} is not a parameter' \
+                f' of the module {type(module)}'
+            param = module._parameters[name]
+
+            with torch.no_grad():
+                m = param.norm().item()
+                if m > self.max_norm:
+                    param.mul_(self.max_norm / (m + 1e-6))
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/visualization/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/core/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c87fc29145b1eff15713ca79bc36708a4836ecf8
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/visualization/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .image import (imshow_bboxes, imshow_keypoints, imshow_keypoints_3d,
+                    imshow_mesh_3d, imshow_multiview_keypoints_3d)
+
+__all__ = [
+    'imshow_keypoints', 'imshow_keypoints_3d', 'imshow_bboxes',
+    'imshow_mesh_3d', 'imshow_multiview_keypoints_3d'
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/core/visualization/image.py b/SMPLer-X/main/transformer_utils/mmpose/core/visualization/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..8188ccb0ed42427dc4311d20e135d115d9e5e6fc
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/core/visualization/image.py
@@ -0,0 +1,522 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import os
+import warnings
+
+import cv2
+import mmcv
+import numpy as np
+from matplotlib import pyplot as plt
+from mmengine.utils import deprecated_api_warning
+from mmcv.visualization.color import color_val
+
+try:
+    import trimesh
+    has_trimesh = True
+except (ImportError, ModuleNotFoundError):
+    has_trimesh = False
+
+try:
+    os.environ['PYOPENGL_PLATFORM'] = 'osmesa'
+    import pyrender
+    has_pyrender = True
+except (ImportError, ModuleNotFoundError):
+    has_pyrender = False
+
+
+def imshow_bboxes(img,
+                  bboxes,
+                  labels=None,
+                  colors='green',
+                  text_color='white',
+                  thickness=1,
+                  font_scale=0.5,
+                  show=True,
+                  win_name='',
+                  wait_time=0,
+                  out_file=None):
+    """Draw bboxes with labels (optional) on an image. This is a wrapper of
+    mmcv.imshow_bboxes.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): ndarray of shape (k, 4), each row is a bbox in
+            format [x1, y1, x2, y2].
+        labels (str or list[str], optional): labels of each bbox.
+        colors (list[str or tuple or :obj:`Color`]): A list of colors.
+        text_color (str or tuple or :obj:`Color`): Color of texts.
+        thickness (int): Thickness of lines.
+        font_scale (float): Font scales of texts.
+        show (bool): Whether to show the image.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+        out_file (str, optional): The filename to write the image.
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+
+    # adapt to mmcv.imshow_bboxes input format
+    bboxes = np.split(
+        bboxes, bboxes.shape[0], axis=0) if bboxes.shape[0] > 0 else []
+    if not isinstance(colors, list):
+        colors = [colors for _ in range(len(bboxes))]
+    colors = [mmcv.color_val(c) for c in colors]
+    assert len(bboxes) == len(colors)
+
+    img = mmcv.imshow_bboxes(
+        img,
+        bboxes,
+        colors,
+        top_k=-1,
+        thickness=thickness,
+        show=False,
+        out_file=None)
+
+    if labels is not None:
+        if not isinstance(labels, list):
+            labels = [labels for _ in range(len(bboxes))]
+        assert len(labels) == len(bboxes)
+
+        for bbox, label, color in zip(bboxes, labels, colors):
+            if label is None:
+                continue
+            bbox_int = bbox[0, :4].astype(np.int32)
+            # roughly estimate the proper font size
+            text_size, text_baseline = cv2.getTextSize(label,
+                                                       cv2.FONT_HERSHEY_DUPLEX,
+                                                       font_scale, thickness)
+            text_x1 = bbox_int[0]
+            text_y1 = max(0, bbox_int[1] - text_size[1] - text_baseline)
+            text_x2 = bbox_int[0] + text_size[0]
+            text_y2 = text_y1 + text_size[1] + text_baseline
+            cv2.rectangle(img, (text_x1, text_y1), (text_x2, text_y2), color,
+                          cv2.FILLED)
+            cv2.putText(img, label, (text_x1, text_y2 - text_baseline),
+                        cv2.FONT_HERSHEY_DUPLEX, font_scale,
+                        mmcv.color_val(text_color), thickness)
+
+    if show:
+        mmcv.imshow(img, win_name, wait_time)
+    if out_file is not None:
+        mmcv.imwrite(img, out_file)
+    return img
+
+
+@deprecated_api_warning({'pose_limb_color': 'pose_link_color'})
+def imshow_keypoints(img,
+                     pose_result,
+                     skeleton=None,
+                     kpt_score_thr=0.3,
+                     pose_kpt_color=None,
+                     pose_link_color=None,
+                     radius=4,
+                     thickness=1,
+                     show_keypoint_weight=False):
+    """Draw keypoints and links on an image.
+
+    Args:
+            img (str or Tensor): The image to draw poses on. If an image array
+                is given, id will be modified in-place.
+            pose_result (list[kpts]): The poses to draw. Each element kpts is
+                a set of K keypoints as an Kx3 numpy.ndarray, where each
+                keypoint is represented as x, y, score.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints. If None,
+                the keypoint will not be drawn.
+            pose_link_color (np.array[Mx3]): Color of M links. If None, the
+                links will not be drawn.
+            thickness (int): Thickness of lines.
+    """
+
+    img = mmcv.imread(img)
+    img_h, img_w, _ = img.shape
+
+    for kpts in pose_result:
+
+        kpts = np.array(kpts, copy=False)
+
+        # draw each point on image
+        if pose_kpt_color is not None:
+            assert len(pose_kpt_color) == len(kpts)
+
+            for kid, kpt in enumerate(kpts):
+                x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
+
+                if kpt_score < kpt_score_thr or pose_kpt_color[kid] is None:
+                    # skip the point that should not be drawn
+                    continue
+
+                color = tuple(int(c) for c in pose_kpt_color[kid])
+                if show_keypoint_weight:
+                    img_copy = img.copy()
+                    cv2.circle(img_copy, (int(x_coord), int(y_coord)), radius,
+                               color, -1)
+                    transparency = max(0, min(1, kpt_score))
+                    cv2.addWeighted(
+                        img_copy,
+                        transparency,
+                        img,
+                        1 - transparency,
+                        0,
+                        dst=img)
+                else:
+                    cv2.circle(img, (int(x_coord), int(y_coord)), radius,
+                               color, -1)
+
+        # draw links
+        if skeleton is not None and pose_link_color is not None:
+            assert len(pose_link_color) == len(skeleton)
+
+            for sk_id, sk in enumerate(skeleton):
+                pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
+                pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
+
+                if (pos1[0] <= 0 or pos1[0] >= img_w or pos1[1] <= 0
+                        or pos1[1] >= img_h or pos2[0] <= 0 or pos2[0] >= img_w
+                        or pos2[1] <= 0 or pos2[1] >= img_h
+                        or kpts[sk[0], 2] < kpt_score_thr
+                        or kpts[sk[1], 2] < kpt_score_thr
+                        or pose_link_color[sk_id] is None):
+                    # skip the link that should not be drawn
+                    continue
+                color = tuple(int(c) for c in pose_link_color[sk_id])
+                if show_keypoint_weight:
+                    img_copy = img.copy()
+                    X = (pos1[0], pos2[0])
+                    Y = (pos1[1], pos2[1])
+                    mX = np.mean(X)
+                    mY = np.mean(Y)
+                    length = ((Y[0] - Y[1])**2 + (X[0] - X[1])**2)**0.5
+                    angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                    stickwidth = 2
+                    polygon = cv2.ellipse2Poly(
+                        (int(mX), int(mY)), (int(length / 2), int(stickwidth)),
+                        int(angle), 0, 360, 1)
+                    cv2.fillConvexPoly(img_copy, polygon, color)
+                    transparency = max(
+                        0, min(1, 0.5 * (kpts[sk[0], 2] + kpts[sk[1], 2])))
+                    cv2.addWeighted(
+                        img_copy,
+                        transparency,
+                        img,
+                        1 - transparency,
+                        0,
+                        dst=img)
+                else:
+                    cv2.line(img, pos1, pos2, color, thickness=thickness)
+
+    return img
+
+
+def imshow_keypoints_3d(
+    pose_result,
+    img=None,
+    skeleton=None,
+    pose_kpt_color=None,
+    pose_link_color=None,
+    vis_height=400,
+    kpt_score_thr=0.3,
+    num_instances=-1,
+    *,
+    axis_azimuth=70,
+    axis_limit=1.7,
+    axis_dist=10.0,
+    axis_elev=15.0,
+):
+    """Draw 3D keypoints and links in 3D coordinates.
+
+    Args:
+        pose_result (list[dict]): 3D pose results containing:
+            - "keypoints_3d" ([K,4]): 3D keypoints
+            - "title" (str): Optional. A string to specify the title of the
+                visualization of this pose result
+        img (str|np.ndarray): Opptional. The image or image path to show input
+            image and/or 2D pose. Note that the image should be given in BGR
+            channel order.
+        skeleton (list of [idx_i,idx_j]): Skeleton described by a list of
+            links, each is a pair of joint indices.
+        pose_kpt_color (np.ndarray[Nx3]`): Color of N keypoints. If None, do
+            not nddraw keypoints.
+        pose_link_color (np.array[Mx3]): Color of M links. If None, do not
+            draw links.
+        vis_height (int): The image height of the visualization. The width
+                will be N*vis_height depending on the number of visualized
+                items.
+        kpt_score_thr (float): Minimum score of keypoints to be shown.
+            Default: 0.3.
+        num_instances (int): Number of instances to be shown in 3D. If smaller
+            than 0, all the instances in the pose_result will be shown.
+            Otherwise, pad or truncate the pose_result to a length of
+            num_instances.
+        axis_azimuth (float): axis azimuth angle for 3D visualizations.
+        axis_dist (float): axis distance for 3D visualizations.
+        axis_elev (float): axis elevation view angle for 3D visualizations.
+        axis_limit (float): The axis limit to visualize 3d pose. The xyz
+            range will be set as:
+            - x: [x_c - axis_limit/2, x_c + axis_limit/2]
+            - y: [y_c - axis_limit/2, y_c + axis_limit/2]
+            - z: [0, axis_limit]
+            Where x_c, y_c is the mean value of x and y coordinates
+        figsize: (float): figure size in inch.
+    """
+
+    show_img = img is not None
+    if num_instances < 0:
+        num_instances = len(pose_result)
+    else:
+        if len(pose_result) > num_instances:
+            pose_result = pose_result[:num_instances]
+        elif len(pose_result) < num_instances:
+            pose_result += [dict()] * (num_instances - len(pose_result))
+    num_axis = num_instances + 1 if show_img else num_instances
+
+    plt.ioff()
+    fig = plt.figure(figsize=(vis_height * num_axis * 0.01, vis_height * 0.01))
+
+    if show_img:
+        img = mmcv.imread(img, channel_order='bgr')
+        img = mmcv.bgr2rgb(img)
+        img = mmcv.imrescale(img, scale=vis_height / img.shape[0])
+
+        ax_img = fig.add_subplot(1, num_axis, 1)
+        ax_img.get_xaxis().set_visible(False)
+        ax_img.get_yaxis().set_visible(False)
+        ax_img.set_axis_off()
+        ax_img.set_title('Input')
+        ax_img.imshow(img, aspect='equal')
+
+    for idx, res in enumerate(pose_result):
+        dummy = len(res) == 0
+        kpts = np.zeros((1, 3)) if dummy else res['keypoints_3d']
+        if kpts.shape[1] == 3:
+            kpts = np.concatenate([kpts, np.ones((kpts.shape[0], 1))], axis=1)
+        valid = kpts[:, 3] >= kpt_score_thr
+
+        ax_idx = idx + 2 if show_img else idx + 1
+        ax = fig.add_subplot(1, num_axis, ax_idx, projection='3d')
+        ax.view_init(
+            elev=axis_elev,
+            azim=axis_azimuth,
+        )
+        x_c = np.mean(kpts[valid, 0]) if sum(valid) > 0 else 0
+        y_c = np.mean(kpts[valid, 1]) if sum(valid) > 0 else 0
+        ax.set_xlim3d([x_c - axis_limit / 2, x_c + axis_limit / 2])
+        ax.set_ylim3d([y_c - axis_limit / 2, y_c + axis_limit / 2])
+        ax.set_zlim3d([0, axis_limit])
+        ax.set_aspect('auto')
+        ax.set_xticks([])
+        ax.set_yticks([])
+        ax.set_zticks([])
+        ax.set_xticklabels([])
+        ax.set_yticklabels([])
+        ax.set_zticklabels([])
+        ax.dist = axis_dist
+
+        if not dummy and pose_kpt_color is not None:
+            pose_kpt_color = np.array(pose_kpt_color)
+            assert len(pose_kpt_color) == len(kpts)
+            x_3d, y_3d, z_3d = np.split(kpts[:, :3], [1, 2], axis=1)
+            # matplotlib uses RGB color in [0, 1] value range
+            _color = pose_kpt_color[..., ::-1] / 255.
+            ax.scatter(
+                x_3d[valid],
+                y_3d[valid],
+                z_3d[valid],
+                marker='o',
+                color=_color[valid],
+            )
+
+        if not dummy and skeleton is not None and pose_link_color is not None:
+            pose_link_color = np.array(pose_link_color)
+            assert len(pose_link_color) == len(skeleton)
+            for link, link_color in zip(skeleton, pose_link_color):
+                link_indices = [_i for _i in link]
+                xs_3d = kpts[link_indices, 0]
+                ys_3d = kpts[link_indices, 1]
+                zs_3d = kpts[link_indices, 2]
+                kpt_score = kpts[link_indices, 3]
+                if kpt_score.min() > kpt_score_thr:
+                    # matplotlib uses RGB color in [0, 1] value range
+                    _color = link_color[::-1] / 255.
+                    ax.plot(xs_3d, ys_3d, zs_3d, color=_color, zdir='z')
+
+        if 'title' in res:
+            ax.set_title(res['title'])
+
+    # convert figure to numpy array
+    fig.tight_layout()
+    fig.canvas.draw()
+    img_w, img_h = fig.canvas.get_width_height()
+    img_vis = np.frombuffer(
+        fig.canvas.tostring_rgb(), dtype=np.uint8).reshape(img_h, img_w, -1)
+    img_vis = mmcv.rgb2bgr(img_vis)
+
+    plt.close(fig)
+
+    return img_vis
+
+
+def imshow_mesh_3d(img,
+                   vertices,
+                   faces,
+                   camera_center,
+                   focal_length,
+                   colors=(76, 76, 204)):
+    """Render 3D meshes on background image.
+
+    Args:
+        img(np.ndarray): Background image.
+        vertices (list of np.ndarray): Vetrex coordinates in camera space.
+        faces (list of np.ndarray): Faces of meshes.
+        camera_center ([2]): Center pixel.
+        focal_length ([2]): Focal length of camera.
+        colors (list[str or tuple or Color]): A list of mesh colors.
+    """
+
+    H, W, C = img.shape
+
+    if not has_pyrender:
+        warnings.warn('pyrender package is not installed.')
+        return img
+
+    if not has_trimesh:
+        warnings.warn('trimesh package is not installed.')
+        return img
+
+    try:
+        renderer = pyrender.OffscreenRenderer(
+            viewport_width=W, viewport_height=H)
+    except (ImportError, RuntimeError):
+        warnings.warn('pyrender package is not installed correctly.')
+        return img
+
+    if not isinstance(colors, list):
+        colors = [colors for _ in range(len(vertices))]
+    colors = [color_val(c) for c in colors]
+
+    depth_map = np.ones([H, W]) * np.inf
+    output_img = img
+    for idx in range(len(vertices)):
+        color = colors[idx]
+        color = [c / 255.0 for c in color]
+        color.append(1.0)
+        vert = vertices[idx]
+        face = faces[idx]
+
+        material = pyrender.MetallicRoughnessMaterial(
+            metallicFactor=0.2, alphaMode='OPAQUE', baseColorFactor=color)
+
+        mesh = trimesh.Trimesh(vert, face)
+        rot = trimesh.transformations.rotation_matrix(
+            np.radians(180), [1, 0, 0])
+        mesh.apply_transform(rot)
+        mesh = pyrender.Mesh.from_trimesh(mesh, material=material)
+
+        scene = pyrender.Scene(ambient_light=(0.5, 0.5, 0.5))
+        scene.add(mesh, 'mesh')
+
+        camera_pose = np.eye(4)
+        camera = pyrender.IntrinsicsCamera(
+            fx=focal_length[0],
+            fy=focal_length[1],
+            cx=camera_center[0],
+            cy=camera_center[1],
+            zfar=1e5)
+        scene.add(camera, pose=camera_pose)
+
+        light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=1)
+        light_pose = np.eye(4)
+
+        light_pose[:3, 3] = np.array([0, -1, 1])
+        scene.add(light, pose=light_pose)
+
+        light_pose[:3, 3] = np.array([0, 1, 1])
+        scene.add(light, pose=light_pose)
+
+        light_pose[:3, 3] = np.array([1, 1, 2])
+        scene.add(light, pose=light_pose)
+
+        color, rend_depth = renderer.render(
+            scene, flags=pyrender.RenderFlags.RGBA)
+
+        valid_mask = (rend_depth < depth_map) * (rend_depth > 0)
+        depth_map[valid_mask] = rend_depth[valid_mask]
+        valid_mask = valid_mask[:, :, None]
+        output_img = (
+            valid_mask * color[:, :, :3] + (1 - valid_mask) * output_img)
+
+    return output_img
+
+
+def imshow_multiview_keypoints_3d(
+    pose_result,
+    skeleton=None,
+    pose_kpt_color=None,
+    pose_link_color=None,
+    space_size=[8000, 8000, 2000],
+    space_center=[0, -500, 800],
+    kpt_score_thr=0.0,
+):
+    """Draw 3D keypoints and links in 3D coordinates.
+
+    Args:
+        pose_result (list[kpts]): The poses to draw. Each element kpts is
+            a set of K keypoints as an Kx4 numpy.ndarray, where each
+            keypoint is represented as x, y, z, score.
+        skeleton (list of [idx_i,idx_j]): Skeleton described by a list of
+            links, each is a pair of joint indices.
+        pose_kpt_color (np.ndarray[Nx3]`): Color of N keypoints. If None, do
+            not nddraw keypoints.
+        pose_link_color (np.array[Mx3]): Color of M links. If None, do not
+            draw links.
+        space_size: (list). Default: [8000, 8000, 2000].
+        space_center: (list). Default: [0, -500, 800].
+        kpt_score_thr (float): Minimum score of keypoints to be shown.
+            Default: 0.0.
+    """
+    fig = plt.figure()
+    ax = plt.axes(projection='3d')
+    ax.set_xlim3d(space_center[0] - space_size[0] * 0.5,
+                  space_center[0] + space_size[0] * 0.5)
+    ax.set_ylim3d(space_center[1] - space_size[1] * 0.5,
+                  space_center[1] + space_size[1] * 0.5)
+    ax.set_zlim3d(space_center[2] - space_size[2] * 0.5,
+                  space_center[2] + space_size[2] * 0.5)
+    pose_kpt_color = np.array(pose_kpt_color)
+    pose_kpt_color = pose_kpt_color[..., ::-1] / 255.
+
+    for kpts in pose_result:
+        # draw each point on image
+        xs, ys, zs, scores = kpts.T
+        valid = scores > kpt_score_thr
+        ax.scatter(
+            xs[valid],
+            ys[valid],
+            zs[valid],
+            marker='o',
+            color=pose_kpt_color[valid])
+
+        for link, link_color in zip(skeleton, pose_link_color):
+            link_indices = [_i for _i in link]
+            xs_3d = kpts[link_indices, 0]
+            ys_3d = kpts[link_indices, 1]
+            zs_3d = kpts[link_indices, 2]
+            kpt_score = kpts[link_indices, 3]
+            if kpt_score.min() > kpt_score_thr:
+                # matplotlib uses RGB color in [0, 1] value range
+                _color = np.array(link_color[::-1]) / 255.
+                ax.plot(xs_3d, ys_3d, zs_3d, color=_color)
+
+    # convert figure to numpy array
+    fig.tight_layout()
+    fig.canvas.draw()
+    img_w, img_h = fig.canvas.get_width_height()
+    img_vis = np.frombuffer(
+        fig.canvas.tostring_rgb(), dtype=np.uint8).reshape(img_h, img_w, -1)
+    img_vis = mmcv.rgb2bgr(img_vis)
+
+    plt.close(fig)
+
+    return img_vis
diff --git a/SMPLer-X/main/transformer_utils/mmpose/deprecated.py b/SMPLer-X/main/transformer_utils/mmpose/deprecated.py
new file mode 100644
index 0000000000000000000000000000000000000000..b930901722ab8fe57455f8eaf9e7c1c728b4b4f8
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/deprecated.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from .datasets.builder import DATASETS
+from .datasets.datasets.base import Kpt2dSviewRgbImgTopDownDataset
+from .models.builder import HEADS, POSENETS
+from .models.detectors import AssociativeEmbedding
+from .models.heads import (AEHigherResolutionHead, AESimpleHead,
+                           DeepposeRegressionHead, HMRMeshHead,
+                           TopdownHeatmapMSMUHead,
+                           TopdownHeatmapMultiStageHead,
+                           TopdownHeatmapSimpleHead)
+
+
+@DATASETS.register_module()
+class TopDownFreiHandDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Deprecated TopDownFreiHandDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownFreiHandDataset has been renamed into FreiHandDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/202 for details.')
+               )
+
+    def _get_db(self):
+        return []
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        return None
+
+
+@DATASETS.register_module()
+class TopDownOneHand10KDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Deprecated TopDownOneHand10KDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownOneHand10KDataset has been renamed into OneHand10KDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/202 for details.')
+               )
+
+    def _get_db(self):
+        return []
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        return None
+
+
+@DATASETS.register_module()
+class TopDownPanopticDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Deprecated TopDownPanopticDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownPanopticDataset has been renamed into PanopticDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/202 for details.')
+               )
+
+    def _get_db(self):
+        return []
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        return None
+
+
+@HEADS.register_module()
+class BottomUpHigherResolutionHead(AEHigherResolutionHead):
+    """Bottom-up head for Higher Resolution.
+
+    BottomUpHigherResolutionHead has been renamed into AEHigherResolutionHead,
+    check https://github.com/open- mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'BottomUpHigherResolutionHead has been renamed into '
+            'AEHigherResolutionHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class BottomUpSimpleHead(AESimpleHead):
+    """Bottom-up simple head.
+
+    BottomUpSimpleHead has been renamed into AESimpleHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'BottomUpHigherResolutionHead has been renamed into '
+            'AEHigherResolutionHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class TopDownSimpleHead(TopdownHeatmapSimpleHead):
+    """Top-down heatmap simple head.
+
+    TopDownSimpleHead has been renamed into TopdownHeatmapSimpleHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TopDownSimpleHead has been renamed into '
+            'TopdownHeatmapSimpleHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class TopDownMultiStageHead(TopdownHeatmapMultiStageHead):
+    """Top-down heatmap multi-stage head.
+
+    TopDownMultiStageHead has been renamed into TopdownHeatmapMultiStageHead,
+    check https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TopDownMultiStageHead has been renamed into '
+            'TopdownHeatmapMultiStageHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class TopDownMSMUHead(TopdownHeatmapMSMUHead):
+    """Heads for multi-stage multi-unit heads.
+
+    TopDownMSMUHead has been renamed into TopdownHeatmapMSMUHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TopDownMSMUHead has been renamed into '
+            'TopdownHeatmapMSMUHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class MeshHMRHead(HMRMeshHead):
+    """SMPL parameters regressor head.
+
+    MeshHMRHead has been renamed into HMRMeshHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'MeshHMRHead has been renamed into '
+            'HMRMeshHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class FcHead(DeepposeRegressionHead):
+    """FcHead (deprecated).
+
+    FcHead has been renamed into DeepposeRegressionHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'FcHead has been renamed into '
+            'DeepposeRegressionHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@POSENETS.register_module()
+class BottomUp(AssociativeEmbedding):
+    """Associative Embedding.
+
+    BottomUp has been renamed into AssociativeEmbedding, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'BottomUp has been renamed into '
+            'AssociativeEmbedding, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa68fc72fbce4204da6bc576daf8d04a9819bf52
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import (BACKBONES, HEADS, LOSSES, MESH_MODELS, NECKS, POSENETS,
+                      build_backbone, build_head, build_loss, build_mesh_model,
+                      build_neck, build_posenet)
+from .detectors import *  # noqa
+from .backbones import *
+from .heads import *  # noqa
+from .losses import *  # noqa
+from .utils import *  # noqa
+
+
+__all__ = [
+    'HEADS', 'NECKS', 'LOSSES', 'POSENETS', 'MESH_MODELS',
+    'build_head', 'build_loss', 'build_posenet',
+    'build_neck', 'build_mesh_model'
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2003ee3af7c44b6fcbf3b46e0ac1e00785f7a6f1
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# from .alexnet import AlexNet
+# from .cpm import CPM
+# from .hourglass import HourglassNet
+# from .hourglass_ae import HourglassAENet
+# from .hrformer import HRFormer
+# from .hrnet import HRNet
+# from .i3d import I3D
+# from .litehrnet import LiteHRNet
+# from .mobilenet_v2 import MobileNetV2
+# from .mobilenet_v3 import MobileNetV3
+# from .mspn import MSPN
+# from .pvt import PyramidVisionTransformer, PyramidVisionTransformerV2
+# from .regnet import RegNet
+# from .resnest import ResNeSt
+# from .resnet import ResNet, ResNetV1d
+# from .resnext import ResNeXt
+# from .rsn import RSN
+# from .scnet import SCNet
+# from .seresnet import SEResNet
+# from .seresnext import SEResNeXt
+# from .shufflenet_v1 import ShuffleNetV1
+# from .shufflenet_v2 import ShuffleNetV2
+# from .swin import SwinTransformer
+# from .tcformer import TCFormer
+# from .tcn import TCN
+# from .v2v_net import V2VNet
+# from .vgg import VGG
+# from .vipnas_mbv3 import ViPNAS_MobileNetV3
+# from .vipnas_resnet import ViPNAS_ResNet
+# from .hrt import HRT
+from .vit import ViT
+
+# __all__ = [
+#     'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2',
+#     'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet',
+#     'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
+#     'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
+#     'LiteHRNet', 'V2VNet', 'HRFormer', 'PyramidVisionTransformer',
+#     'PyramidVisionTransformerV2', 'SwinTransformer', 'I3D', 'TCFormer', 'ViT'
+# ]
+__all__ = ['ViT']
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/base_backbone.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/base_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..8787d944d2233955a96d0446d9ead9f8fd8a6a9c
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/base_backbone.py
@@ -0,0 +1,83 @@
+# # Copyright (c) OpenMMLab. All rights reserved.
+# import logging
+# from abc import ABCMeta, abstractmethod
+#
+# import torch.nn as nn
+#
+# from .utils import load_checkpoint
+#
+#
+# class BaseBackbone(nn.Module, metaclass=ABCMeta):
+#     """Base backbone.
+#
+#     This class defines the basic functions of a backbone. Any backbone that
+#     inherits this class should at least define its own `forward` function.
+#     """
+#
+#     def init_weights(self, pretrained=None):
+#         """Init backbone weights.
+#
+#         Args:
+#             pretrained (str | None): If pretrained is a string, then it
+#                 initializes backbone weights by loading the pretrained
+#                 checkpoint. If pretrained is None, then it follows default
+#                 initializer or customized initializer in subclasses.
+#         """
+#         if isinstance(pretrained, str):
+#             logger = logging.getLogger()
+#             load_checkpoint(self, pretrained, strict=False, logger=logger)
+#         elif pretrained is None:
+#             # use default initializer or customized initializer in subclasses
+#             pass
+#         else:
+#             raise TypeError('pretrained must be a str or None.'
+#                             f' But received {type(pretrained)}.')
+#
+#     @abstractmethod
+#     def forward(self, x):
+#         """Forward function.
+#
+#         Args:
+#             x (Tensor | tuple[Tensor]): x could be a torch.Tensor or a tuple of
+#                 torch.Tensor, containing input data for forward computation.
+#         """
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from abc import ABCMeta, abstractmethod
+
+import torch.nn as nn
+
+from .utils import load_checkpoint
+# from mmcv_custom.checkpoint import load_checkpoint
+
+class BaseBackbone(nn.Module, metaclass=ABCMeta):
+    """Base backbone.
+    This class defines the basic functions of a backbone. Any backbone that
+    inherits this class should at least define its own `forward` function.
+    """
+
+    def init_weights(self, pretrained=None, patch_padding='pad'):
+        """Init backbone weights.
+        Args:
+            pretrained (str | None): If pretrained is a string, then it
+                initializes backbone weights by loading the pretrained
+                checkpoint. If pretrained is None, then it follows default
+                initializer or customized initializer in subclasses.
+        """
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger, patch_padding=patch_padding)
+        elif pretrained is None:
+            # use default initializer or customized initializer in subclasses
+            pass
+        else:
+            raise TypeError('pretrained must be a str or None.'
+                            f' But received {type(pretrained)}.')
+
+    @abstractmethod
+    def forward(self, x):
+        """Forward function.
+        Args:
+            x (Tensor | tuple[Tensor]): x could be a torch.Tensor or a tuple of
+                torch.Tensor, containing input data for forward computation.
+        """
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f93a99db49704b7e1aeb71fb5e209298465dcb0
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py
@@ -0,0 +1,124 @@
+# --------------------------------------------------------
+# High Resolution Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Rao Fu, RainbowSecret
+# --------------------------------------------------------
+
+import os
+import copy
+import logging
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from .transformer_block import TransformerBlock
+from mmengine.model import constant_init, kaiming_init
+from mmcv.cnn import (
+    build_conv_layer,
+    build_norm_layer,
+    build_plugin_layer,
+)
+
+
+class BasicBlock(nn.Module):
+    """Only replce the second 3x3 Conv with the TransformerBlocker"""
+
+    expansion = 1
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        with_cp=False,
+        conv_cfg=None,
+        norm_cfg=dict(type="BN"),
+        mhsa_flag=False,
+        num_heads=1,
+        num_halo_block=1,
+        num_mlp_ratio=4,
+        num_sr_ratio=1,
+        with_rpe=False,
+        with_ffn=True,
+    ):
+        super(BasicBlock, self).__init__()
+        norm_cfg = copy.deepcopy(norm_cfg)
+
+        self.in_channels = inplanes
+        self.out_channels = planes
+        self.stride = stride
+        self.with_cp = with_cp
+        self.downsample = downsample
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=1,
+            dilation=1,
+            bias=False,
+        )
+        self.add_module(self.norm1_name, norm1)
+
+        if not mhsa_flag:
+            self.conv2 = build_conv_layer(
+                conv_cfg, planes, planes, 3, padding=1, bias=False
+            )
+            self.add_module(self.norm2_name, norm2)
+        else:
+            self.conv2 = TransformerBlock(
+                planes,
+                num_heads=num_heads,
+                mlp_ratio=num_mlp_ratio,
+                sr_ratio=num_sr_ratio,
+                input_resolution=num_resolution,
+                with_rpe=with_rpe,
+                with_ffn=with_ffn,
+            )
+
+        self.relu = nn.ReLU(inplace=True)
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/bottleneck_block.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/bottleneck_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ccd11c24b3e10391fd751ca8a7b7e571acd7aee
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/bottleneck_block.py
@@ -0,0 +1,122 @@
+# --------------------------------------------------------
+# High Resolution Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Rao Fu, RainbowSecret
+# --------------------------------------------------------
+
+import os
+import copy
+import logging
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        with_cp=None,
+        norm_cfg=dict(type="BN"),
+        conv_cfg=None,
+    ):
+        super(Bottleneck, self).__init__()
+        norm_cfg = copy.deepcopy(norm_cfg)
+
+        self.in_channels = inplanes
+        self.out_channels = planes
+        self.stride = stride
+        self.with_cp = with_cp
+        self.downsample = downsample
+
+        self.conv1_stride = 1
+        self.conv2_stride = stride
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3
+        )
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False,
+        )
+        self.add_module(self.norm1_name, norm1)
+
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=1,
+            bias=False,
+        )
+        self.add_module(self.norm2_name, norm2)
+
+        self.conv3 = build_conv_layer(
+            conv_cfg, planes, planes * self.expansion, kernel_size=1, bias=False
+        )
+        self.add_module(self.norm3_name, norm3)
+        self.relu = nn.ReLU(inplace=True)
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/ffn_block.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/ffn_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ef023334a3eb2ff4eb7172b4b75131d7c08262
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/ffn_block.py
@@ -0,0 +1,195 @@
+# --------------------------------------------------------
+# High Resolution Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Rao Fu, RainbowSecret
+# --------------------------------------------------------
+
+import torch
+import torch.nn as nn
+
+
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class MlpDW(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        dw_act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+        self.act1 = act_layer()
+        self.dw3x3 = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            stride=1,
+            groups=hidden_features,
+            padding=1,
+        )
+        self.act2 = dw_act_layer()
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+
+        if N == (H * W + 1):
+            cls_tokens = x[:, 0, :]
+            x_ = x[:, 1:, :].permute(0, 2, 1).contiguous().reshape(B, C, H, W)
+        else:
+            x_ = x.permute(0, 2, 1).contiguous().reshape(B, C, H, W)
+
+        x_ = self.fc1(x_)
+        x_ = self.act1(x_)
+        x_ = self.dw3x3(x_)
+        x_ = self.act2(x_)
+        x_ = self.drop(x_)
+        x_ = self.fc2(x_)
+        x_ = self.drop(x_)
+        x_ = x_.reshape(B, C, -1).permute(0, 2, 1).contiguous()
+
+        if N == (H * W + 1):
+            x = torch.cat((cls_tokens.unsqueeze(1), x_), dim=1)
+        else:
+            x = x_
+
+        return x
+
+
+class MlpDWBN(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        dw_act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+        self.act1 = act_layer()
+        self.norm1 = nn.BatchNorm2d(hidden_features)
+        self.dw3x3 = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            stride=1,
+            groups=hidden_features,
+            padding=1,
+        )
+        self.act2 = dw_act_layer()
+        self.norm2 = nn.BatchNorm2d(hidden_features)
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+        self.act3 = act_layer()
+        self.norm3 = nn.BatchNorm2d(out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+
+        if N == (H * W + 1):
+            cls_tokens = x[:, 0, :]
+            x_ = x[:, 1:, :].permute(0, 2, 1).contiguous().reshape(B, C, H, W)
+        else:
+            x_ = x.permute(0, 2, 1).contiguous().reshape(B, C, H, W)
+
+        x_ = self.fc1(x_)
+        x_ = self.norm1(x_)
+        x_ = self.act1(x_)
+        x_ = self.dw3x3(x_)
+        x_ = self.norm2(x_)
+        x_ = self.act2(x_)
+        x_ = self.drop(x_)
+        x_ = self.fc2(x_)
+        x_ = self.norm3(x_)
+        x_ = self.act3(x_)
+        x_ = self.drop(x_)
+        x_ = x_.reshape(B, C, -1).permute(0, 2, 1).contiguous()
+
+        if N == (H * W + 1):
+            x = torch.cat((cls_tokens.unsqueeze(1), x_), dim=1)
+        else:
+            x = x_
+
+        return x
+
+
+class MlpDWBN2D(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        dw_act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+        self.act1 = act_layer()
+        self.norm1 = nn.BatchNorm2d(hidden_features)
+        self.dw3x3 = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            stride=1,
+            groups=hidden_features,
+            padding=1,
+        )
+        self.act2 = dw_act_layer()
+        self.norm2 = nn.BatchNorm2d(hidden_features)
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+        self.act3 = act_layer()
+        self.norm3 = nn.BatchNorm2d(out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.norm1(x)
+        x = self.act1(x)
+        x = self.dw3x3(x)
+        x = self.norm2(x)
+        x = self.act2(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.norm3(x)
+        x = self.act3(x)
+        x = self.drop(x)
+        return x
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/multihead_attention.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/multihead_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..d726ea377a407bdb4e8cf5d0bc44a371a1e3545b
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/multihead_attention.py
@@ -0,0 +1,348 @@
+# --------------------------------------------------------
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by RainbowSecret from:
+#   https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/activation.py#L852
+# --------------------------------------------------------
+
+import copy
+import math
+import warnings
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from torch.nn.modules.module import Module
+from torch._jit_internal import Optional, Tuple
+from torch.overrides import has_torch_function, handle_torch_function
+from torch.nn.functional import linear, pad, softmax, dropout
+
+
+class MultiheadAttention(Module):
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        kdim=None,
+        vdim=None,
+    ):
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+
+        self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+        self.in_proj_bias = None
+        self.in_proj_weight = None
+        self.bias_k = self.bias_v = None
+        self.q_proj_weight = None
+        self.k_proj_weight = None
+        self.v_proj_weight = None
+        self.add_zero_attn = add_zero_attn
+
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if "_qkv_same_embed_dim" not in state:
+            state["_qkv_same_embed_dim"] = True
+
+        super(MultiheadAttention, self).__setstate__(state)
+
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        key_padding_mask=None,
+        need_weights=False,
+        attn_mask=None,
+        residual_attn=None,
+    ):
+        if not self._qkv_same_embed_dim:
+            return self.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight,
+                k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight,
+                out_dim=self.vdim,
+                residual_attn=residual_attn,
+            )
+        else:
+            return self.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                out_dim=self.vdim,
+                residual_attn=residual_attn,
+            )
+
+    def multi_head_attention_forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        embed_dim_to_check: int,
+        num_heads: int,
+        in_proj_weight: Tensor,
+        in_proj_bias: Tensor,
+        bias_k: Optional[Tensor],
+        bias_v: Optional[Tensor],
+        add_zero_attn: bool,
+        dropout_p: float,
+        out_proj_weight: Tensor,
+        out_proj_bias: Tensor,
+        training: bool = True,
+        key_padding_mask: Optional[Tensor] = None,
+        need_weights: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        use_separate_proj_weight: bool = False,
+        q_proj_weight: Optional[Tensor] = None,
+        k_proj_weight: Optional[Tensor] = None,
+        v_proj_weight: Optional[Tensor] = None,
+        static_k: Optional[Tensor] = None,
+        static_v: Optional[Tensor] = None,
+        out_dim: Optional[Tensor] = None,
+        residual_attn: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        if not torch.jit.is_scripting():
+            tens_ops = (
+                query,
+                key,
+                value,
+                in_proj_weight,
+                in_proj_bias,
+                bias_k,
+                bias_v,
+                out_proj_weight,
+                out_proj_bias,
+            )
+            if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(
+                tens_ops
+            ):
+                return handle_torch_function(
+                    multi_head_attention_forward,
+                    tens_ops,
+                    query,
+                    key,
+                    value,
+                    embed_dim_to_check,
+                    num_heads,
+                    in_proj_weight,
+                    in_proj_bias,
+                    bias_k,
+                    bias_v,
+                    add_zero_attn,
+                    dropout_p,
+                    out_proj_weight,
+                    out_proj_bias,
+                    training=training,
+                    key_padding_mask=key_padding_mask,
+                    need_weights=need_weights,
+                    attn_mask=attn_mask,
+                    use_separate_proj_weight=use_separate_proj_weight,
+                    q_proj_weight=q_proj_weight,
+                    k_proj_weight=k_proj_weight,
+                    v_proj_weight=v_proj_weight,
+                    static_k=static_k,
+                    static_v=static_v,
+                )
+        tgt_len, bsz, embed_dim = query.size()
+        key = query if key is None else key
+        value = query if value is None else value
+
+        assert embed_dim == embed_dim_to_check
+        # allow MHA to have different sizes for the feature dimension
+        assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+
+        head_dim = embed_dim // num_heads
+        v_head_dim = out_dim // num_heads
+        assert (
+            head_dim * num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        scaling = float(head_dim) ** -0.5
+
+        q = self.q_proj(query) * scaling
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+
+        if attn_mask is not None:
+            assert (
+                attn_mask.dtype == torch.float32
+                or attn_mask.dtype == torch.float64
+                or attn_mask.dtype == torch.float16
+                or attn_mask.dtype == torch.uint8
+                or attn_mask.dtype == torch.bool
+            ), "Only float, byte, and bool types are supported for attn_mask, not {}".format(
+                attn_mask.dtype
+            )
+            if attn_mask.dtype == torch.uint8:
+                warnings.warn(
+                    "Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."
+                )
+                attn_mask = attn_mask.to(torch.bool)
+
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(0)
+                if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                    raise RuntimeError("The size of the 2D attn_mask is not correct.")
+            elif attn_mask.dim() == 3:
+                if list(attn_mask.size()) != [
+                    bsz * num_heads,
+                    query.size(0),
+                    key.size(0),
+                ]:
+                    raise RuntimeError("The size of the 3D attn_mask is not correct.")
+            else:
+                raise RuntimeError(
+                    "attn_mask's dimension {} is not supported".format(attn_mask.dim())
+                )
+
+        # convert ByteTensor key_padding_mask to bool
+        if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+            warnings.warn(
+                "Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."
+            )
+            key_padding_mask = key_padding_mask.to(torch.bool)
+
+        q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * num_heads, v_head_dim).transpose(0, 1)
+
+        src_len = k.size(1)
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+
+        if add_zero_attn:
+            src_len += 1
+            k = torch.cat(
+                [
+                    k,
+                    torch.zeros(
+                        (k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device
+                    ),
+                ],
+                dim=1,
+            )
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        (v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device
+                    ),
+                ],
+                dim=1,
+            )
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+
+        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+        assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+
+        """
+        Attention weight for the invalid region is -inf
+        """
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_output_weights.masked_fill_(attn_mask, float("-inf"))
+            else:
+                attn_output_weights += attn_mask
+
+        if key_padding_mask is not None:
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )
+            attn_output_weights = attn_output_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                float("-inf"),
+            )
+            attn_output_weights = attn_output_weights.view(
+                bsz * num_heads, tgt_len, src_len
+            )
+
+        if residual_attn is not None:
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )
+            attn_output_weights += residual_attn.unsqueeze(0)
+            attn_output_weights = attn_output_weights.view(
+                bsz * num_heads, tgt_len, src_len
+            )
+
+        """
+        Reweight the attention map before softmax().
+        attn_output_weights: (b*n_head, n, hw)
+        """
+        attn_output_weights = softmax(attn_output_weights, dim=-1)
+        attn_output_weights = dropout(
+            attn_output_weights, p=dropout_p, training=training
+        )
+
+        attn_output = torch.bmm(attn_output_weights, v)
+        assert list(attn_output.size()) == [bsz * num_heads, tgt_len, v_head_dim]
+        attn_output = (
+            attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, out_dim)
+        )
+        attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+
+        if need_weights:
+            # average attention weights over heads
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )
+            return attn_output, attn_output_weights.sum(dim=1) / num_heads
+        else:
+            return attn_output
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/multihead_isa_attention.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/multihead_isa_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..deb25dfa96cc592cc58c825dc0eccd726c1592ed
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/multihead_isa_attention.py
@@ -0,0 +1,435 @@
+# --------------------------------------------------------
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Lang Huang, RainbowSecret from:
+#   https://github.com/openseg-group/openseg.pytorch/blob/master/lib/models/modules/isa_block.py
+# --------------------------------------------------------
+
+
+import copy
+import math
+import warnings
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from torch._jit_internal import Optional, Tuple
+from torch.overrides import has_torch_function, handle_torch_function
+from torch.nn.functional import linear, pad, softmax, dropout
+
+from einops import rearrange
+from timm.models.layers import to_2tuple, trunc_normal_
+
+from .multihead_attention import MultiheadAttention
+
+
+class MHA_(MultiheadAttention):
+    """ "Multihead Attention with extra flags on the q/k/v and out projections."""
+
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+
+    def __init__(self, *args, rpe=False, window_size=7, **kwargs):
+        super(MHA_, self).__init__(*args, **kwargs)
+
+        self.rpe = rpe
+        if rpe:
+            self.window_size = [window_size] * 2
+            # define a parameter table of relative position bias
+            # self.relative_position_bias_table = nn.Parameter(
+            #     torch.zeros(
+            #         (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1),
+            #         self.num_heads,
+            #     )
+            # )  # 2*Wh-1 * 2*Ww-1, nH
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(self.window_size[0])
+            coords_w = torch.arange(self.window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = (
+                coords_flatten[:, :, None] - coords_flatten[:, None, :]
+            )  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(
+                1, 2, 0
+            ).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += self.window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+            relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            self.register_buffer("relative_position_index", relative_position_index)
+            # trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        key_padding_mask=None,
+        need_weights=False,
+        attn_mask=None,
+        do_qkv_proj=True,
+        do_out_proj=True,
+        rpe=True,
+    ):
+        if not self._qkv_same_embed_dim:
+            return self.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight,
+                k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight,
+                out_dim=self.vdim,
+                do_qkv_proj=do_qkv_proj,
+                do_out_proj=do_out_proj,
+                rpe=rpe,
+            )
+        else:
+            return self.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                out_dim=self.vdim,
+                do_qkv_proj=do_qkv_proj,
+                do_out_proj=do_out_proj,
+                rpe=rpe,
+            )
+
+    def multi_head_attention_forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        embed_dim_to_check: int,
+        num_heads: int,
+        in_proj_weight: Tensor,
+        in_proj_bias: Tensor,
+        bias_k: Optional[Tensor],
+        bias_v: Optional[Tensor],
+        add_zero_attn: bool,
+        dropout_p: float,
+        out_proj_weight: Tensor,
+        out_proj_bias: Tensor,
+        training: bool = True,
+        key_padding_mask: Optional[Tensor] = None,
+        need_weights: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        use_separate_proj_weight: bool = False,
+        q_proj_weight: Optional[Tensor] = None,
+        k_proj_weight: Optional[Tensor] = None,
+        v_proj_weight: Optional[Tensor] = None,
+        static_k: Optional[Tensor] = None,
+        static_v: Optional[Tensor] = None,
+        out_dim: Optional[Tensor] = None,
+        do_qkv_proj: bool = True,
+        do_out_proj: bool = True,
+        rpe=True,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        if not torch.jit.is_scripting():
+            tens_ops = (
+                query,
+                key,
+                value,
+                in_proj_weight,
+                in_proj_bias,
+                bias_k,
+                bias_v,
+                out_proj_weight,
+                out_proj_bias,
+            )
+            if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(
+                tens_ops
+            ):
+                return handle_torch_function(
+                    multi_head_attention_forward,
+                    tens_ops,
+                    query,
+                    key,
+                    value,
+                    embed_dim_to_check,
+                    num_heads,
+                    in_proj_weight,
+                    in_proj_bias,
+                    bias_k,
+                    bias_v,
+                    add_zero_attn,
+                    dropout_p,
+                    out_proj_weight,
+                    out_proj_bias,
+                    training=training,
+                    key_padding_mask=key_padding_mask,
+                    need_weights=need_weights,
+                    attn_mask=attn_mask,
+                    use_separate_proj_weight=use_separate_proj_weight,
+                    q_proj_weight=q_proj_weight,
+                    k_proj_weight=k_proj_weight,
+                    v_proj_weight=v_proj_weight,
+                    static_k=static_k,
+                    static_v=static_v,
+                )
+        tgt_len, bsz, embed_dim = query.size()
+        key = query if key is None else key
+        value = query if value is None else value
+
+        assert embed_dim == embed_dim_to_check
+        # allow MHA to have different sizes for the feature dimension
+        assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+
+        head_dim = embed_dim // num_heads
+        v_head_dim = out_dim // num_heads
+        assert (
+            head_dim * num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        scaling = float(head_dim) ** -0.5
+
+        # whether or not use the original query/key/value
+        q = self.q_proj(query) * scaling if do_qkv_proj else query
+        k = self.k_proj(key) if do_qkv_proj else key
+        v = self.v_proj(value) if do_qkv_proj else value
+
+        if attn_mask is not None:
+            assert (
+                attn_mask.dtype == torch.float32
+                or attn_mask.dtype == torch.float64
+                or attn_mask.dtype == torch.float16
+                or attn_mask.dtype == torch.uint8
+                or attn_mask.dtype == torch.bool
+            ), "Only float, byte, and bool types are supported for attn_mask, not {}".format(
+                attn_mask.dtype
+            )
+            if attn_mask.dtype == torch.uint8:
+                warnings.warn(
+                    "Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."
+                )
+                attn_mask = attn_mask.to(torch.bool)
+
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(0)
+                if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                    raise RuntimeError("The size of the 2D attn_mask is not correct.")
+            elif attn_mask.dim() == 3:
+                if list(attn_mask.size()) != [
+                    bsz * num_heads,
+                    query.size(0),
+                    key.size(0),
+                ]:
+                    raise RuntimeError("The size of the 3D attn_mask is not correct.")
+            else:
+                raise RuntimeError(
+                    "attn_mask's dimension {} is not supported".format(attn_mask.dim())
+                )
+
+        # convert ByteTensor key_padding_mask to bool
+        if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+            warnings.warn(
+                "Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."
+            )
+            key_padding_mask = key_padding_mask.to(torch.bool)
+
+        q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * num_heads, v_head_dim).transpose(0, 1)
+
+        src_len = k.size(1)
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+
+        if add_zero_attn:
+            src_len += 1
+            k = torch.cat(
+                [
+                    k,
+                    torch.zeros(
+                        (k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device
+                    ),
+                ],
+                dim=1,
+            )
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        (v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device
+                    ),
+                ],
+                dim=1,
+            )
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+
+        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+        assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+
+        """
+        Add relative position embedding
+        """
+        if self.rpe and rpe:
+            # NOTE: for simplicity, we assume the src_len == tgt_len == window_size**2 here
+            # print('src, tar, window', src_len, tgt_len, self.window_size[0], self.window_size[1])
+            # assert src_len == self.window_size[0] * self.window_size[1] \
+            #                   and tgt_len == self.window_size[0] * self.window_size[1], \
+            #                   f"src{src_len}, tgt{tgt_len}, window{self.window_size[0]}"
+            # relative_position_bias = self.relative_position_bias_table[
+            #     self.relative_position_index.view(-1)
+            # ].view(
+            #     self.window_size[0] * self.window_size[1],
+            #     self.window_size[0] * self.window_size[1],
+            #     -1,
+            # )  # Wh*Ww,Wh*Ww,nH
+            # relative_position_bias = relative_position_bias.permute(
+            #     2, 0, 1
+            # ).contiguous()  # nH, Wh*Ww, Wh*Ww
+            # HELLO!!!!!
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )  # + relative_position_bias.unsqueeze(0)
+            attn_output_weights = attn_output_weights.view(
+                bsz * num_heads, tgt_len, src_len
+            )
+
+        """
+        Attention weight for the invalid region is -inf
+        """
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_output_weights.masked_fill_(attn_mask, float("-inf"))
+            else:
+                attn_output_weights += attn_mask
+
+        if key_padding_mask is not None:
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )
+            attn_output_weights = attn_output_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                float("-inf"),
+            )
+            attn_output_weights = attn_output_weights.view(
+                bsz * num_heads, tgt_len, src_len
+            )
+
+        """
+        Reweight the attention map before softmax().
+        attn_output_weights: (b*n_head, n, hw)
+        """
+        attn_output_weights = softmax(attn_output_weights, dim=-1)
+        attn_output_weights = dropout(
+            attn_output_weights, p=dropout_p, training=training
+        )
+
+        attn_output = torch.bmm(attn_output_weights, v)
+        assert list(attn_output.size()) == [bsz * num_heads, tgt_len, v_head_dim]
+        attn_output = (
+            attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, out_dim)
+        )
+        if do_out_proj:
+            attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+
+        if need_weights:
+            # average attention weights over heads
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )
+            return attn_output, q, k, attn_output_weights.sum(dim=1) / num_heads
+        else:
+            return attn_output, q, k  # additionaly return the query and key
+
+
+class PadBlock(object):
+    """ "Make the size of feature map divisible by local group size."""
+
+    def __init__(self, local_group_size=7):
+        self.lgs = local_group_size
+        if not isinstance(self.lgs, (tuple, list)):
+            self.lgs = to_2tuple(self.lgs)
+        assert len(self.lgs) == 2
+
+    def pad_if_needed(self, x, size):
+        n, h, w, c = size
+        pad_h = math.ceil(h / self.lgs[0]) * self.lgs[0] - h
+        pad_w = math.ceil(w / self.lgs[1]) * self.lgs[1] - w
+        if pad_h > 0 or pad_w > 0:  # center-pad the feature on H and W axes
+            return F.pad(
+                x,
+                (0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2),
+            )
+        return x
+
+    def depad_if_needed(self, x, size):
+        n, h, w, c = size
+        pad_h = math.ceil(h / self.lgs[0]) * self.lgs[0] - h
+        pad_w = math.ceil(w / self.lgs[1]) * self.lgs[1] - w
+        if pad_h > 0 or pad_w > 0:  # remove the center-padding on feature
+            return x[:, pad_h // 2 : pad_h // 2 + h, pad_w // 2 : pad_w // 2 + w, :]
+        return x
+
+
+class LocalPermuteModule(object):
+    """ "Permute the feature map to gather pixels in local groups, and the reverse permutation"""
+
+    def __init__(self, local_group_size=7):
+        self.lgs = local_group_size
+        if not isinstance(self.lgs, (tuple, list)):
+            self.lgs = to_2tuple(self.lgs)
+        assert len(self.lgs) == 2
+
+    def permute(self, x, size):
+        n, h, w, c = size
+        return rearrange(
+            x,
+            "n (qh ph) (qw pw) c -> (ph pw) (n qh qw) c",
+            n=n,
+            qh=h // self.lgs[0],
+            ph=self.lgs[0],
+            qw=w // self.lgs[0],
+            pw=self.lgs[0],
+            c=c,
+        )
+
+    def rev_permute(self, x, size):
+        n, h, w, c = size
+        return rearrange(
+            x,
+            "(ph pw) (n qh qw) c -> n (qh ph) (qw pw) c",
+            n=n,
+            qh=h // self.lgs[0],
+            ph=self.lgs[0],
+            qw=w // self.lgs[0],
+            pw=self.lgs[0],
+            c=c,
+        )
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/multihead_isa_pool_attention.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/multihead_isa_pool_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b039022102a9a26fc1210e24910edd6d9ada560
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/multihead_isa_pool_attention.py
@@ -0,0 +1,59 @@
+# --------------------------------------------------------
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Lang Huang, RainbowSecret from:
+#   https://github.com/openseg-group/openseg.pytorch/blob/master/lib/models/modules/isa_block.py
+# --------------------------------------------------------
+
+import os
+import pdb
+import math
+import torch
+import torch.nn as nn
+
+from .multihead_isa_attention import MHA_, PadBlock, LocalPermuteModule
+
+
+class InterlacedPoolAttention(nn.Module):
+    r"""interlaced sparse multi-head self attention (ISA) module with relative position bias.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): Window size.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self, embed_dim, num_heads, window_size=7, rpe=True, **kwargs):
+        super(InterlacedPoolAttention, self).__init__()
+
+        self.dim = embed_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.with_rpe = rpe
+
+        self.attn = MHA_(
+            embed_dim, num_heads, rpe=rpe, window_size=window_size, **kwargs
+        )
+        self.pad_helper = PadBlock(window_size)
+        self.permute_helper = LocalPermuteModule(window_size)
+
+    def forward(self, x, H, W, **kwargs):
+        B, N, C = x.shape
+        x = x.view(B, H, W, C)
+        # attention
+        # pad
+        x_pad = self.pad_helper.pad_if_needed(x, x.size())
+        # permute
+        x_permute = self.permute_helper.permute(x_pad, x_pad.size())
+        # attention
+        out, _, _ = self.attn(
+            x_permute, x_permute, x_permute, rpe=self.with_rpe, **kwargs
+        )
+        # reverse permutation
+        out = self.permute_helper.rev_permute(out, x_pad.size())
+        # de-pad, pooling with `ceil_mode=True` will do implicit padding, so we need to remove it, too
+        out = self.pad_helper.depad_if_needed(out, x.size())
+        return out.reshape(B, N, C)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/transformer_block.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/transformer_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..9571e8c70843662d466d3903acc5d54eab27bd4c
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/modules/transformer_block.py
@@ -0,0 +1,211 @@
+# --------------------------------------------------------
+# High Resolution Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Rao Fu, RainbowSecret
+# --------------------------------------------------------
+
+import os
+import math
+import logging
+import torch
+import torch.nn as nn
+from functools import partial
+
+from .multihead_isa_pool_attention import InterlacedPoolAttention
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+BN_MOMENTUM = 0.1
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self):
+        # (Optional)Set the extra information about this module. You can test
+        # it by printing an object of this class.
+        return "drop_prob={}".format(self.drop_prob)
+
+
+class MlpDWBN(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        dw_act_layer=nn.GELU,
+        drop=0.0,
+        conv_cfg=None,
+        norm_cfg=dict(type="BN", requires_grad=True),
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = build_conv_layer(
+            conv_cfg,
+            in_features,
+            hidden_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.act1 = act_layer()
+        self.norm1 = build_norm_layer(norm_cfg, hidden_features)[1]
+        self.dw3x3 = build_conv_layer(
+            conv_cfg,
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=hidden_features,
+        )
+        self.act2 = dw_act_layer()
+        self.norm2 = build_norm_layer(norm_cfg, hidden_features)[1]
+        self.fc2 = build_conv_layer(
+            conv_cfg,
+            hidden_features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.act3 = act_layer()
+        self.norm3 = build_norm_layer(norm_cfg, out_features)[1]
+        # self.drop = nn.Dropout(drop, inplace=True)
+
+    def forward(self, x, H, W):
+        if len(x.shape) == 3:
+            B, N, C = x.shape
+            if N == (H * W + 1):
+                cls_tokens = x[:, 0, :]
+                x_ = x[:, 1:, :].permute(0, 2, 1).contiguous().reshape(B, C, H, W)
+            else:
+                x_ = x.permute(0, 2, 1).contiguous().reshape(B, C, H, W)
+
+            x_ = self.fc1(x_)
+            x_ = self.norm1(x_)
+            x_ = self.act1(x_)
+            x_ = self.dw3x3(x_)
+            x_ = self.norm2(x_)
+            x_ = self.act2(x_)
+            # x_ = self.drop(x_)
+            x_ = self.fc2(x_)
+            x_ = self.norm3(x_)
+            x_ = self.act3(x_)
+            # x_ = self.drop(x_)
+            x_ = x_.reshape(B, C, -1).permute(0, 2, 1).contiguous()
+            if N == (H * W + 1):
+                x = torch.cat((cls_tokens.unsqueeze(1), x_), dim=1)
+            else:
+                x = x_
+            return x
+
+        elif len(x.shape) == 4:
+            x = self.fc1(x)
+            x = self.norm1(x)
+            x = self.act1(x)
+            x = self.dw3x3(x)
+            x = self.norm2(x)
+            x = self.act2(x)
+            x = self.drop(x)
+            x = self.fc2(x)
+            x = self.norm3(x)
+            x = self.act3(x)
+            x = self.drop(x)
+            return x
+
+        else:
+            raise RuntimeError("Unsupported input shape: {}".format(x.shape))
+
+
+class GeneralTransformerBlock(nn.Module):
+    expansion = 1
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        conv_cfg=None,
+        norm_cfg=dict(type="BN", requires_grad=True),
+    ):
+        super().__init__()
+        self.dim = inplanes
+        self.out_dim = planes
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.attn = InterlacedPoolAttention(
+            self.dim, num_heads=num_heads, window_size=window_size, dropout=attn_drop
+        )
+
+        self.norm1 = norm_layer(self.dim)
+        self.norm2 = norm_layer(self.out_dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        mlp_hidden_dim = int(self.dim * mlp_ratio)
+        self.mlp = MlpDWBN(
+            in_features=self.dim,
+            hidden_features=mlp_hidden_dim,
+            out_features=self.out_dim,
+            act_layer=act_layer,
+            dw_act_layer=act_layer,
+            drop=drop,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+        )
+
+    def forward(self, x):
+        B, C, H, W = x.size()
+        # reshape
+        x = x.view(B, C, -1).permute(0, 2, 1).contiguous()
+        # Attention
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        # FFN
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+        # reshape
+        x = x.permute(0, 2, 1).contiguous().view(B, C, H, W)
+        return x
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/resnet.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..376796046ba1634e3acdb3d26a3f33a3d8528522
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/resnet.py
@@ -0,0 +1,701 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmengine.model import constant_init, kaiming_init
+from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer)
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class BasicBlock(nn.Module):
+    """BasicBlock for ResNet.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int): The ratio of ``out_channels/mid_channels`` where
+            ``mid_channels`` is the output channels of conv1. This is a
+            reserved argument in BasicBlock and should always be 1. Default: 1.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        style (str): `pytorch` or `caffe`. It is unused and reserved for
+            unified API with Bottleneck.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=1,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.expansion = expansion
+        assert self.expansion == 1
+        assert out_channels % expansion == 0
+        self.mid_channels = out_channels // expansion
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, out_channels, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            self.mid_channels,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            out_channels,
+            3,
+            padding=1,
+            bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    """Bottleneck block for ResNet.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int): The ratio of ``out_channels/mid_channels`` where
+            ``mid_channels`` is the input/output channels of conv2. Default: 4.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        style (str): ``"pytorch"`` or ``"caffe"``. If set to "pytorch", the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Default: "pytorch".
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=4,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.expansion = expansion
+        assert out_channels % expansion == 0
+        self.mid_channels = out_channels // expansion
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: the normalization layer named "norm3" """
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def get_expansion(block, expansion=None):
+    """Get the expansion of a residual block.
+
+    The block expansion will be obtained by the following order:
+
+    1. If ``expansion`` is given, just return it.
+    2. If ``block`` has the attribute ``expansion``, then return
+       ``block.expansion``.
+    3. Return the default value according the the block type:
+       1 for ``BasicBlock`` and 4 for ``Bottleneck``.
+
+    Args:
+        block (class): The block class.
+        expansion (int | None): The given expansion ratio.
+
+    Returns:
+        int: The expansion of the block.
+    """
+    if isinstance(expansion, int):
+        assert expansion > 0
+    elif expansion is None:
+        if hasattr(block, 'expansion'):
+            expansion = block.expansion
+        elif issubclass(block, BasicBlock):
+            expansion = 1
+        elif issubclass(block, Bottleneck):
+            expansion = 4
+        else:
+            raise TypeError(f'expansion is not specified for {block.__name__}')
+    else:
+        raise TypeError('expansion must be an integer or None')
+
+    return expansion
+
+
+class ResLayer(nn.Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): Residual block used to build ResLayer.
+        num_blocks (int): Number of blocks.
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int, optional): The expansion for BasicBlock/Bottleneck.
+            If not specified, it will firstly be obtained via
+            ``block.expansion``. If the block has no attribute "expansion",
+            the following default values will be used: 1 for BasicBlock and
+            4 for Bottleneck. Default: None.
+        stride (int): stride of the first block. Default: 1.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+    """
+
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 in_channels,
+                 out_channels,
+                 expansion=None,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 **kwargs):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        self.block = block
+        self.expansion = get_expansion(block, expansion)
+
+        downsample = None
+        if stride != 1 or in_channels != out_channels:
+            downsample = []
+            conv_stride = stride
+            if avg_down and stride != 1:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, out_channels)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            in_channels = out_channels
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+        else:  # downsample_first=False is for HourglassModule
+            for i in range(0, num_blocks - 1):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+
+        super().__init__(*layers)
+
+
+@BACKBONES.register_module()
+class ResNet(BaseBackbone):
+    """ResNet backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1512.03385>`__ for
+    details.
+
+    Args:
+        depth (int): Network depth, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        base_channels (int): Middle channels of the first stage. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+    Example:
+        >>> from mmpose.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=64,
+                 base_channels=64,
+                 expansion=None,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(3, ),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=True):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        self.depth = depth
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.zero_init_residual = zero_init_residual
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.expansion = get_expansion(self.block, expansion)
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        _in_channels = stem_channels
+        _out_channels = base_channels * self.expansion
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            res_layer = self.make_res_layer(
+                block=self.block,
+                num_blocks=num_blocks,
+                in_channels=_in_channels,
+                out_channels=_out_channels,
+                expansion=self.expansion,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg)
+            _in_channels = _out_channels
+            _out_channels *= 2
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = res_layer[-1].out_channels
+
+    def make_res_layer(self, **kwargs):
+        """Make a ResLayer."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        """Make stem layer."""
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super().init_weights(pretrained)
+        if pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@BACKBONES.register_module()
+class ResNetV1d(ResNet):
+    r"""ResNetV1d variant described in `Bag of Tricks
+    <https://arxiv.org/pdf/1812.01187.pdf>`__.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(deep_stem=True, avg_down=True, **kwargs)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..52a30ca9f7c8e90b6c6fa2fd8a9705ca0403b259
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .channel_shuffle import channel_shuffle
+from .inverted_residual import InvertedResidual
+from .make_divisible import make_divisible
+from .se_layer import SELayer
+from .utils import load_checkpoint
+
+__all__ = [
+    'channel_shuffle', 'make_divisible', 'InvertedResidual', 'SELayer',
+    'load_checkpoint'
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/channel_shuffle.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/channel_shuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..aedd826bee690d42d92ed8a7f538b221e5b069e2
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/channel_shuffle.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def channel_shuffle(x, groups):
+    """Channel Shuffle operation.
+
+    This function enables cross-group information flow for multiple groups
+    convolution layers.
+
+    Args:
+        x (Tensor): The input tensor.
+        groups (int): The number of groups to divide the input tensor
+            in the channel dimension.
+
+    Returns:
+        Tensor: The output tensor after channel shuffle operation.
+    """
+
+    batch_size, num_channels, height, width = x.size()
+    assert (num_channels % groups == 0), ('num_channels should be '
+                                          'divisible by groups')
+    channels_per_group = num_channels // groups
+
+    x = x.view(batch_size, groups, channels_per_group, height, width)
+    x = torch.transpose(x, 1, 2).contiguous()
+    x = x.view(batch_size, groups * channels_per_group, height, width)
+
+    return x
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/ckpt_convert.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/ckpt_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..14a43892c6630be31e915ed1f8b9164ba250e8bd
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/ckpt_convert.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# This script consists of several convert functions which
+# can modify the weights of model in original repo to be
+# pre-trained weights.
+
+from collections import OrderedDict
+
+
+def swin_converter(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    def correct_unfold_reduction_order(x):
+        out_channel, in_channel = x.shape
+        x = x.reshape(out_channel, 4, in_channel // 4)
+        x = x[:, [0, 2, 1, 3], :].transpose(1,
+                                            2).reshape(out_channel, in_channel)
+        return x
+
+    def correct_unfold_norm_order(x):
+        in_channel = x.shape[0]
+        x = x.reshape(4, in_channel // 4)
+        x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+        return x
+
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        elif k.startswith('layers'):
+            new_v = v
+            if 'attn.' in k:
+                new_k = k.replace('attn.', 'attn.w_msa.')
+            elif 'mlp.' in k:
+                if 'mlp.fc1.' in k:
+                    new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.')
+                elif 'mlp.fc2.' in k:
+                    new_k = k.replace('mlp.fc2.', 'ffn.layers.1.')
+                else:
+                    new_k = k.replace('mlp.', 'ffn.')
+            elif 'downsample' in k:
+                new_k = k
+                if 'reduction.' in k:
+                    new_v = correct_unfold_reduction_order(v)
+                elif 'norm.' in k:
+                    new_v = correct_unfold_norm_order(v)
+            else:
+                new_k = k
+            new_k = new_k.replace('layers', 'stages', 1)
+        elif k.startswith('patch_embed'):
+            new_v = v
+            if 'proj' in k:
+                new_k = k.replace('proj', 'projection')
+            else:
+                new_k = k
+        else:
+            new_v = v
+            new_k = k
+
+        new_ckpt['backbone.' + new_k] = new_v
+
+    return new_ckpt
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/inverted_residual.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/inverted_residual.py
new file mode 100644
index 0000000000000000000000000000000000000000..dff762c570550e4a738ae1833a4c82c18777115d
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/inverted_residual.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule
+
+from .se_layer import SELayer
+
+
+class InvertedResidual(nn.Module):
+    """Inverted Residual Block.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        mid_channels (int): The input channels of the depthwise convolution.
+        kernel_size (int): The kernel size of the depthwise convolution.
+            Default: 3.
+        groups (None or int): The group number of the depthwise convolution.
+            Default: None, which means group number = mid_channels.
+        stride (int): The stride of the depthwise convolution. Default: 1.
+        se_cfg (dict): Config dict for se layer. Default: None, which means no
+            se layer.
+        with_expand_conv (bool): Use expand conv or not. If set False,
+            mid_channels must be the same with in_channels.
+            Default: True.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 groups=None,
+                 stride=1,
+                 se_cfg=None,
+                 with_expand_conv=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
+        assert stride in [1, 2]
+        self.with_cp = with_cp
+        self.with_se = se_cfg is not None
+        self.with_expand_conv = with_expand_conv
+
+        if groups is None:
+            groups = mid_channels
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+        if not self.with_expand_conv:
+            assert mid_channels == in_channels
+
+        if self.with_expand_conv:
+            self.expand_conv = ConvModule(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.depthwise_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=groups,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+        self.linear_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+
+            if self.with_expand_conv:
+                out = self.expand_conv(out)
+
+            out = self.depthwise_conv(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.linear_conv(out)
+
+            if self.with_res_shortcut:
+                return x + out
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/make_divisible.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/make_divisible.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7666be65939d5c76057e73927c230029cb1871d
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/make_divisible.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+    """Make divisible function.
+
+    This function rounds the channel number down to the nearest value that can
+    be divisible by the divisor.
+
+    Args:
+        value (int): The original channel number.
+        divisor (int): The divisor to fully divide the channel number.
+        min_value (int, optional): The minimum value of the output channel.
+            Default: None, means that the minimum value equal to the divisor.
+        min_ratio (float, optional): The minimum ratio of the rounded channel
+            number to the original channel number. Default: 0.9.
+    Returns:
+        int: The modified output channel number
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than (1-min_ratio).
+    if new_value < min_ratio * value:
+        new_value += divisor
+    return new_value
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/se_layer.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/se_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f70802eb1b98b1f22516ba62b1533557f428ed
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/se_layer.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+
+class SELayer(nn.Module):
+    """Squeeze-and-Excitation Module.
+
+    Args:
+        channels (int): The input (and output) channels of the SE layer.
+        ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
+            ``int(channels/ratio)``. Default: 16.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Default: (dict(type='ReLU'), dict(type='Sigmoid'))
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
+        super().__init__()
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/utils.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f6ab9b43202b6491911f4e0a713cbf3f210566d
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/utils/utils.py
@@ -0,0 +1,612 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+from mmengine.runner import load_state_dict
+
+
+# Copyright (c) Open-MMLab. All rights reserved.
+import io
+import os
+import os.path as osp
+import pkgutil
+import time
+import warnings
+from collections import OrderedDict
+from importlib import import_module
+from tempfile import TemporaryDirectory
+
+import torch
+import torchvision
+from torch.optim import Optimizer
+from torch.utils import model_zoo
+from torch.nn import functional as F
+
+import mmcv
+from mmengine.fileio import FileClient
+from mmengine.fileio import load as load_file
+# from mmengine.model.wrappers.utils import is_module_wrapper
+from mmengine.utils import mkdir_or_exist
+from mmengine.dist import get_dist_info
+
+from scipy import interpolate
+import numpy as np
+import math
+
+ENV_MMCV_HOME = 'MMCV_HOME'
+ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
+DEFAULT_CACHE_DIR = '~/.cache'
+
+
+def _get_mmcv_home():
+    mmcv_home = os.path.expanduser(
+        os.getenv(
+            ENV_MMCV_HOME,
+            os.path.join(
+                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
+
+    mkdir_or_exist(mmcv_home)
+    return mmcv_home
+
+
+def load_state_dict(module, state_dict, strict=False, logger=None):
+    """Load state_dict to a module.
+    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+    Default value for ``strict`` is set to ``False`` and the message for
+    param mismatch will be shown even if strict is False.
+    Args:
+        module (Module): Module that receives the state_dict.
+        state_dict (OrderedDict): Weights.
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
+        logger (:obj:`logging.Logger`, optional): Logger to log the error
+            message. If not specified, print function will be used.
+    """
+    unexpected_keys = []
+    all_missing_keys = []
+    err_msg = []
+
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    # use _load_from_state_dict to enable checkpoint version control
+    def load(module, prefix=''):
+        # recursively check parallel module in case that the model has a
+        # complicated structure, e.g., nn.Module(nn.Module(DDP))
+        # if is_module_wrapper(module):
+        #     module = module.module
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+                                     all_missing_keys, unexpected_keys,
+                                     err_msg)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+
+    load(module)
+    load = None  # break load->load reference cycle
+
+    # ignore "num_batches_tracked" of BN layers
+    missing_keys = [
+        key for key in all_missing_keys if 'num_batches_tracked' not in key
+    ]
+
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+
+    rank, _ = get_dist_info()
+    if len(err_msg) > 0 and rank == 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)
+        if strict:
+            raise RuntimeError(err_msg)
+        elif logger is not None:
+            logger.warning(err_msg)
+        else:
+            print(err_msg)
+
+
+def load_url_dist(url, model_dir=None, map_location="cpu"):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    if rank == 0:
+        checkpoint = model_zoo.load_url(url, model_dir=model_dir, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            checkpoint = model_zoo.load_url(url, model_dir=model_dir, map_location=map_location)
+    return checkpoint
+
+
+def load_pavimodel_dist(model_path, map_location=None):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    try:
+        from pavi import modelcloud
+    except ImportError:
+        raise ImportError(
+            'Please install pavi to load checkpoint from modelcloud.')
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    if rank == 0:
+        model = modelcloud.get(model_path)
+        with TemporaryDirectory() as tmp_dir:
+            downloaded_file = osp.join(tmp_dir, model.name)
+            model.download(downloaded_file)
+            checkpoint = torch.load(downloaded_file, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            model = modelcloud.get(model_path)
+            with TemporaryDirectory() as tmp_dir:
+                downloaded_file = osp.join(tmp_dir, model.name)
+                model.download(downloaded_file)
+                checkpoint = torch.load(
+                    downloaded_file, map_location=map_location)
+    return checkpoint
+
+
+def load_fileclient_dist(filename, backend, map_location):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    allowed_backends = ['ceph']
+    if backend not in allowed_backends:
+        raise ValueError(f'Load from Backend {backend} is not supported.')
+    if rank == 0:
+        fileclient = FileClient(backend=backend)
+        buffer = io.BytesIO(fileclient.get(filename))
+        checkpoint = torch.load(buffer, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            fileclient = FileClient(backend=backend)
+            buffer = io.BytesIO(fileclient.get(filename))
+            checkpoint = torch.load(buffer, map_location=map_location)
+    return checkpoint
+
+
+def get_torchvision_models():
+    model_urls = dict()
+    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
+        if ispkg:
+            continue
+        _zoo = import_module(f'torchvision.models.{name}')
+        if hasattr(_zoo, 'model_urls'):
+            _urls = getattr(_zoo, 'model_urls')
+            model_urls.update(_urls)
+    return model_urls
+
+
+def get_external_models():
+    mmcv_home = _get_mmcv_home()
+    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
+    default_urls = load_file(default_json_path)
+    assert isinstance(default_urls, dict)
+    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
+    if osp.exists(external_json_path):
+        external_urls = load_file(external_json_path)
+        assert isinstance(external_urls, dict)
+        default_urls.update(external_urls)
+
+    return default_urls
+
+
+def get_mmcls_models():
+    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
+    mmcls_urls = load_file(mmcls_json_path)
+
+    return mmcls_urls
+
+
+def get_deprecated_model_names():
+    deprecate_json_path = osp.join(mmcv.__path__[0],
+                                   'model_zoo/deprecated.json')
+    deprecate_urls = load_file(deprecate_json_path)
+    assert isinstance(deprecate_urls, dict)
+
+    return deprecate_urls
+
+
+def _process_mmcls_checkpoint(checkpoint):
+    state_dict = checkpoint['state_dict']
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith('backbone.'):
+            new_state_dict[k[9:]] = v
+    new_checkpoint = dict(state_dict=new_state_dict)
+
+    return new_checkpoint
+
+
+def _load_checkpoint(filename, map_location=None):
+    """Load checkpoint from somewhere (modelzoo, file, url).
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str | None): Same as :func:`torch.load`. Default: None.
+    Returns:
+        dict | OrderedDict: The loaded checkpoint. It can be either an
+            OrderedDict storing model weights or a dict containing other
+            information, which depends on the checkpoint.
+    """
+    if filename.startswith('modelzoo://'):
+        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
+                      'use "torchvision://" instead')
+        model_urls = get_torchvision_models()
+        model_name = filename[11:]
+        checkpoint = load_url_dist(model_urls[model_name])
+    elif filename.startswith('torchvision://'):
+        model_urls = get_torchvision_models()
+        model_name = filename[14:]
+        checkpoint = load_url_dist(model_urls[model_name])
+    elif filename.startswith('open-mmlab://'):
+        model_urls = get_external_models()
+        model_name = filename[13:]
+        deprecated_urls = get_deprecated_model_names()
+        if model_name in deprecated_urls:
+            warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
+                          f'of open-mmlab://{deprecated_urls[model_name]}')
+            model_name = deprecated_urls[model_name]
+        model_url = model_urls[model_name]
+        # check if is url
+        if model_url.startswith(('http://', 'https://')):
+            checkpoint = load_url_dist(model_url)
+        else:
+            filename = osp.join(_get_mmcv_home(), model_url)
+            if not osp.isfile(filename):
+                raise IOError(f'{filename} is not a checkpoint file')
+            checkpoint = torch.load(filename, map_location=map_location)
+    elif filename.startswith('mmcls://'):
+        model_urls = get_mmcls_models()
+        model_name = filename[8:]
+        checkpoint = load_url_dist(model_urls[model_name])
+        checkpoint = _process_mmcls_checkpoint(checkpoint)
+    elif filename.startswith(('http://', 'https://')):
+        checkpoint = load_url_dist(filename)
+    elif filename.startswith('pavi://'):
+        model_path = filename[7:]
+        checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
+    elif filename.startswith('s3://'):
+        checkpoint = load_fileclient_dist(
+            filename, backend='ceph', map_location=map_location)
+    else:
+        if not osp.isfile(filename):
+            raise IOError(f'{filename} is not a checkpoint file')
+        checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0,
+                     start_warmup_value=0, warmup_steps=-1):
+    warmup_schedule = np.array([])
+    warmup_iters = warmup_epochs * niter_per_ep
+    if warmup_steps > 0:
+        warmup_iters = warmup_steps
+    print("Set warmup steps = %d" % warmup_iters)
+    if warmup_epochs > 0:
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+
+    iters = np.arange(epochs * niter_per_ep - warmup_iters)
+    schedule = np.array(
+        [final_value + 0.5 * (base_value - final_value) * (1 + math.cos(math.pi * i / (len(iters)))) for i in iters])
+
+    schedule = np.concatenate((warmup_schedule, schedule))
+
+    assert len(schedule) == epochs * niter_per_ep
+    return schedule
+
+
+def load_checkpoint(model,
+                    filename,
+                    map_location='cpu',
+                    strict=False,
+                    logger=None,
+                    patch_padding='pad',
+                    ):
+    """Load checkpoint from a file or URI.
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        patch_padding (str): 'pad' or 'bilinear' or 'bicubic', used for interpolate patch embed from 14x14 to 16x16
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    elif 'module' in checkpoint:
+        state_dict = checkpoint['module']
+    else:
+        state_dict = checkpoint
+    # strip prefix of state_dict
+    if list(state_dict.keys())[0].startswith('module.'):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+    # for MoBY, load model of online branch
+    if sorted(list(state_dict.keys()))[0].startswith('encoder'):
+        state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}
+
+    rank, _ = get_dist_info()
+
+    if 'patch_embed.proj.weight' in state_dict:
+        proj_weight = state_dict['patch_embed.proj.weight']
+        orig_size = proj_weight.shape[2:]
+        current_size = model.patch_embed.proj.weight.shape[2:]
+        padding_size = current_size[0] - orig_size[0]
+        padding_l = padding_size // 2
+        padding_r = padding_size - padding_l
+        if orig_size != current_size:
+            if 'pad' in patch_padding:
+                proj_weight = torch.nn.functional.pad(proj_weight, (padding_l, padding_r, padding_l, padding_r))
+            elif 'bilinear' in patch_padding:
+                proj_weight = torch.nn.functional.interpolate(proj_weight, size=current_size, mode='bilinear', align_corners=False)
+            elif 'bicubic' in patch_padding:
+                proj_weight = torch.nn.functional.interpolate(proj_weight, size=current_size, mode='bicubic', align_corners=False)
+            state_dict['patch_embed.proj.weight'] = proj_weight
+
+    if 'pos_embed' in state_dict:
+        pos_embed_checkpoint = state_dict['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        H, W = model.patch_embed.patch_shape
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        if rank == 0:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, H, W))
+        extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+        # only the position tokens are interpolated
+        pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+        pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+        pos_tokens = torch.nn.functional.interpolate(
+            pos_tokens, size=(H, W), mode='bicubic', align_corners=False)
+        pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+        new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+        state_dict['pos_embed'] = new_pos_embed
+
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+
+def weights_to_cpu(state_dict):
+    """Copy a model state_dict to cpu.
+    Args:
+        state_dict (OrderedDict): Model weights on GPU.
+    Returns:
+        OrderedDict: Model weights on GPU.
+    """
+    state_dict_cpu = OrderedDict()
+    for key, val in state_dict.items():
+        state_dict_cpu[key] = val.cpu()
+    return state_dict_cpu
+
+
+def _save_to_state_dict(module, destination, prefix, keep_vars):
+    """Saves module state to `destination` dictionary.
+    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (dict): A dict where state will be stored.
+        prefix (str): The prefix for parameters and buffers used in this
+            module.
+    """
+    for name, param in module._parameters.items():
+        if param is not None:
+            destination[prefix + name] = param if keep_vars else param.detach()
+    for name, buf in module._buffers.items():
+        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
+        if buf is not None:
+            destination[prefix + name] = buf if keep_vars else buf.detach()
+
+
+def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+    """Returns a dictionary containing a whole state of the module.
+    Both parameters and persistent buffers (e.g. running averages) are
+    included. Keys are corresponding parameter and buffer names.
+    This method is modified from :meth:`torch.nn.Module.state_dict` to
+    recursively check parallel module in case that the model has a complicated
+    structure, e.g., nn.Module(nn.Module(DDP)).
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (OrderedDict): Returned dict for the state of the
+            module.
+        prefix (str): Prefix of the key.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters. Default: False.
+    Returns:
+        dict: A dictionary containing a whole state of the module.
+    """
+    # recursively check parallel module in case that the model has a
+    # complicated structure, e.g., nn.Module(nn.Module(DDP))
+    # if is_module_wrapper(module):
+    #     module = module.module
+
+    # below is the same as torch.nn.Module.state_dict()
+    if destination is None:
+        destination = OrderedDict()
+        destination._metadata = OrderedDict()
+    destination._metadata[prefix[:-1]] = local_metadata = dict(
+        version=module._version)
+    _save_to_state_dict(module, destination, prefix, keep_vars)
+    for name, child in module._modules.items():
+        if child is not None:
+            get_state_dict(
+                child, destination, prefix + name + '.', keep_vars=keep_vars)
+    for hook in module._state_dict_hooks.values():
+        hook_result = hook(module, destination, prefix, local_metadata)
+        if hook_result is not None:
+            destination = hook_result
+    return destination
+
+
+def save_checkpoint(model, filename, optimizer=None, meta=None):
+    """Save checkpoint to file.
+    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
+    ``optimizer``. By default ``meta`` will contain version and time info.
+    Args:
+        model (Module): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        meta (dict, optional): Metadata to be saved in checkpoint.
+    """
+    if meta is None:
+        meta = {}
+    elif not isinstance(meta, dict):
+        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+
+    # if is_module_wrapper(model):
+    #     model = model.module
+
+    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+        # save class name to the meta
+        meta.update(CLASSES=model.CLASSES)
+
+    checkpoint = {
+        'meta': meta,
+        'state_dict': weights_to_cpu(get_state_dict(model))
+    }
+    # save optimizer state dict in the checkpoint
+    if isinstance(optimizer, Optimizer):
+        checkpoint['optimizer'] = optimizer.state_dict()
+    elif isinstance(optimizer, dict):
+        checkpoint['optimizer'] = {}
+        for name, optim in optimizer.items():
+            checkpoint['optimizer'][name] = optim.state_dict()
+
+    if filename.startswith('pavi://'):
+        try:
+            from pavi import modelcloud
+            from pavi.exception import NodeNotFoundError
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        mmcv.mkdir_or_exist(osp.dirname(filename))
+        # immediately flush buffer
+        with open(filename, 'wb') as f:
+            torch.save(checkpoint, f)
+            f.flush()
+
+
+# def load_checkpoint(model,
+#                     filename,
+#                     map_location='cpu',
+#                     strict=False,
+#                     logger=None):
+#     """Load checkpoint from a file or URI.
+#
+#     Args:
+#         model (Module): Module to load checkpoint.
+#         filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+#             ``open-mmlab://xxx``.
+#         map_location (str): Same as :func:`torch.load`.
+#         strict (bool): Whether to allow different params for the model and
+#             checkpoint.
+#         logger (:mod:`logging.Logger` or None): The logger for error message.
+#
+#     Returns:
+#         dict or OrderedDict: The loaded checkpoint.
+#     """
+#     checkpoint = _load_checkpoint(filename, map_location)
+#     # OrderedDict is a subclass of dict
+#     if not isinstance(checkpoint, dict):
+#         raise RuntimeError(
+#             f'No state_dict found in checkpoint file {filename}')
+#     # get state_dict from checkpoint
+#     if 'state_dict' in checkpoint:
+#         state_dict_tmp = checkpoint['state_dict']
+#     else:
+#         state_dict_tmp = checkpoint
+#
+#     state_dict = OrderedDict()
+#     # strip prefix of state_dict
+#     for k, v in state_dict_tmp.items():
+#         if k.startswith('module.backbone.'):
+#             state_dict[k[16:]] = v
+#         elif k.startswith('module.'):
+#             state_dict[k[7:]] = v
+#         elif k.startswith('backbone.'):
+#             state_dict[k[9:]] = v
+#         else:
+#             state_dict[k] = v
+#     # load state_dict
+#     load_state_dict(model, state_dict, strict, logger)
+#     return checkpoint
+#
+#
+# def get_state_dict(filename, map_location='cpu'):
+#     """Get state_dict from a file or URI.
+#
+#     Args:
+#         filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+#             ``open-mmlab://xxx``.
+#         map_location (str): Same as :func:`torch.load`.
+#
+#     Returns:
+#         OrderedDict: The state_dict.
+#     """
+#     checkpoint = _load_checkpoint(filename, map_location)
+#     # OrderedDict is a subclass of dict
+#     if not isinstance(checkpoint, dict):
+#         raise RuntimeError(
+#             f'No state_dict found in checkpoint file {filename}')
+#     # get state_dict from checkpoint
+#     if 'state_dict' in checkpoint:
+#         state_dict_tmp = checkpoint['state_dict']
+#     else:
+#         state_dict_tmp = checkpoint
+#
+#     state_dict = OrderedDict()
+#     # strip prefix of state_dict
+#     for k, v in state_dict_tmp.items():
+#         if k.startswith('module.backbone.'):
+#             state_dict[k[16:]] = v
+#         elif k.startswith('module.'):
+#             state_dict[k[7:]] = v
+#         elif k.startswith('backbone.'):
+#             state_dict[k[9:]] = v
+#         else:
+#             state_dict[k] = v
+#
+#     return state_dict
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/backbones/vit.py b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..09a2294d46716cef6c3d08d37369c9e1f853b0f7
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/backbones/vit.py
@@ -0,0 +1,327 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+from functools import partial
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from einops import repeat
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self):
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None, ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.dim = dim
+
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None,
+                 drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm, attn_head_dim=None
+                 ):
+        super().__init__()
+
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
+        )
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
+        self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
+        self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio),
+                              padding=4 + 2 * (ratio // 2 - 1))
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+
+        x = x.flatten(2).transpose(1, 2)
+        return x, (Hp, Wp)
+
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+
+    def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+
+@BACKBONES.register_module()
+class ViT(BaseBackbone):
+
+    def __init__(self,
+                 img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False,
+                 frozen_stages=-1, ratio=1, last_norm=True,
+                 patch_padding='pad', freeze_attn=False, freeze_ffn=False, task_tokens_num=1+1+2+2+25
+                 ):
+        # Protect mutable default arguments
+        super(ViT, self).__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.frozen_stages = frozen_stages
+        self.use_checkpoint = use_checkpoint
+        self.patch_padding = patch_padding
+        self.freeze_attn = freeze_attn
+        self.freeze_ffn = freeze_ffn
+        self.depth = depth
+        self.task_tokens_num = task_tokens_num
+
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio)
+        num_patches = self.patch_embed.num_patches
+
+        # task tokens for HPS estimation
+        self.task_tokens = nn.Parameter(torch.zeros(1, task_tokens_num, embed_dim))
+        trunc_normal_(self.task_tokens, std=.02)
+
+        # since the pretraining model has class token
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+            )
+            for i in range(depth)])
+
+        self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = self.blocks[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        if self.freeze_attn:
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.attn.eval()
+                m.norm1.eval()
+                for param in m.attn.parameters():
+                    param.requires_grad = False
+                for param in m.norm1.parameters():
+                    param.requires_grad = False
+
+        if self.freeze_ffn:
+            self.pos_embed.requires_grad = False
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.mlp.eval()
+                m.norm2.eval()
+                for param in m.mlp.parameters():
+                    param.requires_grad = False
+                for param in m.norm2.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super().init_weights(pretrained, patch_padding=self.patch_padding)
+
+        if pretrained is None:
+            def _init_weights(m):
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if isinstance(m, nn.Linear) and m.bias is not None:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.LayerNorm):
+                    nn.init.constant_(m.bias, 0)
+                    nn.init.constant_(m.weight, 1.0)
+
+            self.apply(_init_weights)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def forward_features(self, x):
+        B, C, H, W = x.shape
+        x, (Hp, Wp) = self.patch_embed(x)
+        task_tokens = repeat(self.task_tokens, '() n d -> b n d', b=B)
+        if self.pos_embed is not None:
+            # fit for multiple GPU training
+            # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
+            x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
+
+        x = torch.cat((task_tokens, x), dim=1)
+
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+
+        x = self.last_norm(x)
+
+        task_tokens = x[:, :self.task_tokens_num]  # [N,J,C]
+        # task_tokens = torch.cat(task_tokens_, dim=-1)
+        xp = x[:, self.task_tokens_num:]  # [N,Hp*Wp,C]
+
+        xp = xp.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous()
+
+        return xp, task_tokens
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/builder.py b/SMPLer-X/main/transformer_utils/mmpose/models/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fa61ca08374a3e0af8d937fdc197eb51207881c
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/builder.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.registry import MODELS as MMCV_MODELS
+from mmengine import Registry
+from mmengine.registry import build_from_cfg, build_model_from_cfg
+
+MODELS = Registry('models', parent=MMCV_MODELS, locations=['mmpose.models'])
+
+BACKBONES = MODELS
+NECKS = MODELS
+HEADS = MODELS
+LOSSES = MODELS
+POSENETS = MODELS
+MESH_MODELS = MODELS
+TRANSFORMER = Registry('Transformer')
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return BACKBONES.build(cfg)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    return NECKS.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    return HEADS.build(cfg)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return LOSSES.build(cfg)
+
+
+def build_posenet(cfg):
+    """Build posenet."""
+    return POSENETS.build(cfg)
+
+
+def build_mesh_model(cfg):
+    """Build mesh model."""
+    return MESH_MODELS.build(cfg)
+
+def build_transformer(cfg, default_args=None):
+    """Builder for Transformer."""
+    return build_from_cfg(cfg, TRANSFORMER, default_args)
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/detectors/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/models/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..823cd5d52c2723c6a537765a7f083a444016e8f7
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/detectors/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .top_down import TopDown
+from .poseur import Poseur
+
+__all__ = [
+    'TopDown', 'Poseur'
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/detectors/base.py b/SMPLer-X/main/transformer_utils/mmpose/models/detectors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d459b42de66012c88ff37d7d845265d06efebc7
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/detectors/base.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+
+class BasePose(nn.Module, metaclass=ABCMeta):
+    """Base class for pose detectors.
+
+    All recognizers should subclass it.
+    All subclass should overwrite:
+        Methods:`forward_train`, supporting to forward when training.
+        Methods:`forward_test`, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Head modules to give output.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+    """
+
+    @abstractmethod
+    def forward_train(self, img, img_metas, **kwargs):
+        """Defines the computation performed at training."""
+
+    @abstractmethod
+    def forward_test(self, img, img_metas, **kwargs):
+        """Defines the computation performed at testing."""
+
+    @abstractmethod
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Forward function."""
+
+    @staticmethod
+    def _parse_losses(losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor \
+                which may be a weighted sum of all losses, log_vars \
+                contains all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, float):
+                log_vars[loss_name] = loss_value
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors or float')
+
+        loss = sum(_value for _key, _value in log_vars.items()
+                   if 'loss' in _key)
+
+        log_vars['loss'] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if not isinstance(loss_value, float):
+                if dist.is_available() and dist.is_initialized():
+                    loss_value = loss_value.data.clone()
+                    dist.all_reduce(loss_value.div_(dist.get_world_size()))
+                log_vars[loss_name] = loss_value.item()
+            else:
+                log_vars[loss_name] = loss_value
+
+        return loss, log_vars
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data_batch (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self.forward(**data_batch)
+
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars,
+            num_samples=len(next(iter(data_batch.values()))))
+
+        return outputs
+
+    def val_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        results = self.forward(return_loss=False, **data_batch)
+
+        outputs = dict(results=results)
+
+        return outputs
+
+    @abstractmethod
+    def show_result(self, **kwargs):
+        """Visualize the results."""
+        raise NotImplementedError
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/detectors/poseur.py b/SMPLer-X/main/transformer_utils/mmpose/models/detectors/poseur.py
new file mode 100644
index 0000000000000000000000000000000000000000..455803bbd78dca98ba80814d19dbbdb4ccfda8ab
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/detectors/poseur.py
@@ -0,0 +1,273 @@
+import warnings
+
+import mmcv
+import numpy as np
+from mmcv.image import imwrite
+from mmcv.visualization.image import imshow
+
+from mmpose.core import imshow_keypoints
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+import torch
+from config import cfg
+
+from mmpose.core import auto_fp16
+
+from .top_down import TopDown
+
+
+@POSENETS.register_module()
+class Poseur(TopDown):
+    def __init__(self, *args, **kwargs):
+        if 'filp_fuse_type' in kwargs:
+            self.filp_fuse_type = kwargs.pop('filp_fuse_type')
+        else:
+            self.filp_fuse_type = 'default'
+        super().__init__(*args, **kwargs)
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        if self.with_neck:
+            self.neck.init_weights()
+        if self.with_keypoint:
+            self.keypoint_head.init_weights()
+
+    @auto_fp16(apply_to=('img',))
+    def forward(self,
+                img,
+                coord_target=None,
+                coord_target_weight=None,
+                bbox_target=None,
+                bbox_target_weight=None,
+                hp_target=None,
+                hp_target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                return_heatmap=False,
+                coord_init=None,
+                query_init=None,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C (Default: 3)
+            img height: imgH
+            img weight: imgW
+            heatmaps height: H
+            heatmaps weight: W
+
+        Args:
+            img (torch.Tensor[NxCximgHximgW]): Input images.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]): Weights across
+                different joint types.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            return_heatmap (bool) : Option to return heatmap.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses.
+              Otherwise, return predicted poses, boxes, image paths
+                  and heatmaps.
+        """
+        return self.forward_mesh_recovery(img, coord_init=coord_init, query_init=query_init,
+                                          **kwargs)
+        # if return_loss:
+        #     return self.forward_train(img,
+        #                               coord_target, coord_target_weight,
+        #                               hp_target, hp_target_weight, img_metas,
+        #                               **kwargs)
+        # return self.forward_test(
+        #     img, img_metas, return_heatmap=return_heatmap, **kwargs)
+
+    def forward_train(self, img, coord_target, coord_target_weight,
+                      hp_target, hp_target_weight, img_metas, **kwargs):
+        """
+        :param img:
+        :param coord_target: [2, 17, 2]
+        :param coord_target_weight: [2, 17, 2]
+        :param hp_target: [2, 4, 17, 64, 48]
+        :param hp_target_weight: [2, 4, 17, 1]
+        :param img_metas:
+        :param kwargs:
+        :return:
+        """
+        """Defines the computation performed at every call when training."""
+        output = self.backbone(img)
+        img_feat = output[-1]
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            # output = self.keypoint_head(output, img_metas)
+            enc_output, dec_output = self.keypoint_head(output)
+
+        return img_feat, enc_output, dec_output, None
+
+    def seperate_sigma_from_score(self, score):
+        if score.shape[2] == 3:
+            sigma = score[:, :, [1, 2]]
+            score = score[:, :, [0]]
+            return score, sigma
+        elif score.shape[2] == 1:
+            return score, None
+        else:
+            raise
+
+    def forward_mesh_recovery(self, output, coord_init=None, query_init=None, **kwargs):
+        """
+        :param img:
+        :param coord_target: [2, 17, 2]
+        :param coord_target_weight: [2, 17, 2]
+        :param hp_target: [2, 4, 17, 64, 48]
+        :param hp_target_weight: [2, 4, 17, 1]
+        :param img_metas:
+        :param kwargs:
+        :return:
+        """
+        """Defines the computation performed at every call when training."""
+        # output = self.backbone(img)
+        img_feat = output[-1]
+        # print(len(output))
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            # output = self.keypoint_head(output, img_metas)
+            enc_output, dec_output = self.keypoint_head(output, coord_init=coord_init, query_init=query_init)
+
+            return dec_output.feat[-1]
+
+    def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        result = {}
+
+        features = self.backbone(img)
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output_regression, output_regression_score = self.keypoint_head.inference_model(
+                features, flip_pairs=None)
+            output_regression_score, output_regression_sigma = self.seperate_sigma_from_score(output_regression_score)
+
+        if self.test_cfg['flip_test']:
+            img_flipped = img.flip(3)
+            features_flipped = self.backbone(img_flipped)
+            if self.with_neck:
+                features_flipped = self.neck(features_flipped)
+            if self.with_keypoint:
+                output_regression_flipped, output_regression_score_flipped = self.keypoint_head.inference_model(
+                    features_flipped, img_metas[0]['flip_pairs'])
+                output_regression_score_flipped, output_regression_sigma_flipped = \
+                    self.seperate_sigma_from_score(output_regression_score_flipped)
+                if self.filp_fuse_type == 'default':
+                    output_regression = (output_regression +
+                                         output_regression_flipped) * 0.5
+
+                    output_regression_score = (output_regression_score +
+                                               output_regression_score_flipped) * 0.5
+                elif self.filp_fuse_type == 'type1':
+                    # output_regression = (output_regression * output_regression_score + output_regression_flipped * output_regression_score_flipped)\
+                    #     /(output_regression_score + output_regression_score_flipped+1e-9)
+                    output_regression, output_regression_flipped = \
+                        torch.from_numpy(output_regression), torch.from_numpy(output_regression_flipped)
+
+                    output_regression_score, output_regression_score_flipped = \
+                        torch.from_numpy(output_regression_score), torch.from_numpy(output_regression_score_flipped)
+
+                    output_regression = (
+                                                    output_regression * output_regression_score + output_regression_flipped * output_regression_score_flipped) \
+                                        / (output_regression_score + output_regression_score_flipped + 1e-9)
+
+                    diff = 1 - (output_regression_score - output_regression_score_flipped).abs()
+                    output_regression_score = (output_regression_score * output_regression_score_flipped * diff) ** 2
+
+                    output_regression = output_regression.numpy()
+                    output_regression_score = output_regression_score.numpy()
+                elif self.filp_fuse_type == 'type2':
+                    # output_regression = (output_regression * output_regression_score + output_regression_flipped * output_regression_score_flipped)\
+                    #     /(output_regression_score + output_regression_score_flipped+1e-9)
+                    output_regression, output_regression_flipped = \
+                        torch.from_numpy(output_regression), torch.from_numpy(output_regression_flipped)
+
+                    output_regression_sigma, output_regression_sigma_flipped = \
+                        torch.from_numpy(output_regression_sigma), torch.from_numpy(output_regression_sigma_flipped)
+
+                    output_regression_p, output_regression_p_flipped = \
+                        self.get_p(output_regression_sigma), self.get_p(output_regression_sigma_flipped)
+
+                    p_to_coord_index = 5
+                    output_regression = (
+                                                    output_regression * output_regression_p ** p_to_coord_index + output_regression_flipped * output_regression_p_flipped ** p_to_coord_index) \
+                                        / (
+                                                    output_regression_p ** p_to_coord_index + output_regression_p_flipped ** p_to_coord_index + 1e-10)
+
+                    output_regression_score = (output_regression_p + output_regression_p_flipped) * 0.5
+
+                    output_regression = output_regression.numpy()
+                    output_regression_score = output_regression_score.numpy()
+                else:
+                    NotImplementedError
+
+        if self.with_keypoint:
+            keypoint_result = self.keypoint_head.decode_keypoints(
+                img_metas, output_regression, output_regression_score, [img_width, img_height])
+            result.update(keypoint_result)
+
+            if not return_heatmap:
+                output_heatmap = None
+
+            result['output_heatmap'] = output_heatmap
+
+        return result
+
+    def get_p(self, output_regression_sigma, p_x=0.2):
+        output_regression_p = (1 - np.exp(-(p_x / output_regression_sigma)))
+        output_regression_p = output_regression_p[:, :, 0] * output_regression_p[:, :, 1]
+        output_regression_p = output_regression_p[:, :, None]
+        return output_regression_p * 0.7
+        # 0.2  0.7 7421
+        # 0.2  0.7 7610
+        # 0.17 0.7
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            Tensor: Output heatmaps.
+        """
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            img_h, img_w = 256, 192
+            img_metas = [{}]
+            img_metas[0]['batch_input_shape'] = (img_h, img_w)
+            img_metas[0]['img_shape'] = (img_h, img_w, 3)
+            # output = self.keypoint_head(output, img_metas)
+            output = self.keypoint_head(output)
+        return output
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/detectors/top_down.py b/SMPLer-X/main/transformer_utils/mmpose/models/detectors/top_down.py
new file mode 100644
index 0000000000000000000000000000000000000000..20bf504e9bfa20a7470203033b2265e067ad33e6
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/detectors/top_down.py
@@ -0,0 +1,306 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import numpy as np
+from mmcv.image import imwrite
+from mmengine.utils import deprecated_api_warning
+from mmcv.visualization.image import imshow
+
+from mmpose.core import imshow_bboxes, imshow_keypoints
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+from mmpose.core import auto_fp16
+
+
+@POSENETS.register_module()
+class TopDown(BasePose):
+    """Top-down pose detectors.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        keypoint_head (dict): Keypoint head to process feature.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained models.
+        loss_pose (None): Deprecated arguments. Please use
+            `loss_keypoint` for heads instead.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 keypoint_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 loss_pose=None):
+        super().__init__()
+        self.fp16_enabled = False
+
+        self.backbone = builder.build_backbone(backbone)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+
+        if keypoint_head is not None:
+            keypoint_head['train_cfg'] = train_cfg
+            keypoint_head['test_cfg'] = test_cfg
+
+            if 'loss_keypoint' not in keypoint_head and loss_pose is not None:
+                warnings.warn(
+                    '`loss_pose` for TopDown is deprecated, '
+                    'use `loss_keypoint` for heads instead. See '
+                    'https://github.com/open-mmlab/mmpose/pull/382'
+                    ' for more information.', DeprecationWarning)
+                keypoint_head['loss_keypoint'] = loss_pose
+
+            self.keypoint_head = builder.build_head(keypoint_head)
+        self.pretrained = pretrained
+        self.init_weights()
+
+    @property
+    def with_neck(self):
+        """Check if has neck."""
+        return hasattr(self, 'neck')
+
+    @property
+    def with_keypoint(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'keypoint_head')
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        if pretrained is not None:
+            self.pretrained = pretrained
+        self.backbone.init_weights(self.pretrained)
+        if self.with_neck:
+            self.neck.init_weights()
+        if self.with_keypoint:
+            self.keypoint_head.init_weights()
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self,
+                img,
+                target=None,
+                target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                return_heatmap=False,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img width: imgW
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            img (torch.Tensor[NxCximgHximgW]): Input images.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]): Weights across
+                different joint types.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            return_heatmap (bool) : Option to return heatmap.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses. \
+                Otherwise, return predicted poses, boxes, image paths \
+                and heatmaps.
+        """
+        if return_loss:
+            return self.forward_train(img, target, target_weight, img_metas,
+                                      **kwargs)
+        return self.forward_test(
+            img, img_metas, return_heatmap=return_heatmap, **kwargs)
+
+    def forward_train(self, img, target, target_weight, img_metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+
+        # if return loss
+        losses = dict()
+        if self.with_keypoint:
+            keypoint_losses = self.keypoint_head.get_loss(
+                output, target, target_weight)
+            losses.update(keypoint_losses)
+            keypoint_accuracy = self.keypoint_head.get_accuracy(
+                output, target, target_weight)
+            losses.update(keypoint_accuracy)
+
+        return losses
+
+    def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        result = {}
+
+        features = self.backbone(img)
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output_heatmap = self.keypoint_head.inference_model(
+                features, flip_pairs=None)
+
+        if self.test_cfg.get('flip_test', True):
+            img_flipped = img.flip(3)
+            features_flipped = self.backbone(img_flipped)
+            if self.with_neck:
+                features_flipped = self.neck(features_flipped)
+            if self.with_keypoint:
+                output_flipped_heatmap = self.keypoint_head.inference_model(
+                    features_flipped, img_metas[0]['flip_pairs'])
+                output_heatmap = (output_heatmap + output_flipped_heatmap)
+                if self.test_cfg.get('regression_flip_shift', False):
+                    output_heatmap[..., 0] -= 1.0 / img_width
+                output_heatmap = output_heatmap / 2
+
+        if self.with_keypoint:
+            keypoint_result = self.keypoint_head.decode(
+                img_metas, output_heatmap, img_size=[img_width, img_height])
+            result.update(keypoint_result)
+
+            if not return_heatmap:
+                output_heatmap = None
+
+            result['output_heatmap'] = output_heatmap
+
+        return result
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            Tensor: Output heatmaps.
+        """
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+        return output
+
+    @deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
+                            cls_name='TopDown')
+    def show_result(self,
+                    img,
+                    result,
+                    skeleton=None,
+                    kpt_score_thr=0.3,
+                    bbox_color='green',
+                    pose_kpt_color=None,
+                    pose_link_color=None,
+                    text_color='white',
+                    radius=4,
+                    thickness=1,
+                    font_scale=0.5,
+                    bbox_thickness=1,
+                    win_name='',
+                    show=False,
+                    show_keypoint_weight=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (list[dict]): The results to draw over `img`
+                (bbox_result, pose_result).
+            skeleton (list[list]): The connection of keypoints.
+                skeleton is 0-based indexing.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
+                If None, do not draw keypoints.
+            pose_link_color (np.array[Mx3]): Color of M links.
+                If None, do not draw links.
+            text_color (str or tuple or :obj:`Color`): Color of texts.
+            radius (int): Radius of circles.
+            thickness (int): Thickness of lines.
+            font_scale (float): Font scales of texts.
+            win_name (str): The window name.
+            show (bool): Whether to show the image. Default: False.
+            show_keypoint_weight (bool): Whether to change the transparency
+                using the predicted confidence scores of keypoints.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            Tensor: Visualized img, only if not `show` or `out_file`.
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+
+        bbox_result = []
+        bbox_labels = []
+        pose_result = []
+        for res in result:
+            if 'bbox' in res:
+                bbox_result.append(res['bbox'])
+                bbox_labels.append(res.get('label', None))
+            pose_result.append(res['keypoints'])
+
+        if bbox_result:
+            bboxes = np.vstack(bbox_result)
+            # draw bounding boxes
+            imshow_bboxes(
+                img,
+                bboxes,
+                labels=bbox_labels,
+                colors=bbox_color,
+                text_color=text_color,
+                thickness=bbox_thickness,
+                font_scale=font_scale,
+                show=False)
+
+        if pose_result:
+            imshow_keypoints(img, pose_result, skeleton, kpt_score_thr,
+                             pose_kpt_color, pose_link_color, radius,
+                             thickness)
+
+        if show:
+            imshow(img, win_name, wait_time)
+
+        if out_file is not None:
+            imwrite(img, out_file)
+
+        return img
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/heads/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/models/heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..80c1be50a9bb91eb9e0f97c6bbcca70cf1478a87
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/heads/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+from .topdown_heatmap_multi_stage_head import (TopdownHeatmapMSMUHead,
+                                               TopdownHeatmapMultiStageHead)
+from .topdown_heatmap_simple_head import TopdownHeatmapSimpleHead
+from .poseur_head import Poseur_noise_sample
+
+__all__ = [
+    'TopdownHeatmapSimpleHead', 'TopdownHeatmapMultiStageHead',
+    'TopdownHeatmapMSMUHead', 'TopdownHeatmapBaseHead',
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/heads/poseur_head.py b/SMPLer-X/main/transformer_utils/mmpose/models/heads/poseur_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..db4d8a62acb9991b908d894e515befd0e7f414e8
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/heads/poseur_head.py
@@ -0,0 +1,759 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import copy
+import math
+import warnings
+from mmengine.model import constant_init, normal_init, bias_init_with_prob
+from mmcv.cnn import build_upsample_layer, Linear
+import torch.nn.functional as F
+
+from mmpose.core.evaluation import (keypoint_pck_accuracy,
+                                    keypoints_from_regression)
+from mmpose.core.post_processing import fliplr_regression
+from mmpose.models.builder import build_loss, HEADS, build_transformer
+from mmpose.core.evaluation import pose_pck_accuracy
+from mmpose.models.utils.transformer import inverse_sigmoid
+from mmcv.cnn import Conv2d, build_activation_layer
+from mmcv.cnn.bricks.transformer import Linear, FFN, build_positional_encoding
+from mmcv.cnn import ConvModule
+import torch.distributions as distributions
+from .rle_regression_head import nets, nett, RealNVP, nets3d, nett3d
+from easydict import EasyDict
+from mmpose.models.losses.regression_loss import L1Loss
+from mmpose.models.losses.rle_loss import RLELoss_poseur, RLEOHKMLoss
+from config import cfg
+from utils.human_models import smpl_x
+from torch.distributions.utils import lazy_property
+
+from torch.distributions import MultivariateNormal
+
+
+def fliplr_rle_regression(regression,
+                          regression_score,
+                          flip_pairs,
+                          center_mode='static',
+                          center_x=0.5,
+                          center_index=0):
+    """Flip human joints horizontally.
+
+    Note:
+        batch_size: N
+        num_keypoint: K
+    Args:
+        regression (np.ndarray([..., K, C])): Coordinates of keypoints, where K
+            is the joint number and C is the dimension. Example shapes are:
+            - [N, K, C]: a batch of keypoints where N is the batch size.
+            - [N, T, K, C]: a batch of pose sequences, where T is the frame
+                number.
+        flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        center_mode (str): The mode to set the center location on the x-axis
+            to flip around. Options are:
+            - static: use a static x value (see center_x also)
+            - root: use a root joint (see center_index also)
+        center_x (float): Set the x-axis location of the flip center. Only used
+            when center_mode=static.
+        center_index (int): Set the index of the root joint, whose x location
+            will be used as the flip center. Only used when center_mode=root.
+
+    Returns:
+        tuple: Flipped human joints.
+
+        - regression_flipped (np.ndarray([..., K, C])): Flipped joints.
+    """
+    assert regression.ndim >= 2, f'Invalid pose shape {regression.shape}'
+
+    allowed_center_mode = {'static', 'root'}
+    assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \
+                                               f'{center_mode}, allowed choices are {allowed_center_mode}'
+
+    if center_mode == 'static':
+        x_c = center_x
+    elif center_mode == 'root':
+        assert regression.shape[-2] > center_index
+        x_c = regression[..., center_index:center_index + 1, 0]
+
+    regression_flipped = regression.copy()
+    regression_score_flipped = regression_score.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        regression_flipped[..., left, :] = regression[..., right, :]
+        regression_flipped[..., right, :] = regression[..., left, :]
+        regression_score_flipped[..., left, :] = regression_score[..., right, :]
+        regression_score_flipped[..., right, :] = regression_score[..., left, :]
+
+    # Flip horizontally
+    regression_flipped[..., 0] = x_c * 2 - regression_flipped[..., 0]
+    return regression_flipped, regression_score_flipped
+
+
+class Linear_with_norm(nn.Module):
+    def __init__(self, in_channel, out_channel, bias=True, norm=True):
+        super(Linear_with_norm, self).__init__()
+        self.bias = bias
+        self.norm = norm
+        self.linear = nn.Linear(in_channel, out_channel, bias)
+        nn.init.xavier_uniform_(self.linear.weight, gain=0.01)
+
+    def forward(self, x):
+        y = x.matmul(self.linear.weight.t())
+
+        if self.norm:
+            x_norm = torch.norm(x, dim=-1, keepdim=True)
+            y = y / x_norm
+
+        if self.bias:
+            y = y + self.linear.bias
+        return y
+
+def deepapply(obj, fn):
+    r"""Applies `fn` to all tensors referenced in `obj`"""
+
+    if torch.is_tensor(obj):
+        obj = fn(obj)
+    elif isinstance(obj, dict):
+        for key, value in obj.items():
+            obj[key] = deepapply(value, fn)
+    elif isinstance(obj, list):
+        for i, value in enumerate(obj):
+            obj[i] = deepapply(value, fn)
+    elif isinstance(obj, tuple):
+        obj = tuple(
+            deepapply(value, fn)
+            for value in obj
+        )
+    elif hasattr(obj, '__dict__'):
+        deepapply(obj.__dict__, fn)
+
+    return obj
+
+
+__init__ = MultivariateNormal.__init__
+
+
+def init(self, *args, **kwargs):
+    __init__(self, *args, **kwargs)
+
+    self.__class__ = type(
+        self.__class__.__name__,
+        (self.__class__, nn.Module),
+        {},
+    )
+
+    nn.Module.__init__(self)
+
+
+MultivariateNormal.__init__ = init
+MultivariateNormal._apply = deepapply
+
+
+@HEADS.register_module()
+class Poseur_noise_sample(nn.Module):
+    """
+    rle loss for transformer_utils
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_queries=17,
+                 num_reg_fcs=2,
+                 positional_encoding=dict(
+                     type='SinePositionalEncoding',
+                     num_feats=128,
+                     normalize=True),
+                 transformer=None,
+                 with_box_refine=False,
+                 as_two_stage=False,
+                 heatmap_size=[64, 48],
+                 num_joints=17,
+                 loss_coord_enc=None,
+                 loss_coord_dec=None,
+                 loss_hp_keypoint=None,
+                 use_heatmap_loss=True,
+                 train_cfg=None,
+                 test_cfg=None,
+                 use_udp=False,
+                 ):
+        super().__init__()
+        self.use_udp = use_udp
+        self.num_queries = num_queries
+        self.num_reg_fcs = num_reg_fcs
+        self.in_channels = in_channels
+        self.act_cfg = transformer.get('act_cfg', dict(type='ReLU', inplace=True))
+        self.activate = build_activation_layer(self.act_cfg)
+        self.positional_encoding = build_positional_encoding(positional_encoding)
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+        self.transformer = build_transformer(transformer)
+        self.embed_dims = self.transformer.embed_dims
+        assert 'num_feats' in positional_encoding
+        num_feats = positional_encoding['num_feats']
+        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
+                                                 f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
+                                                 f' and {num_feats}.'
+
+        self.num_joints = num_joints
+        # self.num_joints = len(smpl_x.pos_joint_part['rhand'])
+        self.heatmap_size = heatmap_size
+        self.loss_coord_enc = build_loss(loss_coord_enc)
+        self.loss_coord_dec = build_loss(loss_coord_dec)
+
+        self.use_dec_rle_loss = isinstance(self.loss_coord_dec, RLELoss_poseur) or isinstance(self.loss_coord_dec,
+                                                                                              RLEOHKMLoss)
+        self.use_heatmap_loss = use_heatmap_loss
+        if self.use_heatmap_loss:
+            self.loss_hp = build_loss(loss_hp_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+
+        enc_prior = MultivariateNormal(torch.zeros(2), torch.eye(2))
+        dec_prior = MultivariateNormal(torch.zeros(2), torch.eye(2))
+        masks = torch.from_numpy(np.array([[0, 1], [1, 0]] * 3).astype(np.float32))
+
+        enc_prior3d = MultivariateNormal(torch.zeros(3), torch.eye(3))
+        dec_prior3d = MultivariateNormal(torch.zeros(3), torch.eye(3))
+        masks3d = torch.from_numpy(np.array([[0, 0, 1], [1, 1, 0]] * 3).astype(np.float32))
+
+        self.enc_flow2d = RealNVP(nets, nett, masks, enc_prior)
+        self.enc_flow3d = RealNVP(nets3d, nett3d, masks3d, enc_prior3d)
+
+        if self.use_dec_rle_loss:
+            self.dec_flow2d = RealNVP(nets, nett, masks, dec_prior)
+            self.dec_flow3d = RealNVP(nets3d, nett3d, masks3d, dec_prior3d)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+
+        fc_coord_branch = []
+        for _ in range(self.num_reg_fcs):
+            fc_coord_branch.append(Linear(self.embed_dims, self.embed_dims))
+            fc_coord_branch.append(nn.ReLU())
+        fc_coord_branch.append(Linear(self.embed_dims, 3))
+        fc_coord_branch = nn.Sequential(*fc_coord_branch)
+
+        if self.use_dec_rle_loss:
+            fc_sigma_branch = []
+            for _ in range(self.num_reg_fcs):
+                fc_sigma_branch.append(Linear(self.embed_dims, self.embed_dims))
+            fc_sigma_branch.append(Linear_with_norm(self.embed_dims, 3, norm=False))
+            fc_sigma_branch = nn.Sequential(*fc_sigma_branch)
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        num_pred = self.transformer.decoder.num_layers
+
+        if self.with_box_refine:
+            self.fc_coord_branches = _get_clones(fc_coord_branch, num_pred)
+            self.fc_coord_output_branches = _get_clones(fc_coord_branch, num_pred)
+            if self.use_dec_rle_loss:
+                self.fc_sigma_branches = _get_clones(fc_sigma_branch, num_pred)
+        else:
+            self.fc_coord_branches = nn.ModuleList(
+                [fc_coord_branch for _ in range(num_pred)])
+            if isinstance(self.loss_coord_dec, RLELoss) or isinstance(self.loss_coord_dec, RLEOHKMLoss):
+                self.fc_sigma_branches = nn.ModuleList([fc_sigma_branch for _ in range(1)])
+
+        if self.as_two_stage:
+            self.query_embedding = None
+        else:
+            self.query_embedding = nn.Embedding(self.num_queries,
+                                                self.embed_dims * 2)
+
+        if self.use_heatmap_loss:
+            from mmcv.cnn import build_upsample_layer
+            # simplebaseline style
+            num_layers = 3
+            num_kernels = [4, 4, 4]
+            num_filters = [256, 256, 256]
+
+            layers = []
+            for i in range(num_layers):
+                kernel, padding, output_padding = \
+                    self._get_deconv_cfg(num_kernels[i])
+
+                planes = num_filters[i]
+                if i == 0:
+                    layers.append(
+                        build_upsample_layer(
+                            dict(type='deconv'),
+                            in_channels=self.embed_dims,
+                            out_channels=planes,
+                            kernel_size=kernel,
+                            stride=2,
+                            padding=padding,
+                            output_padding=output_padding,
+                            bias=False))
+                else:
+                    layers.append(
+                        build_upsample_layer(
+                            dict(type='deconv'),
+                            in_channels=planes,
+                            out_channels=planes,
+                            kernel_size=kernel,
+                            stride=2,
+                            padding=padding,
+                            output_padding=output_padding,
+                            bias=False))
+
+                layers.append(nn.BatchNorm2d(planes))
+                layers.append(nn.ReLU(inplace=True))
+                self.in_channels = planes
+
+            self.deconv_layer = nn.Sequential(*layers)
+            self.final_layer = nn.Sequential(
+                ConvModule(
+                    self.embed_dims,
+                    self.num_joints,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    norm_cfg=None,
+                    act_cfg=None,
+                    inplace=False)
+            )
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+
+        # for m in [self.fc_coord_branches, self.fc_sigma_branches]:
+        for m in [self.fc_coord_branches]:
+            for mm in m:
+                if isinstance(mm, nn.Linear):
+                    nn.init.xavier_uniform_(mm.weight, gain=0.01)
+
+        for m in [self.fc_coord_output_branches]:
+            for mm in m:
+                if isinstance(mm, nn.Linear):
+                    nn.init.xavier_uniform_(mm.weight, gain=0.01)
+
+        if self.use_heatmap_loss:
+            for _, m in self.deconv_layer.named_modules():
+                if isinstance(m, nn.ConvTranspose2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+            for m in self.final_layer.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001, bias=0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+
+    def forward(self, mlvl_feats, coord_init=None, query_init=None):
+
+        batch_size = mlvl_feats[0].size(0)
+        img_w, img_h = self.train_cfg['image_size']
+        img_masks = mlvl_feats[0].new_ones(
+            (batch_size, img_h, img_w))
+        for img_id in range(batch_size):
+            img_masks[img_id, :img_h, :img_w] = 0
+
+        mlvl_masks = []
+        mlvl_positional_encodings = []
+        for feat in mlvl_feats:
+            mlvl_masks.append(F.interpolate(img_masks[None], size=feat.shape[-2:]).to(torch.bool).squeeze(0))
+            mlvl_positional_encodings.append(
+                self.positional_encoding(mlvl_masks[-1]))
+
+        query_embeds = None
+        if not self.as_two_stage:
+            query_embeds = self.query_embedding.weight
+
+        memory, spatial_shapes, level_start_index, hs, init_reference, inter_references, \
+        enc_outputs = self.transformer(
+            mlvl_feats,
+            mlvl_masks,
+            query_embeds,
+            mlvl_positional_encodings,
+            reg_branches=self.fc_coord_branches if self.with_box_refine else None,  # noqa:E501
+            cls_branches=None,  # noqa:E501
+            coord_init=coord_init,
+            query_init=query_init,
+        )
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_coords = []
+
+        dec_outputs = EasyDict(pred_jts=outputs_coords, feat=hs)
+
+        return enc_outputs, dec_outputs
+
+    def get_loss(self, enc_output, dec_output, coord_target, coord_target_weight, hp_target, hp_target_weight):
+        losses = dict()
+        if self.as_two_stage and enc_output is not None:
+            enc_rle_loss = self.get_enc_rle_loss(enc_output, coord_target, coord_target_weight)
+            losses.update(enc_rle_loss)
+
+        dec_rle_loss = self.get_dec_rle_loss(dec_output, coord_target, coord_target_weight)
+        losses.update(dec_rle_loss)
+
+        return losses
+
+    def get_enc_rle_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+        Note:
+            batch_size: N
+            num_keypoints: K
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+        assert not isinstance(self.loss_coord_enc, nn.Sequential)
+        assert target.dim() == 3 and target_weight.dim() == 3
+
+        BATCH_SIZE = output.sigma.size(0)
+        gt_uvd = target.reshape(output.pred_jts.shape)
+        gt_uvd_weight = target_weight.reshape(output.pred_jts.shape)
+        gt_3d_mask = gt_uvd_weight[:, :, 2].reshape(-1)
+
+        assert output.pred_jts.shape == output.sigma.shape, (output.pred_jts.shape, output.sigma.shape)
+        bar_mu = (output.pred_jts - gt_uvd) / output.sigma
+        bar_mu = bar_mu.reshape(-1, 3)
+        bar_mu_3d = bar_mu[gt_3d_mask > 0]
+        bar_mu_2d = bar_mu[gt_3d_mask < 1][:, :2]
+        # (B, K, 3)
+        log_phi_3d = self.enc_flow3d.log_prob(bar_mu_3d)
+        log_phi_2d = self.enc_flow2d.log_prob(bar_mu_2d)
+        log_phi = torch.zeros_like(bar_mu[:, 0])
+        # print(gt_3d_mask)
+        log_phi[gt_3d_mask > 0] = log_phi_3d
+        log_phi[gt_3d_mask < 1] = log_phi_2d
+        log_phi = log_phi.reshape(BATCH_SIZE, self.num_joints, 1)
+
+        output.nf_loss = torch.log(output.sigma) - log_phi
+        losses['enc_rle_loss'] = self.loss_coord_enc(output, target, target_weight)
+
+        return losses
+
+    def get_enc_rle_loss_old(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+        Note:
+            batch_size: N
+            num_keypoints: K
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+        assert not isinstance(self.loss_coord_enc, nn.Sequential)
+        assert target.dim() == 3 and target_weight.dim() == 3
+
+        BATCH_SIZE = output.sigma.size(0)
+        gt_uv = target.reshape(output.pred_jts.shape)
+        bar_mu = (output.pred_jts - gt_uv) / output.sigma
+        # (B, K, 1)
+        log_phi = self.enc_flow.log_prob(bar_mu.reshape(-1, 2)).reshape(BATCH_SIZE, self.num_joints, 1)
+        output.nf_loss = torch.log(output.sigma) - log_phi
+        losses['enc_rle_loss'] = self.loss_coord_enc(output, target, target_weight)
+
+        return losses
+
+    def get_dec_rle_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+        assert not isinstance(self.loss_coord_dec, nn.Sequential)
+        assert target.dim() == 3 and target_weight.dim() == 3
+        target = target.repeat(1, self.transformer.num_noise_sample + 1, 1)
+        target_weight = target_weight.repeat(1, self.transformer.num_noise_sample + 1, 1)
+
+        if self.with_box_refine:
+            if self.use_dec_rle_loss:
+                for i in range(len(output.pred_jts)):
+                    pred_jts, sigma = output.pred_jts[i], output.sigma[i]
+                    output_i = EasyDict(
+                        pred_jts=pred_jts,
+                        sigma=sigma
+                    )
+                    BATCH_SIZE = output_i.sigma.size(0)
+                    gt_uvd = target.reshape(output_i.pred_jts.shape)
+                    gt_uvd_weight = target_weight.reshape(pred_jts.shape)
+                    gt_3d_mask = gt_uvd_weight[:, :, 2].reshape(-1)
+
+                    assert pred_jts.shape == sigma.shape, (pred_jts.shape, sigma.shape)
+                    bar_mu = (output_i.pred_jts - gt_uvd) / output_i.sigma
+                    bar_mu = bar_mu.reshape(-1, 3)
+                    bar_mu_3d = bar_mu[gt_3d_mask > 0]
+                    bar_mu_2d = bar_mu[gt_3d_mask < 1][:, :2]
+                    # (B, K, 3)
+                    log_phi_3d = self.dec_flow3d.log_prob(bar_mu_3d)
+                    log_phi_2d = self.dec_flow2d.log_prob(bar_mu_2d)
+                    log_phi = torch.zeros_like(bar_mu[:, 0])
+                    log_phi[gt_3d_mask > 0] = log_phi_3d
+                    log_phi[gt_3d_mask < 1] = log_phi_2d
+                    log_phi = log_phi.reshape(BATCH_SIZE, self.num_joints * (self.transformer.num_noise_sample + 1), 1)
+                    output_i.nf_loss = torch.log(output_i.sigma) - log_phi
+                    losses['dec_rle_loss_{}'.format(i)] = self.loss_coord_dec(output_i, target, target_weight)
+            else:
+                for i, pred_jts in enumerate(output.pred_jts):
+                    losses['dec_rle_loss_{}'.format(i)] = self.loss_coord_dec(pred_jts, target, target_weight)
+        else:
+            if self.use_dec_rle_loss:
+                BATCH_SIZE = output.sigma.size(0)
+                gt_uv = target.reshape(output.pred_jts.shape)
+                bar_mu = (output.pred_jts - gt_uv) / output.sigma
+                # (B, K, 1)
+                log_phi = self.dec_flow.log_prob(bar_mu.reshape(-1, 2)).reshape(BATCH_SIZE, self.num_joints, 1)
+                output.nf_loss = torch.log(output.sigma) - log_phi
+                losses['dec_rle_loss'] = self.loss_coord_dec(output, target, target_weight) * 0
+            else:
+                losses['dec_rle_loss'] = self.loss_coord_dec(output.pred_jts, target + 0.5, target_weight) * 0
+
+        return losses
+
+    def get_hp_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            heatmaps height: H
+            heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[NxKxHxW]): Output heatmaps.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        if isinstance(self.loss_hp, nn.Sequential):
+            if not isinstance(output, dict):
+                assert len(self.loss_hp) == output.size(0)
+                assert target.dim() == 5 and target_weight.dim() == 4
+                num_hp_layers = output.size(0)
+                for i in range(num_hp_layers):
+                    target_i = target[:, i, :, :, :]
+                    target_weight_i = target_weight[:, i, :, :]
+                    losses['mse_loss_{}'.format(i)] = self.loss_hp[i](output[i], target_i, target_weight_i)
+            else:
+                out_hp_backbone = output['backbone']
+                num_hp_layers = out_hp_backbone.size(0)
+                for i in range(num_hp_layers):
+                    target_i = target[:, i, :, :, :]
+                    target_weight_i = target_weight[:, i, :, :]
+                    losses['mse_loss_backbone_{}'.format(i)] = self.loss_hp[i](out_hp_backbone[i], target_i,
+                                                                               target_weight_i)
+
+                out_hp_enc = output['enc']
+                for lvl in range(len(out_hp_enc)):
+                    if lvl == 2 or lvl == 5:
+                        # if lvl == 5:
+                        for i in range(3):
+                            target_i = target[:, i + 1, :, :, :]
+                            target_weight_i = target_weight[:, i + 1, :, :]
+                            # losses['reg_loss'] += self.loss(output[i], target, target_weight).sum()
+                            if lvl == 2:
+                                loss_weight = 0.1
+                            elif lvl == 5:
+                                loss_weight = 1.0
+
+                            losses['mse_loss_enc_layer{}_c{}'.format(lvl, i + 3)] = loss_weight * self.loss_hp[i + 1](
+                                out_hp_enc[lvl][i], target_i, target_weight_i)
+        else:
+
+            assert target.dim() == 4 and target_weight.dim() == 3
+            losses['mse_loss'] = self.loss_hp(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, enc_output, dec_output, coord_target, coord_target_weight, hp_target, hp_target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+        # coord_output = output["coord"]
+        if self.as_two_stage and enc_output is not None:
+            coord_output = enc_output.pred_jts
+            N = coord_output.shape[0]
+
+            _, avg_acc, cnt = keypoint_pck_accuracy(
+                coord_output.detach().cpu().numpy(),
+                coord_target.detach().cpu().numpy(),
+                coord_target_weight[:, :, 0].detach().cpu().numpy() > 0,
+                thr=0.05,
+                normalize=np.ones((N, 2), dtype=np.float32))
+            accuracy['enc_coord_acc'] = avg_acc
+
+        coord_output = dec_output.pred_jts
+        if coord_output.dim() == 4:
+            coord_output = coord_output[-1]
+        N = coord_output.shape[0]
+
+        if not self.use_dec_rle_loss:
+            coord_target += 0.5
+        # self.num_joints
+        _, avg_acc, cnt = keypoint_pck_accuracy(
+            coord_output[:, :self.num_joints].detach().cpu().numpy(),
+            coord_target.detach().cpu().numpy(),
+            coord_target_weight[:, :, 0].detach().cpu().numpy() > 0,
+            thr=0.05,
+            normalize=np.ones((N, 2), dtype=np.float32))
+        accuracy['dec_coord_acc'] = avg_acc
+
+        # if self.use_heatmap_loss and self.use_multi_stage_memory:
+        #     assert hp_target.dim() == 5 and hp_target_weight.dim() == 4
+        #     _, avg_acc, _ = pose_pck_accuracy(
+        #         hp_output_backbone[0].detach().cpu().numpy(),
+        #         hp_target[:, 0, ...].detach().cpu().numpy(),
+        #         hp_target_weight[:, 0,
+        #                       ...].detach().cpu().numpy().squeeze(-1) > 0)
+        #     accuracy['hp_acc_backbone'] = float(avg_acc)
+
+        #     _, avg_acc, _ = pose_pck_accuracy(
+        #         hp_output_enc[-1][0].detach().cpu().numpy(),
+        #         hp_target[:, 1, ...].detach().cpu().numpy(),
+        #         hp_target_weight[:, 1,
+        #                       ...].detach().cpu().numpy().squeeze(-1) > 0)
+        #     accuracy['hp_acc_enc'] = float(avg_acc)
+
+        # else:
+        if self.use_heatmap_loss:
+            hp_output = dec_output["hp"]
+            _, avg_acc, _ = pose_pck_accuracy(
+                hp_output.detach().cpu().numpy(),
+                hp_target.detach().cpu().numpy(),
+                hp_target_weight.detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['hp_acc'] = float(avg_acc)
+
+        return accuracy
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_regression (np.ndarray): Output regression.
+
+        Args:
+            x (torch.Tensor[N, K, 2]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+        output_enc, output_dec = self.forward(x)
+        output_regression, output_regression_score = output_dec.pred_jts.detach().cpu().numpy(), output_dec.maxvals.detach().cpu().numpy()
+        output_sigma = output_dec.sigma.detach().cpu().numpy()
+        output_sigma = output_sigma[-1]
+        output_regression_score = np.concatenate([output_regression_score, output_sigma], axis=2)
+
+        if output_regression.ndim == 4:
+            output_regression = output_regression[-1]
+
+        if flip_pairs is not None:
+
+            output_regression, output_regression_score = fliplr_rle_regression(
+                output_regression, output_regression_score, flip_pairs)
+
+        return output_regression, output_regression_score
+
+    def decode_keypoints(self, img_metas, output_regression, output_regression_score, img_size):
+        """Decode keypoints from output regression.
+
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            output_regression (np.ndarray[N, K, 2]): model
+                predicted regression vector.
+            img_size (tuple(img_width, img_height)): model input image size.
+        """
+        batch_size = len(img_metas)
+
+        if 'bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+
+        c = np.zeros((batch_size, 2), dtype=np.float32)
+        s = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size)
+        for i in range(batch_size):
+            c[i, :] = img_metas[i]['center']
+            s[i, :] = img_metas[i]['scale']
+            image_paths.append(img_metas[i]['image_file'])
+
+            if 'bbox_score' in img_metas[i]:
+                score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
+
+            if bbox_ids is not None:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+
+        preds, maxvals = keypoints_from_regression(output_regression, c, s,
+                                                   img_size)
+
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        # all_preds[:, :, 2:3] = maxvals
+        all_preds[:, :, 2:3] = output_regression_score
+        all_boxes[:, 0:2] = c[:, 0:2]
+        all_boxes[:, 2:4] = s[:, 0:2]
+        all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
+        all_boxes[:, 5] = score
+
+        result = {}
+
+        result['preds'] = all_preds
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+
+        return result
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/heads/rle_regression_head.py b/SMPLer-X/main/transformer_utils/mmpose/models/heads/rle_regression_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..20f702a573b5fa777c58755e1cc8270885a8bebc
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/heads/rle_regression_head.py
@@ -0,0 +1,425 @@
+import numpy as np
+import torch.nn as nn
+from mmengine.model import normal_init
+from mmpose.core.evaluation import (keypoint_pck_accuracy,
+                                    keypoints_from_regression)
+from mmpose.core.post_processing import fliplr_regression
+from mmpose.models.builder import HEADS, build_loss
+
+import torch
+import torch.nn as nn
+import torch.distributions as distributions
+from easydict import EasyDict
+
+def rle_fliplr_regression(regression,
+                      regression_score,
+                      flip_pairs,
+                      center_mode='static',
+                      center_x=0.5,
+                      center_index=0,
+                      shift=True):
+    """Flip human joints horizontally.
+
+    Note:
+        batch_size: N
+        num_keypoint: K
+    Args:
+        regression (np.ndarray([..., K, C])): Coordinates of keypoints, where K
+            is the joint number and C is the dimension. Example shapes are:
+            - [N, K, C]: a batch of keypoints where N is the batch size.
+            - [N, T, K, C]: a batch of pose sequences, where T is the frame
+                number.
+        flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        center_mode (str): The mode to set the center location on the x-axis
+            to flip around. Options are:
+            - static: use a static x value (see center_x also)
+            - root: use a root joint (see center_index also)
+        center_x (float): Set the x-axis location of the flip center. Only used
+            when center_mode=static.
+        center_index (int): Set the index of the root joint, whose x location
+            will be used as the flip center. Only used when center_mode=root.
+
+    Returns:
+        tuple: Flipped human joints.
+
+        - regression_flipped (np.ndarray([..., K, C])): Flipped joints.
+    """
+    assert regression.ndim >= 2, f'Invalid pose shape {regression.shape}'
+
+    # flip
+    # width_dim = 48
+    # if shift:
+    #     regression[:, :, 0] = - regression[:, :, 0] - 1 / (width_dim * 4)
+    # else:
+    #     regression[:, :, 0] = -1 / width_dim - regression[:, :, 0]
+
+    allowed_center_mode = {'static', 'root'}
+    assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \
+        f'{center_mode}, allowed choices are {allowed_center_mode}'
+
+    if center_mode == 'static':
+        x_c = center_x
+    elif center_mode == 'root':
+        assert regression.shape[-2] > center_index
+        x_c = regression[..., center_index:center_index + 1, 0]
+
+    regression_flipped = regression.copy()
+    regression_score_flipped = regression_score.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        regression_flipped[..., left, :] = regression[..., right, :]
+        regression_flipped[..., right, :] = regression[..., left, :]
+        regression_score_flipped[..., left, :] = regression_score[..., right, :]
+        regression_score_flipped[..., right, :] = regression_score[..., left, :]
+
+    # Flip horizontally
+    regression_flipped[..., 0] = x_c * 2 - regression_flipped[..., 0]
+    return regression_flipped, regression_score_flipped
+
+
+def nets():
+    return nn.Sequential(nn.Linear(2, 64), nn.LeakyReLU(), nn.Linear(64, 64), nn.LeakyReLU(), nn.Linear(64, 2), nn.Tanh())
+
+def nets3d():
+    return nn.Sequential(nn.Linear(3, 64), nn.LeakyReLU(), nn.Linear(64, 64), nn.LeakyReLU(), nn.Linear(64, 3), nn.Tanh())
+    # return nn.Sequential(nn.Linear(3, 256), nn.LeakyReLU(), nn.Linear(256, 2), nn.Tanh())
+
+def nett():
+    return nn.Sequential(nn.Linear(2, 64), nn.LeakyReLU(), nn.Linear(64, 64), nn.LeakyReLU(), nn.Linear(64, 2))
+
+def nett3d():
+    return nn.Sequential(nn.Linear(3, 64), nn.LeakyReLU(), nn.Linear(64, 64), nn.LeakyReLU(), nn.Linear(64, 3))
+    # return nn.Sequential(nn.Linear(3, 256), nn.LeakyReLU(), nn.Linear(256, 2))
+
+
+class Linear(nn.Module):
+    def __init__(self, in_channel, out_channel, bias=True, norm=True):
+        super(Linear, self).__init__()
+        self.bias = bias
+        self.norm = norm
+        self.linear = nn.Linear(in_channel, out_channel, bias)
+        nn.init.xavier_uniform_(self.linear.weight, gain=0.01)
+
+    def forward(self, x):
+        y = x.matmul(self.linear.weight.t())
+
+        if self.norm:
+            x_norm = torch.norm(x, dim=1, keepdim=True)
+            y = y / x_norm
+
+        if self.bias:
+            y = y + self.linear.bias
+        return y
+
+
+class RealNVP(nn.Module):
+    def __init__(self, nets, nett, mask, prior):
+        super(RealNVP, self).__init__()
+
+        self.prior = prior
+        self.register_buffer('mask', mask)
+        self.t = torch.nn.ModuleList([nett() for _ in range(len(mask))])
+        self.s = torch.nn.ModuleList([nets() for _ in range(len(mask))])
+
+    def _init(self):
+        for m in self.t:
+            for mm in m.modules():
+                if isinstance(mm, nn.Linear):
+                    nn.init.xavier_uniform_(mm.weight, gain=0.01)
+        for m in self.s:
+            for mm in m.modules():
+                if isinstance(mm, nn.Linear):
+                    nn.init.xavier_uniform_(mm.weight, gain=0.01)
+
+    def forward_p(self, z):
+        x = z
+        for i in range(len(self.t)):
+            x_ = x * self.mask[i]
+            s = self.s[i](x_) * (1 - self.mask[i])
+            t = self.t[i](x_) * (1 - self.mask[i])
+            x = x_ + (1 - self.mask[i]) * (x * torch.exp(s) + t)
+        return x
+
+    def backward_p(self, x):
+        log_det_J, z = x.new_zeros(x.shape[0]), x
+        for i in reversed(range(len(self.t))):
+            z_ = self.mask[i] * z
+            s = self.s[i](z_) * (1 - self.mask[i])
+            t = self.t[i](z_) * (1 - self.mask[i])
+            z = (1 - self.mask[i]) * (z - t) * torch.exp(-s) + z_
+            log_det_J -= s.sum(dim=1)
+        return z, log_det_J
+
+    def log_prob(self, x):
+        DEVICE = x.device
+        if self.prior.loc.device != DEVICE:
+            self.prior.loc = self.prior.loc.to(DEVICE)
+            self.prior.scale_tril = self.prior.scale_tril.to(DEVICE)
+            self.prior._unbroadcasted_scale_tril = self.prior._unbroadcasted_scale_tril.to(DEVICE)
+            self.prior.covariance_matrix = self.prior.covariance_matrix.to(DEVICE)
+            self.prior.precision_matrix = self.prior.precision_matrix.to(DEVICE)
+
+        z, logp = self.backward_p(x)
+        return self.prior.log_prob(z) + logp
+
+    def sample(self, batchSize):
+        z = self.prior.sample((batchSize, 1))
+        x = self.forward_p(z)
+        return x
+
+    def forward(self, x):
+        return self.log_prob(x)
+
+
+@HEADS.register_module()
+class RLERegressionHead(nn.Module):
+    """Deeppose regression head with fully connected layers.
+
+    paper ref: Alexander Toshev and Christian Szegedy,
+    ``DeepPose: Human Pose Estimation via Deep Neural Networks.''.
+
+    Args:
+        in_channels (int): Number of input channels
+        num_joints (int): Number of joints
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_joints,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.num_joints = num_joints
+
+        self.loss = build_loss(loss_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+
+        # self.fc = nn.Linear(self.in_channels, self.num_joints * 2)
+        # self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        # self.fcs, out_channel = self._make_fc_layer()
+
+        # self.fc_coord = Linear(self.in_channels, self.num_joints * 2)
+        # self.fc_sigma = Linear(self.in_channels, self.num_joints * 2, norm=False)
+        self.fc_coord = Linear(self.in_channels, self.num_joints * 3)
+        self.fc_sigma = Linear(self.in_channels, self.num_joints * 3, norm=False)
+
+        self.fc_layers = [self.fc_coord, self.fc_sigma]
+
+        self.share_flow = True
+
+        prior = distributions.MultivariateNormal(torch.zeros(2), torch.eye(2))
+        masks = torch.from_numpy(np.array([[0, 1], [1, 0]] * 3).astype(np.float32))
+
+        prior3d = distributions.MultivariateNormal(torch.zeros(3), torch.eye(3))
+        masks3d = torch.from_numpy(np.array([[0, 0, 1], [1, 1, 0]] * 3).astype(np.float32))
+
+        self.flow2d = RealNVP(nets, nett, masks, prior)
+        self.flow3d = RealNVP(nets3d, nett3d, masks3d, prior3d)
+
+
+    # def _make_fc_layer(self):
+    #     fc_layers = []
+    #     num_deconv = len(self.fc_dim)
+    #     input_channel = self.feature_channel
+    #     for i in range(num_deconv):
+    #         if self.fc_dim[i] > 0:
+    #             fc = nn.Linear(input_channel, self.fc_dim[i])
+    #             bn = nn.BatchNorm1d(self.fc_dim[i])
+    #             fc_layers.append(fc)
+    #             fc_layers.append(bn)
+    #             fc_layers.append(nn.ReLU(inplace=True))
+    #             input_channel = self.fc_dim[i]
+    #         else:
+    #             fc_layers.append(nn.Identity())
+    #
+    #     return nn.Sequential(*fc_layers), input_channel
+
+
+    def forward(self, x):
+        """Forward function."""
+        # output = self.fc(x)
+        # N, C = output.shape
+        # return output.reshape([N, C // 2, 2])
+        BATCH_SIZE = x.shape[0]
+        out_coord = self.fc_coord(x).reshape(BATCH_SIZE, self.num_joints, 3)
+        assert out_coord.shape[2] == 3
+
+        out_sigma = self.fc_sigma(x).reshape(BATCH_SIZE, self.num_joints, -1)
+
+        # (B, N, 3)
+        pred_jts = out_coord.reshape(BATCH_SIZE, self.num_joints, 3)
+        sigma = out_sigma.reshape(BATCH_SIZE, self.num_joints, -1).sigmoid() + 1e-9
+        scores = 1 - sigma
+        # (B, N, 1)
+        scores = torch.mean(scores, dim=2, keepdim=True)
+
+        output = EasyDict(
+            pred_jts=pred_jts,
+            sigma=sigma,
+            maxvals=scores.float(),
+        )
+        return output
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+        assert not isinstance(self.loss, nn.Sequential)
+        assert target.dim() == 3 and target_weight.dim() == 3
+
+        BATCH_SIZE = output.sigma.size(0)
+        gt_uvd = target.reshape(output.pred_jts.shape)
+        bar_mu = (output.pred_jts - gt_uvd) / output.sigma
+        # (B, K, 1)
+        log_phi = self.flow.log_prob(bar_mu.reshape(-1, 2)).reshape(BATCH_SIZE, self.num_joints, 1)
+        output.nf_loss = torch.log(output.sigma) - log_phi
+        losses['reg_loss'] = self.loss(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        N = output.pred_jts.shape[0]
+
+        _, avg_acc, cnt = keypoint_pck_accuracy(
+            output.pred_jts.detach().cpu().numpy(),
+            target.detach().cpu().numpy(),
+            target_weight[:, :, 0].detach().cpu().numpy() > 0,
+            thr=0.05,
+            normalize=np.ones((N, 2), dtype=np.float32))
+        accuracy['acc_pose'] = avg_acc
+
+        return accuracy
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_regression (np.ndarray): Output regression.
+
+        Args:
+            x (torch.Tensor[N, K, 2]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            output_regression, output_regression_score = rle_fliplr_regression(
+                output.pred_jts.detach().cpu().numpy(), output.maxvals.detach().cpu().numpy(), flip_pairs, center_x=0.0)
+        else:
+            output_regression = output.pred_jts.detach().cpu().numpy()
+            output_regression_score = output.maxvals.detach().cpu().numpy()
+        
+        output_regression += 0.5
+        # output = EasyDict(
+        #     preds=output_regression,
+        #     maxvals=output_regression_score,
+        # )
+        return output_regression
+
+    def decode(self, img_metas, output, pixel_std=200.0, **kwargs):
+        """Decode the keypoints from output regression.
+
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            output (np.ndarray[N, K, 2]): predicted regression vector.
+            kwargs: dict contains 'img_size'.
+                img_size (tuple(img_width, img_height)): input image size.
+        """
+        batch_size = len(img_metas)
+
+        if 'bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+
+        c = np.zeros((batch_size, 2), dtype=np.float32)
+        s = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size)
+        for i in range(batch_size):
+            c[i, :] = img_metas[i]['center']
+            s[i, :] = img_metas[i]['scale']
+            image_paths.append(img_metas[i]['image_file'])
+
+            if 'bbox_score' in img_metas[i]:
+                score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
+            if bbox_ids is not None:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+
+        preds, maxvals = keypoints_from_regression(output, c, s, kwargs['img_size'], pixel_std)
+        # maxvals = output.maxvals
+
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        all_preds[:, :, 2:3] = maxvals
+        all_boxes[:, 0:2] = c[:, 0:2]
+        all_boxes[:, 2:4] = s[:, 0:2]
+        all_boxes[:, 4] = np.prod(s * pixel_std, axis=1)
+        all_boxes[:, 5] = score
+
+        result = {}
+        result['preds'] = all_preds
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+
+        return result
+
+    def init_weights(self):
+        for m in self.fc_layers:
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight, gain=0.01)
+        
+        
+        # for m in self.flow.t:
+        #     for mm in m.modules():
+        #         if isinstance(mm, nn.Linear):
+        #             nn.init.xavier_uniform_(mm.weight, gain=0.01)
+
+        # for m in self.flow.s:
+        #     for mm in m.modules():
+        #         if isinstance(mm, nn.Linear):
+        #             nn.init.xavier_uniform_(mm.weight, gain=0.01)
+        # normal_init(self.fc, mean=0, std=0.01, bias=0)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/heads/topdown_heatmap_base_head.py b/SMPLer-X/main/transformer_utils/mmpose/models/heads/topdown_heatmap_base_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..09646ead353fb054f066b9fc6816748a43287e2c
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/heads/topdown_heatmap_base_head.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+import torch.nn as nn
+
+from mmpose.core.evaluation.top_down_eval import keypoints_from_heatmaps
+
+
+class TopdownHeatmapBaseHead(nn.Module):
+    """Base class for top-down heatmap heads.
+
+    All top-down heatmap heads should subclass it.
+    All subclass should overwrite:
+
+    Methods:`get_loss`, supporting to calculate loss.
+    Methods:`get_accuracy`, supporting to calculate accuracy.
+    Methods:`forward`, supporting to forward model.
+    Methods:`inference_model`, supporting to inference model.
+    """
+
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def get_loss(self, **kwargs):
+        """Gets the loss."""
+
+    @abstractmethod
+    def get_accuracy(self, **kwargs):
+        """Gets the accuracy."""
+
+    @abstractmethod
+    def forward(self, **kwargs):
+        """Forward function."""
+
+    @abstractmethod
+    def inference_model(self, **kwargs):
+        """Inference function."""
+
+    def decode(self, img_metas, output, **kwargs):
+        """Decode keypoints from heatmaps.
+
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            output (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        """
+        batch_size = len(img_metas)
+
+        if 'bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+
+        c = np.zeros((batch_size, 2), dtype=np.float32)
+        s = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size)
+        for i in range(batch_size):
+            c[i, :] = img_metas[i]['center']
+            s[i, :] = img_metas[i]['scale']
+            image_paths.append(img_metas[i]['image_file'])
+
+            if 'bbox_score' in img_metas[i]:
+                score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
+            if bbox_ids is not None:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+
+        preds, maxvals = keypoints_from_heatmaps(
+            output,
+            c,
+            s,
+            unbiased=self.test_cfg.get('unbiased_decoding', False),
+            post_process=self.test_cfg.get('post_process', 'default'),
+            kernel=self.test_cfg.get('modulate_kernel', 11),
+            valid_radius_factor=self.test_cfg.get('valid_radius_factor',
+                                                  0.0546875),
+            use_udp=self.test_cfg.get('use_udp', False),
+            target_type=self.test_cfg.get('target_type', 'GaussianHeatmap'))
+
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        all_preds[:, :, 2:3] = maxvals
+        all_boxes[:, 0:2] = c[:, 0:2]
+        all_boxes[:, 2:4] = s[:, 0:2]
+        all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
+        all_boxes[:, 5] = score
+
+        result = {}
+
+        result['preds'] = all_preds
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+
+        return result
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py b/SMPLer-X/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac7b42a078a210053150bc353e7c9426285d9599
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py
@@ -0,0 +1,572 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+
+import torch.nn as nn
+from mmengine.model import constant_init, normal_init, kaiming_init
+from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, Linear,
+                      build_activation_layer, build_conv_layer,
+                      build_norm_layer, build_upsample_layer)
+
+from mmpose.core.evaluation import pose_pck_accuracy
+from mmpose.core.post_processing import flip_back
+from mmpose.models.builder import build_loss
+from ..builder import HEADS
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+
+
+@HEADS.register_module()
+class TopdownHeatmapMultiStageHead(TopdownHeatmapBaseHead):
+    """Top-down heatmap multi-stage head.
+
+    TopdownHeatmapMultiStageHead is consisted of multiple branches,
+    each of which has num_deconv_layers(>=0) number of deconv layers
+    and a simple conv2d layer.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_stages (int): Number of stages.
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=512,
+                 out_channels=17,
+                 num_stages=1,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.num_stages = num_stages
+        self.loss = build_loss(loss_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        # build multi-stage deconv layers
+        self.multi_deconv_layers = nn.ModuleList([])
+        for _ in range(self.num_stages):
+            if num_deconv_layers > 0:
+                deconv_layers = self._make_deconv_layer(
+                    num_deconv_layers,
+                    num_deconv_filters,
+                    num_deconv_kernels,
+                )
+            elif num_deconv_layers == 0:
+                deconv_layers = nn.Identity()
+            else:
+                raise ValueError(
+                    f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+            self.multi_deconv_layers.append(deconv_layers)
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        # build multi-stage final layers
+        self.multi_final_layers = nn.ModuleList([])
+        for i in range(self.num_stages):
+            if identity_final_layer:
+                final_layer = nn.Identity()
+            else:
+                final_layer = build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=num_deconv_filters[-1]
+                    if num_deconv_layers > 0 else in_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding)
+            self.multi_final_layers.append(final_layer)
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_outputs: O
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]):
+                Output heatmaps.
+            target (torch.Tensor[N,K,H,W]):
+                Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert isinstance(output, list)
+        assert target.dim() == 4 and target_weight.dim() == 3
+
+        if isinstance(self.loss, nn.Sequential):
+            assert len(self.loss) == len(output)
+        for i in range(len(output)):
+            target_i = target
+            target_weight_i = target_weight
+            if isinstance(self.loss, nn.Sequential):
+                loss_func = self.loss[i]
+            else:
+                loss_func = self.loss
+            loss_i = loss_func(output[i], target_i, target_weight_i)
+            if 'heatmap_loss' not in losses:
+                losses['heatmap_loss'] = loss_i
+            else:
+                losses['heatmap_loss'] += loss_i
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type == 'GaussianHeatmap':
+            _, avg_acc, _ = pose_pck_accuracy(
+                output[-1].detach().cpu().numpy(),
+                target.detach().cpu().numpy(),
+                target_weight.detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function.
+
+        Returns:
+            out (list[Tensor]): a list of heatmaps from multiple stages.
+        """
+        out = []
+        assert isinstance(x, list)
+        for i in range(self.num_stages):
+            y = self.multi_deconv_layers[i](x[i])
+            y = self.multi_final_layers[i](y)
+            out.append(y)
+        return out
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (List[torch.Tensor[NxKxHxW]]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+        assert isinstance(output, list)
+        output = output[-1]
+
+        if flip_pairs is not None:
+            # perform flip
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+
+        return output_heatmap
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.multi_deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.multi_final_layers.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+
+
+class PredictHeatmap(nn.Module):
+    """Predict the heat map for an input feature.
+
+    Args:
+        unit_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        out_shape (tuple): Shape of the output heatmap.
+        use_prm (bool): Whether to use pose refine machine. Default: False.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 unit_channels,
+                 out_channels,
+                 out_shape,
+                 use_prm=False,
+                 norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.unit_channels = unit_channels
+        self.out_channels = out_channels
+        self.out_shape = out_shape
+        self.use_prm = use_prm
+        if use_prm:
+            self.prm = PRM(out_channels, norm_cfg=norm_cfg)
+        self.conv_layers = nn.Sequential(
+            ConvModule(
+                unit_channels,
+                unit_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=norm_cfg,
+                inplace=False),
+            ConvModule(
+                unit_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                inplace=False))
+
+    def forward(self, feature):
+        feature = self.conv_layers(feature)
+        output = nn.functional.interpolate(
+            feature, size=self.out_shape, mode='bilinear', align_corners=True)
+        if self.use_prm:
+            output = self.prm(output)
+        return output
+
+
+class PRM(nn.Module):
+    """Pose Refine Machine.
+
+    Please refer to "Learning Delicate Local Representations
+    for Multi-Person Pose Estimation" (ECCV 2020).
+
+    Args:
+        out_channels (int): Channel number of the output. Equals to
+            the number of key points.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self, out_channels, norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.out_channels = out_channels
+        self.global_pooling = nn.AdaptiveAvgPool2d((1, 1))
+        self.middle_path = nn.Sequential(
+            Linear(self.out_channels, self.out_channels),
+            build_norm_layer(dict(type='BN1d'), out_channels)[1],
+            build_activation_layer(dict(type='ReLU')),
+            Linear(self.out_channels, self.out_channels),
+            build_norm_layer(dict(type='BN1d'), out_channels)[1],
+            build_activation_layer(dict(type='ReLU')),
+            build_activation_layer(dict(type='Sigmoid')))
+
+        self.bottom_path = nn.Sequential(
+            ConvModule(
+                self.out_channels,
+                self.out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=norm_cfg,
+                inplace=False),
+            DepthwiseSeparableConvModule(
+                self.out_channels,
+                1,
+                kernel_size=9,
+                stride=1,
+                padding=4,
+                norm_cfg=norm_cfg,
+                inplace=False), build_activation_layer(dict(type='Sigmoid')))
+        self.conv_bn_relu_prm_1 = ConvModule(
+            self.out_channels,
+            self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            norm_cfg=norm_cfg,
+            inplace=False)
+
+    def forward(self, x):
+        out = self.conv_bn_relu_prm_1(x)
+        out_1 = out
+
+        out_2 = self.global_pooling(out_1)
+        out_2 = out_2.view(out_2.size(0), -1)
+        out_2 = self.middle_path(out_2)
+        out_2 = out_2.unsqueeze(2)
+        out_2 = out_2.unsqueeze(3)
+
+        out_3 = self.bottom_path(out_1)
+        out = out_1 * (1 + out_2 * out_3)
+
+        return out
+
+
+@HEADS.register_module()
+class TopdownHeatmapMSMUHead(TopdownHeatmapBaseHead):
+    """Heads for multi-stage multi-unit heads used in Multi-Stage Pose
+    estimation Network (MSPN), and Residual Steps Networks (RSN).
+
+    Args:
+        unit_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        out_shape (tuple): Shape of the output heatmap.
+        num_stages (int): Number of stages.
+        num_units (int): Number of units in each stage.
+        use_prm (bool): Whether to use pose refine machine (PRM).
+            Default: False.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 out_shape,
+                 unit_channels=256,
+                 out_channels=17,
+                 num_stages=4,
+                 num_units=4,
+                 use_prm=False,
+                 norm_cfg=dict(type='BN'),
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        self.out_shape = out_shape
+        self.unit_channels = unit_channels
+        self.out_channels = out_channels
+        self.num_stages = num_stages
+        self.num_units = num_units
+
+        self.loss = build_loss(loss_keypoint)
+
+        self.predict_layers = nn.ModuleList([])
+        for i in range(self.num_stages):
+            for j in range(self.num_units):
+                self.predict_layers.append(
+                    PredictHeatmap(
+                        unit_channels,
+                        out_channels,
+                        out_shape,
+                        use_prm,
+                        norm_cfg=norm_cfg))
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_outputs: O
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,O,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,O,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,O,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert isinstance(output, list)
+        assert target.dim() == 5 and target_weight.dim() == 4
+        assert target.size(1) == len(output)
+
+        if isinstance(self.loss, nn.Sequential):
+            assert len(self.loss) == len(output)
+        for i in range(len(output)):
+            target_i = target[:, i, :, :, :]
+            target_weight_i = target_weight[:, i, :, :]
+
+            if isinstance(self.loss, nn.Sequential):
+                loss_func = self.loss[i]
+            else:
+                loss_func = self.loss
+
+            loss_i = loss_func(output[i], target_i, target_weight_i)
+            if 'heatmap_loss' not in losses:
+                losses['heatmap_loss'] = loss_i
+            else:
+                losses['heatmap_loss'] += loss_i
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type == 'GaussianHeatmap':
+            assert isinstance(output, list)
+            assert target.dim() == 5 and target_weight.dim() == 4
+            _, avg_acc, _ = pose_pck_accuracy(
+                output[-1].detach().cpu().numpy(),
+                target[:, -1, ...].detach().cpu().numpy(),
+                target_weight[:, -1,
+                              ...].detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function.
+
+        Returns:
+            out (list[Tensor]): a list of heatmaps from multiple stages
+                                and units.
+        """
+        out = []
+        assert isinstance(x, list)
+        assert len(x) == self.num_stages
+        assert isinstance(x[0], list)
+        assert len(x[0]) == self.num_units
+        assert x[0][0].shape[1] == self.unit_channels
+        for i in range(self.num_stages):
+            for j in range(self.num_units):
+                y = self.predict_layers[i * self.num_units + j](x[i][j])
+                out.append(y)
+
+        return out
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (list[torch.Tensor[N,K,H,W]]): Input features.
+            flip_pairs (None | list[tuple]):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+        assert isinstance(output, list)
+        output = output[-1]
+        if flip_pairs is not None:
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+        return output_heatmap
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.predict_layers.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+            elif isinstance(m, nn.Linear):
+                normal_init(m, std=0.01)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py b/SMPLer-X/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fe95548d7ff2c20516f22e14b24a51da24d7654
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py
@@ -0,0 +1,339 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.model import constant_init, normal_init
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer)
+
+from mmpose.core.evaluation import pose_pck_accuracy
+from mmpose.core.post_processing import flip_back
+from mmpose.models.builder import build_loss
+from mmpose.models.utils.ops import resize
+from ..builder import HEADS
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+
+
+@HEADS.register_module()
+class TopdownHeatmapSimpleHead(TopdownHeatmapBaseHead):
+    """Top-down heatmap simple head. paper ref: Bin Xiao et al. ``Simple
+    Baselines for Human Pose Estimation and Tracking``.
+
+    TopdownHeatmapSimpleHead is consisted of (>=0) number of deconv layers
+    and a simple conv2d layer.
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        in_index (int|Sequence[int]): Input feature index. Default: 0
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            Default: None.
+
+            - 'resize_concat': Multiple feature maps will be resized to the
+                same size as the first one and then concat together.
+                Usually used in FCN head of HRNet.
+            - 'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            - None: Only one select feature map is allowed.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None,
+                 in_index=0,
+                 input_transform=None,
+                 align_corners=False,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.loss = build_loss(loss_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.in_index = in_index
+        self.align_corners = align_corners
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        if num_deconv_layers > 0:
+            self.deconv_layers = self._make_deconv_layer(
+                num_deconv_layers,
+                num_deconv_filters,
+                num_deconv_kernels,
+            )
+        elif num_deconv_layers == 0:
+            self.deconv_layers = nn.Identity()
+        else:
+            raise ValueError(
+                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        if identity_final_layer:
+            self.final_layer = nn.Identity()
+        else:
+            conv_channels = num_deconv_filters[
+                -1] if num_deconv_layers > 0 else self.in_channels
+
+            layers = []
+            if extra is not None:
+                num_conv_layers = extra.get('num_conv_layers', 0)
+                num_conv_kernels = extra.get('num_conv_kernels',
+                                             [1] * num_conv_layers)
+
+                for i in range(num_conv_layers):
+                    layers.append(
+                        build_conv_layer(
+                            dict(type='Conv2d'),
+                            in_channels=conv_channels,
+                            out_channels=conv_channels,
+                            kernel_size=num_conv_kernels[i],
+                            stride=1,
+                            padding=(num_conv_kernels[i] - 1) // 2))
+                    layers.append(
+                        build_norm_layer(dict(type='BN'), conv_channels)[1])
+                    layers.append(nn.ReLU(inplace=True))
+
+            layers.append(
+                build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=conv_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding))
+
+            if len(layers) > 1:
+                self.final_layer = nn.Sequential(*layers)
+            else:
+                self.final_layer = layers[0]
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert not isinstance(self.loss, nn.Sequential)
+        assert target.dim() == 4 and target_weight.dim() == 3
+        losses['heatmap_loss'] = self.loss(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type == 'GaussianHeatmap':
+            _, avg_acc, _ = pose_pck_accuracy(
+                output.detach().cpu().numpy(),
+                target.detach().cpu().numpy(),
+                target_weight.detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function."""
+        x = self._transform_inputs(x)
+        x = self.deconv_layers(x)
+        x = self.final_layer(x)
+        return x
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (torch.Tensor[N,K,H,W]): Input features.
+            flip_pairs (None | list[tuple]):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+        return output_heatmap
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform is not None, in_channels and in_index must be
+        list or tuple, with the same length.
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+
+                - 'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                - 'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                - None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor] | Tensor): multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(inputs, list):
+            return inputs
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.final_layer.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/losses/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6502f7b19e8ab71cbdca028cd8b14bffde24cf20
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/losses/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .classfication_loss import BCELoss
+from .heatmap_loss import AdaptiveWingLoss
+from .mesh_loss import GANLoss, MeshLoss
+from .mse_loss import JointsMSELoss, JointsOHKMMSELoss
+from .multi_loss_factory import AELoss, HeatmapLoss, MultiLossFactory
+from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MSELoss, RLELoss,
+                              SemiSupervisionLoss, SmoothL1Loss, SoftWingLoss,
+                              WingLoss)
+from .rle_loss import RLELoss_poseur
+
+__all__ = [
+    'JointsMSELoss', 'JointsOHKMMSELoss', 'HeatmapLoss', 'AELoss',
+    'MultiLossFactory', 'MeshLoss', 'GANLoss', 'SmoothL1Loss', 'WingLoss',
+    'MPJPELoss', 'MSELoss', 'L1Loss', 'BCELoss', 'BoneLoss',
+    'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss', 'RLELoss'
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/losses/classfication_loss.py b/SMPLer-X/main/transformer_utils/mmpose/models/losses/classfication_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b79b69d035611f75f10e8722aaea4362659509e2
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/losses/classfication_loss.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class BCELoss(nn.Module):
+    """Binary Cross Entropy loss."""
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.binary_cross_entropy
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_labels: K
+
+        Args:
+            output (torch.Tensor[N, K]): Output classification.
+            target (torch.Tensor[N, K]): Target classification.
+            target_weight (torch.Tensor[N, K] or torch.Tensor[N]):
+                Weights across different labels.
+        """
+
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output, target, reduction='none')
+            if target_weight.dim() == 1:
+                target_weight = target_weight[:, None]
+            loss = (loss * target_weight).mean()
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/losses/heatmap_loss.py b/SMPLer-X/main/transformer_utils/mmpose/models/losses/heatmap_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9471457ca0da2d43441da1d394bc45b3e8ca3ee7
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/losses/heatmap_loss.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class AdaptiveWingLoss(nn.Module):
+    """Adaptive wing loss. paper ref: 'Adaptive Wing Loss for Robust Face
+    Alignment via Heatmap Regression' Wang et al. ICCV'2019.
+
+    Args:
+        alpha (float), omega (float), epsilon (float), theta (float)
+            are hyper-parameters.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self,
+                 alpha=2.1,
+                 omega=14,
+                 epsilon=1,
+                 theta=0.5,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.alpha = float(alpha)
+        self.omega = float(omega)
+        self.epsilon = float(epsilon)
+        self.theta = float(theta)
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            pred (torch.Tensor[NxKxHxW]): Predicted heatmaps.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+        """
+        H, W = pred.shape[2:4]
+        delta = (target - pred).abs()
+
+        A = self.omega * (
+            1 / (1 + torch.pow(self.theta / self.epsilon, self.alpha - target))
+        ) * (self.alpha - target) * (torch.pow(
+            self.theta / self.epsilon,
+            self.alpha - target - 1)) * (1 / self.epsilon)
+        C = self.theta * A - self.omega * torch.log(
+            1 + torch.pow(self.theta / self.epsilon, self.alpha - target))
+
+        losses = torch.where(
+            delta < self.theta,
+            self.omega *
+            torch.log(1 +
+                      torch.pow(delta / self.epsilon, self.alpha - target)),
+            A * delta - C)
+
+        return torch.mean(losses)
+
+    def forward(self, output, target, target_weight):
+        """Forward function.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            output (torch.Tensor[NxKxHxW]): Output heatmaps.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            loss = self.criterion(output * target_weight.unsqueeze(-1),
+                                  target * target_weight.unsqueeze(-1))
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/losses/mesh_loss.py b/SMPLer-X/main/transformer_utils/mmpose/models/losses/mesh_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9d18bd7296a189ec2f24c422cc05a19035d3224
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/losses/mesh_loss.py
@@ -0,0 +1,340 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from ..utils.geometry import batch_rodrigues
+
+
+def perspective_projection(points, rotation, translation, focal_length,
+                           camera_center):
+    """This function computes the perspective projection of a set of 3D points.
+
+    Note:
+        - batch size: B
+        - point number: N
+
+    Args:
+        points (Tensor([B, N, 3])): A set of 3D points
+        rotation (Tensor([B, 3, 3])): Camera rotation matrix
+        translation (Tensor([B, 3])): Camera translation
+        focal_length (Tensor([B,])): Focal length
+        camera_center (Tensor([B, 2])): Camera center
+
+    Returns:
+        projected_points (Tensor([B, N, 2])): Projected 2D
+            points in image space.
+    """
+
+    batch_size = points.shape[0]
+    K = torch.zeros([batch_size, 3, 3], device=points.device)
+    K[:, 0, 0] = focal_length
+    K[:, 1, 1] = focal_length
+    K[:, 2, 2] = 1.
+    K[:, :-1, -1] = camera_center
+
+    # Transform points
+    points = torch.einsum('bij,bkj->bki', rotation, points)
+    points = points + translation.unsqueeze(1)
+
+    # Apply perspective distortion
+    projected_points = points / points[:, :, -1].unsqueeze(-1)
+
+    # Apply camera intrinsics
+    projected_points = torch.einsum('bij,bkj->bki', K, projected_points)
+    projected_points = projected_points[:, :, :-1]
+    return projected_points
+
+
+@LOSSES.register_module()
+class MeshLoss(nn.Module):
+    """Mix loss for 3D human mesh. It is composed of loss on 2D joints, 3D
+    joints, mesh vertices and smpl parameters (if any).
+
+    Args:
+        joints_2d_loss_weight (float): Weight for loss on 2D joints.
+        joints_3d_loss_weight (float): Weight for loss on 3D joints.
+        vertex_loss_weight (float): Weight for loss on 3D verteices.
+        smpl_pose_loss_weight (float): Weight for loss on SMPL
+            pose parameters.
+        smpl_beta_loss_weight (float): Weight for loss on SMPL
+            shape parameters.
+        img_res (int): Input image resolution.
+        focal_length (float): Focal length of camera model. Default=5000.
+    """
+
+    def __init__(self,
+                 joints_2d_loss_weight,
+                 joints_3d_loss_weight,
+                 vertex_loss_weight,
+                 smpl_pose_loss_weight,
+                 smpl_beta_loss_weight,
+                 img_res,
+                 focal_length=5000):
+
+        super().__init__()
+        # Per-vertex loss on the mesh
+        self.criterion_vertex = nn.L1Loss(reduction='none')
+
+        # Joints (2D and 3D) loss
+        self.criterion_joints_2d = nn.SmoothL1Loss(reduction='none')
+        self.criterion_joints_3d = nn.SmoothL1Loss(reduction='none')
+
+        # Loss for SMPL parameter regression
+        self.criterion_regr = nn.MSELoss(reduction='none')
+
+        self.joints_2d_loss_weight = joints_2d_loss_weight
+        self.joints_3d_loss_weight = joints_3d_loss_weight
+        self.vertex_loss_weight = vertex_loss_weight
+        self.smpl_pose_loss_weight = smpl_pose_loss_weight
+        self.smpl_beta_loss_weight = smpl_beta_loss_weight
+        self.focal_length = focal_length
+        self.img_res = img_res
+
+    def joints_2d_loss(self, pred_joints_2d, gt_joints_2d, joints_2d_visible):
+        """Compute 2D reprojection loss on the joints.
+
+        The loss is weighted by joints_2d_visible.
+        """
+        conf = joints_2d_visible.float()
+        loss = (conf *
+                self.criterion_joints_2d(pred_joints_2d, gt_joints_2d)).mean()
+        return loss
+
+    def joints_3d_loss(self, pred_joints_3d, gt_joints_3d, joints_3d_visible):
+        """Compute 3D joints loss for the examples that 3D joint annotations
+        are available.
+
+        The loss is weighted by joints_3d_visible.
+        """
+        conf = joints_3d_visible.float()
+        if len(gt_joints_3d) > 0:
+            gt_pelvis = (gt_joints_3d[:, 2, :] + gt_joints_3d[:, 3, :]) / 2
+            gt_joints_3d = gt_joints_3d - gt_pelvis[:, None, :]
+            pred_pelvis = (pred_joints_3d[:, 2, :] +
+                           pred_joints_3d[:, 3, :]) / 2
+            pred_joints_3d = pred_joints_3d - pred_pelvis[:, None, :]
+            return (
+                conf *
+                self.criterion_joints_3d(pred_joints_3d, gt_joints_3d)).mean()
+        return pred_joints_3d.sum() * 0
+
+    def vertex_loss(self, pred_vertices, gt_vertices, has_smpl):
+        """Compute 3D vertex loss for the examples that 3D human mesh
+        annotations are available.
+
+        The loss is weighted by the has_smpl.
+        """
+        conf = has_smpl.float()
+        loss_vertex = self.criterion_vertex(pred_vertices, gt_vertices)
+        loss_vertex = (conf[:, None, None] * loss_vertex).mean()
+        return loss_vertex
+
+    def smpl_losses(self, pred_rotmat, pred_betas, gt_pose, gt_betas,
+                    has_smpl):
+        """Compute SMPL parameters loss for the examples that SMPL parameter
+        annotations are available.
+
+        The loss is weighted by has_smpl.
+        """
+        conf = has_smpl.float()
+        gt_rotmat = batch_rodrigues(gt_pose.view(-1, 3)).view(-1, 24, 3, 3)
+        loss_regr_pose = self.criterion_regr(pred_rotmat, gt_rotmat)
+        loss_regr_betas = self.criterion_regr(pred_betas, gt_betas)
+        loss_regr_pose = (conf[:, None, None, None] * loss_regr_pose).mean()
+        loss_regr_betas = (conf[:, None] * loss_regr_betas).mean()
+        return loss_regr_pose, loss_regr_betas
+
+    def project_points(self, points_3d, camera):
+        """Perform orthographic projection of 3D points using the camera
+        parameters, return projected 2D points in image plane.
+
+        Note:
+            - batch size: B
+            - point number: N
+
+        Args:
+            points_3d (Tensor([B, N, 3])): 3D points.
+            camera (Tensor([B, 3])): camera parameters with the
+                3 channel as (scale, translation_x, translation_y)
+
+        Returns:
+            Tensor([B, N, 2]): projected 2D points \
+                in image space.
+        """
+        batch_size = points_3d.shape[0]
+        device = points_3d.device
+        cam_t = torch.stack([
+            camera[:, 1], camera[:, 2], 2 * self.focal_length /
+            (self.img_res * camera[:, 0] + 1e-9)
+        ],
+                            dim=-1)
+        camera_center = camera.new_zeros([batch_size, 2])
+        rot_t = torch.eye(
+            3, device=device,
+            dtype=points_3d.dtype).unsqueeze(0).expand(batch_size, -1, -1)
+        joints_2d = perspective_projection(
+            points_3d,
+            rotation=rot_t,
+            translation=cam_t,
+            focal_length=self.focal_length,
+            camera_center=camera_center)
+        return joints_2d
+
+    def forward(self, output, target):
+        """Forward function.
+
+        Args:
+            output (dict): dict of network predicted results.
+                Keys: 'vertices', 'joints_3d', 'camera',
+                'pose'(optional), 'beta'(optional)
+            target (dict): dict of ground-truth labels.
+                Keys: 'vertices', 'joints_3d', 'joints_3d_visible',
+                'joints_2d', 'joints_2d_visible', 'pose', 'beta',
+                'has_smpl'
+
+        Returns:
+            dict: dict of losses.
+        """
+        losses = {}
+
+        # Per-vertex loss for the shape
+        pred_vertices = output['vertices']
+
+        gt_vertices = target['vertices']
+        has_smpl = target['has_smpl']
+        loss_vertex = self.vertex_loss(pred_vertices, gt_vertices, has_smpl)
+        losses['vertex_loss'] = loss_vertex * self.vertex_loss_weight
+
+        # Compute loss on SMPL parameters, if available
+        if 'pose' in output.keys() and 'beta' in output.keys():
+            pred_rotmat = output['pose']
+            pred_betas = output['beta']
+            gt_pose = target['pose']
+            gt_betas = target['beta']
+            loss_regr_pose, loss_regr_betas = self.smpl_losses(
+                pred_rotmat, pred_betas, gt_pose, gt_betas, has_smpl)
+            losses['smpl_pose_loss'] = \
+                loss_regr_pose * self.smpl_pose_loss_weight
+            losses['smpl_beta_loss'] = \
+                loss_regr_betas * self.smpl_beta_loss_weight
+
+        # Compute 3D joints loss
+        pred_joints_3d = output['joints_3d']
+        gt_joints_3d = target['joints_3d']
+        joints_3d_visible = target['joints_3d_visible']
+        loss_joints_3d = self.joints_3d_loss(pred_joints_3d, gt_joints_3d,
+                                             joints_3d_visible)
+        losses['joints_3d_loss'] = loss_joints_3d * self.joints_3d_loss_weight
+
+        # Compute 2D reprojection loss for the 2D joints
+        pred_camera = output['camera']
+        gt_joints_2d = target['joints_2d']
+        joints_2d_visible = target['joints_2d_visible']
+        pred_joints_2d = self.project_points(pred_joints_3d, pred_camera)
+
+        # Normalize keypoints to [-1,1]
+        # The coordinate origin of pred_joints_2d is
+        #  the center of the input image.
+        pred_joints_2d = 2 * pred_joints_2d / (self.img_res - 1)
+        # The coordinate origin of gt_joints_2d is
+        # the top left corner of the input image.
+        gt_joints_2d = 2 * gt_joints_2d / (self.img_res - 1) - 1
+        loss_joints_2d = self.joints_2d_loss(pred_joints_2d, gt_joints_2d,
+                                             joints_2d_visible)
+        losses['joints_2d_loss'] = loss_joints_2d * self.joints_2d_loss_weight
+
+        return losses
+
+
+@LOSSES.register_module()
+class GANLoss(nn.Module):
+    """Define GAN loss.
+
+    Args:
+        gan_type (str): Support 'vanilla', 'lsgan', 'wgan', 'hinge'.
+        real_label_val (float): The value for real label. Default: 1.0.
+        fake_label_val (float): The value for fake label. Default: 0.0.
+        loss_weight (float): Loss weight. Default: 1.0.
+            Note that loss_weight is only for generators; and it is always 1.0
+            for discriminators.
+    """
+
+    def __init__(self,
+                 gan_type,
+                 real_label_val=1.0,
+                 fake_label_val=0.0,
+                 loss_weight=1.0):
+        super().__init__()
+        self.gan_type = gan_type
+        self.loss_weight = loss_weight
+        self.real_label_val = real_label_val
+        self.fake_label_val = fake_label_val
+
+        if self.gan_type == 'vanilla':
+            self.loss = nn.BCEWithLogitsLoss()
+        elif self.gan_type == 'lsgan':
+            self.loss = nn.MSELoss()
+        elif self.gan_type == 'wgan':
+            self.loss = self._wgan_loss
+        elif self.gan_type == 'hinge':
+            self.loss = nn.ReLU()
+        else:
+            raise NotImplementedError(
+                f'GAN type {self.gan_type} is not implemented.')
+
+    @staticmethod
+    def _wgan_loss(input, target):
+        """wgan loss.
+
+        Args:
+            input (Tensor): Input tensor.
+            target (bool): Target label.
+
+        Returns:
+            Tensor: wgan loss.
+        """
+        return -input.mean() if target else input.mean()
+
+    def get_target_label(self, input, target_is_real):
+        """Get target label.
+
+        Args:
+            input (Tensor): Input tensor.
+            target_is_real (bool): Whether the target is real or fake.
+
+        Returns:
+            (bool | Tensor): Target tensor. Return bool for wgan, \
+                otherwise, return Tensor.
+        """
+
+        if self.gan_type == 'wgan':
+            return target_is_real
+        target_val = (
+            self.real_label_val if target_is_real else self.fake_label_val)
+        return input.new_ones(input.size()) * target_val
+
+    def forward(self, input, target_is_real, is_disc=False):
+        """
+        Args:
+            input (Tensor): The input for the loss module, i.e., the network
+                prediction.
+            target_is_real (bool): Whether the targe is real or fake.
+            is_disc (bool): Whether the loss for discriminators or not.
+                Default: False.
+
+        Returns:
+            Tensor: GAN loss value.
+        """
+        target_label = self.get_target_label(input, target_is_real)
+        if self.gan_type == 'hinge':
+            if is_disc:  # for discriminators in hinge-gan
+                input = -input if target_is_real else input
+                loss = self.loss(1 + input).mean()
+            else:  # for generators in hinge-gan
+                loss = -input.mean()
+        else:  # other gan types
+            loss = self.loss(input, target_label)
+
+        # loss_weight is always 1.0 for discriminators
+        return loss if is_disc else loss * self.loss_weight
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/losses/mse_loss.py b/SMPLer-X/main/transformer_utils/mmpose/models/losses/mse_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..f972efadfdfe0093c9ae1b308c6f82a9ccd72f73
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/losses/mse_loss.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class JointsMSELoss(nn.Module):
+    """MSE loss for heatmaps.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = nn.MSELoss()
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight):
+        """Forward function."""
+        batch_size = output.size(0)
+        num_joints = output.size(1)
+
+        heatmaps_pred = output.reshape(
+            (batch_size, num_joints, -1)).split(1, 1)
+        heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1)
+
+        loss = 0.
+
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx].squeeze(1)
+            heatmap_gt = heatmaps_gt[idx].squeeze(1)
+            if self.use_target_weight:
+                loss += self.criterion(heatmap_pred * target_weight[:, idx],
+                                       heatmap_gt * target_weight[:, idx])
+            else:
+                loss += self.criterion(heatmap_pred, heatmap_gt)
+
+        return loss / num_joints * self.loss_weight
+
+
+@LOSSES.register_module()
+class CombinedTargetMSELoss(nn.Module):
+    """MSE loss for combined target.
+        CombinedTarget: The combination of classification target
+        (response map) and regression target (offset map).
+        Paper ref: Huang et al. The Devil is in the Details: Delving into
+        Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight, loss_weight=1.):
+        super().__init__()
+        self.criterion = nn.MSELoss(reduction='mean')
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight):
+        batch_size = output.size(0)
+        num_channels = output.size(1)
+        heatmaps_pred = output.reshape(
+            (batch_size, num_channels, -1)).split(1, 1)
+        heatmaps_gt = target.reshape(
+            (batch_size, num_channels, -1)).split(1, 1)
+        loss = 0.
+        num_joints = num_channels // 3
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx * 3].squeeze()
+            heatmap_gt = heatmaps_gt[idx * 3].squeeze()
+            offset_x_pred = heatmaps_pred[idx * 3 + 1].squeeze()
+            offset_x_gt = heatmaps_gt[idx * 3 + 1].squeeze()
+            offset_y_pred = heatmaps_pred[idx * 3 + 2].squeeze()
+            offset_y_gt = heatmaps_gt[idx * 3 + 2].squeeze()
+            if self.use_target_weight:
+                heatmap_pred = heatmap_pred * target_weight[:, idx]
+                heatmap_gt = heatmap_gt * target_weight[:, idx]
+            # classification loss
+            loss += 0.5 * self.criterion(heatmap_pred, heatmap_gt)
+            # regression loss
+            loss += 0.5 * self.criterion(heatmap_gt * offset_x_pred,
+                                         heatmap_gt * offset_x_gt)
+            loss += 0.5 * self.criterion(heatmap_gt * offset_y_pred,
+                                         heatmap_gt * offset_y_gt)
+        return loss / num_joints * self.loss_weight
+
+
+@LOSSES.register_module()
+class JointsOHKMMSELoss(nn.Module):
+    """MSE loss with online hard keypoint mining.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        topk (int): Only top k joint losses are kept.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, topk=8, loss_weight=1.):
+        super().__init__()
+        assert topk > 0
+        self.criterion = nn.MSELoss(reduction='none')
+        self.use_target_weight = use_target_weight
+        self.topk = topk
+        self.loss_weight = loss_weight
+
+    def _ohkm(self, loss):
+        """Online hard keypoint mining."""
+        ohkm_loss = 0.
+        N = len(loss)
+        for i in range(N):
+            sub_loss = loss[i]
+            _, topk_idx = torch.topk(
+                sub_loss, k=self.topk, dim=0, sorted=False)
+            tmp_loss = torch.gather(sub_loss, 0, topk_idx)
+            ohkm_loss += torch.sum(tmp_loss) / self.topk
+        ohkm_loss /= N
+        return ohkm_loss
+
+    def forward(self, output, target, target_weight):
+        """Forward function."""
+        batch_size = output.size(0)
+        num_joints = output.size(1)
+        if num_joints < self.topk:
+            raise ValueError(f'topk ({self.topk}) should not '
+                             f'larger than num_joints ({num_joints}).')
+        heatmaps_pred = output.reshape(
+            (batch_size, num_joints, -1)).split(1, 1)
+        heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1)
+
+        losses = []
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx].squeeze(1)
+            heatmap_gt = heatmaps_gt[idx].squeeze(1)
+            if self.use_target_weight:
+                losses.append(
+                    self.criterion(heatmap_pred * target_weight[:, idx],
+                                   heatmap_gt * target_weight[:, idx]))
+            else:
+                losses.append(self.criterion(heatmap_pred, heatmap_gt))
+
+        losses = [loss.mean(dim=1).unsqueeze(dim=1) for loss in losses]
+        losses = torch.cat(losses, dim=1)
+
+        return self._ohkm(losses) * self.loss_weight
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/losses/multi_loss_factory.py b/SMPLer-X/main/transformer_utils/mmpose/models/losses/multi_loss_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..65f90a761d0e5f94309023288f0d3ec848ec82dd
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/losses/multi_loss_factory.py
@@ -0,0 +1,281 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+
+
+def _make_input(t, requires_grad=False, device=torch.device('cpu')):
+    """Make zero inputs for AE loss.
+
+    Args:
+        t (torch.Tensor): input
+        requires_grad (bool): Option to use requires_grad.
+        device: torch device
+
+    Returns:
+        torch.Tensor: zero input.
+    """
+    inp = torch.autograd.Variable(t, requires_grad=requires_grad)
+    inp = inp.sum()
+    inp = inp.to(device)
+    return inp
+
+
+@LOSSES.register_module()
+class HeatmapLoss(nn.Module):
+    """Accumulate the heatmap loss for each image in the batch.
+
+    Args:
+        supervise_empty (bool): Whether to supervise empty channels.
+    """
+
+    def __init__(self, supervise_empty=True):
+        super().__init__()
+        self.supervise_empty = supervise_empty
+
+    def forward(self, pred, gt, mask):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+
+        Args:
+            pred (torch.Tensor[N,K,H,W]):heatmap of output.
+            gt (torch.Tensor[N,K,H,W]): target heatmap.
+            mask (torch.Tensor[N,H,W]): mask of target.
+        """
+        assert pred.size() == gt.size(
+        ), f'pred.size() is {pred.size()}, gt.size() is {gt.size()}'
+
+        if not self.supervise_empty:
+            empty_mask = (gt.sum(dim=[2, 3], keepdim=True) > 0).float()
+            loss = ((pred - gt)**2) * empty_mask.expand_as(
+                pred) * mask[:, None, :, :].expand_as(pred)
+        else:
+            loss = ((pred - gt)**2) * mask[:, None, :, :].expand_as(pred)
+        loss = loss.mean(dim=3).mean(dim=2).mean(dim=1)
+        return loss
+
+
+@LOSSES.register_module()
+class AELoss(nn.Module):
+    """Associative Embedding loss.
+
+    `Associative Embedding: End-to-End Learning for Joint Detection and
+    Grouping <https://arxiv.org/abs/1611.05424v2>`_.
+    """
+
+    def __init__(self, loss_type):
+        super().__init__()
+        self.loss_type = loss_type
+
+    def singleTagLoss(self, pred_tag, joints):
+        """Associative embedding loss for one image.
+
+        Note:
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+
+        Args:
+            pred_tag (torch.Tensor[KxHxW,1]): tag of output for one image.
+            joints (torch.Tensor[M,K,2]): joints information for one image.
+        """
+        tags = []
+        pull = 0
+        for joints_per_person in joints:
+            tmp = []
+            for joint in joints_per_person:
+                if joint[1] > 0:
+                    tmp.append(pred_tag[joint[0]])
+            if len(tmp) == 0:
+                continue
+            tmp = torch.stack(tmp)
+            tags.append(torch.mean(tmp, dim=0))
+            pull = pull + torch.mean((tmp - tags[-1].expand_as(tmp))**2)
+
+        num_tags = len(tags)
+        if num_tags == 0:
+            return (
+                _make_input(torch.zeros(1).float(), device=pred_tag.device),
+                _make_input(torch.zeros(1).float(), device=pred_tag.device))
+        elif num_tags == 1:
+            return (_make_input(
+                torch.zeros(1).float(), device=pred_tag.device), pull)
+
+        tags = torch.stack(tags)
+
+        size = (num_tags, num_tags)
+        A = tags.expand(*size)
+        B = A.permute(1, 0)
+
+        diff = A - B
+
+        if self.loss_type == 'exp':
+            diff = torch.pow(diff, 2)
+            push = torch.exp(-diff)
+            push = torch.sum(push) - num_tags
+        elif self.loss_type == 'max':
+            diff = 1 - torch.abs(diff)
+            push = torch.clamp(diff, min=0).sum() - num_tags
+        else:
+            raise ValueError('Unknown ae loss type')
+
+        push_loss = push / ((num_tags - 1) * num_tags) * 0.5
+        pull_loss = pull / (num_tags)
+
+        return push_loss, pull_loss
+
+    def forward(self, tags, joints):
+        """Accumulate the tag loss for each image in the batch.
+
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+
+        Args:
+            tags (torch.Tensor[N,KxHxW,1]): tag channels of output.
+            joints (torch.Tensor[N,M,K,2]): joints information.
+        """
+        pushes, pulls = [], []
+        joints = joints.cpu().data.numpy()
+        batch_size = tags.size(0)
+        for i in range(batch_size):
+            push, pull = self.singleTagLoss(tags[i], joints[i])
+            pushes.append(push)
+            pulls.append(pull)
+        return torch.stack(pushes), torch.stack(pulls)
+
+
+@LOSSES.register_module()
+class MultiLossFactory(nn.Module):
+    """Loss for bottom-up models.
+
+    Args:
+        num_joints (int): Number of keypoints.
+        num_stages (int): Number of stages.
+        ae_loss_type (str): Type of ae loss.
+        with_ae_loss (list[bool]): Use ae loss or not in multi-heatmap.
+        push_loss_factor (list[float]):
+            Parameter of push loss in multi-heatmap.
+        pull_loss_factor (list[float]):
+            Parameter of pull loss in multi-heatmap.
+        with_heatmap_loss (list[bool]):
+            Use heatmap loss or not in multi-heatmap.
+        heatmaps_loss_factor (list[float]):
+            Parameter of heatmap loss in multi-heatmap.
+        supervise_empty (bool): Whether to supervise empty channels.
+    """
+
+    def __init__(self,
+                 num_joints,
+                 num_stages,
+                 ae_loss_type,
+                 with_ae_loss,
+                 push_loss_factor,
+                 pull_loss_factor,
+                 with_heatmaps_loss,
+                 heatmaps_loss_factor,
+                 supervise_empty=True):
+        super().__init__()
+
+        assert isinstance(with_heatmaps_loss, (list, tuple)), \
+            'with_heatmaps_loss should be a list or tuple'
+        assert isinstance(heatmaps_loss_factor, (list, tuple)), \
+            'heatmaps_loss_factor should be a list or tuple'
+        assert isinstance(with_ae_loss, (list, tuple)), \
+            'with_ae_loss should be a list or tuple'
+        assert isinstance(push_loss_factor, (list, tuple)), \
+            'push_loss_factor should be a list or tuple'
+        assert isinstance(pull_loss_factor, (list, tuple)), \
+            'pull_loss_factor should be a list or tuple'
+
+        self.num_joints = num_joints
+        self.num_stages = num_stages
+        self.ae_loss_type = ae_loss_type
+        self.with_ae_loss = with_ae_loss
+        self.push_loss_factor = push_loss_factor
+        self.pull_loss_factor = pull_loss_factor
+        self.with_heatmaps_loss = with_heatmaps_loss
+        self.heatmaps_loss_factor = heatmaps_loss_factor
+
+        self.heatmaps_loss = \
+            nn.ModuleList(
+                [
+                    HeatmapLoss(supervise_empty)
+                    if with_heatmaps_loss else None
+                    for with_heatmaps_loss in self.with_heatmaps_loss
+                ]
+            )
+
+        self.ae_loss = \
+            nn.ModuleList(
+                [
+                    AELoss(self.ae_loss_type) if with_ae_loss else None
+                    for with_ae_loss in self.with_ae_loss
+                ]
+            )
+
+    def forward(self, outputs, heatmaps, masks, joints):
+        """Forward function to calculate losses.
+
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+            - output_channel: C C=2K if use ae loss else K
+
+        Args:
+            outputs (list(torch.Tensor[N,C,H,W])): outputs of stages.
+            heatmaps (list(torch.Tensor[N,K,H,W])): target of heatmaps.
+            masks (list(torch.Tensor[N,H,W])): masks of heatmaps.
+            joints (list(torch.Tensor[N,M,K,2])): joints of ae loss.
+        """
+        heatmaps_losses = []
+        push_losses = []
+        pull_losses = []
+        for idx in range(len(outputs)):
+            offset_feat = 0
+            if self.heatmaps_loss[idx]:
+                heatmaps_pred = outputs[idx][:, :self.num_joints]
+                offset_feat = self.num_joints
+                heatmaps_loss = self.heatmaps_loss[idx](heatmaps_pred,
+                                                        heatmaps[idx],
+                                                        masks[idx])
+                heatmaps_loss = heatmaps_loss * self.heatmaps_loss_factor[idx]
+                heatmaps_losses.append(heatmaps_loss)
+            else:
+                heatmaps_losses.append(None)
+
+            if self.ae_loss[idx]:
+                tags_pred = outputs[idx][:, offset_feat:]
+                batch_size = tags_pred.size()[0]
+                tags_pred = tags_pred.contiguous().view(batch_size, -1, 1)
+
+                push_loss, pull_loss = self.ae_loss[idx](tags_pred,
+                                                         joints[idx])
+                push_loss = push_loss * self.push_loss_factor[idx]
+                pull_loss = pull_loss * self.pull_loss_factor[idx]
+
+                push_losses.append(push_loss)
+                pull_losses.append(pull_loss)
+            else:
+                push_losses.append(None)
+                pull_losses.append(None)
+
+        return heatmaps_losses, push_losses, pull_losses
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/losses/regression_loss.py b/SMPLer-X/main/transformer_utils/mmpose/models/losses/regression_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc7aa33847d8fdc8c6e096b7e3467759024af053
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/losses/regression_loss.py
@@ -0,0 +1,530 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from ..utils.realnvp import RealNVP
+
+
+@LOSSES.register_module()
+class RLELoss(nn.Module):
+    """RLE Loss.
+
+    `Human Pose Regression With Residual Log-Likelihood Estimation
+    arXiv: <https://arxiv.org/abs/2107.11291>`_.
+
+    Code is modified from `the official implementation
+    <https://github.com/Jeff-sjtu/res-loglikelihood-regression>`_.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        size_average (bool): Option to average the loss by the batch_size.
+        residual (bool): Option to add L1 loss and let the flow
+            learn the residual error distribution.
+        q_dis (string): Option for the identity Q(error) distribution,
+            Options: "laplace" or "gaussian"
+    """
+
+    def __init__(self,
+                 use_target_weight=False,
+                 size_average=True,
+                 residual=True,
+                 q_dis='laplace'):
+        super(RLELoss, self).__init__()
+        self.size_average = size_average
+        self.use_target_weight = use_target_weight
+        self.residual = residual
+        self.q_dis = q_dis
+
+        self.flow_model = RealNVP()
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D*2]): Output regression,
+                    including coords and sigmas.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K, D]):
+                Weights across different joint types.
+        """
+        pred = output[:, :, :2]
+        sigma = output[:, :, 2:4].sigmoid()
+
+        error = (pred - target) / (sigma + 1e-9)
+        # (B, K, 2)
+        log_phi = self.flow_model.log_prob(error.reshape(-1, 2))
+        log_phi = log_phi.reshape(target.shape[0], target.shape[1], 1)
+        log_sigma = torch.log(sigma).reshape(target.shape[0], target.shape[1],
+                                             2)
+        nf_loss = log_sigma - log_phi
+
+        if self.residual:
+            assert self.q_dis in ['laplace', 'gaussian', 'strict']
+            if self.q_dis == 'laplace':
+                loss_q = torch.log(sigma * 2) + torch.abs(error)
+            else:
+                loss_q = torch.log(
+                    sigma * math.sqrt(2 * math.pi)) + 0.5 * error**2
+
+            loss = nf_loss + loss_q
+        else:
+            loss = nf_loss
+
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss *= target_weight
+
+        if self.size_average:
+            loss /= len(loss)
+
+        return loss.sum()
+
+
+@LOSSES.register_module()
+class SmoothL1Loss(nn.Module):
+    """SmoothL1Loss loss.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.smooth_l1_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K, D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class WingLoss(nn.Module):
+    """Wing Loss. paper ref: 'Wing Loss for Robust Facial Landmark Localisation
+    with Convolutional Neural Networks' Feng et al. CVPR'2018.
+
+    Args:
+        omega (float): Also referred to as width.
+        epsilon (float): Also referred to as curvature.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self,
+                 omega=10.0,
+                 epsilon=2.0,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.omega = omega
+        self.epsilon = epsilon
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+        # constant that smoothly links the piecewise-defined linear
+        # and nonlinear parts
+        self.C = self.omega * (1.0 - math.log(1.0 + self.omega / self.epsilon))
+
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            pred (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+        """
+        delta = (target - pred).abs()
+        losses = torch.where(
+            delta < self.omega,
+            self.omega * torch.log(1.0 + delta / self.epsilon), delta - self.C)
+        return torch.mean(torch.sum(losses, dim=[1, 2]), dim=0)
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N,K,D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class SoftWingLoss(nn.Module):
+    """Soft Wing Loss 'Structure-Coherent Deep Feature Learning for Robust Face
+    Alignment' Lin et al. TIP'2021.
+
+    loss =
+        1. |x|                           , if |x| < omega1
+        2. omega2*ln(1+|x|/epsilon) + B, if |x| >= omega1
+
+    Args:
+        omega1 (float): The first threshold.
+        omega2 (float): The second threshold.
+        epsilon (float): Also referred to as curvature.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self,
+                 omega1=2.0,
+                 omega2=20.0,
+                 epsilon=0.5,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.omega1 = omega1
+        self.omega2 = omega2
+        self.epsilon = epsilon
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+        # constant that smoothly links the piecewise-defined linear
+        # and nonlinear parts
+        self.B = self.omega1 - self.omega2 * math.log(1.0 + self.omega1 /
+                                                      self.epsilon)
+
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            pred (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+        """
+        delta = (target - pred).abs()
+        losses = torch.where(
+            delta < self.omega1, delta,
+            self.omega2 * torch.log(1.0 + delta / self.epsilon) + self.B)
+        return torch.mean(torch.sum(losses, dim=[1, 2]), dim=0)
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K, D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class MPJPELoss(nn.Module):
+    """MPJPE (Mean Per Joint Position Error) loss.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N,K,D]):
+                Weights across different joint types.
+        """
+
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = torch.mean(
+                torch.norm((output - target) * target_weight, dim=-1))
+        else:
+            loss = torch.mean(torch.norm(output - target, dim=-1))
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class L1Loss(nn.Module):
+    """L1Loss loss ."""
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.l1_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output regression.
+            target (torch.Tensor[N, K, 2]): Target regression.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class MSELoss(nn.Module):
+    """MSE loss for coordinate regression."""
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.mse_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output regression.
+            target (torch.Tensor[N, K, 2]): Target regression.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class BoneLoss(nn.Module):
+    """Bone length loss.
+
+    Args:
+        joint_parents (list): Indices of each joint's parent joint.
+        use_target_weight (bool): Option to use weighted bone loss.
+            Different bone types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, joint_parents, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.joint_parents = joint_parents
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+        self.non_root_indices = []
+        for i in range(len(self.joint_parents)):
+            if i != self.joint_parents[i]:
+                self.non_root_indices.append(i)
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K-1]):
+                Weights across different bone types.
+        """
+        output_bone = torch.norm(
+            output - output[:, self.joint_parents, :],
+            dim=-1)[:, self.non_root_indices]
+        target_bone = torch.norm(
+            target - target[:, self.joint_parents, :],
+            dim=-1)[:, self.non_root_indices]
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = torch.mean(
+                torch.abs((output_bone * target_weight).mean(dim=0) -
+                          (target_bone * target_weight).mean(dim=0)))
+        else:
+            loss = torch.mean(
+                torch.abs(output_bone.mean(dim=0) - target_bone.mean(dim=0)))
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class SemiSupervisionLoss(nn.Module):
+    """Semi-supervision loss for unlabeled data. It is composed of projection
+    loss and bone loss.
+
+    Paper ref: `3D human pose estimation in video with temporal convolutions
+    and semi-supervised training` Dario Pavllo et al. CVPR'2019.
+
+    Args:
+        joint_parents (list): Indices of each joint's parent joint.
+        projection_loss_weight (float): Weight for projection loss.
+        bone_loss_weight (float): Weight for bone loss.
+        warmup_iterations (int): Number of warmup iterations. In the first
+            `warmup_iterations` iterations, the model is trained only on
+            labeled data, and semi-supervision loss will be 0.
+            This is a workaround since currently we cannot access
+            epoch number in loss functions. Note that the iteration number in
+            an epoch can be changed due to different GPU numbers in multi-GPU
+            settings. So please set this parameter carefully.
+            warmup_iterations = dataset_size // samples_per_gpu // gpu_num
+            * warmup_epochs
+    """
+
+    def __init__(self,
+                 joint_parents,
+                 projection_loss_weight=1.,
+                 bone_loss_weight=1.,
+                 warmup_iterations=0):
+        super().__init__()
+        self.criterion_projection = MPJPELoss(
+            loss_weight=projection_loss_weight)
+        self.criterion_bone = BoneLoss(
+            joint_parents, loss_weight=bone_loss_weight)
+        self.warmup_iterations = warmup_iterations
+        self.num_iterations = 0
+
+    @staticmethod
+    def project_joints(x, intrinsics):
+        """Project 3D joint coordinates to 2D image plane using camera
+        intrinsic parameters.
+
+        Args:
+            x (torch.Tensor[N, K, 3]): 3D joint coordinates.
+            intrinsics (torch.Tensor[N, 4] | torch.Tensor[N, 9]): Camera
+                intrinsics: f (2), c (2), k (3), p (2).
+        """
+        while intrinsics.dim() < x.dim():
+            intrinsics.unsqueeze_(1)
+        f = intrinsics[..., :2]
+        c = intrinsics[..., 2:4]
+        _x = torch.clamp(x[:, :, :2] / x[:, :, 2:], -1, 1)
+        if intrinsics.shape[-1] == 9:
+            k = intrinsics[..., 4:7]
+            p = intrinsics[..., 7:9]
+
+            r2 = torch.sum(_x[:, :, :2]**2, dim=-1, keepdim=True)
+            radial = 1 + torch.sum(
+                k * torch.cat((r2, r2**2, r2**3), dim=-1),
+                dim=-1,
+                keepdim=True)
+            tan = torch.sum(p * _x, dim=-1, keepdim=True)
+            _x = _x * (radial + tan) + p * r2
+        _x = f * _x + c
+        return _x
+
+    def forward(self, output, target):
+        losses = dict()
+
+        self.num_iterations += 1
+        if self.num_iterations <= self.warmup_iterations:
+            return losses
+
+        labeled_pose = output['labeled_pose']
+        unlabeled_pose = output['unlabeled_pose']
+        unlabeled_traj = output['unlabeled_traj']
+        unlabeled_target_2d = target['unlabeled_target_2d']
+        intrinsics = target['intrinsics']
+
+        # projection loss
+        unlabeled_output = unlabeled_pose + unlabeled_traj
+        unlabeled_output_2d = self.project_joints(unlabeled_output, intrinsics)
+        loss_proj = self.criterion_projection(unlabeled_output_2d,
+                                              unlabeled_target_2d, None)
+        losses['proj_loss'] = loss_proj
+
+        # bone loss
+        loss_bone = self.criterion_bone(unlabeled_pose, labeled_pose, None)
+        losses['bone_loss'] = loss_bone
+
+        return losses
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/losses/rle_loss.py b/SMPLer-X/main/transformer_utils/mmpose/models/losses/rle_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..5973da8df59dd4804af746bd7fb83a23fbb78c35
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/losses/rle_loss.py
@@ -0,0 +1,180 @@
+import math
+import mmcv
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class RLELoss_poseur_old(nn.Module):
+    ''' RLE Regression Loss
+    '''
+
+    def __init__(self, OUTPUT_3D=False, use_target_weight=True, size_average=True):
+        super(RLELoss_poseur_old, self).__init__()
+        self.size_average = size_average
+        self.amp = 1 / math.sqrt(2 * math.pi)
+
+    def logQ(self, gt_uv, pred_jts, sigma):
+        return torch.log(sigma / self.amp) + torch.abs(gt_uv - pred_jts) / (math.sqrt(2) * sigma + 1e-9)
+
+    def forward(self, output, target_uv, target_uv_weight):
+
+        pred_jts = output.pred_jts
+        sigma = output.sigma
+        gt_uv = target_uv.reshape(pred_jts.shape)
+        gt_uv_weight = target_uv_weight.reshape(pred_jts.shape)
+
+
+
+        nf_loss = output.nf_loss * gt_uv_weight[:, :, :1]
+        # print(gt_uv.min(), gt_uv.max())
+
+        residual = True
+        if residual:
+            Q_logprob = self.logQ(gt_uv, pred_jts, sigma) * gt_uv_weight
+            loss = nf_loss + Q_logprob
+
+        if self.size_average and gt_uv_weight.sum() > 0:
+            return loss.sum() / len(loss)
+        else:
+            return loss.sum()
+
+@LOSSES.register_module()
+class RLELoss_poseur(nn.Module):
+    ''' RLE Regression Loss
+    '''
+
+    def __init__(self, OUTPUT_3D=False, use_target_weight=True, size_average=True):
+        super(RLELoss_poseur, self).__init__()
+        self.size_average = size_average
+        self.amp = 1 / math.sqrt(2 * math.pi)
+
+    def logQ(self, gt_uv, pred_jts, sigma):
+        return torch.log(sigma / self.amp) + torch.abs(gt_uv - pred_jts) / (math.sqrt(2) * sigma + 1e-9)
+
+    def forward(self, output, target_uvd, target_uvd_weight):
+
+        pred_jts = output.pred_jts
+        sigma = output.sigma
+        gt_uv = target_uvd.reshape(pred_jts.shape)
+        gt_uv_weight = target_uvd_weight.reshape(pred_jts.shape)
+
+        # nf_loss = output.nf_loss * gt_uv_weight[:, :, :1]
+        nf_loss = output.nf_loss * gt_uv_weight
+
+        residual = True
+        if residual:
+            Q_logprob = self.logQ(gt_uv, pred_jts, sigma) * gt_uv_weight
+            loss = nf_loss + Q_logprob
+
+        if self.size_average and gt_uv_weight.sum() > 0:
+            return loss.sum() / len(loss)
+        else:
+            return loss.sum()
+
+@LOSSES.register_module()
+class RLEOHKMLoss(nn.Module):
+    ''' RLE Regression Loss
+    '''
+
+    def __init__(self, OUTPUT_3D=False, use_target_weight=True, size_average=True, topk=8, 
+                    ori_weight = 1.0, ohkm_weight = 0.0):
+        super(RLEOHKMLoss, self).__init__()
+        self.size_average = size_average
+        self.amp = 1 / math.sqrt(2 * math.pi)
+        self.topk = topk
+        self.ori_weight = ori_weight
+        self.ohkm_weight = ohkm_weight
+        self.neg_inf = -float("Inf")
+
+    def logQ(self, gt_uv, pred_jts, sigma):
+        return torch.log(sigma / self.amp) + torch.abs(gt_uv - pred_jts) / (math.sqrt(2) * sigma + 1e-9)
+
+    def ohkm(self, loss, weight):
+        # mask = weight == 0
+        loss_value = loss.clone().detach()
+        loss_value[weight == 0] = self.neg_inf
+        _, topk_idx = torch.topk(
+            loss_value, k=self.topk, dim=1, sorted=False)
+        tmp_loss = torch.gather(loss, 1, topk_idx)
+        tmp_weight = torch.gather(weight, 1, topk_idx)
+        # tmp_loss[tmp_loss==-float("Inf")] = 0
+        tmp_loss = tmp_loss * tmp_weight
+        tmp_loss = tmp_loss.flatten(start_dim=1).sum(dim = 1)
+        # tmp_weight = tmp_weight.flatten(start_dim=1).sum(dim = 1)
+        # tmp_loss = tmp_loss / tmp_weight
+
+        return tmp_loss.mean()
+
+    def ori(self, loss, weight):
+        # mask = weight == 0
+        loss = loss * weight
+        loss = loss.flatten(start_dim=1).sum(dim = 1)
+        # weight = weight.flatten(start_dim=1).sum(dim = 1)
+
+        return loss.mean()
+
+    def forward(self, output, target_uv, target_uv_weight):
+
+        pred_jts = output.pred_jts
+        sigma = output.sigma
+        gt_uv = target_uv.reshape(pred_jts.shape)
+        gt_uv_weight = target_uv_weight.reshape(pred_jts.shape)
+
+        # gt_uv_weight = gt_uv_weight[:, :, :1] 
+        nf_loss = output.nf_loss
+        q_loss = self.logQ(gt_uv, pred_jts, sigma)
+
+        # nf_loss_ohkm = self.ohkm(nf_loss, gt_uv_weight)
+        # q_loss_ohkm = self.ohkm(q_loss, gt_uv_weight)
+
+        ori_loss = nf_loss + q_loss
+        ohkm_loss = self.ohkm(ori_loss, gt_uv_weight)
+        ori_loss = self.ori(ori_loss, gt_uv_weight)
+
+        loss = self.ori_weight * ori_loss + self.ohkm_weight * ohkm_loss
+        return loss #TODO mean?
+
+
+        # nf_loss = output.nf_loss * gt_uv_weight
+
+
+        # Q_logprob = self.logQ(gt_uv, pred_jts, sigma) * gt_uv_weight
+        # loss = nf_loss + Q_logprob
+
+        # return loss.sum() / len(loss)
+
+
+@LOSSES.register_module()
+class RLELoss3D(nn.Module):
+    ''' RLE Regression Loss 3D
+    '''
+
+    def __init__(self, OUTPUT_3D=False, size_average=True):
+        super(RLELoss3D, self).__init__()
+        self.size_average = size_average
+        self.amp = 1 / math.sqrt(2 * math.pi)
+
+    def logQ(self, gt_uv, pred_jts, sigma):
+        return torch.log(sigma / self.amp) + torch.abs(gt_uv - pred_jts) / (math.sqrt(2) * sigma + 1e-9)
+
+    def forward(self, output, labels):
+        nf_loss = output.nf_loss
+        pred_jts = output.pred_jts
+        sigma = output.sigma
+        gt_uv = labels['target_uvd'].reshape(pred_jts.shape)
+        gt_uv_weight = labels['target_uvd_weight'].reshape(pred_jts.shape)
+        nf_loss = nf_loss * gt_uv_weight
+
+        residual = True
+        if residual:
+            Q_logprob = self.logQ(gt_uv, pred_jts, sigma) * gt_uv_weight
+            loss = nf_loss + Q_logprob
+
+        if self.size_average and gt_uv_weight.sum() > 0:
+            return loss.sum() / len(loss)
+        else:
+            return loss.sum()
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/misc/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/models/misc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/misc/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/misc/discriminator.py b/SMPLer-X/main/transformer_utils/mmpose/models/misc/discriminator.py
new file mode 100644
index 0000000000000000000000000000000000000000..712f0a8b566e3dcbc0cd13206610d3c750b942ab
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/misc/discriminator.py
@@ -0,0 +1,307 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/akanazawa/hmr
+# Original licence: Copyright (c) 2018 akanazawa, under the MIT License.
+# ------------------------------------------------------------------------------
+
+from abc import abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import normal_init, xavier_init
+
+from mmpose.models.utils.geometry import batch_rodrigues
+
+
+class BaseDiscriminator(nn.Module):
+    """Base linear module for SMPL parameter discriminator.
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count,
+            such as (9, 32, 32, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or not
+            for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active function
+            or not, such as (True, True, False)
+    """
+
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        super().__init__()
+        self.fc_layers = fc_layers
+        self.use_dropout = use_dropout
+        self.drop_prob = drop_prob
+        self.use_activation = use_activation
+        self._check()
+        self.create_layers()
+
+    def _check(self):
+        """Check input to avoid ValueError."""
+        if not isinstance(self.fc_layers, tuple):
+            raise TypeError(f'fc_layers require tuple, '
+                            f'get {type(self.fc_layers)}')
+
+        if not isinstance(self.use_dropout, tuple):
+            raise TypeError(f'use_dropout require tuple, '
+                            f'get {type(self.use_dropout)}')
+
+        if not isinstance(self.drop_prob, tuple):
+            raise TypeError(f'drop_prob require tuple, '
+                            f'get {type(self.drop_prob)}')
+
+        if not isinstance(self.use_activation, tuple):
+            raise TypeError(f'use_activation require tuple, '
+                            f'get {type(self.use_activation)}')
+
+        l_fc_layer = len(self.fc_layers)
+        l_use_drop = len(self.use_dropout)
+        l_drop_prob = len(self.drop_prob)
+        l_use_activation = len(self.use_activation)
+
+        pass_check = (
+            l_fc_layer >= 2 and l_use_drop < l_fc_layer
+            and l_drop_prob < l_fc_layer and l_use_activation < l_fc_layer
+            and l_drop_prob == l_use_drop)
+
+        if not pass_check:
+            msg = 'Wrong BaseDiscriminator parameters!'
+            raise ValueError(msg)
+
+    def create_layers(self):
+        """Create layers."""
+        l_fc_layer = len(self.fc_layers)
+        l_use_drop = len(self.use_dropout)
+        l_use_activation = len(self.use_activation)
+
+        self.fc_blocks = nn.Sequential()
+
+        for i in range(l_fc_layer - 1):
+            self.fc_blocks.add_module(
+                name=f'regressor_fc_{i}',
+                module=nn.Linear(
+                    in_features=self.fc_layers[i],
+                    out_features=self.fc_layers[i + 1]))
+
+            if i < l_use_activation and self.use_activation[i]:
+                self.fc_blocks.add_module(
+                    name=f'regressor_af_{i}', module=nn.ReLU())
+
+            if i < l_use_drop and self.use_dropout[i]:
+                self.fc_blocks.add_module(
+                    name=f'regressor_fc_dropout_{i}',
+                    module=nn.Dropout(p=self.drop_prob[i]))
+
+    @abstractmethod
+    def forward(self, inputs):
+        """Forward function."""
+        msg = 'the base class [BaseDiscriminator] is not callable!'
+        raise NotImplementedError(msg)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.fc_blocks.named_modules():
+            if isinstance(m, nn.Linear):
+                xavier_init(m, gain=0.01)
+
+
+class ShapeDiscriminator(BaseDiscriminator):
+    """Discriminator for SMPL shape parameters, the inputs is (batch_size x 10)
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count, such as (10, 5, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or
+            not for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active
+            function or not, such as (True, False)
+    """
+
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        if fc_layers[-1] != 1:
+            msg = f'the neuron count of the last layer ' \
+                  f'must be 1, but got {fc_layers[-1]}'
+            raise ValueError(msg)
+
+        super().__init__(fc_layers, use_dropout, drop_prob, use_activation)
+
+    def forward(self, inputs):
+        """Forward function."""
+        return self.fc_blocks(inputs)
+
+
+class PoseDiscriminator(nn.Module):
+    """Discriminator for SMPL pose parameters of each joint. It is composed of
+    discriminators for each joints. The inputs is (batch_size x joint_count x
+    9)
+
+    Args:
+        channels (Tuple): Tuple of channel number,
+            such as (9, 32, 32, 1)
+        joint_count (int): Joint number, such as 23
+    """
+
+    def __init__(self, channels, joint_count):
+        super().__init__()
+        if channels[-1] != 1:
+            msg = f'the neuron count of the last layer ' \
+                  f'must be 1, but got {channels[-1]}'
+            raise ValueError(msg)
+        self.joint_count = joint_count
+
+        self.conv_blocks = nn.Sequential()
+        len_channels = len(channels)
+        for idx in range(len_channels - 2):
+            self.conv_blocks.add_module(
+                name=f'conv_{idx}',
+                module=nn.Conv2d(
+                    in_channels=channels[idx],
+                    out_channels=channels[idx + 1],
+                    kernel_size=1,
+                    stride=1))
+
+        self.fc_layer = nn.ModuleList()
+        for idx in range(joint_count):
+            self.fc_layer.append(
+                nn.Linear(
+                    in_features=channels[len_channels - 2], out_features=1))
+
+    def forward(self, inputs):
+        """Forward function.
+
+        The input is (batch_size x joint_count x 9).
+        """
+        # shape: batch_size x 9 x 1 x joint_count
+        inputs = inputs.transpose(1, 2).unsqueeze(2).contiguous()
+        # shape: batch_size x c x 1 x joint_count
+        internal_outputs = self.conv_blocks(inputs)
+        outputs = []
+        for idx in range(self.joint_count):
+            outputs.append(self.fc_layer[idx](internal_outputs[:, :, 0, idx]))
+
+        return torch.cat(outputs, 1), internal_outputs
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.conv_blocks:
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+        for m in self.fc_layer.named_modules():
+            if isinstance(m, nn.Linear):
+                xavier_init(m, gain=0.01)
+
+
+class FullPoseDiscriminator(BaseDiscriminator):
+    """Discriminator for SMPL pose parameters of all joints.
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count,
+            such as (736, 1024, 1024, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or not
+            for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active
+            function or not, such as (True, True, False)
+    """
+
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        if fc_layers[-1] != 1:
+            msg = f'the neuron count of the last layer must be 1,' \
+                  f' but got {fc_layers[-1]}'
+            raise ValueError(msg)
+
+        super().__init__(fc_layers, use_dropout, drop_prob, use_activation)
+
+    def forward(self, inputs):
+        """Forward function."""
+        return self.fc_blocks(inputs)
+
+
+class SMPLDiscriminator(nn.Module):
+    """Discriminator for SMPL pose and shape parameters. It is composed of a
+    discriminator for SMPL shape parameters, a discriminator for SMPL pose
+    parameters of all joints  and a discriminator for SMPL pose parameters of
+    each joint.
+
+    Args:
+        beta_channel (tuple of int): Tuple of neuron count of the
+            discriminator of shape parameters. Defaults to (10, 5, 1)
+        per_joint_channel (tuple of int): Tuple of neuron count of the
+            discriminator of each joint. Defaults to (9, 32, 32, 1)
+        full_pose_channel (tuple of int): Tuple of neuron count of the
+            discriminator of full pose. Defaults to (23*32, 1024, 1024, 1)
+    """
+
+    def __init__(self,
+                 beta_channel=(10, 5, 1),
+                 per_joint_channel=(9, 32, 32, 1),
+                 full_pose_channel=(23 * 32, 1024, 1024, 1)):
+        super().__init__()
+        self.joint_count = 23
+        # The count of SMPL shape parameter is 10.
+        assert beta_channel[0] == 10
+        # Use 3 x 3 rotation matrix as the pose parameters
+        # of each joint, so the input channel is 9.
+        assert per_joint_channel[0] == 9
+        assert self.joint_count * per_joint_channel[-2] \
+            == full_pose_channel[0]
+
+        self.beta_channel = beta_channel
+        self.per_joint_channel = per_joint_channel
+        self.full_pose_channel = full_pose_channel
+        self._create_sub_modules()
+
+    def _create_sub_modules(self):
+        """Create sub discriminators."""
+
+        # create theta discriminator for each joint
+        self.pose_discriminator = PoseDiscriminator(self.per_joint_channel,
+                                                    self.joint_count)
+
+        # create full pose discriminator for total joints
+        fc_layers = self.full_pose_channel
+        use_dropout = tuple([False] * (len(fc_layers) - 1))
+        drop_prob = tuple([0.5] * (len(fc_layers) - 1))
+        use_activation = tuple([True] * (len(fc_layers) - 2) + [False])
+
+        self.full_pose_discriminator = FullPoseDiscriminator(
+            fc_layers, use_dropout, drop_prob, use_activation)
+
+        # create shape discriminator for betas
+        fc_layers = self.beta_channel
+        use_dropout = tuple([False] * (len(fc_layers) - 1))
+        drop_prob = tuple([0.5] * (len(fc_layers) - 1))
+        use_activation = tuple([True] * (len(fc_layers) - 2) + [False])
+        self.shape_discriminator = ShapeDiscriminator(fc_layers, use_dropout,
+                                                      drop_prob,
+                                                      use_activation)
+
+    def forward(self, thetas):
+        """Forward function."""
+        _, poses, shapes = thetas
+
+        batch_size = poses.shape[0]
+        shape_disc_value = self.shape_discriminator(shapes)
+
+        # The first rotation matrix is global rotation
+        # and is NOT used in discriminator.
+        if poses.dim() == 2:
+            rotate_matrixs = \
+                batch_rodrigues(poses.contiguous().view(-1, 3)
+                                ).view(batch_size, 24, 9)[:, 1:, :]
+        else:
+            rotate_matrixs = poses.contiguous().view(batch_size, 24,
+                                                     9)[:, 1:, :].contiguous()
+        pose_disc_value, pose_inter_disc_value \
+            = self.pose_discriminator(rotate_matrixs)
+        full_pose_disc_value = self.full_pose_discriminator(
+            pose_inter_disc_value.contiguous().view(batch_size, -1))
+        return torch.cat(
+            (pose_disc_value, full_pose_disc_value, shape_disc_value), 1)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        self.full_pose_discriminator.init_weights()
+        self.pose_discriminator.init_weights()
+        self.shape_discriminator.init_weights()
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/registry.py b/SMPLer-X/main/transformer_utils/mmpose/models/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..f354ae9e137262e2f375a64aef74c3af20baae63
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/registry.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from .builder import BACKBONES, HEADS, LOSSES, NECKS, POSENETS
+
+__all__ = ['BACKBONES', 'HEADS', 'LOSSES', 'NECKS', 'POSENETS']
+
+warnings.simplefilter('once', DeprecationWarning)
+warnings.warn(
+    'Registries (BACKBONES, NECKS, HEADS, LOSSES, POSENETS) have '
+    'been moved to mmpose.models.builder. Importing from '
+    'mmpose.models.registry will be deprecated in the future.',
+    DeprecationWarning)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/utils/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d7e8f6482ce3e2c06229a578f22536bd75e5260
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/utils/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ckpt_convert import pvt_convert, tcformer_convert
+from .geometry import batch_rodrigues, quat_to_rotmat, rot6d_to_rotmat
+from .misc import torch_meshgrid_ij
+from .ops import resize
+from .realnvp import RealNVP
+from .smpl import SMPL
+from .tcformer_utils import (TCFormerDynamicBlock, TCFormerRegularBlock,
+                             TokenConv, cluster_dpc_knn, merge_tokens,
+                             token2map, token_interp)
+from .transformer import (PatchEmbed, PatchMerging, nchw_to_nlc, nlc_to_nchw, 
+                            PoseurTransformer_v3, DetrTransformerEncoder_zero_layer,
+                            DeformableDetrTransformerDecoder, DetrTransformerDecoderLayer_grouped)
+
+from .positional_encoding import (LearnedPositionalEncoding,
+                                  SinePositionalEncoding)
+
+__all__ = [
+    'SMPL', 'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw', 'pvt_convert',
+    'PatchMerging', 'batch_rodrigues', 'quat_to_rotmat', 'rot6d_to_rotmat',
+    'resize', 'RealNVP', 'torch_meshgrid_ij', 'token2map', 'TokenConv',
+    'TCFormerRegularBlock', 'TCFormerDynamicBlock', 'cluster_dpc_knn',
+    'merge_tokens', 'token_interp', 'tcformer_convert'
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/utils/ckpt_convert.py b/SMPLer-X/main/transformer_utils/mmpose/models/utils/ckpt_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5213937db3641bf7300156a2be3f2225326f02b
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/utils/ckpt_convert.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# This script consists of several convert functions which
+# can modify the weights of model in original repo to be
+# pre-trained weights.
+
+from collections import OrderedDict
+
+import torch
+
+
+def pvt_convert(ckpt):
+    new_ckpt = OrderedDict()
+    # Process the concat between q linear weights and kv linear weights
+    use_abs_pos_embed = False
+    use_conv_ffn = False
+    for k in ckpt.keys():
+        if k.startswith('pos_embed'):
+            use_abs_pos_embed = True
+        if k.find('dwconv') >= 0:
+            use_conv_ffn = True
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        if k.startswith('norm.'):
+            continue
+        if k.startswith('cls_token'):
+            continue
+        if k.startswith('pos_embed'):
+            stage_i = int(k.replace('pos_embed', ''))
+            new_k = k.replace(f'pos_embed{stage_i}',
+                              f'layers.{stage_i - 1}.1.0.pos_embed')
+            if stage_i == 4 and v.size(1) == 50:  # 1 (cls token) + 7 * 7
+                new_v = v[:, 1:, :]  # remove cls token
+            else:
+                new_v = v
+        elif k.startswith('patch_embed'):
+            stage_i = int(k.split('.')[0].replace('patch_embed', ''))
+            new_k = k.replace(f'patch_embed{stage_i}',
+                              f'layers.{stage_i - 1}.0')
+            new_v = v
+            if 'proj.' in new_k:
+                new_k = new_k.replace('proj.', 'projection.')
+        elif k.startswith('block'):
+            stage_i = int(k.split('.')[0].replace('block', ''))
+            layer_i = int(k.split('.')[1])
+            new_layer_i = layer_i + use_abs_pos_embed
+            new_k = k.replace(f'block{stage_i}.{layer_i}',
+                              f'layers.{stage_i - 1}.1.{new_layer_i}')
+            new_v = v
+            if 'attn.q.' in new_k:
+                sub_item_k = k.replace('q.', 'kv.')
+                new_k = new_k.replace('q.', 'attn.in_proj_')
+                new_v = torch.cat([v, ckpt[sub_item_k]], dim=0)
+            elif 'attn.kv.' in new_k:
+                continue
+            elif 'attn.proj.' in new_k:
+                new_k = new_k.replace('proj.', 'attn.out_proj.')
+            elif 'attn.sr.' in new_k:
+                new_k = new_k.replace('sr.', 'sr.')
+            elif 'mlp.' in new_k:
+                string = f'{new_k}-'
+                new_k = new_k.replace('mlp.', 'ffn.layers.')
+                if 'fc1.weight' in new_k or 'fc2.weight' in new_k:
+                    new_v = v.reshape((*v.shape, 1, 1))
+                new_k = new_k.replace('fc1.', '0.')
+                new_k = new_k.replace('dwconv.dwconv.', '1.')
+                if use_conv_ffn:
+                    new_k = new_k.replace('fc2.', '4.')
+                else:
+                    new_k = new_k.replace('fc2.', '3.')
+                string += f'{new_k} {v.shape}-{new_v.shape}'
+        elif k.startswith('norm'):
+            stage_i = int(k[4])
+            new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i - 1}.2')
+            new_v = v
+        else:
+            new_k = k
+            new_v = v
+        new_ckpt[new_k] = new_v
+
+    return new_ckpt
+
+
+def tcformer_convert(ckpt):
+    new_ckpt = OrderedDict()
+    # Process the concat between q linear weights and kv linear weights
+    for k, v in ckpt.items():
+        if 'patch_embed' in k:
+            new_k = k.replace('.proj.', '.projection.')
+        else:
+            new_k = k
+        new_ckpt[new_k] = v
+    return new_ckpt
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/utils/geometry.py b/SMPLer-X/main/transformer_utils/mmpose/models/utils/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ceadaec30cd2c9bb3fbada132e1ea674f2e8754
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/utils/geometry.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.nn import functional as F
+
+
+def rot6d_to_rotmat(x):
+    """Convert 6D rotation representation to 3x3 rotation matrix.
+
+    Based on Zhou et al., "On the Continuity of Rotation
+    Representations in Neural Networks", CVPR 2019
+    Input:
+        (B,6) Batch of 6-D rotation representations
+    Output:
+        (B,3,3) Batch of corresponding rotation matrices
+    """
+    x = x.view(-1, 3, 2)
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
+    b3 = torch.cross(b1, b2)
+    return torch.stack((b1, b2, b3), dim=-1)
+
+
+def batch_rodrigues(theta):
+    """Convert axis-angle representation to rotation matrix.
+    Args:
+        theta: size = [B, 3]
+    Returns:
+        Rotation matrix corresponding to the quaternion
+            -- size = [B, 3, 3]
+    """
+    l2norm = torch.norm(theta + 1e-8, p=2, dim=1)
+    angle = torch.unsqueeze(l2norm, -1)
+    normalized = torch.div(theta, angle)
+    angle = angle * 0.5
+    v_cos = torch.cos(angle)
+    v_sin = torch.sin(angle)
+    quat = torch.cat([v_cos, v_sin * normalized], dim=1)
+    return quat_to_rotmat(quat)
+
+
+def quat_to_rotmat(quat):
+    """Convert quaternion coefficients to rotation matrix.
+    Args:
+        quat: size = [B, 4] 4 <===>(w, x, y, z)
+    Returns:
+        Rotation matrix corresponding to the quaternion
+            -- size = [B, 3, 3]
+    """
+    norm_quat = quat
+    norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True)
+    w, x, y, z = norm_quat[:, 0], norm_quat[:, 1],\
+        norm_quat[:, 2], norm_quat[:, 3]
+
+    B = quat.size(0)
+
+    w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
+    wx, wy, wz = w * x, w * y, w * z
+    xy, xz, yz = x * y, x * z, y * z
+
+    rotMat = torch.stack([
+        w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz, 2 * wz + 2 * xy,
+        w2 - x2 + y2 - z2, 2 * yz - 2 * wx, 2 * xz - 2 * wy, 2 * wx + 2 * yz,
+        w2 - x2 - y2 + z2
+    ],
+                         dim=1).view(B, 3, 3)
+    return rotMat
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/utils/misc.py b/SMPLer-X/main/transformer_utils/mmpose/models/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c784588ef0c0ef58badf5c68d0a9602e14d6079
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/utils/misc.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from packaging import version
+
+_torch_version_meshgrid_indexing = version.parse(
+    torch.__version__) >= version.parse('1.10.0a0')
+
+
+def torch_meshgrid_ij(*tensors):
+    if _torch_version_meshgrid_indexing:
+        return torch.meshgrid(*tensors, indexing='ij')
+    else:
+        return torch.meshgrid(*tensors)  # Uses indexing='ij' by default
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/utils/ops.py b/SMPLer-X/main/transformer_utils/mmpose/models/utils/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..858d0a92148a591d235e58bfce8990207632fb39
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/utils/ops.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn.functional as F
+
+
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    if isinstance(size, torch.Size):
+        size = tuple(int(x) for x in size)
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/utils/positional_encoding.py b/SMPLer-X/main/transformer_utils/mmpose/models/utils/positional_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ceb81ac078894f747d97a2ba6d78199addab3e5
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/utils/positional_encoding.py
@@ -0,0 +1,155 @@
+import math
+
+import torch
+import torch.nn as nn
+# from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING
+from mmengine.model import BaseModule
+
+
+# @POSITIONAL_ENCODING.register_module(force=True)
+class SinePositionalEncoding(BaseModule):
+    """Position encoding with sine and cosine functions.
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_feats,
+                 temperature=10000,
+                 normalize=False,
+                 scale=2 * math.pi,
+                 eps=1e-6,
+                 offset=0.,
+                 init_cfg=None):
+        super(SinePositionalEncoding, self).__init__(init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), 'when normalize is set,' \
+                'scale should be provided and in float or int type, ' \
+                f'found {type(scale)}'
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+
+    def forward(self, mask):
+        """Forward function for `SinePositionalEncoding`.
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, H, W = mask.size()
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'temperature={self.temperature}, '
+        repr_str += f'normalize={self.normalize}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'eps={self.eps})'
+        return repr_str
+
+
+# @POSITIONAL_ENCODING.register_module(force=True)
+class LearnedPositionalEncoding(BaseModule):
+    """Position embedding with learnable embedding weights.
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. The final returned dimension for
+            each position is 2 times of this value.
+        row_num_embed (int, optional): The dictionary size of row embeddings.
+            Default 50.
+        col_num_embed (int, optional): The dictionary size of col embeddings.
+            Default 50.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_feats,
+                 row_num_embed=50,
+                 col_num_embed=50,
+                 init_cfg=dict(type='Uniform', layer='Embedding')):
+        super(LearnedPositionalEncoding, self).__init__(init_cfg)
+        self.row_embed = nn.Embedding(row_num_embed, num_feats)
+        self.col_embed = nn.Embedding(col_num_embed, num_feats)
+        self.num_feats = num_feats
+        self.row_num_embed = row_num_embed
+        self.col_num_embed = col_num_embed
+
+    def forward(self, mask):
+        """Forward function for `LearnedPositionalEncoding`.
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        h, w = mask.shape[-2:]
+        x = torch.arange(w, device=mask.device)
+        y = torch.arange(h, device=mask.device)
+        x_embed = self.col_embed(x)
+        y_embed = self.row_embed(y)
+        pos = torch.cat(
+            (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(
+                1, w, 1)),
+            dim=-1).permute(2, 0,
+                            1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'row_num_embed={self.row_num_embed}, '
+        repr_str += f'col_num_embed={self.col_num_embed})'
+        return repr_str
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/utils/realnvp.py b/SMPLer-X/main/transformer_utils/mmpose/models/utils/realnvp.py
new file mode 100644
index 0000000000000000000000000000000000000000..911953e8f9d1056d44a2d3538d750e89b9bd6a7a
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/utils/realnvp.py
@@ -0,0 +1,76 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from torch import distributions
+
+
+class RealNVP(nn.Module):
+    """RealNVP: a flow-based generative model
+
+    `Density estimation using Real NVP
+    arXiv: <https://arxiv.org/abs/1605.08803>`_.
+
+    Code is modified from `the official implementation of RLE
+    <https://github.com/Jeff-sjtu/res-loglikelihood-regression>`_.
+
+    See also `real-nvp-pytorch
+    <https://github.com/senya-ashukha/real-nvp-pytorch>`_.
+    """
+
+    @staticmethod
+    def get_scale_net():
+        """Get the scale model in a single invertable mapping."""
+        return nn.Sequential(
+            nn.Linear(2, 64), nn.LeakyReLU(), nn.Linear(64, 64),
+            nn.LeakyReLU(), nn.Linear(64, 2), nn.Tanh())
+
+    @staticmethod
+    def get_trans_net():
+        """Get the translation model in a single invertable mapping."""
+        return nn.Sequential(
+            nn.Linear(2, 64), nn.LeakyReLU(), nn.Linear(64, 64),
+            nn.LeakyReLU(), nn.Linear(64, 2))
+
+    @property
+    def prior(self):
+        """The prior distribution."""
+        return distributions.MultivariateNormal(self.loc, self.cov)
+
+    def __init__(self):
+        super(RealNVP, self).__init__()
+
+        self.register_buffer('loc', torch.zeros(2))
+        self.register_buffer('cov', torch.eye(2))
+        self.register_buffer(
+            'mask', torch.tensor([[0, 1], [1, 0]] * 3, dtype=torch.float32))
+
+        self.s = torch.nn.ModuleList(
+            [self.get_scale_net() for _ in range(len(self.mask))])
+        self.t = torch.nn.ModuleList(
+            [self.get_trans_net() for _ in range(len(self.mask))])
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialization model weights."""
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight, gain=0.01)
+
+    def backward_p(self, x):
+        """Apply mapping form the data space to the latent space and calculate
+        the log determinant of the Jacobian matrix."""
+
+        log_det_jacob, z = x.new_zeros(x.shape[0]), x
+        for i in reversed(range(len(self.t))):
+            z_ = self.mask[i] * z
+            s = self.s[i](z_) * (1 - self.mask[i])  # torch.exp(s): betas
+            t = self.t[i](z_) * (1 - self.mask[i])  # gammas
+            z = (1 - self.mask[i]) * (z - t) * torch.exp(-s) + z_
+            log_det_jacob -= s.sum(dim=1)
+        return z, log_det_jacob
+
+    def log_prob(self, x):
+        """Calculate the log probability of given sample in data space."""
+
+        z, log_det = self.backward_p(x)
+        return self.prior.log_prob(z) + log_det
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/utils/smpl.py b/SMPLer-X/main/transformer_utils/mmpose/models/utils/smpl.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe723d483aadb7ce7e0e9f50ef8da7b10e7529e5
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/utils/smpl.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ..builder import MESH_MODELS
+
+try:
+    from smplx import SMPL as SMPL_
+    has_smpl = True
+except (ImportError, ModuleNotFoundError):
+    has_smpl = False
+
+
+@MESH_MODELS.register_module()
+class SMPL(nn.Module):
+    """SMPL 3d human mesh model of paper ref: Matthew Loper. ``SMPL: A skinned
+    multi-person linear model''. This module is based on the smplx project
+    (https://github.com/vchoutas/smplx).
+
+    Args:
+        smpl_path (str): The path to the folder where the model weights are
+            stored.
+        joints_regressor (str): The path to the file where the joints
+            regressor weight are stored.
+    """
+
+    def __init__(self, smpl_path, joints_regressor):
+        super().__init__()
+
+        assert has_smpl, 'Please install smplx to use SMPL.'
+
+        self.smpl_neutral = SMPL_(
+            model_path=smpl_path,
+            create_global_orient=False,
+            create_body_pose=False,
+            create_transl=False,
+            gender='neutral')
+
+        self.smpl_male = SMPL_(
+            model_path=smpl_path,
+            create_betas=False,
+            create_global_orient=False,
+            create_body_pose=False,
+            create_transl=False,
+            gender='male')
+
+        self.smpl_female = SMPL_(
+            model_path=smpl_path,
+            create_betas=False,
+            create_global_orient=False,
+            create_body_pose=False,
+            create_transl=False,
+            gender='female')
+
+        joints_regressor = torch.tensor(
+            np.load(joints_regressor), dtype=torch.float)[None, ...]
+        self.register_buffer('joints_regressor', joints_regressor)
+
+        self.num_verts = self.smpl_neutral.get_num_verts()
+        self.num_joints = self.joints_regressor.shape[1]
+
+    def smpl_forward(self, model, **kwargs):
+        """Apply a specific SMPL model with given model parameters.
+
+        Note:
+            B: batch size
+            V: number of vertices
+            K: number of joints
+
+        Returns:
+            outputs (dict): Dict with mesh vertices and joints.
+                - vertices: Tensor([B, V, 3]), mesh vertices
+                - joints: Tensor([B, K, 3]), 3d joints regressed
+                    from mesh vertices.
+        """
+
+        betas = kwargs['betas']
+        batch_size = betas.shape[0]
+        device = betas.device
+        output = {}
+        if batch_size == 0:
+            output['vertices'] = betas.new_zeros([0, self.num_verts, 3])
+            output['joints'] = betas.new_zeros([0, self.num_joints, 3])
+        else:
+            smpl_out = model(**kwargs)
+            output['vertices'] = smpl_out.vertices
+            output['joints'] = torch.matmul(
+                self.joints_regressor.to(device), output['vertices'])
+        return output
+
+    def get_faces(self):
+        """Return mesh faces.
+
+        Note:
+            F: number of faces
+
+        Returns:
+            faces: np.ndarray([F, 3]), mesh faces
+        """
+        return self.smpl_neutral.faces
+
+    def forward(self,
+                betas,
+                body_pose,
+                global_orient,
+                transl=None,
+                gender=None):
+        """Forward function.
+
+        Note:
+            B: batch size
+            J: number of controllable joints of model, for smpl model J=23
+            K: number of joints
+
+        Args:
+            betas: Tensor([B, 10]), human body shape parameters of SMPL model.
+            body_pose: Tensor([B, J*3] or [B, J, 3, 3]), human body pose
+                parameters of SMPL model. It should be axis-angle vector
+                ([B, J*3]) or rotation matrix ([B, J, 3, 3)].
+            global_orient: Tensor([B, 3] or [B, 1, 3, 3]), global orientation
+                of human body. It should be axis-angle vector ([B, 3]) or
+                rotation matrix ([B, 1, 3, 3)].
+            transl: Tensor([B, 3]), global translation of human body.
+            gender: Tensor([B]), gender parameters of human body. -1 for
+                neutral, 0 for male , 1 for female.
+
+        Returns:
+            outputs (dict): Dict with mesh vertices and joints.
+                - vertices: Tensor([B, V, 3]), mesh vertices
+                - joints: Tensor([B, K, 3]), 3d joints regressed from
+                    mesh vertices.
+        """
+
+        batch_size = betas.shape[0]
+        pose2rot = True if body_pose.dim() == 2 else False
+        if batch_size > 0 and gender is not None:
+            output = {
+                'vertices': betas.new_zeros([batch_size, self.num_verts, 3]),
+                'joints': betas.new_zeros([batch_size, self.num_joints, 3])
+            }
+
+            mask = gender < 0
+            _out = self.smpl_forward(
+                self.smpl_neutral,
+                betas=betas[mask],
+                body_pose=body_pose[mask],
+                global_orient=global_orient[mask],
+                transl=transl[mask] if transl is not None else None,
+                pose2rot=pose2rot)
+            output['vertices'][mask] = _out['vertices']
+            output['joints'][mask] = _out['joints']
+
+            mask = gender == 0
+            _out = self.smpl_forward(
+                self.smpl_male,
+                betas=betas[mask],
+                body_pose=body_pose[mask],
+                global_orient=global_orient[mask],
+                transl=transl[mask] if transl is not None else None,
+                pose2rot=pose2rot)
+            output['vertices'][mask] = _out['vertices']
+            output['joints'][mask] = _out['joints']
+
+            mask = gender == 1
+            _out = self.smpl_forward(
+                self.smpl_male,
+                betas=betas[mask],
+                body_pose=body_pose[mask],
+                global_orient=global_orient[mask],
+                transl=transl[mask] if transl is not None else None,
+                pose2rot=pose2rot)
+            output['vertices'][mask] = _out['vertices']
+            output['joints'][mask] = _out['joints']
+        else:
+            return self.smpl_forward(
+                self.smpl_neutral,
+                betas=betas,
+                body_pose=body_pose,
+                global_orient=global_orient,
+                transl=transl,
+                pose2rot=pose2rot)
+
+        return output
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/utils/tcformer_utils.py b/SMPLer-X/main/transformer_utils/mmpose/models/utils/tcformer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..85fae8cd46f44b536ab396df669f041fe1f18094
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/utils/tcformer_utils.py
@@ -0,0 +1,996 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmengine.model import trunc_normal_init
+from mmcv.cnn.bricks.transformer import build_dropout
+
+try:
+    from torch.cuda.amp import autocast
+    WITH_AUTOCAST = True
+except ImportError:
+    WITH_AUTOCAST = False
+
+
+def get_grid_index(init_grid_size, map_size, device):
+    """For every initial grid, get its index in the feature map.
+    Note:
+        [H_init, W_init]: shape of initial grid
+        [H, W]: shape of feature map
+        N_init: numbers of initial token
+
+    Args:
+        init_grid_size (list[int] or tuple[int]): initial grid resolution in
+            format [H_init, W_init].
+        map_size (list[int] or tuple[int]): feature map resolution in format
+            [H, W].
+        device: the device of output
+
+    Returns:
+        idx (torch.LongTensor[B, N_init]): index in flattened feature map.
+    """
+    H_init, W_init = init_grid_size
+    H, W = map_size
+    idx = torch.arange(H * W, device=device).reshape(1, 1, H, W)
+    idx = F.interpolate(idx.float(), [H_init, W_init], mode='nearest').long()
+    return idx.flatten()
+
+
+def index_points(points, idx):
+    """Sample features following the index.
+    Note:
+        B: batch size
+        N: point number
+        C: channel number of each point
+        Ns: sampled point number
+
+    Args:
+        points (torch.Tensor[B, N, C]): input points data
+        idx (torch.LongTensor[B, S]): sample index
+
+    Returns:
+        new_points (torch.Tensor[B, Ns, C]):, indexed points data
+    """
+    device = points.device
+    B = points.shape[0]
+    view_shape = list(idx.shape)
+    view_shape[1:] = [1] * (len(view_shape) - 1)
+    repeat_shape = list(idx.shape)
+    repeat_shape[0] = 1
+    batch_indices = torch.arange(
+        B, dtype=torch.long).to(device).view(view_shape).repeat(repeat_shape)
+    new_points = points[batch_indices, idx, :]
+    return new_points
+
+
+def token2map(token_dict):
+    """Transform vision tokens to feature map. This function only works when
+    the resolution of the feature map is not higher than the initial grid
+    structure.
+
+    Note:
+        B: batch size
+        C: channel number of each token
+        [H, W]: shape of feature map
+        N_init: numbers of initial token
+
+    Args:
+        token_dict (dict): dict for token information.
+
+    Returns:
+        x_out (Tensor[B, C, H, W]): feature map.
+    """
+
+    x = token_dict['x']
+    H, W = token_dict['map_size']
+    H_init, W_init = token_dict['init_grid_size']
+    idx_token = token_dict['idx_token']
+    B, N, C = x.shape
+    N_init = H_init * W_init
+    device = x.device
+
+    if N_init == N and N == H * W:
+        # for the initial tokens with grid structure, just reshape
+        return x.reshape(B, H, W, C).permute(0, 3, 1, 2).contiguous()
+
+    # for each initial grid, get the corresponding index in
+    # the flattened feature map.
+    idx_hw = get_grid_index([H_init, W_init], [H, W],
+                            device=device)[None, :].expand(B, -1)
+    idx_batch = torch.arange(B, device=device)[:, None].expand(B, N_init)
+    value = x.new_ones(B * N_init)
+
+    # choose the way with fewer flops.
+    if N_init < N * H * W:
+        # use sparse matrix multiplication
+        # Flops: B * N_init * (C+2)
+        idx_hw = idx_hw + idx_batch * H * W
+        idx_tokens = idx_token + idx_batch * N
+        coor = torch.stack([idx_hw, idx_tokens], dim=0).reshape(2, B * N_init)
+
+        # torch.sparse do not support gradient for
+        # sparse tensor, so we detach it
+        value = value.detach().to(torch.float32)
+
+        # build a sparse matrix with the shape [B * H * W, B * N]
+        A = torch.sparse.FloatTensor(coor, value,
+                                     torch.Size([B * H * W, B * N]))
+
+        # normalize the weight for each row
+        if WITH_AUTOCAST:
+            with autocast(enabled=False):
+                all_weight = A @ x.new_ones(B * N, 1).type(
+                    torch.float32) + 1e-6
+        else:
+            all_weight = A @ x.new_ones(B * N, 1).type(torch.float32) + 1e-6
+        value = value / all_weight[idx_hw.reshape(-1), 0]
+
+        # update the matrix with normalize weight
+        A = torch.sparse.FloatTensor(coor, value,
+                                     torch.Size([B * H * W, B * N]))
+
+        # sparse matrix multiplication
+        if WITH_AUTOCAST:
+            with autocast(enabled=False):
+                x_out = A @ x.reshape(B * N, C).to(torch.float32)  # [B*H*W, C]
+        else:
+            x_out = A @ x.reshape(B * N, C).to(torch.float32)  # [B*H*W, C]
+
+    else:
+        # use dense matrix multiplication
+        # Flops: B * N * H * W * (C+2)
+        coor = torch.stack([idx_batch, idx_hw, idx_token],
+                           dim=0).reshape(3, B * N_init)
+
+        # build a matrix with shape [B, H*W, N]
+        A = torch.sparse.FloatTensor(coor, value, torch.Size([B, H * W,
+                                                              N])).to_dense()
+        # normalize the weight
+        A = A / (A.sum(dim=-1, keepdim=True) + 1e-6)
+
+        x_out = A @ x  # [B, H*W, C]
+
+    x_out = x_out.type(x.dtype)
+    x_out = x_out.reshape(B, H, W, C).permute(0, 3, 1, 2).contiguous()
+    return x_out
+
+
+def map2token(feature_map, token_dict):
+    """Transform feature map to vision tokens. This function only works when
+    the resolution of the feature map is not higher than the initial grid
+    structure.
+
+    Note:
+        B: batch size
+        C: channel number
+        [H, W]: shape of feature map
+        N_init: numbers of initial token
+
+    Args:
+        feature_map (Tensor[B, C, H, W]): feature map.
+        token_dict (dict): dict for token information.
+
+    Returns:
+        out (Tensor[B, N, C]): token features.
+    """
+    idx_token = token_dict['idx_token']
+    N = token_dict['token_num']
+    H_init, W_init = token_dict['init_grid_size']
+    N_init = H_init * W_init
+
+    B, C, H, W = feature_map.shape
+    device = feature_map.device
+
+    if N_init == N and N == H * W:
+        # for the initial tokens with grid structure, just reshape
+        return feature_map.flatten(2).permute(0, 2, 1).contiguous()
+
+    idx_hw = get_grid_index([H_init, W_init], [H, W],
+                            device=device)[None, :].expand(B, -1)
+
+    idx_batch = torch.arange(B, device=device)[:, None].expand(B, N_init)
+    value = feature_map.new_ones(B * N_init)
+
+    # choose the way with fewer flops.
+    if N_init < N * H * W:
+        # use sparse matrix multiplication
+        # Flops: B * N_init * (C+2)
+        idx_token = idx_token + idx_batch * N
+        idx_hw = idx_hw + idx_batch * H * W
+        indices = torch.stack([idx_token, idx_hw], dim=0).reshape(2, -1)
+
+        # sparse mm do not support gradient for sparse matrix
+        value = value.detach().to(torch.float32)
+        # build a sparse matrix with shape [B*N, B*H*W]
+        A = torch.sparse_coo_tensor(indices, value, (B * N, B * H * W))
+        # normalize the matrix
+        if WITH_AUTOCAST:
+            with autocast(enabled=False):
+                all_weight = A @ torch.ones(
+                    [B * H * W, 1], device=device, dtype=torch.float32) + 1e-6
+        else:
+            all_weight = A @ torch.ones(
+                [B * H * W, 1], device=device, dtype=torch.float32) + 1e-6
+        value = value / all_weight[idx_token.reshape(-1), 0]
+
+        A = torch.sparse_coo_tensor(indices, value, (B * N, B * H * W))
+        # out: [B*N, C]
+        if WITH_AUTOCAST:
+            with autocast(enabled=False):
+                out = A @ feature_map.permute(0, 2, 3, 1).contiguous().reshape(
+                    B * H * W, C).float()
+        else:
+            out = A @ feature_map.permute(0, 2, 3, 1).contiguous().reshape(
+                B * H * W, C).float()
+    else:
+        # use dense matrix multiplication
+        # Flops: B * N * H * W * (C+2)
+        indices = torch.stack([idx_batch, idx_token, idx_hw],
+                              dim=0).reshape(3, -1)
+        value = value.detach()  # To reduce the training time, we detach here.
+        A = torch.sparse_coo_tensor(indices, value, (B, N, H * W)).to_dense()
+        # normalize the matrix
+        A = A / (A.sum(dim=-1, keepdim=True) + 1e-6)
+
+        out = A @ feature_map.permute(0, 2, 3, 1).reshape(B, H * W,
+                                                          C).contiguous()
+
+    out = out.type(feature_map.dtype)
+    out = out.reshape(B, N, C)
+    return out
+
+
+def token_interp(target_dict, source_dict):
+    """Transform token features between different distribution.
+
+    Note:
+        B: batch size
+        N: token number
+        C: channel number
+
+    Args:
+        target_dict (dict): dict for target token information
+        source_dict (dict): dict for source token information.
+
+    Returns:
+        x_out (Tensor[B, N, C]): token features.
+    """
+
+    x_s = source_dict['x']
+    idx_token_s = source_dict['idx_token']
+    idx_token_t = target_dict['idx_token']
+    T = target_dict['token_num']
+    B, S, C = x_s.shape
+    N_init = idx_token_s.shape[1]
+
+    weight = target_dict['agg_weight'] if 'agg_weight' in target_dict.keys(
+    ) else None
+    if weight is None:
+        weight = x_s.new_ones(B, N_init, 1)
+    weight = weight.reshape(-1)
+
+    # choose the way with fewer flops.
+    if N_init < T * S:
+        # use sparse matrix multiplication
+        # Flops: B * N_init * (C+2)
+        idx_token_t = idx_token_t + torch.arange(
+            B, device=x_s.device)[:, None] * T
+        idx_token_s = idx_token_s + torch.arange(
+            B, device=x_s.device)[:, None] * S
+        coor = torch.stack([idx_token_t, idx_token_s],
+                           dim=0).reshape(2, B * N_init)
+
+        # torch.sparse does not support grad for sparse matrix
+        weight = weight.float().detach().to(torch.float32)
+        # build a matrix with shape [B*T, B*S]
+        A = torch.sparse.FloatTensor(coor, weight, torch.Size([B * T, B * S]))
+        # normalize the matrix
+        if WITH_AUTOCAST:
+            with autocast(enabled=False):
+                all_weight = A.type(torch.float32) @ x_s.new_ones(
+                    B * S, 1).type(torch.float32) + 1e-6
+        else:
+            all_weight = A.type(torch.float32) @ x_s.new_ones(B * S, 1).type(
+                torch.float32) + 1e-6
+        weight = weight / all_weight[idx_token_t.reshape(-1), 0]
+        A = torch.sparse.FloatTensor(coor, weight, torch.Size([B * T, B * S]))
+        # sparse matmul
+        if WITH_AUTOCAST:
+            with autocast(enabled=False):
+                x_out = A.type(torch.float32) @ x_s.reshape(B * S, C).type(
+                    torch.float32)
+        else:
+            x_out = A.type(torch.float32) @ x_s.reshape(B * S, C).type(
+                torch.float32)
+    else:
+        # use dense matrix multiplication
+        # Flops: B * T * S * (C+2)
+        idx_batch = torch.arange(
+            B, device=x_s.device)[:, None].expand(B, N_init)
+        coor = torch.stack([idx_batch, idx_token_t, idx_token_s],
+                           dim=0).reshape(3, B * N_init)
+        weight = weight.detach()  # detach to reduce training time
+        # build a matrix with shape [B, T, S]
+        A = torch.sparse.FloatTensor(coor, weight, torch.Size([B, T,
+                                                               S])).to_dense()
+        # normalize the matrix
+        A = A / (A.sum(dim=-1, keepdim=True) + 1e-6)
+        # dense matmul
+        x_out = A @ x_s
+
+    x_out = x_out.reshape(B, T, C).type(x_s.dtype)
+    return x_out
+
+
+def cluster_dpc_knn(token_dict, cluster_num, k=5, token_mask=None):
+    """Cluster tokens with DPC-KNN algorithm.
+
+    Note:
+        B: batch size
+        N: token number
+        C: channel number
+
+    Args:
+        token_dict (dict): dict for token information
+        cluster_num (int): cluster number
+        k (int): number of the nearest neighbor used for local density.
+        token_mask (Tensor[B, N]): mask indicating which token is the
+            padded empty token. Non-zero value means the token is meaningful,
+            zero value means the token is an empty token. If set to None, all
+            tokens are regarded as meaningful.
+
+    Return:
+        idx_cluster (Tensor[B, N]): cluster index of each token.
+        cluster_num (int): actual cluster number. In this function, it equals
+            to the input cluster number.
+    """
+
+    with torch.no_grad():
+        x = token_dict['x']
+        B, N, C = x.shape
+
+        dist_matrix = torch.cdist(x, x) / (C**0.5)
+
+        if token_mask is not None:
+            token_mask = token_mask > 0
+            # in order to not affect the local density, the
+            # distance between empty tokens and any other
+            # tokens should be the maximal distance.
+            dist_matrix = \
+                dist_matrix * token_mask[:, None, :] +\
+                (dist_matrix.max() + 1) * (~token_mask[:, None, :])
+
+        # get local density
+        dist_nearest, index_nearest = torch.topk(
+            dist_matrix, k=k, dim=-1, largest=False)
+
+        density = (-(dist_nearest**2).mean(dim=-1)).exp()
+        # add a little noise to ensure no tokens have the same density.
+        density = density + torch.rand(
+            density.shape, device=density.device, dtype=density.dtype) * 1e-6
+
+        if token_mask is not None:
+            # the density of empty token should be 0
+            density = density * token_mask
+
+        # get distance indicator
+        mask = density[:, None, :] > density[:, :, None]
+        mask = mask.type(x.dtype)
+        dist_max = dist_matrix.flatten(1).max(dim=-1)[0][:, None, None]
+        dist, index_parent = (dist_matrix * mask + dist_max *
+                              (1 - mask)).min(dim=-1)
+
+        # select clustering center according to score
+        score = dist * density
+        _, index_down = torch.topk(score, k=cluster_num, dim=-1)
+
+        # assign tokens to the nearest center
+        dist_matrix = index_points(dist_matrix, index_down)
+
+        idx_cluster = dist_matrix.argmin(dim=1)
+
+        # make sure cluster center merge to itself
+        idx_batch = torch.arange(
+            B, device=x.device)[:, None].expand(B, cluster_num)
+        idx_tmp = torch.arange(
+            cluster_num, device=x.device)[None, :].expand(B, cluster_num)
+        idx_cluster[idx_batch.reshape(-1),
+                    index_down.reshape(-1)] = idx_tmp.reshape(-1)
+
+    return idx_cluster, cluster_num
+
+
+def merge_tokens(token_dict, idx_cluster, cluster_num, token_weight=None):
+    """Merge tokens in the same cluster to a single cluster. Implemented by
+    torch.index_add(). Flops: B*N*(C+2)
+
+    Note:
+        B: batch size
+        N: token number
+        C: channel number
+
+    Args:
+        token_dict (dict): dict for input token information
+        idx_cluster (Tensor[B, N]): cluster index of each token.
+        cluster_num (int): cluster number
+        token_weight (Tensor[B, N, 1]): weight for each token.
+
+    Return:
+        out_dict (dict): dict for output token information
+    """
+
+    x = token_dict['x']
+    idx_token = token_dict['idx_token']
+    agg_weight = token_dict['agg_weight']
+
+    B, N, C = x.shape
+    if token_weight is None:
+        token_weight = x.new_ones(B, N, 1)
+
+    idx_batch = torch.arange(B, device=x.device)[:, None]
+    idx = idx_cluster + idx_batch * cluster_num
+
+    all_weight = token_weight.new_zeros(B * cluster_num, 1)
+    all_weight.index_add_(
+        dim=0, index=idx.reshape(B * N), source=token_weight.reshape(B * N, 1))
+    all_weight = all_weight + 1e-6
+    norm_weight = token_weight / all_weight[idx]
+
+    # average token features
+    x_merged = x.new_zeros(B * cluster_num, C)
+    source = x * norm_weight
+    x_merged.index_add_(
+        dim=0,
+        index=idx.reshape(B * N),
+        source=source.reshape(B * N, C).type(x.dtype))
+    x_merged = x_merged.reshape(B, cluster_num, C)
+
+    idx_token_new = index_points(idx_cluster[..., None], idx_token).squeeze(-1)
+    weight_t = index_points(norm_weight, idx_token)
+    agg_weight_new = agg_weight * weight_t
+    agg_weight_new / agg_weight_new.max(dim=1, keepdim=True)[0]
+
+    out_dict = {}
+    out_dict['x'] = x_merged
+    out_dict['token_num'] = cluster_num
+    out_dict['map_size'] = token_dict['map_size']
+    out_dict['init_grid_size'] = token_dict['init_grid_size']
+    out_dict['idx_token'] = idx_token_new
+    out_dict['agg_weight'] = agg_weight_new
+    return out_dict
+
+
+class MLP(nn.Module):
+    """FFN with Depthwise Conv of TCFormer.
+
+    Args:
+        in_features (int): The feature dimension.
+        hidden_features (int, optional): The hidden dimension of FFNs.
+            Defaults: The same as in_features.
+        out_features (int, optional): The output feature dimension.
+            Defaults: The same as in_features.
+        act_layer (nn.Module, optional): The activation config for FFNs.
+            Default: nn.GELU.
+        drop (float, optional): drop out rate. Default: 0.
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def init_weights(self):
+        """init weights."""
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_init(m, std=.02, bias=0.)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+            elif isinstance(m, nn.Conv2d):
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                fan_out //= m.groups
+                m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class DWConv(nn.Module):
+    """Depthwise Conv for regular grid-based tokens.
+
+    Args:
+        dim (int): The feature dimension.
+    """
+
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+
+
+class TCFormerRegularAttention(nn.Module):
+    """Spatial Reduction Attention for regular grid-based tokens.
+
+    Args:
+        dim (int): The feature dimension of tokens,
+        num_heads (int): Parallel attention heads.
+        qkv_bias (bool): enable bias for qkv if True. Default: False.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after attention process.
+            Default: 0.0.
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention. Default: 1.
+        use_sr_conv (bool): If True, use a conv layer for spatial reduction.
+            If False, use a pooling process for spatial reduction. Defaults:
+            True.
+    """
+
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.,
+        proj_drop=0.,
+        sr_ratio=1,
+        use_sr_conv=True,
+    ):
+        super().__init__()
+        assert dim % num_heads == 0, \
+            f'dim {dim} should be divided by num_heads {num_heads}.'
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.sr_ratio = sr_ratio
+        self.use_sr_conv = use_sr_conv
+        if sr_ratio > 1 and self.use_sr_conv:
+            self.sr = nn.Conv2d(
+                dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = nn.LayerNorm(dim)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_init(m, std=.02, bias=0.)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+            elif isinstance(m, nn.Conv2d):
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                fan_out //= m.groups
+                m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads,
+                              C // self.num_heads).permute(0, 2, 1, 3)
+
+        if self.sr_ratio > 1:
+            kv = x.permute(0, 2, 1).reshape(B, C, H, W)
+            if self.use_sr_conv:
+                kv = self.sr(kv).reshape(B, C, -1).permute(0, 2,
+                                                           1).contiguous()
+                kv = self.norm(kv)
+            else:
+                kv = F.avg_pool2d(
+                    kv, kernel_size=self.sr_ratio, stride=self.sr_ratio)
+                kv = kv.reshape(B, C, -1).permute(0, 2, 1).contiguous()
+        else:
+            kv = x
+
+        kv = self.kv(kv).reshape(B, -1, 2, self.num_heads,
+                                 C // self.num_heads).permute(2, 0, 3, 1,
+                                                              4).contiguous()
+        k, v = kv[0], kv[1]
+
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class TCFormerRegularBlock(nn.Module):
+    """Transformer block for regular grid-based tokens.
+
+    Args:
+        dim (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        mlp_ratio (int): The expansion ratio for the FFNs.
+        qkv_bias (bool): enable bias for qkv if True. Default: False.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop (float): Dropout layers after attention process and in FFN.
+            Default: 0.0.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        drop_path (int, optional): The drop path rate of transformer block.
+            Default: 0.0
+        act_layer (nn.Module, optional): The activation config for FFNs.
+            Default: nn.GELU.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention. Default: 1.
+        use_sr_conv (bool): If True, use a conv layer for spatial reduction.
+            If False, use a pooling process for spatial reduction. Defaults:
+            True.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 use_sr_conv=True):
+        super().__init__()
+        self.norm1 = build_norm_layer(norm_cfg, dim)[1]
+
+        self.attn = TCFormerRegularAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio,
+            use_sr_conv=use_sr_conv)
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path))
+
+        self.norm2 = build_norm_layer(norm_cfg, dim)[1]
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MLP(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_init(m, std=.02, bias=0.)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+            elif isinstance(m, nn.Conv2d):
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                fan_out //= m.groups
+                m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+        return x
+
+
+class TokenConv(nn.Conv2d):
+    """Conv layer for dynamic tokens.
+
+    A skip link is added between the input and output tokens to reserve detail
+    tokens.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        groups = kwargs['groups'] if 'groups' in kwargs.keys() else 1
+        self.skip = nn.Conv1d(
+            in_channels=kwargs['in_channels'],
+            out_channels=kwargs['out_channels'],
+            kernel_size=1,
+            bias=False,
+            groups=groups)
+
+    def forward(self, token_dict):
+        x = token_dict['x']
+        x = self.skip(x.permute(0, 2, 1)).permute(0, 2, 1)
+        x_map = token2map(token_dict)
+        x_map = super().forward(x_map)
+        x = x + map2token(x_map, token_dict)
+        return x
+
+
+class TCMLP(nn.Module):
+    """FFN with Depthwise Conv for dynamic tokens.
+
+    Args:
+        in_features (int): The feature dimension.
+        hidden_features (int, optional): The hidden dimension of FFNs.
+            Defaults: The same as in_features.
+        out_features (int, optional): The output feature dimension.
+            Defaults: The same as in_features.
+        act_layer (nn.Module, optional): The activation config for FFNs.
+            Default: nn.GELU.
+        drop (float, optional): drop out rate. Default: 0.
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = TokenConv(
+            in_channels=hidden_features,
+            out_channels=hidden_features,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            bias=True,
+            groups=hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def init_weights(self):
+        """init weights."""
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_init(m, std=.02, bias=0.)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+            elif isinstance(m, nn.Conv2d):
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                fan_out //= m.groups
+                m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+    def forward(self, token_dict):
+        token_dict['x'] = self.fc1(token_dict['x'])
+        x = self.dwconv(token_dict)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class TCFormerDynamicAttention(TCFormerRegularAttention):
+    """Spatial Reduction Attention for dynamic tokens."""
+
+    def forward(self, q_dict, kv_dict):
+        """Attention process for dynamic tokens.
+        Dynamic tokens are represented by a dict with the following keys:
+            x (torch.Tensor[B, N, C]): token features.
+            token_num(int): token number.
+            map_size(list[int] or tuple[int]): feature map resolution in
+                format [H, W].
+            init_grid_size(list[int] or tuple[int]): initial grid resolution
+                in format [H_init, W_init].
+            idx_token(torch.LongTensor[B, N_init]): indicates which token
+                the initial grid belongs to.
+            agg_weight(torch.LongTensor[B, N_init] or None): weight for
+                aggregation. Indicates the weight of each token in its
+                cluster. If set to None, uniform weight is used.
+
+        Note:
+            B: batch size
+            N: token number
+            C: channel number
+            Ns: sampled point number
+            [H_init, W_init]: shape of initial grid
+            [H, W]: shape of feature map
+            N_init: numbers of initial token
+
+        Args:
+            q_dict (dict): dict for query token information
+            kv_dict (dict): dict for key and value token information
+
+        Return:
+            x (torch.Tensor[B, N, C]): output token features.
+        """
+
+        q = q_dict['x']
+        kv = kv_dict['x']
+        B, Nq, C = q.shape
+        Nkv = kv.shape[1]
+        conf_kv = kv_dict['token_score'] if 'token_score' in kv_dict.keys(
+        ) else kv.new_zeros(B, Nkv, 1)
+
+        q = self.q(q).reshape(B, Nq, self.num_heads,
+                              C // self.num_heads).permute(0, 2, 1,
+                                                           3).contiguous()
+
+        if self.sr_ratio > 1:
+            tmp = torch.cat([kv, conf_kv], dim=-1)
+            tmp_dict = kv_dict.copy()
+            tmp_dict['x'] = tmp
+            tmp_dict['map_size'] = q_dict['map_size']
+            tmp = token2map(tmp_dict)
+
+            kv = tmp[:, :C]
+            conf_kv = tmp[:, C:]
+
+            if self.use_sr_conv:
+                kv = self.sr(kv)
+                _, _, h, w = kv.shape
+                kv = kv.reshape(B, C, -1).permute(0, 2, 1).contiguous()
+                kv = self.norm(kv)
+            else:
+                kv = F.avg_pool2d(
+                    kv, kernel_size=self.sr_ratio, stride=self.sr_ratio)
+                kv = kv.reshape(B, C, -1).permute(0, 2, 1).contiguous()
+
+            conf_kv = F.avg_pool2d(
+                conf_kv, kernel_size=self.sr_ratio, stride=self.sr_ratio)
+            conf_kv = conf_kv.reshape(B, 1, -1).permute(0, 2, 1).contiguous()
+
+        kv = self.kv(kv).reshape(B, -1, 2, self.num_heads,
+                                 C // self.num_heads).permute(2, 0, 3, 1,
+                                                              4).contiguous()
+        k, v = kv[0], kv[1]
+
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+
+        conf_kv = conf_kv.squeeze(-1)[:, None, None, :]
+        attn = attn + conf_kv
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, Nq, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+# Transformer block for dynamic tokens
+class TCFormerDynamicBlock(TCFormerRegularBlock):
+    """Transformer block for dynamic tokens.
+
+    Args:
+        dim (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        mlp_ratio (int): The expansion ratio for the FFNs.
+        qkv_bias (bool): enable bias for qkv if True. Default: False.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop (float): Dropout layers after attention process and in FFN.
+            Default: 0.0.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        drop_path (int, optional): The drop path rate of transformer block.
+            Default: 0.0
+        act_layer (nn.Module, optional): The activation config for FFNs.
+            Default: nn.GELU.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention. Default: 1.
+        use_sr_conv (bool): If True, use a conv layer for spatial reduction.
+            If False, use a pooling process for spatial reduction. Defaults:
+            True.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 use_sr_conv=True):
+        super(TCFormerRegularBlock, self).__init__()
+        self.norm1 = build_norm_layer(norm_cfg, dim)[1]
+
+        self.attn = TCFormerDynamicAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio,
+            use_sr_conv=use_sr_conv)
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path))
+
+        self.norm2 = build_norm_layer(norm_cfg, dim)[1]
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = TCMLP(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def forward(self, inputs):
+        """Forward function.
+
+        Args:
+            inputs (dict or tuple[dict] or list[dict]): input dynamic
+                token information. If a single dict is provided, it's
+                regraded as query and key, value. If a tuple or list
+                of dict is provided, the first one is regarded as key
+                and the second one is regarded as key, value.
+
+        Return:
+            q_dict (dict): dict for output token information
+        """
+        if isinstance(inputs, tuple) or isinstance(inputs, list):
+            q_dict, kv_dict = inputs
+        else:
+            q_dict, kv_dict = inputs, None
+
+        x = q_dict['x']
+        # norm1
+        q_dict['x'] = self.norm1(q_dict['x'])
+        if kv_dict is None:
+            kv_dict = q_dict
+        else:
+            kv_dict['x'] = self.norm1(kv_dict['x'])
+
+        # attn
+        x = x + self.drop_path(self.attn(q_dict, kv_dict))
+
+        # mlp
+        q_dict['x'] = self.norm2(x)
+        x = x + self.drop_path(self.mlp(q_dict))
+        q_dict['x'] = x
+
+        return q_dict
diff --git a/SMPLer-X/main/transformer_utils/mmpose/models/utils/transformer.py b/SMPLer-X/main/transformer_utils/mmpose/models/utils/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f256c4c8ffade75f020686a369d35157ba6d6b5c
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/models/utils/transformer.py
@@ -0,0 +1,1140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule, ModuleList
+from mmengine.utils import digit_version, to_2tuple
+from mmpose.models.builder import TRANSFORMER
+
+from easydict import EasyDict
+from einops import rearrange, repeat
+from mmpose.core import force_fp32
+from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
+                                         TransformerLayerSequence,
+                                         build_transformer_layer_sequence)
+# from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER,
+#                                       TRANSFORMER_LAYER_SEQUENCE)
+import torch.distributions as distributions
+from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention
+from torch.nn.init import normal_
+import copy
+import warnings
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmengine.model import xavier_init
+from utils.human_models import smpl_x
+
+from config import cfg
+
+from mmengine import Registry
+TRANSFORMER_LAYER = Registry('transformerLayer')
+TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
+def point_sample(input, point_coords, **kwargs):
+    """
+    A wrapper around :function:`torch.nn.functional.grid_sample` to support 3D point_coords tensors.
+    Unlike :function:`torch.nn.functional.grid_sample` it assumes `point_coords` to lie inside
+    [0, 1] x [0, 1] square.
+    Args:
+        input (Tensor): A tensor of shape (N, C, H, W) that contains features map on a H x W grid.
+        point_coords (Tensor): A tensor of shape (N, P, 2) or (N, Hgrid, Wgrid, 2) that contains
+        [0, 1] x [0, 1] normalized point coordinates.
+    Returns:
+        output (Tensor): A tensor of shape (N, C, P) or (N, C, Hgrid, Wgrid) that contains
+            features for points in `point_coords`. The features are obtained via bilinear
+            interplation from `input` the same way as :function:`torch.nn.functional.grid_sample`.
+    """
+    add_dim = False
+    if point_coords.dim() == 3:
+        add_dim = True
+        point_coords = point_coords.unsqueeze(2)
+    output = F.grid_sample(input, 2.0 * point_coords - 1.0, **kwargs)
+    if add_dim:
+        output = output.squeeze(3)
+    return output
+
+
+def nlc_to_nchw(x, hw_shape):
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len does not match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding to input (if needed) so that input can get fully covered
+    by filter you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
+    input. The "corner"  mode would pad zero to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel:
+        stride (int | tuple): Stride of the filter. Default: 1:
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+
+        super(AdaptivePadding, self).__init__()
+
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        padding = to_2tuple(padding)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The config dict for embedding
+            conv layer type selection. Default: "Conv2d.
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: None (Would be set as `kernel_size`).
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only work when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(
+            self,
+            in_channels=3,
+            embed_dims=768,
+            conv_type='Conv2d',
+            kernel_size=16,
+            stride=16,
+            padding='corner',
+            dilation=1,
+            bias=True,
+            norm_cfg=None,
+            input_size=None,
+            init_cfg=None,
+    ):
+        super(PatchEmbed, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adap_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adap_padding:
+                pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (out_h, out_w).
+        """
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
+    merge patch, which is about 25% faster than original implementation.
+    Instead, we need to modify pretrained models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+            to gets fully covered by filter and stride you specified..
+            Default: True.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adap_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+            H, W = x.shape[-2:]
+
+        x = self.sampler(x)
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetrTransformerEncoder_zero_layer():
+    def __init__(self, *args, post_norm_cfg=dict(type='LN'), **kwargs):
+        pass
+
+    def __call__(self,
+                 query,
+                 key,
+                 value,
+                 query_pos=None,
+                 key_pos=None,
+                 attn_masks=None,
+                 query_key_padding_mask=None,
+                 key_padding_mask=None,
+                 **kwargs):
+        query = query + query_pos
+        return query
+
+
+@TRANSFORMER_LAYER.register_module()
+class DetrTransformerDecoderLayer_grouped(BaseTransformerLayer):
+    def __init__(self,
+                 attn_cfgs,
+                 feedforward_channels,
+                 ffn_dropout=0.0,
+                 operation_order=None,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 ffn_num_fcs=2,
+                 num_joints=17,
+                 **kwargs):
+        super(DetrTransformerDecoderLayer_grouped, self).__init__(
+            attn_cfgs=attn_cfgs,
+            feedforward_channels=feedforward_channels,
+            ffn_dropout=ffn_dropout,
+            operation_order=operation_order,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            ffn_num_fcs=ffn_num_fcs,
+            **kwargs)
+        # assert len(operation_order) == 6
+        # assert set(operation_order) == set(
+        #     ['self_attn', 'norm', 'cross_attn', 'ffn'])
+        self.num_joints = num_joints
+        # self.num_joints = len(smpl_x.pos_joint_part['rhand'])
+        # self.num_joints = len(smpl_x.pos_joint_part['body']) + len(smpl_x.pos_joint_part['rhand']) + len(smpl_x.pos_joint_part['lhand'])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                                                     f'attn_masks {len(attn_masks)} must be equal ' \
+                                                     f'to the number of attention in ' \
+                                                     f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            if layer == 'self_attn':
+                # print(query.shape)
+                assert query.size(0) % self.num_joints == 0, f'query.shape: {query.shape}, num_joints: {self.num_joints}'
+                num_group = query.size(0) // self.num_joints
+                bs = query.size(1)
+
+                temp_query = rearrange(query, '(g k) b c -> k (g b) c',
+                                       g=num_group, k=self.num_joints)
+                temp_identity = rearrange(identity, '(g k) b c -> k (g b) c',
+                                          g=num_group, k=self.num_joints)
+                temp_query_pos = rearrange(query_pos, '(g k) b c -> k (g b) c',
+                                           g=num_group, k=self.num_joints)
+
+                temp_key = temp_value = temp_query
+                query = self.attentions[attn_index](
+                    temp_query,
+                    temp_key,
+                    temp_value,
+                    temp_identity if self.pre_norm else None,
+                    query_pos=temp_query_pos,
+                    key_pos=temp_query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    **kwargs)
+
+                query = rearrange(query, 'k (g b) c -> (g k) b c',
+                                  g=num_group, b=bs)
+
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+        if 'cross_attn' not in self.operation_order:
+            query = query + value.sum() * 0
+
+        return query
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module(force=True)
+class DeformableDetrTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+
+        super(DeformableDetrTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                valid_ratios=None,
+                reg_branches=None,
+                fc_coord=None,
+                **kwargs):
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] * \
+                                         torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 3
+                # print(reference_points.shape, valid_ratios.shape)  # [48,65,3], [48,4,3]
+                reference_points_input = reference_points[:, :, None, :2] * \
+                                         valid_ratios[:, None]
+                # assert reference_points.shape[-1] == 2
+                # reference_points_input = reference_points[:, :, None] * \
+                #     valid_ratios[:, None]
+            # print(output.shape, reference_points_input.shape)
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                **kwargs)
+            output = output.permute(1, 0, 2)
+
+            # if reg_branches is not None:
+            #     tmp = reg_branches[lid](output)
+            #
+            #     if fc_coord is not None:
+            #         tmp = fc_coord(tmp)
+            #
+            #     if reference_points.shape[-1] == 4:
+            #         new_reference_points = tmp + inverse_sigmoid(
+            #             reference_points)
+            #         new_reference_points = new_reference_points.sigmoid()
+            #     else:
+            #         assert reference_points.shape[-1] == 3
+            #         new_reference_points = tmp
+            #         new_reference_points[..., :3] = tmp[
+            #                                         ..., :3] + inverse_sigmoid(reference_points)
+            #         new_reference_points = new_reference_points.sigmoid()
+            #     # else:
+            #     #     assert reference_points.shape[-1] == 2
+            #     #     new_reference_points = tmp
+            #     #     new_reference_points[..., :2] = tmp[
+            #     #         ..., :2] + inverse_sigmoid(reference_points)
+            #     #     new_reference_points = new_reference_points.sigmoid()
+            #     # # reference_points = new_reference_points.detach()
+            #     # reference_points = new_reference_points
+            #     reference_points = new_reference_points
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+class Linear_with_norm(nn.Module):
+    def __init__(self, in_channel, out_channel, bias=True, norm=True):
+        super(Linear_with_norm, self).__init__()
+        self.bias = bias
+        self.norm = norm
+        self.linear = nn.Linear(in_channel, out_channel, bias)
+        nn.init.xavier_uniform_(self.linear.weight, gain=0.01)
+
+    def forward(self, x):
+        y = x.matmul(self.linear.weight.t())
+
+        if self.norm:
+            x_norm = torch.norm(x, dim=1, keepdim=True)
+            y = y / x_norm
+
+        if self.bias:
+            y = y + self.linear.bias
+        return y
+
+
+@TRANSFORMER.register_module()
+class Transformer(BaseModule):
+    """Implements the DETR transformer.
+    Following the official DETR implementation, this module copy-paste
+    from torch.nn.Transformer with modifications:
+        * positional encodings are passed in MultiheadAttention
+        * extra LN at the end of encoder is removed
+        * decoder returns a stack of activations from all decoding layers
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        encoder (`mmcv.ConfigDict` | Dict): Config of
+            TransformerEncoder. Defaults to None.
+        decoder ((`mmcv.ConfigDict` | Dict)): Config of
+            TransformerDecoder. Defaults to None
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self, encoder=None, decoder=None, init_cfg=None):
+        super(Transformer, self).__init__(init_cfg=init_cfg)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        # self.embed_dims = self.encoder.embed_dims
+
+    def init_weights(self):
+        # follow the official DETR to init parameters
+        for m in self.modules():
+            if hasattr(m, 'weight') and m.weight.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        self._is_init = True
+
+    def forward(self, x, mask, query_embed, pos_embed):
+        """Forward function for `Transformer`.
+        Args:
+            x (Tensor): Input query with shape [bs, c, h, w] where
+                c = embed_dims.
+            mask (Tensor): The key_padding_mask used for encoder and decoder,
+                with shape [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder, with shape
+                [num_query, c].
+            pos_embed (Tensor): The positional encoding for encoder and
+                decoder, with the same shape as `x`.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - out_dec: Output from decoder. If return_intermediate_dec \
+                      is True output has shape [num_dec_layers, bs,
+                      num_query, embed_dims], else has shape [1, bs, \
+                      num_query, embed_dims].
+                - memory: Output results from encoder, with shape \
+                      [bs, embed_dims, h, w].
+        """
+        bs, c, h, w = x.shape
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        x = x.view(bs, c, -1).permute(2, 0, 1)  # [bs, c, h, w] -> [h*w, bs, c]
+        pos_embed = pos_embed.view(bs, c, -1).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(
+            1, bs, 1)  # [num_query, dim] -> [num_query, bs, dim]
+        mask = mask.view(bs, -1)  # [bs, h, w] -> [bs, h*w]
+        memory = self.encoder(
+            query=x,
+            key=None,
+            value=None,
+            query_pos=pos_embed,
+            query_key_padding_mask=mask)
+        target = torch.zeros_like(query_embed)
+        # out_dec: [num_layers, num_query, bs, dim]
+        out_dec = self.decoder(
+            query=target,
+            key=memory,
+            value=memory,
+            key_pos=pos_embed,
+            query_pos=query_embed,
+            key_padding_mask=mask)
+        out_dec = out_dec.transpose(1, 2)
+        memory = memory.permute(1, 2, 0).reshape(bs, c, h, w)
+        return out_dec, memory
+
+
+@TRANSFORMER.register_module()
+class PoseurTransformer_v3(Transformer):
+    """ add noise training """
+
+    def __init__(self,
+                 as_two_stage=False,
+                 num_feature_levels=4,
+                 two_stage_num_proposals=300,
+                 num_joints=17,
+                 use_soft_argmax=False,
+                 use_soft_argmax_def=False,
+                 proposal_feature='backbone_s',  # or encoder_memory
+                 image_size=[192, 256],
+                 init_q_sigmoid=False,
+                 soft_arg_stride=4,
+                 add_feat_2_query=False,
+                 query_pose_emb=True,
+                 num_noise_sample=3,
+                 num_noise_point=4,
+                 noise_sigma=0.2,
+                 embed_dims=256,
+                 **kwargs):
+        super(PoseurTransformer_v3, self).__init__(**kwargs)
+        assert query_pose_emb == True
+        # self.num_noise_sample = num_noise_sample
+        self.num_noise_sample = num_noise_sample
+        self.num_noise_point = num_noise_point
+        self.noise_sigma = noise_sigma
+        self.add_feat_2_query = add_feat_2_query
+        self.as_two_stage = as_two_stage
+        self.num_feature_levels = num_feature_levels
+        self.two_stage_num_proposals = two_stage_num_proposals
+        try:
+            self.embed_dims = self.encoder.embed_dims
+        except:
+            self.embed_dims = embed_dims
+        self.num_joints = num_joints
+        # self.num_joints = 17
+        # self.num_joints = len(smpl_x.pos_joint_part['rhand'])  # body_joints+bboxes
+        # self.num_joints = len(smpl_x.pos_joint_part['body']) + len(smpl_x.pos_joint_part['rhand']) + len(smpl_x.pos_joint_part['lhand'])
+        self.use_soft_argmax = use_soft_argmax
+        self.use_soft_argmax_def = use_soft_argmax_def
+        assert not (self.use_soft_argmax & self.use_soft_argmax_def)
+        self.init_q_sigmoid = init_q_sigmoid
+        self.image_size = image_size
+        self.soft_arg_stride = soft_arg_stride
+        self.proposal_feature = proposal_feature
+        self.query_pose_emb = query_pose_emb
+        self.prior = distributions.MultivariateNormal(torch.zeros(2), torch.eye(2) * self.noise_sigma)
+        self.init_layers()
+
+    def init_layers(self):
+        """Initialize layers of the DeformableDetrTransformer."""
+        self.level_embeds = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+
+        if self.as_two_stage:
+            self.avg_pool = nn.AdaptiveAvgPool2d(1)
+            # self.fc_sigma = Linear_with_norm(self.embed_dims, self.num_joints * 2, norm=False)
+            self.fc_sigma = Linear_with_norm(self.embed_dims, self.num_joints * 3, norm=False)
+            if self.use_soft_argmax:
+                self.soft_argmax_coord = Heatmap1DHead(in_channels=self.embed_dims, expand_ratio=2, hidden_dims=(512,),
+                                                       image_size=self.image_size, stride=self.soft_arg_stride)
+                self.fc_layers = [self.fc_sigma]
+            elif self.use_soft_argmax_def:
+                self.soft_argmax_coord = Heatmap2DHead(in_channels=self.embed_dims,
+                                                       image_size=self.image_size, stride=self.soft_arg_stride)
+                self.fc_layers = [self.fc_sigma]
+            else:
+                # self.fc_coord = Linear_with_norm(self.embed_dims, self.num_joints * 2)
+                self.fc_coord = Linear_with_norm(self.embed_dims, self.num_joints * 3)
+                self.fc_layers = [self.fc_coord, self.fc_sigma]
+
+            if self.query_pose_emb:
+                self.pos_trans = nn.Linear(self.embed_dims * 2,
+                                           self.embed_dims)
+                self.pos_trans_norm = nn.LayerNorm(self.embed_dims)
+                # self.pos_embed = nn.Embedding(17,self.embed_dims)
+                self.pos_embed = nn.Embedding(self.num_joints, self.embed_dims)
+            else:
+                self.pos_trans = nn.Linear(self.embed_dims * 2,
+                                           self.embed_dims * 2)
+                self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2)
+        else:
+            self.reference_points = nn.Linear(self.embed_dims, 2)
+        self.fp16_enabled = False
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        if not self.as_two_stage:
+            xavier_init(self.reference_points, distribution='uniform', bias=0.)
+        normal_(self.level_embeds)
+        if self.use_soft_argmax:
+            self.soft_argmax_coord.init_weights()
+
+        if self.as_two_stage:
+            for m in self.fc_layers:
+                if isinstance(m, nn.Linear):
+                    nn.init.xavier_uniform_(m.weight, gain=0.01)
+
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask,
+                                     spatial_shapes):
+        """Generate proposals from encoded memory.
+        Args:
+            memory (Tensor) : The output of encoder,
+                has shape (bs, num_key, embed_dim).  num_key is
+                equal the number of points on feature map from
+                all level.
+            memory_padding_mask (Tensor): Padding mask for memory.
+                has shape (bs, num_key).
+            spatial_shapes (Tensor): The shape of all feature maps.
+                has shape (num_level, 2).
+        Returns:
+            tuple: A tuple of feature map and bbox prediction.
+                - output_memory (Tensor): The input of decoder,  \
+                    has shape (bs, num_key, embed_dim).  num_key is \
+                    equal the number of points on feature map from \
+                    all levels.
+                - output_proposals (Tensor): The normalized proposal \
+                    after a inverse sigmoid, has shape \
+                    (bs, num_keys, 4).
+        """
+
+        N, S, C = memory.shape
+        proposals = []
+        _cur = 0
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].view(
+                N, H, W, 1)
+            valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = torch.meshgrid(
+                torch.linspace(
+                    0, H - 1, H, dtype=torch.float32, device=memory.device),
+                torch.linspace(
+                    0, W - 1, W, dtype=torch.float32, device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_W.unsqueeze(-1),
+                               valid_H.unsqueeze(-1)], 1).view(N, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(N, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
+            # proposal = torch.cat((grid, wh), -1).view(N, -1, 4)
+            proposal = grid.view(N, -1, 2)
+            proposals.append(proposal)
+            _cur += (H * W)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) &
+                                  (output_proposals < 0.99)).all(
+            -1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(
+            memory_padding_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(
+            ~output_proposals_valid, float('inf'))
+
+        output_memory = memory
+        output_memory = output_memory.masked_fill(
+            memory_padding_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid,
+                                                  float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        return output_memory, output_proposals
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """Get the reference points used in decoder.
+        Args:
+            spatial_shapes (Tensor): The shape of all
+                feature maps, has shape (num_level, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+            device (obj:`device`): The device where
+                reference_points should be.
+        Returns:
+            Tensor: reference points used in decoder, has \
+                shape (bs, num_keys, num_levels, 2).
+        """
+        # print(spatial_shapes)
+        reference_points_list = []
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            #  TODO  check this 0.5
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(
+                    0.5, H - 0.5, H, dtype=torch.float32, device=device),
+                torch.linspace(
+                    0.5, W - 0.5, W, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (
+                    valid_ratios[:, None, lvl, 1] * H)
+            ref_x = ref_x.reshape(-1)[None] / (
+                    valid_ratios[:, None, lvl, 0] * W)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        # print(reference_points_list[-1])   # range:(0,1)
+        # print(H, W)  [8,6]
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def get_valid_ratio(self, mask):
+        """Get the valid radios of feature maps of all  level."""
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def get_proposal_pos_embed(self,
+                               proposals,
+                               num_pos_feats=128,
+                               temperature=10000):
+        """Get the position embedding of proposal."""
+        num_pos_feats = self.embed_dims // 3 + 1
+        scale = 2 * math.pi
+        dim_t = torch.arange(
+            num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 2
+        if self.init_q_sigmoid:
+            proposals = proposals.sigmoid() * scale
+        else:
+            proposals = proposals * scale
+
+        # N, L, 3, 86
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 3, 43, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos[:, :, :self.embed_dims]
+
+    @force_fp32(apply_to=('mlvl_feats', 'query_embed', 'mlvl_pos_embeds'))
+    def forward(self,
+                mlvl_feats,
+                mlvl_masks,
+                query_embed,
+                mlvl_pos_embeds,
+                reg_branches=None,
+                fc_coord=None,
+                cls_branches=None,
+                coord_init=None,
+                query_init=None,
+                **kwargs):
+        assert self.as_two_stage or query_embed is not None
+
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (feat, mask, pos_embed) in enumerate(
+                zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
+            bs, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            feat = feat.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            feat_flatten.append(feat)
+            mask_flatten.append(mask)
+        feat_flatten = torch.cat(feat_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=feat_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack(
+            [self.get_valid_ratio(m) for m in mlvl_masks], 1)
+        # [bs, H*W, num_lvls, 2]
+        # print(spatial_shape)
+        reference_points = \
+            self.get_reference_points(spatial_shapes,
+                                      valid_ratios,
+                                      device=feat.device)
+        # print(reference_points.shape, valid_ratios.shape)   # [bs, 4080, 4, 2]; [bs, 4, 2]
+        feat_flatten = feat_flatten.permute(1, 0, 2)  # (H*W, bs, embed_dims)
+        lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute(
+            1, 0, 2)  # (H*W, bs, embed_dims)
+
+        memory = self.encoder(
+            query=feat_flatten,
+            key=None,
+            value=None,
+            query_pos=lvl_pos_embed_flatten,
+            query_key_padding_mask=mask_flatten,
+            spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            **kwargs)
+
+        memory = memory.permute(1, 0, 2)
+        bs, _, c = memory.shape
+
+        if self.proposal_feature == 'backbone_l':
+            x = mlvl_feats[0]
+        elif self.proposal_feature == 'backbone_s':
+            x = mlvl_feats[-1]
+            point_sample_feat = mlvl_feats[-1]
+        elif self.proposal_feature == 'encoder_memory_l':
+            x = memory.permute(0, 2, 1)[:, :, :int(level_start_index[1])].view_as(mlvl_feats[0])
+            point_sample_feat = memory.permute(0, 2, 1)[:, :, :int(level_start_index[1])].view_as(mlvl_feats[0])
+        elif self.proposal_feature == 'encoder_memory_s':
+            x = memory.permute(0, 2, 1)[:, :, int(level_start_index[-1]):].view_as(mlvl_feats[-1])
+        else:
+            raise NotImplementedError
+
+        BATCH_SIZE = x.shape[0]
+
+        if coord_init is not None:
+            pred_jts = coord_init
+            enc_outputs = None
+        else:
+            if self.use_soft_argmax:
+                out_coord = self.soft_argmax_coord(x)  # bs, 17, 2
+                assert out_coord.shape[2] == 2
+                x = self.avg_pool(x).reshape(BATCH_SIZE, -1)
+                out_sigma = self.fc_sigma(x).reshape(BATCH_SIZE, self.num_joints, -1)
+            elif self.use_soft_argmax_def:
+                out_coord = self.soft_argmax_coord(x)  # bs, 17, 2
+                assert out_coord.shape[2] == 2
+                x = self.avg_pool(x).reshape(BATCH_SIZE, -1)
+                out_sigma = self.fc_sigma(x).reshape(BATCH_SIZE, self.num_joints, -1)
+            else:
+                x = self.avg_pool(x).reshape(BATCH_SIZE, -1)
+                out_coord = self.fc_coord(x).reshape(BATCH_SIZE, self.num_joints, 3)
+                assert out_coord.shape[2] == 3
+                out_sigma = self.fc_sigma(x).reshape(BATCH_SIZE, self.num_joints, -1)
+
+            # (B, N, 3)
+            pred_jts = out_coord.reshape(BATCH_SIZE, self.num_joints, 3)
+            sigma = out_sigma.reshape(BATCH_SIZE, self.num_joints, -1).sigmoid()
+            scores = 1 - sigma
+
+            scores = torch.mean(scores, dim=2, keepdim=True)
+            enc_outputs = EasyDict(
+                pred_jts=pred_jts,
+                sigma=sigma,
+                maxvals=scores.float(),
+            )
+
+        reference_points = pred_jts.detach()
+        reference_points_cliped = reference_points.clip(0, 1)
+
+        init_reference_out = reference_points_cliped
+        if query_init is not None:
+            query = query_init
+        else:
+            pred_jts_pos_embed = self.get_proposal_pos_embed(reference_points.detach())
+            reference_points_pos_embed = self.get_proposal_pos_embed(reference_points_cliped.detach())  # query init here
+            if self.add_feat_2_query:
+                query_feat = point_sample(point_sample_feat, init_reference_out, align_corners=False).permute(0, 2, 1)
+                reference_points_pos_embed = reference_points_pos_embed + query_feat
+            query_pos_emb = torch.cat([pred_jts_pos_embed, reference_points_pos_embed], dim=2)
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(query_pos_emb))
+
+            query = pos_trans_out
+
+        query_pos = self.pos_embed.weight.clone().repeat(bs, 1, 1).contiguous()
+
+        # decoder
+        query = query.permute(1, 0, 2)
+        memory = memory.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=memory,
+            query_pos=query_pos,
+            key_padding_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=reg_branches,
+            fc_coord=fc_coord,
+            **kwargs)
+        inter_references_out = inter_references
+        return memory.permute(1, 0, 2), spatial_shapes, level_start_index, inter_states, init_reference_out, \
+               inter_references_out, enc_outputs
diff --git a/SMPLer-X/main/transformer_utils/mmpose/ops/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6af823310ad59c2d1e52274f8af9a0fc0f14a72
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/ops/__init__.py
@@ -0,0 +1,9 @@
+from .multi_scale_deform_attn import (MultiScaleDeformableAttention_share_value,
+                                    MultiScaleDeformableAttention_bottle_neck_v,
+                                    MultiScaleDeformableAttention_post_value,
+                                    MultiScaleDeformableAttention_post_v_stirct,
+                                    )
+
+__all__ = [
+    'MultiScaleDeformableAttention',
+]
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/ops/csrc/pytorch/info.cpp b/SMPLer-X/main/transformer_utils/mmpose/ops/csrc/pytorch/info.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d252ba2edf71649c976923f9836da801fd71b48b
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/ops/csrc/pytorch/info.cpp
@@ -0,0 +1,55 @@
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("rocm not vailable");
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn.cpp b/SMPLer-X/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1fda9aeba25b8fd87fd8b9e2d7e27d646271d7b3
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn.cpp
@@ -0,0 +1,79 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+Tensor ms_deform_attn_cuda_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_cuda_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
+
+#endif
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight,
+                              const int im2col_step) {
+  if (value.type().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(value)
+    CHECK_CUDA_INPUT(spatial_shapes)
+    CHECK_CUDA_INPUT(level_start_index)
+    CHECK_CUDA_INPUT(sampling_loc)
+    CHECK_CUDA_INPUT(attn_weight)
+    return ms_deform_attn_cuda_forward(value, spatial_shapes, level_start_index,
+                                       sampling_loc, attn_weight, im2col_step);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step) {
+  if (value.type().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(value)
+    CHECK_CUDA_INPUT(spatial_shapes)
+    CHECK_CUDA_INPUT(level_start_index)
+    CHECK_CUDA_INPUT(sampling_loc)
+    CHECK_CUDA_INPUT(attn_weight)
+    CHECK_CUDA_INPUT(grad_output)
+    CHECK_CUDA_INPUT(grad_value)
+    CHECK_CUDA_INPUT(grad_sampling_loc)
+    CHECK_CUDA_INPUT(grad_attn_weight)
+    ms_deform_attn_cuda_backward(value, spatial_shapes, level_start_index,
+                                 sampling_loc, attn_weight, grad_output,
+                                 grad_value, grad_sampling_loc,
+                                 grad_attn_weight, im2col_step);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("Not implemented on the CPU");
+  }
+}
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn_cuda.cu b/SMPLer-X/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4c2ad396cf68ae470431e2cba362506bc068cc7d
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn_cuda.cu
@@ -0,0 +1,360 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <THC/THCAtomics.cuh>
+#include <ms_deform_attn_cuda_kernel.cuh>
+#include <vector>
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value,
+                               const int64_t *data_spatial_shapes,
+                               const int64_t *data_level_start_index,
+                               const scalar_t *data_sampling_loc,
+                               const scalar_t *data_attn_weight,
+                               const int batch_size, const int spatial_size,
+                               const int num_heads, const int channels,
+                               const int num_levels, const int num_query,
+                               const int num_point, scalar_t *data_col) {
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0, stream>>>(
+          num_kernels, data_value, data_spatial_shapes, data_level_start_index,
+          data_sampling_loc, data_attn_weight, batch_size, spatial_size,
+          num_heads, channels, num_levels, num_query, num_point, data_col);
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(
+    cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  const int num_threads =
+      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024) {
+    if ((channels & 1023) == 0) {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+             num_threads * 3 * sizeof(scalar_t), stream>>>(
+              num_kernels, grad_col, data_value, data_spatial_shapes,
+              data_level_start_index, data_sampling_loc, data_attn_weight,
+              batch_size, spatial_size, num_heads, channels, num_levels,
+              num_query, num_point, grad_value, grad_sampling_loc,
+              grad_attn_weight);
+    } else {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, spatial_size, num_heads,
+                       channels, num_levels, num_query, num_point, grad_value,
+                       grad_sampling_loc, grad_attn_weight);
+    }
+  } else {
+    switch (channels) {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      1>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      2>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      4>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      8>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      16>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      32>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      64>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      128>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      256>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      512>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      1024>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      default:
+        if (channels < 64) {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        } else {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
+                                       const at::Tensor &spatial_shapes,
+                                       const at::Tensor &level_start_index,
+                                       const at::Tensor &sampling_loc,
+                                       const at::Tensor &attn_weight,
+                                       const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+
+  AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.type().is_cuda(),
+             "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.type().is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.type().is_cuda(),
+             "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  auto output =
+      at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+  const int batch_n = im2col_step_;
+  auto output_n = output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto columns = output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES(
+        value.type(), "ms_deform_attn_forward_cuda", ([&] {
+          ms_deformable_im2col_cuda(
+              at::cuda::getCurrentCUDAStream(),
+              value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data<int64_t>(), level_start_index.data<int64_t>(),
+              sampling_loc.data<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point, columns.data<scalar_t>());
+        }));
+  }
+
+  output = output.view({batch, num_query, num_heads * channels});
+
+  return output;
+}
+
+void ms_deform_attn_cuda_backward(
+    const at::Tensor &value, const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index, const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight, const at::Tensor &grad_output,
+    at::Tensor &grad_value, at::Tensor &grad_sampling_loc,
+    at::Tensor &grad_attn_weight, const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+  AT_ASSERTM(grad_output.is_contiguous(),
+             "grad_output tensor has to be contiguous");
+
+  AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.type().is_cuda(),
+             "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.type().is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.type().is_cuda(),
+             "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+  AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  const int batch_n = im2col_step_;
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  auto grad_output_n = grad_output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto grad_output_g = grad_output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES(
+        value.type(), "ms_deform_attn_backward_cuda", ([&] {
+          ms_deformable_col2im_cuda(
+              at::cuda::getCurrentCUDAStream(), grad_output_g.data<scalar_t>(),
+              value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data<int64_t>(), level_start_index.data<int64_t>(),
+              sampling_loc.data<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point,
+              grad_value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+              grad_sampling_loc.data<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              grad_attn_weight.data<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size);
+        }));
+  }
+}
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn_cuda_kernel.cuh b/SMPLer-X/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..cbdf3820117a5cab28e2d116f3404876efa6a0bd
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn_cuda_kernel.cuh
@@ -0,0 +1,807 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#ifndef DEFORM_ATTN_CUDA_KERNEL
+#define DEFORM_ATTN_CUDA_KERNEL
+
+#include "common_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads) {
+  return (N + num_threads - 1) / num_threads;
+}
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(
+    const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes,
+    const int64_t *data_level_start_index, const scalar_t *data_sampling_loc,
+    const scalar_t *data_attn_weight, const int batch_size,
+    const int spatial_size, const int num_heads, const int channels,
+    const int num_levels, const int num_query, const int num_point,
+    scalar_t *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr =
+          data_value +
+          (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h,
+                                                spatial_w, num_heads, channels,
+                                                h_im, w_im, m_col, c_col) *
+                 weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear_gm(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+#endif  // DEFORM_ATTN_CUDA_KERNEL
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py b/SMPLer-X/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d46db5038dcb24d605b1705ecc64b1dcafb6d96
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py
@@ -0,0 +1,1056 @@
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd.function import Function, once_differentiable
+
+from mmengine.utils import deprecated_api_warning
+from mmengine.model import constant_init, xavier_init
+# from mmcv.cnn.bricks.registry import ATTENTION
+from mmengine.model import BaseModule
+from mmcv.utils import ext_loader
+from mmcv.ops.multi_scale_deform_attn import ext_module
+from mmengine import Registry
+ATTENTION = Registry('attention')
+class MultiScaleDeformableAttnFunction(Function):
+
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+
+        ctx.im2col_step = im2col_step
+        # print(im2col_step)
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+        Args:
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
+        Returns:
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index,\
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
+
+
+def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes,
+                                        sampling_locations, attention_weights):
+    """CPU version of multi-scale deformable attention.
+    Args:
+        value (Tensor): The value has shape
+            (bs, num_keys, mum_heads, embed_dims//num_heads)
+        value_spatial_shapes (Tensor): Spatial shape of
+            each feature map, has shape (num_levels, 2),
+            last dimension 2 represent (h, w)
+        sampling_locations (Tensor): The location of sampling points,
+            has shape
+            (bs ,num_queries, num_heads, num_levels, num_points, 2),
+            the last dimension 2 represent (x, y).
+        attention_weights (Tensor): The weight of sampling points used
+            when calculate the attention, has shape
+            (bs ,num_queries, num_heads, num_levels, num_points),
+    Returns:
+        Tensor: has shape (bs, num_queries, embed_dims)
+    """
+
+    bs, _, num_heads, embed_dims = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ =\
+        sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],
+                             dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (H_, W_) in enumerate(value_spatial_shapes):
+        # bs, H_*W_, num_heads, embed_dims ->
+        # bs, H_*W_, num_heads*embed_dims ->
+        # bs, num_heads*embed_dims, H_*W_ ->
+        # bs*num_heads, embed_dims, H_, W_
+        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(
+            bs * num_heads, embed_dims, H_, W_)
+        # bs, num_queries, num_heads, num_points, 2 ->
+        # bs, num_heads, num_queries, num_points, 2 ->
+        # bs*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :,
+                                          level].transpose(1, 2).flatten(0, 1)
+        # bs*num_heads, embed_dims, num_queries, num_points
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (bs, num_queries, num_heads, num_levels, num_points) ->
+    # (bs, num_heads, num_queries, num_levels, num_points) ->
+    # (bs, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        bs * num_heads, 1, num_queries, num_levels * num_points)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) *
+              attention_weights).sum(-1).view(bs, num_heads * embed_dims,
+                                              num_queries)
+    return output.transpose(1, 2).contiguous()
+
+
+@ATTENTION.register_module()
+class MultiScaleDeformableAttention_share_value(BaseModule):
+    """An attention module used in Deformable-Detr. `Deformable DETR:
+    Deformable Transformers for End-to-End Object Detection.
+      <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        # print(im2col_step)
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        # self.im2col_step = im2col_step
+        self.im2col_step = im2col_step*4
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        # self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        # xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        # value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available():
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
+
+class bottle_neck(nn.Module):
+    def __init__(self, embed_dims, hiddem_dims):
+        super().__init__()
+        self.fc1 = nn.Linear(embed_dims, hiddem_dims)
+        self.fc2 = nn.Linear(hiddem_dims, hiddem_dims)
+        self.fc3 = nn.Linear(hiddem_dims, embed_dims)
+        xavier_init(self.fc1, distribution='uniform', bias=0.)
+        xavier_init(self.fc2, distribution='uniform', bias=0.)
+        xavier_init(self.fc3, distribution='uniform', bias=0.)
+    def forward(self, x):
+        res = x
+        x = self.fc1(x)
+        x = self.fc2(x)
+        x = self.fc3(x)
+        x = x + res
+        return x
+
+    
+
+@ATTENTION.register_module()
+class MultiScaleDeformableAttention_bottle_neck_v(BaseModule):
+    """An attention module used in Deformable-Detr. `Deformable DETR:
+    Deformable Transformers for End-to-End Object Detection.
+      <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = bottle_neck(embed_dims, hiddem_dims = embed_dims//16)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        # xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available():
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
+
+@ATTENTION.register_module()
+class MultiScaleDeformableAttention_post_value(BaseModule):
+    """An attention module used in Deformable-Detr. `Deformable DETR:
+    Deformable Transformers for End-to-End Object Detection.
+      <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        # self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        # self.value_proj = nn.Linear(embed_dims * 8, embed_dims//8,)
+        self.value_proj_weight = nn.Embedding(8*embed_dims, embed_dims//8)
+        self.value_proj_bias = nn.Embedding(8, embed_dims//8)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        # xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        # value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        # value = value.view(bs, num_value, self.num_heads, -1)
+        value = value[:,:,None,:].repeat(1, 1, self.num_heads, 1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available():
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+
+        bs, num_q, ch = output.shape
+        value_proj_weight = self.value_proj_weight.weight.clone()
+        value_proj_bias = self.value_proj_bias.weight.clone()
+        value_proj_weight = value_proj_weight.reshape(8,ch//8,-1)
+        output = output.reshape(bs,num_q,8,ch//8)
+        output = torch.einsum('bqhc,hco->bqho', output, value_proj_weight)
+        output = output + value_proj_bias[None,None,:,:]
+        output = output.flatten(-2)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
+
+def multi_scale_deformable_post_v(value, value_spatial_shapes,
+                                        sampling_locations, attention_weights,
+                                        value_proj_weight, value_proj_bias):
+    """CPU version of multi-scale deformable attention.
+    Args:
+        value (Tensor): The value has shape
+            (bs, num_keys, mum_heads, embed_dims//num_heads)
+        value_spatial_shapes (Tensor): Spatial shape of
+            each feature map, has shape (num_levels, 2),
+            last dimension 2 represent (h, w)
+        sampling_locations (Tensor): The location of sampling points,
+            has shape
+            (bs ,num_queries, num_heads, num_levels, num_points, 2),
+            the last dimension 2 represent (x, y).
+        attention_weights (Tensor): The weight of sampling points used
+            when calculate the attention, has shape
+            (bs ,num_queries, num_heads, num_levels, num_points),
+    Returns:
+        Tensor: has shape (bs, num_queries, embed_dims)
+    """
+
+    bs, _, num_heads, embed_dims = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ =\
+        sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],
+                             dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (H_, W_) in enumerate(value_spatial_shapes):
+        # bs, H_*W_, num_heads, embed_dims ->
+        # bs, H_*W_, num_heads*embed_dims ->
+        # bs, num_heads*embed_dims, H_*W_ ->
+        # bs*num_heads, embed_dims, H_, W_
+        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(
+            bs * num_heads, embed_dims, H_, W_)
+        # bs, num_queries, num_heads, num_points, 2 ->
+        # bs, num_heads, num_queries, num_points, 2 ->
+        # bs*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :,
+                                          level].transpose(1, 2).flatten(0, 1)
+        # bs*num_heads, embed_dims, num_queries, num_points
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (bs, num_queries, num_heads, num_levels, num_points) ->
+    # (bs, num_heads, num_queries, num_levels, num_points) ->
+    # (bs, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        bs * num_heads, 1, num_queries, num_levels * num_points)
+
+    value_proj_weight = value_proj_weight.weight.clone()
+    value_proj_weight = value_proj_weight.reshape(8,value_proj_weight.shape[0]//8,value_proj_weight.shape[1])
+    value_proj_bias = value_proj_bias.weight.clone()
+    sampling_value_list = torch.stack(sampling_value_list, dim=-2).flatten(-2)
+    sampling_value_list = sampling_value_list.reshape(bs,num_heads,value_proj_weight.shape[1],num_queries,num_levels*num_points)
+    sampling_value_list = torch.einsum('bhcqp,hco->bhoqp', sampling_value_list, value_proj_weight)
+    sampling_value_list = sampling_value_list + value_proj_bias[None,:,:,None,None]
+    sampling_value_list = sampling_value_list.reshape(bs*num_heads,value_proj_weight.shape[1]//num_heads,num_queries,num_levels*num_points)
+
+    output = (sampling_value_list *
+              attention_weights).sum(-1).view(bs, embed_dims,
+                                              num_queries)
+    return output.transpose(1, 2).contiguous()
+
+@ATTENTION.register_module()
+class MultiScaleDeformableAttention_post_v_stirct(BaseModule):
+    """An attention module used in Deformable-Detr. `Deformable DETR:
+    Deformable Transformers for End-to-End Object Detection.
+      <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        # self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.value_proj_weight = nn.Embedding(8*embed_dims, embed_dims//8)
+        self.value_proj_bias = nn.Embedding(8, embed_dims//8)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        # xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        # value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value[:,:,None,:].expand(-1, -1, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+
+        output = multi_scale_deformable_post_v(
+            value, spatial_shapes, sampling_locations,
+            attention_weights, self.value_proj_weight, self.value_proj_bias)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/SMPLer-X/main/transformer_utils/mmpose/utils/__init__.py b/SMPLer-X/main/transformer_utils/mmpose/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1293ca05aab2632e0d6df29734438bc38ed79c6c
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/utils/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .collect_env import collect_env
+from .logger import get_root_logger
+from .setup_env import setup_multi_processes
+from .timer import StopWatch
+
+__all__ = [
+    'get_root_logger', 'collect_env', 'StopWatch', 'setup_multi_processes'
+]
diff --git a/SMPLer-X/main/transformer_utils/mmpose/utils/collect_env.py b/SMPLer-X/main/transformer_utils/mmpose/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..1433f0bcb1555b550e06b5e933b2755dbc56e24c
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/utils/collect_env.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import get_git_hash
+from mmengine.utils.dl_utils import collect_env as collect_base_env
+
+import mmpose
+
+
+def collect_env():
+    env_info = collect_base_env()
+    env_info['MMPose'] = (mmpose.__version__ + '+' + get_git_hash(digits=7))
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/utils/hooks.py b/SMPLer-X/main/transformer_utils/mmpose/utils/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..b68940f2b7a8a618916ea5aab331e3ce45ba98e7
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/utils/hooks.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+
+
+class OutputHook:
+
+    def __init__(self, module, outputs=None, as_tensor=False):
+        self.outputs = outputs
+        self.as_tensor = as_tensor
+        self.layer_outputs = {}
+        self.register(module)
+
+    def register(self, module):
+
+        def hook_wrapper(name):
+
+            def hook(model, input, output):
+                if self.as_tensor:
+                    self.layer_outputs[name] = output
+                else:
+                    if isinstance(output, list):
+                        self.layer_outputs[name] = [
+                            out.detach().cpu().numpy() for out in output
+                        ]
+                    else:
+                        self.layer_outputs[name] = output.detach().cpu().numpy(
+                        )
+
+            return hook
+
+        self.handles = []
+        if isinstance(self.outputs, (list, tuple)):
+            for name in self.outputs:
+                try:
+                    layer = rgetattr(module, name)
+                    h = layer.register_forward_hook(hook_wrapper(name))
+                except ModuleNotFoundError as module_not_found:
+                    raise ModuleNotFoundError(
+                        f'Module {name} not found') from module_not_found
+                self.handles.append(h)
+
+    def remove(self):
+        for h in self.handles:
+            h.remove()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.remove()
+
+
+# using wonder's beautiful simplification:
+# https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects
+def rgetattr(obj, attr, *args):
+
+    def _getattr(obj, attr):
+        return getattr(obj, attr, *args)
+
+    return functools.reduce(_getattr, [obj] + attr.split('.'))
diff --git a/SMPLer-X/main/transformer_utils/mmpose/utils/logger.py b/SMPLer-X/main/transformer_utils/mmpose/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e60cc5c2d59c15adb963645137d54900d998a60
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/utils/logger.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+from mmengine.logging import MMLogger
+
+
+def get_root_logger(log_file=None, log_level=logging.INFO):
+    """Use `MMLogger` class in mmengine to get the root logger.
+
+    The logger will be initialized if it has not been initialized. By default a
+    StreamHandler will be added. If `log_file` is specified, a FileHandler will
+    also be added. The name of the root logger is the top-level package name,
+    e.g., "mmpose".
+
+    Args:
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the root logger.
+        log_level (int): The root logger level. Note that only the process of
+            rank 0 is affected, while other processes will set the level to
+            "Error" and be silent most of the time.
+
+    Returns:
+        logging.Logger: The root logger.
+    """
+    return MMLogger('MMLogger', __name__.split('.')[0], log_file, log_level)
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/utils/setup_env.py b/SMPLer-X/main/transformer_utils/mmpose/utils/setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c862d9e67a869aa8fa3624110c4d7f3ac60fa90
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/utils/setup_env.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import platform
+import warnings
+
+import cv2
+import torch.multiprocessing as mp
+
+
+def setup_multi_processes(cfg):
+    """Setup multi-processing environment variables."""
+    # set multi-process start method as `fork` to speed up the training
+    if platform.system() != 'Windows':
+        mp_start_method = cfg.get('mp_start_method', 'fork')
+        current_method = mp.get_start_method(allow_none=True)
+        if current_method is not None and current_method != mp_start_method:
+            warnings.warn(
+                f'Multi-processing start method `{mp_start_method}` is '
+                f'different from the previous setting `{current_method}`.'
+                f'It will be force set to `{mp_start_method}`. You can change '
+                f'this behavior by changing `mp_start_method` in your config.')
+        mp.set_start_method(mp_start_method, force=True)
+
+    # disable opencv multithreading to avoid system being overloaded
+    opencv_num_threads = cfg.get('opencv_num_threads', 0)
+    cv2.setNumThreads(opencv_num_threads)
+
+    # setup OMP threads
+    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
+    if 'OMP_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1:
+        omp_num_threads = 1
+        warnings.warn(
+            f'Setting OMP_NUM_THREADS environment variable for each process '
+            f'to be {omp_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
+
+    # setup MKL threads
+    if 'MKL_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1:
+        mkl_num_threads = 1
+        warnings.warn(
+            f'Setting MKL_NUM_THREADS environment variable for each process '
+            f'to be {mkl_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
+
+# def register_all_modules(init_default_scope: bool = True) -> None:
+#     """Register all modules in mmpose into the registries.
+
+#     Args:
+#         init_default_scope (bool): Whether initialize the mmpose default scope.
+#             When `init_default_scope=True`, the global default scope will be
+#             set to `mmpose`, and all registries will build modules from mmpose's
+#             registry node. To understand more about the registry, please refer
+#             to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md
+#             Defaults to True.
+#     """  # noqa
+
+#     import mmpose.models  # noqa: F401,F403
+
+#     if init_default_scope:
+#         never_created = DefaultScope.get_current_instance() is None \
+#                         or not DefaultScope.check_instance_created('mmpose')
+#         if never_created:
+#             DefaultScope.get_instance('mmpose', scope_name='mmpose')
+#             return
+#         current_scope = DefaultScope.get_current_instance()
+#         if current_scope.scope_name != 'mmpose':
+#             warnings.warn('The current default scope '
+#                           f'"{current_scope.scope_name}" is not "mmpose", '
+#                           '`register_all_modules` will force the current'
+#                           'default scope to be "mmpose". If this is not '
+#                           'expected, please set `init_default_scope=False`.')
+#             # avoid name conflict
+#             new_instance_name = f'mmpose-{datetime.datetime.now()}'
+#             DefaultScope.get_instance(new_instance_name, scope_name='mmpose')
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/utils/timer.py b/SMPLer-X/main/transformer_utils/mmpose/utils/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cec6aff6226e249edca2e2ac64b7cd2db8557e19
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/utils/timer.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from contextlib import contextmanager
+from functools import partial
+
+import numpy as np
+from mmengine import Timer
+
+
+class RunningAverage():
+    r"""A helper class to calculate running average in a sliding window.
+
+    Args:
+        window (int): The size of the sliding window.
+    """
+
+    def __init__(self, window: int = 1):
+        self.window = window
+        self._data = []
+
+    def update(self, value):
+        """Update a new data sample."""
+        self._data.append(value)
+        self._data = self._data[-self.window:]
+
+    def average(self):
+        """Get the average value of current window."""
+        return np.mean(self._data)
+
+
+class StopWatch:
+    r"""A helper class to measure FPS and detailed time consuming of each phase
+    in a video processing loop or similar scenarios.
+
+    Args:
+        window (int): The sliding window size to calculate the running average
+            of the time consuming.
+
+    Example:
+        >>> from mmpose.utils import StopWatch
+        >>> import time
+        >>> stop_watch = StopWatch(window=10)
+        >>> with stop_watch.timeit('total'):
+        >>>     time.sleep(0.1)
+        >>>     # 'timeit' support nested use
+        >>>     with stop_watch.timeit('phase1'):
+        >>>         time.sleep(0.1)
+        >>>     with stop_watch.timeit('phase2'):
+        >>>         time.sleep(0.2)
+        >>>     time.sleep(0.2)
+        >>> report = stop_watch.report()
+    """
+
+    def __init__(self, window=1):
+        self.window = window
+        self._record = defaultdict(partial(RunningAverage, window=self.window))
+        self._timer_stack = []
+
+    @contextmanager
+    def timeit(self, timer_name='_FPS_'):
+        """Timing a code snippet with an assigned name.
+
+        Args:
+            timer_name (str): The unique name of the interested code snippet to
+                handle multiple timers and generate reports. Note that '_FPS_'
+                is a special key that the measurement will be in `fps` instead
+                of `millisecond`. Also see `report` and `report_strings`.
+                Default: '_FPS_'.
+        Note:
+            This function should always be used in a `with` statement, as shown
+            in the example.
+        """
+        self._timer_stack.append((timer_name, Timer()))
+        try:
+            yield
+        finally:
+            timer_name, timer = self._timer_stack.pop()
+            self._record[timer_name].update(timer.since_start())
+
+    def report(self, key=None):
+        """Report timing information.
+
+        Returns:
+            dict: The key is the timer name and the value is the \
+                corresponding average time consuming.
+        """
+        result = {
+            name: r.average() * 1000.
+            for name, r in self._record.items()
+        }
+
+        if '_FPS_' in result:
+            result['_FPS_'] = 1000. / result.pop('_FPS_')
+
+        if key is None:
+            return result
+        return result[key]
+
+    def report_strings(self):
+        """Report timing information in texture strings.
+
+        Returns:
+            list(str): Each element is the information string of a timed \
+                event, in format of '{timer_name}: {time_in_ms}'. \
+                Specially, if timer_name is '_FPS_', the result will \
+                be converted to fps.
+        """
+        result = self.report()
+        strings = []
+        if '_FPS_' in result:
+            strings.append(f'FPS: {result["_FPS_"]:>5.1f}')
+        strings += [f'{name}: {val:>3.0f}' for name, val in result.items()]
+        return strings
+
+    def reset(self):
+        self._record = defaultdict(list)
+        self._active_timer_stack = []
\ No newline at end of file
diff --git a/SMPLer-X/main/transformer_utils/mmpose/version.py b/SMPLer-X/main/transformer_utils/mmpose/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c8908940628018788a5adc23afafbe7e352b3a2
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose/version.py
@@ -0,0 +1,19 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+
+__version__ = '0.28.0'
+short_version = __version__
+
+
+def parse_version_info(version_str):
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/SMPLer-X/main/transformer_utils/mmpose_README.md b/SMPLer-X/main/transformer_utils/mmpose_README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2c4a4f91bed45023806a0b03d0ba4f60ccb8354
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose_README.md
@@ -0,0 +1,289 @@
+<div align="center">
+  <img src="resources/mmpose-logo.png" width="450"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b>OpenMMLab website</b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i>HOT</i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b>OpenMMLab platform</b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i>TRY IT OUT</i>
+      </a>
+    </sup>
+  </div>
+  <div>&nbsp;</div>
+
+[![Documentation](https://readthedocs.org/projects/mmpose/badge/?version=latest)](https://mmpose.readthedocs.io/en/latest/?badge=latest)
+[![actions](https://github.com/open-mmlab/mmpose/workflows/build/badge.svg)](https://github.com/open-mmlab/mmpose/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmpose/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmpose)
+[![PyPI](https://img.shields.io/pypi/v/mmpose)](https://pypi.org/project/mmpose/)
+[![LICENSE](https://img.shields.io/github/license/open-mmlab/mmpose.svg)](https://github.com/open-mmlab/mmpose/blob/master/LICENSE)
+[![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmpose.svg)](https://github.com/open-mmlab/mmpose/issues)
+[![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmpose.svg)](https://github.com/open-mmlab/mmpose/issues)
+
+[📘Documentation](https://mmpose.readthedocs.io/en/v0.28.0/) |
+[🛠️Installation](https://mmpose.readthedocs.io/en/v0.28.0/install.html) |
+[👀Model Zoo](https://mmpose.readthedocs.io/en/v0.28.0/modelzoo.html) |
+[📜Papers](https://mmpose.readthedocs.io/en/v0.28.0/papers/algorithms.html) |
+[🆕Update News](https://mmpose.readthedocs.io/en/v0.28.0/changelog.html) |
+[🤔Reporting Issues](https://github.com/open-mmlab/mmpose/issues/new/choose)
+
+</div>
+
+<div align="center">
+
+English | [简体中文](README_CN.md)
+
+</div>
+
+## Introduction
+
+MMPose is an open-source toolbox for pose estimation based on PyTorch.
+It is a part of the [OpenMMLab project](https://github.com/open-mmlab).
+
+The master branch works with **PyTorch 1.5+**.
+
+https://user-images.githubusercontent.com/15977946/124654387-0fd3c500-ded1-11eb-84f6-24eeddbf4d91.mp4
+
+<details open>
+<summary><b>Major Features</b></summary>
+
+- **Support diverse tasks**
+
+  We support a wide spectrum of mainstream pose analysis tasks in current research community, including 2d multi-person human pose estimation, 2d hand pose estimation, 2d face landmark detection, 133 keypoint whole-body human pose estimation, 3d human mesh recovery, fashion landmark detection and animal pose estimation.
+  See [demo.md](demo/README.md) for more information.
+
+- **Higher efficiency and higher accuracy**
+
+  MMPose implements multiple state-of-the-art (SOTA) deep learning models, including both top-down & bottom-up approaches. We achieve faster training speed and higher accuracy than other popular codebases, such as [HRNet](https://github.com/leoxiaobin/deep-high-resolution-net.pytorch).
+  See [benchmark.md](docs/en/benchmark.md) for more information.
+
+- **Support for various datasets**
+
+  The toolbox directly supports multiple popular and representative datasets, COCO, AIC, MPII, MPII-TRB, OCHuman etc.
+  See [data_preparation.md](docs/en/data_preparation.md) for more information.
+
+- **Well designed, tested and documented**
+
+  We decompose MMPose into different components and one can easily construct a customized
+  pose estimation framework by combining different modules.
+  We provide detailed documentation and API reference, as well as unittests.
+
+</details>
+
+## What's New
+
+- 2022-07-06: MMPose [v0.28.0](https://github.com/open-mmlab/mmpose/releases/tag/v0.28.0) is released. Major updates include:
+  - Support [TCFormer](https://openaccess.thecvf.com/content/CVPR2022/html/Zeng_Not_All_Tokens_Are_Equal_Human-Centric_Visual_Analysis_via_Token_CVPR_2022_paper.html) (CVPR'2022). See the [model page](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/tcformer_coco-wholebody.md)
+  - Add [RLE](https://arxiv.org/abs/2107.11291) pre-trained model on COCO dataset. See the [model page](/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_rle_coco.md)
+  - Update [Swin](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/swin_coco.md) models with better performance
+- 2022-02-28: MMPose model deployment is supported by [MMDeploy](https://github.com/open-mmlab/mmdeploy) v0.3.0
+  MMPose Webcam API is a simple yet powerful tool to develop interactive webcam applications with MMPose features.
+- 2021-12-29: OpenMMLab Open Platform is online! Try our [pose estimation demo](https://platform.openmmlab.com/web-demo/demo/poseestimation)
+
+## Installation
+
+MMPose depends on [PyTorch](https://pytorch.org/) and [MMCV](https://github.com/open-mmlab/mmcv).
+Below are quick steps for installation.
+Please refer to [install.md](docs/en/install.md) for detailed installation guide.
+
+```shell
+conda create -n open-mmlab python=3.8 pytorch=1.10 cudatoolkit=11.3 torchvision -c pytorch -y
+conda activate open-mmlab
+pip3 install openmim
+mim install mmcv-full
+git clone https://github.com/open-mmlab/mmpose.git
+cd mmpose
+pip3 install -e .
+```
+
+## Getting Started
+
+Please see [get_started.md](docs/en/get_started.md) for the basic usage of MMPose.
+There are also tutorials:
+
+- [learn about configs](docs/en/tutorials/0_config.md)
+- [finetune model](docs/en/tutorials/1_finetune.md)
+- [add new dataset](docs/en/tutorials/2_new_dataset.md)
+- [customize data pipelines](docs/en/tutorials/3_data_pipeline.md)
+- [add new modules](docs/en/tutorials/4_new_modules.md)
+- [export a model to ONNX](docs/en/tutorials/5_export_model.md)
+- [customize runtime settings](docs/en/tutorials/6_customize_runtime.md)
+
+## Model Zoo
+
+Results and models are available in the *README.md* of each method's config directory.
+A summary can be found in the [Model Zoo](https://mmpose.readthedocs.io/en/latest/modelzoo.html) page.
+
+<details open>
+<summary><b>Supported algorithms:</b></summary>
+
+- [x] [DeepPose](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#deeppose-cvpr-2014) (CVPR'2014)
+- [x] [CPM](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#cpm-cvpr-2016) (CVPR'2016)
+- [x] [Hourglass](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#hourglass-eccv-2016) (ECCV'2016)
+- [x] [SimpleBaseline3D](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#simplebaseline3d-iccv-2017) (ICCV'2017)
+- [x] [Associative Embedding](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#associative-embedding-nips-2017) (NeurIPS'2017)
+- [x] [HMR](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#hmr-cvpr-2018) (CVPR'2018)
+- [x] [SimpleBaseline2D](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#simplebaseline2d-eccv-2018) (ECCV'2018)
+- [x] [HRNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#hrnet-cvpr-2019) (CVPR'2019)
+- [x] [VideoPose3D](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#videopose3d-cvpr-2019) (CVPR'2019)
+- [x] [HRNetv2](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#hrnetv2-tpami-2019) (TPAMI'2019)
+- [x] [MSPN](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#mspn-arxiv-2019) (ArXiv'2019)
+- [x] [SCNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#scnet-cvpr-2020) (CVPR'2020)
+- [x] [HigherHRNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#higherhrnet-cvpr-2020) (CVPR'2020)
+- [x] [RSN](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#rsn-eccv-2020) (ECCV'2020)
+- [x] [InterNet](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#internet-eccv-2020) (ECCV'2020)
+- [x] [VoxelPose](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#voxelpose-eccv-2020) (ECCV'2020)
+- [x] [LiteHRNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#litehrnet-cvpr-2021) (CVPR'2021)
+- [x] [ViPNAS](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#vipnas-cvpr-2021) (CVPR'2021)
+
+</details>
+
+<details open>
+<summary><b>Supported techniques:</b></summary>
+
+- [x] [FPN](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#fpn-cvpr-2017) (CVPR'2017)
+- [x] [FP16](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#fp16-arxiv-2017) (ArXiv'2017)
+- [x] [Wingloss](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#wingloss-cvpr-2018) (CVPR'2018)
+- [x] [AdaptiveWingloss](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#adaptivewingloss-iccv-2019) (ICCV'2019)
+- [x] [DarkPose](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#darkpose-cvpr-2020) (CVPR'2020)
+- [x] [UDP](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#udp-cvpr-2020) (CVPR'2020)
+- [x] [Albumentations](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#albumentations-information-2020) (Information'2020)
+- [x] [SoftWingloss](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#softwingloss-tip-2021) (TIP'2021)
+- [x] [SmoothNet](/configs/_base_/filters/smoothnet_h36m.md) (arXiv'2021)
+- [x] [RLE](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#rle-iccv-2021) (ICCV'2021)
+
+</details>
+
+<details open>
+<summary><b>Supported <a href="https://mmpose.readthedocs.io/en/latest/datasets.html">datasets</a>:</b></summary>
+
+- [x] [AFLW](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#aflw-iccvw-2011) \[[homepage](https://www.tugraz.at/institute/icg/research/team-bischof/lrs/downloads/aflw/)\] (ICCVW'2011)
+- [x] [sub-JHMDB](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#jhmdb-iccv-2013) \[[homepage](http://jhmdb.is.tue.mpg.de/dataset)\] (ICCV'2013)
+- [x] [COFW](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#cofw-iccv-2013) \[[homepage](http://www.vision.caltech.edu/xpburgos/ICCV13/)\] (ICCV'2013)
+- [x] [MPII](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#mpii-cvpr-2014) \[[homepage](http://human-pose.mpi-inf.mpg.de/)\] (CVPR'2014)
+- [x] [Human3.6M](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#human3-6m-tpami-2014) \[[homepage](http://vision.imar.ro/human3.6m/description.php)\] (TPAMI'2014)
+- [x] [COCO](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#coco-eccv-2014) \[[homepage](http://cocodataset.org/)\] (ECCV'2014)
+- [x] [CMU Panoptic](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#cmu-panoptic-iccv-2015) \[[homepage](http://domedb.perception.cs.cmu.edu/)\] (ICCV'2015)
+- [x] [DeepFashion](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#deepfashion-cvpr-2016) \[[homepage](http://mmlab.ie.cuhk.edu.hk/projects/DeepFashion/LandmarkDetection.html)\] (CVPR'2016)
+- [x] [300W](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#300w-imavis-2016) \[[homepage](https://ibug.doc.ic.ac.uk/resources/300-W/)\] (IMAVIS'2016)
+- [x] [RHD](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#rhd-iccv-2017) \[[homepage](https://lmb.informatik.uni-freiburg.de/resources/datasets/RenderedHandposeDataset.en.html)\] (ICCV'2017)
+- [x] [CMU Panoptic HandDB](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#cmu-panoptic-handdb-cvpr-2017) \[[homepage](http://domedb.perception.cs.cmu.edu/handdb.html)\] (CVPR'2017)
+- [x] [AI Challenger](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#ai-challenger-arxiv-2017) \[[homepage](https://github.com/AIChallenger/AI_Challenger_2017)\] (ArXiv'2017)
+- [x] [MHP](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#mhp-acm-mm-2018) \[[homepage](https://lv-mhp.github.io/dataset)\] (ACM MM'2018)
+- [x] [WFLW](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#wflw-cvpr-2018) \[[homepage](https://wywu.github.io/projects/LAB/WFLW.html)\] (CVPR'2018)
+- [x] [PoseTrack18](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#posetrack18-cvpr-2018) \[[homepage](https://posetrack.net/users/download.php)\] (CVPR'2018)
+- [x] [OCHuman](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#ochuman-cvpr-2019) \[[homepage](https://github.com/liruilong940607/OCHumanApi)\] (CVPR'2019)
+- [x] [CrowdPose](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#crowdpose-cvpr-2019) \[[homepage](https://github.com/Jeff-sjtu/CrowdPose)\] (CVPR'2019)
+- [x] [MPII-TRB](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#mpii-trb-iccv-2019) \[[homepage](https://github.com/kennymckormick/Triplet-Representation-of-human-Body)\] (ICCV'2019)
+- [x] [FreiHand](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#freihand-iccv-2019) \[[homepage](https://lmb.informatik.uni-freiburg.de/projects/freihand/)\] (ICCV'2019)
+- [x] [Animal-Pose](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#animal-pose-iccv-2019) \[[homepage](https://sites.google.com/view/animal-pose/)\] (ICCV'2019)
+- [x] [OneHand10K](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#onehand10k-tcsvt-2019) \[[homepage](https://www.yangangwang.com/papers/WANG-MCC-2018-10.html)\] (TCSVT'2019)
+- [x] [Vinegar Fly](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#vinegar-fly-nature-methods-2019) \[[homepage](https://github.com/jgraving/DeepPoseKit-Data)\] (Nature Methods'2019)
+- [x] [Desert Locust](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#desert-locust-elife-2019) \[[homepage](https://github.com/jgraving/DeepPoseKit-Data)\] (Elife'2019)
+- [x] [Grévy’s Zebra](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#grevys-zebra-elife-2019) \[[homepage](https://github.com/jgraving/DeepPoseKit-Data)\] (Elife'2019)
+- [x] [ATRW](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#atrw-acm-mm-2020) \[[homepage](https://cvwc2019.github.io/challenge.html)\] (ACM MM'2020)
+- [x] [Halpe](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#halpe-cvpr-2020) \[[homepage](https://github.com/Fang-Haoshu/Halpe-FullBody/)\] (CVPR'2020)
+- [x] [COCO-WholeBody](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#coco-wholebody-eccv-2020) \[[homepage](https://github.com/jin-s13/COCO-WholeBody/)\] (ECCV'2020)
+- [x] [MacaquePose](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#macaquepose-biorxiv-2020) \[[homepage](http://www.pri.kyoto-u.ac.jp/datasets/macaquepose/index.html)\] (bioRxiv'2020)
+- [x] [InterHand2.6M](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#interhand2-6m-eccv-2020) \[[homepage](https://mks0601.github.io/InterHand2.6M/)\] (ECCV'2020)
+- [x] [AP-10K](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#ap-10k-neurips-2021) \[[homepage](https://github.com/AlexTheBad/AP-10K)\] (NeurIPS'2021)
+- [x] [Horse-10](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#horse-10-wacv-2021) \[[homepage](http://www.mackenziemathislab.org/horse10)\] (WACV'2021)
+
+</details>
+
+<details open>
+<summary><b>Supported backbones:</b></summary>
+
+- [x] [AlexNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#alexnet-neurips-2012) (NeurIPS'2012)
+- [x] [VGG](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#vgg-iclr-2015) (ICLR'2015)
+- [x] [ResNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#resnet-cvpr-2016) (CVPR'2016)
+- [x] [ResNext](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#resnext-cvpr-2017) (CVPR'2017)
+- [x] [SEResNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#seresnet-cvpr-2018) (CVPR'2018)
+- [x] [ShufflenetV1](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#shufflenetv1-cvpr-2018) (CVPR'2018)
+- [x] [ShufflenetV2](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#shufflenetv2-eccv-2018) (ECCV'2018)
+- [x] [MobilenetV2](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#mobilenetv2-cvpr-2018) (CVPR'2018)
+- [x] [ResNetV1D](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#resnetv1d-cvpr-2019) (CVPR'2019)
+- [x] [ResNeSt](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#resnest-arxiv-2020) (ArXiv'2020)
+- [x] [Swin](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#swin-cvpr-2021) (CVPR'2021)
+- [x] [HRFormer](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#hrformer-nips-2021) (NIPS'2021)
+- [x] [PVT](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#pvt-iccv-2021) (ICCV'2021)
+- [x] [PVTV2](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#pvtv2-cvmj-2022) (CVMJ'2022)
+
+</details>
+
+### Model Request
+
+We will keep up with the latest progress of the community, and support more popular algorithms and frameworks. If you have any feature requests, please feel free to leave a comment in [MMPose Roadmap](https://github.com/open-mmlab/mmpose/issues/9).
+
+### Benchmark
+
+#### Accuracy and Training Speed
+
+MMPose achieves superior of training speed and accuracy on the standard keypoint detection benchmarks like COCO. See more details at [benchmark.md](docs/en/benchmark.md).
+
+#### Inference Speed
+
+We summarize the model complexity and inference speed of major models in MMPose, including FLOPs, parameter counts and inference speeds on both CPU and GPU devices with different batch sizes. Please refer to [inference_speed_summary.md](docs/en/inference_speed_summary.md) for more details.
+
+## Data Preparation
+
+Please refer to [data_preparation.md](docs/en/data_preparation.md) for a general knowledge of data preparation.
+
+## FAQ
+
+Please refer to [FAQ](docs/en/faq.md) for frequently asked questions.
+
+## Contributing
+
+We appreciate all contributions to improve MMPose. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline.
+
+## Acknowledgement
+
+MMPose is an open source project that is contributed by researchers and engineers from various colleges and companies.
+We appreciate all the contributors who implement their methods or add new features, as well as users who give valuable feedbacks.
+We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to reimplement existing methods and develop their own new models.
+
+## Citation
+
+If you find this project useful in your research, please consider cite:
+
+```bibtex
+@misc{mmpose2020,
+    title={OpenMMLab Pose Estimation Toolbox and Benchmark},
+    author={MMPose Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmpose}},
+    year={2020}
+}
+```
+
+## License
+
+This project is released under the [Apache 2.0 license](LICENSE).
+
+## Projects in OpenMMLab
+
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
+- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab Model Deployment Framework.
diff --git a/SMPLer-X/main/transformer_utils/mmpose_README_CN.md b/SMPLer-X/main/transformer_utils/mmpose_README_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..05074e3ef9c670dcea4a4bac953f4016ea318b0a
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/mmpose_README_CN.md
@@ -0,0 +1,311 @@
+<div align="center">
+  <img src="resources/mmpose-logo.png" width="450"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b><font size="5">OpenMMLab 官网</font></b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i><font size="4">HOT</font></i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b><font size="5">OpenMMLab 开放平台</font></b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i><font size="4">TRY IT OUT</font></i>
+      </a>
+    </sup>
+  </div>
+  <div>&nbsp;</div>
+
+[![Documentation](https://readthedocs.org/projects/mmpose/badge/?version=latest)](https://mmpose.readthedocs.io/en/latest/?badge=latest)
+[![actions](https://github.com/open-mmlab/mmpose/workflows/build/badge.svg)](https://github.com/open-mmlab/mmpose/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmpose/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmpose)
+[![PyPI](https://img.shields.io/pypi/v/mmpose)](https://pypi.org/project/mmpose/)
+[![LICENSE](https://img.shields.io/github/license/open-mmlab/mmpose.svg)](https://github.com/open-mmlab/mmpose/blob/master/LICENSE)
+[![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmpose.svg)](https://github.com/open-mmlab/mmpose/issues)
+[![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmpose.svg)](https://github.com/open-mmlab/mmpose/issues)
+
+[📘文档](https://mmpose.readthedocs.io/zh_CN/v0.28.0/) |
+[🛠️安装](https://mmpose.readthedocs.io/zh_CN/v0.28.0/install.html) |
+[👀模型库](https://mmpose.readthedocs.io/zh_CN/v0.28.0/modelzoo.html) |
+[📜论文库](https://mmpose.readthedocs.io/zh_CN/v0.28.0/papers/algorithms.html) |
+[🆕更新日志](https://mmpose.readthedocs.io/en/v0.28.0/changelog.html) |
+[🤔报告问题](https://github.com/open-mmlab/mmpose/issues/new/choose)
+
+</div>
+
+<div align="center">
+
+[English](README.md) | 简体中文
+
+</div>
+
+## 简介
+
+MMPose 是一款基于 PyTorch 的姿态分析的开源工具箱，是 [OpenMMLab](http://openmmlab.org/) 项目的成员之一。
+
+主分支代码目前支持 **PyTorch 1.5 以上**的版本。
+
+https://user-images.githubusercontent.com/15977946/124654387-0fd3c500-ded1-11eb-84f6-24eeddbf4d91.mp4
+
+<details open>
+<summary><b>主要特性</b></summary>
+
+- **支持多种人体姿态分析相关任务**
+
+  MMPose 支持当前学界广泛关注的主流姿态分析任务：主要包括 2D多人姿态估计、2D手部姿态估计、2D人脸关键点检测、133关键点的全身人体姿态估计、3D人体形状恢复、服饰关键点检测、动物关键点检测等。
+  具体请参考 [功能演示](demo/README.md)。
+
+- **更高的精度和更快的速度**
+
+  MMPose 复现了多种学界最先进的人体姿态分析模型，包括“自顶向下”和“自底向上”两大类算法。MMPose 相比于其他主流的代码库，具有更高的模型精度和训练速度。
+  具体请参考 [基准测试](docs/en/benchmark.md)（英文）。
+
+- **支持多样的数据集**
+
+  MMPose 支持了很多主流数据集的准备和构建，如 COCO、 MPII 等。 具体请参考 [数据集准备](docs/en/data_preparation.md)。
+
+- **模块化设计**
+
+  MMPose 将统一的人体姿态分析框架解耦成不同的模块组件，通过组合不同的模块组件，用户可以便捷地构建自定义的人体姿态分析模型。
+
+- **详尽的单元测试和文档**
+
+  MMPose 提供了详尽的说明文档，API 接口说明，全面的单元测试，以供社区参考。
+
+</details>
+
+## 最新进展
+
+- 2022-07-06: MMPose [v0.28.0](https://github.com/open-mmlab/mmpose/releases/tag/v0.28.0) 已经发布. 主要更新包括:
+  - 支持了新的主干网络 [TCFormer](https://openaccess.thecvf.com/content/CVPR2022/html/Zeng_Not_All_Tokens_Are_Equal_Human-Centric_Visual_Analysis_via_Token_CVPR_2022_paper.html) (CVPR'2022)，详见 [模型信息](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/tcformer_coco-wholebody.md)
+  - 增加了 [RLE](https://arxiv.org/abs/2107.11291) 在 COCO 数据集上的模型，详见 [模型信息](/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_rle_coco.md)
+  - 优化了 [Swin](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/swin_coco.md) 模型精度
+- 2022-04: MMPose 代码可以通过 [Gitee](https://gitee.com/open-mmlab/mmpose) 访问
+- 2022-02-28: [MMDeploy](https://github.com/open-mmlab/mmdeploy) v0.3.0 支持 MMPose 模型部署
+- 2021-12-29: OpenMMLab 开放平台已经正式上线! 欢迎试用基于 MMPose 的[姿态估计 Demo](https://platform.openmmlab.com/web-demo/demo/poseestimation)
+
+## 安装
+
+MMPose 依赖 [PyTorch](https://pytorch.org/) 和 [MMCV](https://github.com/open-mmlab/mmcv)，以下是安装的简要步骤。
+更详细的安装指南请参考 [install.md](docs/zh_cn/install.md)。
+
+```shell
+conda create -n open-mmlab python=3.8 pytorch=1.10 cudatoolkit=11.3 torchvision -c pytorch -y
+conda activate open-mmlab
+pip3 install openmim
+mim install mmcv-full
+git clone https://github.com/open-mmlab/mmpose.git
+cd mmpose
+pip3 install -e .
+```
+
+## 教程
+
+请参考 [get_started.md](docs/zh_cn/get_started.md) 了解 MMPose 的基本使用。
+MMPose 也提供了其他更详细的教程:
+
+- [如何编写配置文件](docs/zh_cn/tutorials/0_config.md)
+- [如何微调模型](docs/zh_cn/tutorials/1_finetune.md)
+- [如何增加新数据集](docs/zh_cn/tutorials/2_new_dataset.md)
+- [如何设计数据处理流程](docs/zh_cn/tutorials/3_data_pipeline.md)
+- [如何增加新模块](docs/zh_cn/tutorials/4_new_modules.md)
+- [如何导出模型为 onnx 格式](docs/zh_cn/tutorials/5_export_model.md)
+- [如何自定义运行配置](docs/zh_cn/tutorials/6_customize_runtime.md)
+- [如何使用摄像头应用接口（Webcam API）](docs/zh_cn/tutorials/7_webcam_api.md)
+
+## 模型库
+
+各个模型的结果和设置都可以在对应的 config（配置）目录下的 *README.md* 中查看。
+整体的概况也可也在 [模型库](https://mmpose.readthedocs.io/zh_CN/latest/recognition_models.html) 页面中查看。
+
+<details open>
+<summary><b>支持的算法</b></summary>
+
+- [x] [DeepPose](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#deeppose-cvpr-2014) (CVPR'2014)
+- [x] [CPM](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#cpm-cvpr-2016) (CVPR'2016)
+- [x] [Hourglass](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#hourglass-eccv-2016) (ECCV'2016)
+- [x] [SimpleBaseline3D](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#simplebaseline3d-iccv-2017) (ICCV'2017)
+- [x] [Associative Embedding](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#associative-embedding-nips-2017) (NeurIPS'2017)
+- [x] [HMR](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#hmr-cvpr-2018) (CVPR'2018)
+- [x] [SimpleBaseline2D](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#simplebaseline2d-eccv-2018) (ECCV'2018)
+- [x] [HRNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#hrnet-cvpr-2019) (CVPR'2019)
+- [x] [VideoPose3D](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#videopose3d-cvpr-2019) (CVPR'2019)
+- [x] [HRNetv2](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#hrnetv2-tpami-2019) (TPAMI'2019)
+- [x] [MSPN](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#mspn-arxiv-2019) (ArXiv'2019)
+- [x] [SCNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#scnet-cvpr-2020) (CVPR'2020)
+- [x] [HigherHRNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#higherhrnet-cvpr-2020) (CVPR'2020)
+- [x] [RSN](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#rsn-eccv-2020) (ECCV'2020)
+- [x] [InterNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#internet-eccv-2020) (ECCV'2020)
+- [x] [VoxelPose](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#voxelpose-eccv-2020) (ECCV'2020
+- [x] [LiteHRNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#litehrnet-cvpr-2021) (CVPR'2021)
+- [x] [ViPNAS](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#vipnas-cvpr-2021) (CVPR'2021)
+
+</details>
+
+<details open>
+<summary><b>支持的技术</b></summary>
+
+- [x] [FPN](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#fpn-cvpr-2017) (CVPR'2017)
+- [x] [FP16](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#fp16-arxiv-2017) (ArXiv'2017)
+- [x] [Wingloss](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#wingloss-cvpr-2018) (CVPR'2018)
+- [x] [AdaptiveWingloss](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#adaptivewingloss-iccv-2019) (ICCV'2019)
+- [x] [DarkPose](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#darkpose-cvpr-2020) (CVPR'2020)
+- [x] [UDP](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#udp-cvpr-2020) (CVPR'2020)
+- [x] [Albumentations](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#albumentations-information-2020) (Information'2020)
+- [x] [SoftWingloss](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#softwingloss-tip-2021) (TIP'2021)
+- [x] [SmoothNet](/configs/_base_/filters/smoothnet_h36m.md) (arXiv'2021)
+- [x] [RLE](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#rle-iccv-2021) (ICCV'2021)
+
+</details>
+
+<details open>
+<summary><b><a href="https://mmpose.readthedocs.io/zh_CN/latest/datasets.html">支持的数据集</a></b></summary>
+
+- [x] [AFLW](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#aflw-iccvw-2011) \[[homepage](https://www.tugraz.at/institute/icg/research/team-bischof/lrs/downloads/aflw/)\] (ICCVW'2011)
+- [x] [sub-JHMDB](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#jhmdb-iccv-2013) \[[homepage](http://jhmdb.is.tue.mpg.de/dataset)\] (ICCV'2013)
+- [x] [COFW](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#cofw-iccv-2013) \[[homepage](http://www.vision.caltech.edu/xpburgos/ICCV13/)\] (ICCV'2013)
+- [x] [MPII](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#mpii-cvpr-2014) \[[homepage](http://human-pose.mpi-inf.mpg.de/)\] (CVPR'2014)
+- [x] [Human3.6M](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#human3-6m-tpami-2014) \[[homepage](http://vision.imar.ro/human3.6m/description.php)\] (TPAMI'2014)
+- [x] [COCO](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#coco-eccv-2014) \[[homepage](http://cocodataset.org/)\] (ECCV'2014)
+- [x] [CMU Panoptic](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#cmu-panoptic-iccv-2015) (ICCV'2015)
+- [x] [DeepFashion](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#deepfashion-cvpr-2016) \[[homepage](http://mmlab.ie.cuhk.edu.hk/projects/DeepFashion/LandmarkDetection.html)\] (CVPR'2016)
+- [x] [300W](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#300w-imavis-2016) \[[homepage](https://ibug.doc.ic.ac.uk/resources/300-W/)\] (IMAVIS'2016)
+- [x] [RHD](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#rhd-iccv-2017) \[[homepage](https://lmb.informatik.uni-freiburg.de/resources/datasets/RenderedHandposeDataset.en.html)\] (ICCV'2017)
+- [x] [CMU Panoptic](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#cmu-panoptic-iccv-2015) \[[homepage](http://domedb.perception.cs.cmu.edu/)\] (ICCV'2015)
+- [x] [AI Challenger](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#ai-challenger-arxiv-2017) \[[homepage](https://github.com/AIChallenger/AI_Challenger_2017)\] (ArXiv'2017)
+- [x] [MHP](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#mhp-acm-mm-2018) \[[homepage](https://lv-mhp.github.io/dataset)\] (ACM MM'2018)
+- [x] [WFLW](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#wflw-cvpr-2018) \[[homepage](https://wywu.github.io/projects/LAB/WFLW.html)\] (CVPR'2018)
+- [x] [PoseTrack18](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#posetrack18-cvpr-2018) \[[homepage](https://posetrack.net/users/download.php)\] (CVPR'2018)
+- [x] [OCHuman](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#ochuman-cvpr-2019) \[[homepage](https://github.com/liruilong940607/OCHumanApi)\] (CVPR'2019)
+- [x] [CrowdPose](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#crowdpose-cvpr-2019) \[[homepage](https://github.com/Jeff-sjtu/CrowdPose)\] (CVPR'2019)
+- [x] [MPII-TRB](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#mpii-trb-iccv-2019) \[[homepage](https://github.com/kennymckormick/Triplet-Representation-of-human-Body)\] (ICCV'2019)
+- [x] [FreiHand](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#freihand-iccv-2019) \[[homepage](https://lmb.informatik.uni-freiburg.de/projects/freihand/)\] (ICCV'2019)
+- [x] [Animal-Pose](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#animal-pose-iccv-2019) \[[homepage](https://sites.google.com/view/animal-pose/)\] (ICCV'2019)
+- [x] [OneHand10K](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#onehand10k-tcsvt-2019) \[[homepage](https://www.yangangwang.com/papers/WANG-MCC-2018-10.html)\] (TCSVT'2019)
+- [x] [Vinegar Fly](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#vinegar-fly-nature-methods-2019) \[[homepage](https://github.com/jgraving/DeepPoseKit-Data)\] (Nature Methods'2019)
+- [x] [Desert Locust](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#desert-locust-elife-2019) \[[homepage](https://github.com/jgraving/DeepPoseKit-Data)\] (Elife'2019)
+- [x] [Grévy’s Zebra](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#grevys-zebra-elife-2019) \[[homepage](https://github.com/jgraving/DeepPoseKit-Data)\] (Elife'2019)
+- [x] [ATRW](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#atrw-acm-mm-2020) \[[homepage](https://cvwc2019.github.io/challenge.html)\] (ACM MM'2020)
+- [x] [Halpe](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#halpe-cvpr-2020) \[[homepage](https://github.com/Fang-Haoshu/Halpe-FullBody/)\] (CVPR'2020)
+- [x] [COCO-WholeBody](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#coco-wholebody-eccv-2020) \[[homepage](https://github.com/jin-s13/COCO-WholeBody/)\] (ECCV'2020)
+- [x] [MacaquePose](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#macaquepose-biorxiv-2020) \[[homepage](http://www.pri.kyoto-u.ac.jp/datasets/macaquepose/index.html)\] (bioRxiv'2020)
+- [x] [InterHand2.6M](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#interhand2-6m-eccv-2020) \[[homepage](https://mks0601.github.io/InterHand2.6M/)\] (ECCV'2020)
+- [x] [AP-10K](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#ap-10k-neurips-2021) \[[homepage](https://github.com/AlexTheBad/AP-10K)\] (NeurIPS'2021)
+- [x] [Horse-10](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#horse-10-wacv-2021) \[[homepage](http://www.mackenziemathislab.org/horse10)\] (WACV'2021)
+
+</details>
+
+<details>
+<summary><b>支持的骨干网络</b></summary>
+
+- [x] [AlexNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#alexnet-neurips-2012) (NeurIPS'2012)
+- [x] [VGG](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#vgg-iclr-2015) (ICLR'2015)
+- [x] [ResNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#resnet-cvpr-2016) (CVPR'2016)
+- [x] [ResNext](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#resnext-cvpr-2017) (CVPR'2017)
+- [x] [SEResNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#seresnet-cvpr-2018) (CVPR'2018)
+- [x] [ShufflenetV1](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#shufflenetv1-cvpr-2018) (CVPR'2018)
+- [x] [ShufflenetV2](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#shufflenetv2-eccv-2018) (ECCV'2018)
+- [x] [MobilenetV2](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#mobilenetv2-cvpr-2018) (CVPR'2018)
+- [x] [ResNetV1D](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#resnetv1d-cvpr-2019) (CVPR'2019)
+- [x] [ResNeSt](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#resnest-arxiv-2020) (ArXiv'2020)
+- [x] [Swin](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#swin-cvpr-2021) (CVPR'2021)
+- [x] [HRFormer](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#hrformer-nips-2021) (NIPS'2021)
+- [x] [PVT](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#pvt-iccv-2021) (ICCV'2021)
+- [x] [PVTV2](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#pvtv2-cvmj-2022) (CVMJ'2022)
+
+</details>
+
+### 模型需求
+
+我们将跟进学界的最新进展，并支持更多算法和框架。如果您对 MMPose 有任何功能需求，请随时在 [MMPose Roadmap](https://github.com/open-mmlab/mmpose/issues/9) 中留言。
+
+### 基准测试
+
+#### 训练精度和速度
+
+MMPose 在主流关键点检测基准 COCO 上达到了优越的模型精度和训练速度。
+
+详细信息可见 [基准测试](docs/en/benchmark.md)(英文)。
+
+#### 推理速度
+
+我们总结了 MMPose 中主要模型的复杂度信息和推理速度，包括模型的计算复杂度、参数数量，以及以不同的批处理大小在 CPU 和 GPU 上的推理速度。
+
+详细信息可见 [模型推理速度](docs/zh_cn/inference_speed_summary.md)。
+
+## 数据准备
+
+请参考 [data_preparation.md](docs/en/data_preparation.md)（英文） 进行数据集准备。
+
+## 常见问题
+
+请参考 [FAQ](docs/en/faq.md) 了解其他用户的常见问题。
+
+## 参与贡献
+
+我们非常欢迎用户对于 MMPose 做出的任何贡献，可以参考 [CONTRIBUTION.md](.github/CONTRIBUTING.md) 文件了解更多细节。
+
+## 致谢
+
+MMPose 是一款由不同学校和公司共同贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者，以及提供宝贵反馈的用户。
+我们希望该工具箱和基准测试可以为社区提供灵活的代码工具，供用户复现现有算法并开发自己的新模型，从而不断为开源社区提供贡献。
+
+## 引用
+
+如果您觉得 MMPose 对您的研究有所帮助，请考虑引用它：
+
+```bibtex
+@misc{mmpose2020,
+    title={OpenMMLab Pose Estimation Toolbox and Benchmark},
+    author={MMPose Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmpose}},
+    year={2020}
+}
+```
+
+## 许可证
+
+该项目采用 [Apache 2.0 license](LICENSE) 开源协议。
+
+## OpenMMLab的其他项目
+
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
+- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
+
+## 欢迎加入 OpenMMLab 社区
+
+扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，联络 OpenMMLab [官方微信小助手](/docs/en/imgs/wechat_assistant_qrcode.png)或加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=GJP18SjI)
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/en/_static/zhihu_qrcode.jpg" height="400"><img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/en/_static/wechat_qrcode.jpg" height="400"><img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/en/_static/qq_group_qrcode.jpg" height="400">
+</div>
+
+我们会在 OpenMMLab 社区为大家
+
+- 📢 分享 AI 框架的前沿核心技术
+- 💻 解读 PyTorch 常用模块源码
+- 📰 发布 OpenMMLab 的相关新闻
+- 🚀 介绍 OpenMMLab 开发的前沿算法
+- 🏃 获取更高效的问题答疑和意见反馈
+- 🔥 提供与各行各业开发者充分交流的平台
+
+干货满满 📘，等你来撩 💗，OpenMMLab 社区期待您的加入 👬
diff --git a/SMPLer-X/main/transformer_utils/model-index.yml b/SMPLer-X/main/transformer_utils/model-index.yml
new file mode 100644
index 0000000000000000000000000000000000000000..aa7691be16b169e2c08c6d9a482a32f23d17d80c
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/model-index.yml
@@ -0,0 +1,147 @@
+Import:
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_animalpose.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/resnet_animalpose.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_ap10k.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/resnet_ap10k.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_atrw.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/resnet_atrw.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/resnet_fly.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_horse10.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/resnet_horse10.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/resnet_locust.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_macaque.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/resnet_macaque.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/resnet_zebra.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_aic.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_aic.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_udp_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_udp_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/resnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_crowdpose.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_mhp.yml
+- configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_rle_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/resnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/resnet_rle_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_aic.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/resnet_aic.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_augmentation_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_dark_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_fp16_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_udp_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/pvt_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_dark_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_fp16_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/swin_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_crowdpose.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/resnet_crowdpose.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_h36m.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/resnet_jhmdb.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/resnet_mhp.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_dark_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/mobilenetv2_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/resnet_mpii_trb.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_ochuman.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/resnet_ochuman.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_posetrack18.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/resnet_posetrack18.yml
+- configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_posetrack18_posewarper.yml
+- configs/body/3d_kpt_mview_rgb_img/voxelpose/campus/voxelpose_campus.yml
+- configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.yml
+- configs/body/3d_kpt_mview_rgb_img/voxelpose/shelf/voxelpose_shelf.yml
+- configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.yml
+- configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.yml
+- configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m.yml
+- configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp.yml
+- configs/body/3d_mesh_sview_rgb_img/hmr/mixed/resnet_mixed.yml
+- configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_softwingloss_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wingloss_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_300w.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_aflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_dark_aflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_cofw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_awing_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_dark_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_wflw.yml
+- configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/resnet_deepfashion.yml
+- configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/resnet_deepfashion.yml
+- configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/resnet_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/resnet_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/resnet_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/resnet_freihand2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/resnet_interhand2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/resnet_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_dark_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_udp_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/resnet_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/resnet_rhd2d.yml
+- configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/internet_interhand3d.yml
+- configs/hand/gesture_sview_rgbd_vid/mtut/nvgesture/i3d_nvgesture.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/tcformer_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_dark_halpe.yml
diff --git a/SMPLer-X/main/transformer_utils/pytest.ini b/SMPLer-X/main/transformer_utils/pytest.ini
new file mode 100644
index 0000000000000000000000000000000000000000..9796e871e70c7c67345b1d6bcf708c0c82377a98
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/pytest.ini
@@ -0,0 +1,7 @@
+[pytest]
+addopts = --xdoctest --xdoctest-style=auto
+norecursedirs = .git ignore build __pycache__ data docker docs .eggs
+
+filterwarnings= default
+                ignore:.*No cfgstr given in Cacher constructor or call.*:Warning
+                ignore:.*Define the __nice__ method for.*:Warning
diff --git a/SMPLer-X/main/transformer_utils/requirements.txt b/SMPLer-X/main/transformer_utils/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b5b5d97a6ea7837890ff0247bac8c5f24f6eabab
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/requirements.txt
@@ -0,0 +1,4 @@
+-r requirements/build.txt
+-r requirements/runtime.txt
+-r requirements/tests.txt
+-r requirements/optional.txt
diff --git a/SMPLer-X/main/transformer_utils/requirements/build.txt b/SMPLer-X/main/transformer_utils/requirements/build.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a9566943cef029e5c8dab0b52ba564a7f9c7ad30
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/requirements/build.txt
@@ -0,0 +1,3 @@
+# These must be installed before building mmpose
+numpy
+torch>=1.3
diff --git a/SMPLer-X/main/transformer_utils/requirements/docs.txt b/SMPLer-X/main/transformer_utils/requirements/docs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..20170845c44eefcb139ee2baa1a3d375b71c34ec
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/requirements/docs.txt
@@ -0,0 +1,6 @@
+docutils==0.16.0
+myst-parser
+-e git+https://github.com/gaotongxiao/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+sphinx==4.0.2
+sphinx_copybutton
+sphinx_markdown_tables
diff --git a/SMPLer-X/main/transformer_utils/requirements/mminstall.txt b/SMPLer-X/main/transformer_utils/requirements/mminstall.txt
new file mode 100644
index 0000000000000000000000000000000000000000..89199e36061dcd5361d029606fa25cb791af110a
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/requirements/mminstall.txt
@@ -0,0 +1,3 @@
+mmcv-full>=1.3.8
+mmdet>=2.14.0
+mmtrack>=0.6.0
diff --git a/SMPLer-X/main/transformer_utils/requirements/optional.txt b/SMPLer-X/main/transformer_utils/requirements/optional.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bfb1e75f86aba2fd074b0b1723e9b07a2037e9c3
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/requirements/optional.txt
@@ -0,0 +1,8 @@
+albumentations>=0.3.2 --no-binary qudida,albumentations
+onnx
+onnxruntime
+poseval@git+https://github.com/svenkreiss/poseval.git
+pyrender
+requests
+smplx>=0.1.28
+trimesh
diff --git a/SMPLer-X/main/transformer_utils/requirements/readthedocs.txt b/SMPLer-X/main/transformer_utils/requirements/readthedocs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b8b69d3ca2f051dcb6d6a96a25e7cb9054483c76
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/requirements/readthedocs.txt
@@ -0,0 +1,9 @@
+mmcv-full
+munkres
+poseval@git+https://github.com/svenkreiss/poseval.git
+regex
+scipy
+titlecase
+torch
+torchvision
+xtcocotools>=1.8
diff --git a/SMPLer-X/main/transformer_utils/requirements/runtime.txt b/SMPLer-X/main/transformer_utils/requirements/runtime.txt
new file mode 100644
index 0000000000000000000000000000000000000000..30f20b6f8b678dac9d0043454e9b6d6f980de21e
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/requirements/runtime.txt
@@ -0,0 +1,13 @@
+chumpy
+dataclasses; python_version == '3.6'
+json_tricks
+matplotlib
+munkres
+numpy
+opencv-python
+pillow
+scipy
+torchvision
+xtcocotools>=1.12
+# easydict
+# einops
diff --git a/SMPLer-X/main/transformer_utils/requirements/tests.txt b/SMPLer-X/main/transformer_utils/requirements/tests.txt
new file mode 100644
index 0000000000000000000000000000000000000000..aa23e69da611f7dec62cf84541b7b508f4437a26
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/requirements/tests.txt
@@ -0,0 +1,9 @@
+coverage
+flake8
+interrogate
+isort==4.3.21
+pytest
+pytest-runner
+smplx>=0.1.28
+xdoctest>=0.10.0
+yapf
diff --git a/SMPLer-X/main/transformer_utils/resources/mmpose-logo.png b/SMPLer-X/main/transformer_utils/resources/mmpose-logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..1704dc9abdc743f774d5e04782077d0be8bccd73
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/resources/mmpose-logo.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6342cc3ea98ab9910db6d28edaadd2c6b04713a685e486128615cf532cd5e615
+size 28553
diff --git a/SMPLer-X/main/transformer_utils/setup.cfg b/SMPLer-X/main/transformer_utils/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..c4d8643bc91a06cc48f0d88b23288e892121249c
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/setup.cfg
@@ -0,0 +1,24 @@
+[bdist_wheel]
+universal=1
+
+[aliases]
+test=pytest
+
+[tool:pytest]
+addopts=tests/
+
+[yapf]
+based_on_style = pep8
+blank_line_before_nested_class_or_def = true
+split_before_expression_after_opening_paren = true
+split_penalty_import_names=0
+SPLIT_PENALTY_AFTER_OPENING_BRACKET=800
+
+[isort]
+line_length = 79
+multi_line_output = 0
+extra_standard_library = pkg_resources,setuptools
+known_first_party = mmpose
+known_third_party = PIL,cv2,h5py,json_tricks,matplotlib,mmcv,munkres,numpy,pytest,pytorch_sphinx_theme,requests,scipy,seaborn,spacepy,titlecase,torch,torchvision,webcam_apis,xmltodict,xtcocotools
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
diff --git a/SMPLer-X/main/transformer_utils/setup.py b/SMPLer-X/main/transformer_utils/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..13b4b1fba1c3da1e561d6480b11debb4e39b796a
--- /dev/null
+++ b/SMPLer-X/main/transformer_utils/setup.py
@@ -0,0 +1,194 @@
+import os
+import os.path as osp
+import platform
+import shutil
+import sys
+import warnings
+from setuptools import find_packages, setup
+
+
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+version_file = 'mmpose/version.py'
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    import sys
+
+    # return short version for sdist
+    if 'sdist' in sys.argv or 'bdist_wheel' in sys.argv:
+        return locals()['short_version']
+    else:
+        return locals()['__version__']
+
+
+def parse_requirements(fname='requirements.txt', with_version=True):
+    """Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+
+    Returns:
+        List[str]: list of requirements items
+
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import re
+    import sys
+    from os.path import exists
+    require_fpath = fname
+
+    def parse_line(line):
+        """Parse information from a line in a requirements text file."""
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            elif '@git+' in line:
+                info['package'] = line
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath, 'r') as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    for info in parse_line(line):
+                        yield info
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+
+def add_mim_extension():
+    """Add extra files that are required to support MIM into the package.
+
+    These files will be added by creating a symlink to the originals if the
+    package is installed in `editable` mode (e.g. pip install -e .), or by
+    copying from the originals otherwise.
+    """
+
+    # parse installment mode
+    if 'develop' in sys.argv:
+        # installed by `pip install -e .`
+        if platform.system() == 'Windows':
+            mode = 'copy'
+        else:
+            mode = 'symlink'
+    elif 'sdist' in sys.argv or 'bdist_wheel' in sys.argv:
+        # installed by `pip install .`
+        # or create source distribution by `python setup.py sdist`
+        mode = 'copy'
+    else:
+        return
+
+    filenames = ['tools', 'configs', 'demo', 'model-index.yml']
+    repo_path = osp.dirname(__file__)
+    mim_path = osp.join(repo_path, 'mmpose', '.mim')
+    os.makedirs(mim_path, exist_ok=True)
+
+    for filename in filenames:
+        if osp.exists(filename):
+            src_path = osp.join(repo_path, filename)
+            tar_path = osp.join(mim_path, filename)
+
+            if osp.isfile(tar_path) or osp.islink(tar_path):
+                os.remove(tar_path)
+            elif osp.isdir(tar_path):
+                shutil.rmtree(tar_path)
+
+            if mode == 'symlink':
+                src_relpath = osp.relpath(src_path, osp.dirname(tar_path))
+                os.symlink(src_relpath, tar_path)
+            elif mode == 'copy':
+                if osp.isfile(src_path):
+                    shutil.copyfile(src_path, tar_path)
+                elif osp.isdir(src_path):
+                    shutil.copytree(src_path, tar_path)
+                else:
+                    warnings.warn(f'Cannot copy file {src_path}.')
+            else:
+                raise ValueError(f'Invalid mode {mode}')
+
+
+if __name__ == '__main__':
+    add_mim_extension()
+    setup(
+        name='mmpose',
+        version=get_version(),
+        description='OpenMMLab Pose Estimation Toolbox and Benchmark.',
+        author='MMPose Contributors',
+        author_email='openmmlab@gmail.com',
+        keywords='computer vision, pose estimation',
+        long_description=readme(),
+        long_description_content_type='text/markdown',
+        packages=find_packages(exclude=('configs', 'tools', 'demo')),
+        include_package_data=True,
+        package_data={'mmpose.ops': ['*/*.so']},
+        classifiers=[
+            'Development Status :: 4 - Beta',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+            'Programming Language :: Python :: 3.8',
+            'Programming Language :: Python :: 3.9',
+        ],
+        url='https://github.com/open-mmlab/mmpose',
+        license='Apache License 2.0',
+        install_requires=parse_requirements('requirements/runtime.txt'),
+        extras_require={
+            'tests': parse_requirements('requirements/tests.txt'),
+            'build': parse_requirements('requirements/build.txt'),
+            'runtime': parse_requirements('requirements/runtime.txt'),
+            'mim': parse_requirements('requirements/mminstall.txt'),
+        },
+        zip_safe=False)
diff --git a/SMPLer-X/packages.txt b/SMPLer-X/packages.txt
new file mode 100644
index 0000000000000000000000000000000000000000..89c879114f99513be813c03b5eb5d5330fd1dd85
--- /dev/null
+++ b/SMPLer-X/packages.txt
@@ -0,0 +1,13 @@
+libglfw3-dev
+libgles2-mesa-dev
+libgl1
+freeglut3-dev
+zip
+unzip
+ffmpeg 
+libsm6 
+libxext6
+libgl1-mesa-dri
+libegl1-mesa
+libgbm1
+build-essential
diff --git a/SMPLer-X/pre-requirements.txt b/SMPLer-X/pre-requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ba1cf2dec43cf1bd737527402e561cfb15db9b8c
--- /dev/null
+++ b/SMPLer-X/pre-requirements.txt
@@ -0,0 +1,4 @@
+numpy==1.23
+torch==2.0.0
+torchvision
+torchaudio
\ No newline at end of file
diff --git a/SMPLer-X/pretrained_models/mmdet/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth b/SMPLer-X/pretrained_models/mmdet/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fbdd2e26cc16db3c55149dad743e001aba4198d8
--- /dev/null
+++ b/SMPLer-X/pretrained_models/mmdet/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:047c8118fc5ca88ba5ae1fab72f2cd6b070501fe3af2f3cba5cfa9a89b44b03e
+size 167287506
diff --git a/SMPLer-X/pretrained_models/mmdet/mmdet_faster_rcnn_r50_fpn_coco.py b/SMPLer-X/pretrained_models/mmdet/mmdet_faster_rcnn_r50_fpn_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..912265dea5221988cf4ae9da29359ad56af7c5f7
--- /dev/null
+++ b/SMPLer-X/pretrained_models/mmdet/mmdet_faster_rcnn_r50_fpn_coco.py
@@ -0,0 +1,198 @@
+checkpoint_config = dict(interval=1)
+# yapf: disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf: enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 11])
+total_epochs = 12
+
+model = dict(
+    type='FasterRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ))
+
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
\ No newline at end of file
diff --git a/SMPLer-X/pretrained_models/smpler_x_s32.pth.tar b/SMPLer-X/pretrained_models/smpler_x_s32.pth.tar
new file mode 100644
index 0000000000000000000000000000000000000000..597d217e10ecf51d977121f1c8c1bf6bebcb7959
--- /dev/null
+++ b/SMPLer-X/pretrained_models/smpler_x_s32.pth.tar
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c78569f4e7b6e22e7166b55f322e5d5e019073b8d1a430a6926f93b4c86aa62
+size 384315279
diff --git a/SMPLer-X/requirements.txt b/SMPLer-X/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b3d83d9fce533279363fb64e9d533d9c2af543f7
--- /dev/null
+++ b/SMPLer-X/requirements.txt
@@ -0,0 +1,33 @@
+# --extra-index-url https://download.openmmlab.com/mmcv/dist/cu122/torch2.1.0/index.html
+# https://download.openmmlab.com/mmcv/dist/cu122/torch2.1.0/mmcv-2.2.0-cp310-cp310-manylinux1_x86_64.whl
+
+scikit-image
+scipy
+scikit-learn
+smplx==0.1.28
+tqdm
+yacs
+numba
+opencv-python
+tensorboardx
+filterpy
+cython
+chumpy
+Pillow
+trimesh
+pyrender
+matplotlib
+json_tricks
+torchgeometry
+einops
+joblib
+boto3
+requests
+easydict
+pycocotools
+plyfile
+timm
+pyglet
+mmcv==2.1.0
+mmdet==3.2.0
+eval_type_backport
\ No newline at end of file
diff --git a/Wav2Lip/Dockerfile b/Wav2Lip/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..d625a97d1018f8eef44c46dd41829446ce2d237a
--- /dev/null
+++ b/Wav2Lip/Dockerfile
@@ -0,0 +1,57 @@
+FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# install python via pyenv
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	make \
+	build-essential \
+	libssl-dev \
+	zlib1g-dev \
+	libbz2-dev \
+	libreadline-dev \
+	libsqlite3-dev \
+	wget \
+	curl \
+	llvm \
+	libncurses5-dev \
+	libncursesw5-dev \
+	xz-utils \
+	tk-dev \
+	libffi-dev \
+	liblzma-dev \
+	git \
+	ca-certificates \
+    libgl1 \
+	&& rm -rf /var/lib/apt/lists/*
+ENV PATH="/root/.pyenv/shims:/root/.pyenv/bin:$PATH"
+ARG PYTHON_VERSION=3.8
+RUN curl -s -S -L https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer | bash && \
+	pyenv install $PYTHON_VERSION && \
+	pyenv global $PYTHON_VERSION
+
+# install cog
+RUN pip install cog
+
+# install deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	ffmpeg libsndfile1 \
+	&& rm -rf /var/lib/apt/lists/*
+
+# copy to /src
+ENV WORKDIR /src
+RUN mkdir -p $WORKDIR
+WORKDIR $WORKDIR
+
+# install requirements
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+RUN pip install git+https://github.com/elliottzheng/batch-face.git@master
+
+# copy sources
+COPY . .
+
+ENV PYTHONUNBUFFERED=1
+
+# run cog
+CMD python3 -m cog.server.http
diff --git a/Wav2Lip/README.md b/Wav2Lip/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..35f69daff0ee31e44c7b251f360365b2710585fb
--- /dev/null
+++ b/Wav2Lip/README.md
@@ -0,0 +1,152 @@
+# **Wav2Lip**: *Accurately Lip-syncing Videos In The Wild*
+
+For commercial requests, please contact us at radrabha.m@research.iiit.ac.in or prajwal.k@research.iiit.ac.in. We have an HD model ready that can be used commercially.
+
+This code is part of the paper: _A Lip Sync Expert Is All You Need for Speech to Lip Generation In the Wild_ published at ACM Multimedia 2020. 
+
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-lip-sync-expert-is-all-you-need-for-speech/lip-sync-on-lrs2)](https://paperswithcode.com/sota/lip-sync-on-lrs2?p=a-lip-sync-expert-is-all-you-need-for-speech)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-lip-sync-expert-is-all-you-need-for-speech/lip-sync-on-lrs3)](https://paperswithcode.com/sota/lip-sync-on-lrs3?p=a-lip-sync-expert-is-all-you-need-for-speech)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/a-lip-sync-expert-is-all-you-need-for-speech/lip-sync-on-lrw)](https://paperswithcode.com/sota/lip-sync-on-lrw?p=a-lip-sync-expert-is-all-you-need-for-speech)
+
+|📑 Original Paper|📰 Project Page|🌀 Demo|⚡ Live Testing|📔 Colab Notebook
+|:-:|:-:|:-:|:-:|:-:|
+[Paper](http://arxiv.org/abs/2008.10010) | [Project Page](http://cvit.iiit.ac.in/research/projects/cvit-projects/a-lip-sync-expert-is-all-you-need-for-speech-to-lip-generation-in-the-wild/) | [Demo Video](https://youtu.be/0fXaDCZNOJc) | [Interactive Demo](https://bhaasha.iiit.ac.in/lipsync) | [Colab Notebook](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing) /[Updated Collab Notebook](https://colab.research.google.com/drive/1IjFW1cLevs6Ouyu4Yht4mnR4yeuMqO7Y#scrollTo=MH1m608OymLH)
+
+ <img src="https://drive.google.com/uc?export=view&id=1Wn0hPmpo4GRbCIJR8Tf20Akzdi1qjjG9"/>
+
+----------
+**Highlights**
+----------
+ - Weights of the visual quality disc has been updated in readme!
+ - Lip-sync videos to any target speech with high accuracy :100:. Try our [interactive demo](https://bhaasha.iiit.ac.in/lipsync).
+ - :sparkles: Works for any identity, voice, and language. Also works for CGI faces and synthetic voices.
+ - Complete training code, inference code, and pretrained models are available :boom:
+ - Or, quick-start with the Google Colab Notebook: [Link](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing). Checkpoints and samples are available in a Google Drive [folder](https://drive.google.com/drive/folders/1I-0dNLfFOSFwrfqjNa-SXuwaURHE5K4k?usp=sharing) as well. There is also a [tutorial video](https://www.youtube.com/watch?v=Ic0TBhfuOrA) on this, courtesy of [What Make Art](https://www.youtube.com/channel/UCmGXH-jy0o2CuhqtpxbaQgA). Also, thanks to [Eyal Gruss](https://eyalgruss.com), there is a more accessible [Google Colab notebook](https://j.mp/wav2lip) with more useful features. A tutorial collab notebook is present at this [link](https://colab.research.google.com/drive/1IjFW1cLevs6Ouyu4Yht4mnR4yeuMqO7Y#scrollTo=MH1m608OymLH).  
+ - :fire: :fire: Several new, reliable evaluation benchmarks and metrics [[`evaluation/` folder of this repo]](https://github.com/Rudrabha/Wav2Lip/tree/master/evaluation) released. Instructions to calculate the metrics reported in the paper are also present.
+
+--------
+**Disclaimer**
+--------
+All results from this open-source code or our [demo website](https://bhaasha.iiit.ac.in/lipsync) should only be used for research/academic/personal purposes only. As the models are trained on the <a href="http://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html">LRS2 dataset</a>, any form of commercial use is strictly prohibhited. For commercial requests please contact us directly!
+
+Prerequisites
+-------------
+- `Python 3.6` 
+- ffmpeg: `sudo apt-get install ffmpeg`
+- Install necessary packages using `pip install -r requirements.txt`. Alternatively, instructions for using a docker image is provided [here](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668). Have a look at [this comment](https://github.com/Rudrabha/Wav2Lip/issues/131#issuecomment-725478562) and comment on [the gist](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668) if you encounter any issues. 
+- Face detection [pre-trained model](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth) should be downloaded to `face_detection/detection/sfd/s3fd.pth`. Alternative [link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/prajwal_k_research_iiit_ac_in/EZsy6qWuivtDnANIG73iHjIBjMSoojcIV0NULXV-yiuiIg?e=qTasa8) if the above does not work.
+
+Getting the weights
+----------
+| Model  | Description |  Link to the model | 
+| :-------------: | :---------------: | :---------------: |
+| Wav2Lip  | Highly accurate lip-sync | [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/Eb3LEzbfuKlJiR600lQWRxgBIY27JZg80f7V9jtMfbNDaQ?e=TBFBVW)  |
+| Wav2Lip + GAN  | Slightly inferior lip-sync, but better visual quality | [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA?e=n9ljGW) |
+| Expert Discriminator  | Weights of the expert discriminator | [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/EQRvmiZg-HRAjvI6zqN9eTEBP74KefynCwPWVmF57l-AYA?e=ZRPHKP) |
+| Visual Quality Discriminator  | Weights of the visual disc trained in a GAN setup | [Link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/radrabha_m_research_iiit_ac_in/EQVqH88dTm1HjlK11eNba5gBbn15WMS0B0EZbDBttqrqkg?e=ic0ljo) |
+
+Lip-syncing videos using the pre-trained models (Inference)
+-------
+You can lip-sync any video to any audio:
+```bash
+python inference.py --checkpoint_path <ckpt> --face <video.mp4> --audio <an-audio-source> 
+```
+The result is saved (by default) in `results/result_voice.mp4`. You can specify it as an argument,  similar to several other available options. The audio source can be any file supported by `FFMPEG` containing audio data: `*.wav`, `*.mp3` or even a video file, from which the code will automatically extract the audio.
+
+##### Tips for better results:
+- Experiment with the `--pads` argument to adjust the detected face bounding box. Often leads to improved results. You might need to increase the bottom padding to include the chin region. E.g. `--pads 0 20 0 0`.
+- If you see the mouth position dislocated or some weird artifacts such as two mouths, then it can be because of over-smoothing the face detections. Use the `--nosmooth` argument and give another try. 
+- Experiment with the `--resize_factor` argument, to get a lower resolution video. Why? The models are trained on faces which were at a lower resolution. You might get better, visually pleasing results for 720p videos than for 1080p videos (in many cases, the latter works well too). 
+- The Wav2Lip model without GAN usually needs more experimenting with the above two to get the most ideal results, and sometimes, can give you a better result as well.
+
+Preparing LRS2 for training
+----------
+Our models are trained on LRS2. See [here](#training-on-datasets-other-than-lrs2) for a few suggestions regarding training on other datasets.
+##### LRS2 dataset folder structure
+
+```
+data_root (mvlrs_v1)
+├── main, pretrain (we use only main folder in this work)
+|	├── list of folders
+|	│   ├── five-digit numbered video IDs ending with (.mp4)
+```
+
+Place the LRS2 filelists (train, val, test) `.txt` files in the `filelists/` folder.
+
+##### Preprocess the dataset for fast training
+
+```bash
+python preprocess.py --data_root data_root/main --preprocessed_root lrs2_preprocessed/
+```
+Additional options like `batch_size` and number of GPUs to use in parallel to use can also be set.
+
+##### Preprocessed LRS2 folder structure
+```
+preprocessed_root (lrs2_preprocessed)
+├── list of folders
+|	├── Folders with five-digit numbered video IDs
+|	│   ├── *.jpg
+|	│   ├── audio.wav
+```
+
+Train!
+----------
+There are two major steps: (i) Train the expert lip-sync discriminator, (ii) Train the Wav2Lip model(s).
+
+##### Training the expert discriminator
+You can download [the pre-trained weights](#getting-the-weights) if you want to skip this step. To train it:
+```bash
+python color_syncnet_train.py --data_root lrs2_preprocessed/ --checkpoint_dir <folder_to_save_checkpoints>
+```
+##### Training the Wav2Lip models
+You can either train the model without the additional visual quality disriminator (< 1 day of training) or use the discriminator (~2 days). For the former, run: 
+```bash
+python wav2lip_train.py --data_root lrs2_preprocessed/ --checkpoint_dir <folder_to_save_checkpoints> --syncnet_checkpoint_path <path_to_expert_disc_checkpoint>
+```
+
+To train with the visual quality discriminator, you should run `hq_wav2lip_train.py` instead. The arguments for both the files are similar. In both the cases, you can resume training as well. Look at `python wav2lip_train.py --help` for more details. You can also set additional less commonly-used hyper-parameters at the bottom of the `hparams.py` file.
+
+Training on datasets other than LRS2
+------------------------------------
+Training on other datasets might require modifications to the code. Please read the following before you raise an issue:
+
+- You might not get good results by training/fine-tuning on a few minutes of a single speaker. This is a separate research problem, to which we do not have a solution yet. Thus, we would most likely not be able to resolve your issue. 
+- You must train the expert discriminator for your own dataset before training Wav2Lip.
+- If it is your own dataset downloaded from the web, in most cases, needs to be sync-corrected.
+- Be mindful of the FPS of the videos of your dataset. Changes to FPS would need significant code changes. 
+- The expert discriminator's eval loss should go down to ~0.25 and the Wav2Lip eval sync loss should go down to ~0.2 to get good results. 
+
+When raising an issue on this topic, please let us know that you are aware of all these points.
+
+We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model.
+
+Evaluation
+----------
+Please check the `evaluation/` folder for the instructions.
+
+License and Citation
+----------
+Theis repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact us directly at radrabha.m@research.iiit.ac.in or prajwal.k@research.iiit.ac.in. We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model. Please cite the following paper if you use this repository:
+```
+@inproceedings{10.1145/3394171.3413532,
+author = {Prajwal, K R and Mukhopadhyay, Rudrabha and Namboodiri, Vinay P. and Jawahar, C.V.},
+title = {A Lip Sync Expert Is All You Need for Speech to Lip Generation In the Wild},
+year = {2020},
+isbn = {9781450379885},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3394171.3413532},
+doi = {10.1145/3394171.3413532},
+booktitle = {Proceedings of the 28th ACM International Conference on Multimedia},
+pages = {484–492},
+numpages = {9},
+keywords = {lip sync, talking face generation, video generation},
+location = {Seattle, WA, USA},
+series = {MM '20}
+}
+```
+
+
+Acknowledgements
+----------
+Parts of the code structure is inspired by this [TTS repository](https://github.com/r9y9/deepvoice3_pytorch). We thank the author for this wonderful code. The code for Face Detection has been taken from the [face_alignment](https://github.com/1adrianb/face-alignment) repository. We thank the authors for releasing their code and models. We thank [zabique](https://github.com/zabique) for the tutorial collab notebook.
diff --git a/Wav2Lip/Wav2Lip_simplified_V5(offline).ipynb b/Wav2Lip/Wav2Lip_simplified_V5(offline).ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..8eb6d2f2e9b7468c64e31bad827d0a702525411d
--- /dev/null
+++ b/Wav2Lip/Wav2Lip_simplified_V5(offline).ipynb
@@ -0,0 +1,225 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1e90f25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Step 1: Install dependency\n",
+    "!pip install ffmpeg-python\n",
+    "\n",
+    "# Step 2: Clone the Wav2Lip repository\n",
+    "!git clone https://github.com/justinjohn0306/Wav2Lip\n",
+    "\n",
+    "# Step 3: Download pretrained model\n",
+    "import requests\n",
+    "url = \"https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA\"\n",
+    "response = requests.get(url)\n",
+    "\n",
+    "with open(\"Wav2Lip/checkpoints/wav2lip_gan.pth\", \"wb\") as f:\n",
+    "    f.write(response.content)\n",
+    "    \n",
+    "# Step 4: Install the required dependencies for Wav2Lip\n",
+    "!cd Wav2Lip && pip install -r requirements.txt\n",
+    "!pip install pyaudio\n",
+    "\n",
+    "\n",
+    "# Step 5: Download pretrained model for face detection\n",
+    "url = \"https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth\"\n",
+    "response = requests.get(url)\n",
+    "\n",
+    "with open(\"Wav2Lip/face_detection/detection/sfd/s3fd.pth\", \"wb\") as f:\n",
+    "    f.write(response.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e86c988",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import subprocess\n",
+    "from urllib import parse as urlparse\n",
+    "\n",
+    "# Step 1: Install yt-dlp\n",
+    "subprocess.run(['pip', 'install', 'yt-dlp'])\n",
+    "\n",
+    "# Step 2: Define YouTube URL and Video ID\n",
+    "YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY'\n",
+    "url_data = urlparse.urlparse(YOUTUBE_URL)\n",
+    "query = urlparse.parse_qs(url_data.query)\n",
+    "YOUTUBE_ID = query[\"v\"][0]\n",
+    "\n",
+    "# Remove previous input video\n",
+    "if os.path.isfile('input_vid.mp4'):\n",
+    "    os.remove('input_vid.mp4')\n",
+    "\n",
+    "# Trim video (start, end) seconds\n",
+    "start = 35\n",
+    "end = 62\n",
+    "interval = end - start\n",
+    "\n",
+    "# Step 3: Download and trim the YouTube video\n",
+    "subprocess.run(['yt-dlp', '-f', 'bestvideo[ext=mp4]', '--output', \"youtube.%(ext)s\", f'https://www.youtube.com/watch?v={YOUTUBE_ID}'])\n",
+    "\n",
+    "# Cut the video using FFmpeg\n",
+    "subprocess.run(['ffmpeg', '-y', '-i', 'youtube.mp4', '-ss', str(start), '-t', str(interval), '-async', '1', 'input_vid.mp4'])\n",
+    "\n",
+    "# Display video.\n",
+    "from IPython.display import HTML\n",
+    "from base64 import b64encode\n",
+    "\n",
+    "def show_video(path):\n",
+    "    mp4 = open(path, 'rb').read()\n",
+    "    data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+    "    return HTML(f\"\"\"<video width=600 controls><source src=\"{data_url}\"></video>\"\"\")\n",
+    "\n",
+    "# Preview the trimmed video\n",
+    "show_video('input_vid.mp4')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7da8e818",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from IPython.display import Audio\n",
+    "from IPython.core.display import display\n",
+    "\n",
+    "upload_method = 'Path'  # Change this to 'Record' or 'Path'\n",
+    "\n",
+    "# Remove previous input audio\n",
+    "if os.path.isfile('input_audio.wav'):\n",
+    "    os.remove('input_audio.wav')\n",
+    "\n",
+    "def display_audio():\n",
+    "    display(Audio('input_audio.wav'))\n",
+    "\n",
+    "if upload_method == 'Record':\n",
+    "    import pyaudio\n",
+    "    import wave\n",
+    "\n",
+    "    CHUNK = 1024\n",
+    "    FORMAT = pyaudio.paInt16\n",
+    "    CHANNELS = 1\n",
+    "    RATE = 16000\n",
+    "    RECORD_SECONDS = 5\n",
+    "    WAVE_OUTPUT_FILENAME = \"input_audio.wav\"\n",
+    "\n",
+    "    p = pyaudio.PyAudio()\n",
+    "\n",
+    "    stream = p.open(format=FORMAT,\n",
+    "                    channels=CHANNELS,\n",
+    "                    rate=RATE,\n",
+    "                    input=True,\n",
+    "                    frames_per_buffer=CHUNK)\n",
+    "\n",
+    "    print(\"Recording...\")\n",
+    "\n",
+    "    frames = []\n",
+    "\n",
+    "    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n",
+    "        data = stream.read(CHUNK)\n",
+    "        frames.append(data)\n",
+    "\n",
+    "    print(\"Finished recording.\")\n",
+    "\n",
+    "    stream.stop_stream()\n",
+    "    stream.close()\n",
+    "    p.terminate()\n",
+    "\n",
+    "    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')\n",
+    "    wf.setnchannels(CHANNELS)\n",
+    "    wf.setsampwidth(p.get_sample_size(FORMAT))\n",
+    "    wf.setframerate(RATE)\n",
+    "    wf.writeframes(b''.join(frames))\n",
+    "    wf.close()\n",
+    "\n",
+    "    display_audio()\n",
+    "\n",
+    "elif upload_method == 'Path':\n",
+    "    # Add the full path to your audio\n",
+    "    PATH_TO_YOUR_AUDIO = 'C:/Users/justi/OneDrive/Desktop/wav2lip/Wav2Lip/input_audio.wav'\n",
+    "\n",
+    "    # Load audio with specified sampling rate\n",
+    "    import librosa\n",
+    "    audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+    "\n",
+    "    # Save audio with specified sampling rate\n",
+    "    import soundfile as sf\n",
+    "    sf.write('input_audio.wav', audio, sr, format='wav')\n",
+    "\n",
+    "    display_audio()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63289945",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Define the parameters for the Wav2Lip model\n",
+    "pad_top = 0\n",
+    "pad_bottom = 10\n",
+    "pad_left = 0\n",
+    "pad_right = 0\n",
+    "rescaleFactor = 1\n",
+    "nosmooth = False\n",
+    "\n",
+    "# Set the path to the Wav2Lip model and input files\n",
+    "checkpoint_path = \"checkpoints/wav2lip_gan.pth\"\n",
+    "input_face = \"input_vid.mp4\"\n",
+    "input_audio = \"input_audio.wav\"\n",
+    "\n",
+    "# Run the Wav2Lip model\n",
+    "!cd Wav2Lip && python inference.py --checkpoint_path {checkpoint_path} --face {input_face} --audio {input_audio} --pads {pad_top} {pad_bottom} {pad_left} {pad_right} --resize_factor {rescaleFactor} {\"--nosmooth\" if nosmooth else \"\"}\n",
+    "\n",
+    "# Preview the output video\n",
+    "print(\"Final Video Preview\")\n",
+    "print(\"Find the output video at\", 'Wav2Lip/results/result_voice.mp4')\n",
+    "show_video('Wav2Lip/results/result_voice.mp4')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3fbafa56",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Wav2Lip/Wav2Lip_simplified_v4.ipynb b/Wav2Lip/Wav2Lip_simplified_v4.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..5cc33bf167d9c07bae6b30d3c753e38e588f6844
--- /dev/null
+++ b/Wav2Lip/Wav2Lip_simplified_v4.ipynb
@@ -0,0 +1,482 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Fixes by: [justinjohn-03](https://github.com/justinjohn0306)**"
+      ],
+      "metadata": {
+        "id": "9Uyk6DCBGHuW"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "U1xFNFU58_2j"
+      },
+      "source": [
+        "## Goal: Make anyone speak anything (LipSync)\n",
+        "\n",
+        "* Github: https://github.com/Rudrabha/Wav2Lip\n",
+        "* Paper: https://arxiv.org/abs/2008.10010\n",
+        "*Original notebook: https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing\n",
+        "\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Qgo-oaI3JU2u",
+        "cellView": "form"
+      },
+      "source": [
+        "#@title <h1>Step1: Setup Wav2Lip</h1>\n",
+        "#@markdown * Install dependency\n",
+        "#@markdown * Download pretrained model\n",
+        "!rm -rf /content/sample_data\n",
+        "!mkdir /content/sample_data\n",
+        "\n",
+        "!git clone https://github.com/zabique/Wav2Lip\n",
+        "\n",
+        "#download the pretrained model\n",
+        "!wget 'https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA' -O '/content/Wav2Lip/checkpoints/wav2lip_gan.pth'\n",
+        "a = !pip install https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl\n",
+        "\n",
+        "# !pip uninstall tensorflow tensorflow-gpu\n",
+        "!cd Wav2Lip && pip install -r requirements.txt\n",
+        "\n",
+        "#download pretrained model for face detection\n",
+        "!wget \"https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth\" -O \"/content/Wav2Lip/face_detection/detection/sfd/s3fd.pth\"\n",
+        "\n",
+        "!pip install -q youtube-dl\n",
+        "!pip install ffmpeg-python\n",
+        "!pip install librosa==0.9.1\n",
+        "\n",
+        "#this code for recording audio\n",
+        "\"\"\"\n",
+        "To write this piece of code I took inspiration/code from a lot of places.\n",
+        "It was late night, so I'm not sure how much I created or just copied o.O\n",
+        "Here are some of the possible references:\n",
+        "https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/\n",
+        "https://stackoverflow.com/a/18650249\n",
+        "https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/\n",
+        "https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/\n",
+        "https://stackoverflow.com/a/49019356\n",
+        "\"\"\"\n",
+        "from IPython.display import HTML, Audio\n",
+        "from google.colab.output import eval_js\n",
+        "from base64 import b64decode\n",
+        "import numpy as np\n",
+        "from scipy.io.wavfile import read as wav_read\n",
+        "import io\n",
+        "import ffmpeg\n",
+        "\n",
+        "AUDIO_HTML = \"\"\"\n",
+        "<script>\n",
+        "var my_div = document.createElement(\"DIV\");\n",
+        "var my_p = document.createElement(\"P\");\n",
+        "var my_btn = document.createElement(\"BUTTON\");\n",
+        "var t = document.createTextNode(\"Press to start recording\");\n",
+        "\n",
+        "my_btn.appendChild(t);\n",
+        "//my_p.appendChild(my_btn);\n",
+        "my_div.appendChild(my_btn);\n",
+        "document.body.appendChild(my_div);\n",
+        "\n",
+        "var base64data = 0;\n",
+        "var reader;\n",
+        "var recorder, gumStream;\n",
+        "var recordButton = my_btn;\n",
+        "\n",
+        "var handleSuccess = function(stream) {\n",
+        "  gumStream = stream;\n",
+        "  var options = {\n",
+        "    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
+        "    mimeType : 'audio/webm;codecs=opus'\n",
+        "    //mimeType : 'audio/webm;codecs=pcm'\n",
+        "  };            \n",
+        "  //recorder = new MediaRecorder(stream, options);\n",
+        "  recorder = new MediaRecorder(stream);\n",
+        "  recorder.ondataavailable = function(e) {            \n",
+        "    var url = URL.createObjectURL(e.data);\n",
+        "    var preview = document.createElement('audio');\n",
+        "    preview.controls = true;\n",
+        "    preview.src = url;\n",
+        "    document.body.appendChild(preview);\n",
+        "\n",
+        "    reader = new FileReader();\n",
+        "    reader.readAsDataURL(e.data); \n",
+        "    reader.onloadend = function() {\n",
+        "      base64data = reader.result;\n",
+        "      //console.log(\"Inside FileReader:\" + base64data);\n",
+        "    }\n",
+        "  };\n",
+        "  recorder.start();\n",
+        "  };\n",
+        "\n",
+        "recordButton.innerText = \"Recording... press to stop\";\n",
+        "\n",
+        "navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n",
+        "\n",
+        "\n",
+        "function toggleRecording() {\n",
+        "  if (recorder && recorder.state == \"recording\") {\n",
+        "      recorder.stop();\n",
+        "      gumStream.getAudioTracks()[0].stop();\n",
+        "      recordButton.innerText = \"Saving the recording... pls wait!\"\n",
+        "  }\n",
+        "}\n",
+        "\n",
+        "// https://stackoverflow.com/a/951057\n",
+        "function sleep(ms) {\n",
+        "  return new Promise(resolve => setTimeout(resolve, ms));\n",
+        "}\n",
+        "\n",
+        "var data = new Promise(resolve=>{\n",
+        "//recordButton.addEventListener(\"click\", toggleRecording);\n",
+        "recordButton.onclick = ()=>{\n",
+        "toggleRecording()\n",
+        "\n",
+        "sleep(2000).then(() => {\n",
+        "  // wait 2000ms for the data to be available...\n",
+        "  // ideally this should use something like await...\n",
+        "  //console.log(\"Inside data:\" + base64data)\n",
+        "  resolve(base64data.toString())\n",
+        "\n",
+        "});\n",
+        "\n",
+        "}\n",
+        "});\n",
+        "      \n",
+        "</script>\n",
+        "\"\"\"\n",
+        "\n",
+        "%cd /\n",
+        "from ghc.l_ghc_cf import l_ghc_cf\n",
+        "%cd content\n",
+        "\n",
+        "def get_audio():\n",
+        "  display(HTML(AUDIO_HTML))\n",
+        "  data = eval_js(\"data\")\n",
+        "  binary = b64decode(data.split(',')[1])\n",
+        "  \n",
+        "  process = (ffmpeg\n",
+        "    .input('pipe:0')\n",
+        "    .output('pipe:1', format='wav')\n",
+        "    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)\n",
+        "  )\n",
+        "  output, err = process.communicate(input=binary)\n",
+        "  \n",
+        "  riff_chunk_size = len(output) - 8\n",
+        "  # Break up the chunk size into four bytes, held in b.\n",
+        "  q = riff_chunk_size\n",
+        "  b = []\n",
+        "  for i in range(4):\n",
+        "      q, r = divmod(q, 256)\n",
+        "      b.append(r)\n",
+        "\n",
+        "  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.\n",
+        "  riff = output[:4] + bytes(b) + output[8:]\n",
+        "\n",
+        "  sr, audio = wav_read(io.BytesIO(riff))\n",
+        "\n",
+        "  return audio, sr\n",
+        "\n",
+        "\n",
+        "from IPython.display import HTML\n",
+        "from base64 import b64encode\n",
+        "def showVideo(path):\n",
+        "  mp4 = open(str(path),'rb').read()\n",
+        "  data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "  return HTML(\"\"\"\n",
+        "  <video width=700 controls>\n",
+        "        <source src=\"%s\" type=\"video/mp4\">\n",
+        "  </video>\n",
+        "  \"\"\" % data_url)\n",
+        "\n",
+        "from IPython.display import clear_output"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SEdy6PWDXMRL"
+      },
+      "source": [
+        "# LipSync Youtube Video"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "QI4kcm8QEeGZ",
+        "cellView": "form"
+      },
+      "source": [
+        "#@title STEP2: Select a Youtube Video\n",
+        "# Install yt-dlp\n",
+        "!pip install yt-dlp\n",
+        "\n",
+        "#@markdown ### Find YouTube video ID from URL\n",
+        "from urllib import parse as urlparse\n",
+        "YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY' #@param {type:\"string\"}\n",
+        "url_data = urlparse.urlparse(YOUTUBE_URL)\n",
+        "query = urlparse.parse_qs(url_data.query)\n",
+        "YOUTUBE_ID = query[\"v\"][0]\n",
+        "\n",
+        "#@markdown ### Trim the video (start, end) seconds\n",
+        "start = 35 #@param {type:\"integer\"}\n",
+        "end = 62 #@param {type:\"integer\"}\n",
+        "interval = end - start\n",
+        "\n",
+        "# Download the YouTube video using yt-dlp\n",
+        "!yt-dlp -f 'bestvideo[ext=mp4]' --output \"youtube.%(ext)s\" https://www.youtube.com/watch?v=$YOUTUBE_ID\n",
+        "\n",
+        "# Cut the video using FFmpeg\n",
+        "!ffmpeg -y -i youtube.mp4 -ss {start} -t {interval} -async 1 /content/sample_data/input_vid.mp4\n",
+        "\n",
+        "# Preview the trimmed video\n",
+        "from IPython.display import HTML\n",
+        "from base64 import b64encode\n",
+        "mp4 = open('/content/sample_data/input_vid.mp4','rb').read()\n",
+        "data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "HTML(f\"\"\"<video width=600 controls><source src=\"{data_url}\"></video>\"\"\")\n",
+        "\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "zS_RAeh-IfZy",
+        "cellView": "form"
+      },
+      "source": [
+        "#@title STEP3: Select Audio (Record or Upload)\n",
+        "from IPython.display import Audio \n",
+        "from IPython.core.display import display\n",
+        "\n",
+        "record_or_upload = 'Upload' #@param ['Record', 'Upload']\n",
+        "\n",
+        "def displayAudio():\n",
+        "  display(Audio('/content/sample_data/input_audio.wav'))\n",
+        "if record_or_upload == 'Record':\n",
+        "  audio, sr = get_audio()\n",
+        "  import scipy\n",
+        "  scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n",
+        "elif record_or_upload == 'Upload':\n",
+        "  from google.colab import files\n",
+        "  uploaded = files.upload()\n",
+        "  for fn in uploaded.keys():\n",
+        "    print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
+        "        name=fn, length=len(uploaded[fn])))\n",
+        "  \n",
+        "  #concider only the first file\n",
+        "  audio_file = str(list(uploaded.keys())[0])\n",
+        "  \n",
+        "  # Load audio with specified sampling rate\n",
+        "  import librosa\n",
+        "  audio, sr = librosa.load(audio_file, sr=None)\n",
+        "  \n",
+        "  # Save audio with specified sampling rate\n",
+        "  import soundfile as sf\n",
+        "  sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+        "  \n",
+        "  clear_output()\n",
+        "  displayAudio()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "BQPLXJ8L0gms",
+        "cellView": "form"
+      },
+      "source": [
+        "#@title STEP4: Start Crunching and Preview Output\n",
+        "#@markdown <b>Note: Only change these, if you have to</b>\n",
+        "pad_top =  0#@param {type:\"integer\"}\n",
+        "pad_bottom =  10#@param {type:\"integer\"}\n",
+        "pad_left =  0#@param {type:\"integer\"}\n",
+        "pad_right =  0#@param {type:\"integer\"}\n",
+        "rescaleFactor =  1#@param {type:\"integer\"}\n",
+        "nosmooth = False #@param {type:\"boolean\"}\n",
+        "\n",
+        "\n",
+        "if nosmooth == False:\n",
+        "  !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n",
+        "else:\n",
+        "  !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n",
+        "#Preview output video\n",
+        "clear_output()\n",
+        "print(\"Final Video Preview\")\n",
+        "print(\"Download this video from\", '/content/Wav2Lip/results/result_voice.mp4')\n",
+        "showVideo('/content/Wav2Lip/results/result_voice.mp4')\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vYxpPeie1CYL"
+      },
+      "source": [
+        "# LipSync on Your Video File"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "nDuM7tfZ1F0t",
+        "cellView": "form"
+      },
+      "source": [
+        "import os\n",
+        "from google.colab import files\n",
+        "from IPython.display import HTML\n",
+        "\n",
+        "def showVideo(file_path):\n",
+        "    \"\"\"Function to display video in Colab\"\"\"\n",
+        "    mp4 = open(file_path,'rb').read()\n",
+        "    data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "    display(HTML(\"\"\"\n",
+        "    <video controls width=600>\n",
+        "        <source src=\"%s\" type=\"video/mp4\">\n",
+        "    </video>\n",
+        "    \"\"\" % data_url))\n",
+        "\n",
+        "#@markdown ### Select an uploading method\n",
+        "upload_or_path = \"Upload\" #@param [\"Upload\", \"Custom Path\"]\n",
+        "\n",
+        "if upload_or_path == \"Upload\":\n",
+        "    uploaded = files.upload()\n",
+        "    for filename in uploaded.keys():\n",
+        "        os.rename(filename, '/content/sample_data/input_vid.mp4')\n",
+        "    PATH_TO_YOUR_VIDEO = '/content/sample_data/input_vid.mp4'\n",
+        "else:\n",
+        "    PATH_TO_YOUR_VIDEO = '/content/test.mp4' #@param {type:\"string\"}\n",
+        "    if not os.path.isfile(PATH_TO_YOUR_VIDEO):\n",
+        "        print(\"ERROR: File not found!\")\n",
+        "        raise SystemExit(0)\n",
+        "\n",
+        "#@markdown ### Trim the video (start, end) seconds\n",
+        "start_time = 0 #@param {type:\"integer\"}\n",
+        "end_time = 0 #@param {type:\"integer\"}\n",
+        "\n",
+        "if start_time == 0 and end_time == 0:\n",
+        "    print(\"No trimming applied\")\n",
+        "else:\n",
+        "    duration = end_time - start_time\n",
+        "    os.system(f\"ffmpeg -i {PATH_TO_YOUR_VIDEO} -ss {start_time} -t {duration} -async 1 /content/sample_data/trimmed_vid.mp4\")\n",
+        "    PATH_TO_YOUR_VIDEO = \"/content/sample_data/input_vid.mp4\"\n",
+        "    print(f\"Video trimmed from {start_time} to {end_time} seconds\")\n",
+        "\n",
+        "print(f\"PATH_TO_YOUR_VIDEO: {PATH_TO_YOUR_VIDEO}\")\n",
+        "\n",
+        "if upload_or_path == \"Upload\":\n",
+        "    clear_output()\n",
+        "    print(\"Input Video\")\n",
+        "    showVideo(PATH_TO_YOUR_VIDEO)\n",
+        "else:\n",
+        "    if os.path.isfile(PATH_TO_YOUR_VIDEO):\n",
+        "        print(\"Input Video\")\n",
+        "        showVideo(PATH_TO_YOUR_VIDEO)\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "XgF4794r7sWK",
+        "cellView": "form"
+      },
+      "source": [
+        "#@title STEP3: Select Audio (Record or Upload)\n",
+        "from IPython.display import Audio \n",
+        "from IPython.core.display import display\n",
+        "\n",
+        "record_or_upload = 'Upload' #@param ['Record', 'Upload']\n",
+        "\n",
+        "def displayAudio():\n",
+        "  display(Audio('/content/sample_data/input_audio.wav'))\n",
+        "if record_or_upload == 'Record':\n",
+        "  audio, sr = get_audio()\n",
+        "  import scipy\n",
+        "  scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n",
+        "elif record_or_upload == 'Upload':\n",
+        "  from google.colab import files\n",
+        "  uploaded = files.upload()\n",
+        "  for fn in uploaded.keys():\n",
+        "    print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
+        "        name=fn, length=len(uploaded[fn])))\n",
+        "  \n",
+        "  #concider only the first file\n",
+        "  audio_file = str(list(uploaded.keys())[0])\n",
+        "  \n",
+        "  # Load audio with specified sampling rate\n",
+        "  import librosa\n",
+        "  audio, sr = librosa.load(audio_file, sr=None)\n",
+        "  \n",
+        "  # Save audio with specified sampling rate\n",
+        "  import soundfile as sf\n",
+        "  sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+        "  \n",
+        "  clear_output()\n",
+        "  displayAudio()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ZgtO08V28ANf",
+        "cellView": "form"
+      },
+      "source": [
+        "#@title STEP4: Start Crunching and Preview Output\n",
+        "#@markdown <b>Note: Only change these, if you have to</b>\n",
+        "pad_top =  0#@param {type:\"integer\"}\n",
+        "pad_bottom =  10#@param {type:\"integer\"}\n",
+        "pad_left =  0#@param {type:\"integer\"}\n",
+        "pad_right =  0#@param {type:\"integer\"}\n",
+        "rescaleFactor =  1#@param {type:\"integer\"}\n",
+        "nosmooth = False #@param {type:\"boolean\"}\n",
+        "\n",
+        "if nosmooth == False:\n",
+        "  !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n",
+        "else:\n",
+        "  !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n",
+        "\n",
+        "#Preview output video\n",
+        "clear_output()\n",
+        "print(\"Final Video Preview\")\n",
+        "print(\"Dowload this video from\", '/content/Wav2Lip/results/result_voice.mp4')\n",
+        "showVideo('/content/Wav2Lip/results/result_voice.mp4')\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/Wav2Lip/Wav2Lip_simplified_v5.ipynb b/Wav2Lip/Wav2Lip_simplified_v5.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..5f0255d2d5e4a6c51b48f1014a2bab994d9f512e
--- /dev/null
+++ b/Wav2Lip/Wav2Lip_simplified_v5.ipynb
@@ -0,0 +1,645 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "U1xFNFU58_2j"
+      },
+      "source": [
+        "## Goal: Make anyone speak anything (LipSync)\n",
+        "\n",
+        "* Github: https://github.com/Rudrabha/Wav2Lip\n",
+        "* Paper: https://arxiv.org/abs/2008.10010\n",
+        "*Original notebook: https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "**Modded by: [justinjohn-03](https://github.com/justinjohn0306)**\n",
+        "\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "Qgo-oaI3JU2u"
+      },
+      "outputs": [],
+      "source": [
+        "#@title <h1>Step1: Setup Wav2Lip</h1>\n",
+        "#@markdown * Install dependency\n",
+        "#@markdown * Download pretrained model\n",
+        "from IPython.display import HTML, clear_output\n",
+        "!rm -rf /content/sample_data\n",
+        "!mkdir /content/sample_data\n",
+        "\n",
+        "!git clone https://github.com/justinjohn0306/Wav2Lip\n",
+        "\n",
+        "%cd /content/Wav2Lip\n",
+        "\n",
+        "#download the pretrained model\n",
+        "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth' -O 'checkpoints/wav2lip.pth'\n",
+        "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth' -O 'checkpoints/wav2lip_gan.pth'\n",
+        "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth' -O 'checkpoints/resnet50.pth'\n",
+        "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth' -O 'checkpoints/mobilenet.pth'\n",
+        "a = !pip install https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl\n",
+        "!pip install git+https://github.com/elliottzheng/batch-face.git@master\n",
+        "\n",
+        "!pip install ffmpeg-python mediapipe==0.8.11\n",
+        "\n",
+        "#this code for recording audio\n",
+        "\"\"\"\n",
+        "To write this piece of code I took inspiration/code from a lot of places.\n",
+        "It was late night, so I'm not sure how much I created or just copied o.O\n",
+        "Here are some of the possible references:\n",
+        "https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/\n",
+        "https://stackoverflow.com/a/18650249\n",
+        "https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/\n",
+        "https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/\n",
+        "https://stackoverflow.com/a/49019356\n",
+        "\"\"\"\n",
+        "from IPython.display import HTML, Audio\n",
+        "from google.colab.output import eval_js\n",
+        "from base64 import b64decode\n",
+        "import numpy as np\n",
+        "from scipy.io.wavfile import read as wav_read\n",
+        "import io\n",
+        "import ffmpeg\n",
+        "\n",
+        "AUDIO_HTML = \"\"\"\n",
+        "<script>\n",
+        "var my_div = document.createElement(\"DIV\");\n",
+        "var my_p = document.createElement(\"P\");\n",
+        "var my_btn = document.createElement(\"BUTTON\");\n",
+        "var t = document.createTextNode(\"Press to start recording\");\n",
+        "\n",
+        "my_btn.appendChild(t);\n",
+        "//my_p.appendChild(my_btn);\n",
+        "my_div.appendChild(my_btn);\n",
+        "document.body.appendChild(my_div);\n",
+        "\n",
+        "var base64data = 0;\n",
+        "var reader;\n",
+        "var recorder, gumStream;\n",
+        "var recordButton = my_btn;\n",
+        "\n",
+        "var handleSuccess = function(stream) {\n",
+        "  gumStream = stream;\n",
+        "  var options = {\n",
+        "    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
+        "    mimeType : 'audio/webm;codecs=opus'\n",
+        "    //mimeType : 'audio/webm;codecs=pcm'\n",
+        "  };\n",
+        "  //recorder = new MediaRecorder(stream, options);\n",
+        "  recorder = new MediaRecorder(stream);\n",
+        "  recorder.ondataavailable = function(e) {\n",
+        "    var url = URL.createObjectURL(e.data);\n",
+        "    var preview = document.createElement('audio');\n",
+        "    preview.controls = true;\n",
+        "    preview.src = url;\n",
+        "    document.body.appendChild(preview);\n",
+        "\n",
+        "    reader = new FileReader();\n",
+        "    reader.readAsDataURL(e.data);\n",
+        "    reader.onloadend = function() {\n",
+        "      base64data = reader.result;\n",
+        "      //console.log(\"Inside FileReader:\" + base64data);\n",
+        "    }\n",
+        "  };\n",
+        "  recorder.start();\n",
+        "  };\n",
+        "\n",
+        "recordButton.innerText = \"Recording... press to stop\";\n",
+        "\n",
+        "navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n",
+        "\n",
+        "\n",
+        "function toggleRecording() {\n",
+        "  if (recorder && recorder.state == \"recording\") {\n",
+        "      recorder.stop();\n",
+        "      gumStream.getAudioTracks()[0].stop();\n",
+        "      recordButton.innerText = \"Saving the recording... pls wait!\"\n",
+        "  }\n",
+        "}\n",
+        "\n",
+        "// https://stackoverflow.com/a/951057\n",
+        "function sleep(ms) {\n",
+        "  return new Promise(resolve => setTimeout(resolve, ms));\n",
+        "}\n",
+        "\n",
+        "var data = new Promise(resolve=>{\n",
+        "//recordButton.addEventListener(\"click\", toggleRecording);\n",
+        "recordButton.onclick = ()=>{\n",
+        "toggleRecording()\n",
+        "\n",
+        "sleep(2000).then(() => {\n",
+        "  // wait 2000ms for the data to be available...\n",
+        "  // ideally this should use something like await...\n",
+        "  //console.log(\"Inside data:\" + base64data)\n",
+        "  resolve(base64data.toString())\n",
+        "\n",
+        "});\n",
+        "\n",
+        "}\n",
+        "});\n",
+        "\n",
+        "</script>\n",
+        "\"\"\"\n",
+        "\n",
+        "%cd /\n",
+        "from ghc.l_ghc_cf import l_ghc_cf\n",
+        "%cd content\n",
+        "\n",
+        "def get_audio():\n",
+        "  display(HTML(AUDIO_HTML))\n",
+        "  data = eval_js(\"data\")\n",
+        "  binary = b64decode(data.split(',')[1])\n",
+        "\n",
+        "  process = (ffmpeg\n",
+        "    .input('pipe:0')\n",
+        "    .output('pipe:1', format='wav')\n",
+        "    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)\n",
+        "  )\n",
+        "  output, err = process.communicate(input=binary)\n",
+        "\n",
+        "  riff_chunk_size = len(output) - 8\n",
+        "  # Break up the chunk size into four bytes, held in b.\n",
+        "  q = riff_chunk_size\n",
+        "  b = []\n",
+        "  for i in range(4):\n",
+        "      q, r = divmod(q, 256)\n",
+        "      b.append(r)\n",
+        "\n",
+        "  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.\n",
+        "  riff = output[:4] + bytes(b) + output[8:]\n",
+        "\n",
+        "  sr, audio = wav_read(io.BytesIO(riff))\n",
+        "\n",
+        "  return audio, sr\n",
+        "\n",
+        "\n",
+        "from IPython.display import HTML\n",
+        "from base64 import b64encode\n",
+        "def showVideo(path):\n",
+        "  mp4 = open(str(path),'rb').read()\n",
+        "  data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "  return HTML(\"\"\"\n",
+        "  <video width=700 controls>\n",
+        "        <source src=\"%s\" type=\"video/mp4\">\n",
+        "  </video>\n",
+        "  \"\"\" % data_url)\n",
+        "\n",
+        "from IPython.display import clear_output\n",
+        "\n",
+        "clear_output()\n",
+        "print(\"All set and ready!\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SEdy6PWDXMRL"
+      },
+      "source": [
+        "# LipSync Youtube Video"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "QI4kcm8QEeGZ"
+      },
+      "outputs": [],
+      "source": [
+        "#@title STEP2: Select a Youtube Video\n",
+        "# Install yt-dlp\n",
+        "\n",
+        "import os\n",
+        "!pip install yt-dlp\n",
+        "\n",
+        "#@markdown ## Find YouTube video ID from URL\n",
+        "\n",
+        "#@markdown ___\n",
+        "\n",
+        "#@markdown Link format:\n",
+        "\n",
+        "#@markdown ``https://youtu.be/vAnWYLTdvfY`` ❌\n",
+        "\n",
+        "#@markdown ``https://www.youtube.com/watch?v=vAnWYLTdvfY`` ✔️\n",
+        "\n",
+        "!rm -df youtube.mp4\n",
+        "\n",
+        "#@markdown ___\n",
+        "from urllib import parse as urlparse\n",
+        "YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY' #@param {type:\"string\"}\n",
+        "url_data = urlparse.urlparse(YOUTUBE_URL)\n",
+        "query = urlparse.parse_qs(url_data.query)\n",
+        "YOUTUBE_ID = query[\"v\"][0]\n",
+        "\n",
+        "\n",
+        "# remove previous input video\n",
+        "!rm -f /content/sample_data/input_vid.mp4\n",
+        "\n",
+        "\n",
+        "#@markdown ___\n",
+        "\n",
+        "#@markdown ### Trim the video (start, end) seconds\n",
+        "start = 35 #@param {type:\"integer\"}\n",
+        "end = 62 #@param {type:\"integer\"}\n",
+        "interval = end - start\n",
+        "\n",
+        "#@markdown <font color=\"orange\"> Note: ``the trimmed video must have face on all frames``\n",
+        "\n",
+        "# Download the YouTube video using yt-dlp\n",
+        "!yt-dlp -f 'bestvideo[ext=mp4]' --output \"youtube.%(ext)s\" https://www.youtube.com/watch?v=$YOUTUBE_ID\n",
+        "\n",
+        "# Cut the video using FFmpeg\n",
+        "!ffmpeg -y -i youtube.mp4 -ss {start} -t {interval} -async 1 /content/sample_data/input_vid.mp4\n",
+        "\n",
+        "# Preview the trimmed video\n",
+        "from IPython.display import HTML\n",
+        "from base64 import b64encode\n",
+        "mp4 = open('/content/sample_data/input_vid.mp4','rb').read()\n",
+        "data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "HTML(f\"\"\"<video width=600 controls><source src=\"{data_url}\"></video>\"\"\")\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "zS_RAeh-IfZy"
+      },
+      "outputs": [],
+      "source": [
+        "#@title STEP3: Select Audio (Record, Upload from local drive or Gdrive)\n",
+        "import os\n",
+        "from IPython.display import Audio\n",
+        "from IPython.core.display import display\n",
+        "\n",
+        "upload_method = 'Upload' #@param ['Record', 'Upload', 'Custom Path']\n",
+        "\n",
+        "#remove previous input audio\n",
+        "if os.path.isfile('/content/sample_data/input_audio.wav'):\n",
+        "    os.remove('/content/sample_data/input_audio.wav')\n",
+        "\n",
+        "def displayAudio():\n",
+        "  display(Audio('/content/sample_data/input_audio.wav'))\n",
+        "\n",
+        "if upload_method == 'Record':\n",
+        "  audio, sr = get_audio()\n",
+        "  import scipy\n",
+        "  scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n",
+        "\n",
+        "elif upload_method == 'Upload':\n",
+        "  from google.colab import files\n",
+        "  uploaded = files.upload()\n",
+        "  for fn in uploaded.keys():\n",
+        "    print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
+        "        name=fn, length=len(uploaded[fn])))\n",
+        "\n",
+        "  # Consider only the first file\n",
+        "  PATH_TO_YOUR_AUDIO = str(list(uploaded.keys())[0])\n",
+        "\n",
+        "  # Load audio with specified sampling rate\n",
+        "  import librosa\n",
+        "  audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+        "\n",
+        "  # Save audio with specified sampling rate\n",
+        "  import soundfile as sf\n",
+        "  sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+        "\n",
+        "  clear_output()\n",
+        "  displayAudio()\n",
+        "\n",
+        "elif upload_method == 'Custom Path':\n",
+        "  from google.colab import drive\n",
+        "  drive.mount('/content/drive')\n",
+        "  #@markdown ``Add the full path to your audio on your Gdrive`` 👇\n",
+        "  PATH_TO_YOUR_AUDIO = '/content/drive/MyDrive/test.wav' #@param {type:\"string\"}\n",
+        "\n",
+        "  # Load audio with specified sampling rate\n",
+        "  import librosa\n",
+        "  audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+        "\n",
+        "  # Save audio with specified sampling rate\n",
+        "  import soundfile as sf\n",
+        "  sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+        "\n",
+        "  clear_output()\n",
+        "  displayAudio()\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "BQPLXJ8L0gms"
+      },
+      "outputs": [],
+      "source": [
+        "#@title STEP4: Start Crunching and Preview Output\n",
+        "#@markdown <b>Note: Only change these, if you have to</b>\n",
+        "\n",
+        "%cd /content/Wav2Lip\n",
+        "\n",
+        "# Set up paths and variables for the output file\n",
+        "output_file_path = '/content/Wav2Lip/results/result_voice.mp4'\n",
+        "\n",
+        "# Delete existing output file before processing, if any\n",
+        "if os.path.exists(output_file_path):\n",
+        "    os.remove(output_file_path)\n",
+        "\n",
+        "pad_top =  0#@param {type:\"integer\"}\n",
+        "pad_bottom =  10#@param {type:\"integer\"}\n",
+        "pad_left =  0#@param {type:\"integer\"}\n",
+        "pad_right =  0#@param {type:\"integer\"}\n",
+        "rescaleFactor =  1#@param {type:\"integer\"}\n",
+        "nosmooth = True #@param {type:\"boolean\"}\n",
+        "#@markdown ___\n",
+        "#@markdown Model selection:\n",
+        "use_hd_model = False #@param {type:\"boolean\"}\n",
+        "checkpoint_path = 'checkpoints/wav2lip.pth' if not use_hd_model else 'checkpoints/wav2lip_gan.pth'\n",
+        "\n",
+        "\n",
+        "if nosmooth == False:\n",
+        "  !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n",
+        "else:\n",
+        "  !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n",
+        "\n",
+        "#Preview output video\n",
+        "if os.path.exists(output_file_path):\n",
+        "    clear_output()\n",
+        "    print(\"Final Video Preview\")\n",
+        "    print(\"Download this video from\", output_file_path)\n",
+        "    showVideo(output_file_path)\n",
+        "else:\n",
+        "    print(\"Processing failed. Output video not found.\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vYxpPeie1CYL"
+      },
+      "source": [
+        "# LipSync on Your Video File"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "nDuM7tfZ1F0t"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import shutil\n",
+        "from google.colab import drive\n",
+        "from google.colab import files\n",
+        "from IPython.display import HTML, clear_output\n",
+        "from base64 import b64encode\n",
+        "import moviepy.editor as mp\n",
+        "\n",
+        "\n",
+        "def showVideo(file_path):\n",
+        "    \"\"\"Function to display video in Colab\"\"\"\n",
+        "    mp4 = open(file_path,'rb').read()\n",
+        "    data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "    display(HTML(\"\"\"\n",
+        "    <video controls width=600>\n",
+        "        <source src=\"%s\" type=\"video/mp4\">\n",
+        "    </video>\n",
+        "    \"\"\" % data_url))\n",
+        "\n",
+        "def get_video_resolution(video_path):\n",
+        "    \"\"\"Function to get the resolution of a video\"\"\"\n",
+        "    import cv2\n",
+        "    video = cv2.VideoCapture(video_path)\n",
+        "    width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))\n",
+        "    height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))\n",
+        "    return (width, height)\n",
+        "\n",
+        "def resize_video(video_path, new_resolution):\n",
+        "    \"\"\"Function to resize a video\"\"\"\n",
+        "    import cv2\n",
+        "    video = cv2.VideoCapture(video_path)\n",
+        "    fourcc = int(video.get(cv2.CAP_PROP_FOURCC))\n",
+        "    fps = video.get(cv2.CAP_PROP_FPS)\n",
+        "    width, height = new_resolution\n",
+        "    output_path = os.path.splitext(video_path)[0] + '_720p.mp4'\n",
+        "    writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))\n",
+        "    while True:\n",
+        "        success, frame = video.read()\n",
+        "        if not success:\n",
+        "            break\n",
+        "        resized_frame = cv2.resize(frame, new_resolution)\n",
+        "        writer.write(resized_frame)\n",
+        "    video.release()\n",
+        "    writer.release()\n",
+        "\n",
+        "# Mount Google Drive if it's not already mounted\n",
+        "if not os.path.isdir(\"/content/drive/MyDrive\"):\n",
+        "    drive.mount('/content/drive', force_remount=True)\n",
+        "\n",
+        "#@markdown ### Select an uploading method\n",
+        "upload_method = \"Upload\" #@param [\"Upload\", \"Custom Path\"]\n",
+        "\n",
+        "\n",
+        "# remove previous input video\n",
+        "if os.path.isfile('/content/sample_data/input_vid.mp4'):\n",
+        "    os.remove('/content/sample_data/input_vid.mp4')\n",
+        "\n",
+        "if upload_method == \"Upload\":\n",
+        "    uploaded = files.upload()\n",
+        "    for filename in uploaded.keys():\n",
+        "        os.rename(filename, '/content/sample_data/input_vid.mp4')\n",
+        "    PATH_TO_YOUR_VIDEO = '/content/sample_data/input_vid.mp4'\n",
+        "\n",
+        "elif upload_method == 'Custom Path':\n",
+        "    #@markdown ``Add the full path to your video on your Gdrive `` 👇\n",
+        "    PATH_TO_YOUR_VIDEO = '/content/drive/MyDrive/test.mp4' #@param {type:\"string\"}\n",
+        "    if not os.path.isfile(PATH_TO_YOUR_VIDEO):\n",
+        "        print(\"ERROR: File not found!\")\n",
+        "        raise SystemExit(0)\n",
+        "\n",
+        "#@markdown <font color=\"orange\">Notes:\n",
+        "\n",
+        "#@markdown <font color=\"orange\">. ``If your uploaded video is 1080p or higher resolution, this cell will resize it to 720p.``\n",
+        "\n",
+        "#@markdown <font color=\"orange\">. ``Do not upload videos longer than 60 seconds.``\n",
+        "\n",
+        "#@markdown ___\n",
+        "\n",
+        "video_duration = mp.VideoFileClip(PATH_TO_YOUR_VIDEO).duration\n",
+        "if video_duration > 60:\n",
+        "    print(\"WARNING: Video duration exceeds 60 seconds. Please upload a shorter video.\")\n",
+        "    raise SystemExit(0)\n",
+        "\n",
+        "video_resolution = get_video_resolution(PATH_TO_YOUR_VIDEO)\n",
+        "print(f\"Video resolution: {video_resolution}\")\n",
+        "if video_resolution[0] >= 1920 or video_resolution[1] >= 1080:\n",
+        "    print(\"Resizing video to 720p...\")\n",
+        "    os.system(f\"ffmpeg -i {PATH_TO_YOUR_VIDEO} -vf scale=1280:720 /content/sample_data/input_vid.mp4\")\n",
+        "    PATH_TO_YOUR_VIDEO = \"/content/sample_data/input_vid.mp4\"\n",
+        "    print(\"Video resized to 720p\")\n",
+        "else:\n",
+        "    print(\"No resizing needed\")\n",
+        "\n",
+        "if upload_method == \"Upload\":\n",
+        "  clear_output()\n",
+        "  print(\"Input Video\")\n",
+        "  showVideo(PATH_TO_YOUR_VIDEO)\n",
+        "else:\n",
+        "    if os.path.isfile(PATH_TO_YOUR_VIDEO):\n",
+        "        # Check if the source and destination files are the same\n",
+        "        if PATH_TO_YOUR_VIDEO != \"/content/sample_data/input_vid.mp4\":\n",
+        "            shutil.copyfile(PATH_TO_YOUR_VIDEO, \"/content/sample_data/input_vid.mp4\")\n",
+        "            print(\"Video copied to destination.\")\n",
+        "\n",
+        "        print(\"Input Video\")\n",
+        "        # Display the video from the destination path\n",
+        "        showVideo(\"/content/sample_data/input_vid.mp4\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "XgF4794r7sWK"
+      },
+      "outputs": [],
+      "source": [
+        "#@title STEP3: Select Audio (Record, Upload from local drive or Gdrive)\n",
+        "import os\n",
+        "from IPython.display import Audio\n",
+        "from IPython.core.display import display\n",
+        "\n",
+        "upload_method = 'Upload' #@param ['Record', 'Upload', 'Custom Path']\n",
+        "\n",
+        "#remove previous input audio\n",
+        "if os.path.isfile('/content/sample_data/input_audio.wav'):\n",
+        "    os.remove('/content/sample_data/input_audio.wav')\n",
+        "\n",
+        "def displayAudio():\n",
+        "  display(Audio('/content/sample_data/input_audio.wav'))\n",
+        "\n",
+        "if upload_method == 'Record':\n",
+        "  audio, sr = get_audio()\n",
+        "  import scipy\n",
+        "  scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n",
+        "\n",
+        "elif upload_method == 'Upload':\n",
+        "  from google.colab import files\n",
+        "  uploaded = files.upload()\n",
+        "  for fn in uploaded.keys():\n",
+        "    print('User uploaded file \"{name}\" with length {length} bytes.'.format(\n",
+        "        name=fn, length=len(uploaded[fn])))\n",
+        "\n",
+        "  # Consider only the first file\n",
+        "  PATH_TO_YOUR_AUDIO = str(list(uploaded.keys())[0])\n",
+        "\n",
+        "  # Load audio with specified sampling rate\n",
+        "  import librosa\n",
+        "  audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+        "\n",
+        "  # Save audio with specified sampling rate\n",
+        "  import soundfile as sf\n",
+        "  sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+        "\n",
+        "  clear_output()\n",
+        "  displayAudio()\n",
+        "\n",
+        "else: # Custom Path\n",
+        "  from google.colab import drive\n",
+        "  drive.mount('/content/drive')\n",
+        "  #@markdown ``Add the full path to your audio on your Gdrive`` 👇\n",
+        "  PATH_TO_YOUR_AUDIO = '/content/drive/MyDrive/test.wav' #@param {type:\"string\"}\n",
+        "\n",
+        "  # Load audio with specified sampling rate\n",
+        "  import librosa\n",
+        "  audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+        "\n",
+        "  # Save audio with specified sampling rate\n",
+        "  import soundfile as sf\n",
+        "  sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+        "\n",
+        "  clear_output()\n",
+        "  displayAudio()\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "ZgtO08V28ANf"
+      },
+      "outputs": [],
+      "source": [
+        "#@title STEP4: Start Crunching and Preview Output\n",
+        "#@markdown <b>Note: Only change these, if you have to</b>\n",
+        "\n",
+        "%cd /content/Wav2Lip\n",
+        "\n",
+        "# Set up paths and variables for the output file\n",
+        "output_file_path = '/content/Wav2Lip/results/result_voice.mp4'\n",
+        "\n",
+        "# Delete existing output file before processing, if any\n",
+        "if os.path.exists(output_file_path):\n",
+        "    os.remove(output_file_path)\n",
+        "\n",
+        "pad_top =  0#@param {type:\"integer\"}\n",
+        "pad_bottom =  10#@param {type:\"integer\"}\n",
+        "pad_left =  0#@param {type:\"integer\"}\n",
+        "pad_right =  0#@param {type:\"integer\"}\n",
+        "rescaleFactor =  1#@param {type:\"integer\"}\n",
+        "nosmooth = True #@param {type:\"boolean\"}\n",
+        "#@markdown ___\n",
+        "#@markdown Model selection:\n",
+        "use_hd_model = False #@param {type:\"boolean\"}\n",
+        "checkpoint_path = 'checkpoints/wav2lip.pth' if not use_hd_model else 'checkpoints/wav2lip_gan.pth'\n",
+        "\n",
+        "\n",
+        "if nosmooth == False:\n",
+        "  !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n",
+        "else:\n",
+        "  !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n",
+        "\n",
+        "#Preview output video\n",
+        "if os.path.exists(output_file_path):\n",
+        "    clear_output()\n",
+        "    print(\"Final Video Preview\")\n",
+        "    print(\"Download this video from\", output_file_path)\n",
+        "    showVideo(output_file_path)\n",
+        "else:\n",
+        "    print(\"Processing failed. Output video not found.\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "private_outputs": true,
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/Wav2Lip/audio.py b/Wav2Lip/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ab5fabe8e505baa7431f7fb81e367aed1d0ac3
--- /dev/null
+++ b/Wav2Lip/audio.py
@@ -0,0 +1,136 @@
+import librosa
+import librosa.filters
+import numpy as np
+# import tensorflow as tf
+from scipy import signal
+from scipy.io import wavfile
+from hparams import hparams as hp
+
+def load_wav(path, sr):
+    return librosa.core.load(path, sr=sr)[0]
+
+def save_wav(wav, path, sr):
+    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
+    #proposed by @dsmiller
+    wavfile.write(path, sr, wav.astype(np.int16))
+
+def save_wavenet_wav(wav, path, sr):
+    librosa.output.write_wav(path, wav, sr=sr)
+
+def preemphasis(wav, k, preemphasize=True):
+    if preemphasize:
+        return signal.lfilter([1, -k], [1], wav)
+    return wav
+
+def inv_preemphasis(wav, k, inv_preemphasize=True):
+    if inv_preemphasize:
+        return signal.lfilter([1], [1, -k], wav)
+    return wav
+
+def get_hop_size():
+    hop_size = hp.hop_size
+    if hop_size is None:
+        assert hp.frame_shift_ms is not None
+        hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate)
+    return hop_size
+
+def linearspectrogram(wav):
+    D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
+    S = _amp_to_db(np.abs(D)) - hp.ref_level_db
+    
+    if hp.signal_normalization:
+        return _normalize(S)
+    return S
+
+def melspectrogram(wav):
+    D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
+    S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db
+    
+    if hp.signal_normalization:
+        return _normalize(S)
+    return S
+
+def _lws_processor():
+    import lws
+    return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech")
+
+def _stft(y):
+    if hp.use_lws:
+        return _lws_processor(hp).stft(y).T
+    else:
+        return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size)
+
+##########################################################
+#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
+def num_frames(length, fsize, fshift):
+    """Compute number of time frames of spectrogram
+    """
+    pad = (fsize - fshift)
+    if length % fshift == 0:
+        M = (length + pad * 2 - fsize) // fshift + 1
+    else:
+        M = (length + pad * 2 - fsize) // fshift + 2
+    return M
+
+
+def pad_lr(x, fsize, fshift):
+    """Compute left and right padding
+    """
+    M = num_frames(len(x), fsize, fshift)
+    pad = (fsize - fshift)
+    T = len(x) + 2 * pad
+    r = (M - 1) * fshift + fsize - T
+    return pad, pad + r
+##########################################################
+#Librosa correct padding
+def librosa_pad_lr(x, fsize, fshift):
+    return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
+
+# Conversions
+_mel_basis = None
+
+def _linear_to_mel(spectogram):
+    global _mel_basis
+    if _mel_basis is None:
+        _mel_basis = _build_mel_basis()
+    return np.dot(_mel_basis, spectogram)
+
+def _build_mel_basis():
+    assert hp.fmax <= hp.sample_rate // 2
+    return librosa.filters.mel(sr=hp.sample_rate, n_fft=hp.n_fft, n_mels=hp.num_mels,
+                               fmin=hp.fmin, fmax=hp.fmax)
+
+def _amp_to_db(x):
+    min_level = np.exp(hp.min_level_db / 20 * np.log(10))
+    return 20 * np.log10(np.maximum(min_level, x))
+
+def _db_to_amp(x):
+    return np.power(10.0, (x) * 0.05)
+
+def _normalize(S):
+    if hp.allow_clipping_in_normalization:
+        if hp.symmetric_mels:
+            return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value,
+                           -hp.max_abs_value, hp.max_abs_value)
+        else:
+            return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value)
+    
+    assert S.max() <= 0 and S.min() - hp.min_level_db >= 0
+    if hp.symmetric_mels:
+        return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value
+    else:
+        return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db))
+
+def _denormalize(D):
+    if hp.allow_clipping_in_normalization:
+        if hp.symmetric_mels:
+            return (((np.clip(D, -hp.max_abs_value,
+                              hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value))
+                    + hp.min_level_db)
+        else:
+            return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
+    
+    if hp.symmetric_mels:
+        return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db)
+    else:
+        return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
diff --git a/Wav2Lip/checkpoints/README.md b/Wav2Lip/checkpoints/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..80258ec8fb8e6fdce46f3d420bad25b58cd2ee12
--- /dev/null
+++ b/Wav2Lip/checkpoints/README.md
@@ -0,0 +1 @@
+Place all your checkpoints (.pth files) here. 
\ No newline at end of file
diff --git a/Wav2Lip/checkpoints/mobilenet.pth b/Wav2Lip/checkpoints/mobilenet.pth
new file mode 100644
index 0000000000000000000000000000000000000000..059e2c444b5abf257d527ff8856aa5ef91a3efef
--- /dev/null
+++ b/Wav2Lip/checkpoints/mobilenet.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2979b33ffafda5d74b6948cd7a5b9a7a62f62b949cef24e95fd15d2883a65220
+size 1789735
diff --git a/Wav2Lip/checkpoints/resnet50.pth b/Wav2Lip/checkpoints/resnet50.pth
new file mode 100644
index 0000000000000000000000000000000000000000..16546738ce0a00a9fd47585e0fc52744d31cc117
--- /dev/null
+++ b/Wav2Lip/checkpoints/resnet50.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d1de9c2944f2ccddca5f5e010ea5ae64a39845a86311af6fdf30841b0a5a16d
+size 109497761
diff --git a/Wav2Lip/checkpoints/wav2lip_gan.pth b/Wav2Lip/checkpoints/wav2lip_gan.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0b8907735b1aef6adb297f2f6dcbcf8432823de4
--- /dev/null
+++ b/Wav2Lip/checkpoints/wav2lip_gan.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca9ab7b7b812c0e80a6e70a5977c545a1e8a365a6c49d5e533023c034d7ac3d8
+size 435801865
diff --git a/Wav2Lip/cog.yaml b/Wav2Lip/cog.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f188727d7b56815c0e5dbc41616ee928d7a1fd68
--- /dev/null
+++ b/Wav2Lip/cog.yaml
@@ -0,0 +1,35 @@
+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+
+image: r8.im/devxpy/cog-wav2lip
+
+build:
+  # set to true if your model requires a GPU
+  gpu: true
+  cuda: "11.6.2"
+
+  # a list of ubuntu apt packages to install
+  system_packages:
+     - ffmpeg
+     - cmake
+
+  # python version in the form '3.8' or '3.8.12'
+  python_version: "3.8"
+
+  # a list of packages in the format <package-name>==<version>
+  python_packages:
+    - numpy==1.23.4
+    - librosa==0.7.0
+    - opencv-python==4.6.0.66
+    - torch==1.12.1+cu116 --extra-index-url=https://download.pytorch.org/whl/cu116
+    - torchvision==0.13.1+cu116 --extra-index-url=https://download.pytorch.org/whl/cu116
+    - tqdm==4.45.0
+    - numba==0.48
+    - mediapipe==0.8.11
+
+  # commands run after the environment is setup
+  run:
+    - pip install git+https://github.com/elliottzheng/batch-face.git@master
+
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"
diff --git a/Wav2Lip/color_syncnet_train.py b/Wav2Lip/color_syncnet_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..afa00544386cb9627f0d899476abbc82b37958ed
--- /dev/null
+++ b/Wav2Lip/color_syncnet_train.py
@@ -0,0 +1,279 @@
+from os.path import dirname, join, basename, isfile
+from tqdm import tqdm
+
+from models import SyncNet_color as SyncNet
+import audio
+
+import torch
+from torch import nn
+from torch import optim
+import torch.backends.cudnn as cudnn
+from torch.utils import data as data_utils
+import numpy as np
+
+from glob import glob
+
+import os, random, cv2, argparse
+from hparams import hparams, get_image_list
+
+parser = argparse.ArgumentParser(description='Code to train the expert lip-sync discriminator')
+
+parser.add_argument("--data_root", help="Root folder of the preprocessed LRS2 dataset", required=True)
+
+parser.add_argument('--checkpoint_dir', help='Save checkpoints to this directory', required=True, type=str)
+parser.add_argument('--checkpoint_path', help='Resumed from this checkpoint', default=None, type=str)
+
+args = parser.parse_args()
+
+
+global_step = 0
+global_epoch = 0
+use_cuda = torch.cuda.is_available()
+print('use_cuda: {}'.format(use_cuda))
+
+syncnet_T = 5
+syncnet_mel_step_size = 16
+
+class Dataset(object):
+    def __init__(self, split):
+        self.all_videos = get_image_list(args.data_root, split)
+
+    def get_frame_id(self, frame):
+        return int(basename(frame).split('.')[0])
+
+    def get_window(self, start_frame):
+        start_id = self.get_frame_id(start_frame)
+        vidname = dirname(start_frame)
+
+        window_fnames = []
+        for frame_id in range(start_id, start_id + syncnet_T):
+            frame = join(vidname, '{}.jpg'.format(frame_id))
+            if not isfile(frame):
+                return None
+            window_fnames.append(frame)
+        return window_fnames
+
+    def crop_audio_window(self, spec, start_frame):
+        # num_frames = (T x hop_size * fps) / sample_rate
+        start_frame_num = self.get_frame_id(start_frame)
+        start_idx = int(80. * (start_frame_num / float(hparams.fps)))
+
+        end_idx = start_idx + syncnet_mel_step_size
+
+        return spec[start_idx : end_idx, :]
+
+
+    def __len__(self):
+        return len(self.all_videos)
+
+    def __getitem__(self, idx):
+        while 1:
+            idx = random.randint(0, len(self.all_videos) - 1)
+            vidname = self.all_videos[idx]
+
+            img_names = list(glob(join(vidname, '*.jpg')))
+            if len(img_names) <= 3 * syncnet_T:
+                continue
+            img_name = random.choice(img_names)
+            wrong_img_name = random.choice(img_names)
+            while wrong_img_name == img_name:
+                wrong_img_name = random.choice(img_names)
+
+            if random.choice([True, False]):
+                y = torch.ones(1).float()
+                chosen = img_name
+            else:
+                y = torch.zeros(1).float()
+                chosen = wrong_img_name
+
+            window_fnames = self.get_window(chosen)
+            if window_fnames is None:
+                continue
+
+            window = []
+            all_read = True
+            for fname in window_fnames:
+                img = cv2.imread(fname)
+                if img is None:
+                    all_read = False
+                    break
+                try:
+                    img = cv2.resize(img, (hparams.img_size, hparams.img_size))
+                except Exception as e:
+                    all_read = False
+                    break
+
+                window.append(img)
+
+            if not all_read: continue
+
+            try:
+                wavpath = join(vidname, "audio.wav")
+                wav = audio.load_wav(wavpath, hparams.sample_rate)
+
+                orig_mel = audio.melspectrogram(wav).T
+            except Exception as e:
+                continue
+
+            mel = self.crop_audio_window(orig_mel.copy(), img_name)
+
+            if (mel.shape[0] != syncnet_mel_step_size):
+                continue
+
+            # H x W x 3 * T
+            x = np.concatenate(window, axis=2) / 255.
+            x = x.transpose(2, 0, 1)
+            x = x[:, x.shape[1]//2:]
+
+            x = torch.FloatTensor(x)
+            mel = torch.FloatTensor(mel.T).unsqueeze(0)
+
+            return x, mel, y
+
+logloss = nn.BCELoss()
+def cosine_loss(a, v, y):
+    d = nn.functional.cosine_similarity(a, v)
+    loss = logloss(d.unsqueeze(1), y)
+
+    return loss
+
+def train(device, model, train_data_loader, test_data_loader, optimizer,
+          checkpoint_dir=None, checkpoint_interval=None, nepochs=None):
+
+    global global_step, global_epoch
+    resumed_step = global_step
+    
+    while global_epoch < nepochs:
+        running_loss = 0.
+        prog_bar = tqdm(enumerate(train_data_loader))
+        for step, (x, mel, y) in prog_bar:
+            model.train()
+            optimizer.zero_grad()
+
+            # Transform data to CUDA device
+            x = x.to(device)
+
+            mel = mel.to(device)
+
+            a, v = model(mel, x)
+            y = y.to(device)
+
+            loss = cosine_loss(a, v, y)
+            loss.backward()
+            optimizer.step()
+
+            global_step += 1
+            cur_session_steps = global_step - resumed_step
+            running_loss += loss.item()
+
+            if global_step == 1 or global_step % checkpoint_interval == 0:
+                save_checkpoint(
+                    model, optimizer, global_step, checkpoint_dir, global_epoch)
+
+            if global_step % hparams.syncnet_eval_interval == 0:
+                with torch.no_grad():
+                    eval_model(test_data_loader, global_step, device, model, checkpoint_dir)
+
+            prog_bar.set_description('Loss: {}'.format(running_loss / (step + 1)))
+
+        global_epoch += 1
+
+def eval_model(test_data_loader, global_step, device, model, checkpoint_dir):
+    eval_steps = 1400
+    print('Evaluating for {} steps'.format(eval_steps))
+    losses = []
+    while 1:
+        for step, (x, mel, y) in enumerate(test_data_loader):
+
+            model.eval()
+
+            # Transform data to CUDA device
+            x = x.to(device)
+
+            mel = mel.to(device)
+
+            a, v = model(mel, x)
+            y = y.to(device)
+
+            loss = cosine_loss(a, v, y)
+            losses.append(loss.item())
+
+            if step > eval_steps: break
+
+        averaged_loss = sum(losses) / len(losses)
+        print(averaged_loss)
+
+        return
+
+def save_checkpoint(model, optimizer, step, checkpoint_dir, epoch):
+
+    checkpoint_path = join(
+        checkpoint_dir, "checkpoint_step{:09d}.pth".format(global_step))
+    optimizer_state = optimizer.state_dict() if hparams.save_optimizer_state else None
+    torch.save({
+        "state_dict": model.state_dict(),
+        "optimizer": optimizer_state,
+        "global_step": step,
+        "global_epoch": epoch,
+    }, checkpoint_path)
+    print("Saved checkpoint:", checkpoint_path)
+
+def _load(checkpoint_path):
+    if use_cuda:
+        checkpoint = torch.load(checkpoint_path)
+    else:
+        checkpoint = torch.load(checkpoint_path,
+                                map_location=lambda storage, loc: storage)
+    return checkpoint
+
+def load_checkpoint(path, model, optimizer, reset_optimizer=False):
+    global global_step
+    global global_epoch
+
+    print("Load checkpoint from: {}".format(path))
+    checkpoint = _load(path)
+    model.load_state_dict(checkpoint["state_dict"])
+    if not reset_optimizer:
+        optimizer_state = checkpoint["optimizer"]
+        if optimizer_state is not None:
+            print("Load optimizer state from {}".format(path))
+            optimizer.load_state_dict(checkpoint["optimizer"])
+    global_step = checkpoint["global_step"]
+    global_epoch = checkpoint["global_epoch"]
+
+    return model
+
+if __name__ == "__main__":
+    checkpoint_dir = args.checkpoint_dir
+    checkpoint_path = args.checkpoint_path
+
+    if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir)
+
+    # Dataset and Dataloader setup
+    train_dataset = Dataset('train')
+    test_dataset = Dataset('val')
+
+    train_data_loader = data_utils.DataLoader(
+        train_dataset, batch_size=hparams.syncnet_batch_size, shuffle=True,
+        num_workers=hparams.num_workers)
+
+    test_data_loader = data_utils.DataLoader(
+        test_dataset, batch_size=hparams.syncnet_batch_size,
+        num_workers=8)
+
+    device = torch.device("cuda" if use_cuda else "cpu")
+
+    # Model
+    model = SyncNet().to(device)
+    print('total trainable params {}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad)))
+
+    optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad],
+                           lr=hparams.syncnet_lr)
+
+    if checkpoint_path is not None:
+        load_checkpoint(checkpoint_path, model, optimizer, reset_optimizer=False)
+
+    train(device, model, train_data_loader, test_data_loader, optimizer,
+          checkpoint_dir=checkpoint_dir,
+          checkpoint_interval=hparams.syncnet_checkpoint_interval,
+          nepochs=hparams.nepochs)
diff --git a/Wav2Lip/evaluation/README.md b/Wav2Lip/evaluation/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..affebbc063571f576a05af4367c6ab6023299c02
--- /dev/null
+++ b/Wav2Lip/evaluation/README.md
@@ -0,0 +1,63 @@
+# Novel Evaluation Framework, new filelists, and using the LSE-D and LSE-C metric.
+
+Our paper also proposes a novel evaluation framework (Section 4). To evaluate on LRS2, LRS3, and LRW, the filelists are present in the `test_filelists` folder. Please use `gen_videos_from_filelist.py` script to generate the videos. After that, you can calculate the LSE-D and LSE-C scores using the instructions below. Please see [this thread](https://github.com/Rudrabha/Wav2Lip/issues/22#issuecomment-712825380) on how to calculate the FID scores. 
+
+The videos of the ReSyncED benchmark for real-world evaluation will be released soon. 
+
+### Steps to set-up the evaluation repository for LSE-D and LSE-C metric:
+We use the pre-trained syncnet model available in this [repository](https://github.com/joonson/syncnet_python). 
+
+* Clone the SyncNet repository.
+``` 
+git clone https://github.com/joonson/syncnet_python.git 
+```
+* Follow the procedure given in the above linked [repository](https://github.com/joonson/syncnet_python) to download the pretrained models and set up the dependencies. 
+    * **Note: Please install a separate virtual environment for the evaluation scripts. The versions used by Wav2Lip and the publicly released code of SyncNet is different and can cause version mis-match issues. To avoid this, we suggest the users to install a separate virtual environment for the evaluation scripts**
+```
+cd syncnet_python
+pip install -r requirements.txt
+sh download_model.sh
+```
+* The above step should ensure that all the dependencies required by the repository is installed and the pre-trained models are downloaded.
+
+### Running the evaluation scripts:
+* Copy our evaluation scripts given in this folder to the cloned repository.
+```  
+    cd Wav2Lip/evaluation/scores_LSE/
+    cp *.py syncnet_python/
+    cp *.sh syncnet_python/ 
+```
+**Note: We will release the test filelists for LRW, LRS2 and LRS3 shortly once we receive permission from the dataset creators. We will also release the Real World Dataset we have collected shortly.**
+
+* Our evaluation technique does not require ground-truth of any sorts. Given lip-synced videos we can directly calculate the scores from only the generated videos. Please store the generated videos (from our test sets or your own generated videos) in the following folder structure.
+```
+video data root (Folder containing all videos)
+├── All .mp4 files
+```
+* Change the folder back to the cloned repository. 
+```
+cd syncnet_python
+```
+* To run evaluation on the LRW, LRS2 and LRS3 test files, please run the following command:
+```
+python calculate_scores_LRS.py --data_root /path/to/video/data/root --tmp_dir tmp_dir/
+```
+
+* To run evaluation on the ReSynced dataset or your own generated videos, please run the following command:
+```
+sh calculate_scores_real_videos.sh /path/to/video/data/root
+```
+* The generated scores will be present in the all_scores.txt generated in the ```syncnet_python/``` folder
+
+# Evaluation of image quality using FID metric.
+We use the [pytorch-fid](https://github.com/mseitzer/pytorch-fid) repository for calculating the FID metrics. We dump all the frames in both ground-truth and generated videos and calculate the FID score. 
+
+
+# Opening issues related to evaluation scripts
+* Please open the issues with the "Evaluation" label if you face any issues in the evaluation scripts. 
+
+# Acknowledgements
+Our evaluation pipeline in based on two existing repositories. LSE metrics are based on the [syncnet_python](https://github.com/joonson/syncnet_python) repository and the FID score is based on [pytorch-fid](https://github.com/mseitzer/pytorch-fid) repository. We thank the authors of both the repositories for releasing their wonderful code.
+
+
+
diff --git a/Wav2Lip/evaluation/gen_videos_from_filelist.py b/Wav2Lip/evaluation/gen_videos_from_filelist.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd666b93258d3da3143a63da742265ebeac2a8a3
--- /dev/null
+++ b/Wav2Lip/evaluation/gen_videos_from_filelist.py
@@ -0,0 +1,238 @@
+from os import listdir, path
+import numpy as np
+import scipy, cv2, os, sys, argparse
+import dlib, json, subprocess
+from tqdm import tqdm
+from glob import glob
+import torch
+
+sys.path.append('../')
+import audio
+import face_detection
+from models import Wav2Lip
+
+parser = argparse.ArgumentParser(description='Code to generate results for test filelists')
+
+parser.add_argument('--filelist', type=str, 
+					help='Filepath of filelist file to read', required=True)
+parser.add_argument('--results_dir', type=str, help='Folder to save all results into', 
+									required=True)
+parser.add_argument('--data_root', type=str, required=True)
+parser.add_argument('--checkpoint_path', type=str, 
+					help='Name of saved checkpoint to load weights from', required=True)
+
+parser.add_argument('--pads', nargs='+', type=int, default=[0, 0, 0, 0], 
+					help='Padding (top, bottom, left, right)')
+parser.add_argument('--face_det_batch_size', type=int, 
+					help='Single GPU batch size for face detection', default=64)
+parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip', default=128)
+
+# parser.add_argument('--resize_factor', default=1, type=int)
+
+args = parser.parse_args()
+args.img_size = 96
+
+def get_smoothened_boxes(boxes, T):
+	for i in range(len(boxes)):
+		if i + T > len(boxes):
+			window = boxes[len(boxes) - T:]
+		else:
+			window = boxes[i : i + T]
+		boxes[i] = np.mean(window, axis=0)
+	return boxes
+
+def face_detect(images):
+	batch_size = args.face_det_batch_size
+	
+	while 1:
+		predictions = []
+		try:
+			for i in range(0, len(images), batch_size):
+				predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
+		except RuntimeError:
+			if batch_size == 1:
+				raise RuntimeError('Image too big to run face detection on GPU')
+			batch_size //= 2
+			args.face_det_batch_size = batch_size
+			print('Recovering from OOM error; New batch size: {}'.format(batch_size))
+			continue
+		break
+
+	results = []
+	pady1, pady2, padx1, padx2 = args.pads
+	for rect, image in zip(predictions, images):
+		if rect is None:
+			raise ValueError('Face not detected!')
+
+		y1 = max(0, rect[1] - pady1)
+		y2 = min(image.shape[0], rect[3] + pady2)
+		x1 = max(0, rect[0] - padx1)
+		x2 = min(image.shape[1], rect[2] + padx2)
+		
+		results.append([x1, y1, x2, y2])
+
+	boxes = get_smoothened_boxes(np.array(results), T=5)
+	results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2), True] for image, (x1, y1, x2, y2) in zip(images, boxes)]
+
+	return results 
+
+def datagen(frames, face_det_results, mels):
+	img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+
+	for i, m in enumerate(mels):
+		if i >= len(frames): raise ValueError('Equal or less lengths only')
+
+		frame_to_save = frames[i].copy()
+		face, coords, valid_frame = face_det_results[i].copy()
+		if not valid_frame:
+			continue
+
+		face = cv2.resize(face, (args.img_size, args.img_size))
+			
+		img_batch.append(face)
+		mel_batch.append(m)
+		frame_batch.append(frame_to_save)
+		coords_batch.append(coords)
+
+		if len(img_batch) >= args.wav2lip_batch_size:
+			img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+
+			img_masked = img_batch.copy()
+			img_masked[:, args.img_size//2:] = 0
+
+			img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+			mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+
+			yield img_batch, mel_batch, frame_batch, coords_batch
+			img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+
+	if len(img_batch) > 0:
+		img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+
+		img_masked = img_batch.copy()
+		img_masked[:, args.img_size//2:] = 0
+
+		img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+		mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+
+		yield img_batch, mel_batch, frame_batch, coords_batch
+
+fps = 25
+mel_step_size = 16
+mel_idx_multiplier = 80./fps
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print('Using {} for inference.'.format(device))
+
+detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D, 
+											flip_input=False, device=device)
+
+def _load(checkpoint_path):
+	if device == 'cuda':
+		checkpoint = torch.load(checkpoint_path)
+	else:
+		checkpoint = torch.load(checkpoint_path,
+								map_location=lambda storage, loc: storage)
+	return checkpoint
+
+def load_model(path):
+	model = Wav2Lip()
+	print("Load checkpoint from: {}".format(path))
+	checkpoint = _load(path)
+	s = checkpoint["state_dict"]
+	new_s = {}
+	for k, v in s.items():
+		new_s[k.replace('module.', '')] = v
+	model.load_state_dict(new_s)
+
+	model = model.to(device)
+	return model.eval()
+
+model = load_model(args.checkpoint_path)
+
+def main():
+	assert args.data_root is not None
+	data_root = args.data_root
+
+	if not os.path.isdir(args.results_dir): os.makedirs(args.results_dir)
+
+	with open(args.filelist, 'r') as filelist:
+		lines = filelist.readlines()
+
+	for idx, line in enumerate(tqdm(lines)):
+		audio_src, video = line.strip().split()
+
+		audio_src = os.path.join(data_root, audio_src) + '.mp4'
+		video = os.path.join(data_root, video) + '.mp4'
+
+		command = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'.format(audio_src, '../temp/temp.wav')
+		subprocess.call(command, shell=True)
+		temp_audio = '../temp/temp.wav'
+
+		wav = audio.load_wav(temp_audio, 16000)
+		mel = audio.melspectrogram(wav)
+		if np.isnan(mel.reshape(-1)).sum() > 0:
+			continue
+
+		mel_chunks = []
+		i = 0
+		while 1:
+			start_idx = int(i * mel_idx_multiplier)
+			if start_idx + mel_step_size > len(mel[0]):
+				break
+			mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
+			i += 1
+
+		video_stream = cv2.VideoCapture(video)
+			
+		full_frames = []
+		while 1:
+			still_reading, frame = video_stream.read()
+			if not still_reading or len(full_frames) > len(mel_chunks):
+				video_stream.release()
+				break
+			full_frames.append(frame)
+
+		if len(full_frames) < len(mel_chunks):
+			continue
+
+		full_frames = full_frames[:len(mel_chunks)]
+
+		try:
+			face_det_results = face_detect(full_frames.copy())
+		except ValueError as e:
+			continue
+
+		batch_size = args.wav2lip_batch_size
+		gen = datagen(full_frames.copy(), face_det_results, mel_chunks)
+
+		for i, (img_batch, mel_batch, frames, coords) in enumerate(gen):
+			if i == 0:
+				frame_h, frame_w = full_frames[0].shape[:-1]
+				out = cv2.VideoWriter('../temp/result.avi', 
+								cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
+
+			img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
+			mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
+
+			with torch.no_grad():
+				pred = model(mel_batch, img_batch)
+					
+
+			pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
+			
+			for pl, f, c in zip(pred, frames, coords):
+				y1, y2, x1, x2 = c
+				pl = cv2.resize(pl.astype(np.uint8), (x2 - x1, y2 - y1))
+				f[y1:y2, x1:x2] = pl
+				out.write(f)
+
+		out.release()
+
+		vid = os.path.join(args.results_dir, '{}.mp4'.format(idx))
+
+		command = 'ffmpeg -loglevel panic -y -i {} -i {} -strict -2 -q:v 1 {}'.format(temp_audio, 
+								'../temp/result.avi', vid)
+		subprocess.call(command, shell=True)
+
+if __name__ == '__main__':
+	main()
diff --git a/Wav2Lip/evaluation/real_videos_inference.py b/Wav2Lip/evaluation/real_videos_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c9fb15ef342bf03caf77802ddf5b887bab3fb34
--- /dev/null
+++ b/Wav2Lip/evaluation/real_videos_inference.py
@@ -0,0 +1,305 @@
+from os import listdir, path
+import numpy as np
+import scipy, cv2, os, sys, argparse
+import dlib, json, subprocess
+from tqdm import tqdm
+from glob import glob
+import torch
+
+sys.path.append('../')
+import audio
+import face_detection
+from models import Wav2Lip
+
+parser = argparse.ArgumentParser(description='Code to generate results on ReSyncED evaluation set')
+
+parser.add_argument('--mode', type=str, 
+					help='random | dubbed | tts', required=True)
+
+parser.add_argument('--filelist', type=str, 
+					help='Filepath of filelist file to read', default=None)
+
+parser.add_argument('--results_dir', type=str, help='Folder to save all results into', 
+									required=True)
+parser.add_argument('--data_root', type=str, required=True)
+parser.add_argument('--checkpoint_path', type=str, 
+					help='Name of saved checkpoint to load weights from', required=True)
+parser.add_argument('--pads', nargs='+', type=int, default=[0, 10, 0, 0], 
+					help='Padding (top, bottom, left, right)')
+
+parser.add_argument('--face_det_batch_size', type=int, 
+					help='Single GPU batch size for face detection', default=16)
+
+parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip', default=128)
+parser.add_argument('--face_res', help='Approximate resolution of the face at which to test', default=180)
+parser.add_argument('--min_frame_res', help='Do not downsample further below this frame resolution', default=480)
+parser.add_argument('--max_frame_res', help='Downsample to at least this frame resolution', default=720)
+# parser.add_argument('--resize_factor', default=1, type=int)
+
+args = parser.parse_args()
+args.img_size = 96
+
+def get_smoothened_boxes(boxes, T):
+	for i in range(len(boxes)):
+		if i + T > len(boxes):
+			window = boxes[len(boxes) - T:]
+		else:
+			window = boxes[i : i + T]
+		boxes[i] = np.mean(window, axis=0)
+	return boxes
+
+def rescale_frames(images):
+	rect = detector.get_detections_for_batch(np.array([images[0]]))[0]
+	if rect is None:
+		raise ValueError('Face not detected!')
+	h, w = images[0].shape[:-1]
+
+	x1, y1, x2, y2 = rect
+
+	face_size = max(np.abs(y1 - y2), np.abs(x1 - x2))
+
+	diff = np.abs(face_size - args.face_res)
+	for factor in range(2, 16):
+		downsampled_res = face_size // factor
+		if min(h//factor, w//factor) < args.min_frame_res: break 
+		if np.abs(downsampled_res - args.face_res) >= diff: break
+
+	factor -= 1
+	if factor == 1: return images
+
+	return [cv2.resize(im, (im.shape[1]//(factor), im.shape[0]//(factor))) for im in images]
+
+
+def face_detect(images):
+	batch_size = args.face_det_batch_size
+	images = rescale_frames(images)
+
+	while 1:
+		predictions = []
+		try:
+			for i in range(0, len(images), batch_size):
+				predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
+		except RuntimeError:
+			if batch_size == 1:
+				raise RuntimeError('Image too big to run face detection on GPU')
+			batch_size //= 2
+			print('Recovering from OOM error; New batch size: {}'.format(batch_size))
+			continue
+		break
+
+	results = []
+	pady1, pady2, padx1, padx2 = args.pads
+	for rect, image in zip(predictions, images):
+		if rect is None:
+			raise ValueError('Face not detected!')
+
+		y1 = max(0, rect[1] - pady1)
+		y2 = min(image.shape[0], rect[3] + pady2)
+		x1 = max(0, rect[0] - padx1)
+		x2 = min(image.shape[1], rect[2] + padx2)
+		
+		results.append([x1, y1, x2, y2])
+
+	boxes = get_smoothened_boxes(np.array(results), T=5)
+	results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2), True] for image, (x1, y1, x2, y2) in zip(images, boxes)]
+
+	return results, images 
+
+def datagen(frames, face_det_results, mels):
+	img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+
+	for i, m in enumerate(mels):
+		if i >= len(frames): raise ValueError('Equal or less lengths only')
+
+		frame_to_save = frames[i].copy()
+		face, coords, valid_frame = face_det_results[i].copy()
+		if not valid_frame:
+			continue
+
+		face = cv2.resize(face, (args.img_size, args.img_size))
+			
+		img_batch.append(face)
+		mel_batch.append(m)
+		frame_batch.append(frame_to_save)
+		coords_batch.append(coords)
+
+		if len(img_batch) >= args.wav2lip_batch_size:
+			img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+
+			img_masked = img_batch.copy()
+			img_masked[:, args.img_size//2:] = 0
+
+			img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+			mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+
+			yield img_batch, mel_batch, frame_batch, coords_batch
+			img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+
+	if len(img_batch) > 0:
+		img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+
+		img_masked = img_batch.copy()
+		img_masked[:, args.img_size//2:] = 0
+
+		img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+		mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+
+		yield img_batch, mel_batch, frame_batch, coords_batch
+
+def increase_frames(frames, l):
+	## evenly duplicating frames to increase length of video
+	while len(frames) < l:
+		dup_every = float(l) / len(frames)
+
+		final_frames = []
+		next_duplicate = 0.
+
+		for i, f in enumerate(frames):
+			final_frames.append(f)
+
+			if int(np.ceil(next_duplicate)) == i:
+				final_frames.append(f)
+
+			next_duplicate += dup_every
+
+		frames = final_frames
+
+	return frames[:l]
+
+mel_step_size = 16
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print('Using {} for inference.'.format(device))
+
+detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D, 
+											flip_input=False, device=device)
+
+def _load(checkpoint_path):
+	if device == 'cuda':
+		checkpoint = torch.load(checkpoint_path)
+	else:
+		checkpoint = torch.load(checkpoint_path,
+								map_location=lambda storage, loc: storage)
+	return checkpoint
+
+def load_model(path):
+	model = Wav2Lip()
+	print("Load checkpoint from: {}".format(path))
+	checkpoint = _load(path)
+	s = checkpoint["state_dict"]
+	new_s = {}
+	for k, v in s.items():
+		new_s[k.replace('module.', '')] = v
+	model.load_state_dict(new_s)
+
+	model = model.to(device)
+	return model.eval()
+
+model = load_model(args.checkpoint_path)
+
+def main():
+	if not os.path.isdir(args.results_dir): os.makedirs(args.results_dir)
+
+	if args.mode == 'dubbed':
+		files = listdir(args.data_root)
+		lines = ['{} {}'.format(f, f) for f in files]
+
+	else:
+		assert args.filelist is not None
+		with open(args.filelist, 'r') as filelist:
+			lines = filelist.readlines()
+
+	for idx, line in enumerate(tqdm(lines)):
+		video, audio_src = line.strip().split()
+
+		audio_src = os.path.join(args.data_root, audio_src)
+		video = os.path.join(args.data_root, video)
+
+		command = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'.format(audio_src, '../temp/temp.wav')
+		subprocess.call(command, shell=True)
+		temp_audio = '../temp/temp.wav'
+
+		wav = audio.load_wav(temp_audio, 16000)
+		mel = audio.melspectrogram(wav)
+
+		if np.isnan(mel.reshape(-1)).sum() > 0:
+			raise ValueError('Mel contains nan!')
+
+		video_stream = cv2.VideoCapture(video)
+
+		fps = video_stream.get(cv2.CAP_PROP_FPS)
+		mel_idx_multiplier = 80./fps
+
+		full_frames = []
+		while 1:
+			still_reading, frame = video_stream.read()
+			if not still_reading:
+				video_stream.release()
+				break
+
+			if min(frame.shape[:-1]) > args.max_frame_res:
+				h, w = frame.shape[:-1]
+				scale_factor = min(h, w) / float(args.max_frame_res)
+				h = int(h/scale_factor)
+				w = int(w/scale_factor)
+
+				frame = cv2.resize(frame, (w, h))
+			full_frames.append(frame)
+
+		mel_chunks = []
+		i = 0
+		while 1:
+			start_idx = int(i * mel_idx_multiplier)
+			if start_idx + mel_step_size > len(mel[0]):
+				break
+			mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
+			i += 1
+
+		if len(full_frames) < len(mel_chunks):
+			if args.mode == 'tts':
+				full_frames = increase_frames(full_frames, len(mel_chunks))
+			else:
+				raise ValueError('#Frames, audio length mismatch')
+
+		else:
+			full_frames = full_frames[:len(mel_chunks)]
+
+		try:
+			face_det_results, full_frames = face_detect(full_frames.copy())
+		except ValueError as e:
+			continue
+
+		batch_size = args.wav2lip_batch_size
+		gen = datagen(full_frames.copy(), face_det_results, mel_chunks)
+
+		for i, (img_batch, mel_batch, frames, coords) in enumerate(gen):
+			if i == 0:
+				frame_h, frame_w = full_frames[0].shape[:-1]
+
+				out = cv2.VideoWriter('../temp/result.avi', 
+								cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
+
+			img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
+			mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
+
+			with torch.no_grad():
+				pred = model(mel_batch, img_batch)
+					
+
+			pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
+			
+			for pl, f, c in zip(pred, frames, coords):
+				y1, y2, x1, x2 = c
+				pl = cv2.resize(pl.astype(np.uint8), (x2 - x1, y2 - y1))
+				f[y1:y2, x1:x2] = pl
+				out.write(f)
+
+		out.release()
+
+		vid = os.path.join(args.results_dir, '{}.mp4'.format(idx))
+		command = 'ffmpeg -loglevel panic -y -i {} -i {} -strict -2 -q:v 1 {}'.format('../temp/temp.wav', 
+								'../temp/result.avi', vid)
+		subprocess.call(command, shell=True)
+
+
+if __name__ == '__main__':
+	main()
diff --git a/Wav2Lip/evaluation/scores_LSE/SyncNetInstance_calc_scores.py b/Wav2Lip/evaluation/scores_LSE/SyncNetInstance_calc_scores.py
new file mode 100644
index 0000000000000000000000000000000000000000..64906e257bd1f521d8fadb93e877ba83da7764ce
--- /dev/null
+++ b/Wav2Lip/evaluation/scores_LSE/SyncNetInstance_calc_scores.py
@@ -0,0 +1,210 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+# Video 25 FPS, Audio 16000HZ
+
+import torch
+import numpy
+import time, pdb, argparse, subprocess, os, math, glob
+import cv2
+import python_speech_features
+
+from scipy import signal
+from scipy.io import wavfile
+from SyncNetModel import *
+from shutil import rmtree
+
+
+# ==================== Get OFFSET ====================
+
+def calc_pdist(feat1, feat2, vshift=10):
+    
+    win_size = vshift*2+1
+
+    feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
+
+    dists = []
+
+    for i in range(0,len(feat1)):
+
+        dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:]))
+
+    return dists
+
+# ==================== MAIN DEF ====================
+
+class SyncNetInstance(torch.nn.Module):
+
+    def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
+        super(SyncNetInstance, self).__init__();
+
+        self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda();
+
+    def evaluate(self, opt, videofile):
+
+        self.__S__.eval();
+
+        # ========== ==========
+        # Convert files
+        # ========== ==========
+
+        if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
+          rmtree(os.path.join(opt.tmp_dir,opt.reference))
+
+        os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
+
+        command = ("ffmpeg -loglevel error -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) 
+        output = subprocess.call(command, shell=True, stdout=None)
+
+        command = ("ffmpeg -loglevel error -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) 
+        output = subprocess.call(command, shell=True, stdout=None)
+        
+        # ========== ==========
+        # Load video 
+        # ========== ==========
+
+        images = []
+        
+        flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
+        flist.sort()
+
+        for fname in flist:
+            img_input = cv2.imread(fname)
+            img_input = cv2.resize(img_input, (224,224)) #HARD CODED, CHANGE BEFORE RELEASE
+            images.append(img_input)
+
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+
+        # ========== ==========
+        # Load audio
+        # ========== ==========
+
+        sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))
+        mfcc = zip(*python_speech_features.mfcc(audio,sample_rate))
+        mfcc = numpy.stack([numpy.array(i) for i in mfcc])
+
+        cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
+        cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
+
+        # ========== ==========
+        # Check audio and video input length
+        # ========== ==========
+
+        #if (float(len(audio))/16000) != (float(len(images))/25) :
+        #    print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
+
+        min_length = min(len(images),math.floor(len(audio)/640))
+        
+        # ========== ==========
+        # Generate video and audio feats
+        # ========== ==========
+
+        lastframe = min_length-5
+        im_feat = []
+        cc_feat = []
+
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+            
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lip(im_in.cuda());
+            im_feat.append(im_out.data.cpu())
+
+            cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            cc_in = torch.cat(cc_batch,0)
+            cc_out  = self.__S__.forward_aud(cc_in.cuda())
+            cc_feat.append(cc_out.data.cpu())
+
+        im_feat = torch.cat(im_feat,0)
+        cc_feat = torch.cat(cc_feat,0)
+
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+            
+        #print('Compute time %.3f sec.' % (time.time()-tS))
+
+        dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
+        mdist = torch.mean(torch.stack(dists,1),1)
+
+        minval, minidx = torch.min(mdist,0)
+
+        offset = opt.vshift-minidx
+        conf   = torch.median(mdist) - minval
+
+        fdist   = numpy.stack([dist[minidx].numpy() for dist in dists])
+        # fdist   = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
+        fconf   = torch.median(mdist).numpy() - fdist
+        fconfm  = signal.medfilt(fconf,kernel_size=9)
+        
+        numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
+        #print('Framewise conf: ')
+        #print(fconfm)
+        #print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))
+
+        dists_npy = numpy.array([ dist.numpy() for dist in dists ])
+        return offset.numpy(), conf.numpy(), minval.numpy()
+
+    def extract_feature(self, opt, videofile):
+
+        self.__S__.eval();
+        
+        # ========== ==========
+        # Load video 
+        # ========== ==========
+        cap = cv2.VideoCapture(videofile)
+
+        frame_num = 1;
+        images = []
+        while frame_num:
+            frame_num += 1
+            ret, image = cap.read()
+            if ret == 0:
+                break
+
+            images.append(image)
+
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+        
+        # ========== ==========
+        # Generate video feats
+        # ========== ==========
+
+        lastframe = len(images)-4
+        im_feat = []
+
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+            
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lipfeat(im_in.cuda());
+            im_feat.append(im_out.data.cpu())
+
+        im_feat = torch.cat(im_feat,0)
+
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+            
+        print('Compute time %.3f sec.' % (time.time()-tS))
+
+        return im_feat
+
+
+    def loadParameters(self, path):
+        loaded_state = torch.load(path, map_location=lambda storage, loc: storage);
+
+        self_state = self.__S__.state_dict();
+
+        for name, param in loaded_state.items():
+
+            self_state[name].copy_(param);
diff --git a/Wav2Lip/evaluation/scores_LSE/calculate_scores_LRS.py b/Wav2Lip/evaluation/scores_LSE/calculate_scores_LRS.py
new file mode 100644
index 0000000000000000000000000000000000000000..eda02b8fbb7ac2f07d238b92d0879fb26c979394
--- /dev/null
+++ b/Wav2Lip/evaluation/scores_LSE/calculate_scores_LRS.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import time, pdb, argparse, subprocess
+import glob
+import os
+from tqdm import tqdm
+
+from SyncNetInstance_calc_scores import *
+
+# ==================== LOAD PARAMS ====================
+
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--data_root', type=str, required=True, help='');
+parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='');
+parser.add_argument('--reference', type=str, default="demo", help='');
+
+opt = parser.parse_args();
+
+
+# ==================== RUN EVALUATION ====================
+
+s = SyncNetInstance();
+
+s.loadParameters(opt.initial_model);
+#print("Model %s loaded."%opt.initial_model);
+path = os.path.join(opt.data_root, "*.mp4")
+
+all_videos = glob.glob(path)
+
+prog_bar = tqdm(range(len(all_videos)))
+avg_confidence = 0.
+avg_min_distance = 0.
+
+
+for videofile_idx in prog_bar:
+	videofile = all_videos[videofile_idx]
+	offset, confidence, min_distance = s.evaluate(opt, videofile=videofile)
+	avg_confidence += confidence
+	avg_min_distance += min_distance
+	prog_bar.set_description('Avg Confidence: {}, Avg Minimum Dist: {}'.format(round(avg_confidence / (videofile_idx + 1), 3), round(avg_min_distance / (videofile_idx + 1), 3)))
+	prog_bar.refresh()
+
+print ('Average Confidence: {}'.format(avg_confidence/len(all_videos)))
+print ('Average Minimum Distance: {}'.format(avg_min_distance/len(all_videos)))
+
+
+
diff --git a/Wav2Lip/evaluation/scores_LSE/calculate_scores_real_videos.py b/Wav2Lip/evaluation/scores_LSE/calculate_scores_real_videos.py
new file mode 100644
index 0000000000000000000000000000000000000000..09622584653dd40ce610afc8aef5765cdea16e68
--- /dev/null
+++ b/Wav2Lip/evaluation/scores_LSE/calculate_scores_real_videos.py
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import time, pdb, argparse, subprocess, pickle, os, gzip, glob
+
+from SyncNetInstance_calc_scores import *
+
+# ==================== PARSE ARGUMENT ====================
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--data_dir', type=str, default='data/work', help='');
+parser.add_argument('--videofile', type=str, default='', help='');
+parser.add_argument('--reference', type=str, default='', help='');
+opt = parser.parse_args();
+
+setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
+setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
+setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
+setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
+
+
+# ==================== LOAD MODEL AND FILE LIST ====================
+
+s = SyncNetInstance();
+
+s.loadParameters(opt.initial_model);
+#print("Model %s loaded."%opt.initial_model);
+
+flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi'))
+flist.sort()
+
+# ==================== GET OFFSETS ====================
+
+dists = []
+for idx, fname in enumerate(flist):
+    offset, conf, dist = s.evaluate(opt,videofile=fname)
+    print (str(dist)+" "+str(conf))
+      
+# ==================== PRINT RESULTS TO FILE ====================
+
+#with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil:
+#    pickle.dump(dists, fil)
diff --git a/Wav2Lip/evaluation/scores_LSE/calculate_scores_real_videos.sh b/Wav2Lip/evaluation/scores_LSE/calculate_scores_real_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4a45cd568d10bfeea9fc31255fcdf121d3f4e0e9
--- /dev/null
+++ b/Wav2Lip/evaluation/scores_LSE/calculate_scores_real_videos.sh
@@ -0,0 +1,8 @@
+rm all_scores.txt
+yourfilenames=`ls $1`
+
+for eachfile in $yourfilenames
+do
+   python run_pipeline.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir
+   python calculate_scores_real_videos.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir >> all_scores.txt
+done
diff --git a/Wav2Lip/evaluation/test_filelists/README.md b/Wav2Lip/evaluation/test_filelists/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..84c9acaedff77e229ce05bf92892b432cc302c35
--- /dev/null
+++ b/Wav2Lip/evaluation/test_filelists/README.md
@@ -0,0 +1,13 @@
+This folder contains the filelists for the new evaluation framework proposed in the paper. 
+
+## Test filelists for LRS2, LRS3, and LRW.
+
+This folder contains three filelists, each containing a list of names of audio-video pairs from the test sets of LRS2, LRS3, and LRW. The LRS2 and LRW filelists are strictly "Copyright BBC" and can only be used for “non-commercial research by applicants who have an agreement with the BBC to access the Lip Reading in the Wild and/or Lip Reading Sentences in the Wild datasets”. Please follow this link for more details: [https://www.bbc.co.uk/rd/projects/lip-reading-datasets](https://www.bbc.co.uk/rd/projects/lip-reading-datasets). 
+
+
+## ReSynCED benchmark
+
+The sub-folder `ReSynCED` contains filelists for our own Real-world lip-Sync Evaluation Dataset (ReSyncED).
+
+
+#### Instructions on how to use the above two filelists are available in the README of the parent folder. 
diff --git a/Wav2Lip/evaluation/test_filelists/ReSyncED/random_pairs.txt b/Wav2Lip/evaluation/test_filelists/ReSyncED/random_pairs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ffe2c40e117a109a97f8215438464b60e976dc73
--- /dev/null
+++ b/Wav2Lip/evaluation/test_filelists/ReSyncED/random_pairs.txt
@@ -0,0 +1,160 @@
+sachin.mp4 emma_cropped.mp4
+sachin.mp4 mourinho.mp4
+sachin.mp4 elon.mp4
+sachin.mp4 messi2.mp4
+sachin.mp4 cr1.mp4
+sachin.mp4 sachin.mp4
+sachin.mp4 sg.mp4
+sachin.mp4 fergi.mp4
+sachin.mp4 spanish_lec1.mp4
+sachin.mp4 bush_small.mp4
+sachin.mp4 macca_cut.mp4
+sachin.mp4 ca_cropped.mp4
+sachin.mp4 lecun.mp4
+sachin.mp4 spanish_lec0.mp4
+srk.mp4 emma_cropped.mp4
+srk.mp4 mourinho.mp4
+srk.mp4 elon.mp4
+srk.mp4 messi2.mp4
+srk.mp4 cr1.mp4
+srk.mp4 srk.mp4
+srk.mp4 sachin.mp4
+srk.mp4 sg.mp4
+srk.mp4 fergi.mp4
+srk.mp4 spanish_lec1.mp4
+srk.mp4 bush_small.mp4
+srk.mp4 macca_cut.mp4
+srk.mp4 ca_cropped.mp4
+srk.mp4 guardiola.mp4
+srk.mp4 lecun.mp4
+srk.mp4 spanish_lec0.mp4
+cr1.mp4 emma_cropped.mp4
+cr1.mp4 elon.mp4
+cr1.mp4 messi2.mp4
+cr1.mp4 cr1.mp4
+cr1.mp4 spanish_lec1.mp4
+cr1.mp4 bush_small.mp4
+cr1.mp4 macca_cut.mp4
+cr1.mp4 ca_cropped.mp4
+cr1.mp4 lecun.mp4
+cr1.mp4 spanish_lec0.mp4
+macca_cut.mp4 emma_cropped.mp4
+macca_cut.mp4 elon.mp4
+macca_cut.mp4 messi2.mp4
+macca_cut.mp4 spanish_lec1.mp4
+macca_cut.mp4 macca_cut.mp4
+macca_cut.mp4 ca_cropped.mp4
+macca_cut.mp4 spanish_lec0.mp4
+lecun.mp4 emma_cropped.mp4
+lecun.mp4 elon.mp4
+lecun.mp4 messi2.mp4
+lecun.mp4 spanish_lec1.mp4
+lecun.mp4 macca_cut.mp4
+lecun.mp4 ca_cropped.mp4
+lecun.mp4 lecun.mp4
+lecun.mp4 spanish_lec0.mp4
+messi2.mp4 emma_cropped.mp4
+messi2.mp4 elon.mp4
+messi2.mp4 messi2.mp4
+messi2.mp4 spanish_lec1.mp4
+messi2.mp4 macca_cut.mp4
+messi2.mp4 ca_cropped.mp4
+messi2.mp4 spanish_lec0.mp4
+ca_cropped.mp4 emma_cropped.mp4
+ca_cropped.mp4 elon.mp4
+ca_cropped.mp4 spanish_lec1.mp4
+ca_cropped.mp4 ca_cropped.mp4
+ca_cropped.mp4 spanish_lec0.mp4
+spanish_lec1.mp4 spanish_lec1.mp4
+spanish_lec1.mp4 spanish_lec0.mp4
+elon.mp4 elon.mp4
+elon.mp4 spanish_lec1.mp4
+elon.mp4 spanish_lec0.mp4
+guardiola.mp4 emma_cropped.mp4
+guardiola.mp4 mourinho.mp4
+guardiola.mp4 elon.mp4
+guardiola.mp4 messi2.mp4
+guardiola.mp4 cr1.mp4
+guardiola.mp4 sachin.mp4
+guardiola.mp4 sg.mp4
+guardiola.mp4 fergi.mp4
+guardiola.mp4 spanish_lec1.mp4
+guardiola.mp4 bush_small.mp4
+guardiola.mp4 macca_cut.mp4
+guardiola.mp4 ca_cropped.mp4
+guardiola.mp4 guardiola.mp4
+guardiola.mp4 lecun.mp4
+guardiola.mp4 spanish_lec0.mp4
+fergi.mp4 emma_cropped.mp4
+fergi.mp4 mourinho.mp4
+fergi.mp4 elon.mp4
+fergi.mp4 messi2.mp4
+fergi.mp4 cr1.mp4
+fergi.mp4 sachin.mp4
+fergi.mp4 sg.mp4
+fergi.mp4 fergi.mp4
+fergi.mp4 spanish_lec1.mp4
+fergi.mp4 bush_small.mp4
+fergi.mp4 macca_cut.mp4
+fergi.mp4 ca_cropped.mp4
+fergi.mp4 lecun.mp4
+fergi.mp4 spanish_lec0.mp4
+spanish.mp4 emma_cropped.mp4
+spanish.mp4 spanish.mp4
+spanish.mp4 mourinho.mp4
+spanish.mp4 elon.mp4
+spanish.mp4 messi2.mp4
+spanish.mp4 cr1.mp4
+spanish.mp4 srk.mp4
+spanish.mp4 sachin.mp4
+spanish.mp4 sg.mp4
+spanish.mp4 fergi.mp4
+spanish.mp4 spanish_lec1.mp4
+spanish.mp4 bush_small.mp4
+spanish.mp4 macca_cut.mp4
+spanish.mp4 ca_cropped.mp4
+spanish.mp4 guardiola.mp4
+spanish.mp4 lecun.mp4
+spanish.mp4 spanish_lec0.mp4
+bush_small.mp4 emma_cropped.mp4
+bush_small.mp4 elon.mp4
+bush_small.mp4 messi2.mp4
+bush_small.mp4 spanish_lec1.mp4
+bush_small.mp4 bush_small.mp4
+bush_small.mp4 macca_cut.mp4
+bush_small.mp4 ca_cropped.mp4
+bush_small.mp4 lecun.mp4
+bush_small.mp4 spanish_lec0.mp4
+emma_cropped.mp4 emma_cropped.mp4
+emma_cropped.mp4 elon.mp4
+emma_cropped.mp4 spanish_lec1.mp4
+emma_cropped.mp4 spanish_lec0.mp4
+sg.mp4 emma_cropped.mp4
+sg.mp4 mourinho.mp4
+sg.mp4 elon.mp4
+sg.mp4 messi2.mp4
+sg.mp4 cr1.mp4
+sg.mp4 sachin.mp4
+sg.mp4 sg.mp4
+sg.mp4 fergi.mp4
+sg.mp4 spanish_lec1.mp4
+sg.mp4 bush_small.mp4
+sg.mp4 macca_cut.mp4
+sg.mp4 ca_cropped.mp4
+sg.mp4 lecun.mp4
+sg.mp4 spanish_lec0.mp4
+spanish_lec0.mp4 spanish_lec0.mp4
+mourinho.mp4 emma_cropped.mp4
+mourinho.mp4 mourinho.mp4
+mourinho.mp4 elon.mp4
+mourinho.mp4 messi2.mp4
+mourinho.mp4 cr1.mp4
+mourinho.mp4 sachin.mp4
+mourinho.mp4 sg.mp4
+mourinho.mp4 fergi.mp4
+mourinho.mp4 spanish_lec1.mp4
+mourinho.mp4 bush_small.mp4
+mourinho.mp4 macca_cut.mp4
+mourinho.mp4 ca_cropped.mp4
+mourinho.mp4 lecun.mp4
+mourinho.mp4 spanish_lec0.mp4
diff --git a/Wav2Lip/evaluation/test_filelists/ReSyncED/tts_pairs.txt b/Wav2Lip/evaluation/test_filelists/ReSyncED/tts_pairs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b7dc1a8c0b50ebb33ba0edf269ea9329933a10dc
--- /dev/null
+++ b/Wav2Lip/evaluation/test_filelists/ReSyncED/tts_pairs.txt
@@ -0,0 +1,18 @@
+adam_1.mp4 andreng_optimization.wav
+agad_2.mp4 agad_2.wav
+agad_1.mp4 agad_1.wav
+agad_3.mp4 agad_3.wav
+rms_prop_1.mp4 rms_prop_tts.wav
+tf_1.mp4 tf_1.wav
+tf_2.mp4 tf_2.wav
+andrew_ng_ai_business.mp4 andrewng_business_tts.wav
+covid_autopsy_1.mp4 autopsy_tts.wav
+news_1.mp4 news_tts.wav
+andrew_ng_fund_1.mp4 andrewng_ai_fund.wav
+covid_treatments_1.mp4 covid_tts.wav
+pytorch_v_tf.mp4 pytorch_vs_tf_eng.wav
+pytorch_1.mp4 pytorch.wav
+pkb_1.mp4 pkb_1.wav
+ss_1.mp4 ss_1.wav
+carlsen_1.mp4 carlsen_eng.wav
+french.mp4 french.wav
\ No newline at end of file
diff --git a/Wav2Lip/face_detect.py b/Wav2Lip/face_detect.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd35da2a1ce84e9a3038fb7158842d17473f721a
--- /dev/null
+++ b/Wav2Lip/face_detect.py
@@ -0,0 +1,55 @@
+import cv2
+import mediapipe as mp
+
+mp_face_mesh = mp.solutions.face_mesh
+mp_drawing = mp.solutions.drawing_utils
+mp_drawing_styles = mp.solutions.drawing_styles
+mp_face_detection = mp.solutions.face_detection
+
+
+def face_rect(images):
+    with mp_face_detection.FaceDetection(
+        model_selection=1, min_detection_confidence=0.5
+    ) as face_detection:
+        for image_cv2 in images:
+            # Convert the BGR image to RGB and process it with MediaPipe Face Detection.
+            results = face_detection.process(cv2.cvtColor(image_cv2, cv2.COLOR_BGR2RGB))
+
+            # Draw face detections of each face.
+            if not results.detections:
+                yield None
+            for detection in results.detections:
+                yield _get_bounding_rect(image_cv2, detection)
+
+
+def _get_bounding_rect(
+    image: mp_drawing.np.ndarray,
+    detection: mp_drawing.detection_pb2.Detection,
+):
+    """
+    Stolen from mediapipe.solutions.drawing_utils.draw_detection()
+    """
+    if not detection.location_data:
+        return
+    if image.shape[2] != mp_drawing._BGR_CHANNELS:
+        raise ValueError("Input image must contain three channel bgr data.")
+    image_rows, image_cols, _ = image.shape
+
+    location = detection.location_data
+
+    # get bounding box if exists.
+    if not location.HasField("relative_bounding_box"):
+        return
+    relative_bounding_box = location.relative_bounding_box
+    rect_start_point = mp_drawing._normalized_to_pixel_coordinates(
+        relative_bounding_box.xmin, relative_bounding_box.ymin, image_cols, image_rows
+    )
+    rect_end_point = mp_drawing._normalized_to_pixel_coordinates(
+        relative_bounding_box.xmin + relative_bounding_box.width,
+        relative_bounding_box.ymin + relative_bounding_box.height,
+        image_cols,
+        image_rows,
+    )
+
+    return *rect_start_point, *rect_end_point
+
diff --git a/Wav2Lip/face_detection/README.md b/Wav2Lip/face_detection/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c073376e4eeda6d4b29cc31c50cb7e88ab42bb73
--- /dev/null
+++ b/Wav2Lip/face_detection/README.md
@@ -0,0 +1 @@
+The code for Face Detection in this folder has been taken from the wonderful [face_alignment](https://github.com/1adrianb/face-alignment) repository. This has been modified to take batches of faces at a time. 
\ No newline at end of file
diff --git a/Wav2Lip/face_detection/__init__.py b/Wav2Lip/face_detection/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bae29fd5f85b41e4669302bd2603bc6924eddc7
--- /dev/null
+++ b/Wav2Lip/face_detection/__init__.py
@@ -0,0 +1,7 @@
+# -*- coding: utf-8 -*-
+
+__author__ = """Adrian Bulat"""
+__email__ = 'adrian.bulat@nottingham.ac.uk'
+__version__ = '1.0.1'
+
+from .api import FaceAlignment, LandmarksType, NetworkSize
diff --git a/Wav2Lip/face_detection/api.py b/Wav2Lip/face_detection/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb02d5252db5362b9985687a992e128a522e5b63
--- /dev/null
+++ b/Wav2Lip/face_detection/api.py
@@ -0,0 +1,79 @@
+from __future__ import print_function
+import os
+import torch
+from torch.utils.model_zoo import load_url
+from enum import Enum
+import numpy as np
+import cv2
+try:
+    import urllib.request as request_file
+except BaseException:
+    import urllib as request_file
+
+from .models import FAN, ResNetDepth
+from .utils import *
+
+
+class LandmarksType(Enum):
+    """Enum class defining the type of landmarks to detect.
+
+    ``_2D`` - the detected points ``(x,y)`` are detected in a 2D space and follow the visible contour of the face
+    ``_2halfD`` - this points represent the projection of the 3D points into 3D
+    ``_3D`` - detect the points ``(x,y,z)``` in a 3D space
+
+    """
+    _2D = 1
+    _2halfD = 2
+    _3D = 3
+
+
+class NetworkSize(Enum):
+    # TINY = 1
+    # SMALL = 2
+    # MEDIUM = 3
+    LARGE = 4
+
+    def __new__(cls, value):
+        member = object.__new__(cls)
+        member._value_ = value
+        return member
+
+    def __int__(self):
+        return self.value
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+
+class FaceAlignment:
+    def __init__(self, landmarks_type, network_size=NetworkSize.LARGE,
+                 device='cuda', flip_input=False, face_detector='sfd', verbose=False):
+        self.device = device
+        self.flip_input = flip_input
+        self.landmarks_type = landmarks_type
+        self.verbose = verbose
+
+        network_size = int(network_size)
+
+        if 'cuda' in device:
+            torch.backends.cudnn.benchmark = True
+
+        # Get the face detector
+        face_detector_module = __import__('face_detection.detection.' + face_detector,
+                                          globals(), locals(), [face_detector], 0)
+        self.face_detector = face_detector_module.FaceDetector(device=device, verbose=verbose)
+
+    def get_detections_for_batch(self, images):
+        images = images[..., ::-1]
+        detected_faces = self.face_detector.detect_from_batch(images.copy())
+        results = []
+
+        for i, d in enumerate(detected_faces):
+            if len(d) == 0:
+                results.append(None)
+                continue
+            d = d[0]
+            d = np.clip(d, 0, None)
+            
+            x1, y1, x2, y2 = map(int, d[:-1])
+            results.append((x1, y1, x2, y2))
+
+        return results
\ No newline at end of file
diff --git a/Wav2Lip/face_detection/detection/__init__.py b/Wav2Lip/face_detection/detection/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a6b0402dae864a3cc5dc2a90a412fd842a0efc7
--- /dev/null
+++ b/Wav2Lip/face_detection/detection/__init__.py
@@ -0,0 +1 @@
+from .core import FaceDetector
\ No newline at end of file
diff --git a/Wav2Lip/face_detection/detection/core.py b/Wav2Lip/face_detection/detection/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f8275e8e53143f66298f75f0517c234a68778cd
--- /dev/null
+++ b/Wav2Lip/face_detection/detection/core.py
@@ -0,0 +1,130 @@
+import logging
+import glob
+from tqdm import tqdm
+import numpy as np
+import torch
+import cv2
+
+
+class FaceDetector(object):
+    """An abstract class representing a face detector.
+
+    Any other face detection implementation must subclass it. All subclasses
+    must implement ``detect_from_image``, that return a list of detected
+    bounding boxes. Optionally, for speed considerations detect from path is
+    recommended.
+    """
+
+    def __init__(self, device, verbose):
+        self.device = device
+        self.verbose = verbose
+
+        if verbose:
+            if 'cpu' in device:
+                logger = logging.getLogger(__name__)
+                logger.warning("Detection running on CPU, this may be potentially slow.")
+
+        if 'cpu' not in device and 'cuda' not in device:
+            if verbose:
+                logger.error("Expected values for device are: {cpu, cuda} but got: %s", device)
+            raise ValueError
+
+    def detect_from_image(self, tensor_or_path):
+        """Detects faces in a given image.
+
+        This function detects the faces present in a provided BGR(usually)
+        image. The input can be either the image itself or the path to it.
+
+        Arguments:
+            tensor_or_path {numpy.ndarray, torch.tensor or string} -- the path
+            to an image or the image itself.
+
+        Example::
+
+            >>> path_to_image = 'data/image_01.jpg'
+            ...   detected_faces = detect_from_image(path_to_image)
+            [A list of bounding boxes (x1, y1, x2, y2)]
+            >>> image = cv2.imread(path_to_image)
+            ...   detected_faces = detect_from_image(image)
+            [A list of bounding boxes (x1, y1, x2, y2)]
+
+        """
+        raise NotImplementedError
+
+    def detect_from_directory(self, path, extensions=['.jpg', '.png'], recursive=False, show_progress_bar=True):
+        """Detects faces from all the images present in a given directory.
+
+        Arguments:
+            path {string} -- a string containing a path that points to the folder containing the images
+
+        Keyword Arguments:
+            extensions {list} -- list of string containing the extensions to be
+            consider in the following format: ``.extension_name`` (default:
+            {['.jpg', '.png']}) recursive {bool} -- option wherever to scan the
+            folder recursively (default: {False}) show_progress_bar {bool} --
+            display a progressbar (default: {True})
+
+        Example:
+        >>> directory = 'data'
+        ...   detected_faces = detect_from_directory(directory)
+        {A dictionary of [lists containing bounding boxes(x1, y1, x2, y2)]}
+
+        """
+        if self.verbose:
+            logger = logging.getLogger(__name__)
+
+        if len(extensions) == 0:
+            if self.verbose:
+                logger.error("Expected at list one extension, but none was received.")
+            raise ValueError
+
+        if self.verbose:
+            logger.info("Constructing the list of images.")
+        additional_pattern = '/**/*' if recursive else '/*'
+        files = []
+        for extension in extensions:
+            files.extend(glob.glob(path + additional_pattern + extension, recursive=recursive))
+
+        if self.verbose:
+            logger.info("Finished searching for images. %s images found", len(files))
+            logger.info("Preparing to run the detection.")
+
+        predictions = {}
+        for image_path in tqdm(files, disable=not show_progress_bar):
+            if self.verbose:
+                logger.info("Running the face detector on image: %s", image_path)
+            predictions[image_path] = self.detect_from_image(image_path)
+
+        if self.verbose:
+            logger.info("The detector was successfully run on all %s images", len(files))
+
+        return predictions
+
+    @property
+    def reference_scale(self):
+        raise NotImplementedError
+
+    @property
+    def reference_x_shift(self):
+        raise NotImplementedError
+
+    @property
+    def reference_y_shift(self):
+        raise NotImplementedError
+
+    @staticmethod
+    def tensor_or_path_to_ndarray(tensor_or_path, rgb=True):
+        """Convert path (represented as a string) or torch.tensor to a numpy.ndarray
+
+        Arguments:
+            tensor_or_path {numpy.ndarray, torch.tensor or string} -- path to the image, or the image itself
+        """
+        if isinstance(tensor_or_path, str):
+            return cv2.imread(tensor_or_path) if not rgb else cv2.imread(tensor_or_path)[..., ::-1]
+        elif torch.is_tensor(tensor_or_path):
+            # Call cpu in case its coming from cuda
+            return tensor_or_path.cpu().numpy()[..., ::-1].copy() if not rgb else tensor_or_path.cpu().numpy()
+        elif isinstance(tensor_or_path, np.ndarray):
+            return tensor_or_path[..., ::-1].copy() if not rgb else tensor_or_path
+        else:
+            raise TypeError
diff --git a/Wav2Lip/face_detection/detection/sfd/__init__.py b/Wav2Lip/face_detection/detection/sfd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a63ecd45658f22e66c171ada751fb33764d4559
--- /dev/null
+++ b/Wav2Lip/face_detection/detection/sfd/__init__.py
@@ -0,0 +1 @@
+from .sfd_detector import SFDDetector as FaceDetector
\ No newline at end of file
diff --git a/Wav2Lip/face_detection/detection/sfd/bbox.py b/Wav2Lip/face_detection/detection/sfd/bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bd7222e5e5f78a51944cbeed3cccbacddc46bed
--- /dev/null
+++ b/Wav2Lip/face_detection/detection/sfd/bbox.py
@@ -0,0 +1,129 @@
+from __future__ import print_function
+import os
+import sys
+import cv2
+import random
+import datetime
+import time
+import math
+import argparse
+import numpy as np
+import torch
+
+try:
+    from iou import IOU
+except BaseException:
+    # IOU cython speedup 10x
+    def IOU(ax1, ay1, ax2, ay2, bx1, by1, bx2, by2):
+        sa = abs((ax2 - ax1) * (ay2 - ay1))
+        sb = abs((bx2 - bx1) * (by2 - by1))
+        x1, y1 = max(ax1, bx1), max(ay1, by1)
+        x2, y2 = min(ax2, bx2), min(ay2, by2)
+        w = x2 - x1
+        h = y2 - y1
+        if w < 0 or h < 0:
+            return 0.0
+        else:
+            return 1.0 * w * h / (sa + sb - w * h)
+
+
+def bboxlog(x1, y1, x2, y2, axc, ayc, aww, ahh):
+    xc, yc, ww, hh = (x2 + x1) / 2, (y2 + y1) / 2, x2 - x1, y2 - y1
+    dx, dy = (xc - axc) / aww, (yc - ayc) / ahh
+    dw, dh = math.log(ww / aww), math.log(hh / ahh)
+    return dx, dy, dw, dh
+
+
+def bboxloginv(dx, dy, dw, dh, axc, ayc, aww, ahh):
+    xc, yc = dx * aww + axc, dy * ahh + ayc
+    ww, hh = math.exp(dw) * aww, math.exp(dh) * ahh
+    x1, x2, y1, y2 = xc - ww / 2, xc + ww / 2, yc - hh / 2, yc + hh / 2
+    return x1, y1, x2, y2
+
+
+def nms(dets, thresh):
+    if 0 == len(dets):
+        return []
+    x1, y1, x2, y2, scores = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3], dets[:, 4]
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1, yy1 = np.maximum(x1[i], x1[order[1:]]), np.maximum(y1[i], y1[order[1:]])
+        xx2, yy2 = np.minimum(x2[i], x2[order[1:]]), np.minimum(y2[i], y2[order[1:]])
+
+        w, h = np.maximum(0.0, xx2 - xx1 + 1), np.maximum(0.0, yy2 - yy1 + 1)
+        ovr = w * h / (areas[i] + areas[order[1:]] - w * h)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def encode(matched, priors, variances):
+    """Encode the variances from the priorbox layers into the ground truth boxes
+    we have matched (based on jaccard overlap) with the prior boxes.
+    Args:
+        matched: (tensor) Coords of ground truth for each prior in point-form
+            Shape: [num_priors, 4].
+        priors: (tensor) Prior boxes in center-offset form
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        encoded boxes (tensor), Shape: [num_priors, 4]
+    """
+
+    # dist b/t match center and prior's center
+    g_cxcy = (matched[:, :2] + matched[:, 2:]) / 2 - priors[:, :2]
+    # encode variance
+    g_cxcy /= (variances[0] * priors[:, 2:])
+    # match wh / prior wh
+    g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
+    g_wh = torch.log(g_wh) / variances[1]
+    # return target for smooth_l1_loss
+    return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]
+
+
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+
+    boxes = torch.cat((
+        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+def batch_decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+
+    boxes = torch.cat((
+        priors[:, :, :2] + loc[:, :, :2] * variances[0] * priors[:, :, 2:],
+        priors[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1])), 2)
+    boxes[:, :, :2] -= boxes[:, :, 2:] / 2
+    boxes[:, :, 2:] += boxes[:, :, :2]
+    return boxes
diff --git a/Wav2Lip/face_detection/detection/sfd/detect.py b/Wav2Lip/face_detection/detection/sfd/detect.py
new file mode 100644
index 0000000000000000000000000000000000000000..efef6273adf317bc17f3dd0f02423c0701ca218e
--- /dev/null
+++ b/Wav2Lip/face_detection/detection/sfd/detect.py
@@ -0,0 +1,112 @@
+import torch
+import torch.nn.functional as F
+
+import os
+import sys
+import cv2
+import random
+import datetime
+import math
+import argparse
+import numpy as np
+
+import scipy.io as sio
+import zipfile
+from .net_s3fd import s3fd
+from .bbox import *
+
+
+def detect(net, img, device):
+    img = img - np.array([104, 117, 123])
+    img = img.transpose(2, 0, 1)
+    img = img.reshape((1,) + img.shape)
+
+    if 'cuda' in device:
+        torch.backends.cudnn.benchmark = True
+
+    img = torch.from_numpy(img).float().to(device)
+    BB, CC, HH, WW = img.size()
+    with torch.no_grad():
+        olist = net(img)
+
+    bboxlist = []
+    for i in range(len(olist) // 2):
+        olist[i * 2] = F.softmax(olist[i * 2], dim=1)
+    olist = [oelem.data.cpu() for oelem in olist]
+    for i in range(len(olist) // 2):
+        ocls, oreg = olist[i * 2], olist[i * 2 + 1]
+        FB, FC, FH, FW = ocls.size()  # feature map size
+        stride = 2**(i + 2)    # 4,8,16,32,64,128
+        anchor = stride * 4
+        poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
+        for Iindex, hindex, windex in poss:
+            axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
+            score = ocls[0, 1, hindex, windex]
+            loc = oreg[0, :, hindex, windex].contiguous().view(1, 4)
+            priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]])
+            variances = [0.1, 0.2]
+            box = decode(loc, priors, variances)
+            x1, y1, x2, y2 = box[0] * 1.0
+            # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
+            bboxlist.append([x1, y1, x2, y2, score])
+    bboxlist = np.array(bboxlist)
+    if 0 == len(bboxlist):
+        bboxlist = np.zeros((1, 5))
+
+    return bboxlist
+
+def batch_detect(net, imgs, device):
+    imgs = imgs - np.array([104, 117, 123])
+    imgs = imgs.transpose(0, 3, 1, 2)
+
+    if 'cuda' in device:
+        torch.backends.cudnn.benchmark = True
+
+    imgs = torch.from_numpy(imgs).float().to(device)
+    BB, CC, HH, WW = imgs.size()
+    with torch.no_grad():
+        olist = net(imgs)
+
+    bboxlist = []
+    for i in range(len(olist) // 2):
+        olist[i * 2] = F.softmax(olist[i * 2], dim=1)
+    olist = [oelem.data.cpu() for oelem in olist]
+    for i in range(len(olist) // 2):
+        ocls, oreg = olist[i * 2], olist[i * 2 + 1]
+        FB, FC, FH, FW = ocls.size()  # feature map size
+        stride = 2**(i + 2)    # 4,8,16,32,64,128
+        anchor = stride * 4
+        poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
+        for Iindex, hindex, windex in poss:
+            axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
+            score = ocls[:, 1, hindex, windex]
+            loc = oreg[:, :, hindex, windex].contiguous().view(BB, 1, 4)
+            priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]]).view(1, 1, 4)
+            variances = [0.1, 0.2]
+            box = batch_decode(loc, priors, variances)
+            box = box[:, 0] * 1.0
+            # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
+            bboxlist.append(torch.cat([box, score.unsqueeze(1)], 1).cpu().numpy())
+    bboxlist = np.array(bboxlist)
+    if 0 == len(bboxlist):
+        bboxlist = np.zeros((1, BB, 5))
+
+    return bboxlist
+
+def flip_detect(net, img, device):
+    img = cv2.flip(img, 1)
+    b = detect(net, img, device)
+
+    bboxlist = np.zeros(b.shape)
+    bboxlist[:, 0] = img.shape[1] - b[:, 2]
+    bboxlist[:, 1] = b[:, 1]
+    bboxlist[:, 2] = img.shape[1] - b[:, 0]
+    bboxlist[:, 3] = b[:, 3]
+    bboxlist[:, 4] = b[:, 4]
+    return bboxlist
+
+
+def pts_to_bb(pts):
+    min_x, min_y = np.min(pts, axis=0)
+    max_x, max_y = np.max(pts, axis=0)
+    return np.array([min_x, min_y, max_x, max_y])
diff --git a/Wav2Lip/face_detection/detection/sfd/net_s3fd.py b/Wav2Lip/face_detection/detection/sfd/net_s3fd.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc64313c277ab594d0257585c70f147606693452
--- /dev/null
+++ b/Wav2Lip/face_detection/detection/sfd/net_s3fd.py
@@ -0,0 +1,129 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class L2Norm(nn.Module):
+    def __init__(self, n_channels, scale=1.0):
+        super(L2Norm, self).__init__()
+        self.n_channels = n_channels
+        self.scale = scale
+        self.eps = 1e-10
+        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
+        self.weight.data *= 0.0
+        self.weight.data += self.scale
+
+    def forward(self, x):
+        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
+        x = x / norm * self.weight.view(1, -1, 1, 1)
+        return x
+
+
+class s3fd(nn.Module):
+    def __init__(self):
+        super(s3fd, self).__init__()
+        self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
+        self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
+
+        self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
+        self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
+
+        self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
+        self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
+        self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
+
+        self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
+        self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+        self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+
+        self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+        self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+        self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
+
+        self.fc6 = nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=3)
+        self.fc7 = nn.Conv2d(1024, 1024, kernel_size=1, stride=1, padding=0)
+
+        self.conv6_1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0)
+        self.conv6_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)
+
+        self.conv7_1 = nn.Conv2d(512, 128, kernel_size=1, stride=1, padding=0)
+        self.conv7_2 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
+
+        self.conv3_3_norm = L2Norm(256, scale=10)
+        self.conv4_3_norm = L2Norm(512, scale=8)
+        self.conv5_3_norm = L2Norm(512, scale=5)
+
+        self.conv3_3_norm_mbox_conf = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
+        self.conv3_3_norm_mbox_loc = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
+        self.conv4_3_norm_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
+        self.conv4_3_norm_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
+        self.conv5_3_norm_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
+        self.conv5_3_norm_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
+
+        self.fc7_mbox_conf = nn.Conv2d(1024, 2, kernel_size=3, stride=1, padding=1)
+        self.fc7_mbox_loc = nn.Conv2d(1024, 4, kernel_size=3, stride=1, padding=1)
+        self.conv6_2_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
+        self.conv6_2_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
+        self.conv7_2_mbox_conf = nn.Conv2d(256, 2, kernel_size=3, stride=1, padding=1)
+        self.conv7_2_mbox_loc = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        h = F.relu(self.conv1_1(x))
+        h = F.relu(self.conv1_2(h))
+        h = F.max_pool2d(h, 2, 2)
+
+        h = F.relu(self.conv2_1(h))
+        h = F.relu(self.conv2_2(h))
+        h = F.max_pool2d(h, 2, 2)
+
+        h = F.relu(self.conv3_1(h))
+        h = F.relu(self.conv3_2(h))
+        h = F.relu(self.conv3_3(h))
+        f3_3 = h
+        h = F.max_pool2d(h, 2, 2)
+
+        h = F.relu(self.conv4_1(h))
+        h = F.relu(self.conv4_2(h))
+        h = F.relu(self.conv4_3(h))
+        f4_3 = h
+        h = F.max_pool2d(h, 2, 2)
+
+        h = F.relu(self.conv5_1(h))
+        h = F.relu(self.conv5_2(h))
+        h = F.relu(self.conv5_3(h))
+        f5_3 = h
+        h = F.max_pool2d(h, 2, 2)
+
+        h = F.relu(self.fc6(h))
+        h = F.relu(self.fc7(h))
+        ffc7 = h
+        h = F.relu(self.conv6_1(h))
+        h = F.relu(self.conv6_2(h))
+        f6_2 = h
+        h = F.relu(self.conv7_1(h))
+        h = F.relu(self.conv7_2(h))
+        f7_2 = h
+
+        f3_3 = self.conv3_3_norm(f3_3)
+        f4_3 = self.conv4_3_norm(f4_3)
+        f5_3 = self.conv5_3_norm(f5_3)
+
+        cls1 = self.conv3_3_norm_mbox_conf(f3_3)
+        reg1 = self.conv3_3_norm_mbox_loc(f3_3)
+        cls2 = self.conv4_3_norm_mbox_conf(f4_3)
+        reg2 = self.conv4_3_norm_mbox_loc(f4_3)
+        cls3 = self.conv5_3_norm_mbox_conf(f5_3)
+        reg3 = self.conv5_3_norm_mbox_loc(f5_3)
+        cls4 = self.fc7_mbox_conf(ffc7)
+        reg4 = self.fc7_mbox_loc(ffc7)
+        cls5 = self.conv6_2_mbox_conf(f6_2)
+        reg5 = self.conv6_2_mbox_loc(f6_2)
+        cls6 = self.conv7_2_mbox_conf(f7_2)
+        reg6 = self.conv7_2_mbox_loc(f7_2)
+
+        # max-out background label
+        chunk = torch.chunk(cls1, 4, 1)
+        bmax = torch.max(torch.max(chunk[0], chunk[1]), chunk[2])
+        cls1 = torch.cat([bmax, chunk[3]], dim=1)
+
+        return [cls1, reg1, cls2, reg2, cls3, reg3, cls4, reg4, cls5, reg5, cls6, reg6]
diff --git a/Wav2Lip/face_detection/detection/sfd/sfd_detector.py b/Wav2Lip/face_detection/detection/sfd/sfd_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1776e4bfedf9f0bac23a7d35066b7619d83d7e6
--- /dev/null
+++ b/Wav2Lip/face_detection/detection/sfd/sfd_detector.py
@@ -0,0 +1,60 @@
+import os
+import cv2
+from torch.utils.model_zoo import load_url
+
+from ..core import FaceDetector
+
+from .net_s3fd import s3fd
+from .bbox import *
+from .detect import *
+
+models_urls = {
+    's3fd': 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth',
+}
+
+
+class SFDDetector(FaceDetector):
+    @classmethod
+    def load_model(cls, device):
+        path_to_detector = os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth')
+
+        # Initialise the face detector
+        if not os.path.isfile(path_to_detector):
+            model_weights = load_url(models_urls['s3fd'])
+        else:
+            model_weights = torch.load(path_to_detector)
+
+        cls.face_detector = s3fd()
+        cls.face_detector.load_state_dict(model_weights)
+        cls.face_detector.to(device)
+        cls.face_detector.eval()
+
+    def detect_from_image(self, tensor_or_path):
+        image = self.tensor_or_path_to_ndarray(tensor_or_path)
+
+        bboxlist = detect(self.face_detector, image, device=self.device)
+        keep = nms(bboxlist, 0.3)
+        bboxlist = bboxlist[keep, :]
+        bboxlist = [x for x in bboxlist if x[-1] > 0.5]
+
+        return bboxlist
+
+    def detect_from_batch(self, images):
+        bboxlists = batch_detect(self.face_detector, images, device=self.device)
+        keeps = [nms(bboxlists[:, i, :], 0.3) for i in range(bboxlists.shape[1])]
+        bboxlists = [bboxlists[keep, i, :] for i, keep in enumerate(keeps)]
+        bboxlists = [[x for x in bboxlist if x[-1] > 0.5] for bboxlist in bboxlists]
+
+        return bboxlists
+
+    @property
+    def reference_scale(self):
+        return 195
+
+    @property
+    def reference_x_shift(self):
+        return 0
+
+    @property
+    def reference_y_shift(self):
+        return 0
diff --git a/Wav2Lip/face_detection/models.py b/Wav2Lip/face_detection/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee2dde32bdf72c25a4600e48efa73ffc0d4a3893
--- /dev/null
+++ b/Wav2Lip/face_detection/models.py
@@ -0,0 +1,261 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+def conv3x3(in_planes, out_planes, strd=1, padding=1, bias=False):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3,
+                     stride=strd, padding=padding, bias=bias)
+
+
+class ConvBlock(nn.Module):
+    def __init__(self, in_planes, out_planes):
+        super(ConvBlock, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = conv3x3(in_planes, int(out_planes / 2))
+        self.bn2 = nn.BatchNorm2d(int(out_planes / 2))
+        self.conv2 = conv3x3(int(out_planes / 2), int(out_planes / 4))
+        self.bn3 = nn.BatchNorm2d(int(out_planes / 4))
+        self.conv3 = conv3x3(int(out_planes / 4), int(out_planes / 4))
+
+        if in_planes != out_planes:
+            self.downsample = nn.Sequential(
+                nn.BatchNorm2d(in_planes),
+                nn.ReLU(True),
+                nn.Conv2d(in_planes, out_planes,
+                          kernel_size=1, stride=1, bias=False),
+            )
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        residual = x
+
+        out1 = self.bn1(x)
+        out1 = F.relu(out1, True)
+        out1 = self.conv1(out1)
+
+        out2 = self.bn2(out1)
+        out2 = F.relu(out2, True)
+        out2 = self.conv2(out2)
+
+        out3 = self.bn3(out2)
+        out3 = F.relu(out3, True)
+        out3 = self.conv3(out3)
+
+        out3 = torch.cat((out1, out2, out3), 1)
+
+        if self.downsample is not None:
+            residual = self.downsample(residual)
+
+        out3 += residual
+
+        return out3
+
+
+class Bottleneck(nn.Module):
+
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class HourGlass(nn.Module):
+    def __init__(self, num_modules, depth, num_features):
+        super(HourGlass, self).__init__()
+        self.num_modules = num_modules
+        self.depth = depth
+        self.features = num_features
+
+        self._generate_network(self.depth)
+
+    def _generate_network(self, level):
+        self.add_module('b1_' + str(level), ConvBlock(self.features, self.features))
+
+        self.add_module('b2_' + str(level), ConvBlock(self.features, self.features))
+
+        if level > 1:
+            self._generate_network(level - 1)
+        else:
+            self.add_module('b2_plus_' + str(level), ConvBlock(self.features, self.features))
+
+        self.add_module('b3_' + str(level), ConvBlock(self.features, self.features))
+
+    def _forward(self, level, inp):
+        # Upper branch
+        up1 = inp
+        up1 = self._modules['b1_' + str(level)](up1)
+
+        # Lower branch
+        low1 = F.avg_pool2d(inp, 2, stride=2)
+        low1 = self._modules['b2_' + str(level)](low1)
+
+        if level > 1:
+            low2 = self._forward(level - 1, low1)
+        else:
+            low2 = low1
+            low2 = self._modules['b2_plus_' + str(level)](low2)
+
+        low3 = low2
+        low3 = self._modules['b3_' + str(level)](low3)
+
+        up2 = F.interpolate(low3, scale_factor=2, mode='nearest')
+
+        return up1 + up2
+
+    def forward(self, x):
+        return self._forward(self.depth, x)
+
+
+class FAN(nn.Module):
+
+    def __init__(self, num_modules=1):
+        super(FAN, self).__init__()
+        self.num_modules = num_modules
+
+        # Base part
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.conv2 = ConvBlock(64, 128)
+        self.conv3 = ConvBlock(128, 128)
+        self.conv4 = ConvBlock(128, 256)
+
+        # Stacking part
+        for hg_module in range(self.num_modules):
+            self.add_module('m' + str(hg_module), HourGlass(1, 4, 256))
+            self.add_module('top_m_' + str(hg_module), ConvBlock(256, 256))
+            self.add_module('conv_last' + str(hg_module),
+                            nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
+            self.add_module('bn_end' + str(hg_module), nn.BatchNorm2d(256))
+            self.add_module('l' + str(hg_module), nn.Conv2d(256,
+                                                            68, kernel_size=1, stride=1, padding=0))
+
+            if hg_module < self.num_modules - 1:
+                self.add_module(
+                    'bl' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
+                self.add_module('al' + str(hg_module), nn.Conv2d(68,
+                                                                 256, kernel_size=1, stride=1, padding=0))
+
+    def forward(self, x):
+        x = F.relu(self.bn1(self.conv1(x)), True)
+        x = F.avg_pool2d(self.conv2(x), 2, stride=2)
+        x = self.conv3(x)
+        x = self.conv4(x)
+
+        previous = x
+
+        outputs = []
+        for i in range(self.num_modules):
+            hg = self._modules['m' + str(i)](previous)
+
+            ll = hg
+            ll = self._modules['top_m_' + str(i)](ll)
+
+            ll = F.relu(self._modules['bn_end' + str(i)]
+                        (self._modules['conv_last' + str(i)](ll)), True)
+
+            # Predict heatmaps
+            tmp_out = self._modules['l' + str(i)](ll)
+            outputs.append(tmp_out)
+
+            if i < self.num_modules - 1:
+                ll = self._modules['bl' + str(i)](ll)
+                tmp_out_ = self._modules['al' + str(i)](tmp_out)
+                previous = previous + ll + tmp_out_
+
+        return outputs
+
+
+class ResNetDepth(nn.Module):
+
+    def __init__(self, block=Bottleneck, layers=[3, 8, 36, 3], num_classes=68):
+        self.inplanes = 64
+        super(ResNetDepth, self).__init__()
+        self.conv1 = nn.Conv2d(3 + 68, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
diff --git a/Wav2Lip/face_detection/utils.py b/Wav2Lip/face_detection/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dc4cf3e328efaa227cbcfdd969e1056688adad5
--- /dev/null
+++ b/Wav2Lip/face_detection/utils.py
@@ -0,0 +1,313 @@
+from __future__ import print_function
+import os
+import sys
+import time
+import torch
+import math
+import numpy as np
+import cv2
+
+
+def _gaussian(
+        size=3, sigma=0.25, amplitude=1, normalize=False, width=None,
+        height=None, sigma_horz=None, sigma_vert=None, mean_horz=0.5,
+        mean_vert=0.5):
+    # handle some defaults
+    if width is None:
+        width = size
+    if height is None:
+        height = size
+    if sigma_horz is None:
+        sigma_horz = sigma
+    if sigma_vert is None:
+        sigma_vert = sigma
+    center_x = mean_horz * width + 0.5
+    center_y = mean_vert * height + 0.5
+    gauss = np.empty((height, width), dtype=np.float32)
+    # generate kernel
+    for i in range(height):
+        for j in range(width):
+            gauss[i][j] = amplitude * math.exp(-(math.pow((j + 1 - center_x) / (
+                sigma_horz * width), 2) / 2.0 + math.pow((i + 1 - center_y) / (sigma_vert * height), 2) / 2.0))
+    if normalize:
+        gauss = gauss / np.sum(gauss)
+    return gauss
+
+
+def draw_gaussian(image, point, sigma):
+    # Check if the gaussian is inside
+    ul = [math.floor(point[0] - 3 * sigma), math.floor(point[1] - 3 * sigma)]
+    br = [math.floor(point[0] + 3 * sigma), math.floor(point[1] + 3 * sigma)]
+    if (ul[0] > image.shape[1] or ul[1] > image.shape[0] or br[0] < 1 or br[1] < 1):
+        return image
+    size = 6 * sigma + 1
+    g = _gaussian(size)
+    g_x = [int(max(1, -ul[0])), int(min(br[0], image.shape[1])) - int(max(1, ul[0])) + int(max(1, -ul[0]))]
+    g_y = [int(max(1, -ul[1])), int(min(br[1], image.shape[0])) - int(max(1, ul[1])) + int(max(1, -ul[1]))]
+    img_x = [int(max(1, ul[0])), int(min(br[0], image.shape[1]))]
+    img_y = [int(max(1, ul[1])), int(min(br[1], image.shape[0]))]
+    assert (g_x[0] > 0 and g_y[1] > 0)
+    image[img_y[0] - 1:img_y[1], img_x[0] - 1:img_x[1]
+          ] = image[img_y[0] - 1:img_y[1], img_x[0] - 1:img_x[1]] + g[g_y[0] - 1:g_y[1], g_x[0] - 1:g_x[1]]
+    image[image > 1] = 1
+    return image
+
+
+def transform(point, center, scale, resolution, invert=False):
+    """Generate and affine transformation matrix.
+
+    Given a set of points, a center, a scale and a targer resolution, the
+    function generates and affine transformation matrix. If invert is ``True``
+    it will produce the inverse transformation.
+
+    Arguments:
+        point {torch.tensor} -- the input 2D point
+        center {torch.tensor or numpy.array} -- the center around which to perform the transformations
+        scale {float} -- the scale of the face/object
+        resolution {float} -- the output resolution
+
+    Keyword Arguments:
+        invert {bool} -- define wherever the function should produce the direct or the
+        inverse transformation matrix (default: {False})
+    """
+    _pt = torch.ones(3)
+    _pt[0] = point[0]
+    _pt[1] = point[1]
+
+    h = 200.0 * scale
+    t = torch.eye(3)
+    t[0, 0] = resolution / h
+    t[1, 1] = resolution / h
+    t[0, 2] = resolution * (-center[0] / h + 0.5)
+    t[1, 2] = resolution * (-center[1] / h + 0.5)
+
+    if invert:
+        t = torch.inverse(t)
+
+    new_point = (torch.matmul(t, _pt))[0:2]
+
+    return new_point.int()
+
+
+def crop(image, center, scale, resolution=256.0):
+    """Center crops an image or set of heatmaps
+
+    Arguments:
+        image {numpy.array} -- an rgb image
+        center {numpy.array} -- the center of the object, usually the same as of the bounding box
+        scale {float} -- scale of the face
+
+    Keyword Arguments:
+        resolution {float} -- the size of the output cropped image (default: {256.0})
+
+    Returns:
+        [type] -- [description]
+    """  # Crop around the center point
+    """ Crops the image around the center. Input is expected to be an np.ndarray """
+    ul = transform([1, 1], center, scale, resolution, True)
+    br = transform([resolution, resolution], center, scale, resolution, True)
+    # pad = math.ceil(torch.norm((ul - br).float()) / 2.0 - (br[0] - ul[0]) / 2.0)
+    if image.ndim > 2:
+        newDim = np.array([br[1] - ul[1], br[0] - ul[0],
+                           image.shape[2]], dtype=np.int32)
+        newImg = np.zeros(newDim, dtype=np.uint8)
+    else:
+        newDim = np.array([br[1] - ul[1], br[0] - ul[0]], dtype=np.int)
+        newImg = np.zeros(newDim, dtype=np.uint8)
+    ht = image.shape[0]
+    wd = image.shape[1]
+    newX = np.array(
+        [max(1, -ul[0] + 1), min(br[0], wd) - ul[0]], dtype=np.int32)
+    newY = np.array(
+        [max(1, -ul[1] + 1), min(br[1], ht) - ul[1]], dtype=np.int32)
+    oldX = np.array([max(1, ul[0] + 1), min(br[0], wd)], dtype=np.int32)
+    oldY = np.array([max(1, ul[1] + 1), min(br[1], ht)], dtype=np.int32)
+    newImg[newY[0] - 1:newY[1], newX[0] - 1:newX[1]
+           ] = image[oldY[0] - 1:oldY[1], oldX[0] - 1:oldX[1], :]
+    newImg = cv2.resize(newImg, dsize=(int(resolution), int(resolution)),
+                        interpolation=cv2.INTER_LINEAR)
+    return newImg
+
+
+def get_preds_fromhm(hm, center=None, scale=None):
+    """Obtain (x,y) coordinates given a set of N heatmaps. If the center
+    and the scale is provided the function will return the points also in
+    the original coordinate frame.
+
+    Arguments:
+        hm {torch.tensor} -- the predicted heatmaps, of shape [B, N, W, H]
+
+    Keyword Arguments:
+        center {torch.tensor} -- the center of the bounding box (default: {None})
+        scale {float} -- face scale (default: {None})
+    """
+    max, idx = torch.max(
+        hm.view(hm.size(0), hm.size(1), hm.size(2) * hm.size(3)), 2)
+    idx += 1
+    preds = idx.view(idx.size(0), idx.size(1), 1).repeat(1, 1, 2).float()
+    preds[..., 0].apply_(lambda x: (x - 1) % hm.size(3) + 1)
+    preds[..., 1].add_(-1).div_(hm.size(2)).floor_().add_(1)
+
+    for i in range(preds.size(0)):
+        for j in range(preds.size(1)):
+            hm_ = hm[i, j, :]
+            pX, pY = int(preds[i, j, 0]) - 1, int(preds[i, j, 1]) - 1
+            if pX > 0 and pX < 63 and pY > 0 and pY < 63:
+                diff = torch.FloatTensor(
+                    [hm_[pY, pX + 1] - hm_[pY, pX - 1],
+                     hm_[pY + 1, pX] - hm_[pY - 1, pX]])
+                preds[i, j].add_(diff.sign_().mul_(.25))
+
+    preds.add_(-.5)
+
+    preds_orig = torch.zeros(preds.size())
+    if center is not None and scale is not None:
+        for i in range(hm.size(0)):
+            for j in range(hm.size(1)):
+                preds_orig[i, j] = transform(
+                    preds[i, j], center, scale, hm.size(2), True)
+
+    return preds, preds_orig
+
+def get_preds_fromhm_batch(hm, centers=None, scales=None):
+    """Obtain (x,y) coordinates given a set of N heatmaps. If the centers
+    and the scales is provided the function will return the points also in
+    the original coordinate frame.
+
+    Arguments:
+        hm {torch.tensor} -- the predicted heatmaps, of shape [B, N, W, H]
+
+    Keyword Arguments:
+        centers {torch.tensor} -- the centers of the bounding box (default: {None})
+        scales {float} -- face scales (default: {None})
+    """
+    max, idx = torch.max(
+        hm.view(hm.size(0), hm.size(1), hm.size(2) * hm.size(3)), 2)
+    idx += 1
+    preds = idx.view(idx.size(0), idx.size(1), 1).repeat(1, 1, 2).float()
+    preds[..., 0].apply_(lambda x: (x - 1) % hm.size(3) + 1)
+    preds[..., 1].add_(-1).div_(hm.size(2)).floor_().add_(1)
+
+    for i in range(preds.size(0)):
+        for j in range(preds.size(1)):
+            hm_ = hm[i, j, :]
+            pX, pY = int(preds[i, j, 0]) - 1, int(preds[i, j, 1]) - 1
+            if pX > 0 and pX < 63 and pY > 0 and pY < 63:
+                diff = torch.FloatTensor(
+                    [hm_[pY, pX + 1] - hm_[pY, pX - 1],
+                     hm_[pY + 1, pX] - hm_[pY - 1, pX]])
+                preds[i, j].add_(diff.sign_().mul_(.25))
+
+    preds.add_(-.5)
+
+    preds_orig = torch.zeros(preds.size())
+    if centers is not None and scales is not None:
+        for i in range(hm.size(0)):
+            for j in range(hm.size(1)):
+                preds_orig[i, j] = transform(
+                    preds[i, j], centers[i], scales[i], hm.size(2), True)
+
+    return preds, preds_orig
+
+def shuffle_lr(parts, pairs=None):
+    """Shuffle the points left-right according to the axis of symmetry
+    of the object.
+
+    Arguments:
+        parts {torch.tensor} -- a 3D or 4D object containing the
+        heatmaps.
+
+    Keyword Arguments:
+        pairs {list of integers} -- [order of the flipped points] (default: {None})
+    """
+    if pairs is None:
+        pairs = [16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+                 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 27, 28, 29, 30, 35,
+                 34, 33, 32, 31, 45, 44, 43, 42, 47, 46, 39, 38, 37, 36, 41,
+                 40, 54, 53, 52, 51, 50, 49, 48, 59, 58, 57, 56, 55, 64, 63,
+                 62, 61, 60, 67, 66, 65]
+    if parts.ndimension() == 3:
+        parts = parts[pairs, ...]
+    else:
+        parts = parts[:, pairs, ...]
+
+    return parts
+
+
+def flip(tensor, is_label=False):
+    """Flip an image or a set of heatmaps left-right
+
+    Arguments:
+        tensor {numpy.array or torch.tensor} -- [the input image or heatmaps]
+
+    Keyword Arguments:
+        is_label {bool} -- [denote wherever the input is an image or a set of heatmaps ] (default: {False})
+    """
+    if not torch.is_tensor(tensor):
+        tensor = torch.from_numpy(tensor)
+
+    if is_label:
+        tensor = shuffle_lr(tensor).flip(tensor.ndimension() - 1)
+    else:
+        tensor = tensor.flip(tensor.ndimension() - 1)
+
+    return tensor
+
+# From pyzolib/paths.py (https://bitbucket.org/pyzo/pyzolib/src/tip/paths.py)
+
+
+def appdata_dir(appname=None, roaming=False):
+    """ appdata_dir(appname=None, roaming=False)
+
+    Get the path to the application directory, where applications are allowed
+    to write user specific files (e.g. configurations). For non-user specific
+    data, consider using common_appdata_dir().
+    If appname is given, a subdir is appended (and created if necessary).
+    If roaming is True, will prefer a roaming directory (Windows Vista/7).
+    """
+
+    # Define default user directory
+    userDir = os.getenv('FACEALIGNMENT_USERDIR', None)
+    if userDir is None:
+        userDir = os.path.expanduser('~')
+        if not os.path.isdir(userDir):  # pragma: no cover
+            userDir = '/var/tmp'  # issue #54
+
+    # Get system app data dir
+    path = None
+    if sys.platform.startswith('win'):
+        path1, path2 = os.getenv('LOCALAPPDATA'), os.getenv('APPDATA')
+        path = (path2 or path1) if roaming else (path1 or path2)
+    elif sys.platform.startswith('darwin'):
+        path = os.path.join(userDir, 'Library', 'Application Support')
+    # On Linux and as fallback
+    if not (path and os.path.isdir(path)):
+        path = userDir
+
+    # Maybe we should store things local to the executable (in case of a
+    # portable distro or a frozen application that wants to be portable)
+    prefix = sys.prefix
+    if getattr(sys, 'frozen', None):
+        prefix = os.path.abspath(os.path.dirname(sys.executable))
+    for reldir in ('settings', '../settings'):
+        localpath = os.path.abspath(os.path.join(prefix, reldir))
+        if os.path.isdir(localpath):  # pragma: no cover
+            try:
+                open(os.path.join(localpath, 'test.write'), 'wb').close()
+                os.remove(os.path.join(localpath, 'test.write'))
+            except IOError:
+                pass  # We cannot write in this directory
+            else:
+                path = localpath
+                break
+
+    # Get path specific for this app
+    if appname:
+        if path == userDir:
+            appname = '.' + appname.lstrip('.')  # Make it a hidden directory
+        path = os.path.join(path, appname)
+        if not os.path.isdir(path):  # pragma: no cover
+            os.mkdir(path)
+
+    # Done
+    return path
diff --git a/Wav2Lip/filelists/README.md b/Wav2Lip/filelists/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e7d7e7bb3b5adefc9fee84168693e978f129c6e6
--- /dev/null
+++ b/Wav2Lip/filelists/README.md
@@ -0,0 +1 @@
+Place LRS2 (and any other) filelists here for training.
\ No newline at end of file
diff --git a/Wav2Lip/hparams.py b/Wav2Lip/hparams.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c019046279f497e4eae3f839f683bc0b1193c6b
--- /dev/null
+++ b/Wav2Lip/hparams.py
@@ -0,0 +1,101 @@
+from glob import glob
+import os
+
+def get_image_list(data_root, split):
+	filelist = []
+
+	with open('filelists/{}.txt'.format(split)) as f:
+		for line in f:
+			line = line.strip()
+			if ' ' in line: line = line.split()[0]
+			filelist.append(os.path.join(data_root, line))
+
+	return filelist
+
+class HParams:
+	def __init__(self, **kwargs):
+		self.data = {}
+
+		for key, value in kwargs.items():
+			self.data[key] = value
+
+	def __getattr__(self, key):
+		if key not in self.data:
+			raise AttributeError("'HParams' object has no attribute %s" % key)
+		return self.data[key]
+
+	def set_hparam(self, key, value):
+		self.data[key] = value
+
+
+# Default hyperparameters
+hparams = HParams(
+	num_mels=80,  # Number of mel-spectrogram channels and local conditioning dimensionality
+	#  network
+	rescale=True,  # Whether to rescale audio prior to preprocessing
+	rescaling_max=0.9,  # Rescaling value
+	
+	# Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
+	# It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
+	# Does not work if n_ffit is not multiple of hop_size!!
+	use_lws=False,
+	
+	n_fft=800,  # Extra window size is filled with 0 paddings to match this parameter
+	hop_size=200,  # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
+	win_size=800,  # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
+	sample_rate=16000,  # 16000Hz (corresponding to librispeech) (sox --i <filename>)
+	
+	frame_shift_ms=None,  # Can replace hop_size parameter. (Recommended: 12.5)
+	
+	# Mel and Linear spectrograms normalization/scaling and clipping
+	signal_normalization=True,
+	# Whether to normalize mel spectrograms to some predefined range (following below parameters)
+	allow_clipping_in_normalization=True,  # Only relevant if mel_normalization = True
+	symmetric_mels=True,
+	# Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2, 
+	# faster and cleaner convergence)
+	max_abs_value=4.,
+	# max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not 
+	# be too big to avoid gradient explosion, 
+	# not too small for fast convergence)
+	# Contribution by @begeekmyfriend
+	# Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude 
+	# levels. Also allows for better G&L phase reconstruction)
+	preemphasize=True,  # whether to apply filter
+	preemphasis=0.97,  # filter coefficient.
+	
+	# Limits
+	min_level_db=-100,
+	ref_level_db=20,
+	fmin=55,
+	# Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To 
+	# test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
+	fmax=7600,  # To be increased/reduced depending on data.
+
+	###################### Our training parameters #################################
+	img_size=96,
+	fps=25,
+	
+	batch_size=16,
+	initial_learning_rate=1e-4,
+	nepochs=200000000000000000,  ### ctrl + c, stop whenever eval loss is consistently greater than train loss for ~10 epochs
+	num_workers=16,
+	checkpoint_interval=3000,
+	eval_interval=3000,
+    save_optimizer_state=True,
+
+    syncnet_wt=0.0, # is initially zero, will be set automatically to 0.03 later. Leads to faster convergence. 
+	syncnet_batch_size=64,
+	syncnet_lr=1e-4,
+	syncnet_eval_interval=10000,
+	syncnet_checkpoint_interval=10000,
+
+	disc_wt=0.07,
+	disc_initial_learning_rate=1e-4,
+)
+
+
+def hparams_debug_string():
+	values = hparams.values()
+	hp = ["  %s: %s" % (name, values[name]) for name in sorted(values) if name != "sentences"]
+	return "Hyperparameters:\n" + "\n".join(hp)
diff --git a/Wav2Lip/hq_wav2lip_train.py b/Wav2Lip/hq_wav2lip_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..c384ad9d80c82fd6bff51ea395eeefe93e1e0997
--- /dev/null
+++ b/Wav2Lip/hq_wav2lip_train.py
@@ -0,0 +1,443 @@
+from os.path import dirname, join, basename, isfile
+from tqdm import tqdm
+
+from models import SyncNet_color as SyncNet
+from models import Wav2Lip, Wav2Lip_disc_qual
+import audio
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch import optim
+import torch.backends.cudnn as cudnn
+from torch.utils import data as data_utils
+import numpy as np
+
+from glob import glob
+
+import os, random, cv2, argparse
+from hparams import hparams, get_image_list
+
+parser = argparse.ArgumentParser(description='Code to train the Wav2Lip model WITH the visual quality discriminator')
+
+parser.add_argument("--data_root", help="Root folder of the preprocessed LRS2 dataset", required=True, type=str)
+
+parser.add_argument('--checkpoint_dir', help='Save checkpoints to this directory', required=True, type=str)
+parser.add_argument('--syncnet_checkpoint_path', help='Load the pre-trained Expert discriminator', required=True, type=str)
+
+parser.add_argument('--checkpoint_path', help='Resume generator from this checkpoint', default=None, type=str)
+parser.add_argument('--disc_checkpoint_path', help='Resume quality disc from this checkpoint', default=None, type=str)
+
+args = parser.parse_args()
+
+
+global_step = 0
+global_epoch = 0
+use_cuda = torch.cuda.is_available()
+print('use_cuda: {}'.format(use_cuda))
+
+syncnet_T = 5
+syncnet_mel_step_size = 16
+
+class Dataset(object):
+    def __init__(self, split):
+        self.all_videos = get_image_list(args.data_root, split)
+
+    def get_frame_id(self, frame):
+        return int(basename(frame).split('.')[0])
+
+    def get_window(self, start_frame):
+        start_id = self.get_frame_id(start_frame)
+        vidname = dirname(start_frame)
+
+        window_fnames = []
+        for frame_id in range(start_id, start_id + syncnet_T):
+            frame = join(vidname, '{}.jpg'.format(frame_id))
+            if not isfile(frame):
+                return None
+            window_fnames.append(frame)
+        return window_fnames
+
+    def read_window(self, window_fnames):
+        if window_fnames is None: return None
+        window = []
+        for fname in window_fnames:
+            img = cv2.imread(fname)
+            if img is None:
+                return None
+            try:
+                img = cv2.resize(img, (hparams.img_size, hparams.img_size))
+            except Exception as e:
+                return None
+
+            window.append(img)
+
+        return window
+
+    def crop_audio_window(self, spec, start_frame):
+        if type(start_frame) == int:
+            start_frame_num = start_frame
+        else:
+            start_frame_num = self.get_frame_id(start_frame)
+        start_idx = int(80. * (start_frame_num / float(hparams.fps)))
+        
+        end_idx = start_idx + syncnet_mel_step_size
+
+        return spec[start_idx : end_idx, :]
+
+    def get_segmented_mels(self, spec, start_frame):
+        mels = []
+        assert syncnet_T == 5
+        start_frame_num = self.get_frame_id(start_frame) + 1 # 0-indexing ---> 1-indexing
+        if start_frame_num - 2 < 0: return None
+        for i in range(start_frame_num, start_frame_num + syncnet_T):
+            m = self.crop_audio_window(spec, i - 2)
+            if m.shape[0] != syncnet_mel_step_size:
+                return None
+            mels.append(m.T)
+
+        mels = np.asarray(mels)
+
+        return mels
+
+    def prepare_window(self, window):
+        # 3 x T x H x W
+        x = np.asarray(window) / 255.
+        x = np.transpose(x, (3, 0, 1, 2))
+
+        return x
+
+    def __len__(self):
+        return len(self.all_videos)
+
+    def __getitem__(self, idx):
+        while 1:
+            idx = random.randint(0, len(self.all_videos) - 1)
+            vidname = self.all_videos[idx]
+            img_names = list(glob(join(vidname, '*.jpg')))
+            if len(img_names) <= 3 * syncnet_T:
+                continue
+            
+            img_name = random.choice(img_names)
+            wrong_img_name = random.choice(img_names)
+            while wrong_img_name == img_name:
+                wrong_img_name = random.choice(img_names)
+
+            window_fnames = self.get_window(img_name)
+            wrong_window_fnames = self.get_window(wrong_img_name)
+            if window_fnames is None or wrong_window_fnames is None:
+                continue
+
+            window = self.read_window(window_fnames)
+            if window is None:
+                continue
+
+            wrong_window = self.read_window(wrong_window_fnames)
+            if wrong_window is None:
+                continue
+
+            try:
+                wavpath = join(vidname, "audio.wav")
+                wav = audio.load_wav(wavpath, hparams.sample_rate)
+
+                orig_mel = audio.melspectrogram(wav).T
+            except Exception as e:
+                continue
+
+            mel = self.crop_audio_window(orig_mel.copy(), img_name)
+            
+            if (mel.shape[0] != syncnet_mel_step_size):
+                continue
+
+            indiv_mels = self.get_segmented_mels(orig_mel.copy(), img_name)
+            if indiv_mels is None: continue
+
+            window = self.prepare_window(window)
+            y = window.copy()
+            window[:, :, window.shape[2]//2:] = 0.
+
+            wrong_window = self.prepare_window(wrong_window)
+            x = np.concatenate([window, wrong_window], axis=0)
+
+            x = torch.FloatTensor(x)
+            mel = torch.FloatTensor(mel.T).unsqueeze(0)
+            indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1)
+            y = torch.FloatTensor(y)
+            return x, indiv_mels, mel, y
+
+def save_sample_images(x, g, gt, global_step, checkpoint_dir):
+    x = (x.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
+    g = (g.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
+    gt = (gt.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
+
+    refs, inps = x[..., 3:], x[..., :3]
+    folder = join(checkpoint_dir, "samples_step{:09d}".format(global_step))
+    if not os.path.exists(folder): os.mkdir(folder)
+    collage = np.concatenate((refs, inps, g, gt), axis=-2)
+    for batch_idx, c in enumerate(collage):
+        for t in range(len(c)):
+            cv2.imwrite('{}/{}_{}.jpg'.format(folder, batch_idx, t), c[t])
+
+logloss = nn.BCELoss()
+def cosine_loss(a, v, y):
+    d = nn.functional.cosine_similarity(a, v)
+    loss = logloss(d.unsqueeze(1), y)
+
+    return loss
+
+device = torch.device("cuda" if use_cuda else "cpu")
+syncnet = SyncNet().to(device)
+for p in syncnet.parameters():
+    p.requires_grad = False
+
+recon_loss = nn.L1Loss()
+def get_sync_loss(mel, g):
+    g = g[:, :, :, g.size(3)//2:]
+    g = torch.cat([g[:, :, i] for i in range(syncnet_T)], dim=1)
+    # B, 3 * T, H//2, W
+    a, v = syncnet(mel, g)
+    y = torch.ones(g.size(0), 1).float().to(device)
+    return cosine_loss(a, v, y)
+
+def train(device, model, disc, train_data_loader, test_data_loader, optimizer, disc_optimizer,
+          checkpoint_dir=None, checkpoint_interval=None, nepochs=None):
+    global global_step, global_epoch
+    resumed_step = global_step
+
+    while global_epoch < nepochs:
+        print('Starting Epoch: {}'.format(global_epoch))
+        running_sync_loss, running_l1_loss, disc_loss, running_perceptual_loss = 0., 0., 0., 0.
+        running_disc_real_loss, running_disc_fake_loss = 0., 0.
+        prog_bar = tqdm(enumerate(train_data_loader))
+        for step, (x, indiv_mels, mel, gt) in prog_bar:
+            disc.train()
+            model.train()
+
+            x = x.to(device)
+            mel = mel.to(device)
+            indiv_mels = indiv_mels.to(device)
+            gt = gt.to(device)
+
+            ### Train generator now. Remove ALL grads. 
+            optimizer.zero_grad()
+            disc_optimizer.zero_grad()
+
+            g = model(indiv_mels, x)
+
+            if hparams.syncnet_wt > 0.:
+                sync_loss = get_sync_loss(mel, g)
+            else:
+                sync_loss = 0.
+
+            if hparams.disc_wt > 0.:
+                perceptual_loss = disc.perceptual_forward(g)
+            else:
+                perceptual_loss = 0.
+
+            l1loss = recon_loss(g, gt)
+
+            loss = hparams.syncnet_wt * sync_loss + hparams.disc_wt * perceptual_loss + \
+                                    (1. - hparams.syncnet_wt - hparams.disc_wt) * l1loss
+
+            loss.backward()
+            optimizer.step()
+
+            ### Remove all gradients before Training disc
+            disc_optimizer.zero_grad()
+
+            pred = disc(gt)
+            disc_real_loss = F.binary_cross_entropy(pred, torch.ones((len(pred), 1)).to(device))
+            disc_real_loss.backward()
+
+            pred = disc(g.detach())
+            disc_fake_loss = F.binary_cross_entropy(pred, torch.zeros((len(pred), 1)).to(device))
+            disc_fake_loss.backward()
+
+            disc_optimizer.step()
+
+            running_disc_real_loss += disc_real_loss.item()
+            running_disc_fake_loss += disc_fake_loss.item()
+
+            if global_step % checkpoint_interval == 0:
+                save_sample_images(x, g, gt, global_step, checkpoint_dir)
+
+            # Logs
+            global_step += 1
+            cur_session_steps = global_step - resumed_step
+
+            running_l1_loss += l1loss.item()
+            if hparams.syncnet_wt > 0.:
+                running_sync_loss += sync_loss.item()
+            else:
+                running_sync_loss += 0.
+
+            if hparams.disc_wt > 0.:
+                running_perceptual_loss += perceptual_loss.item()
+            else:
+                running_perceptual_loss += 0.
+
+            if global_step == 1 or global_step % checkpoint_interval == 0:
+                save_checkpoint(
+                    model, optimizer, global_step, checkpoint_dir, global_epoch)
+                save_checkpoint(disc, disc_optimizer, global_step, checkpoint_dir, global_epoch, prefix='disc_')
+
+
+            if global_step % hparams.eval_interval == 0:
+                with torch.no_grad():
+                    average_sync_loss = eval_model(test_data_loader, global_step, device, model, disc)
+
+                    if average_sync_loss < .75:
+                        hparams.set_hparam('syncnet_wt', 0.03)
+
+            prog_bar.set_description('L1: {}, Sync: {}, Percep: {} | Fake: {}, Real: {}'.format(running_l1_loss / (step + 1),
+                                                                                        running_sync_loss / (step + 1),
+                                                                                        running_perceptual_loss / (step + 1),
+                                                                                        running_disc_fake_loss / (step + 1),
+                                                                                        running_disc_real_loss / (step + 1)))
+
+        global_epoch += 1
+
+def eval_model(test_data_loader, global_step, device, model, disc):
+    eval_steps = 300
+    print('Evaluating for {} steps'.format(eval_steps))
+    running_sync_loss, running_l1_loss, running_disc_real_loss, running_disc_fake_loss, running_perceptual_loss = [], [], [], [], []
+    while 1:
+        for step, (x, indiv_mels, mel, gt) in enumerate((test_data_loader)):
+            model.eval()
+            disc.eval()
+
+            x = x.to(device)
+            mel = mel.to(device)
+            indiv_mels = indiv_mels.to(device)
+            gt = gt.to(device)
+
+            pred = disc(gt)
+            disc_real_loss = F.binary_cross_entropy(pred, torch.ones((len(pred), 1)).to(device))
+
+            g = model(indiv_mels, x)
+            pred = disc(g)
+            disc_fake_loss = F.binary_cross_entropy(pred, torch.zeros((len(pred), 1)).to(device))
+
+            running_disc_real_loss.append(disc_real_loss.item())
+            running_disc_fake_loss.append(disc_fake_loss.item())
+
+            sync_loss = get_sync_loss(mel, g)
+            
+            if hparams.disc_wt > 0.:
+                perceptual_loss = disc.perceptual_forward(g)
+            else:
+                perceptual_loss = 0.
+
+            l1loss = recon_loss(g, gt)
+
+            loss = hparams.syncnet_wt * sync_loss + hparams.disc_wt * perceptual_loss + \
+                                    (1. - hparams.syncnet_wt - hparams.disc_wt) * l1loss
+
+            running_l1_loss.append(l1loss.item())
+            running_sync_loss.append(sync_loss.item())
+            
+            if hparams.disc_wt > 0.:
+                running_perceptual_loss.append(perceptual_loss.item())
+            else:
+                running_perceptual_loss.append(0.)
+
+            if step > eval_steps: break
+
+        print('L1: {}, Sync: {}, Percep: {} | Fake: {}, Real: {}'.format(sum(running_l1_loss) / len(running_l1_loss),
+                                                            sum(running_sync_loss) / len(running_sync_loss),
+                                                            sum(running_perceptual_loss) / len(running_perceptual_loss),
+                                                            sum(running_disc_fake_loss) / len(running_disc_fake_loss),
+                                                             sum(running_disc_real_loss) / len(running_disc_real_loss)))
+        return sum(running_sync_loss) / len(running_sync_loss)
+
+
+def save_checkpoint(model, optimizer, step, checkpoint_dir, epoch, prefix=''):
+    checkpoint_path = join(
+        checkpoint_dir, "{}checkpoint_step{:09d}.pth".format(prefix, global_step))
+    optimizer_state = optimizer.state_dict() if hparams.save_optimizer_state else None
+    torch.save({
+        "state_dict": model.state_dict(),
+        "optimizer": optimizer_state,
+        "global_step": step,
+        "global_epoch": epoch,
+    }, checkpoint_path)
+    print("Saved checkpoint:", checkpoint_path)
+
+def _load(checkpoint_path):
+    if use_cuda:
+        checkpoint = torch.load(checkpoint_path)
+    else:
+        checkpoint = torch.load(checkpoint_path,
+                                map_location=lambda storage, loc: storage)
+    return checkpoint
+
+
+def load_checkpoint(path, model, optimizer, reset_optimizer=False, overwrite_global_states=True):
+    global global_step
+    global global_epoch
+
+    print("Load checkpoint from: {}".format(path))
+    checkpoint = _load(path)
+    s = checkpoint["state_dict"]
+    new_s = {}
+    for k, v in s.items():
+        new_s[k.replace('module.', '')] = v
+    model.load_state_dict(new_s)
+    if not reset_optimizer:
+        optimizer_state = checkpoint["optimizer"]
+        if optimizer_state is not None:
+            print("Load optimizer state from {}".format(path))
+            optimizer.load_state_dict(checkpoint["optimizer"])
+    if overwrite_global_states:
+        global_step = checkpoint["global_step"]
+        global_epoch = checkpoint["global_epoch"]
+
+    return model
+
+if __name__ == "__main__":
+    checkpoint_dir = args.checkpoint_dir
+
+    # Dataset and Dataloader setup
+    train_dataset = Dataset('train')
+    test_dataset = Dataset('val')
+
+    train_data_loader = data_utils.DataLoader(
+        train_dataset, batch_size=hparams.batch_size, shuffle=True,
+        num_workers=hparams.num_workers)
+
+    test_data_loader = data_utils.DataLoader(
+        test_dataset, batch_size=hparams.batch_size,
+        num_workers=4)
+
+    device = torch.device("cuda" if use_cuda else "cpu")
+
+     # Model
+    model = Wav2Lip().to(device)
+    disc = Wav2Lip_disc_qual().to(device)
+
+    print('total trainable params {}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad)))
+    print('total DISC trainable params {}'.format(sum(p.numel() for p in disc.parameters() if p.requires_grad)))
+
+    optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad],
+                           lr=hparams.initial_learning_rate, betas=(0.5, 0.999))
+    disc_optimizer = optim.Adam([p for p in disc.parameters() if p.requires_grad],
+                           lr=hparams.disc_initial_learning_rate, betas=(0.5, 0.999))
+
+    if args.checkpoint_path is not None:
+        load_checkpoint(args.checkpoint_path, model, optimizer, reset_optimizer=False)
+
+    if args.disc_checkpoint_path is not None:
+        load_checkpoint(args.disc_checkpoint_path, disc, disc_optimizer, 
+                                reset_optimizer=False, overwrite_global_states=False)
+        
+    load_checkpoint(args.syncnet_checkpoint_path, syncnet, None, reset_optimizer=True, 
+                                overwrite_global_states=False)
+
+    if not os.path.exists(checkpoint_dir):
+        os.mkdir(checkpoint_dir)
+
+    # Train!
+    train(device, model, disc, train_data_loader, test_data_loader, optimizer, disc_optimizer,
+              checkpoint_dir=checkpoint_dir,
+              checkpoint_interval=hparams.checkpoint_interval,
+              nepochs=hparams.nepochs)
diff --git a/Wav2Lip/inference.py b/Wav2Lip/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1564ce9f46f9e42b6f8e9e89534b7c3d38976f8
--- /dev/null
+++ b/Wav2Lip/inference.py
@@ -0,0 +1,323 @@
+import argparse
+import math
+import os
+import platform
+import subprocess
+
+import cv2
+import numpy as np
+import torch
+from tqdm import tqdm
+
+import audio
+# from face_detect import face_rect
+from models import Wav2Lip
+
+from batch_face import RetinaFace
+from time import time
+
+parser = argparse.ArgumentParser(description='Inference code to lip-sync videos in the wild using Wav2Lip models')
+
+parser.add_argument('--checkpoint_path', type=str, 
+                    help='Name of saved checkpoint to load weights from', required=True)
+
+parser.add_argument('--face', type=str, 
+                    help='Filepath of video/image that contains faces to use', required=True)
+parser.add_argument('--audio', type=str, 
+                    help='Filepath of video/audio file to use as raw audio source', required=True)
+parser.add_argument('--outfile', type=str, help='Video path to save result. See default for an e.g.', 
+                                default='results/result_voice.mp4')
+
+parser.add_argument('--static', type=bool, 
+                    help='If True, then use only first video frame for inference', default=False)
+parser.add_argument('--fps', type=float, help='Can be specified only if input is a static image (default: 25)', 
+                    default=25., required=False)
+
+parser.add_argument('--pads', nargs='+', type=int, default=[0, 10, 0, 0], 
+                    help='Padding (top, bottom, left, right). Please adjust to include chin at least')
+
+parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip model(s)', default=128)
+
+parser.add_argument('--resize_factor', default=1, type=int,
+             help='Reduce the resolution by this factor. Sometimes, best results are obtained at 480p or 720p')
+
+parser.add_argument('--out_height', default=720, type=int,
+            help='Output video height. Best results are obtained at 480 or 720')
+
+parser.add_argument('--crop', nargs='+', type=int, default=[0, -1, 0, -1],
+                    help='Crop video to a smaller region (top, bottom, left, right). Applied after resize_factor and rotate arg. ' 
+                    'Useful if multiple face present. -1 implies the value will be auto-inferred based on height, width')
+
+parser.add_argument('--box', nargs='+', type=int, default=[-1, -1, -1, -1], 
+                    help='Specify a constant bounding box for the face. Use only as a last resort if the face is not detected.'
+                    'Also, might work only if the face is not moving around much. Syntax: (top, bottom, left, right).')
+
+parser.add_argument('--rotate', default=False, action='store_true',
+                    help='Sometimes videos taken from a phone can be flipped 90deg. If true, will flip video right by 90deg.'
+                    'Use if you get a flipped result, despite feeding a normal looking video')
+
+parser.add_argument('--nosmooth', default=False, action='store_true',
+                    help='Prevent smoothing face detections over a short temporal window')
+
+
+def get_smoothened_boxes(boxes, T):
+    for i in range(len(boxes)):
+        if i + T > len(boxes):
+            window = boxes[len(boxes) - T:]
+        else:
+            window = boxes[i : i + T]
+        boxes[i] = np.mean(window, axis=0)
+    return boxes
+
+def face_detect(images):
+    results = []
+    pady1, pady2, padx1, padx2 = args.pads
+
+    s = time()
+
+    for image, rect in zip(images, face_rect(images)):
+        if rect is None:
+            cv2.imwrite('temp/faulty_frame.jpg', image) # check this frame where the face was not detected.
+            raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')
+
+        y1 = max(0, rect[1] - pady1)
+        y2 = min(image.shape[0], rect[3] + pady2)
+        x1 = max(0, rect[0] - padx1)
+        x2 = min(image.shape[1], rect[2] + padx2)
+
+        results.append([x1, y1, x2, y2])
+
+    print('face detect time:', time() - s)
+
+    boxes = np.array(results)
+    if not args.nosmooth: boxes = get_smoothened_boxes(boxes, T=5)
+    results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
+
+    return results
+
+
+def datagen(frames, mels):
+    img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+
+    if args.box[0] == -1:
+        if not args.static:
+            face_det_results = face_detect(frames) # BGR2RGB for CNN face detection
+        else:
+            face_det_results = face_detect([frames[0]])
+    else:
+        print('Using the specified bounding box instead of face detection...')
+        y1, y2, x1, x2 = args.box
+        face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames]
+
+    for i, m in enumerate(mels):
+        idx = 0 if args.static else i%len(frames)
+        frame_to_save = frames[idx].copy()
+        face, coords = face_det_results[idx].copy()
+
+        face = cv2.resize(face, (args.img_size, args.img_size))
+
+        img_batch.append(face)
+        mel_batch.append(m)
+        frame_batch.append(frame_to_save)
+        coords_batch.append(coords)
+
+        if len(img_batch) >= args.wav2lip_batch_size:
+            img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+
+            img_masked = img_batch.copy()
+            img_masked[:, args.img_size//2:] = 0
+
+            img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+            mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+
+            yield img_batch, mel_batch, frame_batch, coords_batch
+            img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+
+    if len(img_batch) > 0:
+        img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+
+        img_masked = img_batch.copy()
+        img_masked[:, args.img_size//2:] = 0
+
+        img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+        mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+
+        yield img_batch, mel_batch, frame_batch, coords_batch
+
+mel_step_size = 16
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print('Using {} for inference.'.format(device))
+
+def _load(checkpoint_path):
+    if device == 'cuda':
+        checkpoint = torch.load(checkpoint_path)
+    else:
+        checkpoint = torch.load(checkpoint_path,
+                                map_location=lambda storage, loc: storage)
+    return checkpoint
+
+def load_model(path):
+    model = Wav2Lip()
+    print("Load checkpoint from: {}".format(path))
+    checkpoint = _load(path)
+    s = checkpoint["state_dict"]
+    new_s = {}
+    for k, v in s.items():
+        new_s[k.replace('module.', '')] = v
+    model.load_state_dict(new_s)
+
+    model = model.to(device)
+    return model.eval()
+
+def main():
+    args.img_size = 96
+
+    if os.path.isfile(args.face) and args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
+        args.static = True
+
+    if not os.path.isfile(args.face):
+        raise ValueError('--face argument must be a valid path to video/image file')
+
+    elif args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
+        full_frames = [cv2.imread(args.face)]
+        fps = args.fps
+
+    else:
+        video_stream = cv2.VideoCapture(args.face)
+        fps = video_stream.get(cv2.CAP_PROP_FPS)
+
+        print('Reading video frames...')
+
+        full_frames = []
+        while 1:
+            still_reading, frame = video_stream.read()
+            if not still_reading:
+                video_stream.release()
+                break
+
+            aspect_ratio = frame.shape[1] / frame.shape[0]
+            frame = cv2.resize(frame, (int(args.out_height * aspect_ratio), args.out_height))
+            # if args.resize_factor > 1:
+            #     frame = cv2.resize(frame, (frame.shape[1]//args.resize_factor, frame.shape[0]//args.resize_factor))
+
+            if args.rotate:
+                frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)
+
+            y1, y2, x1, x2 = args.crop
+            if x2 == -1: x2 = frame.shape[1]
+            if y2 == -1: y2 = frame.shape[0]
+
+            frame = frame[y1:y2, x1:x2]
+
+            full_frames.append(frame)
+
+    print ("Number of frames available for inference: "+str(len(full_frames)))
+
+    if not args.audio.endswith('.wav'):
+        print('Extracting raw audio...')
+        # command = 'ffmpeg -y -i {} -strict -2 {}'.format(args.audio, 'temp/temp.wav')
+        # subprocess.call(command, shell=True)
+        subprocess.check_call([
+            "ffmpeg", "-y",
+            "-i", args.audio,
+            "temp/temp.wav",
+        ])
+        args.audio = 'temp/temp.wav'
+
+    wav = audio.load_wav(args.audio, 16000)
+    mel = audio.melspectrogram(wav)
+    print(mel.shape)
+
+    if np.isnan(mel.reshape(-1)).sum() > 0:
+        raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
+
+    mel_chunks = []
+    mel_idx_multiplier = 80./fps
+    i = 0
+    while 1:
+        start_idx = int(i * mel_idx_multiplier)
+        if start_idx + mel_step_size > len(mel[0]):
+            mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
+            break
+        mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
+        i += 1
+
+    print("Length of mel chunks: {}".format(len(mel_chunks)))
+
+    full_frames = full_frames[:len(mel_chunks)]
+
+    batch_size = args.wav2lip_batch_size
+    gen = datagen(full_frames.copy(), mel_chunks)
+
+    s = time()
+
+    for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen,
+                                            total=int(np.ceil(float(len(mel_chunks))/batch_size)))):
+        if i == 0:
+            frame_h, frame_w = full_frames[0].shape[:-1]
+            out = cv2.VideoWriter("./result.avi",
+                                    cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
+
+        img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
+        mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
+
+        with torch.no_grad():
+            pred = model(mel_batch, img_batch)
+
+        pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
+
+        for p, f, c in zip(pred, frames, coords):
+            y1, y2, x1, x2 = c
+            p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
+
+            f[y1:y2, x1:x2] = p
+            out.write(f)
+
+    out.release()
+
+    print("wav2lip prediction time:", time() - s)
+
+    subprocess.check_call([
+        "ffmpeg", "-y",
+        # "-vsync", "0", "-hwaccel", "cuda", "-hwaccel_output_format", "cuda",
+        "-i", "./result.avi",
+        "-i", args.audio,
+        # "-c:v", "h264_nvenc",
+        args.outfile,
+    ])
+
+model = detector = detector_model = None
+
+def do_load(checkpoint_path):
+    global model, detector, detector_model
+
+    model = load_model(checkpoint_path)
+
+    # SFDDetector.load_model(device)
+    detector = RetinaFace(gpu_id=0, model_path="./Wav2Lip/checkpoints/mobilenet.pth", network="mobilenet")
+    # detector = RetinaFace(gpu_id=0, model_path="checkpoints/resnet50.pth", network="resnet50")
+
+    detector_model = detector.model
+
+    print("Models loaded")
+
+
+face_batch_size = 64 * 8
+
+def face_rect(images):
+    num_batches = math.ceil(len(images) / face_batch_size)
+    prev_ret = None
+    for i in range(num_batches):
+        batch = images[i * face_batch_size: (i + 1) * face_batch_size]
+        all_faces = detector(batch)  # return faces list of all images
+        for faces in all_faces:
+            if faces:
+                box, landmarks, score = faces[0]
+                prev_ret = tuple(map(int, box))
+            yield prev_ret
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    do_load(args.checkpoint_path)
+    main()
diff --git a/Wav2Lip/models/__init__.py b/Wav2Lip/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4374370494b65f10b76c70a2d4f731c238cfa54c
--- /dev/null
+++ b/Wav2Lip/models/__init__.py
@@ -0,0 +1,2 @@
+from .wav2lip import Wav2Lip, Wav2Lip_disc_qual
+from .syncnet import SyncNet_color
\ No newline at end of file
diff --git a/Wav2Lip/models/conv.py b/Wav2Lip/models/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed83da00cb199e027ef217fd360352d91a7891ff
--- /dev/null
+++ b/Wav2Lip/models/conv.py
@@ -0,0 +1,44 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+class Conv2d(nn.Module):
+    def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_block = nn.Sequential(
+                            nn.Conv2d(cin, cout, kernel_size, stride, padding),
+                            nn.BatchNorm2d(cout)
+                            )
+        self.act = nn.ReLU()
+        self.residual = residual
+
+    def forward(self, x):
+        out = self.conv_block(x)
+        if self.residual:
+            out += x
+        return self.act(out)
+
+class nonorm_Conv2d(nn.Module):
+    def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_block = nn.Sequential(
+                            nn.Conv2d(cin, cout, kernel_size, stride, padding),
+                            )
+        self.act = nn.LeakyReLU(0.01, inplace=True)
+
+    def forward(self, x):
+        out = self.conv_block(x)
+        return self.act(out)
+
+class Conv2dTranspose(nn.Module):
+    def __init__(self, cin, cout, kernel_size, stride, padding, output_padding=0, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_block = nn.Sequential(
+                            nn.ConvTranspose2d(cin, cout, kernel_size, stride, padding, output_padding),
+                            nn.BatchNorm2d(cout)
+                            )
+        self.act = nn.ReLU()
+
+    def forward(self, x):
+        out = self.conv_block(x)
+        return self.act(out)
diff --git a/Wav2Lip/models/syncnet.py b/Wav2Lip/models/syncnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..e773cdca675236745a379a776b7c07d7d353f590
--- /dev/null
+++ b/Wav2Lip/models/syncnet.py
@@ -0,0 +1,66 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .conv import Conv2d
+
+class SyncNet_color(nn.Module):
+    def __init__(self):
+        super(SyncNet_color, self).__init__()
+
+        self.face_encoder = nn.Sequential(
+            Conv2d(15, 32, kernel_size=(7, 7), stride=1, padding=3),
+
+            Conv2d(32, 64, kernel_size=5, stride=(1, 2), padding=1),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(128, 256, kernel_size=3, stride=2, padding=1),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(256, 512, kernel_size=3, stride=2, padding=1),
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(512, 512, kernel_size=3, stride=2, padding=1),
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=0),
+            Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
+
+        self.audio_encoder = nn.Sequential(
+            Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
+            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
+            Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
+
+    def forward(self, audio_sequences, face_sequences): # audio_sequences := (B, dim, T)
+        face_embedding = self.face_encoder(face_sequences)
+        audio_embedding = self.audio_encoder(audio_sequences)
+
+        audio_embedding = audio_embedding.view(audio_embedding.size(0), -1)
+        face_embedding = face_embedding.view(face_embedding.size(0), -1)
+
+        audio_embedding = F.normalize(audio_embedding, p=2, dim=1)
+        face_embedding = F.normalize(face_embedding, p=2, dim=1)
+
+
+        return audio_embedding, face_embedding
diff --git a/Wav2Lip/models/wav2lip.py b/Wav2Lip/models/wav2lip.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae5d6919169ec497f0f0815184f5db8ba9108fbd
--- /dev/null
+++ b/Wav2Lip/models/wav2lip.py
@@ -0,0 +1,184 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+import math
+
+from .conv import Conv2dTranspose, Conv2d, nonorm_Conv2d
+
+class Wav2Lip(nn.Module):
+    def __init__(self):
+        super(Wav2Lip, self).__init__()
+
+        self.face_encoder_blocks = nn.ModuleList([
+            nn.Sequential(Conv2d(6, 16, kernel_size=7, stride=1, padding=3)), # 96,96
+
+            nn.Sequential(Conv2d(16, 32, kernel_size=3, stride=2, padding=1), # 48,48
+            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True)),
+
+            nn.Sequential(Conv2d(32, 64, kernel_size=3, stride=2, padding=1),    # 24,24
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True)),
+
+            nn.Sequential(Conv2d(64, 128, kernel_size=3, stride=2, padding=1),   # 12,12
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True)),
+
+            nn.Sequential(Conv2d(128, 256, kernel_size=3, stride=2, padding=1),       # 6,6
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True)),
+
+            nn.Sequential(Conv2d(256, 512, kernel_size=3, stride=2, padding=1),     # 3,3
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),),
+            
+            nn.Sequential(Conv2d(512, 512, kernel_size=3, stride=1, padding=0),     # 1, 1
+            Conv2d(512, 512, kernel_size=1, stride=1, padding=0)),])
+
+        self.audio_encoder = nn.Sequential(
+            Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
+            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+
+            Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
+            Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
+
+        self.face_decoder_blocks = nn.ModuleList([
+            nn.Sequential(Conv2d(512, 512, kernel_size=1, stride=1, padding=0),),
+
+            nn.Sequential(Conv2dTranspose(1024, 512, kernel_size=3, stride=1, padding=0), # 3,3
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),),
+
+            nn.Sequential(Conv2dTranspose(1024, 512, kernel_size=3, stride=2, padding=1, output_padding=1),
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(512, 512, kernel_size=3, stride=1, padding=1, residual=True),), # 6, 6
+
+            nn.Sequential(Conv2dTranspose(768, 384, kernel_size=3, stride=2, padding=1, output_padding=1),
+            Conv2d(384, 384, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(384, 384, kernel_size=3, stride=1, padding=1, residual=True),), # 12, 12
+
+            nn.Sequential(Conv2dTranspose(512, 256, kernel_size=3, stride=2, padding=1, output_padding=1),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),), # 24, 24
+
+            nn.Sequential(Conv2dTranspose(320, 128, kernel_size=3, stride=2, padding=1, output_padding=1), 
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),), # 48, 48
+
+            nn.Sequential(Conv2dTranspose(160, 64, kernel_size=3, stride=2, padding=1, output_padding=1),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
+            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),),]) # 96,96
+
+        self.output_block = nn.Sequential(Conv2d(80, 32, kernel_size=3, stride=1, padding=1),
+            nn.Conv2d(32, 3, kernel_size=1, stride=1, padding=0),
+            nn.Sigmoid()) 
+
+    def forward(self, audio_sequences, face_sequences):
+        # audio_sequences = (B, T, 1, 80, 16)
+        B = audio_sequences.size(0)
+
+        input_dim_size = len(face_sequences.size())
+        if input_dim_size > 4:
+            audio_sequences = torch.cat([audio_sequences[:, i] for i in range(audio_sequences.size(1))], dim=0)
+            face_sequences = torch.cat([face_sequences[:, :, i] for i in range(face_sequences.size(2))], dim=0)
+
+        audio_embedding = self.audio_encoder(audio_sequences) # B, 512, 1, 1
+
+        feats = []
+        x = face_sequences
+        for f in self.face_encoder_blocks:
+            x = f(x)
+            feats.append(x)
+
+        x = audio_embedding
+        for f in self.face_decoder_blocks:
+            x = f(x)
+            try:
+                x = torch.cat((x, feats[-1]), dim=1)
+            except Exception as e:
+                print(x.size())
+                print(feats[-1].size())
+                raise e
+            
+            feats.pop()
+
+        x = self.output_block(x)
+
+        if input_dim_size > 4:
+            x = torch.split(x, B, dim=0) # [(B, C, H, W)]
+            outputs = torch.stack(x, dim=2) # (B, C, T, H, W)
+
+        else:
+            outputs = x
+            
+        return outputs
+
+class Wav2Lip_disc_qual(nn.Module):
+    def __init__(self):
+        super(Wav2Lip_disc_qual, self).__init__()
+
+        self.face_encoder_blocks = nn.ModuleList([
+            nn.Sequential(nonorm_Conv2d(3, 32, kernel_size=7, stride=1, padding=3)), # 48,96
+
+            nn.Sequential(nonorm_Conv2d(32, 64, kernel_size=5, stride=(1, 2), padding=2), # 48,48
+            nonorm_Conv2d(64, 64, kernel_size=5, stride=1, padding=2)),
+
+            nn.Sequential(nonorm_Conv2d(64, 128, kernel_size=5, stride=2, padding=2),    # 24,24
+            nonorm_Conv2d(128, 128, kernel_size=5, stride=1, padding=2)),
+
+            nn.Sequential(nonorm_Conv2d(128, 256, kernel_size=5, stride=2, padding=2),   # 12,12
+            nonorm_Conv2d(256, 256, kernel_size=5, stride=1, padding=2)),
+
+            nn.Sequential(nonorm_Conv2d(256, 512, kernel_size=3, stride=2, padding=1),       # 6,6
+            nonorm_Conv2d(512, 512, kernel_size=3, stride=1, padding=1)),
+
+            nn.Sequential(nonorm_Conv2d(512, 512, kernel_size=3, stride=2, padding=1),     # 3,3
+            nonorm_Conv2d(512, 512, kernel_size=3, stride=1, padding=1),),
+            
+            nn.Sequential(nonorm_Conv2d(512, 512, kernel_size=3, stride=1, padding=0),     # 1, 1
+            nonorm_Conv2d(512, 512, kernel_size=1, stride=1, padding=0)),])
+
+        self.binary_pred = nn.Sequential(nn.Conv2d(512, 1, kernel_size=1, stride=1, padding=0), nn.Sigmoid())
+        self.label_noise = .0
+
+    def get_lower_half(self, face_sequences):
+        return face_sequences[:, :, face_sequences.size(2)//2:]
+
+    def to_2d(self, face_sequences):
+        B = face_sequences.size(0)
+        face_sequences = torch.cat([face_sequences[:, :, i] for i in range(face_sequences.size(2))], dim=0)
+        return face_sequences
+
+    def perceptual_forward(self, false_face_sequences):
+        false_face_sequences = self.to_2d(false_face_sequences)
+        false_face_sequences = self.get_lower_half(false_face_sequences)
+
+        false_feats = false_face_sequences
+        for f in self.face_encoder_blocks:
+            false_feats = f(false_feats)
+
+        false_pred_loss = F.binary_cross_entropy(self.binary_pred(false_feats).view(len(false_feats), -1), 
+                                        torch.ones((len(false_feats), 1)).cuda())
+
+        return false_pred_loss
+
+    def forward(self, face_sequences):
+        face_sequences = self.to_2d(face_sequences)
+        face_sequences = self.get_lower_half(face_sequences)
+
+        x = face_sequences
+        for f in self.face_encoder_blocks:
+            x = f(x)
+
+        return self.binary_pred(x).view(len(x), -1)
diff --git a/Wav2Lip/predict.py b/Wav2Lip/predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fbc7eba6e0844ad608559e7a2d8322cfef5ac50
--- /dev/null
+++ b/Wav2Lip/predict.py
@@ -0,0 +1,144 @@
+# Prediction interface for Cog ⚙️
+# https://github.com/replicate/cog/blob/main/docs/python.md
+import os
+import subprocess
+
+from cog import BasePredictor, Input, Path
+
+import inference
+
+from time import time
+
+from functools import wraps
+import torch
+
+
+def make_mem_efficient(cls: BasePredictor):
+    if not torch.cuda.is_available():
+        return cls
+
+    old_setup = cls.setup
+    old_predict = cls.predict
+
+    @wraps(old_setup)
+    def new_setup(self, *args, **kwargs):
+        ret = old_setup(self, *args, **kwargs)
+        _move_to(self, "cpu")
+        return ret
+
+    @wraps(old_predict)
+    def new_predict(self, *args, **kwargs):
+        _move_to(self, "cuda")
+        try:
+            ret = old_predict(self, *args, **kwargs)
+        finally:
+            _move_to(self, "cpu")
+        return ret
+
+    cls.setup = new_setup
+    cls.predict = new_predict
+
+    return cls
+
+
+def _move_to(self, device):
+    try:
+        self = self.cached_models
+    except AttributeError:
+        pass
+    for attr, value in vars(self).items():
+        try:
+            value = value.to(device)
+        except AttributeError:
+            pass
+        else:
+            print(f"Moving {self.__name__}.{attr} to {device}")
+            setattr(self, attr, value)
+    torch.cuda.empty_cache()
+
+
+@make_mem_efficient
+class Predictor(BasePredictor):
+    cached_models = inference
+
+    def setup(self):
+        inference.do_load("checkpoints/wav2lip_gan.pth")
+
+    def predict(
+        self,
+        face: Path = Input(description="video/image that contains faces to use"),
+        audio: Path = Input(description="video/audio file to use as raw audio source"),
+        pads: str = Input(
+            description="Padding for the detected face bounding box.\n"
+            "Please adjust to include chin at least\n"
+            'Format: "top bottom left right"',
+            default="0 10 0 0",
+        ),
+        smooth: bool = Input(
+            description="Smooth face detections over a short temporal window",
+            default=True,
+        ),
+        fps: float = Input(
+            description="Can be specified only if input is a static image",
+            default=25.0,
+        ),
+        out_height: int = Input(
+            description="Output video height. Best results are obtained at 480 or 720",
+            default=480,
+        ),
+    ) -> Path:
+        try:
+            os.remove("results/result_voice.mp4")
+        except FileNotFoundError:
+            pass
+
+        face_ext = os.path.splitext(face)[-1]
+        if face_ext not in [".mp4", ".mov", ".png" , ".jpg" , ".jpeg" , ".gif", ".mkv", ".webp"]:
+            raise ValueError(f'Unsupported face format {face_ext!r}')
+
+        audio_ext = os.path.splitext(audio)[-1]
+        if audio_ext not in [".wav", ".mp3"]:
+            raise ValueError(f'Unsupported audio format {audio_ext!r}')
+
+        args = [
+            "--checkpoint_path", "checkpoints/wav2lip_gan.pth",
+            "--face", str(face),
+            "--audio", str(audio),
+            "--pads", *pads.split(" "),
+            "--fps", str(fps),
+            "--out_height", str(out_height),
+        ]
+        if not smooth:
+            args += ["--nosmooth"]
+
+        print("-> run:", " ".join(args))
+        inference.args = inference.parser.parse_args(args)
+
+        s = time()
+
+        try:
+            inference.main()
+        except ValueError as e:
+            print('-> Encountered error, skipping lipsync:', e)
+
+            args = [
+                "ffmpeg", "-y",
+                # "-vsync", "0", "-hwaccel", "cuda", "-hwaccel_output_format", "cuda",
+                "-stream_loop", "-1",
+                "-i", str(face),
+                "-i", str(audio),
+                "-shortest",
+                "-fflags", "+shortest",
+                "-max_interleave_delta", "100M",
+                "-map", "0:v:0",
+                "-map", "1:a:0",
+                # "-c", "copy",
+                # "-c:v", "h264_nvenc",
+                "results/result_voice.mp4",
+            ]
+            print("-> run:", " ".join(args))
+            print(subprocess.check_output(args, encoding="utf-8"))
+
+        print(time() - s)
+
+        return Path("results/result_voice.mp4")
diff --git a/Wav2Lip/preprocess.py b/Wav2Lip/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..5322012ac60e91fefa47338d0e253c3f912ab7f2
--- /dev/null
+++ b/Wav2Lip/preprocess.py
@@ -0,0 +1,113 @@
+import sys
+
+if sys.version_info[0] < 3 and sys.version_info[1] < 2:
+	raise Exception("Must be using >= Python 3.2")
+
+from os import listdir, path
+
+if not path.isfile('face_detection/detection/sfd/s3fd.pth'):
+	raise FileNotFoundError('Save the s3fd model to face_detection/detection/sfd/s3fd.pth \
+							before running this script!')
+
+import multiprocessing as mp
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import numpy as np
+import argparse, os, cv2, traceback, subprocess
+from tqdm import tqdm
+from glob import glob
+import audio
+from hparams import hparams as hp
+
+import face_detection
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('--ngpu', help='Number of GPUs across which to run in parallel', default=1, type=int)
+parser.add_argument('--batch_size', help='Single GPU Face detection batch size', default=32, type=int)
+parser.add_argument("--data_root", help="Root folder of the LRS2 dataset", required=True)
+parser.add_argument("--preprocessed_root", help="Root folder of the preprocessed dataset", required=True)
+
+args = parser.parse_args()
+
+fa = [face_detection.FaceAlignment(face_detection.LandmarksType._2D, flip_input=False, 
+									device='cuda:{}'.format(id)) for id in range(args.ngpu)]
+
+template = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'
+# template2 = 'ffmpeg -hide_banner -loglevel panic -threads 1 -y -i {} -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 {}'
+
+def process_video_file(vfile, args, gpu_id):
+	video_stream = cv2.VideoCapture(vfile)
+	
+	frames = []
+	while 1:
+		still_reading, frame = video_stream.read()
+		if not still_reading:
+			video_stream.release()
+			break
+		frames.append(frame)
+	
+	vidname = os.path.basename(vfile).split('.')[0]
+	dirname = vfile.split('/')[-2]
+
+	fulldir = path.join(args.preprocessed_root, dirname, vidname)
+	os.makedirs(fulldir, exist_ok=True)
+
+	batches = [frames[i:i + args.batch_size] for i in range(0, len(frames), args.batch_size)]
+
+	i = -1
+	for fb in batches:
+		preds = fa[gpu_id].get_detections_for_batch(np.asarray(fb))
+
+		for j, f in enumerate(preds):
+			i += 1
+			if f is None:
+				continue
+
+			x1, y1, x2, y2 = f
+			cv2.imwrite(path.join(fulldir, '{}.jpg'.format(i)), fb[j][y1:y2, x1:x2])
+
+def process_audio_file(vfile, args):
+	vidname = os.path.basename(vfile).split('.')[0]
+	dirname = vfile.split('/')[-2]
+
+	fulldir = path.join(args.preprocessed_root, dirname, vidname)
+	os.makedirs(fulldir, exist_ok=True)
+
+	wavpath = path.join(fulldir, 'audio.wav')
+
+	command = template.format(vfile, wavpath)
+	subprocess.call(command, shell=True)
+
+	
+def mp_handler(job):
+	vfile, args, gpu_id = job
+	try:
+		process_video_file(vfile, args, gpu_id)
+	except KeyboardInterrupt:
+		exit(0)
+	except:
+		traceback.print_exc()
+		
+def main(args):
+	print('Started processing for {} with {} GPUs'.format(args.data_root, args.ngpu))
+
+	filelist = glob(path.join(args.data_root, '*/*.mp4'))
+
+	jobs = [(vfile, args, i%args.ngpu) for i, vfile in enumerate(filelist)]
+	p = ThreadPoolExecutor(args.ngpu)
+	futures = [p.submit(mp_handler, j) for j in jobs]
+	_ = [r.result() for r in tqdm(as_completed(futures), total=len(futures))]
+
+	print('Dumping audios...')
+
+	for vfile in tqdm(filelist):
+		try:
+			process_audio_file(vfile, args)
+		except KeyboardInterrupt:
+			exit(0)
+		except:
+			traceback.print_exc()
+			continue
+
+if __name__ == '__main__':
+	main(args)
\ No newline at end of file
diff --git a/Wav2Lip/requirements.txt b/Wav2Lip/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3477eb65054843690495bcd243e6f3c1584e1cfd
--- /dev/null
+++ b/Wav2Lip/requirements.txt
@@ -0,0 +1,14 @@
+numpy
+librosa
+opencv-contrib-python
+opencv-python
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.3.0+cu118
+--extra-index-url https://download.pytorch.org/whl/cu118
+torchvision==0.18.0+cu118
+tqdm
+numba
+mediapipe
+https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl
+git+https://github.com/elliottzheng/batch-face.git@master
+ffmpeg-python
diff --git a/Wav2Lip/requirementsCPU.txt b/Wav2Lip/requirementsCPU.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ac7cef623baa092db1ec43c8a7eed282963a0d5c
--- /dev/null
+++ b/Wav2Lip/requirementsCPU.txt
@@ -0,0 +1,13 @@
+librosa
+numpy
+opencv-contrib-python
+opencv-python
+-f https://download.pytorch.org/whl/torch_stable.html
+torch
+torchvision
+tqdm
+numba
+mediapipe
+https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl
+git+https://github.com/elliottzheng/batch-face.git@master
+ffmpeg-python
diff --git a/Wav2Lip/requirements_colab.txt b/Wav2Lip/requirements_colab.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c5f75e7b4d0fda70507e468c64c2f80a734965bf
--- /dev/null
+++ b/Wav2Lip/requirements_colab.txt
@@ -0,0 +1,7 @@
+numpy==1.23.4
+librosa
+opencv-python
+torch
+torchvision
+tqdm
+numba
diff --git a/Wav2Lip/results/README.md b/Wav2Lip/results/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b1bbfd53fded37aefe0f4fc97adc8de343341b7a
--- /dev/null
+++ b/Wav2Lip/results/README.md
@@ -0,0 +1 @@
+Generated results will be placed in this folder by default.
\ No newline at end of file
diff --git a/Wav2Lip/scripts/download_models.sh b/Wav2Lip/scripts/download_models.sh
new file mode 100644
index 0000000000000000000000000000000000000000..93049e87342b9f30b3a769187d3173b5d66b8812
--- /dev/null
+++ b/Wav2Lip/scripts/download_models.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+set -ex
+
+wget -c -O checkpoints/wav2lip_gan.pth 'https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA'
+wget -c -O checkpoints/mobilenet.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/mobilenet0.25_Final.pth'
+wget -c -O checkpoints/resnet50.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/Resnet50_Final.pth'
diff --git a/Wav2Lip/scripts/run-dev.sh b/Wav2Lip/scripts/run-dev.sh
new file mode 100644
index 0000000000000000000000000000000000000000..becde83e1932c238535042e9511f9ed8fd6343a6
--- /dev/null
+++ b/Wav2Lip/scripts/run-dev.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+NAME=wav2lip-dev
+
+set -ex
+
+docker build . -t $NAME
+docker run -it --rm \
+  --name $NAME \
+  -v $PWD/checkpoints:/src/checkpoints \
+  -p 6001:5000 \
+  --gpus all \
+  $NAME
diff --git a/Wav2Lip/scripts/run-prod.sh b/Wav2Lip/scripts/run-prod.sh
new file mode 100644
index 0000000000000000000000000000000000000000..08f378a48ee874248c8418b401fcc4d6e34bf1fc
--- /dev/null
+++ b/Wav2Lip/scripts/run-prod.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+NAME=wav2lip
+
+set -x
+
+docker rm -f $NAME
+
+docker build . -t $NAME
+docker run -d --restart always \
+  --name $NAME \
+  -v $PWD/checkpoints:/src/checkpoints \
+  -p 5001:5000 \
+  --gpus all \
+  $NAME
+
+docker logs -f $NAME
diff --git a/Wav2Lip/temp/README.md b/Wav2Lip/temp/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..04c910499300fa8dc05c317d7d30cb29f31ff836
--- /dev/null
+++ b/Wav2Lip/temp/README.md
@@ -0,0 +1 @@
+Temporary files at the time of inference/testing will be saved here. You can ignore them.
\ No newline at end of file
diff --git a/Wav2Lip/wav2lip_train.py b/Wav2Lip/wav2lip_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e0811808af55464a803be1e268be33f1b8a31a9
--- /dev/null
+++ b/Wav2Lip/wav2lip_train.py
@@ -0,0 +1,374 @@
+from os.path import dirname, join, basename, isfile
+from tqdm import tqdm
+
+from models import SyncNet_color as SyncNet
+from models import Wav2Lip as Wav2Lip
+import audio
+
+import torch
+from torch import nn
+from torch import optim
+import torch.backends.cudnn as cudnn
+from torch.utils import data as data_utils
+import numpy as np
+
+from glob import glob
+
+import os, random, cv2, argparse
+from hparams import hparams, get_image_list
+
+parser = argparse.ArgumentParser(description='Code to train the Wav2Lip model without the visual quality discriminator')
+
+parser.add_argument("--data_root", help="Root folder of the preprocessed LRS2 dataset", required=True, type=str)
+
+parser.add_argument('--checkpoint_dir', help='Save checkpoints to this directory', required=True, type=str)
+parser.add_argument('--syncnet_checkpoint_path', help='Load the pre-trained Expert discriminator', required=True, type=str)
+
+parser.add_argument('--checkpoint_path', help='Resume from this checkpoint', default=None, type=str)
+
+args = parser.parse_args()
+
+
+global_step = 0
+global_epoch = 0
+use_cuda = torch.cuda.is_available()
+print('use_cuda: {}'.format(use_cuda))
+
+syncnet_T = 5
+syncnet_mel_step_size = 16
+
+class Dataset(object):
+    def __init__(self, split):
+        self.all_videos = get_image_list(args.data_root, split)
+
+    def get_frame_id(self, frame):
+        return int(basename(frame).split('.')[0])
+
+    def get_window(self, start_frame):
+        start_id = self.get_frame_id(start_frame)
+        vidname = dirname(start_frame)
+
+        window_fnames = []
+        for frame_id in range(start_id, start_id + syncnet_T):
+            frame = join(vidname, '{}.jpg'.format(frame_id))
+            if not isfile(frame):
+                return None
+            window_fnames.append(frame)
+        return window_fnames
+
+    def read_window(self, window_fnames):
+        if window_fnames is None: return None
+        window = []
+        for fname in window_fnames:
+            img = cv2.imread(fname)
+            if img is None:
+                return None
+            try:
+                img = cv2.resize(img, (hparams.img_size, hparams.img_size))
+            except Exception as e:
+                return None
+
+            window.append(img)
+
+        return window
+
+    def crop_audio_window(self, spec, start_frame):
+        if type(start_frame) == int:
+            start_frame_num = start_frame
+        else:
+            start_frame_num = self.get_frame_id(start_frame) # 0-indexing ---> 1-indexing
+        start_idx = int(80. * (start_frame_num / float(hparams.fps)))
+        
+        end_idx = start_idx + syncnet_mel_step_size
+
+        return spec[start_idx : end_idx, :]
+
+    def get_segmented_mels(self, spec, start_frame):
+        mels = []
+        assert syncnet_T == 5
+        start_frame_num = self.get_frame_id(start_frame) + 1 # 0-indexing ---> 1-indexing
+        if start_frame_num - 2 < 0: return None
+        for i in range(start_frame_num, start_frame_num + syncnet_T):
+            m = self.crop_audio_window(spec, i - 2)
+            if m.shape[0] != syncnet_mel_step_size:
+                return None
+            mels.append(m.T)
+
+        mels = np.asarray(mels)
+
+        return mels
+
+    def prepare_window(self, window):
+        # 3 x T x H x W
+        x = np.asarray(window) / 255.
+        x = np.transpose(x, (3, 0, 1, 2))
+
+        return x
+
+    def __len__(self):
+        return len(self.all_videos)
+
+    def __getitem__(self, idx):
+        while 1:
+            idx = random.randint(0, len(self.all_videos) - 1)
+            vidname = self.all_videos[idx]
+            img_names = list(glob(join(vidname, '*.jpg')))
+            if len(img_names) <= 3 * syncnet_T:
+                continue
+            
+            img_name = random.choice(img_names)
+            wrong_img_name = random.choice(img_names)
+            while wrong_img_name == img_name:
+                wrong_img_name = random.choice(img_names)
+
+            window_fnames = self.get_window(img_name)
+            wrong_window_fnames = self.get_window(wrong_img_name)
+            if window_fnames is None or wrong_window_fnames is None:
+                continue
+
+            window = self.read_window(window_fnames)
+            if window is None:
+                continue
+
+            wrong_window = self.read_window(wrong_window_fnames)
+            if wrong_window is None:
+                continue
+
+            try:
+                wavpath = join(vidname, "audio.wav")
+                wav = audio.load_wav(wavpath, hparams.sample_rate)
+
+                orig_mel = audio.melspectrogram(wav).T
+            except Exception as e:
+                continue
+
+            mel = self.crop_audio_window(orig_mel.copy(), img_name)
+            
+            if (mel.shape[0] != syncnet_mel_step_size):
+                continue
+
+            indiv_mels = self.get_segmented_mels(orig_mel.copy(), img_name)
+            if indiv_mels is None: continue
+
+            window = self.prepare_window(window)
+            y = window.copy()
+            window[:, :, window.shape[2]//2:] = 0.
+
+            wrong_window = self.prepare_window(wrong_window)
+            x = np.concatenate([window, wrong_window], axis=0)
+
+            x = torch.FloatTensor(x)
+            mel = torch.FloatTensor(mel.T).unsqueeze(0)
+            indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1)
+            y = torch.FloatTensor(y)
+            return x, indiv_mels, mel, y
+
+def save_sample_images(x, g, gt, global_step, checkpoint_dir):
+    x = (x.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
+    g = (g.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
+    gt = (gt.detach().cpu().numpy().transpose(0, 2, 3, 4, 1) * 255.).astype(np.uint8)
+
+    refs, inps = x[..., 3:], x[..., :3]
+    folder = join(checkpoint_dir, "samples_step{:09d}".format(global_step))
+    if not os.path.exists(folder): os.mkdir(folder)
+    collage = np.concatenate((refs, inps, g, gt), axis=-2)
+    for batch_idx, c in enumerate(collage):
+        for t in range(len(c)):
+            cv2.imwrite('{}/{}_{}.jpg'.format(folder, batch_idx, t), c[t])
+
+logloss = nn.BCELoss()
+def cosine_loss(a, v, y):
+    d = nn.functional.cosine_similarity(a, v)
+    loss = logloss(d.unsqueeze(1), y)
+
+    return loss
+
+device = torch.device("cuda" if use_cuda else "cpu")
+syncnet = SyncNet().to(device)
+for p in syncnet.parameters():
+    p.requires_grad = False
+
+recon_loss = nn.L1Loss()
+def get_sync_loss(mel, g):
+    g = g[:, :, :, g.size(3)//2:]
+    g = torch.cat([g[:, :, i] for i in range(syncnet_T)], dim=1)
+    # B, 3 * T, H//2, W
+    a, v = syncnet(mel, g)
+    y = torch.ones(g.size(0), 1).float().to(device)
+    return cosine_loss(a, v, y)
+
+def train(device, model, train_data_loader, test_data_loader, optimizer,
+          checkpoint_dir=None, checkpoint_interval=None, nepochs=None):
+
+    global global_step, global_epoch
+    resumed_step = global_step
+ 
+    while global_epoch < nepochs:
+        print('Starting Epoch: {}'.format(global_epoch))
+        running_sync_loss, running_l1_loss = 0., 0.
+        prog_bar = tqdm(enumerate(train_data_loader))
+        for step, (x, indiv_mels, mel, gt) in prog_bar:
+            model.train()
+            optimizer.zero_grad()
+
+            # Move data to CUDA device
+            x = x.to(device)
+            mel = mel.to(device)
+            indiv_mels = indiv_mels.to(device)
+            gt = gt.to(device)
+
+            g = model(indiv_mels, x)
+
+            if hparams.syncnet_wt > 0.:
+                sync_loss = get_sync_loss(mel, g)
+            else:
+                sync_loss = 0.
+
+            l1loss = recon_loss(g, gt)
+
+            loss = hparams.syncnet_wt * sync_loss + (1 - hparams.syncnet_wt) * l1loss
+            loss.backward()
+            optimizer.step()
+
+            if global_step % checkpoint_interval == 0:
+                save_sample_images(x, g, gt, global_step, checkpoint_dir)
+
+            global_step += 1
+            cur_session_steps = global_step - resumed_step
+
+            running_l1_loss += l1loss.item()
+            if hparams.syncnet_wt > 0.:
+                running_sync_loss += sync_loss.item()
+            else:
+                running_sync_loss += 0.
+
+            if global_step == 1 or global_step % checkpoint_interval == 0:
+                save_checkpoint(
+                    model, optimizer, global_step, checkpoint_dir, global_epoch)
+
+            if global_step == 1 or global_step % hparams.eval_interval == 0:
+                with torch.no_grad():
+                    average_sync_loss = eval_model(test_data_loader, global_step, device, model, checkpoint_dir)
+
+                    if average_sync_loss < .75:
+                        hparams.set_hparam('syncnet_wt', 0.01) # without image GAN a lesser weight is sufficient
+
+            prog_bar.set_description('L1: {}, Sync Loss: {}'.format(running_l1_loss / (step + 1),
+                                                                    running_sync_loss / (step + 1)))
+
+        global_epoch += 1
+        
+
+def eval_model(test_data_loader, global_step, device, model, checkpoint_dir):
+    eval_steps = 700
+    print('Evaluating for {} steps'.format(eval_steps))
+    sync_losses, recon_losses = [], []
+    step = 0
+    while 1:
+        for x, indiv_mels, mel, gt in test_data_loader:
+            step += 1
+            model.eval()
+
+            # Move data to CUDA device
+            x = x.to(device)
+            gt = gt.to(device)
+            indiv_mels = indiv_mels.to(device)
+            mel = mel.to(device)
+
+            g = model(indiv_mels, x)
+
+            sync_loss = get_sync_loss(mel, g)
+            l1loss = recon_loss(g, gt)
+
+            sync_losses.append(sync_loss.item())
+            recon_losses.append(l1loss.item())
+
+            if step > eval_steps: 
+                averaged_sync_loss = sum(sync_losses) / len(sync_losses)
+                averaged_recon_loss = sum(recon_losses) / len(recon_losses)
+
+                print('L1: {}, Sync loss: {}'.format(averaged_recon_loss, averaged_sync_loss))
+
+                return averaged_sync_loss
+
+def save_checkpoint(model, optimizer, step, checkpoint_dir, epoch):
+
+    checkpoint_path = join(
+        checkpoint_dir, "checkpoint_step{:09d}.pth".format(global_step))
+    optimizer_state = optimizer.state_dict() if hparams.save_optimizer_state else None
+    torch.save({
+        "state_dict": model.state_dict(),
+        "optimizer": optimizer_state,
+        "global_step": step,
+        "global_epoch": epoch,
+    }, checkpoint_path)
+    print("Saved checkpoint:", checkpoint_path)
+
+
+def _load(checkpoint_path):
+    if use_cuda:
+        checkpoint = torch.load(checkpoint_path)
+    else:
+        checkpoint = torch.load(checkpoint_path,
+                                map_location=lambda storage, loc: storage)
+    return checkpoint
+
+def load_checkpoint(path, model, optimizer, reset_optimizer=False, overwrite_global_states=True):
+    global global_step
+    global global_epoch
+
+    print("Load checkpoint from: {}".format(path))
+    checkpoint = _load(path)
+    s = checkpoint["state_dict"]
+    new_s = {}
+    for k, v in s.items():
+        new_s[k.replace('module.', '')] = v
+    model.load_state_dict(new_s)
+    if not reset_optimizer:
+        optimizer_state = checkpoint["optimizer"]
+        if optimizer_state is not None:
+            print("Load optimizer state from {}".format(path))
+            optimizer.load_state_dict(checkpoint["optimizer"])
+    if overwrite_global_states:
+        global_step = checkpoint["global_step"]
+        global_epoch = checkpoint["global_epoch"]
+
+    return model
+
+if __name__ == "__main__":
+    checkpoint_dir = args.checkpoint_dir
+
+    # Dataset and Dataloader setup
+    train_dataset = Dataset('train')
+    test_dataset = Dataset('val')
+
+    train_data_loader = data_utils.DataLoader(
+        train_dataset, batch_size=hparams.batch_size, shuffle=True,
+        num_workers=hparams.num_workers)
+
+    test_data_loader = data_utils.DataLoader(
+        test_dataset, batch_size=hparams.batch_size,
+        num_workers=4)
+
+    device = torch.device("cuda" if use_cuda else "cpu")
+
+    # Model
+    model = Wav2Lip().to(device)
+    print('total trainable params {}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad)))
+
+    optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad],
+                           lr=hparams.initial_learning_rate)
+
+    if args.checkpoint_path is not None:
+        load_checkpoint(args.checkpoint_path, model, optimizer, reset_optimizer=False)
+        
+    load_checkpoint(args.syncnet_checkpoint_path, syncnet, None, reset_optimizer=True, overwrite_global_states=False)
+
+    if not os.path.exists(checkpoint_dir):
+        os.mkdir(checkpoint_dir)
+
+    # Train!
+    train(device, model, train_data_loader, test_data_loader, optimizer,
+              checkpoint_dir=checkpoint_dir,
+              checkpoint_interval=hparams.checkpoint_interval,
+              nepochs=hparams.nepochs)
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d485895c98dc2ef57b91751b68590477ef980ea
--- /dev/null
+++ b/app.py
@@ -0,0 +1,667 @@
+import spaces
+import os
+# os.environ["XDG_RUNTIME_DIR"] = "/content"
+# os.system("Xvfb :99 -ac &")
+# os.environ["DISPLAY"] = ":99"
+# os.environ["PYOPENGL_PLATFORM"] = "egl"
+# os.environ["MESA_GL_VERSION_OVERRIDE"] = "4.1"
+import gradio as gr
+import gc
+import soundfile as sf
+import shutil
+import argparse
+from moviepy.tools import verbose_print
+from omegaconf import OmegaConf
+import random
+import numpy as np
+import json 
+import librosa
+import emage.mertic
+from datetime import datetime
+from decord import VideoReader
+from PIL import Image
+import copy
+
+import importlib
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from torch.nn.parallel import DistributedDataParallel as DDP
+from tqdm import tqdm
+import smplx
+from moviepy.editor import VideoFileClip, AudioFileClip, ImageSequenceClip
+import igraph
+
+# import emage
+import utils.rotation_conversions as rc
+from utils.video_io import save_videos_from_pil
+from utils.genextend_inference_utils import adjust_statistics_to_match_reference
+from create_graph import path_visualization, graph_pruning, get_motion_reps_tensor, path_visualization_v2
+
+
+def search_path_dp(graph, audio_low_np, audio_high_np, loop_penalty=0.1, top_k=1, search_mode="both", continue_penalty=0.1):
+    T = audio_low_np.shape[0]  # Total time steps
+    N = len(graph.vs)          # Total number of nodes in the graph
+
+    # Initialize DP tables
+    min_cost = [{} for _ in range(T)]  # min_cost[t][node_index] = list of tuples: (cost, prev_node_index, prev_tuple_index, non_continue_count, visited_nodes)
+
+    # Initialize the first time step
+    start_nodes = [v for v in graph.vs if v['previous'] is None or v['previous'] == -1]
+    for node in start_nodes:
+        node_index = node.index
+        motion_low = node['motion_low']      # Shape: [C]
+        motion_high = node['motion_high']    # Shape: [C]
+
+        # Cost using cosine similarity
+        if search_mode == "both":
+            cost = 2 - (np.dot(audio_low_np[0], motion_low.T) + np.dot(audio_high_np[0], motion_high.T))
+        elif search_mode == "high_level":
+            cost = 1 - np.dot(audio_high_np[0], motion_high.T)
+        elif search_mode == "low_level":
+            cost = 1 - np.dot(audio_low_np[0], motion_low.T)
+
+        visited_nodes = {node_index: 1}  # Initialize visit count as a dictionary
+
+        min_cost[0][node_index] = [ (cost, None, None, 0, visited_nodes) ]  # Initialize with no predecessor and 0 non-continue count
+
+    # DP over time steps
+    for t in range(1, T):
+        for node in graph.vs:
+            node_index = node.index
+            candidates = []
+
+            # Incoming edges to the current node
+            incoming_edges = graph.es.select(_to=node_index)
+            for edge in incoming_edges:
+                prev_node_index = edge.source
+                edge_id = edge.index
+                is_continue_edge = graph.es[edge_id]['is_continue']
+                prev_node = graph.vs[prev_node_index]
+                if prev_node_index in min_cost[t-1]:
+                    for tuple_index, (prev_cost, _, _, prev_non_continue_count, prev_visited) in enumerate(min_cost[t-1][prev_node_index]):
+                        # Loop punishment
+                        if node_index in prev_visited:
+                            loop_time = prev_visited[node_index]  # Get the count of previous visits
+                            loop_cost = prev_cost + loop_penalty * np.exp(loop_time)  # Apply exponential penalty
+                            new_visited = prev_visited.copy()
+                            new_visited[node_index] = loop_time + 1  # Increment visit count
+                        else:
+                            loop_cost = prev_cost
+                            new_visited = prev_visited.copy()
+                            new_visited[node_index] = 1  # Initialize visit count for the new node
+
+                        motion_low = node['motion_low']      # Shape: [C]
+                        motion_high = node['motion_high']    # Shape: [C]
+
+                        if search_mode == "both":
+                            cost_increment = 2 - (np.dot(audio_low_np[t], motion_low.T) + np.dot(audio_high_np[t], motion_high.T))
+                        elif search_mode == "high_level":
+                            cost_increment = 1 - np.dot(audio_high_np[t], motion_high.T)
+                        elif search_mode == "low_level":
+                            cost_increment = 1 - np.dot(audio_low_np[t], motion_low.T)
+
+                        # Check if the edge is "is_continue"
+                        if not is_continue_edge:
+                            non_continue_count = prev_non_continue_count + 1  # Increment the count of non-continue edges
+                        else:
+                            non_continue_count = prev_non_continue_count
+
+                        # Apply the penalty based on the square of the number of non-continuous edges
+                        continue_penalty_cost = continue_penalty * non_continue_count
+
+                        total_cost = loop_cost + cost_increment + continue_penalty_cost
+
+                        candidates.append( (total_cost, prev_node_index, tuple_index, non_continue_count, new_visited) )
+
+            # Keep the top k candidates
+            if candidates:
+                # Sort candidates by total_cost
+                candidates.sort(key=lambda x: x[0])
+                # Keep top k
+                min_cost[t][node_index] = candidates[:top_k]
+            else:
+                # No candidates, do nothing
+                pass
+
+    # Collect all possible end paths at time T-1
+    end_candidates = []
+    for node_index, tuples in min_cost[T-1].items():
+        for tuple_index, (cost, _, _, _, _) in enumerate(tuples):
+            end_candidates.append( (cost, node_index, tuple_index) )
+
+    if not end_candidates:
+        print("No valid path found.")
+        return [], []
+
+    # Sort end candidates by cost
+    end_candidates.sort(key=lambda x: x[0])
+
+    # Keep top k paths
+    top_k_paths_info = end_candidates[:top_k]
+
+    # Reconstruct the paths
+    optimal_paths = []
+    is_continue_lists = []
+    for final_cost, node_index, tuple_index in top_k_paths_info:
+        optimal_path_indices = []
+        current_node_index = node_index
+        current_tuple_index = tuple_index
+        for t in range(T-1, -1, -1):
+            optimal_path_indices.append(current_node_index)
+            tuple_data = min_cost[t][current_node_index][current_tuple_index]
+            _, prev_node_index, prev_tuple_index, _, _ = tuple_data
+            current_node_index = prev_node_index
+            current_tuple_index = prev_tuple_index
+            if current_node_index is None:
+                break  # Reached the start node
+        optimal_path_indices = optimal_path_indices[::-1]  # Reverse to get correct order
+        optimal_path = [graph.vs[idx] for idx in optimal_path_indices]
+        optimal_paths.append(optimal_path)
+
+        # Extract continuity information
+        is_continue = []
+        for i in range(len(optimal_path) - 1):
+            edge_id = graph.get_eid(optimal_path[i].index, optimal_path[i + 1].index)
+            is_cont = graph.es[edge_id]['is_continue']
+            is_continue.append(is_cont)
+        is_continue_lists.append(is_continue)
+
+    print("Top {} Paths:".format(len(optimal_paths)))
+    for i, path in enumerate(optimal_paths):
+        path_indices = [node.index for node in path]
+        print("Path {}: Cost: {}, Nodes: {}".format(i+1, top_k_paths_info[i][0], path_indices))
+
+    return optimal_paths, is_continue_lists
+
+
+def test_fn(model, device, iteration, candidate_json_path, test_path, cfg, audio_path, **kwargs):
+    torch.set_grad_enabled(False)
+    pool_path = candidate_json_path.replace("data_json", "cached_graph").replace(".json", ".pkl")
+    graph = igraph.Graph.Read_Pickle(fname=pool_path)
+    # print(len(graph.vs))
+
+    save_dir = os.path.join(test_path, f"retrieved_motions_{iteration}")
+    os.makedirs(save_dir, exist_ok=True)
+
+    actual_model = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+    actual_model.eval()
+
+    # with open(candidate_json_path, 'r') as f:
+    #     candidate_data = json.load(f)
+    all_motions = {}
+    for i, node in enumerate(graph.vs):
+        if all_motions.get(node["name"]) is None:
+            all_motions[node["name"]] = [node["axis_angle"].reshape(-1)]
+        else:
+            all_motions[node["name"]].append(node["axis_angle"].reshape(-1))
+    for k, v in all_motions.items():
+        all_motions[k] = np.stack(v) # T, J*3
+        # print(k, all_motions[k].shape)
+    
+    window_size = cfg.data.pose_length
+    motion_high_all = []
+    motion_low_all = []
+    for k, v in all_motions.items():
+        motion_tensor = torch.from_numpy(v).float().to(device).unsqueeze(0)
+        _, t, _ = motion_tensor.shape
+        
+        if t >= window_size:
+            num_chunks = t // window_size
+            motion_high_list = []
+            motion_low_list = []
+
+            for i in range(num_chunks):
+                start_idx = i * window_size
+                end_idx = start_idx + window_size
+                motion_slice = motion_tensor[:, start_idx:end_idx, :]
+                
+                motion_features = actual_model.get_motion_features(motion_slice)
+                
+                motion_low = motion_features["motion_low"].cpu().numpy()
+                motion_high = motion_features["motion_cls"].unsqueeze(0).repeat(1, motion_low.shape[1], 1).cpu().numpy()
+
+                motion_high_list.append(motion_high[0])
+                motion_low_list.append(motion_low[0])
+
+            remain_length = t % window_size
+            if remain_length > 0:
+                start_idx = t - window_size
+                motion_slice = motion_tensor[:, start_idx:, :]
+
+                motion_features = actual_model.get_motion_features(motion_slice)
+                # motion_high = motion_features["motion_high_weight"].cpu().numpy()
+                motion_low = motion_features["motion_low"].cpu().numpy()
+                motion_high = motion_features["motion_cls"].unsqueeze(0).repeat(1, motion_low.shape[1], 1).cpu().numpy()
+
+                motion_high_list.append(motion_high[0][-remain_length:])
+                motion_low_list.append(motion_low[0][-remain_length:])
+
+            motion_high_all.append(np.concatenate(motion_high_list, axis=0))
+            motion_low_all.append(np.concatenate(motion_low_list, axis=0))
+
+        else: # t < window_size:
+            gap = window_size - t
+            motion_slice = torch.cat([motion_tensor, torch.zeros((motion_tensor.shape[0], gap, motion_tensor.shape[2])).to(motion_tensor.device)], 1)
+            motion_features = actual_model.get_motion_features(motion_slice)
+            # motion_high = motion_features["motion_high_weight"].cpu().numpy()
+            motion_low = motion_features["motion_low"].cpu().numpy()
+            motion_high = motion_features["motion_cls"].unsqueeze(0).repeat(1, motion_low.shape[1], 1).cpu().numpy()
+
+            motion_high_all.append(motion_high[0][:t])
+            motion_low_all.append(motion_low[0][:t])
+            
+    motion_high_all = np.concatenate(motion_high_all, axis=0)
+    motion_low_all = np.concatenate(motion_low_all, axis=0)
+    # print(motion_high_all.shape, motion_low_all.shape, len(graph.vs))
+    motion_low_all = motion_low_all / np.linalg.norm(motion_low_all, axis=1, keepdims=True)
+    motion_high_all = motion_high_all / np.linalg.norm(motion_high_all, axis=1, keepdims=True)
+    assert motion_high_all.shape[0] == len(graph.vs)
+    assert motion_low_all.shape[0] == len(graph.vs)
+    
+    for i, node in enumerate(graph.vs):
+        node["motion_high"] = motion_high_all[i]
+        node["motion_low"] = motion_low_all[i]
+
+    graph = graph_pruning(graph)
+    # drop the id of gt
+    idx = 0
+    audio_waveform, sr = librosa.load(audio_path)
+    audio_waveform = librosa.resample(audio_waveform, orig_sr=sr, target_sr=cfg.data.audio_sr)
+    audio_tensor = torch.from_numpy(audio_waveform).float().to(device).unsqueeze(0)
+    
+    target_length = audio_tensor.shape[1] // cfg.data.audio_sr * 30
+    window_size = int(cfg.data.audio_sr * (cfg.data.pose_length / 30))
+    _, t = audio_tensor.shape
+    audio_low_list = []
+    audio_high_list = []
+
+    if t >= window_size:
+        num_chunks = t // window_size
+        # print(num_chunks, t % window_size)
+        for i in range(num_chunks):
+            start_idx = i * window_size
+            end_idx = start_idx + window_size
+            # print(start_idx, end_idx, window_size)
+            audio_slice = audio_tensor[:, start_idx:end_idx]
+
+            model_out_candidates = actual_model.get_audio_features(audio_slice)
+            audio_low = model_out_candidates["audio_low"]
+            # audio_high = model_out_candidates["audio_high_weight"]
+            audio_high = model_out_candidates["audio_cls"].unsqueeze(0).repeat(1, audio_low.shape[1], 1)
+            # print(audio_low.shape, audio_high.shape)
+
+            audio_low = F.normalize(audio_low, dim=2)[0].cpu().numpy()
+            audio_high = F.normalize(audio_high, dim=2)[0].cpu().numpy()
+
+            audio_low_list.append(audio_low)
+            audio_high_list.append(audio_high)
+            # print(audio_low.shape, audio_high.shape)
+            
+
+        remain_length = t % window_size
+        if remain_length > 1:
+            start_idx = t - window_size
+            audio_slice = audio_tensor[:, start_idx:]
+
+            model_out_candidates = actual_model.get_audio_features(audio_slice)
+            audio_low = model_out_candidates["audio_low"]
+            # audio_high = model_out_candidates["audio_high_weight"]
+            audio_high = model_out_candidates["audio_cls"].unsqueeze(0).repeat(1, audio_low.shape[1], 1)
+            
+            gap = target_length - np.concatenate(audio_low_list, axis=0).shape[1]
+            audio_low = F.normalize(audio_low, dim=2)[0][-gap:].cpu().numpy()
+            audio_high = F.normalize(audio_high, dim=2)[0][-gap:].cpu().numpy()
+            
+            # print(audio_low.shape, audio_high.shape)
+            audio_low_list.append(audio_low)
+            audio_high_list.append(audio_high)
+    else:
+        gap = window_size - t
+        audio_slice = audio_tensor 
+        model_out_candidates = actual_model.get_audio_features(audio_slice)
+        audio_low = model_out_candidates["audio_low"]
+        # audio_high = model_out_candidates["audio_high_weight"]
+        audio_high = model_out_candidates["audio_cls"].unsqueeze(0).repeat(1, audio_low.shape[1], 1)
+            
+        gap = target_length - np.concatenate(audio_low_list, axis=0).shape[1]
+        audio_low = F.normalize(audio_low, dim=2)[0][:gap].cpu().numpy()
+        audio_high = F.normalize(audio_high, dim=2)[0][:gap].cpu().numpy()
+        audio_low_list.append(audio_low)
+        audio_high_list.append(audio_high)
+    
+    audio_low_all = np.concatenate(audio_low_list, axis=0)
+    audio_high_all = np.concatenate(audio_high_list, axis=0)
+    path_list, is_continue_list = search_path_dp(graph, audio_low_all, audio_high_all, top_k=1, search_mode="both")
+    
+    res_motion = []
+    counter = 0
+    for path, is_continue in zip(path_list, is_continue_list):
+        # print(path)
+        # res_motion_current = path_visualization(
+        #   graph, path, is_continue, os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), audio_path=audio_path, return_motion=True, verbose_continue=True
+        # )
+        res_motion_current = path_visualization_v2(
+          graph, path, is_continue, os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), audio_path=audio_path, return_motion=True, verbose_continue=True
+        )
+
+        video_temp_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
+        
+        video_reader = VideoReader(video_temp_path)
+        video_np = []
+        for i in range(len(video_reader)):
+            if i == 0: continue
+            video_frame = video_reader[i].asnumpy()
+            video_np.append(Image.fromarray(video_frame))
+        adjusted_video_pil = adjust_statistics_to_match_reference([video_np])
+        save_videos_from_pil(adjusted_video_pil[0], os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), fps=30, bitrate=2000000)
+
+
+        audio_temp_path = audio_path
+        lipsync_output_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
+        checkpoint_path = './Wav2Lip/checkpoints/wav2lip_gan.pth'  # Update this path to your Wav2Lip checkpoint
+        os.system(f'python ./Wav2Lip/inference.py --checkpoint_path {checkpoint_path} --face {video_temp_path} --audio {audio_temp_path} --outfile {lipsync_output_path} --nosmooth')
+
+        res_motion.append(res_motion_current)
+        np.savez(os.path.join(save_dir, f"audio_{idx}_retri_{counter}.npz"), motion=res_motion_current)
+    
+        start_node = path[1].index
+        end_node = start_node + 100
+    print(f"delete gt-nodes {start_node}, {end_node}")
+    nodes_to_delete = list(range(start_node, end_node))
+    graph.delete_vertices(nodes_to_delete)
+    graph = graph_pruning(graph)
+    path_list, is_continue_list = search_path_dp(graph, audio_low_all, audio_high_all, top_k=1, search_mode="both")
+    res_motion = []
+    counter = 1
+    for path, is_continue in zip(path_list, is_continue_list):
+        res_motion_current = path_visualization(
+          graph, path, is_continue, os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), audio_path=audio_path, return_motion=True, verbose_continue=True
+        )
+        video_temp_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
+        
+        video_reader = VideoReader(video_temp_path)
+        video_np = []
+        for i in range(len(video_reader)):
+            if i == 0: continue
+            video_frame = video_reader[i].asnumpy()
+            video_np.append(Image.fromarray(video_frame))
+        adjusted_video_pil = adjust_statistics_to_match_reference([video_np])
+        save_videos_from_pil(adjusted_video_pil[0], os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4"), fps=30, bitrate=2000000)
+
+
+        audio_temp_path = audio_path
+        lipsync_output_path = os.path.join(save_dir, f"audio_{idx}_retri_{counter}.mp4")
+        checkpoint_path = './Wav2Lip/checkpoints/wav2lip_gan.pth'  # Update this path to your Wav2Lip checkpoint
+        os.system(f'python ./Wav2Lip/inference.py --checkpoint_path {checkpoint_path} --face {video_temp_path} --audio {audio_temp_path} --outfile {lipsync_output_path} --nosmooth')
+        res_motion.append(res_motion_current)
+        np.savez(os.path.join(save_dir, f"audio_{idx}_retri_{counter}.npz"), motion=res_motion_current)
+    
+    result = [
+        os.path.join(save_dir, f"audio_{idx}_retri_0.mp4"),
+        os.path.join(save_dir, f"audio_{idx}_retri_1.mp4"),
+        os.path.join(save_dir, f"audio_{idx}_retri_0.npz"),
+        os.path.join(save_dir, f"audio_{idx}_retri_1.npz")
+    ]
+    return result
+
+
+def init_class(module_name, class_name, config, **kwargs):
+    module = importlib.import_module(module_name)
+    model_class = getattr(module, class_name)
+    instance = model_class(config, **kwargs)
+    return instance
+
+
+def seed_everything(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+       
+
+def prepare_all(yaml_name):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default=yaml_name)
+    parser.add_argument("--debug", action="store_true", help="Enable debugging mode")
+    parser.add_argument('overrides', nargs=argparse.REMAINDER)
+    args = parser.parse_args()
+    if args.config.endswith(".yaml"):
+        config = OmegaConf.load(args.config)
+        config.exp_name = args.config.split("/")[-1][:-5]
+    else:
+        raise ValueError("Unsupported config file format. Only .yaml files are allowed.")
+    save_dir = os.path.join(config.output_dir, config.exp_name)
+    os.makedirs(save_dir, exist_ok=True)
+    return config
+
+
+def save_first_20_seconds(video_path, output_path="./save_video.mp4"):
+    import cv2
+    cap = cv2.VideoCapture(video_path)
+    
+    if not cap.isOpened():
+        return
+
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+
+    frames_to_save = fps * 20
+    frame_count = 0
+    
+    while cap.isOpened() and frame_count < frames_to_save:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        out.write(frame)
+        frame_count += 1
+
+    cap.release()
+    out.release()
+
+
+character_name_to_yaml = {
+  "speaker8_jjRWaMCWs44_00-00-30.16_00-00-33.32.mp4": "./configs/gradio_speaker8.yaml",
+  "speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4": "./configs/gradio_speaker7.yaml",
+  "speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4": "./configs/gradio_speaker9.yaml",
+  "1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4": "./configs/gradio_speaker1.yaml",
+  "101099-00_18_09-00_18_19.mp4": "./configs/gradio.yaml",
+}
+
+@spaces.GPU(duration=200) 
+def tango(audio_path, character_name, create_graph=False, video_folder_path=None):
+    saved_audio_path = "./saved_audio.wav"
+    sample_rate, audio_waveform = audio_path 
+    sf.write(saved_audio_path, audio_waveform, sample_rate)
+
+    audio_waveform, sample_rate = librosa.load(saved_audio_path)
+    # print(audio_waveform.shape)
+    resampled_audio = librosa.resample(audio_waveform, orig_sr=sample_rate, target_sr=16000)
+    required_length = int(16000 * (128 / 30)) * 2
+    resampled_audio = resampled_audio[:required_length]
+    sf.write(saved_audio_path, resampled_audio, 16000)
+    audio_path = saved_audio_path
+    
+    yaml_name = character_name_to_yaml.get(character_name.split("/")[-1], "./configs/gradio.yaml")
+    print(yaml_name, character_name.split("/")[-1])
+    cfg = prepare_all(yaml_name)
+    
+    if character_name.split("/")[-1] not in character_name_to_yaml.keys():
+        create_graph=True
+        # load video, and save it to "./save_video.mp4 for the first 20s of the video."
+        os.makedirs("./outputs/tmpvideo/", exist_ok=True)
+        save_first_20_seconds(character_name, "./outputs/tmpvideo/save_video.mp4")
+
+    if create_graph:
+        video_folder_path = "./outputs/tmpvideo/"
+        data_save_path = "./outputs/tmpdata/"
+        json_save_path = "./outputs/save_video.json"
+        graph_save_path = "./outputs/save_video.pkl"
+        os.system(f"cd ./SMPLer-X/ && python app.py --video_folder_path {video_folder_path} --data_save_path {data_save_path} --json_save_path {json_save_path} && cd ..")
+        os.system(f"python ./create_graph.py --json_save_path {json_save_path} --graph_save_path {graph_save_path}") 
+        cfg.data.test_meta_paths = json_save_path
+
+    local_rank = 0  
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    seed_everything(cfg.seed)
+
+    experiment_ckpt_dir = experiment_log_dir = os.path.join(cfg.output_dir, cfg.exp_name)
+    smplx_model = smplx.create(
+            "./emage/smplx_models/", 
+            model_type='smplx',
+            gender='NEUTRAL_2020', 
+            use_face_contour=False,
+            num_betas=300,
+            num_expression_coeffs=100, 
+            ext='npz',
+            use_pca=False,
+        ).to(device).eval()
+    model = init_class(cfg.model.name_pyfile, cfg.model.class_name, cfg).to(device)
+    for param in model.parameters():
+        param.requires_grad = True
+    # freeze wav2vec2
+    for param in model.audio_encoder.parameters():
+        param.requires_grad = False
+    model.smplx_model = smplx_model
+    model.get_motion_reps = get_motion_reps_tensor
+
+    checkpoint_path = "./datasets/cached_ckpts/ckpt.pth"
+    checkpoint = torch.load(checkpoint_path)
+    state_dict = checkpoint['model_state_dict']
+    new_state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
+    model.load_state_dict(new_state_dict, strict=False)
+
+    test_path = os.path.join(experiment_ckpt_dir, f"test_{0}")
+    os.makedirs(test_path, exist_ok=True)
+    result = test_fn(model, device, 0, cfg.data.test_meta_paths, test_path, cfg, audio_path)
+    gc.collect()
+    torch.cuda.empty_cache()
+    return result
+
+
+examples_audio = [
+    ["./datasets/cached_audio/example_male_voice_9_seconds.wav"],
+    ["./datasets/cached_audio/example_female_voice_9_seconds.wav"],
+]
+
+examples_video = [
+    ["./datasets/cached_audio/speaker8_jjRWaMCWs44_00-00-30.16_00-00-33.32.mp4"],
+    ["./datasets/cached_audio/speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4"],
+    ["./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4"],
+    ["./datasets/cached_audio/1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4"],
+    ["./datasets/cached_audio/101099-00_18_09-00_18_19.mp4"],
+]
+
+def make_demo():
+    with gr.Blocks(analytics_enabled=False) as Interface:
+        # First row: Audio upload and Audio examples with adjusted ratio
+        gr.Markdown(
+            """
+            <div align='center'> <h1> TANGO: Co-Speech Gesture Video Reenactment with Hierarchical Audio Motion Embedding and Diffusion Interpolation </span> </h1> \
+                        <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
+                        <a href='https://h-liu1997.github.io/'>Haiyang Liu</a>, \
+                        <a href='https://yangxingchao.github.io/'>Xingchao Yang</a>, \
+                        <a href=''>Tomoya Akiyama</a>, \
+                        <a href='https://sky24h.github.io/'> Yuantian Huang</a>, \
+                        <a href=''>Qiaoge Li</a>, \
+                        <a href='https://www.tut.ac.jp/english/university/faculty/cs/164.html'>Shigeru Kuriyama</a>, \
+                        <a href='https://taketomitakafumi.sakura.ne.jp/web/en/'>Takafumi Taketomi</a>\
+                    </h2> \
+                    <a style='font-size:18px;color: #000000'>This is a preprint version, more details will be available at </a>\
+                    <a style='font-size:18px;color: #000000' href=''>[Github Repo]</a>\
+                        <a style='font-size:18px;color: #000000' href=''> [ArXiv] </a>\
+                        <a style='font-size:18px;color: #000000' href='https://pantomatrix.github.io/TANGO/'> [Project Page] </a> </div>
+            """
+        )
+
+        with gr.Row():
+            gr.Markdown("""
+            <h4 style="text-align: left;">
+            This demo is part of an open-source project supported by Hugging Face's free, zero-GPU runtime. Due to runtime cost considerations, it operates in low-quality mode. Some high-quality videos are shown below.
+
+            Details of the low-quality mode:
+            1. Lower resolution.
+            2. More discontinuous frames (causing noticeable "frame jumps").
+            3. Utilizes open-source tools like SMPLerX-s-model, Wav2Lip, and FiLM for faster processing.
+            4. Accepts audio input of up to 8 seconds. If your input exceeds 8 seconds, only the first 8 seconds will be used.
+            5. You can provide a custom background video for your character, but it is limited to 20 seconds.
+
+            Feel free to open an issue on GitHub or contact the authors if this does not meet your needs.
+            </h4>
+            """)
+        
+        # Create a gallery with 5 videos
+        with gr.Row():
+            video1 = gr.Video(value="./datasets/cached_audio/demo1.mp4", label="Demo 1")
+            video2 = gr.Video(value="./datasets/cached_audio/demo2.mp4", label="Demo 2")
+            video3 = gr.Video(value="./datasets/cached_audio/demo3.mp4", label="Demo 3")
+            video4 = gr.Video(value="./datasets/cached_audio/demo4.mp4", label="Demo 4")
+            video5 = gr.Video(value="./datasets/cached_audio/demo5.mp4", label="Demo 5")
+
+
+        with gr.Row():
+            with gr.Column(scale=4):
+                video_output_1 = gr.Video(label="Generated video - 1",
+                            interactive=False,
+                            autoplay=False,
+                            loop=False,
+                            show_share_button=True)
+            with gr.Column(scale=4):
+                video_output_2 = gr.Video(label="Generated video - 2",
+                            interactive=False,
+                            autoplay=False,
+                            loop=False,
+                            show_share_button=True)
+            with gr.Column(scale=1):
+                file_output_1 = gr.File(label="Download Motion and Visualize in Blender")
+                file_output_2 = gr.File(label="Download Motion and Visualize in Blender")
+            
+        with gr.Row():
+            with gr.Column(scale=1):
+                audio_input = gr.Audio(label="Upload your audio")
+            with gr.Column(scale=2):
+                gr.Examples(
+                    examples=examples_audio,
+                    inputs=[audio_input],
+                    outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
+                    label="Select existing Audio examples",
+                    cache_examples=False
+                )
+            with gr.Column(scale=1):
+                video_input = gr.Video(label="Your Character", elem_classes="video")
+            with gr.Column(scale=2):
+                gr.Examples(
+                    examples=examples_video,
+                    inputs=[video_input],  # Correctly refer to video input
+                    outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
+                    label="Character Examples",
+                    cache_examples=False
+                )
+
+        # Fourth row: Generate video button
+        with gr.Row():
+            run_button = gr.Button("Generate Video")
+        
+        # Define button click behavior
+        run_button.click(
+            fn=tango,
+            inputs=[audio_input, video_input],
+            outputs=[video_output_1, video_output_2, file_output_1, file_output_2]
+        )
+
+    return Interface
+      
+if __name__ == "__main__":
+    os.environ["MASTER_ADDR"]='127.0.0.1'
+    os.environ["MASTER_PORT"]='8675'
+    # #os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
+
+    demo = make_demo()
+    demo.launch(share=True)
\ No newline at end of file
diff --git a/configs/.DS_Store b/configs/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6
Binary files /dev/null and b/configs/.DS_Store differ
diff --git a/configs/gradio.yaml b/configs/gradio.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7c81be4a7892ab735dae9da0c55d96f7ade51c5
--- /dev/null
+++ b/configs/gradio.yaml
@@ -0,0 +1,77 @@
+wandb_project: 'TANGO'
+exp_name: 'debug'
+
+wandb_entity: ''
+wandb_key: ""
+wandb_log_dir: '/content/outputs/wandb'
+output_dir: ./outputs/
+log_period: 1
+seed: 42
+
+data:
+  name_pyfile: "datasets.beat2_v5"
+  class_name: "BEAT2Dataset"
+  train_bs: 2
+  meta_paths:
+    - "./datasets/data_json/show-oliver-s40_w128.json"
+  # test_meta_paths: "./datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json"
+  "test_meta_paths": "./datasets/data_json/show_oliver_test/Stupid_Watergate_-_Last_Week_Tonight_with_John_Oliver_HBO-FVFdsl29s_Q.mkv.json"
+  pose_norm: False
+  pose_fps: 30
+  rot6d: True
+  pose_dims: 825
+  pose_length: 128
+  stride: 20
+  test_length: 128
+  audio_sr: 16000
+  audio_fps: 16000
+
+model:
+  name_pyfile: "models.jointembedding_high_env0"
+  class_name: "JointEmbedding"
+  motion_f: 256
+  audio_rep: wave16k
+  audio_sr: 16000
+  audio_fps: 16000
+  audio_norm: False
+  audio_f: 256
+  word_rep: textgrid
+  word_index_num: 11195
+  word_dims: 300
+  facial_rep: smplxflame_30
+  facial_dims: 100
+  facial_norm: False
+  facial_f: 0
+  f_pre_encoder: null
+  f_encoder: null
+  f_fix_pre: False
+  id_rep: onehot
+  speaker_f: 0
+  hidden_size: 512
+  n_layer: 1
+  motion_dim: 825
+ 
+validation:
+  val_loss_steps: 1
+  validation_steps: 1000
+  # guidance_scale: 3.5
+  # denoising_steps: 20 
+
+solver:
+  gradient_accumulation_steps: 1
+  # mixed_precision: 'fp16'
+  # enable_xformers_memory_efficient_attention: True 
+  gradient_checkpointing: False 
+  max_train_steps: 5000000
+  max_grad_norm: 1.0
+  # lr
+  learning_rate: 2e-5
+  scale_lr: False 
+  lr_warmup_steps: 50
+  lr_scheduler: 'constant'
+  # optimizer
+  use_8bit_adam: False 
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay:  1.0e-2
+  adam_epsilon: 1.0e-8
\ No newline at end of file
diff --git a/configs/gradio_speaker1.yaml b/configs/gradio_speaker1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..054f3a86d9dc2ef14b3b4078bcef3ae5c0af56f9
--- /dev/null
+++ b/configs/gradio_speaker1.yaml
@@ -0,0 +1,77 @@
+wandb_project: 'TANGO'
+exp_name: 'debug'
+
+wandb_entity: ''
+wandb_key: ""
+wandb_log_dir: '/content/outputs/wandb'
+output_dir: ./outputs/
+log_period: 1
+seed: 42
+
+data:
+  name_pyfile: "datasets.beat2_v5"
+  class_name: "BEAT2Dataset"
+  train_bs: 2
+  meta_paths:
+    - "./datasets/data_json/show-oliver-s40_w128.json"
+  # test_meta_paths: "./datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json"
+  "test_meta_paths": "./datasets/data_json/youtube_test/speaker1.json"
+  pose_norm: False
+  pose_fps: 30
+  rot6d: True
+  pose_dims: 825
+  pose_length: 128
+  stride: 20
+  test_length: 128
+  audio_sr: 16000
+  audio_fps: 16000
+
+model:
+  name_pyfile: "models.jointembedding_high_env0"
+  class_name: "JointEmbedding"
+  motion_f: 256
+  audio_rep: wave16k
+  audio_sr: 16000
+  audio_fps: 16000
+  audio_norm: False
+  audio_f: 256
+  word_rep: textgrid
+  word_index_num: 11195
+  word_dims: 300
+  facial_rep: smplxflame_30
+  facial_dims: 100
+  facial_norm: False
+  facial_f: 0
+  f_pre_encoder: null
+  f_encoder: null
+  f_fix_pre: False
+  id_rep: onehot
+  speaker_f: 0
+  hidden_size: 512
+  n_layer: 1
+  motion_dim: 825
+ 
+validation:
+  val_loss_steps: 1
+  validation_steps: 1000
+  # guidance_scale: 3.5
+  # denoising_steps: 20 
+
+solver:
+  gradient_accumulation_steps: 1
+  # mixed_precision: 'fp16'
+  # enable_xformers_memory_efficient_attention: True 
+  gradient_checkpointing: False 
+  max_train_steps: 5000000
+  max_grad_norm: 1.0
+  # lr
+  learning_rate: 2e-5
+  scale_lr: False 
+  lr_warmup_steps: 50
+  lr_scheduler: 'constant'
+  # optimizer
+  use_8bit_adam: False 
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay:  1.0e-2
+  adam_epsilon: 1.0e-8
\ No newline at end of file
diff --git a/configs/gradio_speaker7.yaml b/configs/gradio_speaker7.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..233af90e0b99859b30ffb2ff1572520e7bce0810
--- /dev/null
+++ b/configs/gradio_speaker7.yaml
@@ -0,0 +1,77 @@
+wandb_project: 'TANGO'
+exp_name: 'debug'
+
+wandb_entity: ''
+wandb_key: ""
+wandb_log_dir: '/content/outputs/wandb'
+output_dir: ./outputs/
+log_period: 1
+seed: 42
+
+data:
+  name_pyfile: "datasets.beat2_v5"
+  class_name: "BEAT2Dataset"
+  train_bs: 2
+  meta_paths:
+    - "./datasets/data_json/show-oliver-s40_w128.json"
+  # test_meta_paths: "./datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json"
+  "test_meta_paths": "./datasets/data_json/youtube_test/speaker7.json"
+  pose_norm: False
+  pose_fps: 30
+  rot6d: True
+  pose_dims: 825
+  pose_length: 128
+  stride: 20
+  test_length: 128
+  audio_sr: 16000
+  audio_fps: 16000
+
+model:
+  name_pyfile: "models.jointembedding_high_env0"
+  class_name: "JointEmbedding"
+  motion_f: 256
+  audio_rep: wave16k
+  audio_sr: 16000
+  audio_fps: 16000
+  audio_norm: False
+  audio_f: 256
+  word_rep: textgrid
+  word_index_num: 11195
+  word_dims: 300
+  facial_rep: smplxflame_30
+  facial_dims: 100
+  facial_norm: False
+  facial_f: 0
+  f_pre_encoder: null
+  f_encoder: null
+  f_fix_pre: False
+  id_rep: onehot
+  speaker_f: 0
+  hidden_size: 512
+  n_layer: 1
+  motion_dim: 825
+ 
+validation:
+  val_loss_steps: 1
+  validation_steps: 1000
+  # guidance_scale: 3.5
+  # denoising_steps: 20 
+
+solver:
+  gradient_accumulation_steps: 1
+  # mixed_precision: 'fp16'
+  # enable_xformers_memory_efficient_attention: True 
+  gradient_checkpointing: False 
+  max_train_steps: 5000000
+  max_grad_norm: 1.0
+  # lr
+  learning_rate: 2e-5
+  scale_lr: False 
+  lr_warmup_steps: 50
+  lr_scheduler: 'constant'
+  # optimizer
+  use_8bit_adam: False 
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay:  1.0e-2
+  adam_epsilon: 1.0e-8
\ No newline at end of file
diff --git a/configs/gradio_speaker8.yaml b/configs/gradio_speaker8.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5410a5dfd84a2b63ad8807cc33f0e912e75da96a
--- /dev/null
+++ b/configs/gradio_speaker8.yaml
@@ -0,0 +1,77 @@
+wandb_project: 'TANGO'
+exp_name: 'debug'
+
+wandb_entity: ''
+wandb_key: ""
+wandb_log_dir: '/content/outputs/wandb'
+output_dir: ./outputs/
+log_period: 1
+seed: 42
+
+data:
+  name_pyfile: "datasets.beat2_v5"
+  class_name: "BEAT2Dataset"
+  train_bs: 2
+  meta_paths:
+    - "./datasets/data_json/show-oliver-s40_w128.json"
+  # test_meta_paths: "./datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json"
+  "test_meta_paths": "./datasets/data_json/youtube_test/speaker8.json"
+  pose_norm: False
+  pose_fps: 30
+  rot6d: True
+  pose_dims: 825
+  pose_length: 128
+  stride: 20
+  test_length: 128
+  audio_sr: 16000
+  audio_fps: 16000
+
+model:
+  name_pyfile: "models.jointembedding_high_env0"
+  class_name: "JointEmbedding"
+  motion_f: 256
+  audio_rep: wave16k
+  audio_sr: 16000
+  audio_fps: 16000
+  audio_norm: False
+  audio_f: 256
+  word_rep: textgrid
+  word_index_num: 11195
+  word_dims: 300
+  facial_rep: smplxflame_30
+  facial_dims: 100
+  facial_norm: False
+  facial_f: 0
+  f_pre_encoder: null
+  f_encoder: null
+  f_fix_pre: False
+  id_rep: onehot
+  speaker_f: 0
+  hidden_size: 512
+  n_layer: 1
+  motion_dim: 825
+ 
+validation:
+  val_loss_steps: 1
+  validation_steps: 1000
+  # guidance_scale: 3.5
+  # denoising_steps: 20 
+
+solver:
+  gradient_accumulation_steps: 1
+  # mixed_precision: 'fp16'
+  # enable_xformers_memory_efficient_attention: True 
+  gradient_checkpointing: False 
+  max_train_steps: 5000000
+  max_grad_norm: 1.0
+  # lr
+  learning_rate: 2e-5
+  scale_lr: False 
+  lr_warmup_steps: 50
+  lr_scheduler: 'constant'
+  # optimizer
+  use_8bit_adam: False 
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay:  1.0e-2
+  adam_epsilon: 1.0e-8
\ No newline at end of file
diff --git a/configs/gradio_speaker9.yaml b/configs/gradio_speaker9.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..62597ece6765837a5a3106de9edbe23fb0d37c2b
--- /dev/null
+++ b/configs/gradio_speaker9.yaml
@@ -0,0 +1,77 @@
+wandb_project: 'TANGO'
+exp_name: 'debug'
+
+wandb_entity: ''
+wandb_key: ""
+wandb_log_dir: '/content/outputs/wandb'
+output_dir: ./outputs/
+log_period: 1
+seed: 42
+
+data:
+  name_pyfile: "datasets.beat2_v5"
+  class_name: "BEAT2Dataset"
+  train_bs: 2
+  meta_paths:
+    - "./datasets/data_json/show-oliver-s40_w128.json"
+  # test_meta_paths: "./datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json"
+  "test_meta_paths": "./datasets/data_json/youtube_test/speaker9.json"
+  pose_norm: False
+  pose_fps: 30
+  rot6d: True
+  pose_dims: 825
+  pose_length: 128
+  stride: 20
+  test_length: 128
+  audio_sr: 16000
+  audio_fps: 16000
+
+model:
+  name_pyfile: "models.jointembedding_high_env0"
+  class_name: "JointEmbedding"
+  motion_f: 256
+  audio_rep: wave16k
+  audio_sr: 16000
+  audio_fps: 16000
+  audio_norm: False
+  audio_f: 256
+  word_rep: textgrid
+  word_index_num: 11195
+  word_dims: 300
+  facial_rep: smplxflame_30
+  facial_dims: 100
+  facial_norm: False
+  facial_f: 0
+  f_pre_encoder: null
+  f_encoder: null
+  f_fix_pre: False
+  id_rep: onehot
+  speaker_f: 0
+  hidden_size: 512
+  n_layer: 1
+  motion_dim: 825
+ 
+validation:
+  val_loss_steps: 1
+  validation_steps: 1000
+  # guidance_scale: 3.5
+  # denoising_steps: 20 
+
+solver:
+  gradient_accumulation_steps: 1
+  # mixed_precision: 'fp16'
+  # enable_xformers_memory_efficient_attention: True 
+  gradient_checkpointing: False 
+  max_train_steps: 5000000
+  max_grad_norm: 1.0
+  # lr
+  learning_rate: 2e-5
+  scale_lr: False 
+  lr_warmup_steps: 50
+  lr_scheduler: 'constant'
+  # optimizer
+  use_8bit_adam: False 
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay:  1.0e-2
+  adam_epsilon: 1.0e-8
\ No newline at end of file
diff --git a/create_graph.py b/create_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9259f2a9817f4360c1647137a0dd7c48d978a77
--- /dev/null
+++ b/create_graph.py
@@ -0,0 +1,455 @@
+"""
+input: json file with video, audio, motion paths
+output: igraph object with nodes containing video, audio, motion, position, velocity, axis_angle, previous, next, frame, fps
+
+preprocess:
+1. assume you have a video for one speaker in folder, listed in 
+    -- video_a.mp4
+    -- video_b.mp4
+    run process_video.py to extract frames and audio
+"""
+
+import os
+import smplx
+import torch
+import numpy as np
+import cv2
+import librosa
+import igraph
+import json
+import utils.rotation_conversions as rc
+from moviepy.editor import VideoClip, AudioFileClip
+from tqdm import tqdm
+import imageio
+import tempfile
+import argparse
+
+
+def get_motion_reps_tensor(motion_tensor, smplx_model, pose_fps=30, device='cuda'):
+    bs, n, _ = motion_tensor.shape
+    motion_tensor = motion_tensor.float().to(device)
+    motion_tensor_reshaped = motion_tensor.reshape(bs * n, 165)
+    
+    output = smplx_model(
+        betas=torch.zeros(bs * n, 300, device=device),
+        transl=torch.zeros(bs * n, 3, device=device),
+        expression=torch.zeros(bs * n, 100, device=device),
+        jaw_pose=torch.zeros(bs * n, 3, device=device),
+        global_orient=torch.zeros(bs * n, 3, device=device),
+        body_pose=motion_tensor_reshaped[:, 3:21 * 3 + 3],
+        left_hand_pose=motion_tensor_reshaped[:, 25 * 3:40 * 3],
+        right_hand_pose=motion_tensor_reshaped[:, 40 * 3:55 * 3],
+        return_joints=True,
+        leye_pose=torch.zeros(bs * n, 3, device=device),
+        reye_pose=torch.zeros(bs * n, 3, device=device),
+    )
+    
+    joints = output['joints'].reshape(bs, n, 127, 3)[:, :, :55, :]
+    dt = 1 / pose_fps
+    init_vel = (joints[:, 1:2] - joints[:, 0:1]) / dt
+    middle_vel = (joints[:, 2:] - joints[:, :-2]) / (2 * dt)
+    final_vel = (joints[:, -1:] - joints[:, -2:-1]) / dt
+    vel = torch.cat([init_vel, middle_vel, final_vel], dim=1)
+    
+    position = joints
+    rot_matrices = rc.axis_angle_to_matrix(motion_tensor.reshape(bs, n, 55, 3))
+    rot6d = rc.matrix_to_rotation_6d(rot_matrices).reshape(bs, n, 55, 6)
+
+    init_vel_ang = (motion_tensor[:, 1:2] - motion_tensor[:, 0:1]) / dt
+    middle_vel_ang = (motion_tensor[:, 2:] - motion_tensor[:, :-2]) / (2 * dt)
+    final_vel_ang = (motion_tensor[:, -1:] - motion_tensor[:, -2:-1]) / dt
+    angular_velocity = torch.cat([init_vel_ang, middle_vel_ang, final_vel_ang], dim=1).reshape(bs, n, 55, 3)
+
+    rep15d = torch.cat([position, vel, rot6d, angular_velocity], dim=3).reshape(bs, n, 55 * 15)
+    
+    return {
+        "position": position,
+        "velocity": vel,
+        "rotation": rot6d,
+        "axis_angle": motion_tensor,
+        "angular_velocity": angular_velocity,
+        "rep15d": rep15d,
+    }
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+smplx_model = smplx.create(
+    "./emage/smplx_models/",
+    model_type='smplx',
+    gender='NEUTRAL_2020',
+    use_face_contour=False,
+    num_betas=300,
+    num_expression_coeffs=100,
+    ext='npz',
+    use_pca=False,
+).to(device).eval()
+
+def get_motion_reps(motion, smplx_model=smplx_model, pose_fps=30):
+    gt_motion_tensor = motion["poses"]
+    n = gt_motion_tensor.shape[0]
+    bs = 1
+    gt_motion_tensor = torch.from_numpy(gt_motion_tensor).float().to(device).unsqueeze(0)
+    gt_motion_tensor_reshaped = gt_motion_tensor.reshape(bs * n, 165)
+    output = smplx_model(
+        betas=torch.zeros(bs * n, 300).to(device),
+        transl=torch.zeros(bs * n, 3).to(device),
+        expression=torch.zeros(bs * n, 100).to(device),
+        jaw_pose=torch.zeros(bs * n, 3).to(device),
+        global_orient=torch.zeros(bs * n, 3).to(device),
+        body_pose=gt_motion_tensor_reshaped[:, 3:21 * 3 + 3],
+        left_hand_pose=gt_motion_tensor_reshaped[:, 25 * 3:40 * 3],
+        right_hand_pose=gt_motion_tensor_reshaped[:, 40 * 3:55 * 3],
+        return_joints=True,
+        leye_pose=torch.zeros(bs * n, 3).to(device),
+        reye_pose=torch.zeros(bs * n, 3).to(device),
+    )
+    joints = output["joints"].detach().cpu().numpy().reshape(n, 127, 3)[:, :55, :]
+    dt = 1 / pose_fps
+    init_vel = (joints[1:2] - joints[0:1]) / dt
+    middle_vel = (joints[2:] - joints[:-2]) / (2 * dt)
+    final_vel = (joints[-1:] - joints[-2:-1]) / dt
+    vel = np.concatenate([init_vel, middle_vel, final_vel], axis=0)
+    position = joints
+    rot_matrices = rc.axis_angle_to_matrix(gt_motion_tensor.reshape(1, n, 55, 3))[0]
+    rot6d = rc.matrix_to_rotation_6d(rot_matrices).reshape(n, 55, 6).cpu().numpy()
+    
+    init_vel = (motion["poses"][1:2] - motion["poses"][0:1]) / dt
+    middle_vel = (motion["poses"][2:] - motion["poses"][:-2]) / (2 * dt)
+    final_vel = (motion["poses"][-1:] - motion["poses"][-2:-1]) / dt
+    angular_velocity = np.concatenate([init_vel, middle_vel, final_vel], axis=0).reshape(n, 55, 3)
+
+    rep15d = np.concatenate([
+        position,
+        vel,
+        rot6d,
+        angular_velocity],
+        axis=2
+    ).reshape(n, 55*15)
+    return {
+        "position": position,
+        "velocity": vel,
+        "rotation": rot6d,
+        "axis_angle": motion["poses"],
+        "angular_velocity": angular_velocity,
+        "rep15d": rep15d,
+        "trans": motion["trans"]
+    }
+
+def create_graph(json_path):
+    fps = 30
+    data_meta = json.load(open(json_path, "r"))
+    graph = igraph.Graph(directed=True)
+    global_i = 0
+    for data_item in data_meta:
+        video_path = os.path.join(data_item['video_path'], data_item['video_id'] + ".mp4")
+        # audio_path = os.path.join(data_item['audio_path'], data_item['video_id'] +  ".wav")
+        motion_path = os.path.join(data_item['motion_path'], data_item['video_id'] +  ".npz")
+        video_id = data_item.get("video_id", "")
+        motion = np.load(motion_path, allow_pickle=True)
+        motion_reps = get_motion_reps(motion)
+        position = motion_reps['position']
+        velocity = motion_reps['velocity']
+        trans = motion_reps['trans']
+        axis_angle = motion_reps['axis_angle']
+        # audio, sr = librosa.load(audio_path, sr=None)
+        # audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+        all_frames = []
+        reader = imageio.get_reader(video_path)
+        all_frames = []
+        for frame in reader:
+            all_frames.append(frame)
+        video_frames = np.array(all_frames)
+        min_frames = min(len(video_frames), position.shape[0])
+        position = position[:min_frames]
+        velocity = velocity[:min_frames]
+        video_frames = video_frames[:min_frames]
+        # print(min_frames)
+        for i in tqdm(range(min_frames)):
+            if i == 0:
+                previous = -1
+                next_node = global_i + 1
+            elif i == min_frames - 1:
+                previous = global_i - 1
+                next_node = -1
+            else:
+                previous = global_i - 1
+                next_node = global_i + 1
+            graph.add_vertex(
+                idx=global_i,
+                name=video_id,
+                motion=motion_reps,
+                position=position[i],
+                velocity=velocity[i],
+                axis_angle=axis_angle[i],
+                trans=trans[i],
+                # audio=audio[],
+                video=video_frames[i],
+                previous=previous,
+                next=next_node,
+                frame=i,
+                fps=fps,
+            )
+            global_i += 1
+    return graph
+
+def create_edges(graph):
+    adaptive_length = [-4, -3, -2, -1, 1, 2, 3, 4]
+    # print()
+    for i, node in enumerate(graph.vs):
+        current_position = node['position']
+        current_velocity = node['velocity']
+        current_trans = node['trans']
+        # print(current_position.shape, current_velocity.shape)
+        avg_position = np.zeros(current_position.shape[0])
+        avg_velocity = np.zeros(current_position.shape[0])
+        avg_trans = 0
+        count = 0
+        for node_offset in adaptive_length:
+            idx = i + node_offset
+            if idx < 0 or idx >= len(graph.vs):
+                continue
+            if node_offset < 0:
+              if graph.vs[idx]['next'] == -1:continue
+            else:
+              if graph.vs[idx]['previous'] == -1:continue
+            # add check
+            other_node = graph.vs[idx]
+            other_position = other_node['position']
+            other_velocity = other_node['velocity']
+            other_trans = other_node['trans']
+            # print(other_position.shape, other_velocity.shape)
+            avg_position += np.linalg.norm(current_position - other_position, axis=1)
+            avg_velocity += np.linalg.norm(current_velocity - other_velocity, axis=1)
+            avg_trans += np.linalg.norm(current_trans - other_trans, axis=0)
+            count += 1
+        
+        if count == 0:
+            continue
+        threshold_position = avg_position / count
+        threshold_velocity = avg_velocity / count
+        threshold_trans = avg_trans / count
+        # print(threshold_position, threshold_velocity, threshold_trans)
+        for j, other_node in enumerate(graph.vs):
+            if i == j:
+                continue
+            if j == node['previous'] or j == node['next']:
+                graph.add_edge(i, j, is_continue=1)
+                continue
+            other_position = other_node['position']
+            other_velocity = other_node['velocity']
+            other_trans = other_node['trans']
+            position_similarity = np.linalg.norm(current_position - other_position, axis=1)
+            velocity_similarity = np.linalg.norm(current_velocity - other_velocity, axis=1)
+            trans_similarity = np.linalg.norm(current_trans - other_trans, axis=0)
+            if trans_similarity < threshold_trans: 
+                if np.sum(position_similarity < threshold_position) >= 45 and np.sum(velocity_similarity < threshold_velocity) >= 45:
+                    graph.add_edge(i, j, is_continue=0)
+
+    print(f"nodes: {len(graph.vs)}, edges: {len(graph.es)}")
+    in_degrees = graph.indegree()
+    out_degrees = graph.outdegree()
+    avg_in_degree = sum(in_degrees) / len(in_degrees)
+    avg_out_degree = sum(out_degrees) / len(out_degrees)
+    print(f"Average In-degree: {avg_in_degree}")
+    print(f"Average Out-degree: {avg_out_degree}")
+    print(f"max in degree: {max(in_degrees)}, max out degree: {max(out_degrees)}")
+    print(f"min in degree: {min(in_degrees)}, min out degree: {min(out_degrees)}")
+  # igraph.plot(graph, target="/content/test.png", bbox=(1000, 1000), vertex_size=10)
+    return graph
+
+def random_walk(graph, walk_length, start_node=None):
+    if start_node is None:
+        start_node = np.random.choice(graph.vs)
+    walk = [start_node]
+    is_continue = [1]
+    for _ in range(walk_length):
+        current_node = walk[-1]
+        neighbor_indices = graph.neighbors(current_node.index, mode='OUT')
+        if not neighbor_indices:
+            break
+        next_idx = np.random.choice(neighbor_indices)
+        edge_id = graph.get_eid(current_node.index, next_idx)
+        is_cont = graph.es[edge_id]['is_continue']
+        walk.append(graph.vs[next_idx])
+        is_continue.append(is_cont)
+    return walk, is_continue
+
+
+def path_visualization(graph, path, is_continue, save_path, verbose_continue=False, audio_path=None, return_motion=False):
+    all_frames = [node['video'] for node in path]
+    average_dis_continue = 1 - sum(is_continue) / len(is_continue)
+    if verbose_continue:
+        print("average_dis_continue:", average_dis_continue)
+    duration = len(all_frames) / graph.vs[0]['fps']
+    def make_frame(t):
+        idx = min(int(t * graph.vs[0]['fps']), len(all_frames) - 1)
+        return all_frames[idx]
+    video_clip = VideoClip(make_frame, duration=duration)
+    if audio_path is not None:
+        audio_clip = AudioFileClip(audio_path)
+        video_clip = video_clip.set_audio(audio_clip)
+    video_clip.write_videofile(save_path, codec='libx264', fps=graph.vs[0]['fps'], audio_codec='aac')
+
+    if return_motion:
+        all_motion = [node['axis_angle'] for node in path]
+        all_motion = np.stack(all_motion, 0)
+        return all_motion
+
+def generate_transition_video(frame_start_path, frame_end_path, output_video_path):
+    import subprocess
+    import os
+
+    # Define the path to your model and inference script
+    model_path = "./frame-interpolation-pytorch/film_net_fp32.pt"
+    inference_script = "./frame-interpolation-pytorch/inference.py"
+
+    # Build the command to run the inference script
+    command = [
+        "python",
+        inference_script,
+        model_path,
+        frame_start_path,
+        frame_end_path,
+        "--save_path", output_video_path,
+        "--gpu",
+        "--frames", "3",
+        "--fps", "30"
+    ]
+
+    # Run the command
+    try:
+        subprocess.run(command, check=True)
+        print(f"Generated transition video saved at {output_video_path}")
+    except subprocess.CalledProcessError as e:
+        print(f"Error occurred while generating transition video: {e}")
+
+
+def path_visualization_v2(graph, path, is_continue, save_path, verbose_continue=False, audio_path=None, return_motion=False):
+    '''
+    this is for hugging face demo for fast interpolation. our paper use a diffusion based interpolation method
+    '''
+    all_frames = [node['video'] for node in path]
+    average_dis_continue = 1 - sum(is_continue) / len(is_continue)
+    if verbose_continue:
+        print("average_dis_continue:", average_dis_continue)
+    duration = len(all_frames) / graph.vs[0]['fps']
+    
+    # First loop: Confirm where blending is needed
+    discontinuity_indices = []
+    for i, cont in enumerate(is_continue):
+        if cont == 0:
+            discontinuity_indices.append(i)
+    
+    # Identify blending positions without overlapping
+    blend_positions = []
+    processed_frames = set()
+    for i in discontinuity_indices:
+        # Define the frames for blending: i-2 to i+2
+        start_idx = i - 2
+        end_idx = i + 2
+        # Check index boundaries
+        if start_idx < 0 or end_idx >= len(all_frames):
+            continue  # Skip if indices are out of bounds
+        # Check for overlapping frames
+        overlap = any(idx in processed_frames for idx in range(i - 1, i + 2))
+        if overlap:
+            continue  # Skip if frames have been processed
+        # Mark frames as processed
+        processed_frames.update(range(i - 1, i + 2))
+        blend_positions.append(i)
+    
+    # Second loop: Perform blending
+    temp_dir = tempfile.mkdtemp(prefix='blending_frames_')
+    for i in tqdm(blend_positions):
+        start_frame_idx = i - 2
+        end_frame_idx = i + 2
+        frame_start = all_frames[start_frame_idx]
+        frame_end = all_frames[end_frame_idx]
+        frame_start_path = os.path.join(temp_dir, f'frame_{start_frame_idx}.png')
+        frame_end_path = os.path.join(temp_dir, f'frame_{end_frame_idx}.png')
+        # Save the start and end frames as images
+        imageio.imwrite(frame_start_path, frame_start)
+        imageio.imwrite(frame_end_path, frame_end)
+        
+        # Call FiLM API to generate video
+        generated_video_path = os.path.join(temp_dir, f'generated_{start_frame_idx}_{end_frame_idx}.mp4')
+        generate_transition_video(frame_start_path, frame_end_path, generated_video_path)
+        
+        # Read the generated video frames
+        reader = imageio.get_reader(generated_video_path)
+        generated_frames = [frame for frame in reader]
+        reader.close()
+        
+        # Replace the middle three frames (i-1, i, i+1) in all_frames
+        total_generated_frames = len(generated_frames)
+        if total_generated_frames < 5:
+            print(f"Generated video has insufficient frames ({total_generated_frames}). Skipping blending at position {i}.")
+            continue
+        middle_start = 1  # Start index for middle 3 frames
+        middle_frames = generated_frames[middle_start:middle_start+3]
+        for idx, frame_idx in enumerate(range(i - 1, i + 2)):
+            all_frames[frame_idx] = middle_frames[idx]
+    
+    # Create the video clip
+    def make_frame(t):
+        idx = min(int(t * graph.vs[0]['fps']), len(all_frames) - 1)
+        return all_frames[idx]
+    
+    video_clip = VideoClip(make_frame, duration=duration)
+    if audio_path is not None:
+        audio_clip = AudioFileClip(audio_path)
+        video_clip = video_clip.set_audio(audio_clip)
+    video_clip.write_videofile(save_path, codec='libx264', fps=graph.vs[0]['fps'], audio_codec='aac')
+    
+    if return_motion:
+        all_motion = [node['axis_angle'] for node in path]
+        all_motion = np.stack(all_motion, 0)
+        return all_motion
+
+
+def graph_pruning(graph):
+    ascc = graph.clusters(mode="STRONG")
+    lascc = ascc.giant()
+    print(f"before nodes: {len(graph.vs)}, edges: {len(graph.es)}")
+    print(f"after nodes: {len(lascc.vs)}, edges: {len(lascc.es)}")
+    in_degrees = lascc.indegree()
+    out_degrees = lascc.outdegree()
+    avg_in_degree = sum(in_degrees) / len(in_degrees)
+    avg_out_degree = sum(out_degrees) / len(out_degrees)
+    print(f"Average In-degree: {avg_in_degree}")
+    print(f"Average Out-degree: {avg_out_degree}")
+    print(f"max in degree: {max(in_degrees)}, max out degree: {max(out_degrees)}")
+    print(f"min in degree: {min(in_degrees)}, min out degree: {min(out_degrees)}")
+    return lascc
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--json_save_path", type=str, default="")
+    parser.add_argument("--graph_save_path", type=str, default="")
+    args = parser.parse_args()
+    json_path = args.json_save_path
+    graph_path = args.graph_save_path
+
+    # single_test
+    # graph = create_graph('/content/drive/MyDrive/003_Codes/TANGO/datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json')
+    graph = create_graph(json_path)
+    graph = create_edges(graph)
+    # pool_path = "/content/drive/MyDrive/003_Codes/TANGO-JointEmbedding/datasets/oliver_test/show-oliver-test.pkl"
+    # graph = igraph.Graph.Read_Pickle(fname=pool_path)
+    # graph = igraph.Graph.Read_Pickle(fname="/content/drive/MyDrive/003_Codes/TANGO-JointEmbedding/datasets/oliver_test/test.pkl")
+    
+    walk, is_continue = random_walk(graph, 100)
+    motion = path_visualization(graph, walk, is_continue, "./test.mp4", audio_path=None, verbose_continue=True, return_motion=True)
+    # print(motion.shape)
+    save_graph = graph.write_pickle(fname=graph_path)
+    graph = graph_pruning(graph)
+
+    # show-oliver
+    # json_path = "/content/drive/MyDrive/003_Codes/TANGO/datasets/data_json/show_oliver_test/"
+    # pre_node_path = "/content/drive/MyDrive/003_Codes/TANGO/datasets/cached_graph/show_oliver_test/"
+    # for json_file in tqdm(os.listdir(json_path)):
+    #     graph = create_graph(os.path.join(json_path, json_file))
+    #     graph = create_edges(graph)
+    #     if not len(graph.vs) >= 1500: 
+    #         print(f"skip: {len(graph.vs)}", json_file)  
+    #     graph.write_pickle(fname=os.path.join(pre_node_path, json_file.split(".")[0] + ".pkl"))
+    #     print(f"Graph saved at {json_file.split('.')[0]}.pkl")
\ No newline at end of file
diff --git a/datasets/.DS_Store b/datasets/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..892a72cbbde3e3001721b609dbcf03cb84ad1a11
Binary files /dev/null and b/datasets/.DS_Store differ
diff --git a/datasets/beat2_v5.py b/datasets/beat2_v5.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4e1e46c5713edd9f2d49cda417300fb3410e40c
--- /dev/null
+++ b/datasets/beat2_v5.py
@@ -0,0 +1,85 @@
+import json
+import torch
+from torch.utils import data
+import numpy as np
+import librosa
+import textgrid as tg
+import os
+import math
+
+class BEAT2Dataset(data.Dataset):
+    def __init__(self, cfg, split):
+        data_meta_paths = cfg.data.meta_paths
+        vid_meta = []
+        for data_meta_path in data_meta_paths:
+            vid_meta.extend(json.load(open(data_meta_path, "r")))
+        self.vid_meta = [item for item in vid_meta if item.get("mode") == split]
+        self.mean = 0 #np.load(cfg.data.mean_path) if cfg.data.mean_path is not None else 0
+        self.std = 1 #np.load(cfg.data.std_path) if cfg.data.std_path is not None else 1
+        self.joint_mask = None #cfg.data.joint_mask if cfg.data.joint_mask is not None else None
+        self.data_list = self.vid_meta
+        # self.sample_frames = cfg.data.sample_frames
+        self.fps = cfg.data.pose_fps
+        self.audio_sr = cfg.data.audio_sr
+        self.use_text = False #cfg.data.use_text
+        
+
+    def __len__(self):
+        return len(self.data_list)
+    
+    @staticmethod
+    def normalize(motion, mean, std):
+        return (motion - mean) / (std + 1e-7)
+    
+    @staticmethod
+    def inverse_normalize(motion, mean, std):
+        return motion * std + mean
+    
+    @staticmethod
+    def select_joints(motion, joint_mask):
+        return motion[:, joint_mask]
+    
+    @staticmethod
+    def unselect_joints(motion, joint_mask):
+        # for visualization
+        full_motion = np.zeros((motion.shape[0], joint_mask.shape[0]))
+        full_motion[:, joint_mask] = motion
+
+    def __getitem__(self, item):
+        data = self.data_list[item]
+        motion = np.load(os.path.join(data["video_path"], data["video_id"] + ".npy"))
+        sdx = data["start_idx"]
+        edx = data["end_idx"]
+
+        SMPLX_FPS = 30
+        motion = motion[sdx:edx]
+        # audio, sr = librosa.load(os.path.join(data["audio_path"], data["video_id"] + ".wav"))
+        # audio = librosa.resample(audio, orig_sr=sr, target_sr=self.audio_sr)
+        audio = np.load(os.path.join(data["audio_path"], data["video_id"] + "_text.npz"), allow_pickle=True)
+        sdx_audio = math.floor(sdx * (1 / SMPLX_FPS * 50))
+        edx_audio = sdx_audio + int((edx - sdx) * 50 / SMPLX_FPS) + 1
+        cached_audio_low = audio["wav2vec2_low"][sdx_audio:edx_audio]
+        cached_audio_high = audio["wav2vec2_high"][sdx_audio:edx_audio]
+        bert_time_aligned = audio["bert_time_aligned"][sdx_audio:edx_audio]
+        # print(sdx_audio, edx_audio, cached_audio_low.shape)
+        # print("cached_audio_low:", cached_audio_low.shape, cached_audio_high.shape, bert_time_aligned.shape, motion.shape)
+              
+        motion_tensor = torch.from_numpy(motion).float() # T x D  
+        cached_audio_low = torch.from_numpy(cached_audio_low).float()
+        cached_audio_high = torch.from_numpy(cached_audio_high).float()
+        bert_time_aligned = torch.from_numpy(bert_time_aligned).float()
+
+        audio_wave, sr = librosa.load(os.path.join(data["audio_path"], data["video_id"] + ".wav"))
+        audio_wave = librosa.resample(audio_wave, orig_sr=sr, target_sr=self.audio_sr)
+        sdx_audio = sdx * int(1 / SMPLX_FPS * self.audio_sr)
+        edx_audio = edx * int(1 / SMPLX_FPS * self.audio_sr)
+        audio_wave = audio_wave[sdx_audio:edx_audio]
+        audio_tensor = torch.from_numpy(audio_wave).float()
+       
+        return dict(
+            cached_rep15d=motion_tensor,
+            cached_audio_low=cached_audio_low,
+            cached_audio_high=cached_audio_high,
+            bert_time_aligned=bert_time_aligned,
+            audio_tensor=audio_tensor,
+        )
\ No newline at end of file
diff --git a/datasets/cached_audio/101099-00_18_09-00_18_19.mp4 b/datasets/cached_audio/101099-00_18_09-00_18_19.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..0ca86fdf9097924562cc1146d4ccdd0ed69d4434
--- /dev/null
+++ b/datasets/cached_audio/101099-00_18_09-00_18_19.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:addd2c332242bf4e234adee59d8220f85a3ba4e587e145ed8aece0c9f4b8c358
+size 1393776
diff --git a/datasets/cached_audio/1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4 b/datasets/cached_audio/1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..ae078c0667e02e683178eeed302847513c68b761
--- /dev/null
+++ b/datasets/cached_audio/1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0eb3ec8a6ded1a3e378b6b8745695beff96cdc4976570c1d070d688ab1dbeba
+size 2569514
diff --git a/datasets/cached_audio/demo1.mp4 b/datasets/cached_audio/demo1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..dde45f8929f93a3e4020fb7cad5bfc1452331ec8
--- /dev/null
+++ b/datasets/cached_audio/demo1.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5453aa5604d0af87dd61496a9e5d7ace54ccf155a47d0b99221bbd17b77a9a9d
+size 1333071
diff --git a/datasets/cached_audio/demo2.mp4 b/datasets/cached_audio/demo2.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..444d78e8bd8d34de11adcc0971dec283a017367a
--- /dev/null
+++ b/datasets/cached_audio/demo2.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:373f200d80a2ee5f3181ed91b15524ea95cfcd85d60543fe2eeeb1c6fd0aa942
+size 3166807
diff --git a/datasets/cached_audio/demo3.mp4 b/datasets/cached_audio/demo3.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..f87d72e842c03da03ae33939e6ba10a3611cefe6
--- /dev/null
+++ b/datasets/cached_audio/demo3.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afd26e7de8bd19dc1c28f2de8296c2942763315e664506db671b5360c731d155
+size 1767108
diff --git a/datasets/cached_audio/demo4.mp4 b/datasets/cached_audio/demo4.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..16ebd782049ba7dca1a4e2e194c18cba9ab8b728
--- /dev/null
+++ b/datasets/cached_audio/demo4.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47aa4d6852656fa5633cb00b90ebf434ce9bdfb95751bbab7caa1d4f7d00303e
+size 3653553
diff --git a/datasets/cached_audio/demo5.mp4 b/datasets/cached_audio/demo5.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..995d180346c7f2f768b51c861c6445fa031c5507
--- /dev/null
+++ b/datasets/cached_audio/demo5.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eac0520ffac5d53a5bbca83e6abecac657ff10e3a6a184828276f64e43cebc75
+size 2003651
diff --git a/datasets/cached_audio/example_female_voice_9_seconds.wav b/datasets/cached_audio/example_female_voice_9_seconds.wav
new file mode 100644
index 0000000000000000000000000000000000000000..789b4c4f4d026b6327936c6e3c9fd90bdaced49a
--- /dev/null
+++ b/datasets/cached_audio/example_female_voice_9_seconds.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73e27187199ff35924fb53270e6b5def7f54adf2e0ae72e4d28b6839d7eaa9c3
+size 606158
diff --git a/datasets/cached_audio/example_male_voice_9_seconds.wav b/datasets/cached_audio/example_male_voice_9_seconds.wav
new file mode 100644
index 0000000000000000000000000000000000000000..a7325656f9d92acf62246067034b7c6200b5b927
--- /dev/null
+++ b/datasets/cached_audio/example_male_voice_9_seconds.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4081f05ad380fc2c631b4c595857ab9e6d7a0e620ff6e94c591faff1afde47d0
+size 880078
diff --git a/datasets/cached_audio/speaker12_10_BVHw8aCPATM_00-01-05.0_00-01-10.0.mp4 b/datasets/cached_audio/speaker12_10_BVHw8aCPATM_00-01-05.0_00-01-10.0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..dcdc863bc0c21747c2ec111ff124a8dc4f459e54
--- /dev/null
+++ b/datasets/cached_audio/speaker12_10_BVHw8aCPATM_00-01-05.0_00-01-10.0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:926444bb200639713b1d0d48ea1ff544685c1dc24b9f1d42e8133724563e18bd
+size 1577443
diff --git a/datasets/cached_audio/speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4 b/datasets/cached_audio/speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..491b70a75cfb3c8cb4546aa02031322f99682270
--- /dev/null
+++ b/datasets/cached_audio/speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffb58134d03dd7dabe2bfc587ea615c540cf0c161b20c754f95b74de07379bb9
+size 1679489
diff --git a/datasets/cached_audio/speaker8_jjRWaMCWs44_00-00-30.16_00-00-33.32.mp4 b/datasets/cached_audio/speaker8_jjRWaMCWs44_00-00-30.16_00-00-33.32.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..06c9701bc53b933066c9a5a47cb929cfa2ebbdb7
--- /dev/null
+++ b/datasets/cached_audio/speaker8_jjRWaMCWs44_00-00-30.16_00-00-33.32.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb8bbc04dce11adbf273a9bc9452e05efc958533a8cfcccfadc682c3b2d589ed
+size 777274
diff --git a/datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4 b/datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..7d3e37511b47a0ac04edd2a6c559d8ae5b07002e
--- /dev/null
+++ b/datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:902885eb02a6e6d39faba9cdcc3cf65be937739a31b0ba8fb2fd0727f3c8d7df
+size 1951643
diff --git a/datasets/cached_ckpts/ckpt.pth b/datasets/cached_ckpts/ckpt.pth
new file mode 100644
index 0000000000000000000000000000000000000000..74cc5cfaaf5655a8abc042d13a7de9835057fac7
--- /dev/null
+++ b/datasets/cached_ckpts/ckpt.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ff6ec757f94540d35616835fe7399ba603ada471003506de56963d392ae8047
+size 1635589362
diff --git a/datasets/cached_graph/show_oliver_test/Stupid_Watergate_-_Last_Week_Tonight_with_John_Oliver_HBO-FVFdsl29s_Q.mkv.pkl b/datasets/cached_graph/show_oliver_test/Stupid_Watergate_-_Last_Week_Tonight_with_John_Oliver_HBO-FVFdsl29s_Q.mkv.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..c1de433c574e040c4306a717177f978fc88c099d
--- /dev/null
+++ b/datasets/cached_graph/show_oliver_test/Stupid_Watergate_-_Last_Week_Tonight_with_John_Oliver_HBO-FVFdsl29s_Q.mkv.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2dc9ea9bb57d2744186408c4f02e209c6380bdc1a14cbf55f318d494174397d
+size 4083759649
diff --git a/datasets/cached_graph/youtube_test/speaker1.pkl b/datasets/cached_graph/youtube_test/speaker1.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..7ff53a5134b1edf85b61f87cfdfb54e13800b054
--- /dev/null
+++ b/datasets/cached_graph/youtube_test/speaker1.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fe52a456776150d9012822cab6cc19c25bf3bb5270ce8c16651baccfda81c90
+size 3953916892
diff --git a/datasets/cached_graph/youtube_test/speaker7.pkl b/datasets/cached_graph/youtube_test/speaker7.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..27805102c1e3bc8d6c90e7f2bbabb224af1b715f
--- /dev/null
+++ b/datasets/cached_graph/youtube_test/speaker7.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac3e876d6a467e2282f153f5fc2954babe538f9cfcfd435ed414cf04d86f4dfe
+size 7727648369
diff --git a/datasets/cached_graph/youtube_test/speaker8.pkl b/datasets/cached_graph/youtube_test/speaker8.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..b69996a3eb9b883cb34c7288cde4b660dc2edc99
--- /dev/null
+++ b/datasets/cached_graph/youtube_test/speaker8.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62eaa12501b226077b6c0dcf88d503bb2de8375d7f39020640f160c9c2e139a2
+size 7527863412
diff --git a/datasets/cached_graph/youtube_test/speaker9.pkl b/datasets/cached_graph/youtube_test/speaker9.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..83edacd9424cdbb38747281ea8a598ff6fec50a8
--- /dev/null
+++ b/datasets/cached_graph/youtube_test/speaker9.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9780e73e4280e0894e4e290dedbc46c4a0ec3386531b15e7a46ce65de5e78e8
+size 6278799434
diff --git a/datasets/data_json/show-oliver-original.json b/datasets/data_json/show-oliver-original.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4a0562da1be0d5832fafb9ea40957a5ac390327
--- /dev/null
+++ b/datasets/data_json/show-oliver-original.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb600d7dc5d361d0aa5efa930956fd9cea1b563a80bbfb767b51f04b673cd24c
+size 3523481
diff --git a/datasets/data_json/show-oliver-s40_w128.json b/datasets/data_json/show-oliver-s40_w128.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ddf004f4b2f94cc123dc5bc0ace019d99365439
--- /dev/null
+++ b/datasets/data_json/show-oliver-s40_w128.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dad464485bfd78be599dd2877cb51f08a67982d25ed098503c62bf133ead392c
+size 14158004
diff --git a/datasets/data_json/show-oliver-s40_w256.json b/datasets/data_json/show-oliver-s40_w256.json
new file mode 100644
index 0000000000000000000000000000000000000000..723d5bf8d298d78add5314753a1224233e11a37d
--- /dev/null
+++ b/datasets/data_json/show-oliver-s40_w256.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24d0c7737842b399f679bd24ae57eecf1f7745d2682f87cdd5ce33da28851023
+size 4834533
diff --git a/datasets/data_json/show-oliver-s40_w64.json b/datasets/data_json/show-oliver-s40_w64.json
new file mode 100644
index 0000000000000000000000000000000000000000..4cfc5a0fe2a95a4d33131f0b16a2bf607d04536c
--- /dev/null
+++ b/datasets/data_json/show-oliver-s40_w64.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74e856fc7ec879c7b5207f014d88211c4c81a02d76476907d8ce11e09386710b
+size 18814777
diff --git a/datasets/data_json/show-oliver-test.json b/datasets/data_json/show-oliver-test.json
new file mode 100644
index 0000000000000000000000000000000000000000..a90e89a128bb2d7e8fb57e1f97c0aa198f543026
--- /dev/null
+++ b/datasets/data_json/show-oliver-test.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bdda0c1e69ebf5597397c759b721f96a5f48b1272ccedc6885ba4956335ae03
+size 2917
diff --git a/datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json b/datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..9aaa23e4744e04567e99257d1e0d4a2bdf31c95e
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4694f3b6b61f795a64d908419f4ef81e869affb0e891592aac138bc986f472ad
+size 2917
diff --git a/datasets/data_json/show_oliver_test/Border_Patrol_-_Last_Week_Tonight_with_John_Oliver_HBO-NnW5EjwtE2U.mkv.json b/datasets/data_json/show_oliver_test/Border_Patrol_-_Last_Week_Tonight_with_John_Oliver_HBO-NnW5EjwtE2U.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..76be0d24c1a26fbb8d25350cd190fef5c7f2c6f4
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Border_Patrol_-_Last_Week_Tonight_with_John_Oliver_HBO-NnW5EjwtE2U.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:747875bad548398d26d97f9dc69de4ce2330946a5dac9b2b9d6227df335d1d9c
+size 5222
diff --git a/datasets/data_json/show_oliver_test/Border_Wall_-_Last_Week_Tonight_with_John_Oliver_HBO-vU8dCYocuyI.webm.json b/datasets/data_json/show_oliver_test/Border_Wall_-_Last_Week_Tonight_with_John_Oliver_HBO-vU8dCYocuyI.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..0fe9c0f4b8a0b93395cb0d9d69b9cedd5381d5c8
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Border_Wall_-_Last_Week_Tonight_with_John_Oliver_HBO-vU8dCYocuyI.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58330f5aeb02259eaa5c829772807a3a7f95607531ccc87befe21d3b0d1fcf30
+size 6348
diff --git a/datasets/data_json/show_oliver_test/Brexit_-_Last_Week_Tonight_with_John_Oliver_HBO-iAgKHSNqxa8.webm.json b/datasets/data_json/show_oliver_test/Brexit_-_Last_Week_Tonight_with_John_Oliver_HBO-iAgKHSNqxa8.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..f58e2702fe01fbf6e2c73a6ac745b9619d8f265b
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Brexit_-_Last_Week_Tonight_with_John_Oliver_HBO-iAgKHSNqxa8.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3386ac66a7bebb4be1d6c4d4d785423caa190e36a5c49f8e2701904f0697e4ef
+size 4497
diff --git a/datasets/data_json/show_oliver_test/Brexit_II_-_Last_Week_Tonight_with_John_Oliver_HBO-fyVz5vgqBhE.mkv.json b/datasets/data_json/show_oliver_test/Brexit_II_-_Last_Week_Tonight_with_John_Oliver_HBO-fyVz5vgqBhE.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..be64aee20896e097e66290a1bd05c73ce9d589f2
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Brexit_II_-_Last_Week_Tonight_with_John_Oliver_HBO-fyVz5vgqBhE.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18f6fd670b7325f9fea45449b094e53ec409c446bbc43a2a61f7786189e52565
+size 6249
diff --git a/datasets/data_json/show_oliver_test/Charter_Schools_-_Last_Week_Tonight_with_John_Oliver_HBO-l_htSPGAY7I.webm.json b/datasets/data_json/show_oliver_test/Charter_Schools_-_Last_Week_Tonight_with_John_Oliver_HBO-l_htSPGAY7I.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..38e6c2365a1a6044eb7dbbd503f01f22fb59f565
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Charter_Schools_-_Last_Week_Tonight_with_John_Oliver_HBO-l_htSPGAY7I.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21272290e50279cb5d48aaba51da628e155c50f533e866e64ad9d2d53d8d912e
+size 7069
diff --git a/datasets/data_json/show_oliver_test/Civil_Forfeiture_-_Last_Week_Tonight_with_John_Oliver_HBO-3kEpZWGgJks.mkv.json b/datasets/data_json/show_oliver_test/Civil_Forfeiture_-_Last_Week_Tonight_with_John_Oliver_HBO-3kEpZWGgJks.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..98ec9424b9207dd14739022493b1b76b80f70740
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Civil_Forfeiture_-_Last_Week_Tonight_with_John_Oliver_HBO-3kEpZWGgJks.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:652bc10d2f28aa2fbc2b22233e50aef84c6392813a86ac17869f84219a750924
+size 4123
diff --git a/datasets/data_json/show_oliver_test/Coal_-_Last_Week_Tonight_with_John_Oliver_HBO-aw6RsUhw1Q8.mp4.json b/datasets/data_json/show_oliver_test/Coal_-_Last_Week_Tonight_with_John_Oliver_HBO-aw6RsUhw1Q8.mp4.json
new file mode 100644
index 0000000000000000000000000000000000000000..13952e62f5b72c199d64c70d006cf4eb27c88ea8
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Coal_-_Last_Week_Tonight_with_John_Oliver_HBO-aw6RsUhw1Q8.mp4.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6eac7819260a2cef6a06f2c727e51b6da53ccb6c012889a0308be4e5e2a34ee8
+size 5532
diff --git a/datasets/data_json/show_oliver_test/Congressional_Fundraising_-_Last_Week_Tonight_with_John_Oliver_HBO-Ylomy1Aw9Hk.mkv.json b/datasets/data_json/show_oliver_test/Congressional_Fundraising_-_Last_Week_Tonight_with_John_Oliver_HBO-Ylomy1Aw9Hk.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..86379fc6340b44f96726c9c5e9e571b92407e7ab
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Congressional_Fundraising_-_Last_Week_Tonight_with_John_Oliver_HBO-Ylomy1Aw9Hk.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af60f3aaee2e74fa70d26dcb64d530ed935c03cae3d80e8410bf7d3fa494d944
+size 4314
diff --git a/datasets/data_json/show_oliver_test/Corporate_Taxes_-_Last_Week_Tonight_with_John_Oliver_HBO-RKjk0ECXjiQ.mp4.json b/datasets/data_json/show_oliver_test/Corporate_Taxes_-_Last_Week_Tonight_with_John_Oliver_HBO-RKjk0ECXjiQ.mp4.json
new file mode 100644
index 0000000000000000000000000000000000000000..823b264c5cf8aee997d567a88376dc472bacb120
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Corporate_Taxes_-_Last_Week_Tonight_with_John_Oliver_HBO-RKjk0ECXjiQ.mp4.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bb881615990883841f125c921572a3d31996556acf8e437fd0523fbcdaa9adf
+size 5276
diff --git a/datasets/data_json/show_oliver_test/Credit_Reports_-_Last_Week_Tonight_with_John_Oliver_HBO-aRrDsbUdY_k.webm.json b/datasets/data_json/show_oliver_test/Credit_Reports_-_Last_Week_Tonight_with_John_Oliver_HBO-aRrDsbUdY_k.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..0fb1aa604bb6c4345051a94246e5ae4133d62fc3
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Credit_Reports_-_Last_Week_Tonight_with_John_Oliver_HBO-aRrDsbUdY_k.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e15136c662190f17fdda590e555014992a167e2b045fd8bd55f7d722c22db9d
+size 4104
diff --git a/datasets/data_json/show_oliver_test/Dalai_Lama_-_Last_Week_Tonight_with_John_Oliver_HBO-bLY45o6rHm0.mkv.json b/datasets/data_json/show_oliver_test/Dalai_Lama_-_Last_Week_Tonight_with_John_Oliver_HBO-bLY45o6rHm0.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..bab1c1d217d7a69f9cdc4fcb02ca4edae46bc066
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Dalai_Lama_-_Last_Week_Tonight_with_John_Oliver_HBO-bLY45o6rHm0.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a96246a1a85e40a4bc9dfa998ce296a11be62b8e4f34dc87949f3be625c82c36
+size 3428
diff --git a/datasets/data_json/show_oliver_test/Debt_Buyers_-_Last_Week_Tonight_with_John_Oliver_HBO-hxUAntt1z2c.mkv.json b/datasets/data_json/show_oliver_test/Debt_Buyers_-_Last_Week_Tonight_with_John_Oliver_HBO-hxUAntt1z2c.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..b38042506bf16dca8cce27e1da3dbb2b42a81e3d
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Debt_Buyers_-_Last_Week_Tonight_with_John_Oliver_HBO-hxUAntt1z2c.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfcc62ee1c1d573c3151b687f9fd6d784003809878b571efb540c5dcc7b1fd69
+size 6316
diff --git a/datasets/data_json/show_oliver_test/Democratic_National_Convention_-_Last_Week_Tonight_with_John_Oliver_HBO-BUCnjlTfXDw.webm.json b/datasets/data_json/show_oliver_test/Democratic_National_Convention_-_Last_Week_Tonight_with_John_Oliver_HBO-BUCnjlTfXDw.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3ac84d3b31e2ebf8ba97a9f1e57e26d27cf96bb
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Democratic_National_Convention_-_Last_Week_Tonight_with_John_Oliver_HBO-BUCnjlTfXDw.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:833a5fc7e60e68f72583cf6b158f39f1387e4b69c2428c5f2937236a0881bfe9
+size 6340
diff --git a/datasets/data_json/show_oliver_test/Dialysis_-_Last_Week_Tonight_with_John_Oliver_HBO-yw_nqzVfxFQ.mkv.json b/datasets/data_json/show_oliver_test/Dialysis_-_Last_Week_Tonight_with_John_Oliver_HBO-yw_nqzVfxFQ.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..0fbc85d6c2e7e1b31a3df4b55b2025c703483cbb
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Dialysis_-_Last_Week_Tonight_with_John_Oliver_HBO-yw_nqzVfxFQ.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1559c8b572c6d55964f7421c183af0cfa6c0ab83186bfd197e83bc7eb28af2de
+size 5652
diff --git a/datasets/data_json/show_oliver_test/Donald_Trump_-_Last_Week_Tonight_with_John_Oliver_HBO-DnpO_RTSNmQ.mkv.json b/datasets/data_json/show_oliver_test/Donald_Trump_-_Last_Week_Tonight_with_John_Oliver_HBO-DnpO_RTSNmQ.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..4273f1d3efd691ed58aafed380488a7010a62802
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Donald_Trump_-_Last_Week_Tonight_with_John_Oliver_HBO-DnpO_RTSNmQ.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12b184e644ec4f324d078e58445d586a0b009f1348e9285d5cb5f9d1c9ede09b
+size 8080
diff --git a/datasets/data_json/show_oliver_test/Doping_-_Last_Week_Tonight_with_John_Oliver_HBO-BgyqAD5Z6_A.mkv.json b/datasets/data_json/show_oliver_test/Doping_-_Last_Week_Tonight_with_John_Oliver_HBO-BgyqAD5Z6_A.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..43f22605baef6d0ee481aa620afef93c17ecd8af
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Doping_-_Last_Week_Tonight_with_John_Oliver_HBO-BgyqAD5Z6_A.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1158aa61ab31fdeccd2c128722e825cd5e34c8985d4014fcd5285e8918b59de3
+size 6709
diff --git a/datasets/data_json/show_oliver_test/Elected_Judges_-_Last_Week_Tonight_with_John_Oliver_HBO-poL7l-Uk3I8.mkv.json b/datasets/data_json/show_oliver_test/Elected_Judges_-_Last_Week_Tonight_with_John_Oliver_HBO-poL7l-Uk3I8.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..3658c0c8b6f7b2833ffa921dfc470fbc0a90dc67
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Elected_Judges_-_Last_Week_Tonight_with_John_Oliver_HBO-poL7l-Uk3I8.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70ce6905cb6f771c789eb011ce6a73d72ce6680bd2b38634990e5817d12bbf21
+size 4083
diff --git a/datasets/data_json/show_oliver_test/Encryption_-_Last_Week_Tonight_with_John_Oliver_HBO-zsjZ2r9Ygzw.mkv.json b/datasets/data_json/show_oliver_test/Encryption_-_Last_Week_Tonight_with_John_Oliver_HBO-zsjZ2r9Ygzw.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..d17a7f474b0060eaf102f366d61c19670c0ddb5e
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Encryption_-_Last_Week_Tonight_with_John_Oliver_HBO-zsjZ2r9Ygzw.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33a0a542f1be2300360d69f73204de2d967430fc9f3e9cb7fa77cf88b1f359bf
+size 3998
diff --git a/datasets/data_json/show_oliver_test/FIFA_and_the_World_Cup_-_Last_Week_Tonight_with_John_Oliver_HBO-DlJEt2KU33I.mkv.json b/datasets/data_json/show_oliver_test/FIFA_and_the_World_Cup_-_Last_Week_Tonight_with_John_Oliver_HBO-DlJEt2KU33I.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5a5f1aa63e46e2fd8b3890bcab310bbfd5a4578
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/FIFA_and_the_World_Cup_-_Last_Week_Tonight_with_John_Oliver_HBO-DlJEt2KU33I.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65a3e1f682a7a805dd056552b7b1e9fb79d178a1132d56e99e59f55f6ad11848
+size 3644
diff --git a/datasets/data_json/show_oliver_test/Fashion_-_Last_Week_Tonight_with_John_Oliver_HBO-VdLf4fihP78.mkv.json b/datasets/data_json/show_oliver_test/Fashion_-_Last_Week_Tonight_with_John_Oliver_HBO-VdLf4fihP78.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..edf4132125546b053ffffef527efa4d0987c5ddd
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Fashion_-_Last_Week_Tonight_with_John_Oliver_HBO-VdLf4fihP78.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d7bf2bf0488f5852b39520feb12c9c0be0030fc036d9a255b16341b4e1a5e67
+size 5620
diff --git a/datasets/data_json/show_oliver_test/French_Elections_-_Last_Week_Tonight_with_John_Oliver_HBO-hkZir1L7fSY.mp4.json b/datasets/data_json/show_oliver_test/French_Elections_-_Last_Week_Tonight_with_John_Oliver_HBO-hkZir1L7fSY.mp4.json
new file mode 100644
index 0000000000000000000000000000000000000000..5672990fa2fb6b05edc43c2b2cee914c1838827f
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/French_Elections_-_Last_Week_Tonight_with_John_Oliver_HBO-hkZir1L7fSY.mp4.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:671bd47d6e3ab8f1dcd184b9966657fd61d64923bf5869859dee81094098a919
+size 4125
diff --git a/datasets/data_json/show_oliver_test/Gene_Editing_-_Last_Week_Tonight_with_John_Oliver_HBO-AJm8PeWkiEU.mp4.json b/datasets/data_json/show_oliver_test/Gene_Editing_-_Last_Week_Tonight_with_John_Oliver_HBO-AJm8PeWkiEU.mp4.json
new file mode 100644
index 0000000000000000000000000000000000000000..09e7b8ee95563ead93873643234e46fd3f39702b
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Gene_Editing_-_Last_Week_Tonight_with_John_Oliver_HBO-AJm8PeWkiEU.mp4.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09a891a49346b76ea43a3d067dfc07585248a9faa7da32a2da48f14b85387c28
+size 4617
diff --git a/datasets/data_json/show_oliver_test/Gerrymandering_-_Last_Week_Tonight_with_John_Oliver_HBO-A-4dIImaodQ.mkv.json b/datasets/data_json/show_oliver_test/Gerrymandering_-_Last_Week_Tonight_with_John_Oliver_HBO-A-4dIImaodQ.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec75a4152f41a7cee0cffd0e6b9b706df6c76e71
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Gerrymandering_-_Last_Week_Tonight_with_John_Oliver_HBO-A-4dIImaodQ.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:960c9e49d0dfd9c06435b2279e6d0e54068164d4772a0fb82255c622982523bb
+size 5249
diff --git a/datasets/data_json/show_oliver_test/Infrastructure_-_Last_Week_Tonight_with_John_Oliver_HBO-Wpzvaqypav8.mkv.json b/datasets/data_json/show_oliver_test/Infrastructure_-_Last_Week_Tonight_with_John_Oliver_HBO-Wpzvaqypav8.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..47274a9a74101ecb34519fc5a64ab9a139a82033
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Infrastructure_-_Last_Week_Tonight_with_John_Oliver_HBO-Wpzvaqypav8.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2d60648965cf8985e4ff3f808a5bee6dbc9614aef064e2eaa4e33c305248247
+size 6415
diff --git a/datasets/data_json/show_oliver_test/Iran_Deal_-_Last_Week_Tonight_with_John_Oliver_HBO-5xnZ_CeTqyM.mp4.json b/datasets/data_json/show_oliver_test/Iran_Deal_-_Last_Week_Tonight_with_John_Oliver_HBO-5xnZ_CeTqyM.mp4.json
new file mode 100644
index 0000000000000000000000000000000000000000..eedd45884a3c810dc7d119afea028210133d8e79
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Iran_Deal_-_Last_Week_Tonight_with_John_Oliver_HBO-5xnZ_CeTqyM.mp4.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12161f71f9f37122e3ac5bf09526ee9709e02985d78a857f1559954e0c122f28
+size 5682
diff --git a/datasets/data_json/show_oliver_test/Ivanka_Jared_-_Last_Week_Tonight_with_John_Oliver_HBO-wD8AwgO0AQI.webm.json b/datasets/data_json/show_oliver_test/Ivanka_Jared_-_Last_Week_Tonight_with_John_Oliver_HBO-wD8AwgO0AQI.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..d60061b1b64b31a48fce3b1662b1b50db70f006d
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Ivanka_Jared_-_Last_Week_Tonight_with_John_Oliver_HBO-wD8AwgO0AQI.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35c2583dff13fce14fde553cde9b6ca4a47fa6cc937efbcbe92b884f4e8810b4
+size 5222
diff --git a/datasets/data_json/show_oliver_test/Journalism_-_Last_Week_Tonight_with_John_Oliver_HBO-bq2_wSsDwkQ.webm.json b/datasets/data_json/show_oliver_test/Journalism_-_Last_Week_Tonight_with_John_Oliver_HBO-bq2_wSsDwkQ.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..17b1c44e047ba4483edf73279aedcbd9ce5e0980
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Journalism_-_Last_Week_Tonight_with_John_Oliver_HBO-bq2_wSsDwkQ.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:399862675f583f4c2079792b637883cf2fb5f5b7ef8c3f15f56ce5bf9e7ca667
+size 6314
diff --git a/datasets/data_json/show_oliver_test/Lead_-_Last_Week_Tonight_with_John_Oliver_HBO-GUizvEjR-0U.mkv.json b/datasets/data_json/show_oliver_test/Lead_-_Last_Week_Tonight_with_John_Oliver_HBO-GUizvEjR-0U.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..66f3481e5e96b87be30fcac6402ca540bb44a2a4
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Lead_-_Last_Week_Tonight_with_John_Oliver_HBO-GUizvEjR-0U.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91d3bf4bc1a6abc499a6570cfb3affca4a7deb15c50b9d7491ec5b6a0f6f415e
+size 3873
diff --git a/datasets/data_json/show_oliver_test/Marijuana_-_Last_Week_Tonight_with_John_Oliver_HBO-BcR_Wg42dv8.webm.json b/datasets/data_json/show_oliver_test/Marijuana_-_Last_Week_Tonight_with_John_Oliver_HBO-BcR_Wg42dv8.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..ffc72cff3fd8f61ba5130bcdb4001b892d1e46f1
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Marijuana_-_Last_Week_Tonight_with_John_Oliver_HBO-BcR_Wg42dv8.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b1f7ec11ed72cf8d39894ede109288dfe3f11f3ba684468f7eeaa4b63fe3c32
+size 4570
diff --git a/datasets/data_json/show_oliver_test/Marketing_to_Doctors_-_Last_Week_Tonight_with_John_Oliver_HBO-YQZ2UeOTO3I.mkv.json b/datasets/data_json/show_oliver_test/Marketing_to_Doctors_-_Last_Week_Tonight_with_John_Oliver_HBO-YQZ2UeOTO3I.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..75d3aa26c813ab898e3708bedfb2e063be97dda6
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Marketing_to_Doctors_-_Last_Week_Tonight_with_John_Oliver_HBO-YQZ2UeOTO3I.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc3b5a4e31a3f570ed84bf9c66865a8acd711b2f595f7aebf8e7c04d94356410
+size 8414
diff --git a/datasets/data_json/show_oliver_test/Mexican_Elections_-_Last_Week_Tonight_with_John_Oliver_HBO-8-hahRWhFvg.mp4.json b/datasets/data_json/show_oliver_test/Mexican_Elections_-_Last_Week_Tonight_with_John_Oliver_HBO-8-hahRWhFvg.mp4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7070ef5c8ad73ac3081743079ab03e9af61a805c
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Mexican_Elections_-_Last_Week_Tonight_with_John_Oliver_HBO-8-hahRWhFvg.mp4.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20977f87847f781aeed8eefe66d4d25fee8e66eb069475a60f62118eec6f5d7b
+size 5922
diff --git a/datasets/data_json/show_oliver_test/Net_Neutrality_II_-_Last_Week_Tonight_with_John_Oliver_HBO-92vuuZt7wak.webm.json b/datasets/data_json/show_oliver_test/Net_Neutrality_II_-_Last_Week_Tonight_with_John_Oliver_HBO-92vuuZt7wak.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..ecffb961bf0797d946b3f19782dd562b4beb364e
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Net_Neutrality_II_-_Last_Week_Tonight_with_John_Oliver_HBO-92vuuZt7wak.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32051c07c72f52620350aa059cc28ddabcb7d5a3cc145bb830eed8ce54e54c68
+size 4761
diff --git a/datasets/data_json/show_oliver_test/North_Korea_-_Last_Week_Tonight_with_John_Oliver_HBO-TrS0uNBuG9c.mkv.json b/datasets/data_json/show_oliver_test/North_Korea_-_Last_Week_Tonight_with_John_Oliver_HBO-TrS0uNBuG9c.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..c51982984ed30106f8fac996a5511d2cd7d98fe6
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/North_Korea_-_Last_Week_Tonight_with_John_Oliver_HBO-TrS0uNBuG9c.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5c9d2ff02722fa08792d761d22ca445d90bcd8c621929a3ae3026017d8b225e
+size 6315
diff --git a/datasets/data_json/show_oliver_test/Obamacare_-_Last_Week_Tonight_with_John_Oliver_HBO-YEGpriv2TAc.webm.json b/datasets/data_json/show_oliver_test/Obamacare_-_Last_Week_Tonight_with_John_Oliver_HBO-YEGpriv2TAc.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7788b1aeed47702831ebac576bafaef76cd57c1
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Obamacare_-_Last_Week_Tonight_with_John_Oliver_HBO-YEGpriv2TAc.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2cbdfa2785112f3d6a7e3abee7a5e7b9eb31bd3c3a59dc8d4ce7221e8148fc6
+size 6283
diff --git a/datasets/data_json/show_oliver_test/Online_Harassment_-_Last_Week_Tonight_with_John_Oliver_HBO-PuNIwYsz7PI.mkv.json b/datasets/data_json/show_oliver_test/Online_Harassment_-_Last_Week_Tonight_with_John_Oliver_HBO-PuNIwYsz7PI.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..539981954b82faf78847e157501d7af2e047d3a1
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Online_Harassment_-_Last_Week_Tonight_with_John_Oliver_HBO-PuNIwYsz7PI.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d080fd4ab5d0098e3922c2f0e5783528313f31d24e71113d842aef58f2e0fb3
+size 5921
diff --git a/datasets/data_json/show_oliver_test/Paris_Agreement_-_Last_Week_Tonight_with_John_Oliver_HBO-5scez5dqtAc.webm.json b/datasets/data_json/show_oliver_test/Paris_Agreement_-_Last_Week_Tonight_with_John_Oliver_HBO-5scez5dqtAc.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..8969a5b15477e8608665fdd6e8e829cc77056ceb
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Paris_Agreement_-_Last_Week_Tonight_with_John_Oliver_HBO-5scez5dqtAc.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1886571c2335b0e3a25ea9ed4ecbf7733fae1db6764206c6260017be74f3c8f0
+size 5892
diff --git a/datasets/data_json/show_oliver_test/Predatory_Lending_-_Last_Week_Tonight_with_John_Oliver_HBO-PDylgzybWAw.webm.json b/datasets/data_json/show_oliver_test/Predatory_Lending_-_Last_Week_Tonight_with_John_Oliver_HBO-PDylgzybWAw.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bd0f1a2f5e95fdb0b9633a73f17fc5ee4e77583
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Predatory_Lending_-_Last_Week_Tonight_with_John_Oliver_HBO-PDylgzybWAw.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23a7730a89b01ca34cac6467384c0bcfeb87b4220ac0db4ba8f262ea104a460d
+size 6545
diff --git a/datasets/data_json/show_oliver_test/Primaries_and_Caucuses_-_Last_Week_Tonight_with_John_Oliver_HBO-_S2G8jhhUHg.webm.json b/datasets/data_json/show_oliver_test/Primaries_and_Caucuses_-_Last_Week_Tonight_with_John_Oliver_HBO-_S2G8jhhUHg.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..6fc89543accb1b718cc41b82beab3a77bf89c9a9
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Primaries_and_Caucuses_-_Last_Week_Tonight_with_John_Oliver_HBO-_S2G8jhhUHg.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2db944ba4e7a6f1c890d921c771c04cc7ff0024d8563af8310278b60b4b8b2be
+size 4882
diff --git a/datasets/data_json/show_oliver_test/Prison_-_Last_Week_Tonight_with_John_Oliver_HBO-_Pz3syET3DY.mkv.json b/datasets/data_json/show_oliver_test/Prison_-_Last_Week_Tonight_with_John_Oliver_HBO-_Pz3syET3DY.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..b98f4f75ca3509ca83ba4f20145b72ec450d2156
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Prison_-_Last_Week_Tonight_with_John_Oliver_HBO-_Pz3syET3DY.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f9c7480586dd49955960a075e9fb95f115d82d60e5766c5077ab8fc78260277
+size 3914
diff --git a/datasets/data_json/show_oliver_test/Puerto_Rico_-_Last_Week_Tonight_with_John_Oliver_HBO-Tt-mpuR_QHQ.webm.json b/datasets/data_json/show_oliver_test/Puerto_Rico_-_Last_Week_Tonight_with_John_Oliver_HBO-Tt-mpuR_QHQ.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f2c11984334313b8929f77c246219fc80f56d4b
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Puerto_Rico_-_Last_Week_Tonight_with_John_Oliver_HBO-Tt-mpuR_QHQ.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00ba2cad485b06650c23cb12b4245221b21a07f334d6ae18bb170af834fc494c
+size 7503
diff --git a/datasets/data_json/show_oliver_test/Rehab_-_Last_Week_Tonight_with_John_Oliver_HBO-hWQiXv0sn9Y.mp4.json b/datasets/data_json/show_oliver_test/Rehab_-_Last_Week_Tonight_with_John_Oliver_HBO-hWQiXv0sn9Y.mp4.json
new file mode 100644
index 0000000000000000000000000000000000000000..30ae372cf6131bef7fb068fe1d56dee95ef8bfc7
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Rehab_-_Last_Week_Tonight_with_John_Oliver_HBO-hWQiXv0sn9Y.mp4.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12e7154f5debf8a4cdf95b79217d8509ca5475503f17c45cadd4148caddff70f
+size 5004
diff --git a/datasets/data_json/show_oliver_test/Retirement_Plans_-_Last_Week_Tonight_with_John_Oliver_HBO-gvZSpET11ZY.mkv.json b/datasets/data_json/show_oliver_test/Retirement_Plans_-_Last_Week_Tonight_with_John_Oliver_HBO-gvZSpET11ZY.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..d74e3af3ae7da220a8f53f187a2812ce5a6b619e
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Retirement_Plans_-_Last_Week_Tonight_with_John_Oliver_HBO-gvZSpET11ZY.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:502cb14ae7927488d65d28ecdd819be814c3f15c9001417d0de712eb244a7e25
+size 5303
diff --git a/datasets/data_json/show_oliver_test/Scientific_Studies_-_Last_Week_Tonight_with_John_Oliver_HBO-0Rnq1NpHdmw.webm.json b/datasets/data_json/show_oliver_test/Scientific_Studies_-_Last_Week_Tonight_with_John_Oliver_HBO-0Rnq1NpHdmw.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe8786ea122ee436d2d4582e014fd541df792857
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Scientific_Studies_-_Last_Week_Tonight_with_John_Oliver_HBO-0Rnq1NpHdmw.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa0236a6e32c6e351c2ca854c6fe4bb3614454033cdb176d7568c7e6fd5e17ca
+size 3590
diff --git a/datasets/data_json/show_oliver_test/Sinclair_Broadcast_Group_-_Last_Week_Tonight_with_John_Oliver_HBO-GvtNyOzGogc.webm.json b/datasets/data_json/show_oliver_test/Sinclair_Broadcast_Group_-_Last_Week_Tonight_with_John_Oliver_HBO-GvtNyOzGogc.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe846442e31cd27d0b367ccca05dc7c70821dcae
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Sinclair_Broadcast_Group_-_Last_Week_Tonight_with_John_Oliver_HBO-GvtNyOzGogc.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:170076d200999b081a9d5fc6a8667ee0ded9981bf43a9f27cae6b40b74d626d9
+size 4314
diff --git a/datasets/data_json/show_oliver_test/Standardized_Testing_-_Last_Week_Tonight_with_John_Oliver_HBO-J6lyURyVz7k.mkv.json b/datasets/data_json/show_oliver_test/Standardized_Testing_-_Last_Week_Tonight_with_John_Oliver_HBO-J6lyURyVz7k.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ca0312d4d548187137ee3b165768b94e31faf11
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Standardized_Testing_-_Last_Week_Tonight_with_John_Oliver_HBO-J6lyURyVz7k.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4f35e0f7ef767050092195201b4b2a74b5cafde97dce7ca922c441104faaa51
+size 6011
diff --git a/datasets/data_json/show_oliver_test/Stupid_Watergate_-_Last_Week_Tonight_with_John_Oliver_HBO-FVFdsl29s_Q.mkv.json b/datasets/data_json/show_oliver_test/Stupid_Watergate_-_Last_Week_Tonight_with_John_Oliver_HBO-FVFdsl29s_Q.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..c78eebd1df4526290c0be0e304418568225ace04
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Stupid_Watergate_-_Last_Week_Tonight_with_John_Oliver_HBO-FVFdsl29s_Q.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d14353b30cdc90ab754f56b0d76aef42d0613af38d2f65f7056657271e8e320e
+size 5303
diff --git a/datasets/data_json/show_oliver_test/Televangelists_-_Last_Week_Tonight_with_John_Oliver_HBO-7y1xJAVZxXg.mkv.json b/datasets/data_json/show_oliver_test/Televangelists_-_Last_Week_Tonight_with_John_Oliver_HBO-7y1xJAVZxXg.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d1d345dc50fb3269053816d580ec287284f8beb
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Televangelists_-_Last_Week_Tonight_with_John_Oliver_HBO-7y1xJAVZxXg.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f3becaa6826e02fd9263c16bf9f1ffbf2ef29f982d8d8f6e69b847ad1df7b41
+size 5830
diff --git a/datasets/data_json/show_oliver_test/Tobacco_-_Last_Week_Tonight_with_John_Oliver_HBO-6UsHHOCH4q8.mkv.json b/datasets/data_json/show_oliver_test/Tobacco_-_Last_Week_Tonight_with_John_Oliver_HBO-6UsHHOCH4q8.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..34661bb20f625d255c7284980272cb30ba04dcd0
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Tobacco_-_Last_Week_Tonight_with_John_Oliver_HBO-6UsHHOCH4q8.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4af019c6e7b1833fc79484751b1ce8b426cdbcead88cec2b778e24786f995d14
+size 5622
diff --git a/datasets/data_json/show_oliver_test/Transgender_Rights_-_Last_Week_Tonight_with_John_Oliver_HBO-hmoAX9f6MOc.mkv.json b/datasets/data_json/show_oliver_test/Transgender_Rights_-_Last_Week_Tonight_with_John_Oliver_HBO-hmoAX9f6MOc.mkv.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4dcd47704b437a468ef96e46667f2b0e4c03116
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Transgender_Rights_-_Last_Week_Tonight_with_John_Oliver_HBO-hmoAX9f6MOc.mkv.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:232860c99935a0f0cbec83c3568994760e119a0b658425312aaaa5a76ad81c9d
+size 3572
diff --git a/datasets/data_json/show_oliver_test/Trump_vs._Truth_-_Last_Week_Tonight_with_John_Oliver_HBO-xecEV4dSAXE.webm.json b/datasets/data_json/show_oliver_test/Trump_vs._Truth_-_Last_Week_Tonight_with_John_Oliver_HBO-xecEV4dSAXE.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1ebdf3be91bbbb12e7f1f1de866e6ef5551a88e
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Trump_vs._Truth_-_Last_Week_Tonight_with_John_Oliver_HBO-xecEV4dSAXE.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09c3e5de0a08d3a40d09e3951b1ec6f761ef79fd4a4363f46c9250ae9571fd19
+size 5303
diff --git a/datasets/data_json/show_oliver_test/Vaccines_-_Last_Week_Tonight_with_John_Oliver_HBO-7VG_s2PCH_c.webm.json b/datasets/data_json/show_oliver_test/Vaccines_-_Last_Week_Tonight_with_John_Oliver_HBO-7VG_s2PCH_c.webm.json
new file mode 100644
index 0000000000000000000000000000000000000000..a40eca11dc4e81bcc712b5b06814a4784f3c062a
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Vaccines_-_Last_Week_Tonight_with_John_Oliver_HBO-7VG_s2PCH_c.webm.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae948e49c2b0a520095ac9444eb6b47a273ab8e87769689913e0ad50c135e658
+size 4545
diff --git a/datasets/data_json/show_oliver_test/Xi_Jinping_-_Last_Week_Tonight_with_John_Oliver_HBO-OubM8bD9kck.mp4.json b/datasets/data_json/show_oliver_test/Xi_Jinping_-_Last_Week_Tonight_with_John_Oliver_HBO-OubM8bD9kck.mp4.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a9e910a9e058f5ab298932afda6a00e71e060c9
--- /dev/null
+++ b/datasets/data_json/show_oliver_test/Xi_Jinping_-_Last_Week_Tonight_with_John_Oliver_HBO-OubM8bD9kck.mp4.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65ea3ecf8443949223b0e145bf649a653bc490b938dca59743a7a3a7301aefa7
+size 5711
diff --git a/datasets/data_json/youtube_test/speaker1.json b/datasets/data_json/youtube_test/speaker1.json
new file mode 100644
index 0000000000000000000000000000000000000000..03c7e438dcde0e9faa02e4b10191a3677d3c3e91
--- /dev/null
+++ b/datasets/data_json/youtube_test/speaker1.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f62a18240ecf6b497b44b08ce3db53d21f39152f7a3ac3d59576d11d138ce77
+size 4416
diff --git a/datasets/data_json/youtube_test/speaker12.json b/datasets/data_json/youtube_test/speaker12.json
new file mode 100644
index 0000000000000000000000000000000000000000..eace0a0f1059ea152705cb8bc5163f5dcb210236
--- /dev/null
+++ b/datasets/data_json/youtube_test/speaker12.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffa2fd17a02731de5d9e27b3766414537304a3d3b06a4e4518aeb71bd4ec8680
+size 5448
diff --git a/datasets/data_json/youtube_test/speaker2.json b/datasets/data_json/youtube_test/speaker2.json
new file mode 100644
index 0000000000000000000000000000000000000000..b276815d991d46aebbd1e3d50ebf24bd26bbdb12
--- /dev/null
+++ b/datasets/data_json/youtube_test/speaker2.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86f3c70ab11850c498c036e5f19b32a26d866e7d3a915c778feb501d252e2b09
+size 1818
diff --git a/datasets/data_json/youtube_test/speaker3.json b/datasets/data_json/youtube_test/speaker3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9f3d93754ecf85ebafb7fa1bf72cc61ccb7e87f
--- /dev/null
+++ b/datasets/data_json/youtube_test/speaker3.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4544fcdc79467d5bd3814c3fa0ca2b25234e275239538a0ade196801ce15a5e1
+size 3638
diff --git a/datasets/data_json/youtube_test/speaker4.json b/datasets/data_json/youtube_test/speaker4.json
new file mode 100644
index 0000000000000000000000000000000000000000..afca23e930b382b605f3d3f3b216097d6e1fb2b2
--- /dev/null
+++ b/datasets/data_json/youtube_test/speaker4.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59c310ef22b7cd3ac968f77f5b065e1f5dc4edb7d438a0f36a4a32ba77042467
+size 1300
diff --git a/datasets/data_json/youtube_test/speaker5.json b/datasets/data_json/youtube_test/speaker5.json
new file mode 100644
index 0000000000000000000000000000000000000000..45c0444fce3968b9bdb5dbc1b4057507138c87b9
--- /dev/null
+++ b/datasets/data_json/youtube_test/speaker5.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f68d1e39b645d3d8ea6fc161e665b1a6e1abc0ef9e3c6547f98e9275577bc6ce
+size 5455
diff --git a/datasets/data_json/youtube_test/speaker6.json b/datasets/data_json/youtube_test/speaker6.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4f20ebced8595a98b64a6425e89a8c6078d0571
--- /dev/null
+++ b/datasets/data_json/youtube_test/speaker6.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c65fba2e56744ad495b6a4b19992e83057518514b0c95627c48850a1be8d11f
+size 2337
diff --git a/datasets/data_json/youtube_test/speaker7.json b/datasets/data_json/youtube_test/speaker7.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6851902870ae0ebde9b657e12dd342a88bf1104
--- /dev/null
+++ b/datasets/data_json/youtube_test/speaker7.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5923febf17865c0a445726421f7667494b33ae6e816b655af4024d0366933f39
+size 9125
diff --git a/datasets/data_json/youtube_test/speaker8.json b/datasets/data_json/youtube_test/speaker8.json
new file mode 100644
index 0000000000000000000000000000000000000000..6be216bb83deff24bc6d6170ff82c535dada17cb
--- /dev/null
+++ b/datasets/data_json/youtube_test/speaker8.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5bc072ef0b92ba46d01e6f4630b36c2cc8ae23ad827db4afaba278047470d5e
+size 7790
diff --git a/datasets/data_json/youtube_test/speaker9.json b/datasets/data_json/youtube_test/speaker9.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4419b5d451c5d971e37c8c8a1b9dbb7a640ed0c
--- /dev/null
+++ b/datasets/data_json/youtube_test/speaker9.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c107ed8f5195a3ba6ade8aad1a2f2d6685ec60ed57952abb3a5cd6c091eeb36
+size 8975
diff --git a/datasets/get_json.ipynb b/datasets/get_json.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..d630604e61204d680aaa9bb7c459fb14c7d445a6
--- /dev/null
+++ b/datasets/get_json.ipynb
@@ -0,0 +1,268 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "\n",
+    "def collect_dataset_info(root_dir):\n",
+    "    dataset_info = []\n",
+    "\n",
+    "    for mode in ['train', 'val', 'test']:\n",
+    "        mode_dir = os.path.join(root_dir, mode)\n",
+    "        if not os.path.exists(mode_dir):\n",
+    "            continue\n",
+    "\n",
+    "        for video_folder in os.listdir(mode_dir):\n",
+    "            video_id = video_folder  # Folder name is used as video_id\n",
+    "            video_folder_path = os.path.join(mode_dir, video_folder)\n",
+    "\n",
+    "            if os.path.isdir(video_folder_path):\n",
+    "                video_path, audio_path, motion_path = None, None, None\n",
+    "\n",
+    "                for file_name in os.listdir(video_folder_path):\n",
+    "                    file_path = os.path.join(video_folder_path, file_name)\n",
+    "\n",
+    "                    if file_name.endswith('.mp4'):\n",
+    "                        video_path = file_path\n",
+    "                    elif file_name.endswith('.wav'):\n",
+    "                        audio_path = file_path\n",
+    "                    elif file_name.endswith('.pkl'):\n",
+    "                        motion_path = file_path\n",
+    "\n",
+    "                # Create an entry only if all the necessary files are present\n",
+    "                if video_path and audio_path and motion_path:\n",
+    "                    dataset_info.append({\n",
+    "                        \"video_id\": video_id,\n",
+    "                        \"video_path\": video_path,\n",
+    "                        \"audio_path\": audio_path,\n",
+    "                        \"motion_path\": motion_path,\n",
+    "                        \"mode\": mode\n",
+    "                    })\n",
+    "\n",
+    "    return dataset_info\n",
+    "\n",
+    "\n",
+    "# Set the root directory path of your dataset\n",
+    "root_dir = '/path/to/ExpressiveWholeBodyDatasetReleaseV1.0'\n",
+    "dataset_info = collect_dataset_info(root_dir)\n",
+    "output_file = 'dataset_info.json'\n",
+    "with open(output_file, 'w') as json_file:\n",
+    "    json.dump(dataset_info, json_file, indent=4)\n",
+    "print(f\"Dataset information saved to {output_file}\")\n",
+    "\n",
+    "\n",
+    "import os\n",
+    "import json\n",
+    "import pickle\n",
+    "import wave\n",
+    "\n",
+    "def load_pkl(pkl_path):\n",
+    "    try:\n",
+    "        with open(pkl_path, 'rb') as f:\n",
+    "            data = pickle.load(f)\n",
+    "        return data\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error loading {pkl_path}: {e}\")\n",
+    "        return None\n",
+    "\n",
+    "def load_wav(wav_path):\n",
+    "    try:\n",
+    "        with wave.open(wav_path, 'rb') as f:\n",
+    "            frames = f.getnframes()\n",
+    "        return frames\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error loading {wav_path}: {e}\")\n",
+    "        return None\n",
+    "\n",
+    "def generate_clips(data, stride, window_length):\n",
+    "    clips = []\n",
+    "    for entry in data:\n",
+    "        pkl_data = load_pkl(entry['motion_path'])\n",
+    "        wav_frames = load_wav(entry['audio_path'])\n",
+    "\n",
+    "        # Only continue if both the pkl and wav files are successfully loaded\n",
+    "        if pkl_data is None or wav_frames is None:\n",
+    "            continue\n",
+    "\n",
+    "        # Determine the total length of the sequence from pkl data\n",
+    "        total_frames = len(pkl_data)  # Assuming pkl contains motion data frames\n",
+    "\n",
+    "        # Generate clips based on stride and window_length\n",
+    "        for start_idx in range(0, total_frames - window_length + 1, stride):\n",
+    "            end_idx = start_idx + window_length\n",
+    "            clip = {\n",
+    "                \"video_id\": entry[\"video_id\"],\n",
+    "                \"video_path\": entry[\"video_path\"],\n",
+    "                \"audio_path\": entry[\"audio_path\"],\n",
+    "                \"motion_path\": entry[\"motion_path\"],\n",
+    "                \"mode\": entry[\"mode\"],\n",
+    "                \"start_idx\": start_idx,\n",
+    "                \"end_idx\": end_idx\n",
+    "            }\n",
+    "            clips.append(clip)\n",
+    "\n",
+    "    return clips\n",
+    "\n",
+    "\n",
+    "# Load the existing dataset JSON file\n",
+    "input_json = 'dataset_info.json'\n",
+    "with open(input_json, 'r') as f:\n",
+    "    dataset_info = json.load(f)\n",
+    "\n",
+    "# Set stride and window length\n",
+    "stride = 5  # Adjust stride as needed\n",
+    "window_length = 10  # Adjust window length as needed\n",
+    "\n",
+    "# Generate clips for all data\n",
+    "clips_data = generate_clips(dataset_info, stride, window_length)\n",
+    "\n",
+    "# Save the filtered clips data to a new JSON file\n",
+    "output_json = 'filtered_clips_data.json'\n",
+    "with open(output_json, 'w') as f:\n",
+    "    json.dump(clips_data, f, indent=4)\n",
+    "\n",
+    "print(f\"Filtered clips data saved to {output_json}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "A module that was compiled using NumPy 1.x cannot be run in\n",
+      "NumPy 2.1.1 as it may crash. To support both 1.x and 2.x\n",
+      "versions of NumPy, modules must be compiled with NumPy 2.0.\n",
+      "Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.\n",
+      "\n",
+      "If you are a user of the module, the easiest solution will be to\n",
+      "downgrade to 'numpy<2' or try to upgrade the affected module.\n",
+      "We expect that some modules will need time to support NumPy 2.\n",
+      "\n",
+      "Traceback (most recent call last):  File \"<frozen runpy>\", line 198, in _run_module_as_main\n",
+      "  File \"<frozen runpy>\", line 88, in _run_code\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/ipykernel_launcher.py\", line 18, in <module>\n",
+      "    app.launch_new_instance()\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/traitlets/config/application.py\", line 1075, in launch_instance\n",
+      "    app.start()\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/ipykernel/kernelapp.py\", line 739, in start\n",
+      "    self.io_loop.start()\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/tornado/platform/asyncio.py\", line 205, in start\n",
+      "    self.asyncio_loop.run_forever()\n",
+      "  File \"/usr/local/Cellar/python@3.11/3.11.9/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py\", line 608, in run_forever\n",
+      "    self._run_once()\n",
+      "  File \"/usr/local/Cellar/python@3.11/3.11.9/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py\", line 1936, in _run_once\n",
+      "    handle._run()\n",
+      "  File \"/usr/local/Cellar/python@3.11/3.11.9/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/events.py\", line 84, in _run\n",
+      "    self._context.run(self._callback, *self._args)\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/ipykernel/kernelbase.py\", line 545, in dispatch_queue\n",
+      "    await self.process_one()\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/ipykernel/kernelbase.py\", line 534, in process_one\n",
+      "    await dispatch(*args)\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/ipykernel/kernelbase.py\", line 437, in dispatch_shell\n",
+      "    await result\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/ipykernel/ipkernel.py\", line 362, in execute_request\n",
+      "    await super().execute_request(stream, ident, parent)\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/ipykernel/kernelbase.py\", line 778, in execute_request\n",
+      "    reply_content = await reply_content\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/ipykernel/ipkernel.py\", line 449, in do_execute\n",
+      "    res = shell.run_cell(\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/ipykernel/zmqshell.py\", line 549, in run_cell\n",
+      "    return super().run_cell(*args, **kwargs)\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/IPython/core/interactiveshell.py\", line 3075, in run_cell\n",
+      "    result = self._run_cell(\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/IPython/core/interactiveshell.py\", line 3130, in _run_cell\n",
+      "    result = runner(coro)\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/IPython/core/async_helpers.py\", line 128, in _pseudo_sync_runner\n",
+      "    coro.send(None)\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/IPython/core/interactiveshell.py\", line 3334, in run_cell_async\n",
+      "    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/IPython/core/interactiveshell.py\", line 3517, in run_ast_nodes\n",
+      "    if await self.run_code(code, result, async_=asy):\n",
+      "  File \"/Users/haiyang/Library/Python/3.11/lib/python/site-packages/IPython/core/interactiveshell.py\", line 3577, in run_code\n",
+      "    exec(code_obj, self.user_global_ns, self.user_ns)\n",
+      "  File \"/var/folders/fs/6rm22wts5gd2ll3xz0snkhl00000gn/T/ipykernel_89872/2585702779.py\", line 1, in <module>\n",
+      "    import torch\n",
+      "  File \"/usr/local/lib/python3.11/site-packages/torch/__init__.py\", line 1477, in <module>\n",
+      "    from .functional import *  # noqa: F403\n",
+      "  File \"/usr/local/lib/python3.11/site-packages/torch/functional.py\", line 9, in <module>\n",
+      "    import torch.nn.functional as F\n",
+      "  File \"/usr/local/lib/python3.11/site-packages/torch/nn/__init__.py\", line 1, in <module>\n",
+      "    from .modules import *  # noqa: F403\n",
+      "  File \"/usr/local/lib/python3.11/site-packages/torch/nn/modules/__init__.py\", line 35, in <module>\n",
+      "    from .transformer import TransformerEncoder, TransformerDecoder, \\\n",
+      "  File \"/usr/local/lib/python3.11/site-packages/torch/nn/modules/transformer.py\", line 20, in <module>\n",
+      "    device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),\n",
+      "/usr/local/lib/python3.11/site-packages/torch/nn/modules/transformer.py:20: UserWarning: Failed to initialize NumPy: _ARRAY_API not found (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:84.)\n",
+      "  device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 6\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpickle\u001b[39;00m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/Users/haiyang/download_backup/oliver/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm/test/214438-00_07_16-00_07_26/214438-00_07_16-00_07_26.pkl\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m      5\u001b[0m     \u001b[38;5;66;03m# Load the file by mapping any GPU tensors to CPU\u001b[39;00m\n\u001b[0;32m----> 6\u001b[0m     pkl_example \u001b[38;5;241m=\u001b[39m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_location\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mcpu\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;66;03m# Now check the type of the object\u001b[39;00m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mtype\u001b[39m(pkl_example))\n",
+      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/torch/serialization.py:1040\u001b[0m, in \u001b[0;36mload\u001b[0;34m(f, map_location, pickle_module, weights_only, mmap, **pickle_load_args)\u001b[0m\n\u001b[1;32m   1038\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m   1039\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m pickle\u001b[38;5;241m.\u001b[39mUnpicklingError(UNSAFE_MESSAGE \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(e)) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1040\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_legacy_load\u001b[49m\u001b[43m(\u001b[49m\u001b[43mopened_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_location\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpickle_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpickle_load_args\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/torch/serialization.py:1258\u001b[0m, in \u001b[0;36m_legacy_load\u001b[0;34m(f, map_location, pickle_module, **pickle_load_args)\u001b[0m\n\u001b[1;32m   1252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(f, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mreadinto\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m (\u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m8\u001b[39m, \u001b[38;5;241m0\u001b[39m) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m sys\u001b[38;5;241m.\u001b[39mversion_info \u001b[38;5;241m<\u001b[39m (\u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m8\u001b[39m, \u001b[38;5;241m2\u001b[39m):\n\u001b[1;32m   1253\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m   1254\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch.load does not work with file-like objects that do not implement readinto on Python 3.8.0 and 3.8.1. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1255\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mReceived object of type \u001b[39m\u001b[38;5;130;01m\\\"\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(f)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\\"\u001b[39;00m\u001b[38;5;124m. Please update to Python 3.8.2 or newer to restore this \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1256\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfunctionality.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 1258\u001b[0m magic_number \u001b[38;5;241m=\u001b[39m \u001b[43mpickle_module\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpickle_load_args\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1259\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m magic_number \u001b[38;5;241m!=\u001b[39m MAGIC_NUMBER:\n\u001b[1;32m   1260\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid magic number; corrupt file?\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/torch/storage.py:371\u001b[0m, in \u001b[0;36m_load_from_bytes\u001b[0;34m(b)\u001b[0m\n\u001b[1;32m    370\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load_from_bytes\u001b[39m(b):\n\u001b[0;32m--> 371\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mio\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mBytesIO\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/torch/serialization.py:1040\u001b[0m, in \u001b[0;36mload\u001b[0;34m(f, map_location, pickle_module, weights_only, mmap, **pickle_load_args)\u001b[0m\n\u001b[1;32m   1038\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m   1039\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m pickle\u001b[38;5;241m.\u001b[39mUnpicklingError(UNSAFE_MESSAGE \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(e)) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1040\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_legacy_load\u001b[49m\u001b[43m(\u001b[49m\u001b[43mopened_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmap_location\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpickle_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpickle_load_args\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/torch/serialization.py:1268\u001b[0m, in \u001b[0;36m_legacy_load\u001b[0;34m(f, map_location, pickle_module, **pickle_load_args)\u001b[0m\n\u001b[1;32m   1266\u001b[0m unpickler \u001b[38;5;241m=\u001b[39m UnpicklerWrapper(f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mpickle_load_args)\n\u001b[1;32m   1267\u001b[0m unpickler\u001b[38;5;241m.\u001b[39mpersistent_load \u001b[38;5;241m=\u001b[39m persistent_load\n\u001b[0;32m-> 1268\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43munpickler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1270\u001b[0m deserialized_storage_keys \u001b[38;5;241m=\u001b[39m pickle_module\u001b[38;5;241m.\u001b[39mload(f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mpickle_load_args)\n\u001b[1;32m   1272\u001b[0m offset \u001b[38;5;241m=\u001b[39m f\u001b[38;5;241m.\u001b[39mtell() \u001b[38;5;28;01mif\u001b[39;00m f_should_read_directly \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/torch/serialization.py:1205\u001b[0m, in \u001b[0;36m_legacy_load.<locals>.persistent_load\u001b[0;34m(saved_id)\u001b[0m\n\u001b[1;32m   1201\u001b[0m     obj\u001b[38;5;241m.\u001b[39m_torch_load_uninitialized \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m   1202\u001b[0m     \u001b[38;5;66;03m# TODO: Once we decide to break serialization FC, we can\u001b[39;00m\n\u001b[1;32m   1203\u001b[0m     \u001b[38;5;66;03m# stop wrapping with TypedStorage\u001b[39;00m\n\u001b[1;32m   1204\u001b[0m     typed_storage \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mstorage\u001b[38;5;241m.\u001b[39mTypedStorage(\n\u001b[0;32m-> 1205\u001b[0m         wrap_storage\u001b[38;5;241m=\u001b[39m\u001b[43mrestore_location\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[43m)\u001b[49m,\n\u001b[1;32m   1206\u001b[0m         dtype\u001b[38;5;241m=\u001b[39mdtype,\n\u001b[1;32m   1207\u001b[0m         _internal\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m   1208\u001b[0m     deserialized_objects[root_key] \u001b[38;5;241m=\u001b[39m typed_storage\n\u001b[1;32m   1209\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
+      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/torch/serialization.py:391\u001b[0m, in \u001b[0;36mdefault_restore_location\u001b[0;34m(storage, location)\u001b[0m\n\u001b[1;32m    389\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdefault_restore_location\u001b[39m(storage, location):\n\u001b[1;32m    390\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m _, _, fn \u001b[38;5;129;01min\u001b[39;00m _package_registry:\n\u001b[0;32m--> 391\u001b[0m         result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstorage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    392\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    393\u001b[0m             \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
+      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/torch/serialization.py:266\u001b[0m, in \u001b[0;36m_cuda_deserialize\u001b[0;34m(obj, location)\u001b[0m\n\u001b[1;32m    264\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_cuda_deserialize\u001b[39m(obj, location):\n\u001b[1;32m    265\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m location\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[0;32m--> 266\u001b[0m         device \u001b[38;5;241m=\u001b[39m \u001b[43mvalidate_cuda_device\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlocation\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    267\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(obj, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_load_uninitialized\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[1;32m    268\u001b[0m             \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mdevice(device):\n",
+      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/torch/serialization.py:250\u001b[0m, in \u001b[0;36mvalidate_cuda_device\u001b[0;34m(location)\u001b[0m\n\u001b[1;32m    247\u001b[0m device \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39m_utils\u001b[38;5;241m.\u001b[39m_get_device_index(location, \u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m    249\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mis_available():\n\u001b[0;32m--> 250\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAttempting to deserialize object on a CUDA \u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m    251\u001b[0m                        \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdevice but torch.cuda.is_available() is False. \u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m    252\u001b[0m                        \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mIf you are running on a CPU-only machine, \u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m    253\u001b[0m                        \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mplease use torch.load with map_location=torch.device(\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m) \u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m    254\u001b[0m                        \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mto map your storages to the CPU.\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m    255\u001b[0m device_count \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcuda\u001b[38;5;241m.\u001b[39mdevice_count()\n\u001b[1;32m    256\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m device \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m device_count:\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU."
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import pickle\n",
+    "\n",
+    "with open(\"/Users/haiyang/download_backup/oliver/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm/test/214438-00_07_16-00_07_26/214438-00_07_16-00_07_26.pkl\", \"rb\") as f:\n",
+    "    # Load the file by mapping any GPU tensors to CPU\n",
+    "    pkl_example = torch.load(f, map_location=torch.device('cpu'))\n",
+    "\n",
+    "# Now check the type of the object\n",
+    "print(type(pkl_example))\n",
+    "\n",
+    "# If it's a dictionary, print its keys\n",
+    "if isinstance(pkl_example, dict):\n",
+    "    print(pkl_example.keys())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/datasets/process_testdata.py b/datasets/process_testdata.py
new file mode 100644
index 0000000000000000000000000000000000000000..47ce087d21daa3b76d1096178fa4329595f3a2b9
--- /dev/null
+++ b/datasets/process_testdata.py
@@ -0,0 +1,302 @@
+# import smplx
+# import torch
+# import pickle
+# import numpy as np
+
+# # Global: Load the SMPL-X model once
+# smplx_model = smplx.create(
+#     "/content/drive/MyDrive/003_Codes/TANGO-JointEmbedding/emage/smplx_models/", 
+#     model_type='smplx',
+#     gender='NEUTRAL_2020', 
+#     use_face_contour=False,
+#     num_betas=300,
+#     num_expression_coeffs=100, 
+#     ext='npz',
+#     use_pca=True,
+#     num_pca_comps=12,
+# ).to("cuda").eval()
+
+# device = "cuda"
+
+# def pkl_to_npz(pkl_path, npz_path):
+#     # Load the pickle file
+#     with open(pkl_path, "rb") as f:
+#         pkl_example = pickle.load(f)
+
+#     bs = 1
+#     n = pkl_example["expression"].shape[0]  # Assuming this is the batch size
+
+#     # Convert numpy arrays to torch tensors
+#     def to_tensor(numpy_array):
+#         return torch.tensor(numpy_array, dtype=torch.float32).to(device)
+
+#     # Ensure that betas are loaded from the pickle data, converting them to torch tensors
+#     betas = to_tensor(pkl_example["betas"])
+#     transl = to_tensor(pkl_example["transl"])
+#     expression = to_tensor(pkl_example["expression"])
+#     jaw_pose = to_tensor(pkl_example["jaw_pose"])
+#     global_orient = to_tensor(pkl_example["global_orient"])
+#     body_pose_axis = to_tensor(pkl_example["body_pose_axis"])
+#     left_hand_pose = to_tensor(pkl_example['left_hand_pose'])
+#     right_hand_pose = to_tensor(pkl_example['right_hand_pose'])
+#     leye_pose = to_tensor(pkl_example['leye_pose'])
+#     reye_pose = to_tensor(pkl_example['reye_pose'])
+
+#     # Pass the loaded data into the SMPL-X model
+#     gt_vertex = smplx_model(
+#         betas=betas,
+#         transl=transl,  # Translation
+#         expression=expression,  # Expression
+#         jaw_pose=jaw_pose,  # Jaw pose
+#         global_orient=global_orient,  # Global orientation
+#         body_pose=body_pose_axis,  # Body pose
+#         left_hand_pose=left_hand_pose,  # Left hand pose
+#         right_hand_pose=right_hand_pose,  # Right hand pose
+#         return_full_pose=True,
+#         leye_pose=leye_pose,  # Left eye pose
+#         reye_pose=reye_pose,  # Right eye pose
+#     )
+
+#     # Save the relevant data to an npz file
+#     np.savez(npz_path,
+#         betas=pkl_example["betas"],
+#         poses=gt_vertex["full_pose"].cpu().numpy(),
+#         expressions=pkl_example["expression"],
+#         trans=pkl_example["transl"],
+#         model='smplx2020',
+#         gender='neutral',
+#         mocap_frame_rate=30,
+#     )
+
+# from tqdm import tqdm
+# import os
+# def convert_all_pkl_in_folder(folder_path):
+#     # Collect all .pkl files
+#     pkl_files = []
+#     for root, dirs, files in os.walk(folder_path):
+#         for file in files:
+#             if file.endswith(".pkl"):
+#                 pkl_files.append(os.path.join(root, file))
+    
+#     # Process each file with a progress bar
+#     for pkl_path in tqdm(pkl_files, desc="Converting .pkl to .npz"):
+#         npz_path = pkl_path.replace(".pkl", ".npz")  # Replace .pkl with .npz
+#         pkl_to_npz(pkl_path, npz_path)
+
+# convert_all_pkl_in_folder("/content/oliver/oliver/")  
+
+
+# import os
+# import json
+
+# def collect_dataset_info(root_dir):
+#     dataset_info = []
+    
+#     for root, dirs, files in os.walk(root_dir):
+#         for file in files:
+#             if file.endswith(".npz"):
+#                 video_id = file[:-4]  # Removing the .npz extension to get the video ID
+
+#                 # Construct the paths based on the current root directory
+#                 motion_path = os.path.join(root)
+#                 video_path = os.path.join(root)
+#                 audio_path = os.path.join(root)
+                
+#                 # Determine the mode (train, val, test) by checking parent directory
+#                 mode = root.split(os.sep)[-2]  # Assuming mode is one folder up in hierarchy
+
+#                 dataset_info.append({
+#                     "video_id": video_id,
+#                     "video_path": video_path,
+#                     "audio_path": audio_path,
+#                     "motion_path": motion_path,
+#                     "mode": mode  
+#                 })
+#     return dataset_info
+
+# # Set the root directory path of your dataset
+# root_dir = '/content/oliver/oliver/'  # Adjust this to your actual root directory
+# dataset_info = collect_dataset_info(root_dir)
+# output_file = '/content/drive/MyDrive/003_Codes/TANGO-JointEmbedding/datasets/show-oliver-original.json'
+
+# # Save the dataset information to a JSON file
+# with open(output_file, 'w') as json_file:
+#     json.dump(dataset_info, json_file, indent=4)
+# print(f"Dataset information saved to {output_file}")
+
+
+# import os
+# import json
+# import numpy as np
+
+# def load_npz(npz_path):
+#     try:
+#         data = np.load(npz_path)
+#         return data
+#     except Exception as e:
+#         print(f"Error loading {npz_path}: {e}")
+#         return None
+
+# def generate_clips(data, stride, window_length):
+#     clips = []
+#     for entry in data:
+#         npz_data = load_npz(os.path.join(entry['motion_path'],entry['video_id']+".npz"))
+
+#         # Only continue if the npz file is successfully loaded
+#         if npz_data is None:
+#             continue
+
+#         # Determine the total length of the sequence from npz data
+#         total_frames = npz_data["poses"].shape[0]
+
+#         # Generate clips based on stride and window_length
+#         for start_idx in range(0, total_frames - window_length + 1, stride):
+#             end_idx = start_idx + window_length
+#             clip = {
+#                 "video_id": entry["video_id"],
+#                 "video_path": entry["video_path"],
+#                 "audio_path": entry["audio_path"],
+#                 "motion_path": entry["motion_path"],
+#                 "mode": entry["mode"],
+#                 "start_idx": start_idx,
+#                 "end_idx": end_idx
+#             }
+#             clips.append(clip)
+
+#     return clips
+
+# # Load the existing dataset JSON file
+# input_json = '/content/drive/MyDrive/003_Codes/TANGO-JointEmbedding/datasets/show-oliver-original.json'
+# with open(input_json, 'r') as f:
+#     dataset_info = json.load(f)
+
+# # Set stride and window length
+# stride = 40  # Adjust stride as needed
+# window_length = 64  # Adjust window length as needed
+
+# # Generate clips for all data
+# clips_data = generate_clips(dataset_info, stride, window_length)
+
+# # Save the filtered clips data to a new JSON file
+# output_json = f'/content/drive/MyDrive/003_Codes/TANGO-JointEmbedding/datasets/show-oliver-s{stride}_w{window_length}.json'
+# with open(output_json, 'w') as f:
+#     json.dump(clips_data, f, indent=4)
+
+# print(f"Filtered clips data saved to {output_json}")
+
+
+
+from ast import Expression
+import os
+import numpy as np
+import wave
+from moviepy.editor import VideoFileClip
+
+def split_npz(npz_path, output_prefix):
+    try:
+        # Load the npz file
+        data = np.load(npz_path)
+
+        # Get the arrays and split them along the time dimension (T)
+        poses = data["poses"]
+        betas = data["betas"]
+        expressions = data["expressions"]
+        trans = data["trans"]
+
+        # Determine the halfway point (T/2)
+        half = poses.shape[0] // 2
+
+        # Save the first half (0-5 seconds)
+        np.savez(output_prefix + "_0_5.npz",
+                 betas=betas[:half],
+                 poses=poses[:half],
+                 expressions=expressions[:half],
+                 trans=trans[:half],
+                 model=data['model'],
+                 gender=data['gender'],
+                 mocap_frame_rate=data['mocap_frame_rate'])
+
+        # Save the second half (5-10 seconds)
+        np.savez(output_prefix + "_5_10.npz",
+                 betas=betas[half:],
+                 poses=poses[half:],
+                 expressions=expressions[half:],
+                 trans=trans[half:],
+                 model=data['model'],
+                 gender=data['gender'],
+                 mocap_frame_rate=data['mocap_frame_rate'])
+
+        print(f"NPZ split saved for {output_prefix}")
+    except Exception as e:
+        print(f"Error processing NPZ file {npz_path}: {e}")
+
+def split_wav(wav_path, output_prefix):
+    try:
+        with wave.open(wav_path, 'rb') as wav_file:
+            params = wav_file.getparams()
+            frames = wav_file.readframes(wav_file.getnframes())
+            half_frame = len(frames) // 2
+
+            # Create two half files
+            for i, start_frame in enumerate([0, half_frame]):
+                with wave.open(f"{output_prefix}_{i*5}_{(i+1)*5}.wav", 'wb') as out_wav:
+                    out_wav.setparams(params)
+                    if i == 0:
+                        out_wav.writeframes(frames[:half_frame])
+                    else:
+                        out_wav.writeframes(frames[half_frame:])
+        print(f"WAV split saved for {output_prefix}")
+    except Exception as e:
+        print(f"Error processing WAV file {wav_path}: {e}")
+
+def split_mp4(mp4_path, output_prefix):
+    try:
+        clip = VideoFileClip(mp4_path)
+        for i in range(2):
+            subclip = clip.subclip(i*5, (i+1)*5)
+            subclip.write_videofile(f"{output_prefix}_{i*5}_{(i+1)*5}.mp4", codec="libx264", audio_codec="aac")
+        print(f"MP4 split saved for {output_prefix}")
+    except Exception as e:
+        print(f"Error processing MP4 file {mp4_path}: {e}")
+
+def process_files(root_dir, output_dir):
+    import json
+    clips = []
+    dirs = os.listdir(root_dir)
+    for dir in dirs:
+        video_id = dir
+        output_prefix = os.path.join(output_dir, video_id)
+        root = os.path.join(root_dir, dir)
+        npz_path = os.path.join(root, video_id + ".npz")
+        wav_path = os.path.join(root, video_id + ".wav")
+        mp4_path = os.path.join(root, video_id + ".mp4")
+
+        # split_npz(npz_path, output_prefix)
+        # split_wav(wav_path, output_prefix)
+        # split_mp4(mp4_path, output_prefix)
+
+        clip = {
+                "video_id": video_id,
+                "video_path": root,
+                "audio_path": root,
+                "motion_path": root,
+                "mode": "test",
+                "start_idx": 0,
+                "end_idx": 150
+            }
+        clips.append(clip)
+
+    output_json = output_dir + "/test.json"
+    with open(output_json, 'w') as f:
+        json.dump(clips, f, indent=4)
+    
+
+# Set the root directory path of your dataset and output directory
+root_dir = '/content/oliver/oliver/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm/test/'
+output_dir = '/content/test'
+
+# Make sure the output directory exists
+os.makedirs(output_dir, exist_ok=True)
+
+# Process all the files
+process_files(root_dir, output_dir)
diff --git a/emage/AESKConv_240_100.bin b/emage/AESKConv_240_100.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1d1ea36ecd9582802176c499eba43969144ad9fe
--- /dev/null
+++ b/emage/AESKConv_240_100.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5cd9566b24264f34d44003b3de62cdfd50aa85b7cdde2d369214599023c40f55
+size 17558653
diff --git a/emage/__init__.py b/emage/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/emage/mean_vel_smplxflame_30.npy b/emage/mean_vel_smplxflame_30.npy
new file mode 100644
index 0000000000000000000000000000000000000000..0789238537103f051a6c51a3c4725e8afe85e140
--- /dev/null
+++ b/emage/mean_vel_smplxflame_30.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53b5e48f2a7bf78c41a6de6395d6bb4f29018465ca5d0ee2820a2be3eebb7137
+size 348
diff --git a/emage/mertic.py b/emage/mertic.py
new file mode 100644
index 0000000000000000000000000000000000000000..68dd237f93fe40a5f016a9dcd5739c2b95d36836
--- /dev/null
+++ b/emage/mertic.py
@@ -0,0 +1,365 @@
+import os
+import wget
+import math
+import numpy as np
+import librosa
+import librosa.display
+import matplotlib.pyplot as plt
+from scipy.signal import argrelextrema
+from scipy import linalg
+import torch
+
+from .motion_encoder import VAESKConv
+
+
+class L1div(object):
+    def __init__(self):
+        self.counter = 0
+        self.sum = 0
+    def run(self, results):
+        self.counter += results.shape[0]
+        mean = np.mean(results, 0)
+        for i in range(results.shape[0]):
+            results[i, :] = abs(results[i, :] - mean)
+        sum_l1 = np.sum(results)
+        self.sum += sum_l1
+    def avg(self):
+        return self.sum/self.counter
+    def reset(self):
+        self.counter = 0
+        self.sum = 0
+        
+
+class SRGR(object):
+    def __init__(self, threshold=0.1, joints=47, joint_dim=3):
+        self.threshold = threshold
+        self.pose_dimes = joints
+        self.joint_dim = joint_dim
+        self.counter = 0
+        self.sum = 0
+        
+    def run(self, results, targets, semantic=None, verbose=False):
+        if semantic is None:
+            semantic = np.ones(results.shape[0])
+            avg_weight = 1.0
+        else:
+            # srgr == 0.165 when all success, scale range to [0, 1]
+            avg_weight = 0.165
+        results = results.reshape(-1, self.pose_dimes, self.joint_dim)
+        targets = targets.reshape(-1, self.pose_dimes, self.joint_dim)
+        semantic = semantic.reshape(-1)
+        diff = np.linalg.norm(results-targets, axis=2) # T, J
+        if verbose: print(diff)
+        success = np.where(diff<self.threshold, 1.0, 0.0)
+        for i in range(success.shape[0]):
+            success[i, :] *= semantic[i] * (1/avg_weight) 
+        rate = np.sum(success)/(success.shape[0]*success.shape[1])
+        self.counter += success.shape[0]
+        self.sum += (rate*success.shape[0])
+        return rate
+    
+    def avg(self):
+        return self.sum/self.counter
+    
+    def reset(self):
+        self.counter = 0
+        self.sum = 0
+
+
+class BC(object):
+    def __init__(self, download_path=None, sigma=0.3, order=7, upper_body=[3,6,9,12,13,14,15,16,17,18,19,20,21]):
+        self.sigma = sigma
+        self.order = order
+        self.upper_body = upper_body
+        self.pose_data = []
+        if download_path is not None:
+            os.makedirs(download_path, exist_ok=True)
+            model_file_path = os.path.join(download_path, "mean_vel_smplxflame_30.npy")
+            if not os.path.exists(model_file_path):
+                print(f"Downloading {model_file_path}")
+                wget.download("https://huggingface.co/spaces/H-Liu1997/EMAGE/resolve/main/EMAGE/test_sequences/weights/mean_vel_smplxflame_30.npy", model_file_path)
+        self.mmae = np.load(os.path.join(download_path, "mean_vel_smplxflame_30.npy")) if download_path is not None else None
+        self.threshold = 0.10
+        self.counter = 0
+        self.sum = 0
+
+    def load_audio(self, wave, t_start=None, t_end=None, without_file=False, sr_audio=16000):
+        hop_length = 512
+        if without_file:
+            y = wave
+        else:
+            y, sr = librosa.load(wave, sr=sr_audio)
+        
+        short_y = y[t_start:t_end] if t_start is not None else y
+        onset_t = librosa.onset.onset_detect(y=short_y, sr=sr_audio, hop_length=hop_length, units='time')
+        return onset_t
+
+    def load_pose(self, pose, t_start, t_end, pose_fps, without_file=False):
+        data_each_file = []
+        if without_file:
+            for line_data_np in pose:
+                data_each_file.append(line_data_np)
+        else:
+            with open(pose, "r") as f:
+                for i, line_data in enumerate(f.readlines()):
+                    if i < 432:
+                        continue
+                    line_data_np = np.fromstring(line_data, sep=" ")
+                    if pose_fps == 15 and i % 2 == 0:
+                        continue
+                    data_each_file.append(np.concatenate([line_data_np[30:39], line_data_np[112:121]], 0))
+                    
+        data_each_file = np.array(data_each_file)# T*165
+        # print(data_each_file.shape)
+        
+        joints = data_each_file.transpose(1, 0)
+        dt = 1 / pose_fps
+        init_vel = (joints[:, 1:2] - joints[:, :1]) / dt
+        middle_vel = (joints[:, 2:] - joints[:, 0:-2]) / (2 * dt)
+        final_vel = (joints[:, -1:] - joints[:, -2:-1]) / dt
+        vel = np.concatenate([init_vel, middle_vel, final_vel], 1).transpose(1, 0).reshape(data_each_file.shape[0], -1, 3)
+        # print(vel.shape)
+
+        if self.mmae is not None:
+            vel = np.linalg.norm(vel, axis=2) / self.mmae
+        else:
+            print("Warning: mmae is not provided, using max value of vel as mmae")
+            self.mmae = np.linalg.norm(vel, axis=2).max()
+            vel = np.linalg.norm(vel, axis=2) / self.mmae
+        # print(vel.shape) # T*J
+
+        beat_vel_all = []
+        for i in range(vel.shape[1]):
+            vel_mask = np.where(vel[:, i] > self.threshold)
+            beat_vel = argrelextrema(vel[t_start:t_end, i], np.less, order=self.order)
+            beat_vel_list = [j for j in beat_vel[0] if j in vel_mask[0]]
+            beat_vel_all.append(np.array(beat_vel_list))
+        return beat_vel_all
+
+    def load_data(self, wave, pose, t_start, t_end, pose_fps):
+        onset_raw = self.load_audio(wave, t_start, t_end)
+        beat_vel_all = self.load_pose(pose, t_start, t_end, pose_fps)
+        return onset_raw, beat_vel_all
+
+    def eval_random_pose(self, wave, pose, t_start, t_end, pose_fps, num_random=60):
+        onset_raw = self.load_audio(wave, t_start, t_end)
+        dur = t_end - t_start
+        for i in range(num_random):
+            beat_vel_all = self.load_pose(pose, i, i+dur, pose_fps)
+            dis_all_b2a = self.calculate_align(onset_raw, beat_vel_all)
+            print(f"{i}s: ", dis_all_b2a)
+
+    @staticmethod
+    def plot_onsets(audio, sr, onset_times_1, onset_times_2):
+        fig, axarr = plt.subplots(2, 1, figsize=(10, 10), sharex=True)
+        librosa.display.waveshow(audio, sr=sr, alpha=0.7, ax=axarr[0])
+        librosa.display.waveshow(audio, sr=sr, alpha=0.7, ax=axarr[1])
+
+        for onset in onset_times_1:
+            axarr[0].axvline(onset, color='r', linestyle='--', alpha=0.9, label='Onset Method 1')
+        axarr[0].legend()
+        axarr[0].set(title='Onset Method 1', xlabel='', ylabel='Amplitude')
+
+        for onset in onset_times_2:
+            axarr[1].axvline(onset, color='b', linestyle='-', alpha=0.7, label='Onset Method 2')
+        axarr[1].legend()
+        axarr[1].set(title='Onset Method 2', xlabel='Time (s)', ylabel='Amplitude')
+
+        handles, labels = plt.gca().get_legend_handles_labels()
+        by_label = dict(zip(labels, handles))
+        plt.legend(by_label.values(), by_label.keys())
+        plt.title("Audio waveform with Onsets")
+        plt.savefig("./onset.png", dpi=500)
+
+    def audio_beat_vis(self, onset_raw, onset_bt, onset_bt_rms):
+        fig, ax = plt.subplots(nrows=4, sharex=True)
+        librosa.display.specshow(librosa.amplitude_to_db(self.S, ref=np.max), y_axis='log', x_axis='time', ax=ax[0])
+        ax[1].plot(self.times, self.oenv, label='Onset strength')
+        ax[1].vlines(librosa.frames_to_time(onset_raw), 0, self.oenv.max(), label='Raw onsets', color='r')
+        ax[1].legend()
+        ax[2].vlines(librosa.frames_to_time(onset_bt), 0, self.oenv.max(), label='Backtracked', color='r')
+        ax[2].legend()
+        ax[3].vlines(librosa.frames_to_time(onset_bt_rms), 0, self.oenv.max(), label='Backtracked (RMS)', color='r')
+        ax[3].legend()
+        fig.savefig("./onset.png", dpi=500)
+
+    @staticmethod
+    def motion_frames2time(vel, offset, pose_fps):
+        return vel / pose_fps + offset
+
+    @staticmethod
+    def GAHR(a, b, sigma):
+        dis_all_b2a = 0
+        for b_each in b:
+            l2_min = min(abs(a_each - b_each) for a_each in a)
+            dis_all_b2a += math.exp(-(l2_min ** 2) / (2 * sigma ** 2))
+        return dis_all_b2a / len(b)
+
+    @staticmethod
+    def fix_directed_GAHR(a, b, sigma):
+        a = BC.motion_frames2time(a, 0, 30)
+        b = BC.motion_frames2time(b, 0, 30)
+        a = [0] + a + [len(a)/30]
+        b = [0] + b + [len(b)/30]
+        return BC.GAHR(a, b, sigma)
+
+    def calculate_align(self, onset_bt_rms, beat_vel, pose_fps=30):
+        avg_dis_all_b2a_list = []
+        for its, beat_vel_each in enumerate(beat_vel):
+            if its not in self.upper_body:
+                continue
+            if beat_vel_each.size == 0:
+                avg_dis_all_b2a_list.append(0)
+                continue
+            pose_bt = self.motion_frames2time(beat_vel_each, 0, pose_fps)
+            avg_dis_all_b2a_list.append(self.GAHR(pose_bt, onset_bt_rms, self.sigma))
+        self.counter += 1
+        self.sum += sum(avg_dis_all_b2a_list) / len(self.upper_body)
+    
+    def avg(self):
+        return self.sum/self.counter
+    
+    def reset(self):
+        self.counter = 0
+        self.sum = 0
+    
+class Arg(object):
+    def __init__(self):
+        self.vae_length = 240
+        self.vae_test_dim = 330
+        self.vae_test_len = 32
+        self.vae_layer = 4
+        self.vae_test_stride = 20
+        self.vae_grow = [1, 1, 2, 1]
+        self.variational = False
+
+class FGD(object):
+    def __init__(self, download_path="./emage/"):
+        if download_path is not None:
+            os.makedirs(download_path, exist_ok=True)
+            model_file_path = os.path.join(download_path, "AESKConv_240_100.bin")
+            smplx_model_dir = os.path.join(download_path, "smplx_models", "smplx")
+            smplx_model_file_path = os.path.join(smplx_model_dir, "SMPLX_NEUTRAL_2020.npz")
+            if not os.path.exists(model_file_path):
+                print(f"Downloading {model_file_path}")
+                wget.download("https://huggingface.co/spaces/H-Liu1997/EMAGE/resolve/main/EMAGE/test_sequences/weights/AESKConv_240_100.bin", model_file_path)
+
+            os.makedirs(smplx_model_dir, exist_ok=True)
+            if not os.path.exists(smplx_model_file_path):
+                print(f"Downloading {smplx_model_file_path}")
+                wget.download("https://huggingface.co/spaces/H-Liu1997/EMAGE/resolve/main/EMAGE/smplx_models/smplx/SMPLX_NEUTRAL_2020.npz", smplx_model_file_path)
+        args = Arg()
+        self.eval_model = VAESKConv(args)  # Assumes LocalEncoder is defined elsewhere
+        old_stat = torch.load(download_path+"AESKConv_240_100.bin")["model_state"]
+        new_stat = {}
+        for k, v in old_stat.items():
+            # If 'module.' is in the key, remove it
+            new_key = k.replace('module.', '') if 'module.' in k else k
+            new_stat[new_key] = v
+        self.eval_model.load_state_dict(new_stat)
+
+        self.eval_model.eval()
+        if torch.cuda.is_available():
+            self.eval_model.cuda()
+
+        self.pred_features = []
+        self.target_features = []
+
+    def update(self, pred, target):
+        """
+        Accumulate the feature representations of predictions and targets.
+        pred: torch.Tensor of predicted data
+        target: torch.Tensor of target data
+        """
+        self.pred_features.append(self.get_feature(pred).reshape(-1, 240))
+        self.target_features.append(self.get_feature(target).reshape(-1, 240))
+
+    def compute(self):
+        """
+        Compute the Frechet Distance between the accumulated features.
+        Returns:
+            frechet_distance (float): The FVD score between prediction and target features.
+        """
+        pred_features = np.concatenate(self.pred_features, axis=0)
+        target_features = np.concatenate(self.target_features, axis=0)
+        print(pred_features.shape, target_features.shape)
+        return self.frechet_distance(pred_features, target_features)
+
+    def reset(self):
+        """ Reset the accumulated feature lists. """
+        self.pred_features = []
+        self.target_features = []
+
+    def get_feature(self, data):
+        """
+        Pass the data through the evaluation model to get the feature representation.
+        data: torch.Tensor of data (e.g., predictions or targets)
+        Returns:
+            feature: numpy array of extracted features
+        """
+        with torch.no_grad():
+            if torch.cuda.is_available():
+                data = data.cuda()
+            feature = self.eval_model.map2latent(data).cpu().numpy()
+        return feature
+
+    @staticmethod
+    def frechet_distance(samples_A, samples_B):
+        """
+        Compute the Frechet Distance between two sets of features.
+        samples_A: numpy array of features from set A (e.g., predictions)
+        samples_B: numpy array of features from set B (e.g., targets)
+        Returns:
+            frechet_dist (float): The Frechet Distance between the two feature sets.
+        """
+        A_mu = np.mean(samples_A, axis=0)
+        A_sigma = np.cov(samples_A, rowvar=False)
+        B_mu = np.mean(samples_B, axis=0)
+        B_sigma = np.cov(samples_B, rowvar=False)
+        try:
+            frechet_dist = FGD.calculate_frechet_distance(A_mu, A_sigma, B_mu, B_sigma)
+        except ValueError:
+            frechet_dist = 1e+10
+        return frechet_dist
+
+    @staticmethod
+    def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+        """
+        Calculate the Frechet Distance between two multivariate Gaussians.
+        mu1: Mean vector of the first distribution (generated data).
+        sigma1: Covariance matrix of the first distribution.
+        mu2: Mean vector of the second distribution (target data).
+        sigma2: Covariance matrix of the second distribution.
+        Returns:
+            Frechet Distance (float)
+        """
+        mu1 = np.atleast_1d(mu1)
+        mu2 = np.atleast_1d(mu2)
+        sigma1 = np.atleast_2d(sigma1)
+        sigma2 = np.atleast_2d(sigma2)
+
+        assert mu1.shape == mu2.shape, 'Training and test mean vectors have different lengths'
+        assert sigma1.shape == sigma2.shape, 'Training and test covariances have different dimensions'
+
+        diff = mu1 - mu2
+
+        # Product might be almost singular
+        covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+        # if not np.isfinite(covmean).all():
+        #     msg = ('Frechet Distance calculation produces singular product; '
+        #            'adding %s to diagonal of covariance estimates') % eps
+        #     print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+
+        # Numerical error might give slight imaginary component
+        if np.iscomplexobj(covmean):
+            if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+                m = np.max(np.abs(covmean.imag))
+                raise ValueError(f'Imaginary component {m}')
+            covmean = covmean.real
+
+        tr_covmean = np.trace(covmean)
+
+        return (diff.dot(diff) + np.trace(sigma1) +
+                np.trace(sigma2) - 2 * tr_covmean)
\ No newline at end of file
diff --git a/emage/motion_encoder.py b/emage/motion_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..df60cd9ee7578d17a47830d5c83448889c2a2616
--- /dev/null
+++ b/emage/motion_encoder.py
@@ -0,0 +1,199 @@
+import torch.nn as nn
+import torch
+import numpy as np
+from .skeleton import ResidualBlock, SkeletonResidual, residual_ratio, SkeletonConv, SkeletonPool, find_neighbor, build_edge_topology
+
+class LocalEncoder(nn.Module):
+    def __init__(self, args, topology):
+        super(LocalEncoder, self).__init__()
+        args.channel_base = 6
+        args.activation = "tanh"
+        args.use_residual_blocks=True
+        args.z_dim=1024
+        args.temporal_scale=8
+        args.kernel_size=4
+        args.num_layers=args.vae_layer
+        args.skeleton_dist=2
+        args.extra_conv=0
+        # check how to reflect in 1d
+        args.padding_mode="constant"
+        args.skeleton_pool="mean"
+        args.upsampling="linear"
+
+
+        self.topologies = [topology]
+        self.channel_base = [args.channel_base]
+
+        self.channel_list = []
+        self.edge_num = [len(topology)]
+        self.pooling_list = []
+        self.layers = nn.ModuleList()
+        self.args = args
+        # self.convs = []
+
+        kernel_size = args.kernel_size
+        kernel_even = False if kernel_size % 2 else True
+        padding = (kernel_size - 1) // 2
+        bias = True
+        self.grow = args.vae_grow
+        for i in range(args.num_layers):
+            self.channel_base.append(self.channel_base[-1]*self.grow[i])
+
+        for i in range(args.num_layers):
+            seq = []
+            neighbour_list = find_neighbor(self.topologies[i], args.skeleton_dist)
+            in_channels = self.channel_base[i] * self.edge_num[i]
+            out_channels = self.channel_base[i + 1] * self.edge_num[i]
+            if i == 0:
+                self.channel_list.append(in_channels)
+            self.channel_list.append(out_channels)
+            last_pool = True if i == args.num_layers - 1 else False
+
+            # (T, J, D) => (T, J', D)
+            pool = SkeletonPool(edges=self.topologies[i], pooling_mode=args.skeleton_pool,
+                                channels_per_edge=out_channels // len(neighbour_list), last_pool=last_pool)
+
+            if args.use_residual_blocks:
+                # (T, J, D) => (T/2, J', 2D)
+                seq.append(SkeletonResidual(self.topologies[i], neighbour_list, joint_num=self.edge_num[i], in_channels=in_channels, out_channels=out_channels,
+                                            kernel_size=kernel_size, stride=2, padding=padding, padding_mode=args.padding_mode, bias=bias,
+                                            extra_conv=args.extra_conv, pooling_mode=args.skeleton_pool, activation=args.activation, last_pool=last_pool))
+            else:
+                for _ in range(args.extra_conv):
+                    # (T, J, D) => (T, J, D)
+                    seq.append(SkeletonConv(neighbour_list, in_channels=in_channels, out_channels=in_channels,
+                                            joint_num=self.edge_num[i], kernel_size=kernel_size - 1 if kernel_even else kernel_size,
+                                            stride=1,
+                                            padding=padding, padding_mode=args.padding_mode, bias=bias))
+                    seq.append(nn.PReLU() if args.activation == 'relu' else nn.Tanh())
+                # (T, J, D) => (T/2, J, 2D)
+                seq.append(SkeletonConv(neighbour_list, in_channels=in_channels, out_channels=out_channels,
+                                        joint_num=self.edge_num[i], kernel_size=kernel_size, stride=2,
+                                        padding=padding, padding_mode=args.padding_mode, bias=bias, add_offset=False,
+                                        in_offset_channel=3 * self.channel_base[i] // self.channel_base[0]))
+                # self.convs.append(seq[-1])
+
+                seq.append(pool)
+                seq.append(nn.PReLU() if args.activation == 'relu' else nn.Tanh())
+            self.layers.append(nn.Sequential(*seq))
+
+            self.topologies.append(pool.new_edges)
+            self.pooling_list.append(pool.pooling_list)
+            self.edge_num.append(len(self.topologies[-1]))
+
+        # in_features = self.channel_base[-1] * len(self.pooling_list[-1])
+        # in_features *= int(args.temporal_scale / 2) 
+        # self.reduce = nn.Linear(in_features, args.z_dim)
+        # self.mu = nn.Linear(in_features, args.z_dim)
+        # self.logvar = nn.Linear(in_features, args.z_dim)
+
+    def forward(self, input):
+        #bs, n, c = input.shape[0], input.shape[1], input.shape[2]
+        output = input.permute(0, 2, 1)#input.reshape(bs, n, -1, 6)
+        for layer in self.layers:
+            output = layer(output)
+        #output = output.view(output.shape[0], -1)
+        output = output.permute(0, 2, 1)
+        return output
+
+class ResBlock(nn.Module):
+    def __init__(self, channel):
+        super(ResBlock, self).__init__()
+        self.model = nn.Sequential(
+            nn.Conv1d(channel, channel, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv1d(channel, channel, kernel_size=3, stride=1, padding=1),
+        )
+
+    def forward(self, x):
+        residual = x
+        out = self.model(x)
+        out += residual
+        return out
+    
+class VQDecoderV3(nn.Module):
+    def __init__(self, args):
+        super(VQDecoderV3, self).__init__()
+        n_up = args.vae_layer
+        channels = []
+        for i in range(n_up-1):
+            channels.append(args.vae_length)
+        channels.append(args.vae_length)
+        channels.append(args.vae_test_dim)
+        input_size = args.vae_length
+        n_resblk = 2
+        assert len(channels) == n_up + 1
+        if input_size == channels[0]:
+            layers = []
+        else:
+            layers = [nn.Conv1d(input_size, channels[0], kernel_size=3, stride=1, padding=1)]
+
+        for i in range(n_resblk):
+            layers += [ResBlock(channels[0])]
+        # channels = channels
+        for i in range(n_up):
+            layers += [
+                nn.Upsample(scale_factor=2, mode='nearest'),
+                nn.Conv1d(channels[i], channels[i+1], kernel_size=3, stride=1, padding=1),
+                nn.LeakyReLU(0.2, inplace=True)
+            ]
+        layers += [nn.Conv1d(channels[-1], channels[-1], kernel_size=3, stride=1, padding=1)]
+        self.main = nn.Sequential(*layers)
+        # self.main.apply(init_weight)
+
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        return outputs
+    
+def reparameterize(mu, logvar):
+    std = torch.exp(0.5 * logvar)
+    eps = torch.randn_like(std)
+    return mu + eps * std
+
+class VAEConv(nn.Module):
+    def __init__(self, args):
+        super(VAEConv, self).__init__()
+        # self.encoder = VQEncoderV3(args)
+        # self.decoder = VQDecoderV3(args)
+        self.fc_mu = nn.Linear(args.vae_length, args.vae_length)
+        self.fc_logvar = nn.Linear(args.vae_length, args.vae_length)
+        self.variational = args.variational
+        
+    def forward(self, inputs):
+        pre_latent = self.encoder(inputs)
+        mu, logvar = None, None
+        if self.variational:
+            mu = self.fc_mu(pre_latent)
+            logvar = self.fc_logvar(pre_latent)
+            pre_latent = reparameterize(mu, logvar)
+        rec_pose = self.decoder(pre_latent)
+        return {
+            "poses_feat":pre_latent,
+            "rec_pose": rec_pose,
+            "pose_mu": mu,
+            "pose_logvar": logvar,
+            }
+    
+    def map2latent(self, inputs):
+        pre_latent = self.encoder(inputs)
+        if self.variational:
+            mu = self.fc_mu(pre_latent)
+            logvar = self.fc_logvar(pre_latent)
+            pre_latent = reparameterize(mu, logvar)
+        return pre_latent
+    
+    def decode(self, pre_latent):
+        rec_pose = self.decoder(pre_latent)
+        return rec_pose
+
+class VAESKConv(VAEConv):
+    def __init__(self, args, model_save_path="./emage/"):
+        # args = args()
+        super(VAESKConv, self).__init__(args)
+        smpl_fname = model_save_path +'smplx_models/smplx/SMPLX_NEUTRAL_2020.npz'
+        smpl_data = np.load(smpl_fname, encoding='latin1')
+        parents = smpl_data['kintree_table'][0].astype(np.int32)
+        edges = build_edge_topology(parents)
+        self.encoder = LocalEncoder(args, edges)
+        self.decoder = VQDecoderV3(args)
\ No newline at end of file
diff --git a/emage/skeleton.py b/emage/skeleton.py
new file mode 100644
index 0000000000000000000000000000000000000000..123656b7516aec1b424f9f87d384837eb820ccc9
--- /dev/null
+++ b/emage/skeleton.py
@@ -0,0 +1,636 @@
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SkeletonConv(nn.Module):
+    def __init__(self, neighbour_list, in_channels, out_channels, kernel_size, joint_num, stride=1, padding=0,
+                 bias=True, padding_mode='zeros', add_offset=False, in_offset_channel=0):
+        self.in_channels_per_joint = in_channels // joint_num
+        self.out_channels_per_joint = out_channels // joint_num
+        if in_channels % joint_num != 0 or out_channels % joint_num != 0:
+            raise Exception('BAD')
+        super(SkeletonConv, self).__init__()
+
+        if padding_mode == 'zeros':
+            padding_mode = 'constant'
+        if padding_mode == 'reflection':
+            padding_mode = 'reflect'
+
+        self.expanded_neighbour_list = []
+        self.expanded_neighbour_list_offset = []
+        self.neighbour_list = neighbour_list
+        self.add_offset = add_offset
+        self.joint_num = joint_num
+
+        self.stride = stride
+        self.dilation = 1
+        self.groups = 1
+        self.padding = padding
+        self.padding_mode = padding_mode
+        self._padding_repeated_twice = (padding, padding)
+
+        for neighbour in neighbour_list:
+            expanded = []
+            for k in neighbour:
+                for i in range(self.in_channels_per_joint):
+                    expanded.append(k * self.in_channels_per_joint + i)
+            self.expanded_neighbour_list.append(expanded)
+
+        if self.add_offset:
+            self.offset_enc = SkeletonLinear(neighbour_list, in_offset_channel * len(neighbour_list), out_channels)
+
+            for neighbour in neighbour_list:
+                expanded = []
+                for k in neighbour:
+                    for i in range(add_offset):
+                        expanded.append(k * in_offset_channel + i)
+                self.expanded_neighbour_list_offset.append(expanded)
+
+        self.weight = torch.zeros(out_channels, in_channels, kernel_size)
+        if bias:
+            self.bias = torch.zeros(out_channels)
+        else:
+            self.register_parameter('bias', None)
+
+        self.mask = torch.zeros_like(self.weight)
+        for i, neighbour in enumerate(self.expanded_neighbour_list):
+            self.mask[self.out_channels_per_joint * i: self.out_channels_per_joint * (i + 1), neighbour, ...] = 1
+        self.mask = nn.Parameter(self.mask, requires_grad=False)
+
+        self.description = 'SkeletonConv(in_channels_per_armature={}, out_channels_per_armature={}, kernel_size={}, ' \
+                           'joint_num={}, stride={}, padding={}, bias={})'.format(
+                               in_channels // joint_num, out_channels // joint_num, kernel_size, joint_num, stride, padding, bias
+                           )
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for i, neighbour in enumerate(self.expanded_neighbour_list):
+            """ Use temporary variable to avoid assign to copy of slice, which might lead to unexpected result """
+            tmp = torch.zeros_like(self.weight[self.out_channels_per_joint * i: self.out_channels_per_joint * (i + 1),
+                                   neighbour, ...])
+            nn.init.kaiming_uniform_(tmp, a=math.sqrt(5))
+            self.weight[self.out_channels_per_joint * i: self.out_channels_per_joint * (i + 1),
+                        neighbour, ...] = tmp
+            if self.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(
+                    self.weight[self.out_channels_per_joint * i: self.out_channels_per_joint * (i + 1), neighbour, ...])
+                bound = 1 / math.sqrt(fan_in)
+                tmp = torch.zeros_like(
+                    self.bias[self.out_channels_per_joint * i: self.out_channels_per_joint * (i + 1)])
+                nn.init.uniform_(tmp, -bound, bound)
+                self.bias[self.out_channels_per_joint * i: self.out_channels_per_joint * (i + 1)] = tmp
+
+        self.weight = nn.Parameter(self.weight)
+        if self.bias is not None:
+            self.bias = nn.Parameter(self.bias)
+
+    def set_offset(self, offset):
+        if not self.add_offset:
+            raise Exception('Wrong Combination of Parameters')
+        self.offset = offset.reshape(offset.shape[0], -1)
+
+    def forward(self, input):
+        # print('SkeletonConv')
+        weight_masked = self.weight * self.mask
+        #print(f'input: {input.size()}')
+        res = F.conv1d(F.pad(input, self._padding_repeated_twice, mode=self.padding_mode),
+                       weight_masked, self.bias, self.stride,
+                       0, self.dilation, self.groups)
+
+        if self.add_offset:
+            offset_res = self.offset_enc(self.offset)
+            offset_res = offset_res.reshape(offset_res.shape + (1, ))
+            res += offset_res / 100
+        #print(f'res: {res.size()}')
+        return res
+
+
+class SkeletonLinear(nn.Module):
+    def __init__(self, neighbour_list, in_channels, out_channels, extra_dim1=False):
+        super(SkeletonLinear, self).__init__()
+        self.neighbour_list = neighbour_list
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.in_channels_per_joint = in_channels // len(neighbour_list)
+        self.out_channels_per_joint = out_channels // len(neighbour_list)
+        self.extra_dim1 = extra_dim1
+        self.expanded_neighbour_list = []
+
+        for neighbour in neighbour_list:
+            expanded = []
+            for k in neighbour:
+                for i in range(self.in_channels_per_joint):
+                    expanded.append(k * self.in_channels_per_joint + i)
+            self.expanded_neighbour_list.append(expanded)
+
+        self.weight = torch.zeros(out_channels, in_channels)
+        self.mask = torch.zeros(out_channels, in_channels)
+        self.bias = nn.Parameter(torch.Tensor(out_channels))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for i, neighbour in enumerate(self.expanded_neighbour_list):
+            tmp = torch.zeros_like(
+                self.weight[i*self.out_channels_per_joint: (i + 1)*self.out_channels_per_joint, neighbour]
+            )
+            self.mask[i*self.out_channels_per_joint: (i + 1)*self.out_channels_per_joint, neighbour] = 1
+            nn.init.kaiming_uniform_(tmp, a=math.sqrt(5))
+            self.weight[i*self.out_channels_per_joint: (i + 1)*self.out_channels_per_joint, neighbour] = tmp
+
+        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+        bound = 1 / math.sqrt(fan_in)
+        nn.init.uniform_(self.bias, -bound, bound)
+
+        self.weight = nn.Parameter(self.weight)
+        self.mask = nn.Parameter(self.mask, requires_grad=False)
+
+    def forward(self, input):
+        input = input.reshape(input.shape[0], -1)
+        weight_masked = self.weight * self.mask
+        res = F.linear(input, weight_masked, self.bias)
+        if self.extra_dim1:
+            res = res.reshape(res.shape + (1,))
+        return res
+
+
+class SkeletonPool(nn.Module):
+    def __init__(self, edges, pooling_mode, channels_per_edge, last_pool=False):
+        super(SkeletonPool, self).__init__()
+
+        if pooling_mode != 'mean':
+            raise Exception('Unimplemented pooling mode in matrix_implementation')
+
+        self.channels_per_edge = channels_per_edge
+        self.pooling_mode = pooling_mode
+        self.edge_num = len(edges)
+        # self.edge_num = len(edges) + 1
+        self.seq_list = []
+        self.pooling_list = []
+        self.new_edges = []
+        degree = [0] * 100  # each element represents the degree of the corresponding joint
+
+        for edge in edges:
+            degree[edge[0]] += 1
+            degree[edge[1]] += 1
+
+        # seq_list contains multiple sub-lists where each sub-list is an edge chain from the joint whose degree > 2 to the end effectors or joints whose degree > 2.
+        def find_seq(j, seq):
+            nonlocal self, degree, edges
+
+            if degree[j] > 2 and j != 0:
+                self.seq_list.append(seq)
+                seq = []
+
+            if degree[j] == 1:
+                self.seq_list.append(seq)
+                return
+
+            for idx, edge in enumerate(edges):
+                if edge[0] == j:
+                    find_seq(edge[1], seq + [idx])
+
+        find_seq(0, [])
+        # print(f'self.seq_list: {self.seq_list}')
+
+        for seq in self.seq_list:
+            if last_pool:
+                self.pooling_list.append(seq)
+                continue
+            if len(seq) % 2 == 1:
+                self.pooling_list.append([seq[0]])
+                self.new_edges.append(edges[seq[0]])
+                seq = seq[1:]
+            for i in range(0, len(seq), 2):
+                self.pooling_list.append([seq[i], seq[i + 1]])
+                self.new_edges.append([edges[seq[i]][0], edges[seq[i + 1]][1]])
+        # print(f'self.pooling_list: {self.pooling_list}')
+        # print(f'self.new_egdes: {self.new_edges}')
+
+        # add global position
+        # self.pooling_list.append([self.edge_num - 1])
+
+        self.description = 'SkeletonPool(in_edge_num={}, out_edge_num={})'.format(
+            len(edges), len(self.pooling_list)
+        )
+
+        self.weight = torch.zeros(len(self.pooling_list) * channels_per_edge, self.edge_num * channels_per_edge)
+
+        for i, pair in enumerate(self.pooling_list):
+            for j in pair:
+                for c in range(channels_per_edge):
+                    self.weight[i * channels_per_edge + c, j * channels_per_edge + c] = 1.0 / len(pair)
+
+        self.weight = nn.Parameter(self.weight, requires_grad=False)
+
+    def forward(self, input: torch.Tensor):
+        # print('SkeletonPool')
+        # print(f'input: {input.size()}')
+        # print(f'self.weight: {self.weight.size()}')
+        return torch.matmul(self.weight, input)
+
+
+class SkeletonUnpool(nn.Module):
+    def __init__(self, pooling_list, channels_per_edge):
+        super(SkeletonUnpool, self).__init__()
+        self.pooling_list = pooling_list
+        self.input_edge_num = len(pooling_list)
+        self.output_edge_num = 0
+        self.channels_per_edge = channels_per_edge
+        for t in self.pooling_list:
+            self.output_edge_num += len(t)
+
+        self.description = 'SkeletonUnpool(in_edge_num={}, out_edge_num={})'.format(
+            self.input_edge_num, self.output_edge_num,
+        )
+
+        self.weight = torch.zeros(self.output_edge_num * channels_per_edge, self.input_edge_num * channels_per_edge)
+
+        for i, pair in enumerate(self.pooling_list):
+            for j in pair:
+                for c in range(channels_per_edge):
+                    self.weight[j * channels_per_edge + c, i * channels_per_edge + c] = 1
+
+        self.weight = nn.Parameter(self.weight)
+        self.weight.requires_grad_(False)
+
+    def forward(self, input: torch.Tensor):
+        # print('SkeletonUnpool')
+        # print(f'input: {input.size()}')
+        # print(f'self.weight: {self.weight.size()}')
+        return torch.matmul(self.weight, input)
+
+
+"""
+Helper functions for skeleton operation
+"""
+
+
+def dfs(x, fa, vis, dist):
+    vis[x] = 1
+    for y in range(len(fa)):
+        if (fa[y] == x or fa[x] == y) and vis[y] == 0:
+            dist[y] = dist[x] + 1
+            dfs(y, fa, vis, dist)
+
+
+"""
+def find_neighbor_joint(fa, threshold):
+    neighbor_list = [[]]
+    for x in range(1, len(fa)):
+        vis = [0 for _ in range(len(fa))]
+        dist = [0 for _ in range(len(fa))]
+        dist[0] = 10000
+        dfs(x, fa, vis, dist)
+        neighbor = []
+        for j in range(1, len(fa)):
+            if dist[j] <= threshold:
+                neighbor.append(j)
+        neighbor_list.append(neighbor)
+
+    neighbor = [0]
+    for i, x in enumerate(neighbor_list):
+        if i == 0: continue
+        if 1 in x:
+            neighbor.append(i)
+            neighbor_list[i] = [0] + neighbor_list[i]
+    neighbor_list[0] = neighbor
+    return neighbor_list
+
+
+def build_edge_topology(topology, offset):
+    # get all edges (pa, child, offset)
+    edges = []
+    joint_num = len(topology)
+    for i in range(1, joint_num):
+        edges.append((topology[i], i, offset[i]))
+    return edges
+"""
+
+
+def build_edge_topology(topology):
+    # get all edges (pa, child)
+    edges = []
+    joint_num = len(topology)
+    edges.append((0, joint_num))  # add an edge between the root joint and a virtual joint
+    for i in range(1, joint_num):
+        edges.append((topology[i], i))
+    return edges
+
+
+def build_joint_topology(edges, origin_names):
+    parent = []
+    offset = []
+    names = []
+    edge2joint = []
+    joint_from_edge = []  # -1 means virtual joint
+    joint_cnt = 0
+    out_degree = [0] * (len(edges) + 10)
+    for edge in edges:
+        out_degree[edge[0]] += 1
+
+    # add root joint
+    joint_from_edge.append(-1)
+    parent.append(0)
+    offset.append(np.array([0, 0, 0]))
+    names.append(origin_names[0])
+    joint_cnt += 1
+
+    def make_topology(edge_idx, pa):
+        nonlocal edges, parent, offset, names, edge2joint, joint_from_edge, joint_cnt
+        edge = edges[edge_idx]
+        if out_degree[edge[0]] > 1:
+            parent.append(pa)
+            offset.append(np.array([0, 0, 0]))
+            names.append(origin_names[edge[1]] + '_virtual')
+            edge2joint.append(-1)
+            pa = joint_cnt
+            joint_cnt += 1
+
+        parent.append(pa)
+        offset.append(edge[2])
+        names.append(origin_names[edge[1]])
+        edge2joint.append(edge_idx)
+        pa = joint_cnt
+        joint_cnt += 1
+
+        for idx, e in enumerate(edges):
+            if e[0] == edge[1]:
+                make_topology(idx, pa)
+
+    for idx, e in enumerate(edges):
+        if e[0] == 0:
+            make_topology(idx, 0)
+
+    return parent, offset, names, edge2joint
+
+
+def calc_edge_mat(edges):
+    edge_num = len(edges)
+    # edge_mat[i][j] = distance between edge(i) and edge(j)
+    edge_mat = [[100000] * edge_num for _ in range(edge_num)]
+    for i in range(edge_num):
+        edge_mat[i][i] = 0
+
+    # initialize edge_mat with direct neighbor
+    for i, a in enumerate(edges):
+        for j, b in enumerate(edges):
+            link = 0
+            for x in range(2):
+                for y in range(2):
+                    if a[x] == b[y]:
+                        link = 1
+            if link:
+                edge_mat[i][j] = 1
+
+    # calculate all the pairs distance
+    for k in range(edge_num):
+        for i in range(edge_num):
+            for j in range(edge_num):
+                edge_mat[i][j] = min(edge_mat[i][j], edge_mat[i][k] + edge_mat[k][j])
+    return edge_mat
+
+
+def find_neighbor(edges, d):
+    """
+    Args:
+        edges: The list contains N elements, each element represents (parent, child).
+        d: Distance between edges (the distance of the same edge is 0 and the distance of adjacent edges is 1).
+
+    Returns:
+        The list contains N elements, each element is a list of edge indices whose distance <= d.
+    """
+    edge_mat = calc_edge_mat(edges)
+    neighbor_list = []
+    edge_num = len(edge_mat)
+    for i in range(edge_num):
+        neighbor = []
+        for j in range(edge_num):
+            if edge_mat[i][j] <= d:
+                neighbor.append(j)
+        neighbor_list.append(neighbor)
+
+    # # add neighbor for global part
+    # global_part_neighbor = neighbor_list[0].copy()
+    # """
+    # Line #373 is buggy. Thanks @crissallan!!
+    # See issue #30 (https://github.com/DeepMotionEditing/deep-motion-editing/issues/30)
+    # However, fixing this bug will make it unable to load the pretrained model and
+    # affect the reproducibility of quantitative error reported in the paper.
+    # It is not a fatal bug so we didn't touch it and we are looking for possible solutions.
+    # """
+    # for i in global_part_neighbor:
+    #     neighbor_list[i].append(edge_num)
+    # neighbor_list.append(global_part_neighbor)
+
+    return neighbor_list
+
+
+def calc_node_depth(topology):
+    def dfs(node, topology):
+        if topology[node] < 0:
+            return 0
+        return 1 + dfs(topology[node], topology)
+    depth = []
+    for i in range(len(topology)):
+        depth.append(dfs(i, topology))
+
+    return depth
+
+
+def residual_ratio(k):
+    return 1 / (k + 1)
+
+
+class Affine(nn.Module):
+    def __init__(self, num_parameters, scale=True, bias=True, scale_init=1.0):
+        super(Affine, self).__init__()
+        if scale:
+            self.scale = nn.Parameter(torch.ones(num_parameters) * scale_init)
+        else:
+            self.register_parameter('scale', None)
+
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(num_parameters))
+        else:
+            self.register_parameter('bias', None)
+
+    def forward(self, input):
+        output = input
+        if self.scale is not None:
+            scale = self.scale.unsqueeze(0)
+            while scale.dim() < input.dim():
+                scale = scale.unsqueeze(2)
+        output = output.mul(scale)
+
+        if self.bias is not None:
+            bias = self.bias.unsqueeze(0)
+            while bias.dim() < input.dim():
+                bias = bias.unsqueeze(2)
+        output += bias
+
+        return output
+
+
+class BatchStatistics(nn.Module):
+    def __init__(self, affine=-1):
+        super(BatchStatistics, self).__init__()
+        self.affine = nn.Sequential() if affine == -1 else Affine(affine)
+        self.loss = 0
+
+    def clear_loss(self):
+        self.loss = 0
+
+    def compute_loss(self, input):
+        input_flat = input.view(input.size(1), input.numel() // input.size(1))
+        mu = input_flat.mean(1)
+        logvar = (input_flat.pow(2).mean(1) - mu.pow(2)).sqrt().log()
+
+        self.loss = mu.pow(2).mean() + logvar.pow(2).mean()
+
+    def forward(self, input):
+        self.compute_loss(input)
+        return self.affine(input)
+
+
+class ResidualBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, residual_ratio, activation, batch_statistics=False, last_layer=False):
+        super(ResidualBlock, self).__init__()
+
+        self.residual_ratio = residual_ratio
+        self.shortcut_ratio = 1 - residual_ratio
+
+        residual = []
+        residual.append(nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding))
+        if batch_statistics:
+            residual.append(BatchStatistics(out_channels))
+        if not last_layer:
+            residual.append(nn.PReLU() if activation == 'relu' else nn.Tanh())
+        self.residual = nn.Sequential(*residual)
+
+        self.shortcut = nn.Sequential(
+            nn.AvgPool1d(kernel_size=2) if stride == 2 else nn.Sequential(),
+            nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=1, padding=0),
+            BatchStatistics(out_channels) if (in_channels != out_channels and batch_statistics is True) else nn.Sequential()
+        )
+
+    def forward(self, input):
+        return self.residual(input).mul(self.residual_ratio) + self.shortcut(input).mul(self.shortcut_ratio)
+
+
+class ResidualBlockTranspose(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, residual_ratio, activation):
+        super(ResidualBlockTranspose, self).__init__()
+
+        self.residual_ratio = residual_ratio
+        self.shortcut_ratio = 1 - residual_ratio
+
+        self.residual = nn.Sequential(
+            nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride, padding),
+            nn.PReLU() if activation == 'relu' else nn.Tanh()
+        )
+
+        self.shortcut = nn.Sequential(
+            nn.Upsample(scale_factor=2, mode='linear', align_corners=False) if stride == 2 else nn.Sequential(),
+            nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        )
+
+    def forward(self, input):
+        return self.residual(input).mul(self.residual_ratio) + self.shortcut(input).mul(self.shortcut_ratio)
+
+
+class SkeletonResidual(nn.Module):
+    def __init__(self, topology, neighbour_list, joint_num, in_channels, out_channels, kernel_size, stride, padding, padding_mode, bias, extra_conv, pooling_mode, activation, last_pool):
+        super(SkeletonResidual, self).__init__()
+
+        kernel_even = False if kernel_size % 2 else True
+
+        seq = []
+        for _ in range(extra_conv):
+            # (T, J, D) => (T, J, D)
+            seq.append(SkeletonConv(neighbour_list, in_channels=in_channels, out_channels=in_channels,
+                                    joint_num=joint_num, kernel_size=kernel_size - 1 if kernel_even else kernel_size,
+                                    stride=1,
+                                    padding=padding, padding_mode=padding_mode, bias=bias))
+            seq.append(nn.PReLU() if activation == 'relu' else nn.Tanh())
+        # (T, J, D) => (T/2, J, 2D)
+        seq.append(SkeletonConv(neighbour_list, in_channels=in_channels, out_channels=out_channels,
+                                joint_num=joint_num, kernel_size=kernel_size, stride=stride,
+                                padding=padding, padding_mode=padding_mode, bias=bias, add_offset=False))
+        seq.append(nn.GroupNorm(10, out_channels))  # FIXME: REMEMBER TO CHANGE BACK !!!
+        self.residual = nn.Sequential(*seq)
+
+        # (T, J, D) => (T/2, J, 2D)
+        self.shortcut = SkeletonConv(neighbour_list, in_channels=in_channels, out_channels=out_channels,
+                                     joint_num=joint_num, kernel_size=1, stride=stride, padding=0,
+                                     bias=True, add_offset=False)
+
+        seq = []
+        # (T/2, J, 2D) => (T/2, J', 2D)
+        pool = SkeletonPool(edges=topology, pooling_mode=pooling_mode,
+                            channels_per_edge=out_channels // len(neighbour_list), last_pool=last_pool)
+        if len(pool.pooling_list) != pool.edge_num:
+            seq.append(pool)
+        seq.append(nn.PReLU() if activation == 'relu' else nn.Tanh())
+        self.common = nn.Sequential(*seq)
+
+    def forward(self, input):
+        output = self.residual(input) + self.shortcut(input)
+
+        return self.common(output)
+
+
+class SkeletonResidualTranspose(nn.Module):
+    def __init__(self, neighbour_list, joint_num, in_channels, out_channels, kernel_size, padding, padding_mode, bias, extra_conv, pooling_list, upsampling, activation, last_layer):
+        super(SkeletonResidualTranspose, self).__init__()
+
+        kernel_even = False if kernel_size % 2 else True
+
+        seq = []
+        # (T, J, D) => (2T, J, D)
+        if upsampling is not None:
+            seq.append(nn.Upsample(scale_factor=2, mode=upsampling, align_corners=False))
+        # (2T, J, D) => (2T, J', D)
+        unpool = SkeletonUnpool(pooling_list, in_channels // len(neighbour_list))
+        if unpool.input_edge_num != unpool.output_edge_num:
+            seq.append(unpool)
+        self.common = nn.Sequential(*seq)
+
+        seq = []
+        for _ in range(extra_conv):
+            # (2T, J', D) => (2T, J', D)
+            seq.append(SkeletonConv(neighbour_list, in_channels=in_channels, out_channels=in_channels,
+                                    joint_num=joint_num, kernel_size=kernel_size - 1 if kernel_even else kernel_size,
+                                    stride=1,
+                                    padding=padding, padding_mode=padding_mode, bias=bias))
+            seq.append(nn.PReLU() if activation == 'relu' else nn.Tanh())
+        # (2T, J', D) => (2T, J', D/2)
+        seq.append(SkeletonConv(neighbour_list, in_channels=in_channels, out_channels=out_channels,
+                                joint_num=joint_num, kernel_size=kernel_size - 1 if kernel_even else kernel_size,
+                                stride=1,
+                                padding=padding, padding_mode=padding_mode, bias=bias, add_offset=False))
+        self.residual = nn.Sequential(*seq)
+
+        # (2T, J', D) => (2T, J', D/2)
+        self.shortcut = SkeletonConv(neighbour_list, in_channels=in_channels, out_channels=out_channels,
+                                     joint_num=joint_num, kernel_size=1, stride=1, padding=0,
+                                     bias=True, add_offset=False)
+
+        if activation == 'relu':
+            self.activation = nn.PReLU() if not last_layer else None
+        else:
+            self.activation = nn.Tanh() if not last_layer else None
+
+    def forward(self, input):
+        output = self.common(input)
+        output = self.residual(output) + self.shortcut(output)
+
+        if self.activation is not None:
+            return self.activation(output)
+        else:
+            return output
\ No newline at end of file
diff --git a/emage/smplx_models/smplx/SMPLX_NEUTRAL_2020.npz b/emage/smplx_models/smplx/SMPLX_NEUTRAL_2020.npz
new file mode 100644
index 0000000000000000000000000000000000000000..998c299e1ea055a6091e87edb92774c888940cd7
--- /dev/null
+++ b/emage/smplx_models/smplx/SMPLX_NEUTRAL_2020.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdf06146e27d92022fe5dadad3b9203373f6879eca8e4d8235359ee3ec6a5a74
+size 167264530
diff --git a/frame-interpolation-pytorch/LICENSE b/frame-interpolation-pytorch/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/frame-interpolation-pytorch/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/frame-interpolation-pytorch/README.md b/frame-interpolation-pytorch/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..832c1e8b738822e6daf9f7f8520af77eaf045adf
--- /dev/null
+++ b/frame-interpolation-pytorch/README.md
@@ -0,0 +1,89 @@
+<a href="https://savelife.in.ua/en/donate-en/"><img src="https://savelife.in.ua/wp-content/themes/savelife/assets/images/new-logo-en.svg" width=120px></a>
+# Frame interpolation in PyTorch
+
+This is an unofficial PyTorch inference implementation
+of [FILM: Frame Interpolation for Large Motion, In ECCV 2022](https://film-net.github.io/).\
+[Original repository link](https://github.com/google-research/frame-interpolation)
+
+The project is focused on creating simple and TorchScript compilable inference interface for the original pretrained TF2
+model.
+
+# Quickstart
+
+Download a compiled model from [the release](https://github.com/dajes/frame-interpolation-pytorch/releases)
+and specify the path to the file in the following snippet:
+
+```python
+import torch
+
+device = torch.device('cuda')
+precision = torch.float16
+
+model = torch.jit.load(model_path, map_location='cpu')
+model.eval().to(device=device, dtype=precision)
+
+img1 = torch.rand(1, 3, 720, 1080).to(precision).to(device)
+img3 = torch.rand(1, 3, 720, 1080).to(precision).to(device)
+dt = img1.new_full((1, 1), .5)
+
+with torch.no_grad():
+    img2 = model(img1, img3, dt)  # Will be of the same shape as inputs (1, 3, 720, 1080)
+
+```
+
+# Exporting model by yourself
+
+You will need to install TensorFlow of the version specified in
+the [original repo](https://github.com/google-research/frame-interpolation#installation) and download SavedModel of "
+Style" network from [there](https://github.com/google-research/frame-interpolation#pre-trained-models)
+
+After you have downloaded the SavedModel and can load it via ```tf.compat.v2.saved_model.load(path)```:
+
+* Clone the repository
+
+```
+git clone https://github.com/dajes/frame-interpolation-pytorch
+cd frame-interpolation-pytorch
+```
+
+* Install dependencies
+
+``` 
+python -m pip install -r requirements.txt
+```
+
+* Run ```export.py```:
+
+```
+python export.py "model_path" "save_path" [--statedict] [--fp32] [--skiptest] [--gpu]
+```
+
+Argument list:
+
+* ```model_path``` Path to the TF SavedModel
+* ```save_path``` Path to save the PyTorch state dict
+* ```--statedict``` Export to state dict instead of TorchScript
+* ```--fp32``` Save weights at full precision
+* ```--skiptest``` Skip testing and save model immediately instead
+* ```--gpu``` Whether to attempt to use GPU for testing
+
+# Testing exported model
+The following script creates an MP4 video of interpolated frames between 2 input images:
+```
+python inference.py "model_path" "img1" "img2" [--save_path SAVE_PATH] [--gpu] [--fp16] [--frames FRAMES] [--fps FPS]
+```
+* ```model_path``` Path to the exported TorchScript checkpoint
+* ```img1``` Path to the first image
+* ```img2``` Path to the second image
+* ```--save_path SAVE_PATH``` Path to save the interpolated frames as a video, if absent it will be saved in the same directory as ```img1``` is located and named ```output.mp4```
+* ```--gpu``` Whether to attempt to use GPU for predictions
+* ```--fp16``` Whether to use fp16 for calculations, speeds inference up on GPUs with tensor cores
+* ```--frames FRAMES``` Number of frames to interpolate between the input images
+* ```--fps FPS``` FPS of the output video
+
+### Results on the 2 example photos from original repository:
+<p float="left">
+  <img src="photos/one.png" width="384px" />
+  <img src="photos/two.png" width="384px" /> 
+</p>
+<img src="photos/output.gif" height="384px"/>
diff --git a/frame-interpolation-pytorch/export.py b/frame-interpolation-pytorch/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..1414f768348f0412afa0f94a57ca5e68693c89e3
--- /dev/null
+++ b/frame-interpolation-pytorch/export.py
@@ -0,0 +1,155 @@
+import warnings
+
+import numpy as np
+import tensorflow as tf
+import torch
+
+from interpolator import Interpolator
+
+
+def translate_state_dict(var_dict, state_dict):
+    for name, (prev_name, weight) in zip(state_dict, var_dict.items()):
+        print('Mapping', prev_name, '->', name)
+        weight = torch.from_numpy(weight)
+        if 'kernel' in prev_name:
+            # Transpose the conv2d kernel weights, since TF uses (H, W, C, K) and PyTorch uses (K, C, H, W)
+            weight = weight.permute(3, 2, 0, 1)
+
+        assert state_dict[name].shape == weight.shape, f'Shape mismatch {state_dict[name].shape} != {weight.shape}'
+
+        state_dict[name] = weight
+
+
+def import_state_dict(interpolator: Interpolator, saved_model):
+    variables = saved_model.keras_api.variables
+
+    extract_dict = interpolator.extract.state_dict()
+    flow_dict = interpolator.predict_flow.state_dict()
+    fuse_dict = interpolator.fuse.state_dict()
+
+    extract_vars = {}
+    _flow_vars = {}
+    _fuse_vars = {}
+
+    for var in variables:
+        name = var.name
+        if name.startswith('feat_net'):
+            extract_vars[name[9:]] = var.numpy()
+        elif name.startswith('predict_flow'):
+            _flow_vars[name[13:]] = var.numpy()
+        elif name.startswith('fusion'):
+            _fuse_vars[name[7:]] = var.numpy()
+
+    # reverse order of modules to allow jit export
+    # TODO: improve this hack
+    flow_vars = dict(sorted(_flow_vars.items(), key=lambda x: x[0].split('/')[0], reverse=True))
+    fuse_vars = dict(sorted(_fuse_vars.items(), key=lambda x: int((x[0].split('/')[0].split('_')[1:] or [0])[0]) // 3, reverse=True))
+
+    assert len(extract_vars) == len(extract_dict), f'{len(extract_vars)} != {len(extract_dict)}'
+    assert len(flow_vars) == len(flow_dict), f'{len(flow_vars)} != {len(flow_dict)}'
+    assert len(fuse_vars) == len(fuse_dict), f'{len(fuse_vars)} != {len(fuse_dict)}'
+
+    for state_dict, var_dict in ((extract_dict, extract_vars), (flow_dict, flow_vars), (fuse_dict, fuse_vars)):
+        translate_state_dict(var_dict, state_dict)
+
+    interpolator.extract.load_state_dict(extract_dict)
+    interpolator.predict_flow.load_state_dict(flow_dict)
+    interpolator.fuse.load_state_dict(fuse_dict)
+
+
+def verify_debug_outputs(pt_outputs, tf_outputs):
+    max_error = 0
+    for name, predicted in pt_outputs.items():
+        if name == 'image':
+            continue
+        pred_frfp = [f.permute(0, 2, 3, 1).detach().cpu().numpy() for f in predicted]
+        true_frfp = [f.numpy() for f in tf_outputs[name]]
+
+        for i, (pred, true) in enumerate(zip(pred_frfp, true_frfp)):
+            assert pred.shape == true.shape, f'{name} {i} shape mismatch {pred.shape} != {true.shape}'
+            error = np.max(np.abs(pred - true))
+            max_error = max(max_error, error)
+            assert error < 1, f'{name} {i} max error: {error}'
+    print('Max intermediate error:', max_error)
+
+
+def test_model(interpolator, model, half=False, gpu=False):
+    torch.manual_seed(0)
+    time = torch.full((1, 1), .5)
+    x0 = torch.rand(1, 3, 256, 256)
+    x1 = torch.rand(1, 3, 256, 256)
+
+    x0_ = tf.convert_to_tensor(x0.permute(0, 2, 3, 1).numpy(), dtype=tf.float32)
+    x1_ = tf.convert_to_tensor(x1.permute(0, 2, 3, 1).numpy(), dtype=tf.float32)
+    time_ = tf.convert_to_tensor(time.numpy(), dtype=tf.float32)
+    tf_outputs = model({'x0': x0_, 'x1': x1_, 'time': time_}, training=False)
+
+    if half:
+        x0 = x0.half()
+        x1 = x1.half()
+        time = time.half()
+
+    if gpu and torch.cuda.is_available():
+        x0 = x0.cuda()
+        x1 = x1.cuda()
+        time = time.cuda()
+
+    with torch.no_grad():
+        pt_outputs = interpolator.debug_forward(x0, x1, time)
+
+    verify_debug_outputs(pt_outputs, tf_outputs)
+
+    with torch.no_grad():
+        prediction = interpolator(x0, x1, time)
+    output_color = prediction.permute(0, 2, 3, 1).detach().cpu().numpy()
+    true_color = tf_outputs['image'].numpy()
+    error = np.abs(output_color - true_color).max()
+
+    print('Color max error:', error)
+
+
+def main(model_path, save_path, export_to_torchscript=True, use_gpu=False, fp16=True, skiptest=False):
+    print(f'Exporting model to FP{["32", "16"][fp16]} {["state_dict", "torchscript"][export_to_torchscript]} '
+          f'using {"CG"[use_gpu]}PU')
+    model = tf.compat.v2.saved_model.load(model_path)
+    interpolator = Interpolator()
+    interpolator.eval()
+    import_state_dict(interpolator, model)
+
+    if use_gpu and torch.cuda.is_available():
+        interpolator = interpolator.cuda()
+    else:
+        use_gpu = False
+
+    if fp16:
+        interpolator = interpolator.half()
+    if export_to_torchscript:
+        interpolator = torch.jit.script(interpolator)
+    if export_to_torchscript:
+        interpolator.save(save_path)
+    else:
+        torch.save(interpolator.state_dict(), save_path)
+
+    if not skiptest:
+        if not use_gpu and fp16:
+            warnings.warn('Testing FP16 model on CPU is impossible, casting it back')
+            interpolator = interpolator.float()
+            fp16 = False
+        test_model(interpolator, model, fp16, use_gpu)
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Export frame-interpolator model to PyTorch state dict')
+
+    parser.add_argument('model_path', type=str, help='Path to the TF SavedModel')
+    parser.add_argument('save_path', type=str, help='Path to save the PyTorch state dict')
+    parser.add_argument('--statedict', action='store_true', help='Export to state dict instead of TorchScript')
+    parser.add_argument('--fp32', action='store_true', help='Save at full precision')
+    parser.add_argument('--skiptest', action='store_true', help='Skip testing and save model immediately instead')
+    parser.add_argument('--gpu', action='store_true', help='Use GPU')
+
+    args = parser.parse_args()
+
+    main(args.model_path, args.save_path, not args.statedict, args.gpu, not args.fp32, args.skiptest)
diff --git a/frame-interpolation-pytorch/feature_extractor.py b/frame-interpolation-pytorch/feature_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b41975291c64173e4d98619d0ea5f2ca67f3240
--- /dev/null
+++ b/frame-interpolation-pytorch/feature_extractor.py
@@ -0,0 +1,156 @@
+"""PyTorch layer for extracting image features for the film_net interpolator.
+
+The feature extractor implemented here converts an image pyramid into a pyramid
+of deep features. The feature pyramid serves a similar purpose as U-Net
+architecture's encoder, but we use a special cascaded architecture described in
+Multi-view Image Fusion [1].
+
+For comprehensiveness, below is a short description of the idea. While the
+description is a bit involved, the cascaded feature pyramid can be used just
+like any image feature pyramid.
+
+Why cascaded architeture?
+=========================
+To understand the concept it is worth reviewing a traditional feature pyramid
+first: *A traditional feature pyramid* as in U-net or in many optical flow
+networks is built by alternating between convolutions and pooling, starting
+from the input image.
+
+It is well known that early features of such architecture correspond to low
+level concepts such as edges in the image whereas later layers extract
+semantically higher level concepts such as object classes etc. In other words,
+the meaning of the filters in each resolution level is different. For problems
+such as semantic segmentation and many others this is a desirable property.
+
+However, the asymmetric features preclude sharing weights across resolution
+levels in the feature extractor itself and in any subsequent neural networks
+that follow. This can be a downside, since optical flow prediction, for
+instance is symmetric across resolution levels. The cascaded feature
+architecture addresses this shortcoming.
+
+How is it built?
+================
+The *cascaded* feature pyramid contains feature vectors that have constant
+length and meaning on each resolution level, except few of the finest ones. The
+advantage of this is that the subsequent optical flow layer can learn
+synergically from many resolutions. This means that coarse level prediction can
+benefit from finer resolution training examples, which can be useful with
+moderately sized datasets to avoid overfitting.
+
+The cascaded feature pyramid is built by extracting shallower subtree pyramids,
+each one of them similar to the traditional architecture. Each subtree
+pyramid S_i is extracted starting from each resolution level:
+
+image resolution 0 -> S_0
+image resolution 1 -> S_1
+image resolution 2 -> S_2
+...
+
+If we denote the features at level j of subtree i as S_i_j, the cascaded pyramid
+is constructed by concatenating features as follows (assuming subtree depth=3):
+
+lvl
+feat_0 = concat(                               S_0_0 )
+feat_1 = concat(                         S_1_0 S_0_1 )
+feat_2 = concat(                   S_2_0 S_1_1 S_0_2 )
+feat_3 = concat(             S_3_0 S_2_1 S_1_2       )
+feat_4 = concat(       S_4_0 S_3_1 S_2_2             )
+feat_5 = concat( S_5_0 S_4_1 S_3_2                   )
+   ....
+
+In above, all levels except feat_0 and feat_1 have the same number of features
+with similar semantic meaning. This enables training a single optical flow
+predictor module shared by levels 2,3,4,5... . For more details and evaluation
+see [1].
+
+[1] Multi-view Image Fusion, Trinidad et al. 2019
+"""
+from typing import List
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from util import Conv2d
+
+
+class SubTreeExtractor(nn.Module):
+    """Extracts a hierarchical set of features from an image.
+
+    This is a conventional, hierarchical image feature extractor, that extracts
+    [k, k*2, k*4... ] filters for the image pyramid where k=options.sub_levels.
+    Each level is followed by average pooling.
+    """
+
+    def __init__(self, in_channels=3, channels=64, n_layers=4):
+        super().__init__()
+        convs = []
+        for i in range(n_layers):
+            convs.append(nn.Sequential(
+                Conv2d(in_channels, (channels << i), 3),
+                Conv2d((channels << i), (channels << i), 3)
+            ))
+            in_channels = channels << i
+        self.convs = nn.ModuleList(convs)
+
+    def forward(self, image: torch.Tensor, n: int) -> List[torch.Tensor]:
+        """Extracts a pyramid of features from the image.
+
+        Args:
+          image: TORCH.Tensor with shape BATCH_SIZE x HEIGHT x WIDTH x CHANNELS.
+          n: number of pyramid levels to extract. This can be less or equal to
+           options.sub_levels given in the __init__.
+        Returns:
+          The pyramid of features, starting from the finest level. Each element
+          contains the output after the last convolution on the corresponding
+          pyramid level.
+        """
+        head = image
+        pyramid = []
+        for i, layer in enumerate(self.convs):
+            head = layer(head)
+            pyramid.append(head)
+            if i < n - 1:
+                head = F.avg_pool2d(head, kernel_size=2, stride=2)
+        return pyramid
+
+
+class FeatureExtractor(nn.Module):
+    """Extracts features from an image pyramid using a cascaded architecture.
+    """
+
+    def __init__(self, in_channels=3, channels=64, sub_levels=4):
+        super().__init__()
+        self.extract_sublevels = SubTreeExtractor(in_channels, channels, sub_levels)
+        self.sub_levels = sub_levels
+
+    def forward(self, image_pyramid: List[torch.Tensor]) -> List[torch.Tensor]:
+        """Extracts a cascaded feature pyramid.
+
+        Args:
+          image_pyramid: Image pyramid as a list, starting from the finest level.
+        Returns:
+          A pyramid of cascaded features.
+        """
+        sub_pyramids: List[List[torch.Tensor]] = []
+        for i in range(len(image_pyramid)):
+            # At each level of the image pyramid, creates a sub_pyramid of features
+            # with 'sub_levels' pyramid levels, re-using the same SubTreeExtractor.
+            # We use the same instance since we want to share the weights.
+            #
+            # However, we cap the depth of the sub_pyramid so we don't create features
+            # that are beyond the coarsest level of the cascaded feature pyramid we
+            # want to generate.
+            capped_sub_levels = min(len(image_pyramid) - i, self.sub_levels)
+            sub_pyramids.append(self.extract_sublevels(image_pyramid[i], capped_sub_levels))
+        # Below we generate the cascades of features on each level of the feature
+        # pyramid. Assuming sub_levels=3, The layout of the features will be
+        # as shown in the example on file documentation above.
+        feature_pyramid: List[torch.Tensor] = []
+        for i in range(len(image_pyramid)):
+            features = sub_pyramids[i][0]
+            for j in range(1, self.sub_levels):
+                if j <= i:
+                    features = torch.cat([features, sub_pyramids[i - j][j]], dim=1)
+            feature_pyramid.append(features)
+        return feature_pyramid
diff --git a/frame-interpolation-pytorch/film_net_fp16.pt b/frame-interpolation-pytorch/film_net_fp16.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e2695211566846c6137de304743e5e4b5dd56739
--- /dev/null
+++ b/frame-interpolation-pytorch/film_net_fp16.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d48a9c8f1032f046d7dfcbed40299d51e615b4bd8bbfbb36a83c9a49c76aca9
+size 69048401
diff --git a/frame-interpolation-pytorch/film_net_fp32.pt b/frame-interpolation-pytorch/film_net_fp32.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2691162477f27fe5e3cd4c69890fa2c28be27713
--- /dev/null
+++ b/frame-interpolation-pytorch/film_net_fp32.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f810cada26d0c288e50a27eac43af74446eb84b857ccbc77a22bb006f4d27240
+size 137922129
diff --git a/frame-interpolation-pytorch/fusion.py b/frame-interpolation-pytorch/fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca79661fdea7435a118e783cb74436f477faab2c
--- /dev/null
+++ b/frame-interpolation-pytorch/fusion.py
@@ -0,0 +1,120 @@
+"""The final fusion stage for the film_net frame interpolator.
+
+The inputs to this module are the warped input images, image features and
+flow fields, all aligned to the target frame (often midway point between the
+two original inputs). The output is the final image. FILM has no explicit
+occlusion handling -- instead using the abovementioned information this module
+automatically decides how to best blend the inputs together to produce content
+in areas where the pixels can only be borrowed from one of the inputs.
+
+Similarly, this module also decides on how much to blend in each input in case
+of fractional timestep that is not at the halfway point. For example, if the two
+inputs images are at t=0 and t=1, and we were to synthesize a frame at t=0.1,
+it often makes most sense to favor the first input. However, this is not
+always the case -- in particular in occluded pixels.
+
+The architecture of the Fusion module follows U-net [1] architecture's decoder
+side, e.g. each pyramid level consists of concatenation with upsampled coarser
+level output, and two 3x3 convolutions.
+
+The upsampling is implemented as 'resize convolution', e.g. nearest neighbor
+upsampling followed by 2x2 convolution as explained in [2]. The classic U-net
+uses max-pooling which has a tendency to create checkerboard artifacts.
+
+[1] Ronneberger et al. U-Net: Convolutional Networks for Biomedical Image
+    Segmentation, 2015, https://arxiv.org/pdf/1505.04597.pdf
+[2] https://distill.pub/2016/deconv-checkerboard/
+"""
+from typing import List
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from util import Conv2d
+
+_NUMBER_OF_COLOR_CHANNELS = 3
+
+
+def get_channels_at_level(level, filters):
+    n_images = 2
+    channels = _NUMBER_OF_COLOR_CHANNELS
+    flows = 2
+
+    return (sum(filters << i for i in range(level)) + channels + flows) * n_images
+
+
+class Fusion(nn.Module):
+    """The decoder."""
+
+    def __init__(self, n_layers=4, specialized_layers=3, filters=64):
+        """
+        Args:
+            m: specialized levels
+        """
+        super().__init__()
+
+        # The final convolution that outputs RGB:
+        self.output_conv = nn.Conv2d(filters, 3, kernel_size=1)
+
+        # Each item 'convs[i]' will contain the list of convolutions to be applied
+        # for pyramid level 'i'.
+        self.convs = nn.ModuleList()
+
+        # Create the convolutions. Roughly following the feature extractor, we
+        # double the number of filters when the resolution halves, but only up to
+        # the specialized_levels, after which we use the same number of filters on
+        # all levels.
+        #
+        # We create the convs in fine-to-coarse order, so that the array index
+        # for the convs will correspond to our normal indexing (0=finest level).
+        # in_channels: tuple = (128, 202, 256, 522, 512, 1162, 1930, 2442)
+
+        in_channels = get_channels_at_level(n_layers, filters)
+        increase = 0
+        for i in range(n_layers)[::-1]:
+            num_filters = (filters << i) if i < specialized_layers else (filters << specialized_layers)
+            convs = nn.ModuleList([
+                Conv2d(in_channels, num_filters, size=2, activation=None),
+                Conv2d(in_channels + (increase or num_filters), num_filters, size=3),
+                Conv2d(num_filters, num_filters, size=3)]
+            )
+            self.convs.append(convs)
+            in_channels = num_filters
+            increase = get_channels_at_level(i, filters) - num_filters // 2
+
+    def forward(self, pyramid: List[torch.Tensor]) -> torch.Tensor:
+        """Runs the fusion module.
+
+        Args:
+          pyramid: The input feature pyramid as list of tensors. Each tensor being
+            in (B x H x W x C) format, with finest level tensor first.
+
+        Returns:
+          A batch of RGB images.
+        Raises:
+          ValueError, if len(pyramid) != config.fusion_pyramid_levels as provided in
+            the constructor.
+        """
+
+        # As a slight difference to a conventional decoder (e.g. U-net), we don't
+        # apply any extra convolutions to the coarsest level, but just pass it
+        # to finer levels for concatenation. This choice has not been thoroughly
+        # evaluated, but is motivated by the educated guess that the fusion part
+        # probably does not need large spatial context, because at this point the
+        # features are spatially aligned by the preceding warp.
+        net = pyramid[-1]
+
+        # Loop starting from the 2nd coarsest level:
+        # for i in reversed(range(0, len(pyramid) - 1)):
+        for k, layers in enumerate(self.convs):
+            i = len(self.convs) - 1 - k
+            # Resize the tensor from coarser level to match for concatenation.
+            level_size = pyramid[i].shape[2:4]
+            net = F.interpolate(net, size=level_size, mode='nearest')
+            net = layers[0](net)
+            net = torch.cat([pyramid[i], net], dim=1)
+            net = layers[1](net)
+            net = layers[2](net)
+        net = self.output_conv(net)
+        return net
diff --git a/frame-interpolation-pytorch/inference.py b/frame-interpolation-pytorch/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..344d8bb62d81f02b60846cbf0865b28744047f33
--- /dev/null
+++ b/frame-interpolation-pytorch/inference.py
@@ -0,0 +1,105 @@
+import bisect
+import os
+from tqdm import tqdm
+import torch
+import numpy as np
+import cv2
+
+from util import load_image
+
+
+def inference(model_path, img1, img2, save_path, gpu, inter_frames, fps, half):
+    model = torch.jit.load(model_path, map_location='cpu')
+    model.eval()
+    img_batch_1, crop_region_1 = load_image(img1)
+    img_batch_2, crop_region_2 = load_image(img2)
+
+    img_batch_1 = torch.from_numpy(img_batch_1).permute(0, 3, 1, 2)
+    img_batch_2 = torch.from_numpy(img_batch_2).permute(0, 3, 1, 2)
+
+    if not half:
+        model.float()
+
+    if gpu and torch.cuda.is_available():
+        if half:
+            model = model.half()
+        else:
+            model.float()
+        model = model.cuda()
+
+    if save_path == 'img1 folder':
+        save_path = os.path.join(os.path.split(img1)[0], 'output.mp4')
+
+    results = [
+        img_batch_1,
+        img_batch_2
+    ]
+
+    idxes = [0, inter_frames + 1]
+    remains = list(range(1, inter_frames + 1))
+
+    splits = torch.linspace(0, 1, inter_frames + 2)
+
+    for _ in tqdm(range(len(remains)), 'Generating in-between frames'):
+        starts = splits[idxes[:-1]]
+        ends = splits[idxes[1:]]
+        distances = ((splits[None, remains] - starts[:, None]) / (ends[:, None] - starts[:, None]) - .5).abs()
+        matrix = torch.argmin(distances).item()
+        start_i, step = np.unravel_index(matrix, distances.shape)
+        end_i = start_i + 1
+
+        x0 = results[start_i]
+        x1 = results[end_i]
+
+        if gpu and torch.cuda.is_available():
+            if half:
+                x0 = x0.half()
+                x1 = x1.half()
+            x0 = x0.cuda()
+            x1 = x1.cuda()
+
+        dt = x0.new_full((1, 1), (splits[remains[step]] - splits[idxes[start_i]])) / (splits[idxes[end_i]] - splits[idxes[start_i]])
+
+        with torch.no_grad():
+            prediction = model(x0, x1, dt)
+        insert_position = bisect.bisect_left(idxes, remains[step])
+        idxes.insert(insert_position, remains[step])
+        results.insert(insert_position, prediction.clamp(0, 1).cpu().float())
+        del remains[step]
+
+    video_folder = os.path.split(save_path)[0]
+    os.makedirs(video_folder, exist_ok=True)
+
+    y1, x1, y2, x2 = crop_region_1
+    frames = [(tensor[0] * 255).byte().flip(0).permute(1, 2, 0).numpy()[y1:y2, x1:x2].copy() for tensor in results]
+
+    w, h = frames[0].shape[1::-1]
+    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
+    writer = cv2.VideoWriter(save_path, fourcc, fps, (w, h))
+    for frame in frames:
+        writer.write(frame)
+
+    for frame in frames[1:][::-1]:
+        writer.write(frame)
+
+    writer.release()
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Test frame interpolator model')
+
+    parser.add_argument('model_path', type=str, help='Path to the TorchScript model')
+    parser.add_argument('img1', type=str, help='Path to the first image')
+    parser.add_argument('img2', type=str, help='Path to the second image')
+
+    parser.add_argument('--save_path', type=str, default='img1 folder', help='Path to save the interpolated frames')
+    parser.add_argument('--gpu', action='store_true', help='Use GPU')
+    parser.add_argument('--fp16', action='store_true', help='Use FP16')
+    parser.add_argument('--frames', type=int, default=18, help='Number of frames to interpolate')
+    parser.add_argument('--fps', type=int, default=10, help='FPS of the output video')
+
+    args = parser.parse_args()
+
+    inference(args.model_path, args.img1, args.img2, args.save_path, args.gpu, args.frames, args.fps, args.fp16)
diff --git a/frame-interpolation-pytorch/interpolator.py b/frame-interpolation-pytorch/interpolator.py
new file mode 100644
index 0000000000000000000000000000000000000000..707f8a69af0c6783a75766fed38c1353e96d1c16
--- /dev/null
+++ b/frame-interpolation-pytorch/interpolator.py
@@ -0,0 +1,158 @@
+"""The film_net frame interpolator main model code.
+
+Basics
+======
+The film_net is an end-to-end learned neural frame interpolator implemented as
+a PyTorch model. It has the following inputs and outputs:
+
+Inputs:
+  x0: image A.
+  x1: image B.
+  time: desired sub-frame time.
+
+Outputs:
+  image: the predicted in-between image at the chosen time in range [0, 1].
+
+Additional outputs include forward and backward warped image pyramids, flow
+pyramids, etc., that can be visualized for debugging and analysis.
+
+Note that many training sets only contain triplets with ground truth at
+time=0.5. If a model has been trained with such training set, it will only work
+well for synthesizing frames at time=0.5. Such models can only generate more
+in-between frames using recursion.
+
+Architecture
+============
+The inference consists of three main stages: 1) feature extraction 2) warping
+3) fusion. On high-level, the architecture has similarities to Context-aware
+Synthesis for Video Frame Interpolation [1], but the exact architecture is
+closer to Multi-view Image Fusion [2] with some modifications for the frame
+interpolation use-case.
+
+Feature extraction stage employs the cascaded multi-scale architecture described
+in [2]. The advantage of this architecture is that coarse level flow prediction
+can be learned from finer resolution image samples. This is especially useful
+to avoid overfitting with moderately sized datasets.
+
+The warping stage uses a residual flow prediction idea that is similar to
+PWC-Net [3], Multi-view Image Fusion [2] and many others.
+
+The fusion stage is similar to U-Net's decoder where the skip connections are
+connected to warped image and feature pyramids. This is described in [2].
+
+Implementation Conventions
+====================
+Pyramids
+--------
+Throughtout the model, all image and feature pyramids are stored as python lists
+with finest level first followed by downscaled versions obtained by successively
+halving the resolution. The depths of all pyramids are determined by
+options.pyramid_levels. The only exception to this is internal to the feature
+extractor, where smaller feature pyramids are temporarily constructed with depth
+options.sub_levels.
+
+Color ranges & gamma
+--------------------
+The model code makes no assumptions on whether the images are in gamma or
+linearized space or what is the range of RGB color values. So a model can be
+trained with different choices. This does not mean that all the choices lead to
+similar results. In practice the model has been proven to work well with RGB
+scale = [0,1] with gamma-space images (i.e. not linearized).
+
+[1] Context-aware Synthesis for Video Frame Interpolation, Niklaus and Liu, 2018
+[2] Multi-view Image Fusion, Trinidad et al, 2019
+[3] PWC-Net: CNNs for Optical Flow Using Pyramid, Warping, and Cost Volume
+"""
+from typing import Dict, List
+
+import torch
+from torch import nn
+
+import util
+from feature_extractor import FeatureExtractor
+from fusion import Fusion
+from pyramid_flow_estimator import PyramidFlowEstimator
+
+
+class Interpolator(nn.Module):
+    def __init__(
+            self,
+            pyramid_levels=7,
+            fusion_pyramid_levels=5,
+            specialized_levels=3,
+            sub_levels=4,
+            filters=64,
+            flow_convs=(3, 3, 3, 3),
+            flow_filters=(32, 64, 128, 256),
+    ):
+        super().__init__()
+        self.pyramid_levels = pyramid_levels
+        self.fusion_pyramid_levels = fusion_pyramid_levels
+
+        self.extract = FeatureExtractor(3, filters, sub_levels)
+        self.predict_flow = PyramidFlowEstimator(filters, flow_convs, flow_filters)
+        self.fuse = Fusion(sub_levels, specialized_levels, filters)
+
+    def shuffle_images(self, x0, x1):
+        return [
+            util.build_image_pyramid(x0, self.pyramid_levels),
+            util.build_image_pyramid(x1, self.pyramid_levels)
+        ]
+
+    def debug_forward(self, x0, x1, batch_dt) -> Dict[str, List[torch.Tensor]]:
+        image_pyramids = self.shuffle_images(x0, x1)
+
+        # Siamese feature pyramids:
+        feature_pyramids = [self.extract(image_pyramids[0]), self.extract(image_pyramids[1])]
+
+        # Predict forward flow.
+        forward_residual_flow_pyramid = self.predict_flow(feature_pyramids[0], feature_pyramids[1])
+
+        # Predict backward flow.
+        backward_residual_flow_pyramid = self.predict_flow(feature_pyramids[1], feature_pyramids[0])
+
+        # Concatenate features and images:
+
+        # Note that we keep up to 'fusion_pyramid_levels' levels as only those
+        # are used by the fusion module.
+
+        forward_flow_pyramid = util.flow_pyramid_synthesis(forward_residual_flow_pyramid)[:self.fusion_pyramid_levels]
+
+        backward_flow_pyramid = util.flow_pyramid_synthesis(backward_residual_flow_pyramid)[:self.fusion_pyramid_levels]
+
+        # We multiply the flows with t and 1-t to warp to the desired fractional time.
+        #
+        # Note: In film_net we fix time to be 0.5, and recursively invoke the interpo-
+        # lator for multi-frame interpolation. Below, we create a constant tensor of
+        # shape [B]. We use the `time` tensor to infer the batch size.
+        backward_flow = util.multiply_pyramid(backward_flow_pyramid, batch_dt)
+        forward_flow = util.multiply_pyramid(forward_flow_pyramid, 1 - batch_dt)
+
+        pyramids_to_warp = [
+            util.concatenate_pyramids(image_pyramids[0][:self.fusion_pyramid_levels],
+                                      feature_pyramids[0][:self.fusion_pyramid_levels]),
+            util.concatenate_pyramids(image_pyramids[1][:self.fusion_pyramid_levels],
+                                      feature_pyramids[1][:self.fusion_pyramid_levels])
+        ]
+
+        # Warp features and images using the flow. Note that we use backward warping
+        # and backward flow is used to read from image 0 and forward flow from
+        # image 1.
+        forward_warped_pyramid = util.pyramid_warp(pyramids_to_warp[0], backward_flow)
+        backward_warped_pyramid = util.pyramid_warp(pyramids_to_warp[1], forward_flow)
+
+        aligned_pyramid = util.concatenate_pyramids(forward_warped_pyramid,
+                                                    backward_warped_pyramid)
+        aligned_pyramid = util.concatenate_pyramids(aligned_pyramid, backward_flow)
+        aligned_pyramid = util.concatenate_pyramids(aligned_pyramid, forward_flow)
+
+        return {
+            'image': [self.fuse(aligned_pyramid)],
+            'forward_residual_flow_pyramid': forward_residual_flow_pyramid,
+            'backward_residual_flow_pyramid': backward_residual_flow_pyramid,
+            'forward_flow_pyramid': forward_flow_pyramid,
+            'backward_flow_pyramid': backward_flow_pyramid,
+        }
+
+    def forward(self, x0, x1, batch_dt) -> torch.Tensor:
+        return self.debug_forward(x0, x1, batch_dt)['image'][0]
diff --git a/frame-interpolation-pytorch/photos/one.png b/frame-interpolation-pytorch/photos/one.png
new file mode 100644
index 0000000000000000000000000000000000000000..044b61a95f23ff2b140c4deaf94230e10db2f7e2
--- /dev/null
+++ b/frame-interpolation-pytorch/photos/one.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bad1c97feb31a4bec60a809f808e1b0a26f55219fa991c4caa2e696bce8e81f
+size 3442971
diff --git a/frame-interpolation-pytorch/photos/output.gif b/frame-interpolation-pytorch/photos/output.gif
new file mode 100644
index 0000000000000000000000000000000000000000..423413a343e899ff721db372c58d6c3452eba47d
--- /dev/null
+++ b/frame-interpolation-pytorch/photos/output.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81ff68882dfca2c22343d1a435de6815b7d1c9747899febf9bb429ec8746cc35
+size 2829322
diff --git a/frame-interpolation-pytorch/photos/two.png b/frame-interpolation-pytorch/photos/two.png
new file mode 100644
index 0000000000000000000000000000000000000000..c6aac8b76c7d8170987b380424facd2c3f30527f
--- /dev/null
+++ b/frame-interpolation-pytorch/photos/two.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d80058cede12e10b9d7fe49ea022d1cc4f9c28bd2a00a1c3d4830d048c55f3fa
+size 3392356
diff --git a/frame-interpolation-pytorch/pyramid_flow_estimator.py b/frame-interpolation-pytorch/pyramid_flow_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..3083690a0737a19479a199ed633155cd6ff30163
--- /dev/null
+++ b/frame-interpolation-pytorch/pyramid_flow_estimator.py
@@ -0,0 +1,149 @@
+"""PyTorch layer for estimating optical flow by a residual flow pyramid.
+
+This approach of estimating optical flow between two images can be traced back
+to [1], but is also used by later neural optical flow computation methods such
+as SpyNet [2] and PWC-Net [3].
+
+The basic idea is that the optical flow is first estimated in a coarse
+resolution, then the flow is upsampled to warp the higher resolution image and
+then a residual correction is computed and added to the estimated flow. This
+process is repeated in a pyramid on coarse to fine order to successively
+increase the resolution of both optical flow and the warped image.
+
+In here, the optical flow predictor is used as an internal component for the
+film_net frame interpolator, to warp the two input images into the inbetween,
+target frame.
+
+[1] F. Glazer, Hierarchical motion detection. PhD thesis, 1987.
+[2] A. Ranjan and M. J. Black, Optical Flow Estimation using a Spatial Pyramid
+    Network. 2016
+[3] D. Sun X. Yang, M-Y. Liu and J. Kautz, PWC-Net: CNNs for Optical Flow Using
+    Pyramid, Warping, and Cost Volume, 2017
+"""
+from typing import List
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+import util
+
+
+class FlowEstimator(nn.Module):
+    """Small-receptive field predictor for computing the flow between two images.
+
+    This is used to compute the residual flow fields in PyramidFlowEstimator.
+
+    Note that while the number of 3x3 convolutions & filters to apply is
+    configurable, two extra 1x1 convolutions are appended to extract the flow in
+    the end.
+
+    Attributes:
+      name: The name of the layer
+      num_convs: Number of 3x3 convolutions to apply
+      num_filters: Number of filters in each 3x3 convolution
+    """
+
+    def __init__(self, in_channels: int, num_convs: int, num_filters: int):
+        super(FlowEstimator, self).__init__()
+
+        self._convs = nn.ModuleList()
+        for i in range(num_convs):
+            self._convs.append(util.Conv2d(in_channels=in_channels, out_channels=num_filters, size=3))
+            in_channels = num_filters
+        self._convs.append(util.Conv2d(in_channels, num_filters // 2, size=1))
+        in_channels = num_filters // 2
+        # For the final convolution, we want no activation at all to predict the
+        # optical flow vector values. We have done extensive testing on explicitly
+        # bounding these values using sigmoid, but it turned out that having no
+        # activation gives better results.
+        self._convs.append(util.Conv2d(in_channels, 2, size=1, activation=None))
+
+    def forward(self, features_a: torch.Tensor, features_b: torch.Tensor) -> torch.Tensor:
+        """Estimates optical flow between two images.
+
+        Args:
+          features_a: per pixel feature vectors for image A (B x H x W x C)
+          features_b: per pixel feature vectors for image B (B x H x W x C)
+
+        Returns:
+          A tensor with optical flow from A to B
+        """
+        net = torch.cat([features_a, features_b], dim=1)
+        for conv in self._convs:
+            net = conv(net)
+        return net
+
+
+class PyramidFlowEstimator(nn.Module):
+    """Predicts optical flow by coarse-to-fine refinement.
+    """
+
+    def __init__(self, filters: int = 64,
+                 flow_convs: tuple = (3, 3, 3, 3),
+                 flow_filters: tuple = (32, 64, 128, 256)):
+        super(PyramidFlowEstimator, self).__init__()
+
+        in_channels = filters << 1
+        predictors = []
+        for i in range(len(flow_convs)):
+            predictors.append(
+                FlowEstimator(
+                    in_channels=in_channels,
+                    num_convs=flow_convs[i],
+                    num_filters=flow_filters[i]))
+            in_channels += filters << (i + 2)
+        self._predictor = predictors[-1]
+        self._predictors = nn.ModuleList(predictors[:-1][::-1])
+
+    def forward(self, feature_pyramid_a: List[torch.Tensor],
+                feature_pyramid_b: List[torch.Tensor]) -> List[torch.Tensor]:
+        """Estimates residual flow pyramids between two image pyramids.
+
+        Each image pyramid is represented as a list of tensors in fine-to-coarse
+        order. Each individual image is represented as a tensor where each pixel is
+        a vector of image features.
+
+        util.flow_pyramid_synthesis can be used to convert the residual flow
+        pyramid returned by this method into a flow pyramid, where each level
+        encodes the flow instead of a residual correction.
+
+        Args:
+          feature_pyramid_a: image pyramid as a list in fine-to-coarse order
+          feature_pyramid_b: image pyramid as a list in fine-to-coarse order
+
+        Returns:
+          List of flow tensors, in fine-to-coarse order, each level encoding the
+          difference against the bilinearly upsampled version from the coarser
+          level. The coarsest flow tensor, e.g. the last element in the array is the
+          'DC-term', e.g. not a residual (alternatively you can think of it being a
+          residual against zero).
+        """
+        levels = len(feature_pyramid_a)
+        v = self._predictor(feature_pyramid_a[-1], feature_pyramid_b[-1])
+        residuals = [v]
+        for i in range(levels - 2, len(self._predictors) - 1, -1):
+            # Upsamples the flow to match the current pyramid level. Also, scales the
+            # magnitude by two to reflect the new size.
+            level_size = feature_pyramid_a[i].shape[2:4]
+            v = F.interpolate(2 * v, size=level_size, mode='bilinear')
+            # Warp feature_pyramid_b[i] image based on the current flow estimate.
+            warped = util.warp(feature_pyramid_b[i], v)
+            # Estimate the residual flow between pyramid_a[i] and warped image:
+            v_residual = self._predictor(feature_pyramid_a[i], warped)
+            residuals.insert(0, v_residual)
+            v = v_residual + v
+
+        for k, predictor in enumerate(self._predictors):
+            i = len(self._predictors) - 1 - k
+            # Upsamples the flow to match the current pyramid level. Also, scales the
+            # magnitude by two to reflect the new size.
+            level_size = feature_pyramid_a[i].shape[2:4]
+            v = F.interpolate(2 * v, size=level_size, mode='bilinear')
+            # Warp feature_pyramid_b[i] image based on the current flow estimate.
+            warped = util.warp(feature_pyramid_b[i], v)
+            # Estimate the residual flow between pyramid_a[i] and warped image:
+            v_residual = predictor(feature_pyramid_a[i], warped)
+            residuals.insert(0, v_residual)
+            v = v_residual + v
+        return residuals
diff --git a/frame-interpolation-pytorch/requirements.txt b/frame-interpolation-pytorch/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8cf85fc758e71c0f018843b4e238b3198d89da30
--- /dev/null
+++ b/frame-interpolation-pytorch/requirements.txt
@@ -0,0 +1,3 @@
+opencv-python
+torch
+tqdm
\ No newline at end of file
diff --git a/frame-interpolation-pytorch/util.py b/frame-interpolation-pytorch/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eec1a11fa9d885a44081f35917783acda14d626
--- /dev/null
+++ b/frame-interpolation-pytorch/util.py
@@ -0,0 +1,166 @@
+"""Various utilities used in the film_net frame interpolator model."""
+from typing import List, Optional
+
+import cv2
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def pad_batch(batch, align):
+    height, width = batch.shape[1:3]
+    height_to_pad = (align - height % align) if height % align != 0 else 0
+    width_to_pad = (align - width % align) if width % align != 0 else 0
+
+    crop_region = [height_to_pad >> 1, width_to_pad >> 1, height + (height_to_pad >> 1), width + (width_to_pad >> 1)]
+    batch = np.pad(batch, ((0, 0), (height_to_pad >> 1, height_to_pad - (height_to_pad >> 1)),
+                           (width_to_pad >> 1, width_to_pad - (width_to_pad >> 1)), (0, 0)), mode='constant')
+    return batch, crop_region
+
+
+def load_image(path, align=64):
+    image = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB).astype(np.float32) / np.float32(255)
+    image_batch, crop_region = pad_batch(np.expand_dims(image, axis=0), align)
+    return image_batch, crop_region
+
+
+def build_image_pyramid(image: torch.Tensor, pyramid_levels: int = 3) -> List[torch.Tensor]:
+    """Builds an image pyramid from a given image.
+
+    The original image is included in the pyramid and the rest are generated by
+    successively halving the resolution.
+
+    Args:
+      image: the input image.
+      options: film_net options object
+
+    Returns:
+      A list of images starting from the finest with options.pyramid_levels items
+    """
+
+    pyramid = []
+    for i in range(pyramid_levels):
+        pyramid.append(image)
+        if i < pyramid_levels - 1:
+            image = F.avg_pool2d(image, 2, 2)
+    return pyramid
+
+
+def warp(image: torch.Tensor, flow: torch.Tensor) -> torch.Tensor:
+    """Backward warps the image using the given flow.
+
+    Specifically, the output pixel in batch b, at position x, y will be computed
+    as follows:
+      (flowed_y, flowed_x) = (y+flow[b, y, x, 1], x+flow[b, y, x, 0])
+      output[b, y, x] = bilinear_lookup(image, b, flowed_y, flowed_x)
+
+    Note that the flow vectors are expected as [x, y], e.g. x in position 0 and
+    y in position 1.
+
+    Args:
+      image: An image with shape BxHxWxC.
+      flow: A flow with shape BxHxWx2, with the two channels denoting the relative
+        offset in order: (dx, dy).
+    Returns:
+      A warped image.
+    """
+    flow = -flow.flip(1)
+
+    dtype = flow.dtype
+    device = flow.device
+
+    # warped = tfa_image.dense_image_warp(image, flow)
+    # Same as above but with pytorch
+    ls1 = 1 - 1 / flow.shape[3]
+    ls2 = 1 - 1 / flow.shape[2]
+
+    normalized_flow2 = flow.permute(0, 2, 3, 1) / torch.tensor(
+        [flow.shape[2] * .5, flow.shape[3] * .5], dtype=dtype, device=device)[None, None, None]
+    normalized_flow2 = torch.stack([
+        torch.linspace(-ls1, ls1, flow.shape[3], dtype=dtype, device=device)[None, None, :] - normalized_flow2[..., 1],
+        torch.linspace(-ls2, ls2, flow.shape[2], dtype=dtype, device=device)[None, :, None] - normalized_flow2[..., 0],
+    ], dim=3)
+
+    warped = F.grid_sample(image, normalized_flow2,
+                           mode='bilinear', padding_mode='border', align_corners=False)
+    return warped.reshape(image.shape)
+
+
+def multiply_pyramid(pyramid: List[torch.Tensor],
+                     scalar: torch.Tensor) -> List[torch.Tensor]:
+    """Multiplies all image batches in the pyramid by a batch of scalars.
+
+    Args:
+      pyramid: Pyramid of image batches.
+      scalar: Batch of scalars.
+
+    Returns:
+      An image pyramid with all images multiplied by the scalar.
+    """
+    # To multiply each image with its corresponding scalar, we first transpose
+    # the batch of images from BxHxWxC-format to CxHxWxB. This can then be
+    # multiplied with a batch of scalars, then we transpose back to the standard
+    # BxHxWxC form.
+    return [image * scalar[..., None, None] for image in pyramid]
+
+
+def flow_pyramid_synthesis(
+        residual_pyramid: List[torch.Tensor]) -> List[torch.Tensor]:
+    """Converts a residual flow pyramid into a flow pyramid."""
+    flow = residual_pyramid[-1]
+    flow_pyramid: List[torch.Tensor] = [flow]
+    for residual_flow in residual_pyramid[:-1][::-1]:
+        level_size = residual_flow.shape[2:4]
+        flow = F.interpolate(2 * flow, size=level_size, mode='bilinear')
+        flow = residual_flow + flow
+        flow_pyramid.insert(0, flow)
+    return flow_pyramid
+
+
+def pyramid_warp(feature_pyramid: List[torch.Tensor],
+                 flow_pyramid: List[torch.Tensor]) -> List[torch.Tensor]:
+    """Warps the feature pyramid using the flow pyramid.
+
+    Args:
+      feature_pyramid: feature pyramid starting from the finest level.
+      flow_pyramid: flow fields, starting from the finest level.
+
+    Returns:
+      Reverse warped feature pyramid.
+    """
+    warped_feature_pyramid = []
+    for features, flow in zip(feature_pyramid, flow_pyramid):
+        warped_feature_pyramid.append(warp(features, flow))
+    return warped_feature_pyramid
+
+
+def concatenate_pyramids(pyramid1: List[torch.Tensor],
+                         pyramid2: List[torch.Tensor]) -> List[torch.Tensor]:
+    """Concatenates each pyramid level together in the channel dimension."""
+    result = []
+    for features1, features2 in zip(pyramid1, pyramid2):
+        result.append(torch.cat([features1, features2], dim=1))
+    return result
+
+
+class Conv2d(nn.Sequential):
+    def __init__(self, in_channels, out_channels, size, activation: Optional[str] = 'relu'):
+        assert activation in (None, 'relu')
+        super().__init__(
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=size,
+                padding='same' if size % 2 else 0)
+        )
+        self.size = size
+        self.activation = nn.LeakyReLU(.2) if activation == 'relu' else None
+
+    def forward(self, x):
+        if not self.size % 2:
+            x = F.pad(x, (0, 1, 0, 1))
+        y = self[0](x)
+        if self.activation is not None:
+            y = self.activation(y)
+        return y
diff --git a/models/.DS_Store b/models/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6
Binary files /dev/null and b/models/.DS_Store differ
diff --git a/models/jointembedding_high_env0.py b/models/jointembedding_high_env0.py
new file mode 100644
index 0000000000000000000000000000000000000000..044f144fcf1318ffa4eeee1fb3c25dec3f38767d
--- /dev/null
+++ b/models/jointembedding_high_env0.py
@@ -0,0 +1,483 @@
+import copy
+import math
+import pickle
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import difflib
+from typing import Optional, Tuple, Union
+
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, BertTokenizer, BertModel, Wav2Vec2Model, Wav2Vec2Config
+from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2FeatureEncoder
+from .motion_encoder import VQEncoderV6
+
+
+def audio_to_time_aligned_text_features(inputs, processor, model, tokenizer, bert_model):  
+    with torch.no_grad():
+        logits = model(inputs.input_values).logits  # shape: (1, time_steps, vocab_size)
+
+    predicted_ids_per_timestep = torch.argmax(logits, dim=-1)  # shape: (1, time_steps)
+    predicted_ids_per_timestep = predicted_ids_per_timestep[0].cpu().numpy()
+    vocab = processor.tokenizer.get_vocab()
+    id_to_token = {v: k for k, v in vocab.items()}
+    tokens_per_timestep = [id_to_token[id] for id in predicted_ids_per_timestep]
+
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.decode(predicted_ids[0])
+    inputs_bert = tokenizer(transcription, return_tensors='pt')
+    input_ids = inputs_bert['input_ids'][0]  
+    tokens_bert = tokenizer.convert_ids_to_tokens(input_ids)
+
+    with torch.no_grad():
+        outputs_bert = bert_model(**inputs_bert.to(inputs.input_values.device))
+    all_token_embeddings = outputs_bert.last_hidden_state[0]  
+    per_timestep_chars = []
+    per_timestep_char_indices = []
+    for idx, t in enumerate(tokens_per_timestep):
+        if t not in ('<pad>', '|'):
+            per_timestep_chars.append(t.lower())
+            per_timestep_char_indices.append(idx)
+    bert_chars = []
+    bert_char_indices = []
+    for idx, token in enumerate(tokens_bert):
+        if token in ('[CLS]', '[SEP]'):
+            continue
+        token_str = token.replace('##', '')
+        for c in token_str:
+            bert_chars.append(c)
+            bert_char_indices.append(idx)
+
+    s = difflib.SequenceMatcher(None, per_timestep_chars, bert_chars)
+    opcodes = s.get_opcodes()
+    per_timestep_to_bert_token_idx = {}
+    for tag, i1, i2, j1, j2 in opcodes:
+        if tag == 'equal':
+            for k in range(i2 - i1):
+                per_timestep_idx = per_timestep_char_indices[i1 + k]
+                bert_token_idx = bert_char_indices[j1 + k]
+                per_timestep_to_bert_token_idx[per_timestep_idx] = bert_token_idx
+    features_per_timestep = []
+    check = []
+    for i, per_token in enumerate(tokens_per_timestep):
+        if i == 0:
+            embedding = all_token_embeddings[0]
+            check.append("cls")
+        elif per_token in ('<pad>', '|'):
+            embedding = torch.zeros(all_token_embeddings.shape[-1]).to(inputs.input_values.device)
+            check.append(0)
+        else:
+            if i in per_timestep_to_bert_token_idx:
+                bert_idx = per_timestep_to_bert_token_idx[i]
+                embedding = all_token_embeddings[bert_idx]
+                check.append(tokens_bert[bert_idx])
+            else:
+                embedding = torch.zeros(all_token_embeddings.shape[-1]).to(inputs.input_values.device)
+                check.append(0)
+        features_per_timestep.append(embedding)
+    features_per_timestep = torch.stack(features_per_timestep)  
+
+    updated_check = check.copy()
+    for i in range(len(check)):
+        if check[i] == 0:
+            left = i - 1
+            right = i + 1
+            left_found = False
+            right_found = False
+
+            while left >= 0:
+                if check[left] != 0:
+                    left_found = True
+                    break
+                left -= 1
+
+            while right < len(check):
+                if check[right] != 0:
+                    right_found = True
+                    break
+                right += 1
+
+            if left_found and right_found:
+                if (i - left) <= (right - i):
+                    nearest = left
+                else:
+                    nearest = right
+            elif left_found:
+                nearest = left
+            elif right_found:
+                nearest = right
+            else:
+                continue
+            updated_check[i] = updated_check[nearest]
+            features_per_timestep[i] = features_per_timestep[nearest]
+    features_per_timestep = features_per_timestep.unsqueeze(0)
+    return transcription, features_per_timestep, all_token_embeddings 
+
+
+class MLP(nn.Module):
+    def __init__(self, in_dim, hidden_size, out_dim):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(in_dim, hidden_size),
+            nn.LeakyReLU(0.2, True),
+            nn.Linear(hidden_size, out_dim)
+        )
+    def forward(self, inputs):
+        out = self.mlp(inputs)
+        return out
+
+
+class PeriodicPositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, period=20, max_seq_len=64): 
+        super(PeriodicPositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(period, d_model)
+        position = torch.arange(0, period, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0) # (1, period, d_model)
+        repeat_num = (max_seq_len//period) + 1
+        pe = pe.repeat(1, repeat_num, 1) # (1, repeat_num, period, d_model)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        # print(self.pe.shape, x.shape)
+        x = x + self.pe[:, :x.size(1), :]
+        return self.dropout(x)
+    
+
+class CustomMultiheadAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super(CustomMultiheadAttention, self).__init__()
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.query_proj = nn.Linear(embed_dim, embed_dim)
+        self.key_proj = nn.Linear(embed_dim, embed_dim)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+    def forward(self, query, key, value):
+        batch_size, seq_len, embed_dim = query.size()
+
+        # Linear projections
+        Q = self.query_proj(query).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        K = self.key_proj(key).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        V = self.value_proj(value).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        # Scaled dot-product attention
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)
+        attn_weights = F.softmax(scores, dim=-1)  # Shape: (batch_size, num_heads, seq_len, seq_len)
+        attn_output = torch.matmul(attn_weights, V)
+
+        # Concatenate heads
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)
+
+        # Apply final linear projection
+        output = self.out_proj(attn_output)
+        return output, attn_weights  # Return the per-head attention weights
+
+
+def reinitialize_weights(module):
+    for submodule in module.modules():
+        weight = getattr(submodule, 'weight', None)
+        if weight is not None and isinstance(weight, torch.Tensor) and weight.dim() >= 2:
+            torch.nn.init.xavier_uniform_(weight)
+            print("init")
+        elif weight is not None and isinstance(weight, torch.Tensor):
+            torch.nn.init.normal_(weight, mean=0.0, std=0.02)
+            print("init")
+        bias = getattr(submodule, 'bias', None)
+        if bias is not None and isinstance(bias, torch.Tensor):
+            torch.nn.init.zeros_(bias)
+        
+
+class WrapedMotionCNN(nn.Module):
+    def __init__(self, args):
+        super(WrapedMotionCNN, self).__init__()
+        self.args = args
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=self.args.motion_f,  # This should match the hidden size of the Wav2Vec2 model
+            nhead=8,      # Number of attention heads
+            dim_feedforward=self.args.hidden_size,  # The feedforward network dimension
+            dropout=0.1,   # Dropout rate
+            batch_first=True
+        )
+        args_top = copy.deepcopy(self.args)
+        args_top.vae_layer = 3
+        args_top.vae_length = self.args.motion_f
+        args_top.vae_test_dim = self.args.motion_dim
+        self.feature_extractor = VQEncoderV6(args_top) 
+
+     
+        args_top = copy.deepcopy(self.args)
+        args_top.vae_layer = 6
+        args_top.vae_length = self.args.motion_f
+        args_top.vae_test_dim = self.args.motion_dim + self.args.motion_f
+      
+        self.encoder_cnn = VQEncoderV6(args_top) 
+        self.pos_encoding = PeriodicPositionalEncoding(d_model=self.args.motion_f, period=20, max_seq_len=64, dropout=0.0)
+        self.encoder_trans = nn.TransformerEncoder(encoder_layer, num_layers=1) # Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base-960h').encoder
+
+    def forward(self, 
+        inputs,
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None
+        ):
+        low_level = self.feature_extractor(inputs)
+        # print(low_level.shape, inputs.shape)
+        hidden_states = self.encoder_cnn(torch.cat([low_level.detach(), inputs], dim=-1))
+        hidden_states = self.pos_encoding(hidden_states)
+        hidden_states = self.encoder_trans(hidden_states)
+        return {
+            "low_level": low_level,
+            "high_level": hidden_states
+        }
+        
+
+class WrapedWav2Vec(nn.Module):
+    def __init__(self):
+        super(WrapedWav2Vec, self).__init__()
+        self.feature_extractor = Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base-960h').feature_extractor
+        self.feature_projection = Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base-960h').feature_projection
+        self.encoder = Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base-960h').encoder
+        # print(self.encoder)
+        self.encoder.layers = self.encoder.layers[:1]
+        # print(self.encoder)
+        self.proj_down = nn.Linear(768,512)
+        # print(bug)
+    
+    def forward(self, 
+        inputs,
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None
+        ):
+        finetune_audio_low = self.feature_extractor(inputs).transpose(1, 2)
+        hidden_states, _ = self.feature_projection(finetune_audio_low.detach())
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        hidden_states = self.proj_down(hidden_states)
+        # print(hidden_states.shape)
+        return {
+            "low_level": finetune_audio_low,
+            "high_level": hidden_states
+        }
+
+
+class JointEmbedding(nn.Module):
+    def __init__(self, args):
+        super(JointEmbedding, self).__init__()
+        self.args = args.model   
+        self.audio_processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h')
+        self.audio_encoder = Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base-960h')
+        self.config_wav2vec = Wav2Vec2Config.from_pretrained('facebook/wav2vec2-base-960h')
+        # self.audio_encoder_fintune = Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base-960h').feature_extractor
+        self.audio_encoder_fintune = WrapedWav2Vec()
+        # print(self.audio_encoder_fintune)
+        # print(bug)
+        
+        self.asr = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h')
+        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
+
+        self.audio_low_mapping = MLP(512+512, self.args.hidden_size, self.args.audio_f)
+        self.audio_high_mapping = MLP(512+512+512, self.args.hidden_size, self.args.audio_f)
+        # self.audio_down_proj_1 = nn.Linear(768, 512)
+        self.audio_down_proj_2 = nn.Linear(768, 512)
+        self.audio_down_proj_3 = nn.Linear(768, 512)
+        # self.audio_sa = nn.MultiheadAttention(embed_dim=self.args.audio_f, num_heads=8, batch_first=True)
+        self.audio_sa = CustomMultiheadAttention(embed_dim=self.args.audio_f, num_heads=8,)
+
+        self.motion_encoder_fintune = WrapedMotionCNN(self.args)
+        self.motion_low_mapping = MLP(self.args.motion_f, self.args.hidden_size, self.args.motion_f)
+        self.motion_high_mapping = MLP(self.args.motion_f, self.args.hidden_size, self.args.motion_f)
+        # self.motion_sa = nn.MultiheadAttention(embed_dim=self.args.audio_f, num_heads=8, batch_first=True)
+        self.motion_sa = CustomMultiheadAttention(embed_dim=self.args.audio_f, num_heads=8,)
+        
+        self.down_sample = 2 # for downsample 30 fps motion to 15
+        self.smplx_model = None
+        self.get_motion_reps = None
+        self.audio_to_time_aligned_text_features = audio_to_time_aligned_text_features
+        self.low_temp = nn.Parameter(torch.tensor(0.07))
+        self.low_level_loss_fn = None
+        self.high_temp = nn.Parameter(torch.tensor(0.07))
+        self.high_level_loss_fn = None
+
+    def _reset_parameters(self):
+        nn.init.normal_(self.mask_embeddings, 0, self.args.hidden_size ** -0.5)
+    
+    def forward(self, in_audio=None, in_motion=None, cached_audio_low=None, cached_audio_high=None, cached_rep15d=None):
+        # motion feature
+        if cached_rep15d is not None:
+            in_motion = cached_rep15d[:,::self.down_sample]
+        else:
+            in_motion = self.get_motion_reps(in_motion, self.smplx_model)["rep15d"][:,::self.down_sample]
+        
+        motion_features = self.motion_encoder_fintune(in_motion)
+        raw_motion_low = motion_features["low_level"] # self.motion_encoder_low(in_motion)
+        raw_motion_high = motion_features["high_level"] # self.motion_encoder_high(torch.cat([raw_motion_low.detach(), in_motion], dim=-1))
+
+        motion_low = self.motion_low_mapping(raw_motion_low)
+        motion_high = self.motion_high_mapping(raw_motion_high)
+        motion_high_att, motion_high_weight = self.motion_sa(motion_high, motion_high, motion_high)
+        bs, n, c = motion_high.shape
+        # print("a:", motion_high_weight[:, :, 0, :].unsqueeze(2).shape, "b:", motion_high.transpose(1, 2).view(bs, 8, c//8, n).shape)
+        motion_high_att_before_sum = motion_high_weight[:, :, 0, :].unsqueeze(2) * motion_high.transpose(1, 2).view(bs, 8, c//8, n)
+        motion_high_att_before_sum = motion_high_att_before_sum.reshape(bs, c, n).transpose(1, 2)
+        motion_low = F.interpolate(motion_low.transpose(1, 2), scale_factor=2, mode='linear', align_corners=True).transpose(1, 2)
+        motion_high_att = F.interpolate(motion_high_att.transpose(1, 2), scale_factor=2, mode='linear', align_corners=True).transpose(1, 2)
+        motion_high_att_before_sum = F.interpolate(motion_high_att_before_sum.transpose(1, 2), scale_factor=2, mode='linear', align_corners=True).transpose(1, 2)
+        motion_cls = motion_high_att[:, 0]
+
+        # audio feature
+        if cached_audio_low is not None:
+            raw_audio_low = cached_audio_low
+            raw_audio_high = torch.cat([self.audio_down_proj_2(cached_audio_high[:, :, :768]), self.audio_down_proj_3(cached_audio_high[:, :, 768:])], dim=-1)
+            
+            audio_list = [i.cpu().numpy() for i in in_audio]
+            inputs = self.audio_processor(audio_list, sampling_rate=16000, return_tensors="pt", padding=True).to(in_audio.device)
+            finetune_audio = self.audio_encoder_fintune(inputs.input_values)
+            finetune_audio_low, finetune_audio_high = finetune_audio["low_level"], finetune_audio["high_level"]
+            diff = raw_audio_low.shape[1] - finetune_audio_low.shape[1]
+            if diff > 0:
+                finetune_audio_low = torch.cat([finetune_audio_low, finetune_audio_low[:, -diff:]], dim=1)
+            diff = raw_audio_high.shape[1] - finetune_audio_high.shape[1]
+            if diff > 0:
+                finetune_audio_high = torch.cat([finetune_audio_high, finetune_audio_high[:, -diff:]], dim=1)
+            raw_audio_low = torch.cat([raw_audio_low, finetune_audio_low], dim=-1) # bs, t, 1024
+        else:
+            print("error! must have cached audio in training")
+        
+        # print(raw_audio_low.shape, raw_audio_high.shape, "before")
+
+        raw_audio_low = F.interpolate(raw_audio_low.transpose(1, 2), scale_factor=30/50, mode='linear', align_corners=True).transpose(1, 2) 
+        raw_audio_high = F.interpolate(raw_audio_high.transpose(1, 2), scale_factor=15/50, mode='linear', align_corners=True).transpose(1, 2)
+        finetune_audio_high = F.interpolate(finetune_audio_high.transpose(1, 2), scale_factor=15/50, mode='linear', align_corners=True).transpose(1, 2)  
+        # print(raw_audio_low.shape, raw_audio_high.shape, "after")
+        audio_low = self.audio_low_mapping(raw_audio_low)
+        raw_audio_high = torch.cat([finetune_audio_high, raw_audio_high], dim=-1)
+        # print(finetune_audio_high.shape, raw_audio_high.shape)
+        audio_high = self.audio_high_mapping(raw_audio_high)
+        audio_high_att, audio_high_weight = self.audio_sa(audio_high, audio_high, audio_high)
+        bs, n, c = audio_high.shape
+        audio_high_att_before_sum = audio_high_weight[:, :, 0, :].unsqueeze(2) * audio_high.transpose(1, 2).view(bs, 8, c//8, n)
+        audio_high_att_before_sum = audio_high_att_before_sum.reshape(bs, c, n).transpose(1, 2)
+        audio_high_att = F.interpolate(audio_high_att.transpose(1, 2), scale_factor=2, mode='linear', align_corners=True).transpose(1, 2)
+        audio_high_att_before_sum = F.interpolate(audio_high_att_before_sum.transpose(1, 2), scale_factor=2, mode='linear', align_corners=True).transpose(1, 2)
+        audio_cls = audio_high_att[:, 0]
+        # low_infonce, low_acc = self.low_level_loss_fn(audio_low, motion_low, learned_temp=self.low_temp)
+        
+        # fix temp to 0.1 is better than learned temp
+        low_infonce, low_acc = self.low_level_loss_fn(audio_low, motion_low)
+        high_infonce = self.high_level_loss_fn(audio_cls, motion_cls)
+        return {
+            "audio_low":audio_low,
+            "audio_high":audio_high_att,
+            "audio_cls":audio_cls,
+            "audio_high_weight":audio_high_att_before_sum,
+            "motion_low":motion_low,
+            "motion_high":motion_high_att,
+            "motion_cls":motion_cls,
+            "motion_high_weight":motion_high_att_before_sum,
+            "low_level_loss": [low_infonce, low_acc],
+            "high_level_loss": high_infonce
+            }
+
+    def get_audio_features(self, in_audio):
+        audio_list = [i.cpu().numpy() for i in in_audio]
+        inputs = self.audio_processor(audio_list, sampling_rate=16000, return_tensors="pt", padding=True).to(in_audio.device)
+        raw_audio_low = self.audio_encoder.feature_extractor(inputs.input_values).transpose(1, 2)
+        raw_audio_low = raw_audio_low
+            
+        finetune_audio = self.audio_encoder_fintune(inputs.input_values)
+        finetune_audio_low, finetune_audio_high = finetune_audio["low_level"], finetune_audio["high_level"]
+        diff = raw_audio_low.shape[1] - finetune_audio_low.shape[1]
+        if diff > 0:
+            finetune_audio_low = torch.cat([finetune_audio_low, finetune_audio_low[:, -diff:]], dim=1)
+        raw_audio_low = torch.cat([raw_audio_low, finetune_audio_low], dim=-1)
+
+        raw_audio_high = self.audio_encoder(inputs.input_values).last_hidden_state
+        
+        diff = raw_audio_high.shape[1] - finetune_audio_high.shape[1]
+        if diff > 0:
+            finetune_audio_high = torch.cat([finetune_audio_high, finetune_audio_high[:, -diff:]], dim=1)
+        # print(raw_audio_high.shape, finetune_audio_high.shape)
+
+        _, bert_time_aligned_text, _ = audio_to_time_aligned_text_features(inputs, self.audio_processor, self.asr, self.bert_tokenizer, self.bert_model)
+        raw_audio_high = torch.cat([raw_audio_high, bert_time_aligned_text], dim=2)
+        raw_audio_high = torch.cat([self.audio_down_proj_2(raw_audio_high[:, :, :768]), self.audio_down_proj_3(raw_audio_high[:, :, 768:])], dim=-1)
+
+        raw_audio_low = F.interpolate(raw_audio_low.transpose(1, 2), scale_factor=30/50, mode='linear', align_corners=True).transpose(1, 2) 
+        raw_audio_high = F.interpolate(raw_audio_high.transpose(1, 2), scale_factor=15/50, mode='linear', align_corners=True).transpose(1, 2)
+        finetune_audio_high = F.interpolate(finetune_audio_high.transpose(1, 2), scale_factor=15/50, mode='linear', align_corners=True).transpose(1, 2) 
+        
+        if raw_audio_low.shape[1] % 2 == 1:
+            raw_audio_low = torch.cat([raw_audio_low, raw_audio_low[:, -1:]], dim=1)
+        diff = raw_audio_low[:, ::2].shape[1] - raw_audio_high.shape[1]
+        if diff > 0:
+            raw_audio_high = torch.cat([raw_audio_high, raw_audio_high[:, -diff:]], dim=1)
+            finetune_audio_high = torch.cat([finetune_audio_high, finetune_audio_high[:, -diff:]], dim=1)
+
+        audio_low = self.audio_low_mapping(raw_audio_low)
+        # print(audio_low.shape[1]//2, raw_audio_high.shape[1])
+        raw_audio_high = torch.cat([finetune_audio_high, raw_audio_high], dim=-1)
+        audio_high = self.audio_high_mapping(raw_audio_high)
+        audio_high_att, audio_high_weight = self.audio_sa(audio_high, audio_high, audio_high)
+        bs, n, c = audio_high.shape
+        audio_high_att_before_sum = audio_high_weight[:, :, 0, :].unsqueeze(2) * audio_high.transpose(1, 2).view(bs, 8, c//8, n)
+        audio_high_att_before_sum = audio_high_att_before_sum.reshape(bs, c, n).transpose(1, 2)
+        audio_high_att = F.interpolate(audio_high_att.transpose(1, 2), scale_factor=2, mode='linear', align_corners=True).transpose(1, 2)
+        audio_high_att_before_sum = F.interpolate(audio_high_att_before_sum.transpose(1, 2), scale_factor=2, mode='linear', align_corners=True).transpose(1, 2)
+        audio_cls = audio_high_att[:, 0]
+        return {
+            "audio_low":audio_low,
+            "audio_high":audio_high_att,
+            "audio_cls":audio_cls,
+            "audio_high_weight":audio_high_att_before_sum,
+            }
+
+    def get_motion_features(self, in_motion):
+        original_length = in_motion.shape[1]
+         
+        in_motion = self.get_motion_reps(in_motion, self.smplx_model)["rep15d"][:,::self.down_sample]
+        motion_features = self.motion_encoder_fintune(in_motion)
+        raw_motion_low = motion_features["low_level"] # self.motion_encoder_low(in_motion)
+        raw_motion_high = motion_features["high_level"] # self.motion_encoder_high(torch.cat([raw_motion_low.detach(), in_motion], dim=-1))
+        motion_low = self.motion_low_mapping(raw_motion_low)
+        motion_high = self.motion_high_mapping(raw_motion_high)
+        
+        motion_high_att, motion_high_weight = self.motion_sa(motion_high, motion_high, motion_high)
+        bs, n, c = motion_high.shape
+        motion_high_att_before_sum = motion_high_weight[:, :, 0, :].unsqueeze(2) * motion_high.transpose(1, 2).view(bs, 8, c//8, n)
+        motion_high_att_before_sum = motion_high_att_before_sum.reshape(bs, c, n).transpose(1, 2)
+        motion_low = F.interpolate(motion_low.transpose(1, 2), scale_factor=2, mode='linear', align_corners=True).transpose(1, 2)
+        motion_high_att = F.interpolate(motion_high_att.transpose(1, 2), scale_factor=2, mode='linear', align_corners=True).transpose(1, 2)
+        motion_high_att_before_sum = F.interpolate(motion_high_att_before_sum.transpose(1, 2), scale_factor=2, mode='linear', align_corners=True).transpose(1, 2)
+        
+        # if motion_low.shape[1] - 
+        motion_low = motion_low[:, :original_length]
+        motion_high_att = motion_high_att[:, :original_length]
+        motion_high_att_before_sum = motion_high_att_before_sum[:, :original_length]
+
+        motion_cls = motion_high_att[:, 0]
+        # print(original_length, motion_low.shape, motion_high_att.shape, motion_high_att_before_sum.shape)
+        return {
+            "motion_low":motion_low,
+            "motion_high":motion_high_att,
+            "motion_cls":motion_cls,
+            "motion_high_weight":motion_high_att_before_sum,
+            }
+  
\ No newline at end of file
diff --git a/models/layer.py b/models/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d293094e470b8956b48649a92cc3e4f0acaf323e
--- /dev/null
+++ b/models/layer.py
@@ -0,0 +1,215 @@
+import random
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+import torch.nn.functional as F
+
+class Chomp1d(nn.Module):
+    def __init__(self, chomp_size):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+
+    def forward(self, x):
+        return x[:, :, :-self.chomp_size].contiguous()
+
+
+class TemporalBlock(nn.Module):
+    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
+        super(TemporalBlock, self).__init__()
+        self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
+                                           stride=stride, padding=padding, dilation=dilation))
+        self.chomp1 = Chomp1d(padding)
+        self.relu1 = nn.ReLU()
+        self.dropout1 = nn.Dropout(dropout)
+
+        self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,
+                                           stride=stride, padding=padding, dilation=dilation))
+        self.chomp2 = Chomp1d(padding)
+        self.relu2 = nn.ReLU()
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
+                                 self.conv2, self.chomp2, self.relu2, self.dropout2)
+        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
+        self.relu = nn.ReLU()
+        self.init_weights()
+
+    def init_weights(self):
+        self.conv1.weight.data.normal_(0, 0.01)
+        self.conv2.weight.data.normal_(0, 0.01)
+        if self.downsample is not None:
+            self.downsample.weight.data.normal_(0, 0.01)
+
+    def forward(self, x):
+        out = self.net(x)
+        res = x if self.downsample is None else self.downsample(x)
+        return self.relu(out + res)
+
+
+class TemporalConvNet(nn.Module):
+    def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
+        super(TemporalConvNet, self).__init__()
+        layers = []
+        num_levels = len(num_channels)
+        for i in range(num_levels):
+            dilation_size = 2 ** i
+            in_channels = num_inputs if i == 0 else num_channels[i-1]
+            out_channels = num_channels[i]
+            layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size,
+                                     padding=(kernel_size-1) * dilation_size, dropout=dropout)]
+
+        self.network = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.network(x)
+
+
+class TextEncoderTCN(nn.Module):
+    """ based on https://github.com/locuslab/TCN/blob/master/TCN/word_cnn/model.py """
+    def __init__(self, args, n_words=11195, embed_size=300, pre_trained_embedding=None,
+                 kernel_size=2, dropout=0.3, emb_dropout=0.1, word_cache=False):
+        super(TextEncoderTCN, self).__init__()
+#         if word_cache:
+#             self.embedding = None
+#         else:
+#             if pre_trained_embedding is not None:  # use pre-trained embedding (fasttext)
+#                 #print(pre_trained_embedding.shape)
+#                 assert pre_trained_embedding.shape[0] == n_words
+#                 assert pre_trained_embedding.shape[1] == embed_size
+#                 self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(pre_trained_embedding),
+#                                                               freeze=args.freeze_wordembed)
+#             else:
+#                 self.embedding = nn.Embedding(n_words, embed_size)
+
+        num_channels = [args.hidden_size] #* args.n_layer
+        self.tcn = TemporalConvNet(embed_size, num_channels, kernel_size, dropout=dropout)
+        self.decoder = nn.Linear(num_channels[-1], args.word_f)
+        self.drop = nn.Dropout(emb_dropout)
+        #self.emb_dropout = emb_dropout
+        self.init_weights()
+
+    def init_weights(self):
+        self.decoder.bias.data.fill_(0)
+        self.decoder.weight.data.normal_(0, 0.01)
+
+    def forward(self, input):
+        #print(input.shape)
+#         if self.embedding is None:
+#             emb = self.drop(input)
+#         else:
+#             emb = self.drop(self.embedding(input))
+        y = self.tcn(input.transpose(1, 2)).transpose(1, 2)
+        y = self.decoder(y)
+        return y, torch.max(y, dim=1)[0]
+
+
+
+
+
+
+
+
+
+def reparameterize(mu, logvar):
+    std = torch.exp(0.5 * logvar)
+    eps = torch.randn_like(std)
+    return mu + eps * std
+
+def ConvNormRelu(in_channels, out_channels, downsample=False, padding=0, batchnorm=True):
+    if not downsample:
+        k = 3
+        s = 1
+    else:
+        k = 4
+        s = 2
+    conv_block = nn.Conv1d(in_channels, out_channels, kernel_size=k, stride=s, padding=padding)
+    norm_block = nn.BatchNorm1d(out_channels)
+    if batchnorm:
+        net = nn.Sequential(
+            conv_block,
+            norm_block,
+            nn.LeakyReLU(0.2, True)
+        )
+    else:
+        net = nn.Sequential(
+            conv_block,
+            nn.LeakyReLU(0.2, True)
+        )
+    return net
+
+class BasicBlock(nn.Module):
+    """ based on timm: https://github.com/rwightman/pytorch-image-models """
+    def __init__(self, inplanes, planes, ker_size, stride=1, downsample=None, cardinality=1, base_width=64,
+                 reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.LeakyReLU,   norm_layer=nn.BatchNorm1d, attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
+        super(BasicBlock, self).__init__()
+
+        self.conv1 = nn.Conv1d(
+            inplanes, planes, kernel_size=ker_size, stride=stride, padding=first_dilation,
+            dilation=dilation, bias=True)
+        self.bn1 = norm_layer(planes)
+        self.act1 = act_layer(inplace=True)
+        self.conv2 = nn.Conv1d(
+            planes, planes, kernel_size=ker_size, padding=ker_size//2, dilation=dilation, bias=True)
+        self.bn2 = norm_layer(planes)
+        self.act2 = act_layer(inplace=True)
+        if downsample is not None:
+            self.downsample = nn.Sequential(
+                nn.Conv1d(inplanes, planes,  stride=stride, kernel_size=ker_size, padding=first_dilation, dilation=dilation, bias=True),
+                norm_layer(planes), 
+            )
+        else: self.downsample=None
+        self.stride = stride
+        self.dilation = dilation
+        self.drop_block = drop_block
+        self.drop_path = drop_path
+
+    def zero_init_last_bn(self):
+        nn.init.zeros_(self.bn2.weight)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        x += shortcut
+        x = self.act2(x)
+        return x
+
+def init_weight(m):
+    if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear) or isinstance(m, nn.ConvTranspose1d):
+        nn.init.xavier_normal_(m.weight)
+        # m.bias.data.fill_(0.01)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+
+def init_weight_skcnn(m):
+    if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear) or isinstance(m, nn.ConvTranspose1d):
+        nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5))
+        # m.bias.data.fill_(0.01)
+        if m.bias is not None:
+            #nn.init.constant_(m.bias, 0)
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(m.weight)
+            bound = 1 / math.sqrt(fan_in)
+            nn.init.uniform_(m.bias, -bound, bound)
+            
+class ResBlock(nn.Module):
+    def __init__(self, channel):
+        super(ResBlock, self).__init__()
+        self.model = nn.Sequential(
+            nn.Conv1d(channel, channel, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv1d(channel, channel, kernel_size=3, stride=1, padding=1),
+        )
+
+    def forward(self, x):
+        residual = x
+        out = self.model(x)
+        out += residual
+        return out
+    
\ No newline at end of file
diff --git a/models/motion_encoder.py b/models/motion_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a198aceb61b74e1897a0f07e3df991f01d2d94b
--- /dev/null
+++ b/models/motion_encoder.py
@@ -0,0 +1,690 @@
+import random
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import smplx
+
+# ----------- 1 full conv-based encoder------------- #
+"""
+from tm2t
+TM2T: Stochastical and Tokenized Modeling for the Reciprocal Generation of 3D Human Motions and Texts
+https://github.com/EricGuo5513/TM2T
+"""
+from .quantizer import *
+from .layer import *
+
+class SCFormer(nn.Module):
+    def __init__(self, args):
+        super(VQEncoderV3, self).__init__()
+
+
+        n_down = args.vae_layer
+        channels = [args.vae_length]
+        for i in range(n_down-1):
+            channels.append(args.vae_length)
+        
+        input_size = args.vae_test_dim
+        assert len(channels) == n_down
+        layers = [
+            nn.Conv1d(input_size, channels[0], 4, 2, 1),
+            nn.LeakyReLU(0.2, inplace=True),
+            ResBlock(channels[0]),
+        ]
+
+        for i in range(1, n_down):
+            layers += [
+                nn.Conv1d(channels[i-1], channels[i], 4, 2, 1),
+                nn.LeakyReLU(0.2, inplace=True),
+                ResBlock(channels[i]),
+            ]
+        self.main = nn.Sequential(*layers)
+        # self.out_net = nn.Linear(output_size, output_size)
+        self.main.apply(init_weight)
+        # self.out_net.apply(init_weight)
+    def forward(self, inputs): # bs t n
+        '''
+        face 51 or 106
+        hand 30*(15)
+        upper body 
+        lower body 
+        global 1*3 
+        max length around 180 --> 450
+        '''
+        bs, t, n = inputs.shape
+        inputs = inputs.reshape(bs*t, n)
+        inputs = self.spatial_transformer_encoder(inputs) # bs*t c
+        cs = inputs.shape[1]
+        inputs = inputs.reshape(bs, t, cs).permute(0, 2, 1).reshape(bs*cs, t)
+        inputs = self.temporal_cnn_encoder(inputs) # bs*c t
+        ct = inputs.shape[1]
+        outputs = inputs.reshape(bs, cs, ct).permute(0, 2, 1) # bs ct cs
+        return outputs
+
+class VQEncoderV3(nn.Module):
+    def __init__(self, args):
+        super(VQEncoderV3, self).__init__()
+        n_down = args.vae_layer
+        channels = [args.vae_length]
+        for i in range(n_down-1):
+            channels.append(args.vae_length)
+        
+        input_size = args.vae_test_dim
+        assert len(channels) == n_down
+        layers = [
+            nn.Conv1d(input_size, channels[0], 4, 2, 1),
+            nn.LeakyReLU(0.2, inplace=True),
+            ResBlock(channels[0]),
+        ]
+
+        for i in range(1, n_down):
+            layers += [
+                nn.Conv1d(channels[i-1], channels[i], 4, 2, 1),
+                nn.LeakyReLU(0.2, inplace=True),
+                ResBlock(channels[i]),
+            ]
+        self.main = nn.Sequential(*layers)
+        # self.out_net = nn.Linear(output_size, output_size)
+        self.main.apply(init_weight)
+        # self.out_net.apply(init_weight)
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        return outputs
+
+class VQEncoderV6(nn.Module):
+    def __init__(self, args):
+        super(VQEncoderV6, self).__init__()
+        n_down = args.vae_layer
+        channels = [args.vae_length]
+        for i in range(n_down-1):
+            channels.append(args.vae_length)
+        
+        input_size = args.vae_test_dim
+        assert len(channels) == n_down
+        layers = [
+            nn.Conv1d(input_size, channels[0], 3, 1, 1),
+            nn.LeakyReLU(0.2, inplace=True),
+            ResBlock(channels[0]),
+        ]
+
+        for i in range(1, n_down):
+            layers += [
+                nn.Conv1d(channels[i-1], channels[i], 3, 1, 1),
+                nn.LeakyReLU(0.2, inplace=True),
+                ResBlock(channels[i]),
+            ]
+        self.main = nn.Sequential(*layers)
+        # self.out_net = nn.Linear(output_size, output_size)
+        self.main.apply(init_weight)
+        # self.out_net.apply(init_weight)
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        return outputs
+
+class VQEncoderV4(nn.Module):
+    def __init__(self, args):
+        super(VQEncoderV4, self).__init__()
+        n_down = args.vae_layer
+        channels = [args.vae_length]
+        for i in range(n_down-1):
+            channels.append(args.vae_length)
+        
+        input_size = args.vae_test_dim
+        assert len(channels) == n_down
+        layers = [
+            nn.Conv1d(input_size, channels[0], 4, 2, 1),
+            nn.LeakyReLU(0.2, inplace=True),
+            ResBlock(channels[0]),
+        ]
+
+        for i in range(1, n_down):
+            layers += [
+                nn.Conv1d(channels[i-1], channels[i], 3, 1, 1),
+                nn.LeakyReLU(0.2, inplace=True),
+                ResBlock(channels[i]),
+            ]
+        self.main = nn.Sequential(*layers)
+        # self.out_net = nn.Linear(output_size, output_size)
+        self.main.apply(init_weight)
+        # self.out_net.apply(init_weight)
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        # print(outputs.shape)
+        return outputs
+
+class VQEncoderV5(nn.Module):
+    def __init__(self, args):
+        super(VQEncoderV5, self).__init__()
+        n_down = args.vae_layer
+        channels = [args.vae_length]
+        for i in range(n_down-1):
+            channels.append(args.vae_length)
+        
+        input_size = args.vae_test_dim
+        assert len(channels) == n_down
+        layers = [
+            nn.Conv1d(input_size, channels[0], 3, 1, 1),
+            nn.LeakyReLU(0.2, inplace=True),
+            ResBlock(channels[0]),
+        ]
+
+        for i in range(1, n_down):
+            layers += [
+                nn.Conv1d(channels[i-1], channels[i], 3, 1, 1),
+                nn.LeakyReLU(0.2, inplace=True),
+                ResBlock(channels[i]),
+            ]
+        self.main = nn.Sequential(*layers)
+        # self.out_net = nn.Linear(output_size, output_size)
+        self.main.apply(init_weight)
+        # self.out_net.apply(init_weight)
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        # print(outputs.shape)
+        return outputs
+
+class VQDecoderV4(nn.Module):
+    def __init__(self, args):
+        super(VQDecoderV4, self).__init__()
+        n_up = args.vae_layer
+        channels = []
+        for i in range(n_up-1):
+            channels.append(args.vae_length)
+        channels.append(args.vae_length)
+        channels.append(args.vae_test_dim)
+        input_size = args.vae_length
+        n_resblk = 2
+        assert len(channels) == n_up + 1
+        if input_size == channels[0]:
+            layers = []
+        else:
+            layers = [nn.Conv1d(input_size, channels[0], kernel_size=3, stride=1, padding=1)]
+
+        for i in range(n_resblk):
+            layers += [ResBlock(channels[0])]
+        # channels = channels
+        for i in range(n_up):
+            up_factor = 2 if i < n_up - 1 else 1
+            layers += [
+                nn.Upsample(scale_factor=up_factor, mode='nearest'),
+                nn.Conv1d(channels[i], channels[i+1], kernel_size=3, stride=1, padding=1),
+                nn.LeakyReLU(0.2, inplace=True)
+            ]
+        layers += [nn.Conv1d(channels[-1], channels[-1], kernel_size=3, stride=1, padding=1)]
+        self.main = nn.Sequential(*layers)
+        self.main.apply(init_weight)
+
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        return outputs
+
+class VQDecoderV5(nn.Module):
+    def __init__(self, args):
+        super(VQDecoderV5, self).__init__()
+        n_up = args.vae_layer
+        channels = []
+        for i in range(n_up-1):
+            channels.append(args.vae_length)
+        channels.append(args.vae_length)
+        channels.append(args.vae_test_dim)
+        input_size = args.vae_length
+        n_resblk = 2
+        assert len(channels) == n_up + 1
+        if input_size == channels[0]:
+            layers = []
+        else:
+            layers = [nn.Conv1d(input_size, channels[0], kernel_size=3, stride=1, padding=1)]
+
+        for i in range(n_resblk):
+            layers += [ResBlock(channels[0])]
+        # channels = channels
+        for i in range(n_up):
+            up_factor = 2 if i < n_up - 1 else 1
+            layers += [
+                #nn.Upsample(scale_factor=up_factor, mode='nearest'),
+                nn.Conv1d(channels[i], channels[i+1], kernel_size=3, stride=1, padding=1),
+                nn.LeakyReLU(0.2, inplace=True)
+            ]
+        layers += [nn.Conv1d(channels[-1], channels[-1], kernel_size=3, stride=1, padding=1)]
+        self.main = nn.Sequential(*layers)
+        self.main.apply(init_weight)
+
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        return outputs
+
+class VQDecoderV7(nn.Module):
+    def __init__(self, args):
+        super(VQDecoderV7, self).__init__()
+        n_up = args.vae_layer
+        channels = []
+        for i in range(n_up-1):
+            channels.append(args.vae_length)
+        channels.append(args.vae_length)
+        channels.append(args.vae_test_dim+4)
+        input_size = args.vae_length
+        n_resblk = 2
+        assert len(channels) == n_up + 1
+        if input_size == channels[0]:
+            layers = []
+        else:
+            layers = [nn.Conv1d(input_size, channels[0], kernel_size=3, stride=1, padding=1)]
+
+        for i in range(n_resblk):
+            layers += [ResBlock(channels[0])]
+        # channels = channels
+        for i in range(n_up):
+            up_factor = 2 if i < n_up - 1 else 1
+            layers += [
+                #nn.Upsample(scale_factor=up_factor, mode='nearest'),
+                nn.Conv1d(channels[i], channels[i+1], kernel_size=3, stride=1, padding=1),
+                nn.LeakyReLU(0.2, inplace=True)
+            ]
+        layers += [nn.Conv1d(channels[-1], channels[-1], kernel_size=3, stride=1, padding=1)]
+        self.main = nn.Sequential(*layers)
+        self.main.apply(init_weight)
+
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        return outputs
+    
+class VQDecoderV3(nn.Module):
+    def __init__(self, args):
+        super(VQDecoderV3, self).__init__()
+        n_up = args.vae_layer
+        channels = []
+        for i in range(n_up-1):
+            channels.append(args.vae_length)
+        channels.append(args.vae_length)
+        channels.append(args.vae_test_dim)
+        input_size = args.vae_length
+        n_resblk = 2
+        assert len(channels) == n_up + 1
+        if input_size == channels[0]:
+            layers = []
+        else:
+            layers = [nn.Conv1d(input_size, channels[0], kernel_size=3, stride=1, padding=1)]
+
+        for i in range(n_resblk):
+            layers += [ResBlock(channels[0])]
+        # channels = channels
+        for i in range(n_up):
+            layers += [
+                nn.Upsample(scale_factor=2, mode='nearest'),
+                nn.Conv1d(channels[i], channels[i+1], kernel_size=3, stride=1, padding=1),
+                nn.LeakyReLU(0.2, inplace=True)
+            ]
+        layers += [nn.Conv1d(channels[-1], channels[-1], kernel_size=3, stride=1, padding=1)]
+        self.main = nn.Sequential(*layers)
+        self.main.apply(init_weight)
+
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        return outputs
+
+class VQDecoderV6(nn.Module):
+    def __init__(self, args):
+        super(VQDecoderV6, self).__init__()
+        n_up = args.vae_layer
+        channels = []
+        for i in range(n_up-1):
+            channels.append(args.vae_length)
+        channels.append(args.vae_length)
+        channels.append(args.vae_test_dim)
+        input_size = args.vae_length * 2
+        n_resblk = 2
+        assert len(channels) == n_up + 1
+        if input_size == channels[0]:
+            layers = []
+        else:
+            layers = [nn.Conv1d(input_size, channels[0], kernel_size=3, stride=1, padding=1)]
+
+        for i in range(n_resblk):
+            layers += [ResBlock(channels[0])]
+        # channels = channels
+        for i in range(n_up):
+            layers += [
+                # nn.Upsample(scale_factor=2, mode='nearest'),
+                nn.Conv1d(channels[i], channels[i+1], kernel_size=3, stride=1, padding=1),
+                nn.LeakyReLU(0.2, inplace=True)
+            ]
+        layers += [nn.Conv1d(channels[-1], channels[-1], kernel_size=3, stride=1, padding=1)]
+        self.main = nn.Sequential(*layers)
+        self.main.apply(init_weight)
+
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        return outputs
+
+
+# -----------2 conv+mlp based fix-length input ae ------------- #
+from .layer import reparameterize, ConvNormRelu, BasicBlock
+"""
+from Trimodal,
+encoder:
+    bs, n, c_in --conv--> bs, n/k, c_out_0 --mlp--> bs, c_out_1, only support fixed length
+decoder:
+    bs, c_out_1 --mlp--> bs, n/k*c_out_0 --> bs, n/k, c_out_0 --deconv--> bs, n, c_in
+"""
+class PoseEncoderConv(nn.Module):
+    def __init__(self, length, dim, feature_length=32):
+        super().__init__()
+        self.base = feature_length
+        self.net = nn.Sequential(
+            ConvNormRelu(dim, self.base, batchnorm=True), #32
+            ConvNormRelu(self.base, self.base*2, batchnorm=True), #30
+            ConvNormRelu(self.base*2, self.base*2, True, batchnorm=True), #14     
+            nn.Conv1d(self.base*2, self.base, 3)
+        )
+        self.out_net = nn.Sequential(
+            nn.Linear(12*self.base, self.base*4),  # for 34 frames
+            nn.BatchNorm1d(self.base*4),
+            nn.LeakyReLU(True),
+            nn.Linear(self.base*4, self.base*2),
+            nn.BatchNorm1d(self.base*2),
+            nn.LeakyReLU(True),
+            nn.Linear(self.base*2, self.base),
+        )
+        self.fc_mu = nn.Linear(self.base, self.base)
+        self.fc_logvar = nn.Linear(self.base, self.base)
+
+    def forward(self, poses, variational_encoding=None):
+        poses = poses.transpose(1, 2)  # to (bs, dim, seq)
+        out = self.net(poses)
+        out = out.flatten(1)
+        out = self.out_net(out)
+        mu = self.fc_mu(out)
+        logvar = self.fc_logvar(out)
+        if variational_encoding:
+            z = reparameterize(mu, logvar)
+        else:
+            z = mu
+        return z, mu, logvar
+
+
+class PoseDecoderFC(nn.Module):
+    def __init__(self, gen_length, pose_dim, use_pre_poses=False):
+        super().__init__()
+        self.gen_length = gen_length
+        self.pose_dim = pose_dim
+        self.use_pre_poses = use_pre_poses
+
+        in_size = 32
+        if use_pre_poses:
+            self.pre_pose_net = nn.Sequential(
+                nn.Linear(pose_dim * 4, 32),
+                nn.BatchNorm1d(32),
+                nn.ReLU(),
+                nn.Linear(32, 32),
+            )
+            in_size += 32
+
+        self.net = nn.Sequential(
+            nn.Linear(in_size, 128),
+            nn.BatchNorm1d(128),
+            nn.ReLU(),
+            nn.Linear(128, 128),
+            nn.BatchNorm1d(128),
+            nn.ReLU(),
+            nn.Linear(128, 256),
+            nn.BatchNorm1d(256),
+            nn.ReLU(),
+            nn.Linear(256, 512),
+            nn.BatchNorm1d(512),
+            nn.ReLU(),
+            nn.Linear(512, gen_length * pose_dim),
+        )
+
+    def forward(self, latent_code, pre_poses=None):
+        if self.use_pre_poses:
+            pre_pose_feat = self.pre_pose_net(pre_poses.reshape(pre_poses.shape[0], -1))
+            feat = torch.cat((pre_pose_feat, latent_code), dim=1)
+        else:
+            feat = latent_code
+        output = self.net(feat)
+        output = output.view(-1, self.gen_length, self.pose_dim)
+        return output
+
+
+class PoseDecoderConv(nn.Module):
+    def __init__(self, length, dim, use_pre_poses=False, feature_length=32):
+        super().__init__()
+        self.use_pre_poses = use_pre_poses
+        self.feat_size = feature_length
+        
+        if use_pre_poses:
+            self.pre_pose_net = nn.Sequential(
+                nn.Linear(dim * 4, 32),
+                nn.BatchNorm1d(32),
+                nn.ReLU(),
+                nn.Linear(32, 32),
+            )
+            self.feat_size += 32
+
+        if length == 64:
+            self.pre_net = nn.Sequential(
+                nn.Linear(self.feat_size, self.feat_size),
+                nn.BatchNorm1d(self.feat_size),
+                nn.LeakyReLU(True),
+                nn.Linear(self.feat_size, self.feat_size//8*64),
+            )
+        elif length == 34:
+            self.pre_net = nn.Sequential(
+                nn.Linear(self.feat_size, self.feat_size*2),
+                nn.BatchNorm1d(self.feat_size*2),
+                nn.LeakyReLU(True),
+                nn.Linear(self.feat_size*2, self.feat_size//8*34),
+            )
+        elif length == 32:
+            self.pre_net = nn.Sequential(
+                nn.Linear(self.feat_size, self.feat_size*2),
+                nn.BatchNorm1d(self.feat_size*2),
+                nn.LeakyReLU(True),
+                nn.Linear(self.feat_size*2, self.feat_size//8*32),
+            )
+        else:
+            assert False
+        self.decoder_size = self.feat_size//8
+        self.net = nn.Sequential(
+            nn.ConvTranspose1d(self.decoder_size, self.feat_size, 3),
+            nn.BatchNorm1d(self.feat_size),
+            nn.LeakyReLU(0.2, True),
+            
+            nn.ConvTranspose1d(self.feat_size, self.feat_size, 3),
+            nn.BatchNorm1d(self.feat_size),
+            nn.LeakyReLU(0.2, True),
+            nn.Conv1d(self.feat_size, self.feat_size*2, 3),
+            nn.Conv1d(self.feat_size*2, dim, 3),
+        )
+
+    def forward(self, feat, pre_poses=None):
+        if self.use_pre_poses:
+            pre_pose_feat = self.pre_pose_net(pre_poses.reshape(pre_poses.shape[0], -1))
+            feat = torch.cat((pre_pose_feat, feat), dim=1)
+        #print(feat.shape)
+        out = self.pre_net(feat)
+        #print(out.shape)
+        out = out.view(feat.shape[0], self.decoder_size, -1)
+        #print(out.shape)
+        out = self.net(out)
+        out = out.transpose(1, 2)
+        return out
+
+'''
+Our CaMN Modification
+'''
+class PoseEncoderConvResNet(nn.Module):
+    def __init__(self, length, dim, feature_length=32):
+        super().__init__()
+        self.base = feature_length
+        self.conv1=BasicBlock(dim, self.base, reduce_first = 1, downsample = False, first_dilation=1) #34
+        self.conv2=BasicBlock(self.base, self.base*2, downsample = False, first_dilation=1,) #34
+        self.conv3=BasicBlock(self.base*2, self.base*2, first_dilation=1, downsample = True, stride=2)#17            
+        self.conv4=BasicBlock(self.base*2, self.base, first_dilation=1, downsample = False)
+        
+        self.out_net = nn.Sequential(
+            # nn.Linear(864, 256),  # for 64 frames
+            nn.Linear(17*self.base, self.base*4),  # for 34 frames
+            nn.BatchNorm1d(self.base*4),
+            nn.LeakyReLU(True),
+            nn.Linear(self.base*4, self.base*2),
+            nn.BatchNorm1d(self.base*2),
+            nn.LeakyReLU(True),
+            nn.Linear(self.base*2, self.base),
+        )
+
+        self.fc_mu = nn.Linear(self.base, self.base)
+        self.fc_logvar = nn.Linear(self.base, self.base)
+
+    def forward(self, poses, variational_encoding=None):
+        poses = poses.transpose(1, 2)  # to (bs, dim, seq)
+        out1 = self.conv1(poses)
+        out2 = self.conv2(out1)
+        out3 = self.conv3(out2)
+        out = self.conv4(out3)
+        out = out.flatten(1)
+        out = self.out_net(out)
+        mu = self.fc_mu(out)
+        logvar = self.fc_logvar(out)
+        if variational_encoding:
+            z = reparameterize(mu, logvar)
+        else:
+            z = mu
+        return z, mu, logvar    
+    
+
+# -----------3 lstm ------------- #
+'''
+bs, n, c_int --> bs, n, c_out or bs, 1 (hidden), c_out 
+'''
+class AELSTM(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.motion_emb = nn.Linear(args.vae_test_dim, args.vae_length)
+        self.lstm = nn.LSTM(args.vae_length, hidden_size=args.vae_length, num_layers=4, batch_first=True,
+                          bidirectional=True, dropout=0.3)
+        self.out = nn.Sequential(
+            nn.Linear(args.vae_length, args.vae_length//2),
+            nn.LeakyReLU(0.2, True),
+            nn.Linear(args.vae_length//2, args.vae_test_dim)
+        )
+        self.hidden_size = args.vae_length
+
+    def forward(self, inputs):
+        poses = self.motion_emb(inputs)  
+        out, _ = self.lstm(poses)
+        out = out[:, :, :self.hidden_size] + out[:, :, self.hidden_size:]
+        out_poses = self.out(out) 
+        return {
+            "poses_feat":out,
+            "rec_pose": out_poses,
+            }     
+    
+class PoseDecoderLSTM(nn.Module):
+    """
+    input bs*n*64
+    """
+    def __init__(self,pose_dim, feature_length):
+        super().__init__()
+        self.pose_dim = pose_dim
+        self.base = feature_length
+        self.hidden_size = 256
+        self.lstm_d = nn.LSTM(self.base, hidden_size=self.hidden_size, num_layers=4, batch_first=True,
+                          bidirectional=True, dropout=0.3)
+        self.out_d = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size // 2),
+            nn.LeakyReLU(True),
+            nn.Linear(self.hidden_size // 2, self.pose_dim)
+        )
+
+    def forward(self, latent_code):
+        output, _ = self.lstm_d(latent_code)
+        output = output[:, :, :self.hidden_size] + output[:, :, self.hidden_size:]  # sum bidirectional outputs
+        #print("outd:", output.shape)
+        output = self.out_d(output.reshape(-1, output.shape[2]))
+        output = output.view(latent_code.shape[0], latent_code.shape[1], -1)
+        #print("resotuput:", output.shape)
+        return output
+     
+# ---------------4 transformer --------------- #
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)#.transpose(0, 1)
+        
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        #print(self.pe.shape, x.shape)
+        x = x + self.pe[:, :x.shape[1]]
+        return self.dropout(x)  
+
+class Encoder_TRANSFORMER(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.skelEmbedding = nn.Linear(args.vae_test_dim, args.vae_length)
+        self.sequence_pos_encoder = PositionalEncoding(args.vae_length, 0.3)
+        seqTransEncoderLayer = nn.TransformerEncoderLayer(d_model=args.vae_length,
+                                                          nhead=4,
+                                                          dim_feedforward=1025,
+                                                          dropout=0.3,
+                                                          activation="gelu",
+                                                          batch_first=True
+                                                         )
+        self.seqTransEncoder = nn.TransformerEncoder(seqTransEncoderLayer,
+                                                     num_layers=4)
+    def _generate_square_subsequent_mask(self, sz):
+        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        return mask
+    
+    def forward(self, inputs):
+        x = self.skelEmbedding(inputs)  #bs * n * 128
+        #print(x.shape)
+        xseq = self.sequence_pos_encoder(x)
+        device = xseq.device
+        #mask = self._generate_square_subsequent_mask(xseq.size(1)).to(device)
+        final = self.seqTransEncoder(xseq)
+        #print(final.shape)
+        mu = final[:, 0:1, :]
+        logvar = final[:, 1:2, :]
+        return final, mu, logvar
+    
+class Decoder_TRANSFORMER(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.vae_test_len = args.vae_test_len
+        self.vae_length = args.vae_length
+        self.sequence_pos_encoder = PositionalEncoding(args.vae_length, 0.3)
+        seqTransDecoderLayer = nn.TransformerDecoderLayer(d_model=args.vae_length,
+                                                          nhead=4,
+                                                          dim_feedforward=1024,
+                                                          dropout=0.3,
+                                                          activation="gelu",
+                                                          batch_first=True)
+        self.seqTransDecoder = nn.TransformerDecoder(seqTransDecoderLayer,
+                                                     num_layers=4)
+        self.finallayer = nn.Linear(args.vae_length, args.vae_test_dim)
+        
+    def forward(self, inputs):
+        timequeries = torch.zeros(inputs.shape[0], self.vae_test_len, self.vae_length, device=inputs.device) 
+        timequeries = self.sequence_pos_encoder(timequeries)
+        output = self.seqTransDecoder(tgt=timequeries, memory=inputs)
+        output = self.finallayer(output)
+        return output    
\ No newline at end of file
diff --git a/models/quantizer.py b/models/quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..896c973ea25513e27feccc564d85d5dd361a4dc5
--- /dev/null
+++ b/models/quantizer.py
@@ -0,0 +1,159 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Quantizer(nn.Module):
+    def __init__(self, n_e, e_dim, beta):
+        super(Quantizer, self).__init__()
+
+        self.e_dim = e_dim
+        self.n_e = n_e
+        self.beta = beta
+
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+
+    def forward(self, z):
+        """
+        Inputs the output of the encoder network z and maps it to a discrete
+        one-hot vectort that is the index of the closest embedding vector e_j
+        z (continuous) -> z_q (discrete)
+        :param z (B, seq_len, channel):
+        :return z_q:
+        """
+        assert z.shape[-1] == self.e_dim
+        z_flattened = z.contiguous().view(-1, self.e_dim)
+
+        # B x V
+        d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(self.embedding.weight**2, dim=1) - 2 * \
+            torch.matmul(z_flattened, self.embedding.weight.t())
+        # B x 1
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+
+        # compute loss for embedding
+        loss = torch.mean((z_q - z.detach())**2) + self.beta * \
+               torch.mean((z_q.detach() - z)**2)
+
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+
+        min_encodings = F.one_hot(min_encoding_indices, self.n_e).type(z.dtype)
+        e_mean = torch.mean(min_encodings, dim=0)
+        perplexity = torch.exp(-torch.sum(e_mean*torch.log(e_mean + 1e-10)))
+        return loss, z_q, min_encoding_indices, perplexity
+
+    def map2index(self, z):
+        """
+        Inputs the output of the encoder network z and maps it to a discrete
+        one-hot vectort that is the index of the closest embedding vector e_j
+        z (continuous) -> z_q (discrete)
+        :param z (B, seq_len, channel):
+        :return z_q:
+        """
+        assert z.shape[-1] == self.e_dim
+        #print(z.shape)
+        z_flattened = z.contiguous().view(-1, self.e_dim)
+        #print(z_flattened.shape)
+
+        # B x V
+        d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(self.embedding.weight**2, dim=1) - 2 * \
+            torch.matmul(z_flattened, self.embedding.weight.t())
+        # B x 1
+        min_encoding_indices = torch.argmin(d, dim=1)
+        return min_encoding_indices.reshape(z.shape[0], -1)
+
+    def get_codebook_entry(self, indices):
+        """
+
+        :param indices(B, seq_len):
+        :return z_q(B, seq_len, e_dim):
+        """
+        index_flattened = indices.view(-1)
+        z_q = self.embedding(index_flattened)
+        z_q = z_q.view(indices.shape + (self.e_dim, )).contiguous()
+        return z_q
+
+
+class EmbeddingEMA(nn.Module):
+    def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5):
+        super(EmbeddingEMA, self).__init__()
+        self.decay = decay
+        self.eps = eps
+        weight = torch.randn(num_tokens, codebook_dim)
+        self.weight = nn.Parameter(weight, requires_grad=False)
+        self.cluster_size = nn.Parameter(torch.zeros(num_tokens), requires_grad=False)
+        self.embed_avg = nn.Parameter(weight.clone(), requires_grad=False)
+        self.update = True
+
+    def forward(self, embed_id):
+        return F.embedding(embed_id, self.weight)
+
+    def cluster_size_ema_update(self, new_cluster_size):
+        self.cluster_size.data.mul_(self.decay).add_(new_cluster_size, alpha=1 - self.decay)
+
+    def embed_avg_ema_update(self, new_emb_avg):
+        self.embed_avg.data.mul_(self.decay).add(new_emb_avg, alpha=1 - self.decay)
+
+    def weight_update(self, num_tokens):
+        n = self.cluster_size.sum()
+        smoothed_cluster_size = (
+            (self.cluster_size + self.eps) / (n + num_tokens*self.eps) * n
+        )
+        embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1)
+        self.weight.data.copy_(embed_normalized)
+
+
+class EMAVectorQuantizer(nn.Module):
+    def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5):
+        super(EMAVectorQuantizer, self).__init__()
+        self.codebook_dim = embedding_dim
+        self.num_tokens = n_embed
+        self.beta = beta
+        self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps)
+
+    def forward(self, z):
+        z_flattened = z.view(-1, self.codebook_dim)
+
+        d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(self.embedding.weight ** 2, dim=1) - 2 * \
+            torch.matmul(z_flattened, self.embedding.weight.t())
+
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+
+        min_encodings = F.one_hot(min_encoding_indices, self.num_tokens).type(z.dtype)
+        e_mean = torch.mean(min_encodings, dim=0)
+        perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10)))
+
+        if self.training and self.embedding.update:
+            encoding_sum = min_encodings.sum(0)
+            embed_sum = min_encodings.transpose(0, 1)@z_flattened
+
+            self.embedding.cluster_size_ema_update(encoding_sum)
+            self.embedding.embed_avg_ema_update(embed_sum)
+            self.embedding.weight_update(self.num_tokens)
+
+        loss = self.beta * F.mse_loss(z_q.detach(), z)
+
+        z_q = z + (z_q - z).detach()
+        return loss, z_q, min_encoding_indices, perplexity
+
+
+# class GumbelQuantizer(nn.Module):
+#     def __init__(self, num_hiddens, embedding_dim, n_embed, straight_through=True,
+#                  kl_weight=5e-4, temp_init=1.0):
+#         super(GumbelQuantizer, self).__init__()
+#
+#         self.embedding_dim = embedding_dim
+#         self.n_embed = n_embed
+#
+#         self.straight_through = straight_through
+#         self.temperature = temp_init
+#         self.kl_weight = kl_weight
+#
+#         self.proj = nn.Linear(num_hiddens, n_embed)
+#         self.embed = nn.Embedding(n_embed, embedding_dim)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7287f9a5d9921b659eea8e3335c710f80606b7df
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,55 @@
+--extra-index-url https://download.openmmlab.com/mmcv/dist/cu118/torch2.1.0/index.html
+
+torch==2.1.0
+
+scikit-image==0.21.0
+scikit-learn==1.3.2
+scipy==1.11.4
+smplx==0.1.28
+tqdm
+yacs
+numba
+opencv-contrib-python==4.8.1.78
+opencv-python==4.8.1.78
+tensorboardx
+filterpy
+cython
+chumpy
+Pillow==9.5.0
+trimesh
+pyrender
+matplotlib
+json_tricks
+torchgeometry
+einops
+joblib
+boto3
+requests
+easydict
+pycocotools
+plyfile
+timm
+pyglet
+mmcv==2.1.0
+mmdet==3.2.0
+mmpose
+eval_type_backport
+
+wget
+gradio==4.44.1
+av==11.0.0
+ffmpeg-python
+mediapipe
+batch-face @ git+https://github.com/elliottzheng/batch-face.git@master
+decord==0.6.0
+diffusers==0.24.0
+imageio==2.33.0
+imageio-ffmpeg==0.4.9
+omegaconf==2.2.3
+open-clip-torch==2.20.0
+transformers==4.30.2
+torchvision
+moviepy
+igraph
+ConfigArgParse
+librosa
\ No newline at end of file
diff --git a/utils/genextend_inference_utils.py b/utils/genextend_inference_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d24468f1b955d5ede476db64dde56f1ce9e3542
--- /dev/null
+++ b/utils/genextend_inference_utils.py
@@ -0,0 +1,156 @@
+import imageio
+import numpy as np
+from PIL import Image, ImageDraw, ImageEnhance
+from scipy.ndimage import gaussian_filter1d
+
+def draw_annotations_for_extended_frames(video_batch, start_index_prediction=17):
+    """
+    video_batch     List of list of PIL.Image frames
+    """
+    radius = 2.5
+    offset = 10
+    for video in video_batch:
+        assert start_index_prediction < len(video), f"Index {start_index_prediction} is out-of-bound for frames"
+        for i_idx, image in enumerate(video):
+            if i_idx < start_index_prediction:
+                continue
+            draw = ImageDraw.Draw(image)
+            draw.ellipse([offset, offset, offset+2*radius, offset+2*radius], fill=(255,0,0))
+    return video_batch
+
+def draw_annotations_for_initial_frames(video_batch, end_index_prediction=17):
+    """
+    video_batch     List of list of PIL.Image frames
+    """
+    radius = 2.5
+    offset = 10
+    for video in video_batch:
+        assert end_index_prediction < len(video), f"Index {end_index_prediction} is out-of-bound for frames"
+        for i_idx, image in enumerate(video):
+            if i_idx >= end_index_prediction:
+                continue
+            draw = ImageDraw.Draw(image)
+            draw.ellipse([offset, offset, offset+2*radius, offset+2*radius], fill=(255,0,0))
+    return video_batch
+
+def images_to_array(images):
+    return np.array([np.array(img) for img in images])
+
+def array_to_images(array):
+    return [Image.fromarray(arr) for arr in array]
+
+def save_video_mp4(path, video, fps=12):
+    imageio.mimwrite(
+        path,
+        video,
+        format="mp4",
+        fps=fps,
+        codec="libx264",
+        output_params=["-pix_fmt", "yuv420p"],
+    )
+
+def blend_pixels_temporal(video_batch, start_index_prediction=17, sigma=1, support=3):
+    for video in video_batch:
+        assert start_index_prediction < len(video) and start_index_prediction > 0, f"Index {start_index_prediction} is out-of-bound for frames"
+        # blur temporally
+        video_array = images_to_array(video)
+        start = max(start_index_prediction - support // 2, 0)
+        end = min(start_index_prediction + support // 2 + 1, video_array.shape[0])
+        # only blend in the first frame
+        video_array[start_index_prediction] = gaussian_filter1d(video_array[start:end],
+                                                                sigma=sigma,
+                                                                axis=0,
+                                                                truncate=support/2)[support//2]
+        # uncomment to blend in "support" frames, which causes noticeable blurs in some cases
+        #video_array[start:end] = gaussian_filter1d(video_array[start:end],
+        #                                           sigma=sigma,
+        #                                           axis=0,
+        #                                           truncate=support/2)
+        blurred_video = array_to_images(video_array)
+        for i in range(len(video)):
+            video[i] = blurred_video[i]
+    return video_batch
+
+def calculate_mean_std(image_array, channel):
+    channel_data = image_array[:, :, channel]
+    return channel_data.mean(), channel_data.std()
+
+def adjust_mean(image, target_mean, channel):
+    channel_data = np.array(image)[:, :, channel]
+    current_mean = channel_data.mean()
+    adjusted_data = channel_data + (target_mean - current_mean)
+    adjusted_data = np.clip(adjusted_data, 0, 255).astype(np.uint8)
+    image_np = np.array(image)
+    image_np[:, :, channel] = adjusted_data
+    return Image.fromarray(image_np)
+
+def adjust_contrast(image, target_contrast, channel):
+    channel_data = np.array(image)[:, :, channel]
+    current_mean = channel_data.mean()
+    current_contrast = channel_data.std()
+    if current_contrast == 0:
+        adjusted_data = current_mean * np.ones_like(channel_data)
+    else:
+        adjusted_data = (channel_data - current_mean) * (target_contrast / current_contrast) + current_mean
+    adjusted_data = np.clip(adjusted_data, 0, 255).astype(np.uint8)
+    image_np = np.array(image)
+    image_np[:, :, channel] = adjusted_data
+    return Image.fromarray(image_np)
+
+def calculate_brightness(image):
+    grayscale = image.convert("L")
+    histogram = grayscale.histogram()
+    pixels = sum(histogram)
+    brightness = scale = len(histogram)
+    for index in range(scale):
+        ratio = histogram[index] / pixels
+        brightness += ratio * (-scale + index)
+    return 1 if brightness == 255 else brightness / scale
+
+def calculate_contrast(image):
+    grayscale = image.convert("L")
+    histogram = grayscale.histogram()
+    pixels = sum(histogram)
+    mean = sum(i * w for i, w in enumerate(histogram)) / pixels
+    contrast = sum((i - mean) ** 2 * w for i, w in enumerate(histogram)) / pixels
+    return contrast ** 0.5
+
+def adjust_brightness_contrast(image, target_brightness, target_contrast):
+    current_brightness = calculate_brightness(image)
+
+    brightness_enhancer = ImageEnhance.Brightness(image)
+    image = brightness_enhancer.enhance(target_brightness / current_brightness)
+
+    current_contrast = calculate_contrast(image)
+    contrast_enhancer = ImageEnhance.Contrast(image)
+    image = contrast_enhancer.enhance(target_contrast / current_contrast)
+
+    return image
+
+def adjust_statistics_to_match_reference(video_batch,
+                                         start_index_prediction=17,
+                                         reference_window_size=3):
+    assert start_index_prediction > 1, f"Need at least 1 frame before prediction start"
+    assert start_index_prediction > reference_window_size, f"Reference window size incorrect: {start_index_prediction} <= {reference_window_size}"
+    for video in video_batch:
+
+        window_start = max(start_index_prediction - reference_window_size, 0)
+
+        ## first adjust the mean and contrast of each color channel
+        #video_array = images_to_array(video)
+        #window_frames = video_array[window_start:start_index_prediction]
+        #for channel in range(3):
+        #    window_mean, window_std = calculate_mean_std(window_frames, channel)
+        #    for ii in range(start_index_prediction, len(video)):
+        #        video[ii] = adjust_mean(video[ii], window_mean, channel)
+        #        video[ii] = adjust_contrast(video[ii], window_std, channel)
+
+        # then adjust the overall brightness and contrast
+        window_brightness = np.mean(
+            [calculate_brightness(video[jj]) for jj in range(window_start, start_index_prediction)])
+        window_contrast = np.mean(
+            [calculate_contrast(video[jj]) for jj in range(window_start, start_index_prediction)])
+        for ii in range(start_index_prediction, len(video)):
+            video[ii] = adjust_brightness_contrast(video[ii], window_brightness, window_contrast)
+
+    return video_batch
diff --git a/utils/rotation_conversions.py b/utils/rotation_conversions.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2bfaa1b2247622bff35d3f9b15e8eb84064aa53
--- /dev/null
+++ b/utils/rotation_conversions.py
@@ -0,0 +1,550 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+import functools
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+
+"""
+The transformation matrices returned from the functions in this file assume
+the points on which the transformation will be applied are column vectors.
+i.e. the R matrix is structured as
+
+    R = [
+            [Rxx, Rxy, Rxz],
+            [Ryx, Ryy, Ryz],
+            [Rzx, Rzy, Rzz],
+        ]  # (3, 3)
+
+This matrix can be applied to column vectors by post multiplication
+by the points e.g.
+
+    points = [[0], [1], [2]]  # (3 x 1) xyz coordinates of a point
+    transformed_points = R * points
+
+To apply the same matrix to points which are row vectors, the R matrix
+can be transposed and pre multiplied by the points:
+
+e.g.
+    points = [[0, 1, 2]]  # (1 x 3) xyz coordinates of a point
+    transformed_points = points * R.transpose(1, 0)
+"""
+
+
+def quaternion_to_matrix(quaternions):
+    """
+    Convert rotations given as quaternions to rotation matrices.
+
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+
+def _copysign(a, b):
+    """
+    Return a tensor where each element has the absolute value taken from the,
+    corresponding element of a, with sign taken from the corresponding
+    element of b. This is like the standard copysign floating-point operation,
+    but is not careful about negative 0 and NaN.
+
+    Args:
+        a: source tensor.
+        b: tensor whose signs will be used, of the same shape as a.
+
+    Returns:
+        Tensor of the same shape as a with the signs of b.
+    """
+    signs_differ = (a < 0) != (b < 0)
+    return torch.where(signs_differ, -a, a)
+
+
+def _sqrt_positive_part(x):
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    ret[positive_mask] = torch.sqrt(x[positive_mask])
+    return ret
+
+
+def matrix_to_quaternion(matrix):
+    """
+    Convert rotations given as rotation matrices to quaternions.
+
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix  shape f{matrix.shape}.")
+    m00 = matrix[..., 0, 0]
+    m11 = matrix[..., 1, 1]
+    m22 = matrix[..., 2, 2]
+    o0 = 0.5 * _sqrt_positive_part(1 + m00 + m11 + m22)
+    x = 0.5 * _sqrt_positive_part(1 + m00 - m11 - m22)
+    y = 0.5 * _sqrt_positive_part(1 - m00 + m11 - m22)
+    z = 0.5 * _sqrt_positive_part(1 - m00 - m11 + m22)
+    o1 = _copysign(x, matrix[..., 2, 1] - matrix[..., 1, 2])
+    o2 = _copysign(y, matrix[..., 0, 2] - matrix[..., 2, 0])
+    o3 = _copysign(z, matrix[..., 1, 0] - matrix[..., 0, 1])
+    return torch.stack((o0, o1, o2, o3), -1)
+
+
+def _axis_angle_rotation(axis: str, angle):
+    """
+    Return the rotation matrices for one of the rotations about an axis
+    of which Euler angles describe, for each value of the angle given.
+
+    Args:
+        axis: Axis label "X" or "Y or "Z".
+        angle: any shape tensor of Euler angles in radians
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+
+    cos = torch.cos(angle)
+    sin = torch.sin(angle)
+    one = torch.ones_like(angle)
+    zero = torch.zeros_like(angle)
+
+    if axis == "X":
+        R_flat = (one, zero, zero, zero, cos, -sin, zero, sin, cos)
+    if axis == "Y":
+        R_flat = (cos, zero, sin, zero, one, zero, -sin, zero, cos)
+    if axis == "Z":
+        R_flat = (cos, -sin, zero, sin, cos, zero, zero, zero, one)
+
+    return torch.stack(R_flat, -1).reshape(angle.shape + (3, 3))
+
+
+def euler_angles_to_matrix(euler_angles, convention: str):
+    """
+    Convert rotations given as Euler angles in radians to rotation matrices.
+
+    Args:
+        euler_angles: Euler angles in radians as tensor of shape (..., 3).
+        convention: Convention string of three uppercase letters from
+            {"X", "Y", and "Z"}.
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    if euler_angles.dim() == 0 or euler_angles.shape[-1] != 3:
+        raise ValueError("Invalid input euler angles.")
+    if len(convention) != 3:
+        raise ValueError("Convention must have 3 letters.")
+    if convention[1] in (convention[0], convention[2]):
+        raise ValueError(f"Invalid convention {convention}.")
+    for letter in convention:
+        if letter not in ("X", "Y", "Z"):
+            raise ValueError(f"Invalid letter {letter} in convention string.")
+    matrices = map(_axis_angle_rotation, convention, torch.unbind(euler_angles, -1))
+    return functools.reduce(torch.matmul, matrices)
+
+
+def _angle_from_tan(
+    axis: str, other_axis: str, data, horizontal: bool, tait_bryan: bool
+):
+    """
+    Extract the first or third Euler angle from the two members of
+    the matrix which are positive constant times its sine and cosine.
+
+    Args:
+        axis: Axis label "X" or "Y or "Z" for the angle we are finding.
+        other_axis: Axis label "X" or "Y or "Z" for the middle axis in the
+            convention.
+        data: Rotation matrices as tensor of shape (..., 3, 3).
+        horizontal: Whether we are looking for the angle for the third axis,
+            which means the relevant entries are in the same row of the
+            rotation matrix. If not, they are in the same column.
+        tait_bryan: Whether the first and third axes in the convention differ.
+
+    Returns:
+        Euler Angles in radians for each matrix in data as a tensor
+        of shape (...).
+    """
+
+    i1, i2 = {"X": (2, 1), "Y": (0, 2), "Z": (1, 0)}[axis]
+    if horizontal:
+        i2, i1 = i1, i2
+    even = (axis + other_axis) in ["XY", "YZ", "ZX"]
+    if horizontal == even:
+        return torch.atan2(data[..., i1], data[..., i2])
+    if tait_bryan:
+        return torch.atan2(-data[..., i2], data[..., i1])
+    return torch.atan2(data[..., i2], -data[..., i1])
+
+
+def _index_from_letter(letter: str):
+    if letter == "X":
+        return 0
+    if letter == "Y":
+        return 1
+    if letter == "Z":
+        return 2
+
+
+def matrix_to_euler_angles(matrix, convention: str):
+    """
+    Convert rotations given as rotation matrices to Euler angles in radians.
+
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+        convention: Convention string of three uppercase letters.
+
+    Returns:
+        Euler angles in radians as tensor of shape (..., 3).
+    """
+    if len(convention) != 3:
+        raise ValueError("Convention must have 3 letters.")
+    if convention[1] in (convention[0], convention[2]):
+        raise ValueError(f"Invalid convention {convention}.")
+    for letter in convention:
+        if letter not in ("X", "Y", "Z"):
+            raise ValueError(f"Invalid letter {letter} in convention string.")
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix  shape f{matrix.shape}.")
+    i0 = _index_from_letter(convention[0])
+    i2 = _index_from_letter(convention[2])
+    tait_bryan = i0 != i2
+    if tait_bryan:
+        central_angle = torch.asin(
+            matrix[..., i0, i2] * (-1.0 if i0 - i2 in [-1, 2] else 1.0)
+        )
+    else:
+        central_angle = torch.acos(matrix[..., i0, i0])
+
+    o = (
+        _angle_from_tan(
+            convention[0], convention[1], matrix[..., i2], False, tait_bryan
+        ),
+        central_angle,
+        _angle_from_tan(
+            convention[2], convention[1], matrix[..., i0, :], True, tait_bryan
+        ),
+    )
+    return torch.stack(o, -1)
+
+
+def random_quaternions(
+    n: int, dtype: Optional[torch.dtype] = None, device=None, requires_grad=False
+):
+    """
+    Generate random quaternions representing rotations,
+    i.e. versors with nonnegative real part.
+
+    Args:
+        n: Number of quaternions in a batch to return.
+        dtype: Type to return.
+        device: Desired device of returned tensor. Default:
+            uses the current device for the default tensor type.
+        requires_grad: Whether the resulting tensor should have the gradient
+            flag set.
+
+    Returns:
+        Quaternions as tensor of shape (N, 4).
+    """
+    o = torch.randn((n, 4), dtype=dtype, device=device, requires_grad=requires_grad)
+    s = (o * o).sum(1)
+    o = o / _copysign(torch.sqrt(s), o[:, 0])[:, None]
+    return o
+
+
+def random_rotations(
+    n: int, dtype: Optional[torch.dtype] = None, device=None, requires_grad=False
+):
+    """
+    Generate random rotations as 3x3 rotation matrices.
+
+    Args:
+        n: Number of rotation matrices in a batch to return.
+        dtype: Type to return.
+        device: Device of returned tensor. Default: if None,
+            uses the current device for the default tensor type.
+        requires_grad: Whether the resulting tensor should have the gradient
+            flag set.
+
+    Returns:
+        Rotation matrices as tensor of shape (n, 3, 3).
+    """
+    quaternions = random_quaternions(
+        n, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    return quaternion_to_matrix(quaternions)
+
+
+def random_rotation(
+    dtype: Optional[torch.dtype] = None, device=None, requires_grad=False
+):
+    """
+    Generate a single random 3x3 rotation matrix.
+
+    Args:
+        dtype: Type to return
+        device: Device of returned tensor. Default: if None,
+            uses the current device for the default tensor type
+        requires_grad: Whether the resulting tensor should have the gradient
+            flag set
+
+    Returns:
+        Rotation matrix as tensor of shape (3, 3).
+    """
+    return random_rotations(1, dtype, device, requires_grad)[0]
+
+
+def standardize_quaternion(quaternions):
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+
+    Args:
+        quaternions: Quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions)
+
+
+def quaternion_raw_multiply(a, b):
+    """
+    Multiply two quaternions.
+    Usual torch rules for broadcasting apply.
+
+    Args:
+        a: Quaternions as tensor of shape (..., 4), real part first.
+        b: Quaternions as tensor of shape (..., 4), real part first.
+
+    Returns:
+        The product of a and b, a tensor of quaternions shape (..., 4).
+    """
+    aw, ax, ay, az = torch.unbind(a, -1)
+    bw, bx, by, bz = torch.unbind(b, -1)
+    ow = aw * bw - ax * bx - ay * by - az * bz
+    ox = aw * bx + ax * bw + ay * bz - az * by
+    oy = aw * by - ax * bz + ay * bw + az * bx
+    oz = aw * bz + ax * by - ay * bx + az * bw
+    return torch.stack((ow, ox, oy, oz), -1)
+
+
+def quaternion_multiply(a, b):
+    """
+    Multiply two quaternions representing rotations, returning the quaternion
+    representing their composition, i.e. the versor with nonnegative real part.
+    Usual torch rules for broadcasting apply.
+
+    Args:
+        a: Quaternions as tensor of shape (..., 4), real part first.
+        b: Quaternions as tensor of shape (..., 4), real part first.
+
+    Returns:
+        The product of a and b, a tensor of quaternions of shape (..., 4).
+    """
+    ab = quaternion_raw_multiply(a, b)
+    return standardize_quaternion(ab)
+
+
+def quaternion_invert(quaternion):
+    """
+    Given a quaternion representing rotation, get the quaternion representing
+    its inverse.
+
+    Args:
+        quaternion: Quaternions as tensor of shape (..., 4), with real part
+            first, which must be versors (unit quaternions).
+
+    Returns:
+        The inverse, a tensor of quaternions of shape (..., 4).
+    """
+
+    return quaternion * quaternion.new_tensor([1, -1, -1, -1])
+
+
+def quaternion_apply(quaternion, point):
+    """
+    Apply the rotation given by a quaternion to a 3D point.
+    Usual torch rules for broadcasting apply.
+
+    Args:
+        quaternion: Tensor of quaternions, real part first, of shape (..., 4).
+        point: Tensor of 3D points of shape (..., 3).
+
+    Returns:
+        Tensor of rotated points of shape (..., 3).
+    """
+    if point.size(-1) != 3:
+        raise ValueError(f"Points are not in 3D, f{point.shape}.")
+    real_parts = point.new_zeros(point.shape[:-1] + (1,))
+    point_as_quaternion = torch.cat((real_parts, point), -1)
+    out = quaternion_raw_multiply(
+        quaternion_raw_multiply(quaternion, point_as_quaternion),
+        quaternion_invert(quaternion),
+    )
+    return out[..., 1:]
+
+
+def axis_angle_to_matrix(axis_angle):
+    """
+    Convert rotations given as axis/angle to rotation matrices.
+
+    Args:
+        axis_angle: Rotations given as a vector in axis angle form,
+            as a tensor of shape (..., 3), where the magnitude is
+            the angle turned anticlockwise in radians around the
+            vector's direction.
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    return quaternion_to_matrix(axis_angle_to_quaternion(axis_angle))
+
+
+def matrix_to_axis_angle(matrix):
+    """
+    Convert rotations given as rotation matrices to axis/angle.
+
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+
+    Returns:
+        Rotations given as a vector in axis angle form, as a tensor
+            of shape (..., 3), where the magnitude is the angle
+            turned anticlockwise in radians around the vector's
+            direction.
+    """
+    return quaternion_to_axis_angle(matrix_to_quaternion(matrix))
+
+
+def axis_angle_to_quaternion(axis_angle):
+    """
+    Convert rotations given as axis/angle to quaternions.
+
+    Args:
+        axis_angle: Rotations given as a vector in axis angle form,
+            as a tensor of shape (..., 3), where the magnitude is
+            the angle turned anticlockwise in radians around the
+            vector's direction.
+
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    angles = torch.norm(axis_angle, p=2, dim=-1, keepdim=True)
+    half_angles = 0.5 * angles
+    eps = 1e-6
+    small_angles = angles.abs() < eps
+    sin_half_angles_over_angles = torch.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+    )
+    # for x small, sin(x/2) is about x/2 - (x/2)^3/6
+    # so sin(x/2)/x is about 1/2 - (x*x)/48
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48
+    )
+    quaternions = torch.cat(
+        [torch.cos(half_angles), axis_angle * sin_half_angles_over_angles], dim=-1
+    )
+    return quaternions
+
+
+def quaternion_to_axis_angle(quaternions):
+    """
+    Convert rotations given as quaternions to axis/angle.
+
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Rotations given as a vector in axis angle form, as a tensor
+            of shape (..., 3), where the magnitude is the angle
+            turned anticlockwise in radians around the vector's
+            direction.
+    """
+    norms = torch.norm(quaternions[..., 1:], p=2, dim=-1, keepdim=True)
+    half_angles = torch.atan2(norms, quaternions[..., :1])
+    angles = 2 * half_angles
+    eps = 1e-6
+    small_angles = angles.abs() < eps
+    sin_half_angles_over_angles = torch.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+    )
+    # for x small, sin(x/2) is about x/2 - (x/2)^3/6
+    # so sin(x/2)/x is about 1/2 - (x*x)/48
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48
+    )
+    return quaternions[..., 1:] / sin_half_angles_over_angles
+
+
+def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
+    """
+    Converts 6D rotation representation by Zhou et al. [1] to rotation matrix
+    using Gram--Schmidt orthogonalisation per Section B of [1].
+    Args:
+        d6: 6D rotation representation, of size (*, 6)
+
+    Returns:
+        batch of rotation matrices of size (*, 3, 3)
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+
+    a1, a2 = d6[..., :3], d6[..., 3:]
+    b1 = F.normalize(a1, dim=-1)
+    b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
+    b2 = F.normalize(b2, dim=-1)
+    b3 = torch.cross(b1, b2, dim=-1)
+    return torch.stack((b1, b2, b3), dim=-2)
+
+
+def matrix_to_rotation_6d(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Converts rotation matrices to 6D rotation representation by Zhou et al. [1]
+    by dropping the last row. Note that 6D representation is not unique.
+    Args:
+        matrix: batch of rotation matrices of size (*, 3, 3)
+
+    Returns:
+        6D rotation representation, of size (*, 6)
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    return matrix[..., :2, :].clone().reshape(*matrix.size()[:-2], 6)
diff --git a/utils/video_io.py b/utils/video_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..30b2c9939e0fd11259ee9cbb3cc60176efa4476f
--- /dev/null
+++ b/utils/video_io.py
@@ -0,0 +1,35 @@
+import av
+from pathlib import Path
+import os
+
+def save_videos_from_pil(pil_images, path, fps=8, bitrate=None):
+    save_fmt = Path(path).suffix
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    width, height = pil_images[0].size
+
+    if save_fmt == ".mp4":
+        codec = "libx264"
+        container = av.open(path, "w")
+        stream = container.add_stream(codec, rate=fps, )
+
+        stream.width = width
+        stream.height = height
+        if bitrate is not None:
+            stream.bit_rate = bitrate
+        for pil_image in pil_images:
+            av_frame = av.VideoFrame.from_image(pil_image)
+            container.mux(stream.encode(av_frame))
+        container.mux(stream.encode())
+        container.close()
+
+    elif save_fmt == ".gif":
+        pil_images[0].save(
+            fp=path,
+            format="GIF",
+            append_images=pil_images[1:],
+            save_all=True,
+            duration=(1 / fps * 1000),
+            loop=0,
+        )
+    else:
+        raise ValueError("Unsupported file type. Use .mp4 or .gif.")
\ No newline at end of file