Initial upload of MotionStreamer code, excluding large extracted data and output folders.
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- .ipynb_checkpoints/TRAIN_motionstreamer-checkpoint.sh +15 -0
- .ipynb_checkpoints/demo_t2m-checkpoint.py +204 -0
- .ipynb_checkpoints/environment-checkpoint.yaml +258 -0
- .ipynb_checkpoints/requirements-checkpoint.txt +17 -0
- .ipynb_checkpoints/train_motionstreamer-checkpoint.py +264 -0
- EVAL_causal_TAE.sh +6 -0
- EVAL_t2m.sh +7 -0
- LICENSE +21 -0
- README.md +336 -0
- TRAIN_causal_TAE.sh +22 -0
- TRAIN_evaluator_272.sh +6 -0
- TRAIN_motionstreamer.sh +16 -0
- TRAIN_t2m.sh +15 -0
- assets/teaser.jpg +3 -0
- babel_272/.gitattributes +59 -0
- babel_272/README.md +34 -0
- babel_272/motion_data.zip +3 -0
- babel_272/split/train.txt +0 -0
- babel_272/split/val.txt +0 -0
- babel_272/t2m_babel_mean_std/Mean.npy +3 -0
- babel_272/t2m_babel_mean_std/Std.npy +3 -0
- babel_272/texts.zip +3 -0
- babel_272_stream/.gitattributes +59 -0
- babel_272_stream/README.md +62 -0
- babel_272_stream/train_stream.zip +3 -0
- babel_272_stream/train_stream_text.zip +3 -0
- babel_272_stream/val_stream.zip +3 -0
- babel_272_stream/val_stream_text.zip +3 -0
- body_models/human_model_files/mano/MANO_LEFT.pkl +3 -0
- body_models/human_model_files/mano/MANO_RIGHT.pkl +3 -0
- body_models/human_model_files/smpl/J_regressor_extra.npy +3 -0
- body_models/human_model_files/smpl/SMPL_FEMALE.pkl +3 -0
- body_models/human_model_files/smpl/SMPL_MALE.pkl +3 -0
- body_models/human_model_files/smpl/SMPL_NEUTRAL.pkl +3 -0
- body_models/human_model_files/smpl/VPOSER_CKPT/TR00_004_00_WO_accad.ini +29 -0
- body_models/human_model_files/smpl/VPOSER_CKPT/snapshots/._TR00_E096.pt +3 -0
- body_models/human_model_files/smpl/VPOSER_CKPT/snapshots/TR00_E096.pt +3 -0
- body_models/human_model_files/smpl/VPOSER_CKPT/vposer_smpl.py +164 -0
- body_models/human_model_files/smplx/MANO_SMPLX_vertex_ids.pkl +3 -0
- body_models/human_model_files/smplx/SMPL-X__FLAME_vertex_ids.npy +3 -0
- body_models/human_model_files/smplx/SMPLX_FEMALE.npz +3 -0
- body_models/human_model_files/smplx/SMPLX_FEMALE.pkl +3 -0
- body_models/human_model_files/smplx/SMPLX_MALE.npz +3 -0
- body_models/human_model_files/smplx/SMPLX_MALE.pkl +3 -0
- body_models/human_model_files/smplx/SMPLX_NEUTRAL.npz +3 -0
- body_models/human_model_files/smplx/SMPLX_NEUTRAL.pkl +3 -0
- body_models/human_model_files/smplx/SMPLX_NEUTRAL_NEW.npy +3 -0
- body_models/human_model_files/smplx/SMPLX_NEUTRAL_NEW.npz +3 -0
- body_models/human_model_files/smplx/SMPLX_NEUTRAL_NEW_WiFlame.npy +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/teaser.jpg filter=lfs diff=lfs merge=lfs -text
|
.ipynb_checkpoints/TRAIN_motionstreamer-checkpoint.sh
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NUM_GPUS=${1:-1} # default: 1 GPU
|
| 2 |
+
|
| 3 |
+
BATCH_SIZE=$((30 / NUM_GPUS))
|
| 4 |
+
|
| 5 |
+
echo "Using $NUM_GPUS GPUs, each with a batch size of $BATCH_SIZE"
|
| 6 |
+
|
| 7 |
+
accelerate launch --num_processes $NUM_GPUS train_motionstreamer.py \
|
| 8 |
+
--batch-size $BATCH_SIZE \
|
| 9 |
+
--lr 0.0001 \
|
| 10 |
+
--total-iter 100000 \
|
| 11 |
+
--out-dir Experiments \
|
| 12 |
+
--exp-name motionstreamer_model \
|
| 13 |
+
--dataname t2m_babel_272 \
|
| 14 |
+
--latent_dir babel_272_stream/t2m_babel_latents \
|
| 15 |
+
--num_gpus $NUM_GPUS
|
.ipynb_checkpoints/demo_t2m-checkpoint.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import numpy as np
|
| 4 |
+
from models.llama_model import LLaMAHF, LLaMAHFConfig
|
| 5 |
+
import models.tae as tae
|
| 6 |
+
import options.option_transformer as option_trans
|
| 7 |
+
import warnings
|
| 8 |
+
|
| 9 |
+
import smplx
|
| 10 |
+
from utils import bvh, quat
|
| 11 |
+
from utils.face_z_align_util import rotation_6d_to_matrix, matrix_to_axis_angle, axis_angle_to_quaternion
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
warnings.filterwarnings('ignore')
|
| 15 |
+
|
| 16 |
+
comp_device = torch.device('cuda')
|
| 17 |
+
##### ---- Exp dirs ---- #####
|
| 18 |
+
args = option_trans.get_args_parser()
|
| 19 |
+
torch.manual_seed(args.seed)
|
| 20 |
+
|
| 21 |
+
from sentence_transformers import SentenceTransformer
|
| 22 |
+
t5_model = SentenceTransformer('sentencet5-xxl/')
|
| 23 |
+
t5_model.eval()
|
| 24 |
+
for p in t5_model.parameters():
|
| 25 |
+
p.requires_grad = False
|
| 26 |
+
|
| 27 |
+
def save_motion_as_bvh(motion_data, output_path, fps=30):
|
| 28 |
+
"""
|
| 29 |
+
Saves a motion tensor in the 272-dimensional format to a BVH file.
|
| 30 |
+
This version is adapted from the official repository script for robustness.
|
| 31 |
+
"""
|
| 32 |
+
print(f"--- Starting direct conversion to BVH: {os.path.basename(output_path)} ---")
|
| 33 |
+
try:
|
| 34 |
+
# --- 1. Ensure data is a 2D NumPy array ---
|
| 35 |
+
if isinstance(motion_data, torch.Tensor):
|
| 36 |
+
motion_data = motion_data.detach().cpu().numpy()
|
| 37 |
+
|
| 38 |
+
# This is the key fix: Check dimensions before squeezing
|
| 39 |
+
if motion_data.ndim == 3 and motion_data.shape[0] == 1:
|
| 40 |
+
motion_data = motion_data.squeeze(0)
|
| 41 |
+
elif motion_data.ndim != 2:
|
| 42 |
+
raise ValueError(f"Input motion data must be 2D or 3D with a batch size of 1, but got shape {motion_data.shape}")
|
| 43 |
+
|
| 44 |
+
# --- 2. Recover 85-dim SMPL format from 272-dim format ---
|
| 45 |
+
# This logic is from the official script's `recover_from_local_rotation`
|
| 46 |
+
njoint = 22
|
| 47 |
+
nfrm, _ = motion_data.shape
|
| 48 |
+
|
| 49 |
+
rotations_matrix = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 8+6*njoint : 8+12*njoint]).reshape(nfrm, -1, 6)).numpy()
|
| 50 |
+
|
| 51 |
+
# Accumulate heading rotations
|
| 52 |
+
global_heading_diff_rot_6d = torch.from_numpy(motion_data[:, 2:8])
|
| 53 |
+
global_heading_diff_rot = rotation_6d_to_matrix(global_heading_diff_rot_6d).numpy()
|
| 54 |
+
global_heading_rot = np.zeros_like(global_heading_diff_rot)
|
| 55 |
+
global_heading_rot[0] = global_heading_diff_rot[0]
|
| 56 |
+
for i in range(1, nfrm):
|
| 57 |
+
global_heading_rot[i] = np.matmul(global_heading_diff_rot[i], global_heading_rot[i-1])
|
| 58 |
+
|
| 59 |
+
# Calculate root translation
|
| 60 |
+
velocities_root_xy = motion_data[:, :2]
|
| 61 |
+
positions_no_heading = motion_data[:, 8 : 8+3*njoint].reshape(nfrm, -1, 3)
|
| 62 |
+
height = positions_no_heading[:, 0, 1]
|
| 63 |
+
|
| 64 |
+
inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1))
|
| 65 |
+
rotations_matrix[:, 0, ...] = np.matmul(inv_global_heading_rot, rotations_matrix[:, 0, ...])
|
| 66 |
+
|
| 67 |
+
velocities_root_xyz = np.zeros((nfrm, 3))
|
| 68 |
+
velocities_root_xyz[:, 0] = velocities_root_xy[:, 0]
|
| 69 |
+
velocities_root_xyz[:, 2] = velocities_root_xy[:, 1]
|
| 70 |
+
velocities_root_xyz[1:, :] = np.matmul(inv_global_heading_rot[:-1], velocities_root_xyz[1:, :, None]).squeeze(-1)
|
| 71 |
+
root_translation = np.cumsum(velocities_root_xyz, axis=0)
|
| 72 |
+
root_translation[:, 1] = height
|
| 73 |
+
|
| 74 |
+
# Convert rotation matrices to axis-angle
|
| 75 |
+
axis_angle = matrix_to_axis_angle(torch.from_numpy(rotations_matrix)).numpy()
|
| 76 |
+
poses_85dim = np.concatenate([axis_angle.reshape(nfrm, -1), np.zeros((nfrm, 6)), root_translation, np.zeros((nfrm, 10))], axis=-1)
|
| 77 |
+
|
| 78 |
+
# --- 3. Convert 85-dim SMPL to BVH data ---
|
| 79 |
+
# This logic is from the official script's `smpl2bvh`
|
| 80 |
+
rots = poses_85dim[:, :72].reshape(-1, 24, 3)
|
| 81 |
+
trans = poses_85dim[:, 72:75]
|
| 82 |
+
|
| 83 |
+
# Get skeleton from SMPL model
|
| 84 |
+
model = smplx.create(model_path="body_models/human_model_files", model_type="smpl", gender="NEUTRAL")
|
| 85 |
+
parents = model.parents.detach().cpu().numpy()
|
| 86 |
+
rest_pose = model().joints.detach().cpu().numpy().squeeze()[:24,:]
|
| 87 |
+
offsets = rest_pose - rest_pose[parents]
|
| 88 |
+
offsets[0] = np.array([0,0,0])
|
| 89 |
+
|
| 90 |
+
rotations_quat = axis_angle_to_quaternion(torch.from_numpy(rots)).numpy()
|
| 91 |
+
rotations_euler = np.degrees(quat.to_euler(rotations_quat, order="zyx"))
|
| 92 |
+
|
| 93 |
+
positions = offsets[None].repeat(len(rots), axis=0)
|
| 94 |
+
positions[:, 0] = trans
|
| 95 |
+
|
| 96 |
+
joint_names = [
|
| 97 |
+
"Pelvis", "Left_hip", "Right_hip", "Spine1", "Left_knee", "Right_knee", "Spine2",
|
| 98 |
+
"Left_ankle", "Right_ankle", "Spine3", "Left_foot", "Right_foot", "Neck",
|
| 99 |
+
"Left_collar", "Right_collar", "Head", "Left_shoulder", "Right_shoulder",
|
| 100 |
+
"Left_elbow", "Right_elbow", "Left_wrist", "Right_wrist", "Left_hand", "Right_hand"
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
# --- 4. Save the final BVH file ---
|
| 104 |
+
bvh.save(output_path, {
|
| 105 |
+
"rotations": rotations_euler,
|
| 106 |
+
"positions": positions,
|
| 107 |
+
"offsets": offsets,
|
| 108 |
+
"parents": parents,
|
| 109 |
+
"names": joint_names,
|
| 110 |
+
"order": "zyx",
|
| 111 |
+
"frametime": 1.0 / fps,
|
| 112 |
+
})
|
| 113 |
+
print(f"✅ BVH file saved successfully to {output_path}")
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
print(f"❌ BVH Conversion Failed. Error: {e}")
|
| 117 |
+
import traceback
|
| 118 |
+
traceback.print_exc()
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
##### ---- Network ---- #####
|
| 122 |
+
clip_range = [-30,20]
|
| 123 |
+
|
| 124 |
+
net = tae.Causal_HumanTAE(
|
| 125 |
+
hidden_size=args.hidden_size,
|
| 126 |
+
down_t=args.down_t,
|
| 127 |
+
stride_t=args.stride_t,
|
| 128 |
+
depth=args.depth,
|
| 129 |
+
dilation_growth_rate=args.dilation_growth_rate,
|
| 130 |
+
activation='relu',
|
| 131 |
+
latent_dim=args.latent_dim,
|
| 132 |
+
clip_range=clip_range
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
config = LLaMAHFConfig.from_name('Normal_size')
|
| 137 |
+
config.block_size = 78
|
| 138 |
+
trans_encoder = LLaMAHF(config, args.num_diffusion_head_layers, args.latent_dim, comp_device)
|
| 139 |
+
|
| 140 |
+
print('loading checkpoint from {}'.format(args.resume_pth))
|
| 141 |
+
ckpt = torch.load(args.resume_pth, map_location='cpu')
|
| 142 |
+
net.load_state_dict(ckpt['net'], strict=True)
|
| 143 |
+
net.eval()
|
| 144 |
+
net.to(comp_device)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
if args.resume_trans is not None:
|
| 148 |
+
print('loading transformer checkpoint from {}'.format(args.resume_trans))
|
| 149 |
+
ckpt = torch.load(args.resume_trans, map_location='cpu')
|
| 150 |
+
new_ckpt_trans = {}
|
| 151 |
+
for key in ckpt['trans'].keys():
|
| 152 |
+
if key.split('.')[0]=='module':
|
| 153 |
+
new_key = '.'.join(key.split('.')[1:])
|
| 154 |
+
else:
|
| 155 |
+
new_key = key
|
| 156 |
+
new_ckpt_trans[new_key] = ckpt['trans'][key]
|
| 157 |
+
trans_encoder.load_state_dict(new_ckpt_trans, strict=True)
|
| 158 |
+
trans_encoder.eval()
|
| 159 |
+
trans_encoder.to(comp_device)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
reference_end_latent = np.load('reference_end_latent_t2m_272.npy')
|
| 163 |
+
reference_end_latent = torch.from_numpy(reference_end_latent).to(comp_device)
|
| 164 |
+
|
| 165 |
+
mean = np.load('humanml3d_272/mean_std/Mean.npy')
|
| 166 |
+
std = np.load('humanml3d_272/mean_std/Std.npy')
|
| 167 |
+
|
| 168 |
+
# forward inference
|
| 169 |
+
threshold = 0.1
|
| 170 |
+
cfg_scale = 4.0
|
| 171 |
+
print(f"Generating motion with CFG scale: {cfg_scale}")
|
| 172 |
+
motion_latents = trans_encoder.sample_for_eval_CFG_inference(text=args.text, tokenizer=t5_model, device=comp_device, reference_end_latent=reference_end_latent, threshold=threshold, cfg=cfg_scale)
|
| 173 |
+
|
| 174 |
+
# forward decode
|
| 175 |
+
motion_seqs = net.forward_decoder(motion_latents)
|
| 176 |
+
from visualization.recover_visualize import recover_from_local_position
|
| 177 |
+
import visualization.plot_3d_global as plot_3d
|
| 178 |
+
|
| 179 |
+
motion = motion_seqs.squeeze(0)
|
| 180 |
+
motion = motion.detach().cpu().numpy()
|
| 181 |
+
|
| 182 |
+
if not os.path.exists('demo_output'):
|
| 183 |
+
os.makedirs('demo_output')
|
| 184 |
+
|
| 185 |
+
if args.mode == 'pos':
|
| 186 |
+
# Option1: recover from joint position
|
| 187 |
+
pred_xyz = recover_from_local_position(motion * std + mean, 22)
|
| 188 |
+
xyz = pred_xyz.reshape(1, -1, 22, 3)
|
| 189 |
+
pose_vis = plot_3d.draw_to_batch(xyz, [args.text], [f'demo_output/{args.text}.mp4'], fps=30)
|
| 190 |
+
print(f"Visualized result is saved in demo_output/{args.text}.mp4")
|
| 191 |
+
|
| 192 |
+
elif args.mode == 'rot':
|
| 193 |
+
# De-normalize the motion data to its original scale
|
| 194 |
+
motion = motion * std + mean
|
| 195 |
+
|
| 196 |
+
# Define the output path for the new BVH file
|
| 197 |
+
output_bvh_path = os.path.join('demo_output', f'{args.text}.bvh')
|
| 198 |
+
|
| 199 |
+
# Call the new function to save the BVH file directly
|
| 200 |
+
save_motion_as_bvh(motion, output_bvh_path, fps=30)
|
| 201 |
+
|
| 202 |
+
else:
|
| 203 |
+
raise ValueError(f'Invalid mode: {args.mode}')
|
| 204 |
+
|
.ipynb_checkpoints/environment-checkpoint.yaml
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: mgpt
|
| 2 |
+
channels:
|
| 3 |
+
- pytorch
|
| 4 |
+
- conda-forge
|
| 5 |
+
- defaults
|
| 6 |
+
- https://repo.anaconda.com/pkgs/main
|
| 7 |
+
- https://repo.anaconda.com/pkgs/r
|
| 8 |
+
dependencies:
|
| 9 |
+
- _libgcc_mutex=0.1=main
|
| 10 |
+
- _openmp_mutex=4.5=1_gnu
|
| 11 |
+
- asttokens=3.0.0=pyhd8ed1ab_0
|
| 12 |
+
- backcall=0.2.0=pyh9f0ad1d_0
|
| 13 |
+
- blas=1.0=mkl
|
| 14 |
+
- bzip2=1.0.8=h7b6447c_0
|
| 15 |
+
- ca-certificates=2025.1.31=hbcca054_0
|
| 16 |
+
- certifi=2024.8.30=pyhd8ed1ab_0
|
| 17 |
+
- comm=0.2.2=pyhd8ed1ab_0
|
| 18 |
+
- cudatoolkit=10.1.243=h6bb024c_0
|
| 19 |
+
- debugpy=1.4.1=py38h709712a_0
|
| 20 |
+
- entrypoints=0.4=pyhd8ed1ab_0
|
| 21 |
+
- executing=2.1.0=pyhd8ed1ab_0
|
| 22 |
+
- ffmpeg=4.3=hf484d3e_0
|
| 23 |
+
- freetype=2.10.4=h5ab3b9f_0
|
| 24 |
+
- gmp=6.2.1=h2531618_2
|
| 25 |
+
- gnutls=3.6.15=he1e5248_0
|
| 26 |
+
- intel-openmp=2021.3.0=h06a4308_3350
|
| 27 |
+
- ipykernel=6.20.2=pyh210e3f2_0
|
| 28 |
+
- jpeg=9b=h024ee3a_2
|
| 29 |
+
- jupyter_client=7.1.2=pyhd8ed1ab_0
|
| 30 |
+
- jupyter_core=5.7.2=pyh31011fe_1
|
| 31 |
+
- lame=3.100=h7b6447c_0
|
| 32 |
+
- lcms2=2.12=h3be6417_0
|
| 33 |
+
- ld_impl_linux-64=2.35.1=h7274673_9
|
| 34 |
+
- libffi=3.3=he6710b0_2
|
| 35 |
+
- libgcc-ng=9.3.0=h5101ec6_17
|
| 36 |
+
- libgomp=9.3.0=h5101ec6_17
|
| 37 |
+
- libiconv=1.15=h63c8f33_5
|
| 38 |
+
- libidn2=2.3.2=h7f8727e_0
|
| 39 |
+
- libpng=1.6.37=hbc83047_0
|
| 40 |
+
- libsodium=1.0.18=h36c2ea0_1
|
| 41 |
+
- libstdcxx-ng=13.2.0=hc0a3c3a_7
|
| 42 |
+
- libtasn1=4.16.0=h27cfd23_0
|
| 43 |
+
- libtiff=4.2.0=h85742a9_0
|
| 44 |
+
- libunistring=0.9.10=h27cfd23_0
|
| 45 |
+
- libuv=1.40.0=h7b6447c_0
|
| 46 |
+
- libwebp-base=1.2.0=h27cfd23_0
|
| 47 |
+
- lz4-c=1.9.3=h295c915_1
|
| 48 |
+
- mkl=2021.3.0=h06a4308_520
|
| 49 |
+
- mkl-service=2.4.0=py38h7f8727e_0
|
| 50 |
+
- mkl_fft=1.3.0=py38h42c9631_2
|
| 51 |
+
- mkl_random=1.2.2=py38h51133e4_0
|
| 52 |
+
- ncurses=6.2=he6710b0_1
|
| 53 |
+
- nest-asyncio=1.6.0=pyhd8ed1ab_0
|
| 54 |
+
- nettle=3.7.3=hbbd107a_1
|
| 55 |
+
- ninja=1.10.2=hff7bd54_1
|
| 56 |
+
- olefile=0.46=py_0
|
| 57 |
+
- openh264=2.1.0=hd408876_0
|
| 58 |
+
- openjpeg=2.3.0=h05c96fa_1
|
| 59 |
+
- openssl=1.1.1k=h7f98852_0
|
| 60 |
+
- packaging=24.2=pyhd8ed1ab_2
|
| 61 |
+
- pickleshare=0.7.5=py_1003
|
| 62 |
+
- pillow=8.3.1=py38h2c7a002_0
|
| 63 |
+
- pip=21.0.1=py38h06a4308_0
|
| 64 |
+
- platformdirs=4.3.6=pyhd8ed1ab_0
|
| 65 |
+
- prompt_toolkit=3.0.48=hd8ed1ab_1
|
| 66 |
+
- ptyprocess=0.7.0=pyhd3deb0d_0
|
| 67 |
+
- pure_eval=0.2.3=pyhd8ed1ab_0
|
| 68 |
+
- pygments=2.18.0=pyhd8ed1ab_0
|
| 69 |
+
- python=3.8.11=h12debd9_0_cpython
|
| 70 |
+
- python_abi=3.8=5_cp38
|
| 71 |
+
- pyzmq=22.1.0=py38h2035c66_0
|
| 72 |
+
- readline=8.1=h27cfd23_0
|
| 73 |
+
- setuptools=52.0.0=py38h06a4308_0
|
| 74 |
+
- six=1.16.0=pyhd3eb1b0_0
|
| 75 |
+
- sqlite=3.36.0=hc218d9a_0
|
| 76 |
+
- stack_data=0.6.2=pyhd8ed1ab_0
|
| 77 |
+
- tk=8.6.10=hbc83047_0
|
| 78 |
+
- torchaudio=0.8.1=py38
|
| 79 |
+
- torchvision=0.9.1=py38_cu101
|
| 80 |
+
- tornado=6.1=py38h497a2fe_1
|
| 81 |
+
- wheel=0.37.0=pyhd3eb1b0_0
|
| 82 |
+
- xz=5.2.5=h7b6447c_0
|
| 83 |
+
- zeromq=4.3.4=h9c3ff4c_0
|
| 84 |
+
- zlib=1.2.11=h7b6447c_3
|
| 85 |
+
- zstd=1.4.9=haebb681_0
|
| 86 |
+
- pip:
|
| 87 |
+
- absl-py==0.13.0
|
| 88 |
+
- accelerate==1.0.1
|
| 89 |
+
- aiohappyeyeballs==2.4.3
|
| 90 |
+
- aiohttp==3.10.11
|
| 91 |
+
- aiosignal==1.3.1
|
| 92 |
+
- annotated-types==0.7.0
|
| 93 |
+
- antlr4-python3-runtime==4.9.3
|
| 94 |
+
- async-timeout==5.0.1
|
| 95 |
+
- attrs==24.2.0
|
| 96 |
+
- beautifulsoup4==4.12.3
|
| 97 |
+
- blis==0.7.11
|
| 98 |
+
- cachetools==4.2.2
|
| 99 |
+
- catalogue==2.0.10
|
| 100 |
+
- charset-normalizer==2.0.4
|
| 101 |
+
- chumpy==0.70
|
| 102 |
+
- click==8.1.7
|
| 103 |
+
- clip==1.0
|
| 104 |
+
- cloudpathlib==0.20.0
|
| 105 |
+
- confection==0.1.5
|
| 106 |
+
- cycler==0.10.0
|
| 107 |
+
- cymem==2.0.10
|
| 108 |
+
- decorator==5.0.9
|
| 109 |
+
- diffusers==0.31.0
|
| 110 |
+
- einops==0.8.0
|
| 111 |
+
- ffmpeg-python==0.2.0
|
| 112 |
+
- filelock==3.16.1
|
| 113 |
+
- freetype-py==2.5.1
|
| 114 |
+
- frozenlist==1.5.0
|
| 115 |
+
- fsspec==2024.2.0
|
| 116 |
+
- ftfy==6.1.1
|
| 117 |
+
- future==1.0.0
|
| 118 |
+
- fvcore==0.1.5.post20221221
|
| 119 |
+
- gdown==5.2.0
|
| 120 |
+
- glfw==2.8.0
|
| 121 |
+
- google-auth==2.36.0
|
| 122 |
+
- google-auth-oauthlib==0.4.6
|
| 123 |
+
- grpcio==1.68.0
|
| 124 |
+
- h5py==3.11.0
|
| 125 |
+
- huggingface-hub==0.26.2
|
| 126 |
+
- human-body-prior==2.2.2.0
|
| 127 |
+
- idna==3.2
|
| 128 |
+
- imageio==2.9.0
|
| 129 |
+
- imageio-ffmpeg==0.5.1
|
| 130 |
+
- importlib-metadata==8.5.0
|
| 131 |
+
- iopath==0.1.10
|
| 132 |
+
- ipdb==0.13.9
|
| 133 |
+
- ipython==7.26.0
|
| 134 |
+
- ipython-genutils==0.2.0
|
| 135 |
+
- jedi==0.18.0
|
| 136 |
+
- jinja2==3.1.3
|
| 137 |
+
- joblib==1.0.1
|
| 138 |
+
- kiwisolver==1.3.1
|
| 139 |
+
- langcodes==3.4.1
|
| 140 |
+
- language-data==1.3.0
|
| 141 |
+
- lightning-utilities==0.11.9
|
| 142 |
+
- marisa-trie==1.2.1
|
| 143 |
+
- markdown==3.3.4
|
| 144 |
+
- markdown-it-py==3.0.0
|
| 145 |
+
- markupsafe==2.1.5
|
| 146 |
+
- matplotlib==3.4.3
|
| 147 |
+
- matplotlib-inline==0.1.2
|
| 148 |
+
- mdurl==0.1.2
|
| 149 |
+
- moviepy==0.2.3.1
|
| 150 |
+
- mpmath==1.3.0
|
| 151 |
+
- multidict==6.1.0
|
| 152 |
+
- murmurhash==1.0.11
|
| 153 |
+
- natsort==8.4.0
|
| 154 |
+
- networkx==3.0
|
| 155 |
+
- numpy==1.22.4
|
| 156 |
+
- nvidia-cublas-cu11==11.11.3.6
|
| 157 |
+
- nvidia-cublas-cu12==12.1.3.1
|
| 158 |
+
- nvidia-cuda-cupti-cu11==11.8.87
|
| 159 |
+
- nvidia-cuda-cupti-cu12==12.1.105
|
| 160 |
+
- nvidia-cuda-nvrtc-cu11==11.8.89
|
| 161 |
+
- nvidia-cuda-nvrtc-cu12==12.1.105
|
| 162 |
+
- nvidia-cuda-runtime-cu11==11.8.89
|
| 163 |
+
- nvidia-cuda-runtime-cu12==12.1.105
|
| 164 |
+
- nvidia-cudnn-cu11==9.1.0.70
|
| 165 |
+
- nvidia-cudnn-cu12==9.1.0.70
|
| 166 |
+
- nvidia-cufft-cu11==10.9.0.58
|
| 167 |
+
- nvidia-cufft-cu12==11.0.2.54
|
| 168 |
+
- nvidia-curand-cu11==10.3.0.86
|
| 169 |
+
- nvidia-curand-cu12==10.3.2.106
|
| 170 |
+
- nvidia-cusolver-cu11==11.4.1.48
|
| 171 |
+
- nvidia-cusolver-cu12==11.4.5.107
|
| 172 |
+
- nvidia-cusparse-cu11==11.7.5.86
|
| 173 |
+
- nvidia-cusparse-cu12==12.1.0.106
|
| 174 |
+
- nvidia-nccl-cu11==2.20.5
|
| 175 |
+
- nvidia-nccl-cu12==2.20.5
|
| 176 |
+
- nvidia-nvjitlink-cu12==12.1.105
|
| 177 |
+
- nvidia-nvtx-cu11==11.8.86
|
| 178 |
+
- nvidia-nvtx-cu12==12.1.105
|
| 179 |
+
- oauthlib==3.1.1
|
| 180 |
+
- omegaconf==2.3.0
|
| 181 |
+
- orjson==3.10.15
|
| 182 |
+
- pandas==1.3.2
|
| 183 |
+
- parso==0.8.2
|
| 184 |
+
- pexpect==4.8.0
|
| 185 |
+
- portalocker==3.0.0
|
| 186 |
+
- preshed==3.0.9
|
| 187 |
+
- prompt-toolkit==3.0.20
|
| 188 |
+
- propcache==0.2.0
|
| 189 |
+
- protobuf==5.28.3
|
| 190 |
+
- psutil==6.1.0
|
| 191 |
+
- pyasn1==0.4.8
|
| 192 |
+
- pyasn1-modules==0.2.8
|
| 193 |
+
- pydantic==2.10.1
|
| 194 |
+
- pydantic-core==2.27.1
|
| 195 |
+
- pydeprecate==0.3.2
|
| 196 |
+
- pygame==2.6.1
|
| 197 |
+
- pyglet==2.1.2
|
| 198 |
+
- pyopengl==3.1.0
|
| 199 |
+
- pyparsing==2.4.7
|
| 200 |
+
- pyrender==0.1.45
|
| 201 |
+
- pysocks==1.7.1
|
| 202 |
+
- python-dateutil==2.8.2
|
| 203 |
+
- pytorch-lightning==1.7.0
|
| 204 |
+
- pytorch3d==0.3.0
|
| 205 |
+
- pytz==2021.1
|
| 206 |
+
- pyyaml==5.4.1
|
| 207 |
+
- regex==2024.11.6
|
| 208 |
+
- requests==2.26.0
|
| 209 |
+
- requests-oauthlib==1.3.0
|
| 210 |
+
- rich==13.9.4
|
| 211 |
+
- rsa==4.7.2
|
| 212 |
+
- safetensors==0.4.5
|
| 213 |
+
- scikit-learn==0.24.2
|
| 214 |
+
- scipy==1.7.1
|
| 215 |
+
- sentence-transformers==3.2.1
|
| 216 |
+
- sentencepiece==0.2.0
|
| 217 |
+
- shapely==2.0.7
|
| 218 |
+
- shellingham==1.5.4
|
| 219 |
+
- sklearn==0.0
|
| 220 |
+
- smart-open==7.0.5
|
| 221 |
+
- smplx==0.1.28
|
| 222 |
+
- soupsieve==2.6
|
| 223 |
+
- spacy==3.7.5
|
| 224 |
+
- spacy-legacy==3.0.12
|
| 225 |
+
- spacy-loggers==1.0.5
|
| 226 |
+
- srsly==2.4.8
|
| 227 |
+
- sympy==1.13.1
|
| 228 |
+
- tabulate==0.9.0
|
| 229 |
+
- tensorboard==2.12.0
|
| 230 |
+
- tensorboard-data-server==0.7.2
|
| 231 |
+
- tensorboard-plugin-wit==1.8.0
|
| 232 |
+
- termcolor==2.4.0
|
| 233 |
+
- thinc==8.2.5
|
| 234 |
+
- threadpoolctl==2.2.0
|
| 235 |
+
- timm==1.0.12
|
| 236 |
+
- tokenizers==0.20.3
|
| 237 |
+
- toml==0.10.2
|
| 238 |
+
- torch==2.4.1+cu118
|
| 239 |
+
- torchgeometry==0.1.2
|
| 240 |
+
- torchmetrics==0.7.0
|
| 241 |
+
- tqdm==4.62.2
|
| 242 |
+
- traitlets==5.0.5
|
| 243 |
+
- transformers==4.46.3
|
| 244 |
+
- triangle==20250106
|
| 245 |
+
- trimesh==4.6.2
|
| 246 |
+
- triton==3.0.0
|
| 247 |
+
- typer==0.13.1
|
| 248 |
+
- typing-extensions==4.12.2
|
| 249 |
+
- urllib3==1.26.6
|
| 250 |
+
- wasabi==1.1.3
|
| 251 |
+
- wcwidth==0.2.5
|
| 252 |
+
- weasel==0.4.1
|
| 253 |
+
- werkzeug==2.0.1
|
| 254 |
+
- wrapt==1.17.0
|
| 255 |
+
- yacs==0.1.8
|
| 256 |
+
- yarl==1.15.2
|
| 257 |
+
- zipp==3.20.2
|
| 258 |
+
prefix: /root/miniconda3/envs/mgpt
|
.ipynb_checkpoints/requirements-checkpoint.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
smplx==0.1.28
|
| 2 |
+
transformers==4.56.2
|
| 3 |
+
timm==1.0.12
|
| 4 |
+
sentence-transformers==5.1.0
|
| 5 |
+
clip @ git+https://github.com/openai/CLIP.git@main#egg=clip
|
| 6 |
+
human-body-prior @ git+https://github.com/nghorbani/human_body_prior.git@master#egg=human-body-prior
|
| 7 |
+
gdown
|
| 8 |
+
chumpy==0.70
|
| 9 |
+
scipy==1.7.1
|
| 10 |
+
numpy==1.22.4
|
| 11 |
+
tensorboard
|
| 12 |
+
accelerate
|
| 13 |
+
flash_attn
|
| 14 |
+
matplotlib==3.4.3
|
| 15 |
+
matplotlib-inline==0.1.2
|
| 16 |
+
imageio==2.9.0
|
| 17 |
+
imageio-ffmpeg==0.5.1
|
.ipynb_checkpoints/train_motionstreamer-checkpoint.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Train streaming motion generation model (MotionStreamer) with llama blocks, Two-Forward strategy and QK-Norm, using the motion latents encoded by the Causal TAE (trained in the first stage)."""
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
import torch
|
| 6 |
+
import numpy as np
|
| 7 |
+
import random
|
| 8 |
+
from torch.utils.tensorboard import SummaryWriter
|
| 9 |
+
import json
|
| 10 |
+
from accelerate import Accelerator
|
| 11 |
+
from tqdm import tqdm
|
| 12 |
+
from models.llama_model import LLaMAHF, LLaMAHFConfig
|
| 13 |
+
import options.option_transformer as option_trans
|
| 14 |
+
import utils.utils_model as utils_model
|
| 15 |
+
import warnings
|
| 16 |
+
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR
|
| 17 |
+
warnings.filterwarnings('ignore')
|
| 18 |
+
|
| 19 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 20 |
+
##### ---- Exp dirs ---- #####
|
| 21 |
+
args = option_trans.get_args_parser()
|
| 22 |
+
torch.manual_seed(args.seed)
|
| 23 |
+
|
| 24 |
+
# warm-up + cosine decay scheduler
|
| 25 |
+
class WarmupCosineDecayScheduler:
|
| 26 |
+
def __init__(self, optimizer, warmup_iters, total_iters, min_lr=0):
|
| 27 |
+
self.optimizer = optimizer
|
| 28 |
+
self.warmup_iters = warmup_iters
|
| 29 |
+
self.total_iters = total_iters
|
| 30 |
+
self.min_lr = min_lr
|
| 31 |
+
|
| 32 |
+
self.warmup_scheduler = LambdaLR(optimizer, lr_lambda=self.warmup_lambda)
|
| 33 |
+
|
| 34 |
+
self.cosine_scheduler = CosineAnnealingLR(optimizer,
|
| 35 |
+
T_max=total_iters - warmup_iters,
|
| 36 |
+
eta_min=min_lr)
|
| 37 |
+
|
| 38 |
+
def warmup_lambda(self, current_iter):
|
| 39 |
+
if current_iter < self.warmup_iters:
|
| 40 |
+
return float(current_iter) / float(max(1, self.warmup_iters))
|
| 41 |
+
return 1.0
|
| 42 |
+
|
| 43 |
+
def step(self, current_iter):
|
| 44 |
+
if current_iter < self.warmup_iters:
|
| 45 |
+
self.warmup_scheduler.step()
|
| 46 |
+
else:
|
| 47 |
+
self.cosine_scheduler.step()
|
| 48 |
+
|
| 49 |
+
def state_dict(self):
|
| 50 |
+
return {
|
| 51 |
+
'warmup_iters': self.warmup_iters,
|
| 52 |
+
'total_iters': self.total_iters,
|
| 53 |
+
'min_lr': self.min_lr,
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
def load_state_dict(self, state_dict):
|
| 57 |
+
self.warmup_iters = state_dict['warmup_iters']
|
| 58 |
+
self.total_iters = state_dict['total_iters']
|
| 59 |
+
self.min_lr = state_dict['min_lr']
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
args.out_dir = os.path.join(args.out_dir, f'{args.exp_name}')
|
| 64 |
+
os.makedirs(args.out_dir, exist_ok = True)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
##### ---- Accelerator Setup ---- #####
|
| 68 |
+
accelerator = Accelerator()
|
| 69 |
+
comp_device = accelerator.device
|
| 70 |
+
|
| 71 |
+
##### ---- Logger ---- #####
|
| 72 |
+
logger = utils_model.get_logger(args.out_dir)
|
| 73 |
+
writer = SummaryWriter(args.out_dir)
|
| 74 |
+
logger.info(json.dumps(vars(args), indent=4, sort_keys=True))
|
| 75 |
+
|
| 76 |
+
##### ---- Dataloader ---- #####
|
| 77 |
+
from humanml3d_272 import dataset_TM_train_motionstreamer
|
| 78 |
+
train_loader = dataset_TM_train_motionstreamer.DATALoader(args.dataname, args.batch_size, unit_length=2**args.down_t, latent_dir=args.latent_dir)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
##### ---- Network ---- #####
|
| 82 |
+
from sentence_transformers import SentenceTransformer
|
| 83 |
+
t5_model = SentenceTransformer('sentencet5-xxl/')
|
| 84 |
+
t5_model.eval()
|
| 85 |
+
for p in t5_model.parameters():
|
| 86 |
+
p.requires_grad = False
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
config = LLaMAHFConfig.from_name('Normal_size')
|
| 90 |
+
config.block_size = 78
|
| 91 |
+
trans_encoder = LLaMAHF(config, args.num_diffusion_head_layers, args.latent_dim, comp_device)
|
| 92 |
+
|
| 93 |
+
if args.resume_trans is not None:
|
| 94 |
+
print('loading transformer checkpoint from {}'.format(args.resume_trans))
|
| 95 |
+
ckpt = torch.load(args.resume_trans, map_location='cpu')
|
| 96 |
+
new_ckpt_trans = {}
|
| 97 |
+
for key in ckpt['trans'].keys():
|
| 98 |
+
if key.split('.')[0]=='module':
|
| 99 |
+
new_key = '.'.join(key.split('.')[1:])
|
| 100 |
+
else:
|
| 101 |
+
new_key = key
|
| 102 |
+
new_ckpt_trans[new_key] = ckpt['trans'][key]
|
| 103 |
+
trans_encoder.load_state_dict(new_ckpt_trans, strict=True)
|
| 104 |
+
trans_encoder.train()
|
| 105 |
+
trans_encoder.to(comp_device)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
##### ---- Optimizer & Scheduler ---- #####
|
| 109 |
+
optimizer = utils_model.initial_optim(args.decay_option, args.lr, args.weight_decay, trans_encoder, args.optimizer)
|
| 110 |
+
scheduler = WarmupCosineDecayScheduler(optimizer, args.total_iter//10, args.total_iter)
|
| 111 |
+
|
| 112 |
+
t5_model, trans_encoder, optimizer, train_loader = accelerator.prepare(t5_model, trans_encoder, optimizer, train_loader)
|
| 113 |
+
train_loader_iter = dataset_TM_train_motionstreamer.cycle(train_loader)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
diffmlps_batch_mul = 4
|
| 117 |
+
def lengths_to_mask(lengths, max_len):
|
| 118 |
+
mask = torch.arange(max_len, device=lengths.device).expand(len(lengths), max_len) < lengths.unsqueeze(1)
|
| 119 |
+
return mask
|
| 120 |
+
def get_mask_subset_prob(mask, prob):
|
| 121 |
+
subset_mask = torch.bernoulli(mask, p=prob) & mask
|
| 122 |
+
return subset_mask
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def uniform(shape, device=None):
|
| 126 |
+
return torch.zeros(shape, device=device).float().uniform_(0, 1)
|
| 127 |
+
|
| 128 |
+
import math
|
| 129 |
+
def cosine_schedule(t):
|
| 130 |
+
return torch.cos(t * math.pi * 0.5)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
#--------------2-forward:------------------
|
| 134 |
+
def cosine_decay(step, total_steps, start_value=1.0, end_value=0.0):
|
| 135 |
+
step = torch.tensor(step, dtype=torch.float32)
|
| 136 |
+
total_steps = torch.tensor(total_steps, dtype=torch.float32)
|
| 137 |
+
cosine_factor = 0.5 * (1 + torch.cos(torch.pi * step / total_steps))
|
| 138 |
+
return start_value + (end_value - start_value) * cosine_factor
|
| 139 |
+
|
| 140 |
+
def replace_with_pred(latents, pred_xstart, step, total_steps):
|
| 141 |
+
decay_factor = cosine_decay(step, total_steps).to(latents.device)
|
| 142 |
+
b, l, d = latents.shape
|
| 143 |
+
num_replace = int(l * decay_factor)
|
| 144 |
+
|
| 145 |
+
replace_indices = torch.randperm(l)[:num_replace]
|
| 146 |
+
|
| 147 |
+
replace_mask = torch.zeros(b, l, dtype=torch.bool).to(latents.device)
|
| 148 |
+
replace_mask[:, replace_indices] = 1
|
| 149 |
+
|
| 150 |
+
updated_latents = latents.clone()
|
| 151 |
+
updated_latents[replace_mask] = pred_xstart[replace_mask]
|
| 152 |
+
|
| 153 |
+
return updated_latents
|
| 154 |
+
|
| 155 |
+
def forward_loss_withmask_2_forward_streaming(latents, trans, m_lens, feat_text, step, total_steps, A_token_length):
|
| 156 |
+
latents = latents.to(comp_device)
|
| 157 |
+
feat_text = feat_text.to(comp_device)
|
| 158 |
+
A_token_length = A_token_length.to(comp_device)
|
| 159 |
+
conditions = trans(latents, feat_text)
|
| 160 |
+
conditions = conditions.contiguous()
|
| 161 |
+
z = conditions[:,:-1,:]
|
| 162 |
+
|
| 163 |
+
b, l, d = latents.shape
|
| 164 |
+
mask = lengths_to_mask(m_lens, l)
|
| 165 |
+
|
| 166 |
+
for j in range(b):
|
| 167 |
+
mask[j, :A_token_length[j].item()] = False # A_motion token: do not compute loss
|
| 168 |
+
|
| 169 |
+
mask = mask.reshape(b * l).repeat(diffmlps_batch_mul)
|
| 170 |
+
|
| 171 |
+
target = latents.clone().detach()
|
| 172 |
+
target = target.reshape(b * l, -1)
|
| 173 |
+
z = z.reshape(b * l, -1)
|
| 174 |
+
|
| 175 |
+
with torch.no_grad():
|
| 176 |
+
loss, pred_xstart = trans.diff_loss(target=target, z=z)
|
| 177 |
+
|
| 178 |
+
pred_xstart = pred_xstart.clone().detach()
|
| 179 |
+
pred_xstart = pred_xstart.reshape(b, l, -1)
|
| 180 |
+
|
| 181 |
+
# do not replace A_motion tokens
|
| 182 |
+
for k in range(b):
|
| 183 |
+
pred_xstart[k, :A_token_length[k].item(),:] = latents[k, :A_token_length[k].item(),:]
|
| 184 |
+
|
| 185 |
+
updated_latents = replace_with_pred(latents, pred_xstart, step, total_steps)
|
| 186 |
+
updated_conditions = trans(updated_latents, feat_text)
|
| 187 |
+
updated_conditions = updated_conditions.contiguous()
|
| 188 |
+
updated_z = updated_conditions[:,:-1,:]
|
| 189 |
+
|
| 190 |
+
updated_target = latents.clone().detach()
|
| 191 |
+
|
| 192 |
+
updated_target = updated_target.reshape(b * l, -1).repeat(diffmlps_batch_mul, 1)
|
| 193 |
+
updated_z = updated_z.reshape(b * l, -1).repeat(diffmlps_batch_mul, 1)
|
| 194 |
+
|
| 195 |
+
updated_target = updated_target[mask]
|
| 196 |
+
updated_z = updated_z[mask]
|
| 197 |
+
|
| 198 |
+
updated_loss, updated_pred_xstart = trans.diff_loss(target=updated_target, z=updated_z)
|
| 199 |
+
|
| 200 |
+
return updated_loss
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
##### ---- Training Loop ---- #####
|
| 204 |
+
avg_loss_cls = 0.
|
| 205 |
+
|
| 206 |
+
pbar = tqdm(range(1, args.total_iter + 1), desc="Training MotionStreamer")
|
| 207 |
+
for nb_iter in pbar:
|
| 208 |
+
batch = next(train_loader_iter)
|
| 209 |
+
caption, m_tokens, m_tokens_len, A_token_length = batch
|
| 210 |
+
caption = list(caption)
|
| 211 |
+
m_tokens, m_tokens_len = m_tokens.to(comp_device), m_tokens_len.to(comp_device)
|
| 212 |
+
A_token_length = A_token_length.to(comp_device)
|
| 213 |
+
|
| 214 |
+
bs = len(caption)
|
| 215 |
+
num_masked = int(bs * 0.1) # 10%
|
| 216 |
+
mask_indices = random.sample(range(bs), num_masked)
|
| 217 |
+
|
| 218 |
+
for idx in mask_indices:
|
| 219 |
+
caption[idx] = ''
|
| 220 |
+
|
| 221 |
+
feat_text = torch.from_numpy(t5_model.encode(caption)).float()
|
| 222 |
+
feat_text = feat_text.to(comp_device)
|
| 223 |
+
|
| 224 |
+
# -------gt--------
|
| 225 |
+
input_latent = m_tokens[:,:-1,:] # continuous token
|
| 226 |
+
|
| 227 |
+
loss_cls = 0.0
|
| 228 |
+
|
| 229 |
+
if args.num_gpus > 1:
|
| 230 |
+
loss_cls = forward_loss_withmask_2_forward_streaming(latents=input_latent, trans=trans_encoder.module, m_lens = m_tokens_len, feat_text=feat_text, step=nb_iter, total_steps=args.total_iter, A_token_length=A_token_length)
|
| 231 |
+
else:
|
| 232 |
+
loss_cls = forward_loss_withmask_2_forward_streaming(latents=input_latent, trans=trans_encoder, m_lens = m_tokens_len, feat_text=feat_text, step=nb_iter, total_steps=args.total_iter, A_token_length=A_token_length)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
# backward & optimizer step
|
| 236 |
+
optimizer.zero_grad()
|
| 237 |
+
accelerator.backward(loss_cls)
|
| 238 |
+
optimizer.step()
|
| 239 |
+
scheduler.step(nb_iter)
|
| 240 |
+
|
| 241 |
+
avg_loss_cls = avg_loss_cls + loss_cls.item()
|
| 242 |
+
|
| 243 |
+
args.print_iter = 100
|
| 244 |
+
if nb_iter % args.print_iter == 0 :
|
| 245 |
+
if accelerator.is_main_process:
|
| 246 |
+
avg_loss_cls = avg_loss_cls / args.print_iter
|
| 247 |
+
lr = optimizer.param_groups[0]['lr']
|
| 248 |
+
writer.add_scalar('./Loss/train', avg_loss_cls, nb_iter)
|
| 249 |
+
writer.add_scalar('./LR/train', optimizer.param_groups[0]['lr'], nb_iter)
|
| 250 |
+
msg = f"Train. Iter {nb_iter} : Loss. {avg_loss_cls:.5f}"
|
| 251 |
+
tqdm.write(f"Iter {nb_iter} | Loss: {avg_loss_cls:.5f} | LR: {lr:.6f}")
|
| 252 |
+
logger.info(msg)
|
| 253 |
+
avg_loss_cls = 0.
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
args.save_iter = 10000
|
| 257 |
+
if nb_iter % args.save_iter == 0:
|
| 258 |
+
# save checkpoint
|
| 259 |
+
if accelerator.is_main_process:
|
| 260 |
+
torch.save({
|
| 261 |
+
'trans': trans_encoder.state_dict(),
|
| 262 |
+
}, os.path.join(args.out_dir, f'latest.pth'))
|
| 263 |
+
|
| 264 |
+
accelerator.wait_for_everyone()
|
EVAL_causal_TAE.sh
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ln -s ../utils ./Evaluator_272/
|
| 2 |
+
ln -s ../humanml3d_272 ./Evaluator_272/
|
| 3 |
+
ln -s ../options ./Evaluator_272/
|
| 4 |
+
ln -s ../models ./Evaluator_272/
|
| 5 |
+
ln -s ../visualization ./Evaluator_272/
|
| 6 |
+
python eval_causal_TAE.py --resume-pth output/causal_TAE/net_last.pth
|
EVAL_t2m.sh
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ln -s ../utils ./Evaluator_272/
|
| 2 |
+
ln -s ../humanml3d_272 ./Evaluator_272/
|
| 3 |
+
ln -s ../options ./Evaluator_272/
|
| 4 |
+
ln -s ../models ./Evaluator_272/
|
| 5 |
+
ln -s ../visualization ./Evaluator_272/
|
| 6 |
+
ln -s ../Causal_TAE ./Evaluator_272/
|
| 7 |
+
python eval_t2m.py --resume-pth Causal_TAE/net_last.pth --resume-trans /cpfs03/shared/IDC/wangjingbo_group/motionstreamer/Open_source_Train_AR_16_1024_fps_30_111M_9/latest.pth
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 ZJU3DV
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
<h2 align="center"<strong>MotionStreamer: Streaming Motion Generation via Diffusion-based Autoregressive Model in Causal Latent Space</strong></h2>
|
| 3 |
+
<p align="center">
|
| 4 |
+
<a href='https://li-xingxiao.github.io/homepage/' target='_blank'>Lixing Xiao</a><sup>1</sup>
|
| 5 |
+
·
|
| 6 |
+
<a href='https://shunlinlu.github.io/' target='_blank'>Shunlin Lu</a> <sup>2</sup>
|
| 7 |
+
·
|
| 8 |
+
<a href='https://phj128.github.io/' target='_blank'>Huaijin Pi</a><sup>3</sup>
|
| 9 |
+
·
|
| 10 |
+
<a href='https://vankouf.github.io/' target='_blank'>Ke Fan</a><sup>4</sup>
|
| 11 |
+
·
|
| 12 |
+
<a href='https://liangpan99.github.io/' target='_blank'>Liang Pan</a><sup>3</sup>
|
| 13 |
+
·
|
| 14 |
+
<a href='https://[email protected]' target='_blank'>Yueer Zhou</a><sup>1</sup>
|
| 15 |
+
·
|
| 16 |
+
<a href='https://dblp.org/pid/120/4362.html/' target='_blank'>Ziyong Feng</a><sup>5</sup>
|
| 17 |
+
·
|
| 18 |
+
<br>
|
| 19 |
+
<a href='https://www.xzhou.me/' target='_blank'>Xiaowei Zhou</a><sup>1</sup>
|
| 20 |
+
·
|
| 21 |
+
<a href='https://pengsida.net/' target='_blank'>Sida Peng</a><sup>1†</sup>
|
| 22 |
+
·
|
| 23 |
+
<a href='https://wangjingbo1219.github.io/' target='_blank'>Jingbo Wang</a><sup>6</sup>
|
| 24 |
+
<br>
|
| 25 |
+
<br>
|
| 26 |
+
<sup>1</sup>Zhejiang University <sup>2</sup>The Chinese University of Hong Kong, Shenzhen <sup>3</sup>The University of Hong Kong <br><sup>4</sup>Shanghai Jiao Tong University <sup>5</sup>DeepGlint <sup>6</sup>Shanghai AI Lab
|
| 27 |
+
<br>
|
| 28 |
+
<strong>ICCV 2025</strong>
|
| 29 |
+
|
| 30 |
+
</p>
|
| 31 |
+
</p>
|
| 32 |
+
<p align="center">
|
| 33 |
+
<a href='https://arxiv.org/abs/2503.15451'>
|
| 34 |
+
<img src='https://img.shields.io/badge/Arxiv-2503.15451-A42C25?style=flat&logo=arXiv&logoColor=A42C25'></a>
|
| 35 |
+
<a href='https://arxiv.org/pdf/2503.15451'>
|
| 36 |
+
<img src='https://img.shields.io/badge/Paper-PDF-blue?style=flat&logo=arXiv&logoColor=blue'></a>
|
| 37 |
+
<a href='https://zju3dv.github.io/MotionStreamer/'>
|
| 38 |
+
<img src='https://img.shields.io/badge/Project-Page-green?style=flat&logo=Google%20chrome&logoColor=green'></a>
|
| 39 |
+
<a href='https://huggingface.co/datasets/lxxiao/272-dim-HumanML3D'>
|
| 40 |
+
<img src='https://img.shields.io/badge/Data-Download-yellow?style=flat&logo=huggingface&logoColor=yellow'></a>
|
| 41 |
+
</p>
|
| 42 |
+
|
| 43 |
+
<img width="1385" alt="image" src="assets/teaser.jpg"/>
|
| 44 |
+
|
| 45 |
+
## 🔥 News
|
| 46 |
+
|
| 47 |
+
- **[2025-06]** MotionStreamer has been accepted to ICCV 2025! 🎉
|
| 48 |
+
|
| 49 |
+
## TODO List
|
| 50 |
+
|
| 51 |
+
- [x] Release the processing script of 272-dim motion representation.
|
| 52 |
+
- [x] Release the processed 272-dim Motion Representation of [HumanML3D](https://github.com/EricGuo5513/HumanML3D) dataset. Only for academic usage.
|
| 53 |
+
- [x] Release the training code and checkpoint of our [TMR](https://github.com/Mathux/TMR)-based motion evaluator trained on the processed 272-dim [HumanML3D](https://github.com/EricGuo5513/HumanML3D) dataset.
|
| 54 |
+
- [x] Release the training and evaluation code as well as checkpoint of Causal TAE.
|
| 55 |
+
- [x] Release the training code of original motion generation model and streaming generation model (MotionStreamer).
|
| 56 |
+
- [x] Release the checkpoint and demo inference code of original motion generation model.
|
| 57 |
+
- [ ] Release complete code for MotionStreamer.
|
| 58 |
+
|
| 59 |
+
## 🏃 Motion Representation
|
| 60 |
+
For more details of how to obtain the 272-dim motion representation, as well as other useful tools (e.g., Visualization and Conversion to BVH format), please refer to our [GitHub repo](https://github.com/Li-xingXiao/272-dim-Motion-Representation).
|
| 61 |
+
|
| 62 |
+
## Installation
|
| 63 |
+
|
| 64 |
+
### 🐍 Python Virtual Environment
|
| 65 |
+
```sh
|
| 66 |
+
conda env create -f environment.yaml
|
| 67 |
+
conda activate mgpt
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
### 🤗 Hugging Face Mirror
|
| 71 |
+
Since all of our models and data are available on Hugging Face, if Hugging Face is not directly accessible, you can use the HF-mirror tools following:
|
| 72 |
+
```sh
|
| 73 |
+
pip install -U huggingface_hub
|
| 74 |
+
export HF_ENDPOINT=https://hf-mirror.com
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
## 📥 Data Preparation
|
| 78 |
+
To facilitate researchers, we provide the processed 272-dim Motion Representation of:
|
| 79 |
+
> HumanML3D dataset at [this link](https://huggingface.co/datasets/lxxiao/272-dim-HumanML3D).
|
| 80 |
+
|
| 81 |
+
> BABEL dataset at [this link](https://huggingface.co/datasets/lxxiao/272-dim-BABEL).
|
| 82 |
+
|
| 83 |
+
❗️❗️❗️ The processed data is solely for academic purposes. Make sure you read through the [AMASS License](https://amass.is.tue.mpg.de/license.html).
|
| 84 |
+
|
| 85 |
+
1. Download the processed 272-dim [HumanML3D](https://github.com/EricGuo5513/HumanML3D) dataset following:
|
| 86 |
+
```bash
|
| 87 |
+
huggingface-cli download --repo-type dataset --resume-download lxxiao/272-dim-HumanML3D --local-dir ./humanml3d_272
|
| 88 |
+
cd ./humanml3d_272
|
| 89 |
+
unzip texts.zip
|
| 90 |
+
unzip motion_data.zip
|
| 91 |
+
```
|
| 92 |
+
The dataset is organized as:
|
| 93 |
+
```
|
| 94 |
+
./humanml3d_272
|
| 95 |
+
├── mean_std
|
| 96 |
+
├── Mean.npy
|
| 97 |
+
├── Std.npy
|
| 98 |
+
├── split
|
| 99 |
+
├── train.txt
|
| 100 |
+
├── val.txt
|
| 101 |
+
├── test.txt
|
| 102 |
+
├── texts
|
| 103 |
+
├── 000000.txt
|
| 104 |
+
...
|
| 105 |
+
├── motion_data
|
| 106 |
+
├── 000000.npy
|
| 107 |
+
...
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
2. Download the processed 272-dim [BABEL](https://babel.is.tue.mpg.de/) dataset following:
|
| 111 |
+
```bash
|
| 112 |
+
huggingface-cli download --repo-type dataset --resume-download lxxiao/272-dim-BABEL --local-dir ./babel_272
|
| 113 |
+
cd ./babel_272
|
| 114 |
+
unzip texts.zip
|
| 115 |
+
unzip motion_data.zip
|
| 116 |
+
```
|
| 117 |
+
The dataset is organized as:
|
| 118 |
+
```
|
| 119 |
+
./babel_272
|
| 120 |
+
├── t2m_babel_mean_std
|
| 121 |
+
├── Mean.npy
|
| 122 |
+
├── Std.npy
|
| 123 |
+
├── split
|
| 124 |
+
├── train.txt
|
| 125 |
+
├── val.txt
|
| 126 |
+
├── texts
|
| 127 |
+
├── 000000.txt
|
| 128 |
+
...
|
| 129 |
+
├── motion_data
|
| 130 |
+
├── 000000.npy
|
| 131 |
+
...
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
3. Download the processed streaming 272-dim [BABEL](https://babel.is.tue.mpg.de/) dataset following:
|
| 135 |
+
```bash
|
| 136 |
+
huggingface-cli download --repo-type dataset --resume-download lxxiao/272-dim-BABEL-stream --local-dir ./babel_272_stream
|
| 137 |
+
cd ./babel_272_stream
|
| 138 |
+
unzip train_stream.zip
|
| 139 |
+
unzip train_stream_text.zip
|
| 140 |
+
unzip val_stream.zip
|
| 141 |
+
unzip val_stream_text.zip
|
| 142 |
+
```
|
| 143 |
+
The dataset is organized as:
|
| 144 |
+
```
|
| 145 |
+
./babel_272_stream
|
| 146 |
+
├── train_stream
|
| 147 |
+
├── seq1.npy
|
| 148 |
+
...
|
| 149 |
+
├── train_stream_text
|
| 150 |
+
├── seq1.txt
|
| 151 |
+
...
|
| 152 |
+
├── val_stream
|
| 153 |
+
├── seq1.npy
|
| 154 |
+
...
|
| 155 |
+
├── val_stream_text
|
| 156 |
+
├── seq1.txt
|
| 157 |
+
...
|
| 158 |
+
```
|
| 159 |
+
> NOTE: We process the original BABEL dataset to support training of streaming motion generation. e.g. If there is a motion sequence A, annotated as (A1, A2, A3, A4) in BABEL dataset, each subsequence has text description: (A1_t, A2_t, A3_t, A4_t).
|
| 160 |
+
|
| 161 |
+
> Then, our BABEL-stream is constructed as:
|
| 162 |
+
|
| 163 |
+
> seq1: (A1, A2) --- seq1_text: (A1_t*A2_t#A1_length)
|
| 164 |
+
|
| 165 |
+
> seq2: (A2, A3) --- seq2_text: (A2_t*A3_t#A2_length)
|
| 166 |
+
|
| 167 |
+
> seq3: (A3, A4) --- seq3_text: (A3_t*A4_t#A3_length)
|
| 168 |
+
|
| 169 |
+
> Here, * and # is separation symbol, A1_length means the number of frames of subsequence A1.
|
| 170 |
+
|
| 171 |
+
## 🚀 Training
|
| 172 |
+
1. Train our [TMR](https://github.com/Mathux/TMR)-based motion evaluator on the processed 272-dim [HumanML3D](https://github.com/EricGuo5513/HumanML3D) dataset:
|
| 173 |
+
```bash
|
| 174 |
+
bash TRAIN_evaluator_272.sh
|
| 175 |
+
```
|
| 176 |
+
>After training for 100 epochs, the checkpoint will be stored at:
|
| 177 |
+
``Evaluator_272/experiments/temos/EXP1/checkpoints/``.
|
| 178 |
+
|
| 179 |
+
⬇️ We provide the evaluator checkpoint on [Hugging Face](https://huggingface.co/lxxiao/MotionStreamer/tree/main/Evaluator_272), download it following:
|
| 180 |
+
```bash
|
| 181 |
+
python humanml3d_272/prepare/download_evaluator_ckpt.py
|
| 182 |
+
```
|
| 183 |
+
>The downloaded checkpoint will be stored at: ``Evaluator_272/``.
|
| 184 |
+
2. Train the Causal TAE:
|
| 185 |
+
```bash
|
| 186 |
+
bash TRAIN_causal_TAE.sh ${NUM_GPUS}
|
| 187 |
+
```
|
| 188 |
+
> e.g., if you have 8 GPUs, run: bash TRAIN_causal_TAE.sh 8
|
| 189 |
+
|
| 190 |
+
> The checkpoint will be stored at:
|
| 191 |
+
``Experiments/causal_TAE_t2m_272/``
|
| 192 |
+
|
| 193 |
+
> Tensorboard visualization:
|
| 194 |
+
```bash
|
| 195 |
+
tensorboard --logdir='Experiments/causal_TAE_t2m_272'
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
⬇️ We provide the Causal TAE checkpoint on [Hugging Face](https://huggingface.co/lxxiao/MotionStreamer/tree/main/Causal_TAE), download it following:
|
| 199 |
+
```bash
|
| 200 |
+
python humanml3d_272/prepare/download_Causal_TAE_t2m_272_ckpt.py
|
| 201 |
+
```
|
| 202 |
+
|
| 203 |
+
3. Train text to motion model:
|
| 204 |
+
> We provide scripts to train the original text to motion generation model with llama blocks, Two-Forward strategy and QK-Norm, using the motion latents encoded by the Causal TAE (trained in the first stage).
|
| 205 |
+
|
| 206 |
+
3.1 Get motion latents:
|
| 207 |
+
```bash
|
| 208 |
+
python get_latent.py --resume-pth Causal_TAE/net_last.pth --latent_dir humanml3d_272/t2m_latents
|
| 209 |
+
```
|
| 210 |
+
3.2 Download [sentence-T5-XXL model](https://huggingface.co/sentence-transformers/sentence-t5-xxl/tree/main) on Hugging Face:
|
| 211 |
+
```bash
|
| 212 |
+
huggingface-cli download --resume-download sentence-transformers/sentence-t5-xxl --local-dir sentencet5-xxl/
|
| 213 |
+
```
|
| 214 |
+
3.3 Train text to motion generation model:
|
| 215 |
+
```bash
|
| 216 |
+
bash TRAIN_t2m.sh ${NUM_GPUS}
|
| 217 |
+
```
|
| 218 |
+
> e.g., if you have 8 GPUs, run: bash TRAIN_t2m.sh 8
|
| 219 |
+
|
| 220 |
+
> The checkpoint will be stored at:
|
| 221 |
+
``Experiments/t2m_model/``
|
| 222 |
+
|
| 223 |
+
> Tensorboard visualization:
|
| 224 |
+
```bash
|
| 225 |
+
tensorboard --logdir='Experiments/t2m_model'
|
| 226 |
+
```
|
| 227 |
+
|
| 228 |
+
⬇️ We provide the text to motion model checkpoint on [Hugging Face](https://huggingface.co/lxxiao/MotionStreamer/tree/main/Experiments/t2m_model), download it following:
|
| 229 |
+
```bash
|
| 230 |
+
python humanml3d_272/prepare/download_t2m_model_ckpt.py
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
4. Train streaming motion generation model (MotionStreamer):
|
| 234 |
+
> We provide scripts to train the streaming motion generation model (MotionStreamer) with llama blocks, Two-Forward strategy and QK-Norm, using the motion latents encoded by the Causal TAE (need to train a new Causal TAE using both HumanML3D-272 and BABEL-272 data).
|
| 235 |
+
|
| 236 |
+
4.1 Train a Causal TAE using both HumanML3D-272 and BABEL-272 data:
|
| 237 |
+
```bash
|
| 238 |
+
bash TRAIN_causal_TAE.sh ${NUM_GPUS} t2m_babel_272
|
| 239 |
+
```
|
| 240 |
+
> e.g., if you have 8 GPUs, run: bash TRAIN_causal_TAE.sh 8 t2m_babel_272
|
| 241 |
+
|
| 242 |
+
> The checkpoint will be stored at:
|
| 243 |
+
``Experiments/causal_TAE_t2m_babel_272/``
|
| 244 |
+
|
| 245 |
+
> Tensorboard visualization:
|
| 246 |
+
```bash
|
| 247 |
+
tensorboard --logdir='Experiments/causal_TAE_t2m_babel_272'
|
| 248 |
+
```
|
| 249 |
+
|
| 250 |
+
⬇️ We provide the Causal TAE checkpoint trained using both HumanML3D-272 and BABEL-272 data on [Hugging Face](https://huggingface.co/lxxiao/MotionStreamer/tree/main/Causal_TAE_t2m_babel), download it following:
|
| 251 |
+
```bash
|
| 252 |
+
python humanml3d_272/prepare/download_Causal_TAE_t2m_babel_272_ckpt.py
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
4.2 Get motion latents of both HumanML3D-272 and the processed BABEL-272-stream dataset:
|
| 256 |
+
```bash
|
| 257 |
+
python get_latent.py --resume-pth Causal_TAE_t2m_babel/net_last.pth --latent_dir babel_272_stream/t2m_babel_latents --dataname t2m_babel_272
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
4.3 Train MotionStreamer model:
|
| 261 |
+
```bash
|
| 262 |
+
bash TRAIN_motionstreamer.sh ${NUM_GPUS}
|
| 263 |
+
```
|
| 264 |
+
> e.g., if you have 8 GPUs, run: bash TRAIN_motionstreamer.sh 8
|
| 265 |
+
|
| 266 |
+
> The checkpoint will be stored at:
|
| 267 |
+
``Experiments/motionstreamer_model/``
|
| 268 |
+
|
| 269 |
+
> Tensorboard visualization:
|
| 270 |
+
```bash
|
| 271 |
+
tensorboard --logdir='Experiments/motionstreamer_model'
|
| 272 |
+
```
|
| 273 |
+
|
| 274 |
+
## 📍 Evaluation
|
| 275 |
+
|
| 276 |
+
1. Evaluate the metrics of the processed 272-dim [HumanML3D](https://github.com/EricGuo5513/HumanML3D) dataset:
|
| 277 |
+
```bash
|
| 278 |
+
bash EVAL_GT.sh
|
| 279 |
+
```
|
| 280 |
+
( FID, R@1, R@2, R@3, Diversity and MM-Dist (Matching Score) are reported. )
|
| 281 |
+
|
| 282 |
+
2. Evaluate the metrics of Causal TAE:
|
| 283 |
+
```bash
|
| 284 |
+
bash EVAL_causal_TAE.sh
|
| 285 |
+
```
|
| 286 |
+
( FID and MPJPE (mm) are reported. )
|
| 287 |
+
|
| 288 |
+
3. Evaluate the metrics of text to motion model:
|
| 289 |
+
```bash
|
| 290 |
+
bash EVAL_t2m.sh
|
| 291 |
+
```
|
| 292 |
+
( FID, R@1, R@2, R@3, Diversity and MM-Dist (Matching Score) are reported. )
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
## 🎬 Demo Inference
|
| 296 |
+
|
| 297 |
+
1. Inference of text to motion model:
|
| 298 |
+
> [Option1] Recover from joint position
|
| 299 |
+
```bash
|
| 300 |
+
python demo_t2m.py --text 'a person is walking like a mummy.' --mode pos --resume-pth Causal_TAE/net_last.pth --resume-trans Experiments/t2m_model/latest.pth
|
| 301 |
+
```
|
| 302 |
+
> [Option2] Recover from joint rotation
|
| 303 |
+
```bash
|
| 304 |
+
python demo_t2m.py --text 'a person is walking like a mummy.' --mode rot --resume-pth Causal_TAE/net_last.pth --resume-trans Experiments/t2m_model/latest.pth
|
| 305 |
+
```
|
| 306 |
+
> In our 272-dim representation, Inverse Kinematics (IK) is not needed.
|
| 307 |
+
> For further conversion to BVH format, please refer to [this repo](https://github.com/Li-xingXiao/272-dim-Motion-Representation?tab=readme-ov-file#6-representation_272-to-bvh-conversion-optional) (Step 6: Representation_272 to BVH conversion). The BVH format of motion animation can be visualizd and edited in [Blender](https://www.blender.org/features/animation/).
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
## 🌹 Acknowledgement
|
| 313 |
+
This repository builds upon the following awesome datasets and projects:
|
| 314 |
+
- [272-dim-Motion-Representation](https://github.com/Li-xingXiao/272-dim-Motion-Representation)
|
| 315 |
+
- [AMASS](https://amass.is.tue.mpg.de/index.html)
|
| 316 |
+
- [HumanML3D](https://github.com/EricGuo5513/HumanML3D)
|
| 317 |
+
- [T2M-GPT](https://github.com/Mael-zys/T2M-GPT)
|
| 318 |
+
- [TMR](https://github.com/Mathux/TMR)
|
| 319 |
+
- [OpenTMA](https://github.com/LinghaoChan/OpenTMA)
|
| 320 |
+
- [Sigma-VAE](https://github.com/orybkin/sigma-vae-pytorch)
|
| 321 |
+
- [Scamo](https://github.com/shunlinlu/ScaMo_code)
|
| 322 |
+
|
| 323 |
+
## 🤝🏼 Citation
|
| 324 |
+
If our project is helpful for your research, please consider citing :
|
| 325 |
+
```
|
| 326 |
+
@article{xiao2025motionstreamer,
|
| 327 |
+
title={MotionStreamer: Streaming Motion Generation via Diffusion-based Autoregressive Model in Causal Latent Space},
|
| 328 |
+
author={Xiao, Lixing and Lu, Shunlin and Pi, Huaijin and Fan, Ke and Pan, Liang and Zhou, Yueer and Feng, Ziyong and Zhou, Xiaowei and Peng, Sida and Wang, Jingbo},
|
| 329 |
+
journal={arXiv preprint arXiv:2503.15451},
|
| 330 |
+
year={2025}
|
| 331 |
+
}
|
| 332 |
+
```
|
| 333 |
+
|
| 334 |
+
## Star History
|
| 335 |
+
|
| 336 |
+
[](https://www.star-history.com/#zju3dv/MotionStreamer&Date)
|
TRAIN_causal_TAE.sh
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NUM_GPUS=${1:-1} # default: 1 GPU
|
| 2 |
+
dataset_name=${2:-t2m_272} # default: t2m_272, options: t2m_272, t2m_babel_272
|
| 3 |
+
|
| 4 |
+
BATCH_SIZE=$((128 / NUM_GPUS))
|
| 5 |
+
|
| 6 |
+
echo "Using $NUM_GPUS GPUs, each with a batch size of $BATCH_SIZE"
|
| 7 |
+
|
| 8 |
+
accelerate launch --num_processes $NUM_GPUS train_causal_TAE.py \
|
| 9 |
+
--batch-size $BATCH_SIZE \
|
| 10 |
+
--lr 0.00005 \
|
| 11 |
+
--total-iter 2000000 \
|
| 12 |
+
--lr-scheduler 1900000 \
|
| 13 |
+
--down-t 2 \
|
| 14 |
+
--depth 3 \
|
| 15 |
+
--dilation-growth-rate 3 \
|
| 16 |
+
--out-dir Experiments \
|
| 17 |
+
--dataname $dataset_name \
|
| 18 |
+
--exp-name causal_TAE_${dataset_name} \
|
| 19 |
+
--root_loss 7.0 \
|
| 20 |
+
--latent_dim 16 \
|
| 21 |
+
--hidden_size 1024 \
|
| 22 |
+
--num_gpus $NUM_GPUS
|
TRAIN_evaluator_272.sh
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export HF_ENDPOINT=https://hf-mirror.com
|
| 2 |
+
cd Evaluator_272
|
| 3 |
+
huggingface-cli download --resume-download distilbert/distilbert-base-uncased --local-dir ./deps/distilbert-base-uncased
|
| 4 |
+
ln -s ../humanml3d_272 ./datasets/humanml3d_272
|
| 5 |
+
python -m train --cfg configs/configs_evaluator_272/H3D-TMR.yaml --cfg_assets configs/assets.yaml --batch_size 256 --nodebug
|
| 6 |
+
cd ..
|
TRAIN_motionstreamer.sh
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NUM_GPUS=${1:-1} # default: 1 GPU
|
| 2 |
+
|
| 3 |
+
BATCH_SIZE=$((30 / NUM_GPUS))
|
| 4 |
+
|
| 5 |
+
echo "Using $NUM_GPUS GPUs, each with a batch size of $BATCH_SIZE"
|
| 6 |
+
|
| 7 |
+
accelerate launch --num_processes $NUM_GPUS train_motionstreamer.py \
|
| 8 |
+
--batch-size $BATCH_SIZE \
|
| 9 |
+
--lr 0.0001 \
|
| 10 |
+
--total-iter 200000 \
|
| 11 |
+
--out-dir Experiments \
|
| 12 |
+
--exp-name motionstreamer_model \
|
| 13 |
+
--dataname t2m_babel_272 \
|
| 14 |
+
--latent_dir babel_272_stream/t2m_babel_latents \
|
| 15 |
+
--num_gpus $NUM_GPUS
|
| 16 |
+
--resume-trans Experiments/motionstreamer_model/100k.pth \
|
TRAIN_t2m.sh
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NUM_GPUS=${1:-1} # default: 1 GPU
|
| 2 |
+
|
| 3 |
+
BATCH_SIZE=$((256 / NUM_GPUS))
|
| 4 |
+
|
| 5 |
+
echo "Using $NUM_GPUS GPUs, each with a batch size of $BATCH_SIZE"
|
| 6 |
+
|
| 7 |
+
accelerate launch --num_processes $NUM_GPUS train_t2m.py \
|
| 8 |
+
--batch-size $BATCH_SIZE \
|
| 9 |
+
--lr 0.0001 \
|
| 10 |
+
--total-iter 100000 \
|
| 11 |
+
--out-dir Experiments \
|
| 12 |
+
--exp-name t2m_model \
|
| 13 |
+
--dataname t2m_272 \
|
| 14 |
+
--latent_dir humanml3d_272/t2m_latents \
|
| 15 |
+
--num_gpus $NUM_GPUS
|
assets/teaser.jpg
ADDED
|
Git LFS Details
|
babel_272/.gitattributes
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.mds filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
# Audio files - uncompressed
|
| 39 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
# Audio files - compressed
|
| 43 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
# Image files - uncompressed
|
| 49 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
# Image files - compressed
|
| 54 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
# Video files - compressed
|
| 58 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
*.webm filter=lfs diff=lfs merge=lfs -text
|
babel_272/README.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
---
|
| 4 |
+
## 🚀 Dataset Usage
|
| 5 |
+
To facilitate researchers, we provide the processed 272-dim Motion Representation of [BABEL](https://babel.is.tue.mpg.de/) dataset in this Hugging Face repo.
|
| 6 |
+
|
| 7 |
+
Motions are resampled into 30 FPS.
|
| 8 |
+
|
| 9 |
+
NOTE: ``t2m_babel_mean_std/`` contains the joint mean and std of both HumanML3D and BABEL dataset for joint training of the proposed [Causal TAE](https://github.com/zju3dv/MotionStreamer/blob/main/TRAIN_causal_TAE.sh).
|
| 10 |
+
|
| 11 |
+
❗️❗️❗️ The processed data is solely for academic purposes. Make sure you read through the [BABEL License](https://babel.is.tue.mpg.de/license.html).
|
| 12 |
+
|
| 13 |
+
## 📖 Paper & Project Page & Code
|
| 14 |
+
* [Arxiv Paper](https://arxiv.org/abs/2503.15451)
|
| 15 |
+
* [Project Page](https://zju3dv.github.io/MotionStreamer/)
|
| 16 |
+
* [Code](https://github.com/zju3dv/MotionStreamer)
|
| 17 |
+
|
| 18 |
+
## 🏃 Processing script
|
| 19 |
+
For more details of how to obtain the 272-dim motion representation, as well as other useful tools (e.g., Visualization and Conversion to BVH format), please refer to our [GitHub repo](https://github.com/Li-xingXiao/272-dim-Motion-Representation).
|
| 20 |
+
|
| 21 |
+
## 🌹 Acknowledgement
|
| 22 |
+
This repository builds upon the following awesome datasets and projects:
|
| 23 |
+
- [BABEL](https://babel.is.tue.mpg.de/)
|
| 24 |
+
|
| 25 |
+
## 🤝🏼 Citation
|
| 26 |
+
If our project is helpful for your research, please consider citing :
|
| 27 |
+
```
|
| 28 |
+
@article{xiao2025motionstreamer,
|
| 29 |
+
title={MotionStreamer: Streaming Motion Generation via Diffusion-based Autoregressive Model in Causal Latent Space},
|
| 30 |
+
author={Xiao, Lixing and Lu, Shunlin and Pi, Huaijin and Fan, Ke and Pan, Liang and Zhou, Yueer and Feng, Ziyong and Zhou, Xiaowei and Peng, Sida and Wang, Jingbo},
|
| 31 |
+
journal={arXiv preprint arXiv:2503.15451},
|
| 32 |
+
year={2025}
|
| 33 |
+
}
|
| 34 |
+
```
|
babel_272/motion_data.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:03ecf1eefd24f828e0717dd0d7d05ad2ad139d79fd09d59baeab711895311525
|
| 3 |
+
size 8093667470
|
babel_272/split/train.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
babel_272/split/val.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
babel_272/t2m_babel_mean_std/Mean.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0f782aecd1c0479c517aee68959a26f55ddf1f34bb2344b4d9c365c73f3ed80
|
| 3 |
+
size 2304
|
babel_272/t2m_babel_mean_std/Std.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de477d76de0b03b71779dea84964ccf59c1f53ad49ebef7d99202c4ff19a2ff5
|
| 3 |
+
size 2304
|
babel_272/texts.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39b0a560144db9d4a261462d21f0eeedefc3f0bd1bb664cb3ec819c17ebead52
|
| 3 |
+
size 38968869
|
babel_272_stream/.gitattributes
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.mds filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
# Audio files - uncompressed
|
| 39 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
# Audio files - compressed
|
| 43 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
# Image files - uncompressed
|
| 49 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
# Image files - compressed
|
| 54 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
# Video files - compressed
|
| 58 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
*.webm filter=lfs diff=lfs merge=lfs -text
|
babel_272_stream/README.md
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
---
|
| 4 |
+
## 🚀 Dataset Usage
|
| 5 |
+
To facilitate researchers, we provide the processed streaming 272-dim Motion Representation of [BABEL](https://babel.is.tue.mpg.de/) dataset in this Hugging Face repo.
|
| 6 |
+
|
| 7 |
+
NOTE: We process the original BABEL dataset to support training of streaming motion generation.
|
| 8 |
+
e.g. If there is a motion sequence A, annotated as (A1, A2, A3, A4) in BABEL dataset, each subsequence has text description: (A1_t, A2_t, A3_t, A4_t).
|
| 9 |
+
|
| 10 |
+
Then, our BABEL-stream is constructed as:
|
| 11 |
+
|
| 12 |
+
seq1: (A1, A2) --- seq1_text: (A1_t*A2_t#A1_length)
|
| 13 |
+
|
| 14 |
+
seq2: (A2, A3) --- seq2_text: (A2_t*A3_t#A2_length)
|
| 15 |
+
|
| 16 |
+
seq3: (A3, A4) --- seq3_text: (A3_t*A4_t#A3_length)
|
| 17 |
+
|
| 18 |
+
Here, * and # is separation symbol, A1_length means the number of frames of subsequence A1.
|
| 19 |
+
|
| 20 |
+
Motions are resampled into 30 FPS.
|
| 21 |
+
|
| 22 |
+
The dataset is organized as:
|
| 23 |
+
```
|
| 24 |
+
./
|
| 25 |
+
├── train_stream
|
| 26 |
+
├── seq1.npy
|
| 27 |
+
...
|
| 28 |
+
├── train_stream_text
|
| 29 |
+
├── seq1.txt
|
| 30 |
+
...
|
| 31 |
+
├── val_stream
|
| 32 |
+
├── seq1.npy
|
| 33 |
+
...
|
| 34 |
+
├── val_stream_text
|
| 35 |
+
├── seq1.txt
|
| 36 |
+
...
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
❗️❗️❗️ The processed data is solely for academic purposes. Make sure you read through the [BABEL License](https://babel.is.tue.mpg.de/license.html).
|
| 40 |
+
|
| 41 |
+
## 📖 Paper & Project Page & Code
|
| 42 |
+
* [Arxiv Paper](https://arxiv.org/abs/2503.15451)
|
| 43 |
+
* [Project Page](https://zju3dv.github.io/MotionStreamer/)
|
| 44 |
+
* [Code](https://github.com/zju3dv/MotionStreamer)
|
| 45 |
+
|
| 46 |
+
## 🏃 Processing script
|
| 47 |
+
For more details of how to obtain the 272-dim motion representation, as well as other useful tools (e.g., Visualization and Conversion to BVH format), please refer to our [GitHub repo](https://github.com/Li-xingXiao/272-dim-Motion-Representation).
|
| 48 |
+
|
| 49 |
+
## 🌹 Acknowledgement
|
| 50 |
+
This repository builds upon the following awesome datasets and projects:
|
| 51 |
+
- [BABEL](https://babel.is.tue.mpg.de/)
|
| 52 |
+
|
| 53 |
+
## 🤝🏼 Citation
|
| 54 |
+
If our project is helpful for your research, please consider citing :
|
| 55 |
+
```
|
| 56 |
+
@article{xiao2025motionstreamer,
|
| 57 |
+
title={MotionStreamer: Streaming Motion Generation via Diffusion-based Autoregressive Model in Causal Latent Space},
|
| 58 |
+
author={Xiao, Lixing and Lu, Shunlin and Pi, Huaijin and Fan, Ke and Pan, Liang and Zhou, Yueer and Feng, Ziyong and Zhou, Xiaowei and Peng, Sida and Wang, Jingbo},
|
| 59 |
+
journal={arXiv preprint arXiv:2503.15451},
|
| 60 |
+
year={2025}
|
| 61 |
+
}
|
| 62 |
+
```
|
babel_272_stream/train_stream.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:35db924d754e321f673a72c22b80d5d725f55d74151fc34351f554ef6bf33a2e
|
| 3 |
+
size 6901914721
|
babel_272_stream/train_stream_text.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d46561fcaf62738b1d08cf54a851ffecb3fb7a154f9663b199dfa83f0d677046
|
| 3 |
+
size 4746908
|
babel_272_stream/val_stream.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0564c64ce642330222b3ed83d031f5f3765c6979a82f17a2259e07d80d0ff78a
|
| 3 |
+
size 2580199524
|
babel_272_stream/val_stream_text.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba646f2836f03a7fa1a5470aa8c098d1b0e446872d5bf53b8b42283e5c1f368b
|
| 3 |
+
size 1685986
|
body_models/human_model_files/mano/MANO_LEFT.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c4022f7083f2ca7c78b2b3d595abbab52debd32b09d372b16923a801f0ea6a30
|
| 3 |
+
size 3821391
|
body_models/human_model_files/mano/MANO_RIGHT.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:45d60aa3b27ef9107a7afd4e00808f307fd91111e1cfa35afd5c4a62de264767
|
| 3 |
+
size 3821356
|
body_models/human_model_files/smpl/J_regressor_extra.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cc968ea4f9855571e82f90203280836b01f13ee42a8e1b89d8d580b801242a89
|
| 3 |
+
size 496160
|
body_models/human_model_files/smpl/SMPL_FEMALE.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a583c1b98e4afc19042641f1bae5cd8a1f712a6724886291a7627ec07acd408d
|
| 3 |
+
size 39056454
|
body_models/human_model_files/smpl/SMPL_MALE.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e8c0bbbbc635dcb166ed29c303fb4bef16ea5f623e5a89263495a9e403575bd
|
| 3 |
+
size 39056404
|
body_models/human_model_files/smpl/SMPL_NEUTRAL.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98e65c74ad9b998783132f00880d1025a8d64b158e040e6ef13a557e5098bc42
|
| 3 |
+
size 39001280
|
body_models/human_model_files/smpl/VPOSER_CKPT/TR00_004_00_WO_accad.ini
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[All]
|
| 2 |
+
adam_beta1 : 0.9
|
| 3 |
+
base_lr : 0.005
|
| 4 |
+
batch_size : 512
|
| 5 |
+
best_model_fname : None
|
| 6 |
+
cuda_id : 0
|
| 7 |
+
data_shape : [1, 21, 3]
|
| 8 |
+
dataset_dir : None
|
| 9 |
+
display_model_gender : male
|
| 10 |
+
expr_code : 004_00_WO_accad
|
| 11 |
+
fp_precision : 32
|
| 12 |
+
ip_avoid : False
|
| 13 |
+
kl_coef : 0.005
|
| 14 |
+
latentD : 32
|
| 15 |
+
log_every_epoch : 2
|
| 16 |
+
model_type : smpl
|
| 17 |
+
n_workers : 10
|
| 18 |
+
num_bodies_to_display : 10
|
| 19 |
+
num_epochs : 100
|
| 20 |
+
num_neurons : 512
|
| 21 |
+
reg_coef : 0.0001
|
| 22 |
+
remove_Zrot : True
|
| 23 |
+
seed : 4815
|
| 24 |
+
sm_coef : 0.01
|
| 25 |
+
test_only : False
|
| 26 |
+
try_num : 0
|
| 27 |
+
use_cont_repr : True
|
| 28 |
+
verbosity : 0
|
| 29 |
+
work_dir : None
|
body_models/human_model_files/smpl/VPOSER_CKPT/snapshots/._TR00_E096.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e2615cd1d2e78cdfac7169c6182a7352d02992336dad7329d3d97f6947fb515
|
| 3 |
+
size 4096
|
body_models/human_model_files/smpl/VPOSER_CKPT/snapshots/TR00_E096.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e4ad40f922606989939d3fae6eadf82d1a8e98112dffb6e39d89d6471270d5c
|
| 3 |
+
size 2702962
|
body_models/human_model_files/smpl/VPOSER_CKPT/vposer_smpl.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2019 Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG),
|
| 4 |
+
# acting on behalf of its Max Planck Institute for Intelligent Systems and the
|
| 5 |
+
# Max Planck Institute for Biological Cybernetics. All rights reserved.
|
| 6 |
+
#
|
| 7 |
+
# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is holder of all proprietary rights
|
| 8 |
+
# on this computer program. You can only use this computer program if you have closed a license agreement
|
| 9 |
+
# with MPG or you get the right to use the computer program from someone who is authorized to grant you that right.
|
| 10 |
+
# Any use of the computer program without a valid license is prohibited and liable to prosecution.
|
| 11 |
+
# Contact: [email protected]
|
| 12 |
+
#
|
| 13 |
+
#
|
| 14 |
+
# If you use this code in a research publication please consider citing the following:
|
| 15 |
+
#
|
| 16 |
+
# Expressive Body Capture: 3D Hands, Face, and Body from a Single Image <https://arxiv.org/abs/1904.05866>
|
| 17 |
+
# AMASS: Archive of Motion Capture as Surface Shapes <https://arxiv.org/abs/1904.03278>
|
| 18 |
+
#
|
| 19 |
+
#
|
| 20 |
+
# Code Developed by:
|
| 21 |
+
# Nima Ghorbani <https://www.linkedin.com/in/nghorbani/>
|
| 22 |
+
# Vassilis Choutas <https://ps.is.tuebingen.mpg.de/employees/vchoutas> for ContinousRotReprDecoder
|
| 23 |
+
#
|
| 24 |
+
# 2018.01.02
|
| 25 |
+
|
| 26 |
+
'''
|
| 27 |
+
A human body pose prior built with Auto-Encoding Variational Bayes
|
| 28 |
+
'''
|
| 29 |
+
|
| 30 |
+
__all__ = ['VPoser']
|
| 31 |
+
|
| 32 |
+
import os, sys, shutil
|
| 33 |
+
|
| 34 |
+
import torch
|
| 35 |
+
|
| 36 |
+
from torch import nn
|
| 37 |
+
from torch.nn import functional as F
|
| 38 |
+
|
| 39 |
+
import numpy as np
|
| 40 |
+
|
| 41 |
+
import torchgeometry as tgm
|
| 42 |
+
|
| 43 |
+
class ContinousRotReprDecoder(nn.Module):
|
| 44 |
+
def __init__(self):
|
| 45 |
+
super(ContinousRotReprDecoder, self).__init__()
|
| 46 |
+
|
| 47 |
+
def forward(self, module_input):
|
| 48 |
+
reshaped_input = module_input.view(-1, 3, 2)
|
| 49 |
+
|
| 50 |
+
b1 = F.normalize(reshaped_input[:, :, 0], dim=1)
|
| 51 |
+
|
| 52 |
+
dot_prod = torch.sum(b1 * reshaped_input[:, :, 1], dim=1, keepdim=True)
|
| 53 |
+
b2 = F.normalize(reshaped_input[:, :, 1] - dot_prod * b1, dim=-1)
|
| 54 |
+
b3 = torch.cross(b1, b2, dim=1)
|
| 55 |
+
|
| 56 |
+
return torch.stack([b1, b2, b3], dim=-1)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class VPoser(nn.Module):
|
| 60 |
+
def __init__(self, num_neurons, latentD, data_shape, use_cont_repr=True):
|
| 61 |
+
super(VPoser, self).__init__()
|
| 62 |
+
|
| 63 |
+
self.latentD = latentD
|
| 64 |
+
self.use_cont_repr = use_cont_repr
|
| 65 |
+
|
| 66 |
+
n_features = np.prod(data_shape)
|
| 67 |
+
self.num_joints = data_shape[1]
|
| 68 |
+
|
| 69 |
+
self.bodyprior_enc_bn1 = nn.BatchNorm1d(n_features)
|
| 70 |
+
self.bodyprior_enc_fc1 = nn.Linear(n_features, num_neurons)
|
| 71 |
+
self.bodyprior_enc_bn2 = nn.BatchNorm1d(num_neurons)
|
| 72 |
+
self.bodyprior_enc_fc2 = nn.Linear(num_neurons, num_neurons)
|
| 73 |
+
self.bodyprior_enc_mu = nn.Linear(num_neurons, latentD)
|
| 74 |
+
self.bodyprior_enc_logvar = nn.Linear(num_neurons, latentD)
|
| 75 |
+
self.dropout = nn.Dropout(p=.1, inplace=False)
|
| 76 |
+
|
| 77 |
+
self.bodyprior_dec_fc1 = nn.Linear(latentD, num_neurons)
|
| 78 |
+
self.bodyprior_dec_fc2 = nn.Linear(num_neurons, num_neurons)
|
| 79 |
+
|
| 80 |
+
if self.use_cont_repr:
|
| 81 |
+
self.rot_decoder = ContinousRotReprDecoder()
|
| 82 |
+
|
| 83 |
+
self.bodyprior_dec_out = nn.Linear(num_neurons, self.num_joints* 6)
|
| 84 |
+
|
| 85 |
+
def encode(self, Pin):
|
| 86 |
+
'''
|
| 87 |
+
|
| 88 |
+
:param Pin: Nx(numjoints*3)
|
| 89 |
+
:param rep_type: 'matrot'/'aa' for matrix rotations or axis-angle
|
| 90 |
+
:return:
|
| 91 |
+
'''
|
| 92 |
+
Xout = Pin.view(Pin.size(0), -1) # flatten input
|
| 93 |
+
Xout = self.bodyprior_enc_bn1(Xout)
|
| 94 |
+
|
| 95 |
+
Xout = F.leaky_relu(self.bodyprior_enc_fc1(Xout), negative_slope=.2)
|
| 96 |
+
Xout = self.bodyprior_enc_bn2(Xout)
|
| 97 |
+
Xout = self.dropout(Xout)
|
| 98 |
+
Xout = F.leaky_relu(self.bodyprior_enc_fc2(Xout), negative_slope=.2)
|
| 99 |
+
return torch.distributions.normal.Normal(self.bodyprior_enc_mu(Xout), F.softplus(self.bodyprior_enc_logvar(Xout)))
|
| 100 |
+
|
| 101 |
+
def decode(self, Zin, output_type='matrot'):
|
| 102 |
+
assert output_type in ['matrot', 'aa']
|
| 103 |
+
|
| 104 |
+
Xout = F.leaky_relu(self.bodyprior_dec_fc1(Zin), negative_slope=.2)
|
| 105 |
+
Xout = self.dropout(Xout)
|
| 106 |
+
Xout = F.leaky_relu(self.bodyprior_dec_fc2(Xout), negative_slope=.2)
|
| 107 |
+
Xout = self.bodyprior_dec_out(Xout)
|
| 108 |
+
if self.use_cont_repr:
|
| 109 |
+
Xout = self.rot_decoder(Xout)
|
| 110 |
+
else:
|
| 111 |
+
Xout = torch.tanh(Xout)
|
| 112 |
+
|
| 113 |
+
Xout = Xout.view([-1, 1, self.num_joints, 9])
|
| 114 |
+
if output_type == 'aa': return VPoser.matrot2aa(Xout)
|
| 115 |
+
return Xout
|
| 116 |
+
|
| 117 |
+
def forward(self, Pin, input_type='matrot', output_type='matrot'):
|
| 118 |
+
'''
|
| 119 |
+
|
| 120 |
+
:param Pin: aa: Nx1xnum_jointsx3 / matrot: Nx1xnum_jointsx9
|
| 121 |
+
:param input_type: matrot / aa for matrix rotations or axis angles
|
| 122 |
+
:param output_type: matrot / aa
|
| 123 |
+
:return:
|
| 124 |
+
'''
|
| 125 |
+
assert output_type in ['matrot', 'aa']
|
| 126 |
+
# if input_type == 'aa': Pin = VPoser.aa2matrot(Pin)
|
| 127 |
+
q_z = self.encode(Pin)
|
| 128 |
+
q_z_sample = q_z.rsample()
|
| 129 |
+
Prec = self.decode(q_z_sample)
|
| 130 |
+
if output_type == 'aa': Prec = VPoser.matrot2aa(Prec)
|
| 131 |
+
|
| 132 |
+
#return Prec, q_z.mean, q_z.sigma
|
| 133 |
+
return {'pose':Prec, 'mean':q_z.mean, 'std':q_z.scale}
|
| 134 |
+
|
| 135 |
+
def sample_poses(self, num_poses, output_type='aa', seed=None):
|
| 136 |
+
np.random.seed(seed)
|
| 137 |
+
dtype = self.bodyprior_dec_fc1.weight.dtype
|
| 138 |
+
device = self.bodyprior_dec_fc1.weight.device
|
| 139 |
+
self.eval()
|
| 140 |
+
with torch.no_grad():
|
| 141 |
+
Zgen = torch.tensor(np.random.normal(0., 1., size=(num_poses, self.latentD)), dtype=dtype).to(device)
|
| 142 |
+
return self.decode(Zgen, output_type=output_type)
|
| 143 |
+
|
| 144 |
+
@staticmethod
|
| 145 |
+
def matrot2aa(pose_matrot):
|
| 146 |
+
'''
|
| 147 |
+
:param pose_matrot: Nx1xnum_jointsx9
|
| 148 |
+
:return: Nx1xnum_jointsx3
|
| 149 |
+
'''
|
| 150 |
+
batch_size = pose_matrot.size(0)
|
| 151 |
+
homogen_matrot = F.pad(pose_matrot.view(-1, 3, 3), [0,1])
|
| 152 |
+
pose = tgm.rotation_matrix_to_angle_axis(homogen_matrot).view(batch_size, 1, -1, 3).contiguous()
|
| 153 |
+
return pose
|
| 154 |
+
|
| 155 |
+
@staticmethod
|
| 156 |
+
def aa2matrot(pose):
|
| 157 |
+
'''
|
| 158 |
+
:param Nx1xnum_jointsx3
|
| 159 |
+
:return: pose_matrot: Nx1xnum_jointsx9
|
| 160 |
+
'''
|
| 161 |
+
batch_size = pose.size(0)
|
| 162 |
+
pose_body_matrot = tgm.angle_axis_to_rotation_matrix(pose.reshape(-1, 3))[:, :3, :3].contiguous().view(batch_size, 1, -1, 9)
|
| 163 |
+
return pose_body_matrot
|
| 164 |
+
|
body_models/human_model_files/smplx/MANO_SMPLX_vertex_ids.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e5abe70b6574de25470475091e8008314a5b90127eb48c3e63bfa0adf8c04dcf
|
| 3 |
+
size 13535
|
body_models/human_model_files/smplx/SMPL-X__FLAME_vertex_ids.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e70cdc3659aae699b9732e8dd4af49106310c69b90dc83d9f73e96dbf871e49
|
| 3 |
+
size 40312
|
body_models/human_model_files/smplx/SMPLX_FEMALE.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:05e37bd22dff93362c92cea9c791c62a2d4d7e8d44b234f3e41be0020fa1c256
|
| 3 |
+
size 108532279
|
body_models/human_model_files/smplx/SMPLX_FEMALE.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b870ce1fd05b46dd81e2de6269b2955667c931c8594999eb22eeb489b00e2c1f
|
| 3 |
+
size 146809856
|
body_models/human_model_files/smplx/SMPLX_MALE.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79360d466228bec1b9f9d922ea48df718a0a09bccddace18cfec98b0edd68b73
|
| 3 |
+
size 108491578
|
body_models/human_model_files/smplx/SMPLX_MALE.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d4f94c40261ac4762bb9b09142d11bf47e1cc3d6b49b6bbcc4a2731451bf5632
|
| 3 |
+
size 543102085
|
body_models/human_model_files/smplx/SMPLX_NEUTRAL.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15eb61ac2f91dcd6e340913e281b2b8a0a910ebe0955af9251b9bb99fd11d02b
|
| 3 |
+
size 108490191
|
body_models/human_model_files/smplx/SMPLX_NEUTRAL.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b0279321ea9bd3cec5541c03b1f1c9ab9d197896943035c3abeef47f699bc5e
|
| 3 |
+
size 542798306
|
body_models/human_model_files/smplx/SMPLX_NEUTRAL_NEW.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:248e277858008fea271d1ea3874eed2310dfd57fa160ea07c467cf6a061e0ecd
|
| 3 |
+
size 167260951
|
body_models/human_model_files/smplx/SMPLX_NEUTRAL_NEW.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ecb628fadd2b40f42cd39378d1e429cd30acc0bab6104676898d4374b804163d
|
| 3 |
+
size 167261087
|
body_models/human_model_files/smplx/SMPLX_NEUTRAL_NEW_WiFlame.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9047e853fc08caa5cef648aa691bf80cf423ca5f0693d825c029a6a7b0bedc51
|
| 3 |
+
size 215482118
|