zirobtc commited on
Commit
0e267a7
·
verified ·
1 Parent(s): 2c7cf25

Initial upload of MotionStreamer code, excluding large extracted data and output folders.

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .ipynb_checkpoints/TRAIN_motionstreamer-checkpoint.sh +15 -0
  3. .ipynb_checkpoints/demo_t2m-checkpoint.py +204 -0
  4. .ipynb_checkpoints/environment-checkpoint.yaml +258 -0
  5. .ipynb_checkpoints/requirements-checkpoint.txt +17 -0
  6. .ipynb_checkpoints/train_motionstreamer-checkpoint.py +264 -0
  7. EVAL_causal_TAE.sh +6 -0
  8. EVAL_t2m.sh +7 -0
  9. LICENSE +21 -0
  10. README.md +336 -0
  11. TRAIN_causal_TAE.sh +22 -0
  12. TRAIN_evaluator_272.sh +6 -0
  13. TRAIN_motionstreamer.sh +16 -0
  14. TRAIN_t2m.sh +15 -0
  15. assets/teaser.jpg +3 -0
  16. babel_272/.gitattributes +59 -0
  17. babel_272/README.md +34 -0
  18. babel_272/motion_data.zip +3 -0
  19. babel_272/split/train.txt +0 -0
  20. babel_272/split/val.txt +0 -0
  21. babel_272/t2m_babel_mean_std/Mean.npy +3 -0
  22. babel_272/t2m_babel_mean_std/Std.npy +3 -0
  23. babel_272/texts.zip +3 -0
  24. babel_272_stream/.gitattributes +59 -0
  25. babel_272_stream/README.md +62 -0
  26. babel_272_stream/train_stream.zip +3 -0
  27. babel_272_stream/train_stream_text.zip +3 -0
  28. babel_272_stream/val_stream.zip +3 -0
  29. babel_272_stream/val_stream_text.zip +3 -0
  30. body_models/human_model_files/mano/MANO_LEFT.pkl +3 -0
  31. body_models/human_model_files/mano/MANO_RIGHT.pkl +3 -0
  32. body_models/human_model_files/smpl/J_regressor_extra.npy +3 -0
  33. body_models/human_model_files/smpl/SMPL_FEMALE.pkl +3 -0
  34. body_models/human_model_files/smpl/SMPL_MALE.pkl +3 -0
  35. body_models/human_model_files/smpl/SMPL_NEUTRAL.pkl +3 -0
  36. body_models/human_model_files/smpl/VPOSER_CKPT/TR00_004_00_WO_accad.ini +29 -0
  37. body_models/human_model_files/smpl/VPOSER_CKPT/snapshots/._TR00_E096.pt +3 -0
  38. body_models/human_model_files/smpl/VPOSER_CKPT/snapshots/TR00_E096.pt +3 -0
  39. body_models/human_model_files/smpl/VPOSER_CKPT/vposer_smpl.py +164 -0
  40. body_models/human_model_files/smplx/MANO_SMPLX_vertex_ids.pkl +3 -0
  41. body_models/human_model_files/smplx/SMPL-X__FLAME_vertex_ids.npy +3 -0
  42. body_models/human_model_files/smplx/SMPLX_FEMALE.npz +3 -0
  43. body_models/human_model_files/smplx/SMPLX_FEMALE.pkl +3 -0
  44. body_models/human_model_files/smplx/SMPLX_MALE.npz +3 -0
  45. body_models/human_model_files/smplx/SMPLX_MALE.pkl +3 -0
  46. body_models/human_model_files/smplx/SMPLX_NEUTRAL.npz +3 -0
  47. body_models/human_model_files/smplx/SMPLX_NEUTRAL.pkl +3 -0
  48. body_models/human_model_files/smplx/SMPLX_NEUTRAL_NEW.npy +3 -0
  49. body_models/human_model_files/smplx/SMPLX_NEUTRAL_NEW.npz +3 -0
  50. body_models/human_model_files/smplx/SMPLX_NEUTRAL_NEW_WiFlame.npy +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/teaser.jpg filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/TRAIN_motionstreamer-checkpoint.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NUM_GPUS=${1:-1} # default: 1 GPU
2
+
3
+ BATCH_SIZE=$((30 / NUM_GPUS))
4
+
5
+ echo "Using $NUM_GPUS GPUs, each with a batch size of $BATCH_SIZE"
6
+
7
+ accelerate launch --num_processes $NUM_GPUS train_motionstreamer.py \
8
+ --batch-size $BATCH_SIZE \
9
+ --lr 0.0001 \
10
+ --total-iter 100000 \
11
+ --out-dir Experiments \
12
+ --exp-name motionstreamer_model \
13
+ --dataname t2m_babel_272 \
14
+ --latent_dir babel_272_stream/t2m_babel_latents \
15
+ --num_gpus $NUM_GPUS
.ipynb_checkpoints/demo_t2m-checkpoint.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ from models.llama_model import LLaMAHF, LLaMAHFConfig
5
+ import models.tae as tae
6
+ import options.option_transformer as option_trans
7
+ import warnings
8
+
9
+ import smplx
10
+ from utils import bvh, quat
11
+ from utils.face_z_align_util import rotation_6d_to_matrix, matrix_to_axis_angle, axis_angle_to_quaternion
12
+
13
+
14
+ warnings.filterwarnings('ignore')
15
+
16
+ comp_device = torch.device('cuda')
17
+ ##### ---- Exp dirs ---- #####
18
+ args = option_trans.get_args_parser()
19
+ torch.manual_seed(args.seed)
20
+
21
+ from sentence_transformers import SentenceTransformer
22
+ t5_model = SentenceTransformer('sentencet5-xxl/')
23
+ t5_model.eval()
24
+ for p in t5_model.parameters():
25
+ p.requires_grad = False
26
+
27
+ def save_motion_as_bvh(motion_data, output_path, fps=30):
28
+ """
29
+ Saves a motion tensor in the 272-dimensional format to a BVH file.
30
+ This version is adapted from the official repository script for robustness.
31
+ """
32
+ print(f"--- Starting direct conversion to BVH: {os.path.basename(output_path)} ---")
33
+ try:
34
+ # --- 1. Ensure data is a 2D NumPy array ---
35
+ if isinstance(motion_data, torch.Tensor):
36
+ motion_data = motion_data.detach().cpu().numpy()
37
+
38
+ # This is the key fix: Check dimensions before squeezing
39
+ if motion_data.ndim == 3 and motion_data.shape[0] == 1:
40
+ motion_data = motion_data.squeeze(0)
41
+ elif motion_data.ndim != 2:
42
+ raise ValueError(f"Input motion data must be 2D or 3D with a batch size of 1, but got shape {motion_data.shape}")
43
+
44
+ # --- 2. Recover 85-dim SMPL format from 272-dim format ---
45
+ # This logic is from the official script's `recover_from_local_rotation`
46
+ njoint = 22
47
+ nfrm, _ = motion_data.shape
48
+
49
+ rotations_matrix = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 8+6*njoint : 8+12*njoint]).reshape(nfrm, -1, 6)).numpy()
50
+
51
+ # Accumulate heading rotations
52
+ global_heading_diff_rot_6d = torch.from_numpy(motion_data[:, 2:8])
53
+ global_heading_diff_rot = rotation_6d_to_matrix(global_heading_diff_rot_6d).numpy()
54
+ global_heading_rot = np.zeros_like(global_heading_diff_rot)
55
+ global_heading_rot[0] = global_heading_diff_rot[0]
56
+ for i in range(1, nfrm):
57
+ global_heading_rot[i] = np.matmul(global_heading_diff_rot[i], global_heading_rot[i-1])
58
+
59
+ # Calculate root translation
60
+ velocities_root_xy = motion_data[:, :2]
61
+ positions_no_heading = motion_data[:, 8 : 8+3*njoint].reshape(nfrm, -1, 3)
62
+ height = positions_no_heading[:, 0, 1]
63
+
64
+ inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1))
65
+ rotations_matrix[:, 0, ...] = np.matmul(inv_global_heading_rot, rotations_matrix[:, 0, ...])
66
+
67
+ velocities_root_xyz = np.zeros((nfrm, 3))
68
+ velocities_root_xyz[:, 0] = velocities_root_xy[:, 0]
69
+ velocities_root_xyz[:, 2] = velocities_root_xy[:, 1]
70
+ velocities_root_xyz[1:, :] = np.matmul(inv_global_heading_rot[:-1], velocities_root_xyz[1:, :, None]).squeeze(-1)
71
+ root_translation = np.cumsum(velocities_root_xyz, axis=0)
72
+ root_translation[:, 1] = height
73
+
74
+ # Convert rotation matrices to axis-angle
75
+ axis_angle = matrix_to_axis_angle(torch.from_numpy(rotations_matrix)).numpy()
76
+ poses_85dim = np.concatenate([axis_angle.reshape(nfrm, -1), np.zeros((nfrm, 6)), root_translation, np.zeros((nfrm, 10))], axis=-1)
77
+
78
+ # --- 3. Convert 85-dim SMPL to BVH data ---
79
+ # This logic is from the official script's `smpl2bvh`
80
+ rots = poses_85dim[:, :72].reshape(-1, 24, 3)
81
+ trans = poses_85dim[:, 72:75]
82
+
83
+ # Get skeleton from SMPL model
84
+ model = smplx.create(model_path="body_models/human_model_files", model_type="smpl", gender="NEUTRAL")
85
+ parents = model.parents.detach().cpu().numpy()
86
+ rest_pose = model().joints.detach().cpu().numpy().squeeze()[:24,:]
87
+ offsets = rest_pose - rest_pose[parents]
88
+ offsets[0] = np.array([0,0,0])
89
+
90
+ rotations_quat = axis_angle_to_quaternion(torch.from_numpy(rots)).numpy()
91
+ rotations_euler = np.degrees(quat.to_euler(rotations_quat, order="zyx"))
92
+
93
+ positions = offsets[None].repeat(len(rots), axis=0)
94
+ positions[:, 0] = trans
95
+
96
+ joint_names = [
97
+ "Pelvis", "Left_hip", "Right_hip", "Spine1", "Left_knee", "Right_knee", "Spine2",
98
+ "Left_ankle", "Right_ankle", "Spine3", "Left_foot", "Right_foot", "Neck",
99
+ "Left_collar", "Right_collar", "Head", "Left_shoulder", "Right_shoulder",
100
+ "Left_elbow", "Right_elbow", "Left_wrist", "Right_wrist", "Left_hand", "Right_hand"
101
+ ]
102
+
103
+ # --- 4. Save the final BVH file ---
104
+ bvh.save(output_path, {
105
+ "rotations": rotations_euler,
106
+ "positions": positions,
107
+ "offsets": offsets,
108
+ "parents": parents,
109
+ "names": joint_names,
110
+ "order": "zyx",
111
+ "frametime": 1.0 / fps,
112
+ })
113
+ print(f"✅ BVH file saved successfully to {output_path}")
114
+
115
+ except Exception as e:
116
+ print(f"❌ BVH Conversion Failed. Error: {e}")
117
+ import traceback
118
+ traceback.print_exc()
119
+
120
+
121
+ ##### ---- Network ---- #####
122
+ clip_range = [-30,20]
123
+
124
+ net = tae.Causal_HumanTAE(
125
+ hidden_size=args.hidden_size,
126
+ down_t=args.down_t,
127
+ stride_t=args.stride_t,
128
+ depth=args.depth,
129
+ dilation_growth_rate=args.dilation_growth_rate,
130
+ activation='relu',
131
+ latent_dim=args.latent_dim,
132
+ clip_range=clip_range
133
+ )
134
+
135
+
136
+ config = LLaMAHFConfig.from_name('Normal_size')
137
+ config.block_size = 78
138
+ trans_encoder = LLaMAHF(config, args.num_diffusion_head_layers, args.latent_dim, comp_device)
139
+
140
+ print('loading checkpoint from {}'.format(args.resume_pth))
141
+ ckpt = torch.load(args.resume_pth, map_location='cpu')
142
+ net.load_state_dict(ckpt['net'], strict=True)
143
+ net.eval()
144
+ net.to(comp_device)
145
+
146
+
147
+ if args.resume_trans is not None:
148
+ print('loading transformer checkpoint from {}'.format(args.resume_trans))
149
+ ckpt = torch.load(args.resume_trans, map_location='cpu')
150
+ new_ckpt_trans = {}
151
+ for key in ckpt['trans'].keys():
152
+ if key.split('.')[0]=='module':
153
+ new_key = '.'.join(key.split('.')[1:])
154
+ else:
155
+ new_key = key
156
+ new_ckpt_trans[new_key] = ckpt['trans'][key]
157
+ trans_encoder.load_state_dict(new_ckpt_trans, strict=True)
158
+ trans_encoder.eval()
159
+ trans_encoder.to(comp_device)
160
+
161
+
162
+ reference_end_latent = np.load('reference_end_latent_t2m_272.npy')
163
+ reference_end_latent = torch.from_numpy(reference_end_latent).to(comp_device)
164
+
165
+ mean = np.load('humanml3d_272/mean_std/Mean.npy')
166
+ std = np.load('humanml3d_272/mean_std/Std.npy')
167
+
168
+ # forward inference
169
+ threshold = 0.1
170
+ cfg_scale = 4.0
171
+ print(f"Generating motion with CFG scale: {cfg_scale}")
172
+ motion_latents = trans_encoder.sample_for_eval_CFG_inference(text=args.text, tokenizer=t5_model, device=comp_device, reference_end_latent=reference_end_latent, threshold=threshold, cfg=cfg_scale)
173
+
174
+ # forward decode
175
+ motion_seqs = net.forward_decoder(motion_latents)
176
+ from visualization.recover_visualize import recover_from_local_position
177
+ import visualization.plot_3d_global as plot_3d
178
+
179
+ motion = motion_seqs.squeeze(0)
180
+ motion = motion.detach().cpu().numpy()
181
+
182
+ if not os.path.exists('demo_output'):
183
+ os.makedirs('demo_output')
184
+
185
+ if args.mode == 'pos':
186
+ # Option1: recover from joint position
187
+ pred_xyz = recover_from_local_position(motion * std + mean, 22)
188
+ xyz = pred_xyz.reshape(1, -1, 22, 3)
189
+ pose_vis = plot_3d.draw_to_batch(xyz, [args.text], [f'demo_output/{args.text}.mp4'], fps=30)
190
+ print(f"Visualized result is saved in demo_output/{args.text}.mp4")
191
+
192
+ elif args.mode == 'rot':
193
+ # De-normalize the motion data to its original scale
194
+ motion = motion * std + mean
195
+
196
+ # Define the output path for the new BVH file
197
+ output_bvh_path = os.path.join('demo_output', f'{args.text}.bvh')
198
+
199
+ # Call the new function to save the BVH file directly
200
+ save_motion_as_bvh(motion, output_bvh_path, fps=30)
201
+
202
+ else:
203
+ raise ValueError(f'Invalid mode: {args.mode}')
204
+
.ipynb_checkpoints/environment-checkpoint.yaml ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: mgpt
2
+ channels:
3
+ - pytorch
4
+ - conda-forge
5
+ - defaults
6
+ - https://repo.anaconda.com/pkgs/main
7
+ - https://repo.anaconda.com/pkgs/r
8
+ dependencies:
9
+ - _libgcc_mutex=0.1=main
10
+ - _openmp_mutex=4.5=1_gnu
11
+ - asttokens=3.0.0=pyhd8ed1ab_0
12
+ - backcall=0.2.0=pyh9f0ad1d_0
13
+ - blas=1.0=mkl
14
+ - bzip2=1.0.8=h7b6447c_0
15
+ - ca-certificates=2025.1.31=hbcca054_0
16
+ - certifi=2024.8.30=pyhd8ed1ab_0
17
+ - comm=0.2.2=pyhd8ed1ab_0
18
+ - cudatoolkit=10.1.243=h6bb024c_0
19
+ - debugpy=1.4.1=py38h709712a_0
20
+ - entrypoints=0.4=pyhd8ed1ab_0
21
+ - executing=2.1.0=pyhd8ed1ab_0
22
+ - ffmpeg=4.3=hf484d3e_0
23
+ - freetype=2.10.4=h5ab3b9f_0
24
+ - gmp=6.2.1=h2531618_2
25
+ - gnutls=3.6.15=he1e5248_0
26
+ - intel-openmp=2021.3.0=h06a4308_3350
27
+ - ipykernel=6.20.2=pyh210e3f2_0
28
+ - jpeg=9b=h024ee3a_2
29
+ - jupyter_client=7.1.2=pyhd8ed1ab_0
30
+ - jupyter_core=5.7.2=pyh31011fe_1
31
+ - lame=3.100=h7b6447c_0
32
+ - lcms2=2.12=h3be6417_0
33
+ - ld_impl_linux-64=2.35.1=h7274673_9
34
+ - libffi=3.3=he6710b0_2
35
+ - libgcc-ng=9.3.0=h5101ec6_17
36
+ - libgomp=9.3.0=h5101ec6_17
37
+ - libiconv=1.15=h63c8f33_5
38
+ - libidn2=2.3.2=h7f8727e_0
39
+ - libpng=1.6.37=hbc83047_0
40
+ - libsodium=1.0.18=h36c2ea0_1
41
+ - libstdcxx-ng=13.2.0=hc0a3c3a_7
42
+ - libtasn1=4.16.0=h27cfd23_0
43
+ - libtiff=4.2.0=h85742a9_0
44
+ - libunistring=0.9.10=h27cfd23_0
45
+ - libuv=1.40.0=h7b6447c_0
46
+ - libwebp-base=1.2.0=h27cfd23_0
47
+ - lz4-c=1.9.3=h295c915_1
48
+ - mkl=2021.3.0=h06a4308_520
49
+ - mkl-service=2.4.0=py38h7f8727e_0
50
+ - mkl_fft=1.3.0=py38h42c9631_2
51
+ - mkl_random=1.2.2=py38h51133e4_0
52
+ - ncurses=6.2=he6710b0_1
53
+ - nest-asyncio=1.6.0=pyhd8ed1ab_0
54
+ - nettle=3.7.3=hbbd107a_1
55
+ - ninja=1.10.2=hff7bd54_1
56
+ - olefile=0.46=py_0
57
+ - openh264=2.1.0=hd408876_0
58
+ - openjpeg=2.3.0=h05c96fa_1
59
+ - openssl=1.1.1k=h7f98852_0
60
+ - packaging=24.2=pyhd8ed1ab_2
61
+ - pickleshare=0.7.5=py_1003
62
+ - pillow=8.3.1=py38h2c7a002_0
63
+ - pip=21.0.1=py38h06a4308_0
64
+ - platformdirs=4.3.6=pyhd8ed1ab_0
65
+ - prompt_toolkit=3.0.48=hd8ed1ab_1
66
+ - ptyprocess=0.7.0=pyhd3deb0d_0
67
+ - pure_eval=0.2.3=pyhd8ed1ab_0
68
+ - pygments=2.18.0=pyhd8ed1ab_0
69
+ - python=3.8.11=h12debd9_0_cpython
70
+ - python_abi=3.8=5_cp38
71
+ - pyzmq=22.1.0=py38h2035c66_0
72
+ - readline=8.1=h27cfd23_0
73
+ - setuptools=52.0.0=py38h06a4308_0
74
+ - six=1.16.0=pyhd3eb1b0_0
75
+ - sqlite=3.36.0=hc218d9a_0
76
+ - stack_data=0.6.2=pyhd8ed1ab_0
77
+ - tk=8.6.10=hbc83047_0
78
+ - torchaudio=0.8.1=py38
79
+ - torchvision=0.9.1=py38_cu101
80
+ - tornado=6.1=py38h497a2fe_1
81
+ - wheel=0.37.0=pyhd3eb1b0_0
82
+ - xz=5.2.5=h7b6447c_0
83
+ - zeromq=4.3.4=h9c3ff4c_0
84
+ - zlib=1.2.11=h7b6447c_3
85
+ - zstd=1.4.9=haebb681_0
86
+ - pip:
87
+ - absl-py==0.13.0
88
+ - accelerate==1.0.1
89
+ - aiohappyeyeballs==2.4.3
90
+ - aiohttp==3.10.11
91
+ - aiosignal==1.3.1
92
+ - annotated-types==0.7.0
93
+ - antlr4-python3-runtime==4.9.3
94
+ - async-timeout==5.0.1
95
+ - attrs==24.2.0
96
+ - beautifulsoup4==4.12.3
97
+ - blis==0.7.11
98
+ - cachetools==4.2.2
99
+ - catalogue==2.0.10
100
+ - charset-normalizer==2.0.4
101
+ - chumpy==0.70
102
+ - click==8.1.7
103
+ - clip==1.0
104
+ - cloudpathlib==0.20.0
105
+ - confection==0.1.5
106
+ - cycler==0.10.0
107
+ - cymem==2.0.10
108
+ - decorator==5.0.9
109
+ - diffusers==0.31.0
110
+ - einops==0.8.0
111
+ - ffmpeg-python==0.2.0
112
+ - filelock==3.16.1
113
+ - freetype-py==2.5.1
114
+ - frozenlist==1.5.0
115
+ - fsspec==2024.2.0
116
+ - ftfy==6.1.1
117
+ - future==1.0.0
118
+ - fvcore==0.1.5.post20221221
119
+ - gdown==5.2.0
120
+ - glfw==2.8.0
121
+ - google-auth==2.36.0
122
+ - google-auth-oauthlib==0.4.6
123
+ - grpcio==1.68.0
124
+ - h5py==3.11.0
125
+ - huggingface-hub==0.26.2
126
+ - human-body-prior==2.2.2.0
127
+ - idna==3.2
128
+ - imageio==2.9.0
129
+ - imageio-ffmpeg==0.5.1
130
+ - importlib-metadata==8.5.0
131
+ - iopath==0.1.10
132
+ - ipdb==0.13.9
133
+ - ipython==7.26.0
134
+ - ipython-genutils==0.2.0
135
+ - jedi==0.18.0
136
+ - jinja2==3.1.3
137
+ - joblib==1.0.1
138
+ - kiwisolver==1.3.1
139
+ - langcodes==3.4.1
140
+ - language-data==1.3.0
141
+ - lightning-utilities==0.11.9
142
+ - marisa-trie==1.2.1
143
+ - markdown==3.3.4
144
+ - markdown-it-py==3.0.0
145
+ - markupsafe==2.1.5
146
+ - matplotlib==3.4.3
147
+ - matplotlib-inline==0.1.2
148
+ - mdurl==0.1.2
149
+ - moviepy==0.2.3.1
150
+ - mpmath==1.3.0
151
+ - multidict==6.1.0
152
+ - murmurhash==1.0.11
153
+ - natsort==8.4.0
154
+ - networkx==3.0
155
+ - numpy==1.22.4
156
+ - nvidia-cublas-cu11==11.11.3.6
157
+ - nvidia-cublas-cu12==12.1.3.1
158
+ - nvidia-cuda-cupti-cu11==11.8.87
159
+ - nvidia-cuda-cupti-cu12==12.1.105
160
+ - nvidia-cuda-nvrtc-cu11==11.8.89
161
+ - nvidia-cuda-nvrtc-cu12==12.1.105
162
+ - nvidia-cuda-runtime-cu11==11.8.89
163
+ - nvidia-cuda-runtime-cu12==12.1.105
164
+ - nvidia-cudnn-cu11==9.1.0.70
165
+ - nvidia-cudnn-cu12==9.1.0.70
166
+ - nvidia-cufft-cu11==10.9.0.58
167
+ - nvidia-cufft-cu12==11.0.2.54
168
+ - nvidia-curand-cu11==10.3.0.86
169
+ - nvidia-curand-cu12==10.3.2.106
170
+ - nvidia-cusolver-cu11==11.4.1.48
171
+ - nvidia-cusolver-cu12==11.4.5.107
172
+ - nvidia-cusparse-cu11==11.7.5.86
173
+ - nvidia-cusparse-cu12==12.1.0.106
174
+ - nvidia-nccl-cu11==2.20.5
175
+ - nvidia-nccl-cu12==2.20.5
176
+ - nvidia-nvjitlink-cu12==12.1.105
177
+ - nvidia-nvtx-cu11==11.8.86
178
+ - nvidia-nvtx-cu12==12.1.105
179
+ - oauthlib==3.1.1
180
+ - omegaconf==2.3.0
181
+ - orjson==3.10.15
182
+ - pandas==1.3.2
183
+ - parso==0.8.2
184
+ - pexpect==4.8.0
185
+ - portalocker==3.0.0
186
+ - preshed==3.0.9
187
+ - prompt-toolkit==3.0.20
188
+ - propcache==0.2.0
189
+ - protobuf==5.28.3
190
+ - psutil==6.1.0
191
+ - pyasn1==0.4.8
192
+ - pyasn1-modules==0.2.8
193
+ - pydantic==2.10.1
194
+ - pydantic-core==2.27.1
195
+ - pydeprecate==0.3.2
196
+ - pygame==2.6.1
197
+ - pyglet==2.1.2
198
+ - pyopengl==3.1.0
199
+ - pyparsing==2.4.7
200
+ - pyrender==0.1.45
201
+ - pysocks==1.7.1
202
+ - python-dateutil==2.8.2
203
+ - pytorch-lightning==1.7.0
204
+ - pytorch3d==0.3.0
205
+ - pytz==2021.1
206
+ - pyyaml==5.4.1
207
+ - regex==2024.11.6
208
+ - requests==2.26.0
209
+ - requests-oauthlib==1.3.0
210
+ - rich==13.9.4
211
+ - rsa==4.7.2
212
+ - safetensors==0.4.5
213
+ - scikit-learn==0.24.2
214
+ - scipy==1.7.1
215
+ - sentence-transformers==3.2.1
216
+ - sentencepiece==0.2.0
217
+ - shapely==2.0.7
218
+ - shellingham==1.5.4
219
+ - sklearn==0.0
220
+ - smart-open==7.0.5
221
+ - smplx==0.1.28
222
+ - soupsieve==2.6
223
+ - spacy==3.7.5
224
+ - spacy-legacy==3.0.12
225
+ - spacy-loggers==1.0.5
226
+ - srsly==2.4.8
227
+ - sympy==1.13.1
228
+ - tabulate==0.9.0
229
+ - tensorboard==2.12.0
230
+ - tensorboard-data-server==0.7.2
231
+ - tensorboard-plugin-wit==1.8.0
232
+ - termcolor==2.4.0
233
+ - thinc==8.2.5
234
+ - threadpoolctl==2.2.0
235
+ - timm==1.0.12
236
+ - tokenizers==0.20.3
237
+ - toml==0.10.2
238
+ - torch==2.4.1+cu118
239
+ - torchgeometry==0.1.2
240
+ - torchmetrics==0.7.0
241
+ - tqdm==4.62.2
242
+ - traitlets==5.0.5
243
+ - transformers==4.46.3
244
+ - triangle==20250106
245
+ - trimesh==4.6.2
246
+ - triton==3.0.0
247
+ - typer==0.13.1
248
+ - typing-extensions==4.12.2
249
+ - urllib3==1.26.6
250
+ - wasabi==1.1.3
251
+ - wcwidth==0.2.5
252
+ - weasel==0.4.1
253
+ - werkzeug==2.0.1
254
+ - wrapt==1.17.0
255
+ - yacs==0.1.8
256
+ - yarl==1.15.2
257
+ - zipp==3.20.2
258
+ prefix: /root/miniconda3/envs/mgpt
.ipynb_checkpoints/requirements-checkpoint.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ smplx==0.1.28
2
+ transformers==4.56.2
3
+ timm==1.0.12
4
+ sentence-transformers==5.1.0
5
+ clip @ git+https://github.com/openai/CLIP.git@main#egg=clip
6
+ human-body-prior @ git+https://github.com/nghorbani/human_body_prior.git@master#egg=human-body-prior
7
+ gdown
8
+ chumpy==0.70
9
+ scipy==1.7.1
10
+ numpy==1.22.4
11
+ tensorboard
12
+ accelerate
13
+ flash_attn
14
+ matplotlib==3.4.3
15
+ matplotlib-inline==0.1.2
16
+ imageio==2.9.0
17
+ imageio-ffmpeg==0.5.1
.ipynb_checkpoints/train_motionstreamer-checkpoint.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Train streaming motion generation model (MotionStreamer) with llama blocks, Two-Forward strategy and QK-Norm, using the motion latents encoded by the Causal TAE (trained in the first stage)."""
2
+
3
+
4
+ import os
5
+ import torch
6
+ import numpy as np
7
+ import random
8
+ from torch.utils.tensorboard import SummaryWriter
9
+ import json
10
+ from accelerate import Accelerator
11
+ from tqdm import tqdm
12
+ from models.llama_model import LLaMAHF, LLaMAHFConfig
13
+ import options.option_transformer as option_trans
14
+ import utils.utils_model as utils_model
15
+ import warnings
16
+ from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR
17
+ warnings.filterwarnings('ignore')
18
+
19
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
20
+ ##### ---- Exp dirs ---- #####
21
+ args = option_trans.get_args_parser()
22
+ torch.manual_seed(args.seed)
23
+
24
+ # warm-up + cosine decay scheduler
25
+ class WarmupCosineDecayScheduler:
26
+ def __init__(self, optimizer, warmup_iters, total_iters, min_lr=0):
27
+ self.optimizer = optimizer
28
+ self.warmup_iters = warmup_iters
29
+ self.total_iters = total_iters
30
+ self.min_lr = min_lr
31
+
32
+ self.warmup_scheduler = LambdaLR(optimizer, lr_lambda=self.warmup_lambda)
33
+
34
+ self.cosine_scheduler = CosineAnnealingLR(optimizer,
35
+ T_max=total_iters - warmup_iters,
36
+ eta_min=min_lr)
37
+
38
+ def warmup_lambda(self, current_iter):
39
+ if current_iter < self.warmup_iters:
40
+ return float(current_iter) / float(max(1, self.warmup_iters))
41
+ return 1.0
42
+
43
+ def step(self, current_iter):
44
+ if current_iter < self.warmup_iters:
45
+ self.warmup_scheduler.step()
46
+ else:
47
+ self.cosine_scheduler.step()
48
+
49
+ def state_dict(self):
50
+ return {
51
+ 'warmup_iters': self.warmup_iters,
52
+ 'total_iters': self.total_iters,
53
+ 'min_lr': self.min_lr,
54
+ }
55
+
56
+ def load_state_dict(self, state_dict):
57
+ self.warmup_iters = state_dict['warmup_iters']
58
+ self.total_iters = state_dict['total_iters']
59
+ self.min_lr = state_dict['min_lr']
60
+
61
+
62
+
63
+ args.out_dir = os.path.join(args.out_dir, f'{args.exp_name}')
64
+ os.makedirs(args.out_dir, exist_ok = True)
65
+
66
+
67
+ ##### ---- Accelerator Setup ---- #####
68
+ accelerator = Accelerator()
69
+ comp_device = accelerator.device
70
+
71
+ ##### ---- Logger ---- #####
72
+ logger = utils_model.get_logger(args.out_dir)
73
+ writer = SummaryWriter(args.out_dir)
74
+ logger.info(json.dumps(vars(args), indent=4, sort_keys=True))
75
+
76
+ ##### ---- Dataloader ---- #####
77
+ from humanml3d_272 import dataset_TM_train_motionstreamer
78
+ train_loader = dataset_TM_train_motionstreamer.DATALoader(args.dataname, args.batch_size, unit_length=2**args.down_t, latent_dir=args.latent_dir)
79
+
80
+
81
+ ##### ---- Network ---- #####
82
+ from sentence_transformers import SentenceTransformer
83
+ t5_model = SentenceTransformer('sentencet5-xxl/')
84
+ t5_model.eval()
85
+ for p in t5_model.parameters():
86
+ p.requires_grad = False
87
+
88
+
89
+ config = LLaMAHFConfig.from_name('Normal_size')
90
+ config.block_size = 78
91
+ trans_encoder = LLaMAHF(config, args.num_diffusion_head_layers, args.latent_dim, comp_device)
92
+
93
+ if args.resume_trans is not None:
94
+ print('loading transformer checkpoint from {}'.format(args.resume_trans))
95
+ ckpt = torch.load(args.resume_trans, map_location='cpu')
96
+ new_ckpt_trans = {}
97
+ for key in ckpt['trans'].keys():
98
+ if key.split('.')[0]=='module':
99
+ new_key = '.'.join(key.split('.')[1:])
100
+ else:
101
+ new_key = key
102
+ new_ckpt_trans[new_key] = ckpt['trans'][key]
103
+ trans_encoder.load_state_dict(new_ckpt_trans, strict=True)
104
+ trans_encoder.train()
105
+ trans_encoder.to(comp_device)
106
+
107
+
108
+ ##### ---- Optimizer & Scheduler ---- #####
109
+ optimizer = utils_model.initial_optim(args.decay_option, args.lr, args.weight_decay, trans_encoder, args.optimizer)
110
+ scheduler = WarmupCosineDecayScheduler(optimizer, args.total_iter//10, args.total_iter)
111
+
112
+ t5_model, trans_encoder, optimizer, train_loader = accelerator.prepare(t5_model, trans_encoder, optimizer, train_loader)
113
+ train_loader_iter = dataset_TM_train_motionstreamer.cycle(train_loader)
114
+
115
+
116
+ diffmlps_batch_mul = 4
117
+ def lengths_to_mask(lengths, max_len):
118
+ mask = torch.arange(max_len, device=lengths.device).expand(len(lengths), max_len) < lengths.unsqueeze(1)
119
+ return mask
120
+ def get_mask_subset_prob(mask, prob):
121
+ subset_mask = torch.bernoulli(mask, p=prob) & mask
122
+ return subset_mask
123
+
124
+
125
+ def uniform(shape, device=None):
126
+ return torch.zeros(shape, device=device).float().uniform_(0, 1)
127
+
128
+ import math
129
+ def cosine_schedule(t):
130
+ return torch.cos(t * math.pi * 0.5)
131
+
132
+
133
+ #--------------2-forward:------------------
134
+ def cosine_decay(step, total_steps, start_value=1.0, end_value=0.0):
135
+ step = torch.tensor(step, dtype=torch.float32)
136
+ total_steps = torch.tensor(total_steps, dtype=torch.float32)
137
+ cosine_factor = 0.5 * (1 + torch.cos(torch.pi * step / total_steps))
138
+ return start_value + (end_value - start_value) * cosine_factor
139
+
140
+ def replace_with_pred(latents, pred_xstart, step, total_steps):
141
+ decay_factor = cosine_decay(step, total_steps).to(latents.device)
142
+ b, l, d = latents.shape
143
+ num_replace = int(l * decay_factor)
144
+
145
+ replace_indices = torch.randperm(l)[:num_replace]
146
+
147
+ replace_mask = torch.zeros(b, l, dtype=torch.bool).to(latents.device)
148
+ replace_mask[:, replace_indices] = 1
149
+
150
+ updated_latents = latents.clone()
151
+ updated_latents[replace_mask] = pred_xstart[replace_mask]
152
+
153
+ return updated_latents
154
+
155
+ def forward_loss_withmask_2_forward_streaming(latents, trans, m_lens, feat_text, step, total_steps, A_token_length):
156
+ latents = latents.to(comp_device)
157
+ feat_text = feat_text.to(comp_device)
158
+ A_token_length = A_token_length.to(comp_device)
159
+ conditions = trans(latents, feat_text)
160
+ conditions = conditions.contiguous()
161
+ z = conditions[:,:-1,:]
162
+
163
+ b, l, d = latents.shape
164
+ mask = lengths_to_mask(m_lens, l)
165
+
166
+ for j in range(b):
167
+ mask[j, :A_token_length[j].item()] = False # A_motion token: do not compute loss
168
+
169
+ mask = mask.reshape(b * l).repeat(diffmlps_batch_mul)
170
+
171
+ target = latents.clone().detach()
172
+ target = target.reshape(b * l, -1)
173
+ z = z.reshape(b * l, -1)
174
+
175
+ with torch.no_grad():
176
+ loss, pred_xstart = trans.diff_loss(target=target, z=z)
177
+
178
+ pred_xstart = pred_xstart.clone().detach()
179
+ pred_xstart = pred_xstart.reshape(b, l, -1)
180
+
181
+ # do not replace A_motion tokens
182
+ for k in range(b):
183
+ pred_xstart[k, :A_token_length[k].item(),:] = latents[k, :A_token_length[k].item(),:]
184
+
185
+ updated_latents = replace_with_pred(latents, pred_xstart, step, total_steps)
186
+ updated_conditions = trans(updated_latents, feat_text)
187
+ updated_conditions = updated_conditions.contiguous()
188
+ updated_z = updated_conditions[:,:-1,:]
189
+
190
+ updated_target = latents.clone().detach()
191
+
192
+ updated_target = updated_target.reshape(b * l, -1).repeat(diffmlps_batch_mul, 1)
193
+ updated_z = updated_z.reshape(b * l, -1).repeat(diffmlps_batch_mul, 1)
194
+
195
+ updated_target = updated_target[mask]
196
+ updated_z = updated_z[mask]
197
+
198
+ updated_loss, updated_pred_xstart = trans.diff_loss(target=updated_target, z=updated_z)
199
+
200
+ return updated_loss
201
+
202
+
203
+ ##### ---- Training Loop ---- #####
204
+ avg_loss_cls = 0.
205
+
206
+ pbar = tqdm(range(1, args.total_iter + 1), desc="Training MotionStreamer")
207
+ for nb_iter in pbar:
208
+ batch = next(train_loader_iter)
209
+ caption, m_tokens, m_tokens_len, A_token_length = batch
210
+ caption = list(caption)
211
+ m_tokens, m_tokens_len = m_tokens.to(comp_device), m_tokens_len.to(comp_device)
212
+ A_token_length = A_token_length.to(comp_device)
213
+
214
+ bs = len(caption)
215
+ num_masked = int(bs * 0.1) # 10%
216
+ mask_indices = random.sample(range(bs), num_masked)
217
+
218
+ for idx in mask_indices:
219
+ caption[idx] = ''
220
+
221
+ feat_text = torch.from_numpy(t5_model.encode(caption)).float()
222
+ feat_text = feat_text.to(comp_device)
223
+
224
+ # -------gt--------
225
+ input_latent = m_tokens[:,:-1,:] # continuous token
226
+
227
+ loss_cls = 0.0
228
+
229
+ if args.num_gpus > 1:
230
+ loss_cls = forward_loss_withmask_2_forward_streaming(latents=input_latent, trans=trans_encoder.module, m_lens = m_tokens_len, feat_text=feat_text, step=nb_iter, total_steps=args.total_iter, A_token_length=A_token_length)
231
+ else:
232
+ loss_cls = forward_loss_withmask_2_forward_streaming(latents=input_latent, trans=trans_encoder, m_lens = m_tokens_len, feat_text=feat_text, step=nb_iter, total_steps=args.total_iter, A_token_length=A_token_length)
233
+
234
+
235
+ # backward & optimizer step
236
+ optimizer.zero_grad()
237
+ accelerator.backward(loss_cls)
238
+ optimizer.step()
239
+ scheduler.step(nb_iter)
240
+
241
+ avg_loss_cls = avg_loss_cls + loss_cls.item()
242
+
243
+ args.print_iter = 100
244
+ if nb_iter % args.print_iter == 0 :
245
+ if accelerator.is_main_process:
246
+ avg_loss_cls = avg_loss_cls / args.print_iter
247
+ lr = optimizer.param_groups[0]['lr']
248
+ writer.add_scalar('./Loss/train', avg_loss_cls, nb_iter)
249
+ writer.add_scalar('./LR/train', optimizer.param_groups[0]['lr'], nb_iter)
250
+ msg = f"Train. Iter {nb_iter} : Loss. {avg_loss_cls:.5f}"
251
+ tqdm.write(f"Iter {nb_iter} | Loss: {avg_loss_cls:.5f} | LR: {lr:.6f}")
252
+ logger.info(msg)
253
+ avg_loss_cls = 0.
254
+
255
+
256
+ args.save_iter = 10000
257
+ if nb_iter % args.save_iter == 0:
258
+ # save checkpoint
259
+ if accelerator.is_main_process:
260
+ torch.save({
261
+ 'trans': trans_encoder.state_dict(),
262
+ }, os.path.join(args.out_dir, f'latest.pth'))
263
+
264
+ accelerator.wait_for_everyone()
EVAL_causal_TAE.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ln -s ../utils ./Evaluator_272/
2
+ ln -s ../humanml3d_272 ./Evaluator_272/
3
+ ln -s ../options ./Evaluator_272/
4
+ ln -s ../models ./Evaluator_272/
5
+ ln -s ../visualization ./Evaluator_272/
6
+ python eval_causal_TAE.py --resume-pth output/causal_TAE/net_last.pth
EVAL_t2m.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ln -s ../utils ./Evaluator_272/
2
+ ln -s ../humanml3d_272 ./Evaluator_272/
3
+ ln -s ../options ./Evaluator_272/
4
+ ln -s ../models ./Evaluator_272/
5
+ ln -s ../visualization ./Evaluator_272/
6
+ ln -s ../Causal_TAE ./Evaluator_272/
7
+ python eval_t2m.py --resume-pth Causal_TAE/net_last.pth --resume-trans /cpfs03/shared/IDC/wangjingbo_group/motionstreamer/Open_source_Train_AR_16_1024_fps_30_111M_9/latest.pth
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 ZJU3DV
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ <h2 align="center"<strong>MotionStreamer: Streaming Motion Generation via Diffusion-based Autoregressive Model in Causal Latent Space</strong></h2>
3
+ <p align="center">
4
+ <a href='https://li-xingxiao.github.io/homepage/' target='_blank'>Lixing Xiao</a><sup>1</sup>
5
+ ·
6
+ <a href='https://shunlinlu.github.io/' target='_blank'>Shunlin Lu</a> <sup>2</sup>
7
+ ·
8
+ <a href='https://phj128.github.io/' target='_blank'>Huaijin Pi</a><sup>3</sup>
9
+ ·
10
+ <a href='https://vankouf.github.io/' target='_blank'>Ke Fan</a><sup>4</sup>
11
+ ·
12
+ <a href='https://liangpan99.github.io/' target='_blank'>Liang Pan</a><sup>3</sup>
13
+ ·
14
+ <a href='https://[email protected]' target='_blank'>Yueer Zhou</a><sup>1</sup>
15
+ ·
16
+ <a href='https://dblp.org/pid/120/4362.html/' target='_blank'>Ziyong Feng</a><sup>5</sup>
17
+ ·
18
+ <br>
19
+ <a href='https://www.xzhou.me/' target='_blank'>Xiaowei Zhou</a><sup>1</sup>
20
+ ·
21
+ <a href='https://pengsida.net/' target='_blank'>Sida Peng</a><sup>1†</sup>
22
+ ·
23
+ <a href='https://wangjingbo1219.github.io/' target='_blank'>Jingbo Wang</a><sup>6</sup>
24
+ <br>
25
+ <br>
26
+ <sup>1</sup>Zhejiang University <sup>2</sup>The Chinese University of Hong Kong, Shenzhen <sup>3</sup>The University of Hong Kong <br><sup>4</sup>Shanghai Jiao Tong University <sup>5</sup>DeepGlint <sup>6</sup>Shanghai AI Lab
27
+ <br>
28
+ <strong>ICCV 2025</strong>
29
+
30
+ </p>
31
+ </p>
32
+ <p align="center">
33
+ <a href='https://arxiv.org/abs/2503.15451'>
34
+ <img src='https://img.shields.io/badge/Arxiv-2503.15451-A42C25?style=flat&logo=arXiv&logoColor=A42C25'></a>
35
+ <a href='https://arxiv.org/pdf/2503.15451'>
36
+ <img src='https://img.shields.io/badge/Paper-PDF-blue?style=flat&logo=arXiv&logoColor=blue'></a>
37
+ <a href='https://zju3dv.github.io/MotionStreamer/'>
38
+ <img src='https://img.shields.io/badge/Project-Page-green?style=flat&logo=Google%20chrome&logoColor=green'></a>
39
+ <a href='https://huggingface.co/datasets/lxxiao/272-dim-HumanML3D'>
40
+ <img src='https://img.shields.io/badge/Data-Download-yellow?style=flat&logo=huggingface&logoColor=yellow'></a>
41
+ </p>
42
+
43
+ <img width="1385" alt="image" src="assets/teaser.jpg"/>
44
+
45
+ ## 🔥 News
46
+
47
+ - **[2025-06]** MotionStreamer has been accepted to ICCV 2025! 🎉
48
+
49
+ ## TODO List
50
+
51
+ - [x] Release the processing script of 272-dim motion representation.
52
+ - [x] Release the processed 272-dim Motion Representation of [HumanML3D](https://github.com/EricGuo5513/HumanML3D) dataset. Only for academic usage.
53
+ - [x] Release the training code and checkpoint of our [TMR](https://github.com/Mathux/TMR)-based motion evaluator trained on the processed 272-dim [HumanML3D](https://github.com/EricGuo5513/HumanML3D) dataset.
54
+ - [x] Release the training and evaluation code as well as checkpoint of Causal TAE.
55
+ - [x] Release the training code of original motion generation model and streaming generation model (MotionStreamer).
56
+ - [x] Release the checkpoint and demo inference code of original motion generation model.
57
+ - [ ] Release complete code for MotionStreamer.
58
+
59
+ ## 🏃 Motion Representation
60
+ For more details of how to obtain the 272-dim motion representation, as well as other useful tools (e.g., Visualization and Conversion to BVH format), please refer to our [GitHub repo](https://github.com/Li-xingXiao/272-dim-Motion-Representation).
61
+
62
+ ## Installation
63
+
64
+ ### 🐍 Python Virtual Environment
65
+ ```sh
66
+ conda env create -f environment.yaml
67
+ conda activate mgpt
68
+ ```
69
+
70
+ ### 🤗 Hugging Face Mirror
71
+ Since all of our models and data are available on Hugging Face, if Hugging Face is not directly accessible, you can use the HF-mirror tools following:
72
+ ```sh
73
+ pip install -U huggingface_hub
74
+ export HF_ENDPOINT=https://hf-mirror.com
75
+ ```
76
+
77
+ ## 📥 Data Preparation
78
+ To facilitate researchers, we provide the processed 272-dim Motion Representation of:
79
+ > HumanML3D dataset at [this link](https://huggingface.co/datasets/lxxiao/272-dim-HumanML3D).
80
+
81
+ > BABEL dataset at [this link](https://huggingface.co/datasets/lxxiao/272-dim-BABEL).
82
+
83
+ ❗️❗️❗️ The processed data is solely for academic purposes. Make sure you read through the [AMASS License](https://amass.is.tue.mpg.de/license.html).
84
+
85
+ 1. Download the processed 272-dim [HumanML3D](https://github.com/EricGuo5513/HumanML3D) dataset following:
86
+ ```bash
87
+ huggingface-cli download --repo-type dataset --resume-download lxxiao/272-dim-HumanML3D --local-dir ./humanml3d_272
88
+ cd ./humanml3d_272
89
+ unzip texts.zip
90
+ unzip motion_data.zip
91
+ ```
92
+ The dataset is organized as:
93
+ ```
94
+ ./humanml3d_272
95
+ ├── mean_std
96
+ ├── Mean.npy
97
+ ├── Std.npy
98
+ ├── split
99
+ ├── train.txt
100
+ ├── val.txt
101
+ ├── test.txt
102
+ ├── texts
103
+ ├── 000000.txt
104
+ ...
105
+ ├── motion_data
106
+ ├── 000000.npy
107
+ ...
108
+ ```
109
+
110
+ 2. Download the processed 272-dim [BABEL](https://babel.is.tue.mpg.de/) dataset following:
111
+ ```bash
112
+ huggingface-cli download --repo-type dataset --resume-download lxxiao/272-dim-BABEL --local-dir ./babel_272
113
+ cd ./babel_272
114
+ unzip texts.zip
115
+ unzip motion_data.zip
116
+ ```
117
+ The dataset is organized as:
118
+ ```
119
+ ./babel_272
120
+ ├── t2m_babel_mean_std
121
+ ├── Mean.npy
122
+ ├── Std.npy
123
+ ├── split
124
+ ├── train.txt
125
+ ├── val.txt
126
+ ├── texts
127
+ ├── 000000.txt
128
+ ...
129
+ ├── motion_data
130
+ ├── 000000.npy
131
+ ...
132
+ ```
133
+
134
+ 3. Download the processed streaming 272-dim [BABEL](https://babel.is.tue.mpg.de/) dataset following:
135
+ ```bash
136
+ huggingface-cli download --repo-type dataset --resume-download lxxiao/272-dim-BABEL-stream --local-dir ./babel_272_stream
137
+ cd ./babel_272_stream
138
+ unzip train_stream.zip
139
+ unzip train_stream_text.zip
140
+ unzip val_stream.zip
141
+ unzip val_stream_text.zip
142
+ ```
143
+ The dataset is organized as:
144
+ ```
145
+ ./babel_272_stream
146
+ ├── train_stream
147
+ ├── seq1.npy
148
+ ...
149
+ ├── train_stream_text
150
+ ├── seq1.txt
151
+ ...
152
+ ├── val_stream
153
+ ├── seq1.npy
154
+ ...
155
+ ├── val_stream_text
156
+ ├── seq1.txt
157
+ ...
158
+ ```
159
+ > NOTE: We process the original BABEL dataset to support training of streaming motion generation. e.g. If there is a motion sequence A, annotated as (A1, A2, A3, A4) in BABEL dataset, each subsequence has text description: (A1_t, A2_t, A3_t, A4_t).
160
+
161
+ > Then, our BABEL-stream is constructed as:
162
+
163
+ > seq1: (A1, A2) --- seq1_text: (A1_t*A2_t#A1_length)
164
+
165
+ > seq2: (A2, A3) --- seq2_text: (A2_t*A3_t#A2_length)
166
+
167
+ > seq3: (A3, A4) --- seq3_text: (A3_t*A4_t#A3_length)
168
+
169
+ > Here, * and # is separation symbol, A1_length means the number of frames of subsequence A1.
170
+
171
+ ## 🚀 Training
172
+ 1. Train our [TMR](https://github.com/Mathux/TMR)-based motion evaluator on the processed 272-dim [HumanML3D](https://github.com/EricGuo5513/HumanML3D) dataset:
173
+ ```bash
174
+ bash TRAIN_evaluator_272.sh
175
+ ```
176
+ >After training for 100 epochs, the checkpoint will be stored at:
177
+ ``Evaluator_272/experiments/temos/EXP1/checkpoints/``.
178
+
179
+ ⬇️ We provide the evaluator checkpoint on [Hugging Face](https://huggingface.co/lxxiao/MotionStreamer/tree/main/Evaluator_272), download it following:
180
+ ```bash
181
+ python humanml3d_272/prepare/download_evaluator_ckpt.py
182
+ ```
183
+ >The downloaded checkpoint will be stored at: ``Evaluator_272/``.
184
+ 2. Train the Causal TAE:
185
+ ```bash
186
+ bash TRAIN_causal_TAE.sh ${NUM_GPUS}
187
+ ```
188
+ > e.g., if you have 8 GPUs, run: bash TRAIN_causal_TAE.sh 8
189
+
190
+ > The checkpoint will be stored at:
191
+ ``Experiments/causal_TAE_t2m_272/``
192
+
193
+ > Tensorboard visualization:
194
+ ```bash
195
+ tensorboard --logdir='Experiments/causal_TAE_t2m_272'
196
+ ```
197
+
198
+ ⬇️ We provide the Causal TAE checkpoint on [Hugging Face](https://huggingface.co/lxxiao/MotionStreamer/tree/main/Causal_TAE), download it following:
199
+ ```bash
200
+ python humanml3d_272/prepare/download_Causal_TAE_t2m_272_ckpt.py
201
+ ```
202
+
203
+ 3. Train text to motion model:
204
+ > We provide scripts to train the original text to motion generation model with llama blocks, Two-Forward strategy and QK-Norm, using the motion latents encoded by the Causal TAE (trained in the first stage).
205
+
206
+ 3.1 Get motion latents:
207
+ ```bash
208
+ python get_latent.py --resume-pth Causal_TAE/net_last.pth --latent_dir humanml3d_272/t2m_latents
209
+ ```
210
+ 3.2 Download [sentence-T5-XXL model](https://huggingface.co/sentence-transformers/sentence-t5-xxl/tree/main) on Hugging Face:
211
+ ```bash
212
+ huggingface-cli download --resume-download sentence-transformers/sentence-t5-xxl --local-dir sentencet5-xxl/
213
+ ```
214
+ 3.3 Train text to motion generation model:
215
+ ```bash
216
+ bash TRAIN_t2m.sh ${NUM_GPUS}
217
+ ```
218
+ > e.g., if you have 8 GPUs, run: bash TRAIN_t2m.sh 8
219
+
220
+ > The checkpoint will be stored at:
221
+ ``Experiments/t2m_model/``
222
+
223
+ > Tensorboard visualization:
224
+ ```bash
225
+ tensorboard --logdir='Experiments/t2m_model'
226
+ ```
227
+
228
+ ⬇️ We provide the text to motion model checkpoint on [Hugging Face](https://huggingface.co/lxxiao/MotionStreamer/tree/main/Experiments/t2m_model), download it following:
229
+ ```bash
230
+ python humanml3d_272/prepare/download_t2m_model_ckpt.py
231
+ ```
232
+
233
+ 4. Train streaming motion generation model (MotionStreamer):
234
+ > We provide scripts to train the streaming motion generation model (MotionStreamer) with llama blocks, Two-Forward strategy and QK-Norm, using the motion latents encoded by the Causal TAE (need to train a new Causal TAE using both HumanML3D-272 and BABEL-272 data).
235
+
236
+ 4.1 Train a Causal TAE using both HumanML3D-272 and BABEL-272 data:
237
+ ```bash
238
+ bash TRAIN_causal_TAE.sh ${NUM_GPUS} t2m_babel_272
239
+ ```
240
+ > e.g., if you have 8 GPUs, run: bash TRAIN_causal_TAE.sh 8 t2m_babel_272
241
+
242
+ > The checkpoint will be stored at:
243
+ ``Experiments/causal_TAE_t2m_babel_272/``
244
+
245
+ > Tensorboard visualization:
246
+ ```bash
247
+ tensorboard --logdir='Experiments/causal_TAE_t2m_babel_272'
248
+ ```
249
+
250
+ ⬇️ We provide the Causal TAE checkpoint trained using both HumanML3D-272 and BABEL-272 data on [Hugging Face](https://huggingface.co/lxxiao/MotionStreamer/tree/main/Causal_TAE_t2m_babel), download it following:
251
+ ```bash
252
+ python humanml3d_272/prepare/download_Causal_TAE_t2m_babel_272_ckpt.py
253
+ ```
254
+
255
+ 4.2 Get motion latents of both HumanML3D-272 and the processed BABEL-272-stream dataset:
256
+ ```bash
257
+ python get_latent.py --resume-pth Causal_TAE_t2m_babel/net_last.pth --latent_dir babel_272_stream/t2m_babel_latents --dataname t2m_babel_272
258
+ ```
259
+
260
+ 4.3 Train MotionStreamer model:
261
+ ```bash
262
+ bash TRAIN_motionstreamer.sh ${NUM_GPUS}
263
+ ```
264
+ > e.g., if you have 8 GPUs, run: bash TRAIN_motionstreamer.sh 8
265
+
266
+ > The checkpoint will be stored at:
267
+ ``Experiments/motionstreamer_model/``
268
+
269
+ > Tensorboard visualization:
270
+ ```bash
271
+ tensorboard --logdir='Experiments/motionstreamer_model'
272
+ ```
273
+
274
+ ## 📍 Evaluation
275
+
276
+ 1. Evaluate the metrics of the processed 272-dim [HumanML3D](https://github.com/EricGuo5513/HumanML3D) dataset:
277
+ ```bash
278
+ bash EVAL_GT.sh
279
+ ```
280
+ ( FID, R@1, R@2, R@3, Diversity and MM-Dist (Matching Score) are reported. )
281
+
282
+ 2. Evaluate the metrics of Causal TAE:
283
+ ```bash
284
+ bash EVAL_causal_TAE.sh
285
+ ```
286
+ ( FID and MPJPE (mm) are reported. )
287
+
288
+ 3. Evaluate the metrics of text to motion model:
289
+ ```bash
290
+ bash EVAL_t2m.sh
291
+ ```
292
+ ( FID, R@1, R@2, R@3, Diversity and MM-Dist (Matching Score) are reported. )
293
+
294
+
295
+ ## 🎬 Demo Inference
296
+
297
+ 1. Inference of text to motion model:
298
+ > [Option1] Recover from joint position
299
+ ```bash
300
+ python demo_t2m.py --text 'a person is walking like a mummy.' --mode pos --resume-pth Causal_TAE/net_last.pth --resume-trans Experiments/t2m_model/latest.pth
301
+ ```
302
+ > [Option2] Recover from joint rotation
303
+ ```bash
304
+ python demo_t2m.py --text 'a person is walking like a mummy.' --mode rot --resume-pth Causal_TAE/net_last.pth --resume-trans Experiments/t2m_model/latest.pth
305
+ ```
306
+ > In our 272-dim representation, Inverse Kinematics (IK) is not needed.
307
+ > For further conversion to BVH format, please refer to [this repo](https://github.com/Li-xingXiao/272-dim-Motion-Representation?tab=readme-ov-file#6-representation_272-to-bvh-conversion-optional) (Step 6: Representation_272 to BVH conversion). The BVH format of motion animation can be visualizd and edited in [Blender](https://www.blender.org/features/animation/).
308
+
309
+
310
+
311
+
312
+ ## 🌹 Acknowledgement
313
+ This repository builds upon the following awesome datasets and projects:
314
+ - [272-dim-Motion-Representation](https://github.com/Li-xingXiao/272-dim-Motion-Representation)
315
+ - [AMASS](https://amass.is.tue.mpg.de/index.html)
316
+ - [HumanML3D](https://github.com/EricGuo5513/HumanML3D)
317
+ - [T2M-GPT](https://github.com/Mael-zys/T2M-GPT)
318
+ - [TMR](https://github.com/Mathux/TMR)
319
+ - [OpenTMA](https://github.com/LinghaoChan/OpenTMA)
320
+ - [Sigma-VAE](https://github.com/orybkin/sigma-vae-pytorch)
321
+ - [Scamo](https://github.com/shunlinlu/ScaMo_code)
322
+
323
+ ## 🤝🏼 Citation
324
+ If our project is helpful for your research, please consider citing :
325
+ ```
326
+ @article{xiao2025motionstreamer,
327
+ title={MotionStreamer: Streaming Motion Generation via Diffusion-based Autoregressive Model in Causal Latent Space},
328
+ author={Xiao, Lixing and Lu, Shunlin and Pi, Huaijin and Fan, Ke and Pan, Liang and Zhou, Yueer and Feng, Ziyong and Zhou, Xiaowei and Peng, Sida and Wang, Jingbo},
329
+ journal={arXiv preprint arXiv:2503.15451},
330
+ year={2025}
331
+ }
332
+ ```
333
+
334
+ ## Star History
335
+
336
+ [![Star History Chart](https://api.star-history.com/svg?repos=zju3dv/MotionStreamer&type=Date)](https://www.star-history.com/#zju3dv/MotionStreamer&Date)
TRAIN_causal_TAE.sh ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NUM_GPUS=${1:-1} # default: 1 GPU
2
+ dataset_name=${2:-t2m_272} # default: t2m_272, options: t2m_272, t2m_babel_272
3
+
4
+ BATCH_SIZE=$((128 / NUM_GPUS))
5
+
6
+ echo "Using $NUM_GPUS GPUs, each with a batch size of $BATCH_SIZE"
7
+
8
+ accelerate launch --num_processes $NUM_GPUS train_causal_TAE.py \
9
+ --batch-size $BATCH_SIZE \
10
+ --lr 0.00005 \
11
+ --total-iter 2000000 \
12
+ --lr-scheduler 1900000 \
13
+ --down-t 2 \
14
+ --depth 3 \
15
+ --dilation-growth-rate 3 \
16
+ --out-dir Experiments \
17
+ --dataname $dataset_name \
18
+ --exp-name causal_TAE_${dataset_name} \
19
+ --root_loss 7.0 \
20
+ --latent_dim 16 \
21
+ --hidden_size 1024 \
22
+ --num_gpus $NUM_GPUS
TRAIN_evaluator_272.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ export HF_ENDPOINT=https://hf-mirror.com
2
+ cd Evaluator_272
3
+ huggingface-cli download --resume-download distilbert/distilbert-base-uncased --local-dir ./deps/distilbert-base-uncased
4
+ ln -s ../humanml3d_272 ./datasets/humanml3d_272
5
+ python -m train --cfg configs/configs_evaluator_272/H3D-TMR.yaml --cfg_assets configs/assets.yaml --batch_size 256 --nodebug
6
+ cd ..
TRAIN_motionstreamer.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NUM_GPUS=${1:-1} # default: 1 GPU
2
+
3
+ BATCH_SIZE=$((30 / NUM_GPUS))
4
+
5
+ echo "Using $NUM_GPUS GPUs, each with a batch size of $BATCH_SIZE"
6
+
7
+ accelerate launch --num_processes $NUM_GPUS train_motionstreamer.py \
8
+ --batch-size $BATCH_SIZE \
9
+ --lr 0.0001 \
10
+ --total-iter 200000 \
11
+ --out-dir Experiments \
12
+ --exp-name motionstreamer_model \
13
+ --dataname t2m_babel_272 \
14
+ --latent_dir babel_272_stream/t2m_babel_latents \
15
+ --num_gpus $NUM_GPUS
16
+ --resume-trans Experiments/motionstreamer_model/100k.pth \
TRAIN_t2m.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NUM_GPUS=${1:-1} # default: 1 GPU
2
+
3
+ BATCH_SIZE=$((256 / NUM_GPUS))
4
+
5
+ echo "Using $NUM_GPUS GPUs, each with a batch size of $BATCH_SIZE"
6
+
7
+ accelerate launch --num_processes $NUM_GPUS train_t2m.py \
8
+ --batch-size $BATCH_SIZE \
9
+ --lr 0.0001 \
10
+ --total-iter 100000 \
11
+ --out-dir Experiments \
12
+ --exp-name t2m_model \
13
+ --dataname t2m_272 \
14
+ --latent_dir humanml3d_272/t2m_latents \
15
+ --num_gpus $NUM_GPUS
assets/teaser.jpg ADDED

Git LFS Details

  • SHA256: 7958c8564ae20e48165890a08d21d1b63d2a6ce94fed017fb7b5504286f0b5da
  • Pointer size: 131 Bytes
  • Size of remote file: 751 kB
babel_272/.gitattributes ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mds filter=lfs diff=lfs merge=lfs -text
13
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
14
+ *.model filter=lfs diff=lfs merge=lfs -text
15
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
16
+ *.npy filter=lfs diff=lfs merge=lfs -text
17
+ *.npz filter=lfs diff=lfs merge=lfs -text
18
+ *.onnx filter=lfs diff=lfs merge=lfs -text
19
+ *.ot filter=lfs diff=lfs merge=lfs -text
20
+ *.parquet filter=lfs diff=lfs merge=lfs -text
21
+ *.pb filter=lfs diff=lfs merge=lfs -text
22
+ *.pickle filter=lfs diff=lfs merge=lfs -text
23
+ *.pkl filter=lfs diff=lfs merge=lfs -text
24
+ *.pt filter=lfs diff=lfs merge=lfs -text
25
+ *.pth filter=lfs diff=lfs merge=lfs -text
26
+ *.rar filter=lfs diff=lfs merge=lfs -text
27
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
28
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
30
+ *.tar filter=lfs diff=lfs merge=lfs -text
31
+ *.tflite filter=lfs diff=lfs merge=lfs -text
32
+ *.tgz filter=lfs diff=lfs merge=lfs -text
33
+ *.wasm filter=lfs diff=lfs merge=lfs -text
34
+ *.xz filter=lfs diff=lfs merge=lfs -text
35
+ *.zip filter=lfs diff=lfs merge=lfs -text
36
+ *.zst filter=lfs diff=lfs merge=lfs -text
37
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
38
+ # Audio files - uncompressed
39
+ *.pcm filter=lfs diff=lfs merge=lfs -text
40
+ *.sam filter=lfs diff=lfs merge=lfs -text
41
+ *.raw filter=lfs diff=lfs merge=lfs -text
42
+ # Audio files - compressed
43
+ *.aac filter=lfs diff=lfs merge=lfs -text
44
+ *.flac filter=lfs diff=lfs merge=lfs -text
45
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
46
+ *.ogg filter=lfs diff=lfs merge=lfs -text
47
+ *.wav filter=lfs diff=lfs merge=lfs -text
48
+ # Image files - uncompressed
49
+ *.bmp filter=lfs diff=lfs merge=lfs -text
50
+ *.gif filter=lfs diff=lfs merge=lfs -text
51
+ *.png filter=lfs diff=lfs merge=lfs -text
52
+ *.tiff filter=lfs diff=lfs merge=lfs -text
53
+ # Image files - compressed
54
+ *.jpg filter=lfs diff=lfs merge=lfs -text
55
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
56
+ *.webp filter=lfs diff=lfs merge=lfs -text
57
+ # Video files - compressed
58
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
59
+ *.webm filter=lfs diff=lfs merge=lfs -text
babel_272/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ ## 🚀 Dataset Usage
5
+ To facilitate researchers, we provide the processed 272-dim Motion Representation of [BABEL](https://babel.is.tue.mpg.de/) dataset in this Hugging Face repo.
6
+
7
+ Motions are resampled into 30 FPS.
8
+
9
+ NOTE: ``t2m_babel_mean_std/`` contains the joint mean and std of both HumanML3D and BABEL dataset for joint training of the proposed [Causal TAE](https://github.com/zju3dv/MotionStreamer/blob/main/TRAIN_causal_TAE.sh).
10
+
11
+ ❗️❗️❗️ The processed data is solely for academic purposes. Make sure you read through the [BABEL License](https://babel.is.tue.mpg.de/license.html).
12
+
13
+ ## 📖 Paper & Project Page & Code
14
+ * [Arxiv Paper](https://arxiv.org/abs/2503.15451)
15
+ * [Project Page](https://zju3dv.github.io/MotionStreamer/)
16
+ * [Code](https://github.com/zju3dv/MotionStreamer)
17
+
18
+ ## 🏃 Processing script
19
+ For more details of how to obtain the 272-dim motion representation, as well as other useful tools (e.g., Visualization and Conversion to BVH format), please refer to our [GitHub repo](https://github.com/Li-xingXiao/272-dim-Motion-Representation).
20
+
21
+ ## 🌹 Acknowledgement
22
+ This repository builds upon the following awesome datasets and projects:
23
+ - [BABEL](https://babel.is.tue.mpg.de/)
24
+
25
+ ## 🤝🏼 Citation
26
+ If our project is helpful for your research, please consider citing :
27
+ ```
28
+ @article{xiao2025motionstreamer,
29
+ title={MotionStreamer: Streaming Motion Generation via Diffusion-based Autoregressive Model in Causal Latent Space},
30
+ author={Xiao, Lixing and Lu, Shunlin and Pi, Huaijin and Fan, Ke and Pan, Liang and Zhou, Yueer and Feng, Ziyong and Zhou, Xiaowei and Peng, Sida and Wang, Jingbo},
31
+ journal={arXiv preprint arXiv:2503.15451},
32
+ year={2025}
33
+ }
34
+ ```
babel_272/motion_data.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03ecf1eefd24f828e0717dd0d7d05ad2ad139d79fd09d59baeab711895311525
3
+ size 8093667470
babel_272/split/train.txt ADDED
The diff for this file is too large to render. See raw diff
 
babel_272/split/val.txt ADDED
The diff for this file is too large to render. See raw diff
 
babel_272/t2m_babel_mean_std/Mean.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0f782aecd1c0479c517aee68959a26f55ddf1f34bb2344b4d9c365c73f3ed80
3
+ size 2304
babel_272/t2m_babel_mean_std/Std.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de477d76de0b03b71779dea84964ccf59c1f53ad49ebef7d99202c4ff19a2ff5
3
+ size 2304
babel_272/texts.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39b0a560144db9d4a261462d21f0eeedefc3f0bd1bb664cb3ec819c17ebead52
3
+ size 38968869
babel_272_stream/.gitattributes ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mds filter=lfs diff=lfs merge=lfs -text
13
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
14
+ *.model filter=lfs diff=lfs merge=lfs -text
15
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
16
+ *.npy filter=lfs diff=lfs merge=lfs -text
17
+ *.npz filter=lfs diff=lfs merge=lfs -text
18
+ *.onnx filter=lfs diff=lfs merge=lfs -text
19
+ *.ot filter=lfs diff=lfs merge=lfs -text
20
+ *.parquet filter=lfs diff=lfs merge=lfs -text
21
+ *.pb filter=lfs diff=lfs merge=lfs -text
22
+ *.pickle filter=lfs diff=lfs merge=lfs -text
23
+ *.pkl filter=lfs diff=lfs merge=lfs -text
24
+ *.pt filter=lfs diff=lfs merge=lfs -text
25
+ *.pth filter=lfs diff=lfs merge=lfs -text
26
+ *.rar filter=lfs diff=lfs merge=lfs -text
27
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
28
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
30
+ *.tar filter=lfs diff=lfs merge=lfs -text
31
+ *.tflite filter=lfs diff=lfs merge=lfs -text
32
+ *.tgz filter=lfs diff=lfs merge=lfs -text
33
+ *.wasm filter=lfs diff=lfs merge=lfs -text
34
+ *.xz filter=lfs diff=lfs merge=lfs -text
35
+ *.zip filter=lfs diff=lfs merge=lfs -text
36
+ *.zst filter=lfs diff=lfs merge=lfs -text
37
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
38
+ # Audio files - uncompressed
39
+ *.pcm filter=lfs diff=lfs merge=lfs -text
40
+ *.sam filter=lfs diff=lfs merge=lfs -text
41
+ *.raw filter=lfs diff=lfs merge=lfs -text
42
+ # Audio files - compressed
43
+ *.aac filter=lfs diff=lfs merge=lfs -text
44
+ *.flac filter=lfs diff=lfs merge=lfs -text
45
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
46
+ *.ogg filter=lfs diff=lfs merge=lfs -text
47
+ *.wav filter=lfs diff=lfs merge=lfs -text
48
+ # Image files - uncompressed
49
+ *.bmp filter=lfs diff=lfs merge=lfs -text
50
+ *.gif filter=lfs diff=lfs merge=lfs -text
51
+ *.png filter=lfs diff=lfs merge=lfs -text
52
+ *.tiff filter=lfs diff=lfs merge=lfs -text
53
+ # Image files - compressed
54
+ *.jpg filter=lfs diff=lfs merge=lfs -text
55
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
56
+ *.webp filter=lfs diff=lfs merge=lfs -text
57
+ # Video files - compressed
58
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
59
+ *.webm filter=lfs diff=lfs merge=lfs -text
babel_272_stream/README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ ## 🚀 Dataset Usage
5
+ To facilitate researchers, we provide the processed streaming 272-dim Motion Representation of [BABEL](https://babel.is.tue.mpg.de/) dataset in this Hugging Face repo.
6
+
7
+ NOTE: We process the original BABEL dataset to support training of streaming motion generation.
8
+ e.g. If there is a motion sequence A, annotated as (A1, A2, A3, A4) in BABEL dataset, each subsequence has text description: (A1_t, A2_t, A3_t, A4_t).
9
+
10
+ Then, our BABEL-stream is constructed as:
11
+
12
+ seq1: (A1, A2) --- seq1_text: (A1_t*A2_t#A1_length)
13
+
14
+ seq2: (A2, A3) --- seq2_text: (A2_t*A3_t#A2_length)
15
+
16
+ seq3: (A3, A4) --- seq3_text: (A3_t*A4_t#A3_length)
17
+
18
+ Here, * and # is separation symbol, A1_length means the number of frames of subsequence A1.
19
+
20
+ Motions are resampled into 30 FPS.
21
+
22
+ The dataset is organized as:
23
+ ```
24
+ ./
25
+ ├── train_stream
26
+ ├── seq1.npy
27
+ ...
28
+ ├── train_stream_text
29
+ ├── seq1.txt
30
+ ...
31
+ ├── val_stream
32
+ ├── seq1.npy
33
+ ...
34
+ ├── val_stream_text
35
+ ├── seq1.txt
36
+ ...
37
+ ```
38
+
39
+ ❗️❗️❗️ The processed data is solely for academic purposes. Make sure you read through the [BABEL License](https://babel.is.tue.mpg.de/license.html).
40
+
41
+ ## 📖 Paper & Project Page & Code
42
+ * [Arxiv Paper](https://arxiv.org/abs/2503.15451)
43
+ * [Project Page](https://zju3dv.github.io/MotionStreamer/)
44
+ * [Code](https://github.com/zju3dv/MotionStreamer)
45
+
46
+ ## 🏃 Processing script
47
+ For more details of how to obtain the 272-dim motion representation, as well as other useful tools (e.g., Visualization and Conversion to BVH format), please refer to our [GitHub repo](https://github.com/Li-xingXiao/272-dim-Motion-Representation).
48
+
49
+ ## 🌹 Acknowledgement
50
+ This repository builds upon the following awesome datasets and projects:
51
+ - [BABEL](https://babel.is.tue.mpg.de/)
52
+
53
+ ## 🤝🏼 Citation
54
+ If our project is helpful for your research, please consider citing :
55
+ ```
56
+ @article{xiao2025motionstreamer,
57
+ title={MotionStreamer: Streaming Motion Generation via Diffusion-based Autoregressive Model in Causal Latent Space},
58
+ author={Xiao, Lixing and Lu, Shunlin and Pi, Huaijin and Fan, Ke and Pan, Liang and Zhou, Yueer and Feng, Ziyong and Zhou, Xiaowei and Peng, Sida and Wang, Jingbo},
59
+ journal={arXiv preprint arXiv:2503.15451},
60
+ year={2025}
61
+ }
62
+ ```
babel_272_stream/train_stream.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35db924d754e321f673a72c22b80d5d725f55d74151fc34351f554ef6bf33a2e
3
+ size 6901914721
babel_272_stream/train_stream_text.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d46561fcaf62738b1d08cf54a851ffecb3fb7a154f9663b199dfa83f0d677046
3
+ size 4746908
babel_272_stream/val_stream.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0564c64ce642330222b3ed83d031f5f3765c6979a82f17a2259e07d80d0ff78a
3
+ size 2580199524
babel_272_stream/val_stream_text.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba646f2836f03a7fa1a5470aa8c098d1b0e446872d5bf53b8b42283e5c1f368b
3
+ size 1685986
body_models/human_model_files/mano/MANO_LEFT.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4022f7083f2ca7c78b2b3d595abbab52debd32b09d372b16923a801f0ea6a30
3
+ size 3821391
body_models/human_model_files/mano/MANO_RIGHT.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45d60aa3b27ef9107a7afd4e00808f307fd91111e1cfa35afd5c4a62de264767
3
+ size 3821356
body_models/human_model_files/smpl/J_regressor_extra.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc968ea4f9855571e82f90203280836b01f13ee42a8e1b89d8d580b801242a89
3
+ size 496160
body_models/human_model_files/smpl/SMPL_FEMALE.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a583c1b98e4afc19042641f1bae5cd8a1f712a6724886291a7627ec07acd408d
3
+ size 39056454
body_models/human_model_files/smpl/SMPL_MALE.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e8c0bbbbc635dcb166ed29c303fb4bef16ea5f623e5a89263495a9e403575bd
3
+ size 39056404
body_models/human_model_files/smpl/SMPL_NEUTRAL.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98e65c74ad9b998783132f00880d1025a8d64b158e040e6ef13a557e5098bc42
3
+ size 39001280
body_models/human_model_files/smpl/VPOSER_CKPT/TR00_004_00_WO_accad.ini ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [All]
2
+ adam_beta1 : 0.9
3
+ base_lr : 0.005
4
+ batch_size : 512
5
+ best_model_fname : None
6
+ cuda_id : 0
7
+ data_shape : [1, 21, 3]
8
+ dataset_dir : None
9
+ display_model_gender : male
10
+ expr_code : 004_00_WO_accad
11
+ fp_precision : 32
12
+ ip_avoid : False
13
+ kl_coef : 0.005
14
+ latentD : 32
15
+ log_every_epoch : 2
16
+ model_type : smpl
17
+ n_workers : 10
18
+ num_bodies_to_display : 10
19
+ num_epochs : 100
20
+ num_neurons : 512
21
+ reg_coef : 0.0001
22
+ remove_Zrot : True
23
+ seed : 4815
24
+ sm_coef : 0.01
25
+ test_only : False
26
+ try_num : 0
27
+ use_cont_repr : True
28
+ verbosity : 0
29
+ work_dir : None
body_models/human_model_files/smpl/VPOSER_CKPT/snapshots/._TR00_E096.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e2615cd1d2e78cdfac7169c6182a7352d02992336dad7329d3d97f6947fb515
3
+ size 4096
body_models/human_model_files/smpl/VPOSER_CKPT/snapshots/TR00_E096.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e4ad40f922606989939d3fae6eadf82d1a8e98112dffb6e39d89d6471270d5c
3
+ size 2702962
body_models/human_model_files/smpl/VPOSER_CKPT/vposer_smpl.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2019 Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG),
4
+ # acting on behalf of its Max Planck Institute for Intelligent Systems and the
5
+ # Max Planck Institute for Biological Cybernetics. All rights reserved.
6
+ #
7
+ # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is holder of all proprietary rights
8
+ # on this computer program. You can only use this computer program if you have closed a license agreement
9
+ # with MPG or you get the right to use the computer program from someone who is authorized to grant you that right.
10
+ # Any use of the computer program without a valid license is prohibited and liable to prosecution.
11
+ # Contact: [email protected]
12
+ #
13
+ #
14
+ # If you use this code in a research publication please consider citing the following:
15
+ #
16
+ # Expressive Body Capture: 3D Hands, Face, and Body from a Single Image <https://arxiv.org/abs/1904.05866>
17
+ # AMASS: Archive of Motion Capture as Surface Shapes <https://arxiv.org/abs/1904.03278>
18
+ #
19
+ #
20
+ # Code Developed by:
21
+ # Nima Ghorbani <https://www.linkedin.com/in/nghorbani/>
22
+ # Vassilis Choutas <https://ps.is.tuebingen.mpg.de/employees/vchoutas> for ContinousRotReprDecoder
23
+ #
24
+ # 2018.01.02
25
+
26
+ '''
27
+ A human body pose prior built with Auto-Encoding Variational Bayes
28
+ '''
29
+
30
+ __all__ = ['VPoser']
31
+
32
+ import os, sys, shutil
33
+
34
+ import torch
35
+
36
+ from torch import nn
37
+ from torch.nn import functional as F
38
+
39
+ import numpy as np
40
+
41
+ import torchgeometry as tgm
42
+
43
+ class ContinousRotReprDecoder(nn.Module):
44
+ def __init__(self):
45
+ super(ContinousRotReprDecoder, self).__init__()
46
+
47
+ def forward(self, module_input):
48
+ reshaped_input = module_input.view(-1, 3, 2)
49
+
50
+ b1 = F.normalize(reshaped_input[:, :, 0], dim=1)
51
+
52
+ dot_prod = torch.sum(b1 * reshaped_input[:, :, 1], dim=1, keepdim=True)
53
+ b2 = F.normalize(reshaped_input[:, :, 1] - dot_prod * b1, dim=-1)
54
+ b3 = torch.cross(b1, b2, dim=1)
55
+
56
+ return torch.stack([b1, b2, b3], dim=-1)
57
+
58
+
59
+ class VPoser(nn.Module):
60
+ def __init__(self, num_neurons, latentD, data_shape, use_cont_repr=True):
61
+ super(VPoser, self).__init__()
62
+
63
+ self.latentD = latentD
64
+ self.use_cont_repr = use_cont_repr
65
+
66
+ n_features = np.prod(data_shape)
67
+ self.num_joints = data_shape[1]
68
+
69
+ self.bodyprior_enc_bn1 = nn.BatchNorm1d(n_features)
70
+ self.bodyprior_enc_fc1 = nn.Linear(n_features, num_neurons)
71
+ self.bodyprior_enc_bn2 = nn.BatchNorm1d(num_neurons)
72
+ self.bodyprior_enc_fc2 = nn.Linear(num_neurons, num_neurons)
73
+ self.bodyprior_enc_mu = nn.Linear(num_neurons, latentD)
74
+ self.bodyprior_enc_logvar = nn.Linear(num_neurons, latentD)
75
+ self.dropout = nn.Dropout(p=.1, inplace=False)
76
+
77
+ self.bodyprior_dec_fc1 = nn.Linear(latentD, num_neurons)
78
+ self.bodyprior_dec_fc2 = nn.Linear(num_neurons, num_neurons)
79
+
80
+ if self.use_cont_repr:
81
+ self.rot_decoder = ContinousRotReprDecoder()
82
+
83
+ self.bodyprior_dec_out = nn.Linear(num_neurons, self.num_joints* 6)
84
+
85
+ def encode(self, Pin):
86
+ '''
87
+
88
+ :param Pin: Nx(numjoints*3)
89
+ :param rep_type: 'matrot'/'aa' for matrix rotations or axis-angle
90
+ :return:
91
+ '''
92
+ Xout = Pin.view(Pin.size(0), -1) # flatten input
93
+ Xout = self.bodyprior_enc_bn1(Xout)
94
+
95
+ Xout = F.leaky_relu(self.bodyprior_enc_fc1(Xout), negative_slope=.2)
96
+ Xout = self.bodyprior_enc_bn2(Xout)
97
+ Xout = self.dropout(Xout)
98
+ Xout = F.leaky_relu(self.bodyprior_enc_fc2(Xout), negative_slope=.2)
99
+ return torch.distributions.normal.Normal(self.bodyprior_enc_mu(Xout), F.softplus(self.bodyprior_enc_logvar(Xout)))
100
+
101
+ def decode(self, Zin, output_type='matrot'):
102
+ assert output_type in ['matrot', 'aa']
103
+
104
+ Xout = F.leaky_relu(self.bodyprior_dec_fc1(Zin), negative_slope=.2)
105
+ Xout = self.dropout(Xout)
106
+ Xout = F.leaky_relu(self.bodyprior_dec_fc2(Xout), negative_slope=.2)
107
+ Xout = self.bodyprior_dec_out(Xout)
108
+ if self.use_cont_repr:
109
+ Xout = self.rot_decoder(Xout)
110
+ else:
111
+ Xout = torch.tanh(Xout)
112
+
113
+ Xout = Xout.view([-1, 1, self.num_joints, 9])
114
+ if output_type == 'aa': return VPoser.matrot2aa(Xout)
115
+ return Xout
116
+
117
+ def forward(self, Pin, input_type='matrot', output_type='matrot'):
118
+ '''
119
+
120
+ :param Pin: aa: Nx1xnum_jointsx3 / matrot: Nx1xnum_jointsx9
121
+ :param input_type: matrot / aa for matrix rotations or axis angles
122
+ :param output_type: matrot / aa
123
+ :return:
124
+ '''
125
+ assert output_type in ['matrot', 'aa']
126
+ # if input_type == 'aa': Pin = VPoser.aa2matrot(Pin)
127
+ q_z = self.encode(Pin)
128
+ q_z_sample = q_z.rsample()
129
+ Prec = self.decode(q_z_sample)
130
+ if output_type == 'aa': Prec = VPoser.matrot2aa(Prec)
131
+
132
+ #return Prec, q_z.mean, q_z.sigma
133
+ return {'pose':Prec, 'mean':q_z.mean, 'std':q_z.scale}
134
+
135
+ def sample_poses(self, num_poses, output_type='aa', seed=None):
136
+ np.random.seed(seed)
137
+ dtype = self.bodyprior_dec_fc1.weight.dtype
138
+ device = self.bodyprior_dec_fc1.weight.device
139
+ self.eval()
140
+ with torch.no_grad():
141
+ Zgen = torch.tensor(np.random.normal(0., 1., size=(num_poses, self.latentD)), dtype=dtype).to(device)
142
+ return self.decode(Zgen, output_type=output_type)
143
+
144
+ @staticmethod
145
+ def matrot2aa(pose_matrot):
146
+ '''
147
+ :param pose_matrot: Nx1xnum_jointsx9
148
+ :return: Nx1xnum_jointsx3
149
+ '''
150
+ batch_size = pose_matrot.size(0)
151
+ homogen_matrot = F.pad(pose_matrot.view(-1, 3, 3), [0,1])
152
+ pose = tgm.rotation_matrix_to_angle_axis(homogen_matrot).view(batch_size, 1, -1, 3).contiguous()
153
+ return pose
154
+
155
+ @staticmethod
156
+ def aa2matrot(pose):
157
+ '''
158
+ :param Nx1xnum_jointsx3
159
+ :return: pose_matrot: Nx1xnum_jointsx9
160
+ '''
161
+ batch_size = pose.size(0)
162
+ pose_body_matrot = tgm.angle_axis_to_rotation_matrix(pose.reshape(-1, 3))[:, :3, :3].contiguous().view(batch_size, 1, -1, 9)
163
+ return pose_body_matrot
164
+
body_models/human_model_files/smplx/MANO_SMPLX_vertex_ids.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5abe70b6574de25470475091e8008314a5b90127eb48c3e63bfa0adf8c04dcf
3
+ size 13535
body_models/human_model_files/smplx/SMPL-X__FLAME_vertex_ids.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e70cdc3659aae699b9732e8dd4af49106310c69b90dc83d9f73e96dbf871e49
3
+ size 40312
body_models/human_model_files/smplx/SMPLX_FEMALE.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05e37bd22dff93362c92cea9c791c62a2d4d7e8d44b234f3e41be0020fa1c256
3
+ size 108532279
body_models/human_model_files/smplx/SMPLX_FEMALE.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b870ce1fd05b46dd81e2de6269b2955667c931c8594999eb22eeb489b00e2c1f
3
+ size 146809856
body_models/human_model_files/smplx/SMPLX_MALE.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79360d466228bec1b9f9d922ea48df718a0a09bccddace18cfec98b0edd68b73
3
+ size 108491578
body_models/human_model_files/smplx/SMPLX_MALE.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4f94c40261ac4762bb9b09142d11bf47e1cc3d6b49b6bbcc4a2731451bf5632
3
+ size 543102085
body_models/human_model_files/smplx/SMPLX_NEUTRAL.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15eb61ac2f91dcd6e340913e281b2b8a0a910ebe0955af9251b9bb99fd11d02b
3
+ size 108490191
body_models/human_model_files/smplx/SMPLX_NEUTRAL.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b0279321ea9bd3cec5541c03b1f1c9ab9d197896943035c3abeef47f699bc5e
3
+ size 542798306
body_models/human_model_files/smplx/SMPLX_NEUTRAL_NEW.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:248e277858008fea271d1ea3874eed2310dfd57fa160ea07c467cf6a061e0ecd
3
+ size 167260951
body_models/human_model_files/smplx/SMPLX_NEUTRAL_NEW.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecb628fadd2b40f42cd39378d1e429cd30acc0bab6104676898d4374b804163d
3
+ size 167261087
body_models/human_model_files/smplx/SMPLX_NEUTRAL_NEW_WiFlame.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9047e853fc08caa5cef648aa691bf80cf423ca5f0693d825c029a6a7b0bedc51
3
+ size 215482118