Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- README.md +89 -0
- config.json +135 -0
- model.safetensors +3 -0
- sample/diving.mp4 +3 -0
- video_preprocessor_config.json +79 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
sample/diving.mp4 filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
pipeline_tag: video-classification
|
| 4 |
+
tags:
|
| 5 |
+
- video
|
| 6 |
+
library_name: transformers
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
# V-JEPA 2
|
| 10 |
+
|
| 11 |
+
A frontier video understanding model developed by FAIR, Meta, which extends the pretraining objectives of [VJEPA](https://ai.meta.com/blog/v-jepa-yann-lecun-ai-model-video-joint-embedding-predictive-architecture/), resulting in state-of-the-art video understanding capabilities, leveraging data and model sizes at scale.
|
| 12 |
+
The code is released [in this repository](https://github.com/facebookresearch/vjepa2).
|
| 13 |
+
|
| 14 |
+
<div style="background-color: rgba(251, 255, 120, 0.4); padding: 10px; color: black; border-radius: 5px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
|
| 15 |
+
💡 This is V-JEPA 2 model with video classification head pretrained on <a href="http://www.svcl.ucsd.edu/projects/resound/dataset.html" style="color: black;">Diving 48</a> dataset.
|
| 16 |
+
</div>
|
| 17 |
+
<br></br>
|
| 18 |
+
|
| 19 |
+
<img src="https://dl.fbaipublicfiles.com/vjepa2/vjepa2-pretrain.gif">
|
| 20 |
+
|
| 21 |
+
## Installation
|
| 22 |
+
|
| 23 |
+
To run V-JEPA 2 model, ensure you have installed the latest transformers:
|
| 24 |
+
|
| 25 |
+
```bash
|
| 26 |
+
pip install -U git+https://github.com/huggingface/transformers
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
## Video classification code snippet
|
| 30 |
+
|
| 31 |
+
```python
|
| 32 |
+
import torch
|
| 33 |
+
import numpy as np
|
| 34 |
+
|
| 35 |
+
from torchcodec.decoders import VideoDecoder
|
| 36 |
+
from transformers import AutoVideoProcessor, AutoModelForVideoClassification
|
| 37 |
+
|
| 38 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 39 |
+
|
| 40 |
+
# Load model and video preprocessor
|
| 41 |
+
hf_repo = "facebook/vjepa2-vitl-fpc32-256-diving48"
|
| 42 |
+
|
| 43 |
+
model = AutoModelForVideoClassification.from_pretrained(hf_repo).to(device)
|
| 44 |
+
processor = AutoVideoProcessor.from_pretrained(hf_repo)
|
| 45 |
+
|
| 46 |
+
# To load a video, sample the number of frames according to the model.
|
| 47 |
+
video_url = "converted_models/vjepa2-vitg-fpc32-384-diving48/sample/diving.mp4" #"https://huggingface.co/datasets/nateraw/kinetics-mini/resolve/main/val/bowling/-WH-lxmGJVY_000005_000015.mp4"
|
| 48 |
+
vr = VideoDecoder(video_url)
|
| 49 |
+
frame_idx = np.arange(0, model.config.frames_per_clip, 8) # you can define more complex sampling strategy
|
| 50 |
+
video = vr.get_frames_at(indices=frame_idx).data # frames x channels x height x width
|
| 51 |
+
|
| 52 |
+
# Preprocess and run inference
|
| 53 |
+
inputs = processor(video, return_tensors="pt").to(model.device)
|
| 54 |
+
with torch.no_grad():
|
| 55 |
+
outputs = model(**inputs)
|
| 56 |
+
logits = outputs.logits
|
| 57 |
+
|
| 58 |
+
print("Top 5 predicted class names:")
|
| 59 |
+
top5_indices = logits.topk(5).indices[0]
|
| 60 |
+
top5_probs = torch.softmax(logits, dim=-1).topk(5).values[0]
|
| 61 |
+
for idx, prob in zip(top5_indices, top5_probs):
|
| 62 |
+
text_label = model.config.id2label[idx.item()]
|
| 63 |
+
print(f" - {text_label}: {prob:.2f}")
|
| 64 |
+
```
|
| 65 |
+
Output:
|
| 66 |
+
```
|
| 67 |
+
Top 5 predicted class names:
|
| 68 |
+
- ['Reverse', 'Dive', 'NoTwis', 'PIKE']: 0.52
|
| 69 |
+
- ['Inward', '25som', 'NoTwis', 'PIKE']: 0.12
|
| 70 |
+
- ['Forward', '35som', 'NoTwis', 'PIKE']: 0.07
|
| 71 |
+
- ['Reverse', '25som', 'NoTwis', 'PIKE']: 0.05
|
| 72 |
+
- ['Forward', '25som', '1Twis', 'PIKE']: 0.03
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
## Citation
|
| 76 |
+
|
| 77 |
+
```
|
| 78 |
+
@techreport{assran2025vjepa2,
|
| 79 |
+
title={V-JEPA~2: Self-Supervised Video Models Enable Understanding, Prediction and Planning},
|
| 80 |
+
author={Assran, Mahmoud and Bardes, Adrien and Fan, David and Garrido, Quentin and Howes, Russell and
|
| 81 |
+
Komeili, Mojtaba and Muckley, Matthew and Rizvi, Ammar and Roberts, Claire and Sinha, Koustuv and Zholus, Artem and
|
| 82 |
+
Arnaud, Sergio and Gejji, Abha and Martin, Ada and Robert Hogan, Francois and Dugas, Daniel and
|
| 83 |
+
Bojanowski, Piotr and Khalidov, Vasil and Labatut, Patrick and Massa, Francisco and Szafraniec, Marc and
|
| 84 |
+
Krishnakumar, Kapil and Li, Yong and Ma, Xiaodong and Chandar, Sarath and Meier, Franziska and LeCun, Yann and
|
| 85 |
+
Rabbat, Michael and Ballas, Nicolas},
|
| 86 |
+
institution={FAIR at Meta},
|
| 87 |
+
year={2025}
|
| 88 |
+
}
|
| 89 |
+
```
|
config.json
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"VJEPA2ForVideoClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_dropout": 0.0,
|
| 6 |
+
"attention_probs_dropout_prob": 0.0,
|
| 7 |
+
"crop_size": 256,
|
| 8 |
+
"drop_path_rate": 0.0,
|
| 9 |
+
"frames_per_clip": 32,
|
| 10 |
+
"hidden_act": "gelu",
|
| 11 |
+
"hidden_dropout_prob": 0.0,
|
| 12 |
+
"hidden_size": 1024,
|
| 13 |
+
"id2label": {
|
| 14 |
+
"0": "['Back', '15som', '05Twis', 'FREE']",
|
| 15 |
+
"1": "['Back', '15som', '15Twis', 'FREE']",
|
| 16 |
+
"2": "['Back', '15som', '25Twis', 'FREE']",
|
| 17 |
+
"3": "['Back', '15som', 'NoTwis', 'PIKE']",
|
| 18 |
+
"4": "['Back', '15som', 'NoTwis', 'TUCK']",
|
| 19 |
+
"5": "['Back', '25som', '15Twis', 'PIKE']",
|
| 20 |
+
"6": "['Back', '25som', '25Twis', 'PIKE']",
|
| 21 |
+
"7": "['Back', '25som', 'NoTwis', 'PIKE']",
|
| 22 |
+
"8": "['Back', '25som', 'NoTwis', 'TUCK']",
|
| 23 |
+
"9": "['Back', '2som', '15Twis', 'FREE']",
|
| 24 |
+
"10": "['Back', '2som', '25Twis', 'FREE']",
|
| 25 |
+
"11": "['Back', '35som', 'NoTwis', 'PIKE']",
|
| 26 |
+
"12": "['Back', '35som', 'NoTwis', 'TUCK']",
|
| 27 |
+
"13": "['Back', '3som', 'NoTwis', 'PIKE']",
|
| 28 |
+
"14": "['Back', '3som', 'NoTwis', 'TUCK']",
|
| 29 |
+
"15": "['Back', 'Dive', 'NoTwis', 'PIKE']",
|
| 30 |
+
"16": "['Back', 'Dive', 'NoTwis', 'TUCK']",
|
| 31 |
+
"17": "['Forward', '15som', '1Twis', 'FREE']",
|
| 32 |
+
"18": "['Forward', '15som', '2Twis', 'FREE']",
|
| 33 |
+
"19": "['Forward', '15som', 'NoTwis', 'PIKE']",
|
| 34 |
+
"20": "['Forward', '1som', 'NoTwis', 'PIKE']",
|
| 35 |
+
"21": "['Forward', '25som', '1Twis', 'PIKE']",
|
| 36 |
+
"22": "['Forward', '25som', '2Twis', 'PIKE']",
|
| 37 |
+
"23": "['Forward', '25som', '3Twis', 'PIKE']",
|
| 38 |
+
"24": "['Forward', '25som', 'NoTwis', 'PIKE']",
|
| 39 |
+
"25": "['Forward', '25som', 'NoTwis', 'TUCK']",
|
| 40 |
+
"26": "['Forward', '35som', 'NoTwis', 'PIKE']",
|
| 41 |
+
"27": "['Forward', '35som', 'NoTwis', 'TUCK']",
|
| 42 |
+
"28": "['Forward', '45som', 'NoTwis', 'TUCK']",
|
| 43 |
+
"29": "['Forward', 'Dive', 'NoTwis', 'PIKE']",
|
| 44 |
+
"30": "['Forward', 'Dive', 'NoTwis', 'STR']",
|
| 45 |
+
"31": "['Inward', '15som', 'NoTwis', 'PIKE']",
|
| 46 |
+
"32": "['Inward', '15som', 'NoTwis', 'TUCK']",
|
| 47 |
+
"33": "['Inward', '25som', 'NoTwis', 'PIKE']",
|
| 48 |
+
"34": "['Inward', '25som', 'NoTwis', 'TUCK']",
|
| 49 |
+
"35": "['Inward', '35som', 'NoTwis', 'TUCK']",
|
| 50 |
+
"36": "['Inward', 'Dive', 'NoTwis', 'PIKE']",
|
| 51 |
+
"37": "['Reverse', '15som', '05Twis', 'FREE']",
|
| 52 |
+
"38": "['Reverse', '15som', '15Twis', 'FREE']",
|
| 53 |
+
"39": "['Reverse', '15som', '25Twis', 'FREE']",
|
| 54 |
+
"40": "['Reverse', '15som', '35Twis', 'FREE']",
|
| 55 |
+
"41": "['Reverse', '15som', 'NoTwis', 'PIKE']",
|
| 56 |
+
"42": "['Reverse', '25som', '15Twis', 'PIKE']",
|
| 57 |
+
"43": "['Reverse', '25som', 'NoTwis', 'PIKE']",
|
| 58 |
+
"44": "['Reverse', '25som', 'NoTwis', 'TUCK']",
|
| 59 |
+
"45": "['Reverse', '35som', 'NoTwis', 'TUCK']",
|
| 60 |
+
"46": "['Reverse', 'Dive', 'NoTwis', 'PIKE']",
|
| 61 |
+
"47": "['Reverse', 'Dive', 'NoTwis', 'TUCK']"
|
| 62 |
+
},
|
| 63 |
+
"image_size": 256,
|
| 64 |
+
"in_chans": 3,
|
| 65 |
+
"initializer_range": 0.02,
|
| 66 |
+
"label2id": {
|
| 67 |
+
"LABEL_0": 0,
|
| 68 |
+
"LABEL_1": 1,
|
| 69 |
+
"LABEL_10": 10,
|
| 70 |
+
"LABEL_11": 11,
|
| 71 |
+
"LABEL_12": 12,
|
| 72 |
+
"LABEL_13": 13,
|
| 73 |
+
"LABEL_14": 14,
|
| 74 |
+
"LABEL_15": 15,
|
| 75 |
+
"LABEL_16": 16,
|
| 76 |
+
"LABEL_17": 17,
|
| 77 |
+
"LABEL_18": 18,
|
| 78 |
+
"LABEL_19": 19,
|
| 79 |
+
"LABEL_2": 2,
|
| 80 |
+
"LABEL_20": 20,
|
| 81 |
+
"LABEL_21": 21,
|
| 82 |
+
"LABEL_22": 22,
|
| 83 |
+
"LABEL_23": 23,
|
| 84 |
+
"LABEL_24": 24,
|
| 85 |
+
"LABEL_25": 25,
|
| 86 |
+
"LABEL_26": 26,
|
| 87 |
+
"LABEL_27": 27,
|
| 88 |
+
"LABEL_28": 28,
|
| 89 |
+
"LABEL_29": 29,
|
| 90 |
+
"LABEL_3": 3,
|
| 91 |
+
"LABEL_30": 30,
|
| 92 |
+
"LABEL_31": 31,
|
| 93 |
+
"LABEL_32": 32,
|
| 94 |
+
"LABEL_33": 33,
|
| 95 |
+
"LABEL_34": 34,
|
| 96 |
+
"LABEL_35": 35,
|
| 97 |
+
"LABEL_36": 36,
|
| 98 |
+
"LABEL_37": 37,
|
| 99 |
+
"LABEL_38": 38,
|
| 100 |
+
"LABEL_39": 39,
|
| 101 |
+
"LABEL_4": 4,
|
| 102 |
+
"LABEL_40": 40,
|
| 103 |
+
"LABEL_41": 41,
|
| 104 |
+
"LABEL_42": 42,
|
| 105 |
+
"LABEL_43": 43,
|
| 106 |
+
"LABEL_44": 44,
|
| 107 |
+
"LABEL_45": 45,
|
| 108 |
+
"LABEL_46": 46,
|
| 109 |
+
"LABEL_47": 47,
|
| 110 |
+
"LABEL_5": 5,
|
| 111 |
+
"LABEL_6": 6,
|
| 112 |
+
"LABEL_7": 7,
|
| 113 |
+
"LABEL_8": 8,
|
| 114 |
+
"LABEL_9": 9
|
| 115 |
+
},
|
| 116 |
+
"layer_norm_eps": 1e-06,
|
| 117 |
+
"mlp_ratio": 4,
|
| 118 |
+
"model_type": "vjepa2",
|
| 119 |
+
"num_attention_heads": 16,
|
| 120 |
+
"num_hidden_layers": 24,
|
| 121 |
+
"num_pooler_layers": 3,
|
| 122 |
+
"patch_size": 16,
|
| 123 |
+
"pred_hidden_size": 384,
|
| 124 |
+
"pred_mlp_ratio": 4.0,
|
| 125 |
+
"pred_num_attention_heads": 12,
|
| 126 |
+
"pred_num_hidden_layers": 12,
|
| 127 |
+
"pred_num_mask_tokens": 10,
|
| 128 |
+
"pred_zero_init_mask_tokens": true,
|
| 129 |
+
"qkv_bias": true,
|
| 130 |
+
"torch_dtype": "float32",
|
| 131 |
+
"transformers_version": "4.53.0.dev0",
|
| 132 |
+
"tubelet_size": 2,
|
| 133 |
+
"use_SiLU": false,
|
| 134 |
+
"wide_SiLU": true
|
| 135 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0dec8132070562db7bd5ee0ba650d51f76f9b31157c5e87e4c593f98e81155c
|
| 3 |
+
size 1501501984
|
sample/diving.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7652d0c08ebe634ec200d758746926edbe3b4972d88b83093f3719d67fca8490
|
| 3 |
+
size 324132
|
video_preprocessor_config.json
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_valid_kwargs_names": [
|
| 3 |
+
"do_convert_rgb",
|
| 4 |
+
"do_resize",
|
| 5 |
+
"size",
|
| 6 |
+
"size_divisor",
|
| 7 |
+
"default_to_square",
|
| 8 |
+
"resample",
|
| 9 |
+
"do_rescale",
|
| 10 |
+
"rescale_factor",
|
| 11 |
+
"do_normalize",
|
| 12 |
+
"image_mean",
|
| 13 |
+
"image_std",
|
| 14 |
+
"do_pad",
|
| 15 |
+
"do_center_crop",
|
| 16 |
+
"crop_size",
|
| 17 |
+
"data_format",
|
| 18 |
+
"input_data_format",
|
| 19 |
+
"device"
|
| 20 |
+
],
|
| 21 |
+
"crop_size": {
|
| 22 |
+
"height": 256,
|
| 23 |
+
"width": 256
|
| 24 |
+
},
|
| 25 |
+
"data_format": "channels_first",
|
| 26 |
+
"default_to_square": true,
|
| 27 |
+
"device": null,
|
| 28 |
+
"do_center_crop": true,
|
| 29 |
+
"do_convert_rgb": null,
|
| 30 |
+
"do_normalize": true,
|
| 31 |
+
"do_pad": null,
|
| 32 |
+
"do_rescale": true,
|
| 33 |
+
"do_resize": true,
|
| 34 |
+
"do_sample_frames": null,
|
| 35 |
+
"fps": null,
|
| 36 |
+
"image_mean": [
|
| 37 |
+
0.485,
|
| 38 |
+
0.456,
|
| 39 |
+
0.406
|
| 40 |
+
],
|
| 41 |
+
"image_std": [
|
| 42 |
+
0.229,
|
| 43 |
+
0.224,
|
| 44 |
+
0.225
|
| 45 |
+
],
|
| 46 |
+
"input_data_format": null,
|
| 47 |
+
"model_valid_processing_keys": [
|
| 48 |
+
"do_convert_rgb",
|
| 49 |
+
"do_resize",
|
| 50 |
+
"size",
|
| 51 |
+
"size_divisor",
|
| 52 |
+
"default_to_square",
|
| 53 |
+
"resample",
|
| 54 |
+
"do_rescale",
|
| 55 |
+
"rescale_factor",
|
| 56 |
+
"do_normalize",
|
| 57 |
+
"image_mean",
|
| 58 |
+
"image_std",
|
| 59 |
+
"do_pad",
|
| 60 |
+
"do_center_crop",
|
| 61 |
+
"crop_size",
|
| 62 |
+
"data_format",
|
| 63 |
+
"input_data_format",
|
| 64 |
+
"device",
|
| 65 |
+
"do_sample_frames",
|
| 66 |
+
"video_metadata",
|
| 67 |
+
"fps",
|
| 68 |
+
"num_frames"
|
| 69 |
+
],
|
| 70 |
+
"num_frames": null,
|
| 71 |
+
"resample": 2,
|
| 72 |
+
"rescale_factor": 0.00392156862745098,
|
| 73 |
+
"size": {
|
| 74 |
+
"shortest_edge": 292
|
| 75 |
+
},
|
| 76 |
+
"size_divisor": null,
|
| 77 |
+
"video_metadata": null,
|
| 78 |
+
"video_processor_type": "VJEPA2VideoProcessor"
|
| 79 |
+
}
|