Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.gitattributes +1 -0
README.md +89 -0
config.json +135 -0
model.safetensors +3 -0
sample/diving.mp4 +3 -0
video_preprocessor_config.json +79 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sample/diving.mp4 filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,89 @@

+---
+license: mit
+pipeline_tag: video-classification
+tags:
+- video
+library_name: transformers
+---
+# V-JEPA 2
+A frontier video understanding model developed by FAIR, Meta, which extends the pretraining objectives of [VJEPA](https://ai.meta.com/blog/v-jepa-yann-lecun-ai-model-video-joint-embedding-predictive-architecture/), resulting in state-of-the-art video understanding capabilities, leveraging data and model sizes at scale.
+The code is released [in this repository](https://github.com/facebookresearch/vjepa2).
+<div style="background-color: rgba(251, 255, 120, 0.4); padding: 10px; color: black; border-radius: 5px; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
+    💡 This is V-JEPA 2 model with video classification head pretrained on <a href="http://www.svcl.ucsd.edu/projects/resound/dataset.html" style="color: black;">Diving 48</a> dataset.
+</div>
+<br></br>
+<img src="https://dl.fbaipublicfiles.com/vjepa2/vjepa2-pretrain.gif">&nbsp;
+## Installation
+To run V-JEPA 2 model, ensure you have installed the latest transformers:
+```bash
+pip install -U git+https://github.com/huggingface/transformers
+```
+## Video classification code snippet
+```python
+import torch
+import numpy as np
+from torchcodec.decoders import VideoDecoder
+from transformers import AutoVideoProcessor, AutoModelForVideoClassification
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load model and video preprocessor
+hf_repo = "facebook/vjepa2-vitl-fpc32-256-diving48"
+model = AutoModelForVideoClassification.from_pretrained(hf_repo).to(device)
+processor = AutoVideoProcessor.from_pretrained(hf_repo)
+# To load a video, sample the number of frames according to the model.
+video_url = "converted_models/vjepa2-vitg-fpc32-384-diving48/sample/diving.mp4" #"https://huggingface.co/datasets/nateraw/kinetics-mini/resolve/main/val/bowling/-WH-lxmGJVY_000005_000015.mp4"
+vr = VideoDecoder(video_url)
+frame_idx = np.arange(0, model.config.frames_per_clip, 8) # you can define more complex sampling strategy
+video = vr.get_frames_at(indices=frame_idx).data  # frames x channels x height x width
+# Preprocess and run inference
+inputs = processor(video, return_tensors="pt").to(model.device)
+with torch.no_grad():
+    outputs = model(**inputs)
+logits = outputs.logits
+print("Top 5 predicted class names:")
+top5_indices = logits.topk(5).indices[0]
+top5_probs = torch.softmax(logits, dim=-1).topk(5).values[0]
+for idx, prob in zip(top5_indices, top5_probs):
+    text_label = model.config.id2label[idx.item()]
+    print(f" - {text_label}: {prob:.2f}")
+```
+Output:
+```
+Top 5 predicted class names:
+ - ['Reverse', 'Dive', 'NoTwis', 'PIKE']: 0.52
+ - ['Inward', '25som', 'NoTwis', 'PIKE']: 0.12
+ - ['Forward', '35som', 'NoTwis', 'PIKE']: 0.07
+ - ['Reverse', '25som', 'NoTwis', 'PIKE']: 0.05
+ - ['Forward', '25som', '1Twis', 'PIKE']: 0.03
+```
+## Citation
+```
+@techreport{assran2025vjepa2,
+  title={V-JEPA~2: Self-Supervised Video Models Enable Understanding, Prediction and Planning},
+  author={Assran, Mahmoud and Bardes, Adrien and Fan, David and Garrido, Quentin and Howes, Russell and
+  Komeili, Mojtaba and Muckley, Matthew and Rizvi, Ammar and Roberts, Claire and Sinha, Koustuv and Zholus, Artem and
+  Arnaud, Sergio and Gejji, Abha and Martin, Ada and Robert Hogan, Francois and Dugas, Daniel and
+  Bojanowski, Piotr and Khalidov, Vasil and Labatut, Patrick and Massa, Francisco and Szafraniec, Marc and
+  Krishnakumar, Kapil and Li, Yong and Ma, Xiaodong and Chandar, Sarath and Meier, Franziska and LeCun, Yann and
+  Rabbat, Michael and Ballas, Nicolas},
+  institution={FAIR at Meta},
+  year={2025}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,135 @@

+{
+  "architectures": [
+    "VJEPA2ForVideoClassification"
+  ],
+  "attention_dropout": 0.0,
+  "attention_probs_dropout_prob": 0.0,
+  "crop_size": 256,
+  "drop_path_rate": 0.0,
+  "frames_per_clip": 32,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "['Back', '15som', '05Twis', 'FREE']",
+    "1": "['Back', '15som', '15Twis', 'FREE']",
+    "2": "['Back', '15som', '25Twis', 'FREE']",
+    "3": "['Back', '15som', 'NoTwis', 'PIKE']",
+    "4": "['Back', '15som', 'NoTwis', 'TUCK']",
+    "5": "['Back', '25som', '15Twis', 'PIKE']",
+    "6": "['Back', '25som', '25Twis', 'PIKE']",
+    "7": "['Back', '25som', 'NoTwis', 'PIKE']",
+    "8": "['Back', '25som', 'NoTwis', 'TUCK']",
+    "9": "['Back', '2som', '15Twis', 'FREE']",
+    "10": "['Back', '2som', '25Twis', 'FREE']",
+    "11": "['Back', '35som', 'NoTwis', 'PIKE']",
+    "12": "['Back', '35som', 'NoTwis', 'TUCK']",
+    "13": "['Back', '3som', 'NoTwis', 'PIKE']",
+    "14": "['Back', '3som', 'NoTwis', 'TUCK']",
+    "15": "['Back', 'Dive', 'NoTwis', 'PIKE']",
+    "16": "['Back', 'Dive', 'NoTwis', 'TUCK']",
+    "17": "['Forward', '15som', '1Twis', 'FREE']",
+    "18": "['Forward', '15som', '2Twis', 'FREE']",
+    "19": "['Forward', '15som', 'NoTwis', 'PIKE']",
+    "20": "['Forward', '1som', 'NoTwis', 'PIKE']",
+    "21": "['Forward', '25som', '1Twis', 'PIKE']",
+    "22": "['Forward', '25som', '2Twis', 'PIKE']",
+    "23": "['Forward', '25som', '3Twis', 'PIKE']",
+    "24": "['Forward', '25som', 'NoTwis', 'PIKE']",
+    "25": "['Forward', '25som', 'NoTwis', 'TUCK']",
+    "26": "['Forward', '35som', 'NoTwis', 'PIKE']",
+    "27": "['Forward', '35som', 'NoTwis', 'TUCK']",
+    "28": "['Forward', '45som', 'NoTwis', 'TUCK']",
+    "29": "['Forward', 'Dive', 'NoTwis', 'PIKE']",
+    "30": "['Forward', 'Dive', 'NoTwis', 'STR']",
+    "31": "['Inward', '15som', 'NoTwis', 'PIKE']",
+    "32": "['Inward', '15som', 'NoTwis', 'TUCK']",
+    "33": "['Inward', '25som', 'NoTwis', 'PIKE']",
+    "34": "['Inward', '25som', 'NoTwis', 'TUCK']",
+    "35": "['Inward', '35som', 'NoTwis', 'TUCK']",
+    "36": "['Inward', 'Dive', 'NoTwis', 'PIKE']",
+    "37": "['Reverse', '15som', '05Twis', 'FREE']",
+    "38": "['Reverse', '15som', '15Twis', 'FREE']",
+    "39": "['Reverse', '15som', '25Twis', 'FREE']",
+    "40": "['Reverse', '15som', '35Twis', 'FREE']",
+    "41": "['Reverse', '15som', 'NoTwis', 'PIKE']",
+    "42": "['Reverse', '25som', '15Twis', 'PIKE']",
+    "43": "['Reverse', '25som', 'NoTwis', 'PIKE']",
+    "44": "['Reverse', '25som', 'NoTwis', 'TUCK']",
+    "45": "['Reverse', '35som', 'NoTwis', 'TUCK']",
+    "46": "['Reverse', 'Dive', 'NoTwis', 'PIKE']",
+    "47": "['Reverse', 'Dive', 'NoTwis', 'TUCK']"
+  },
+  "image_size": 256,
+  "in_chans": 3,
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_11": 11,
+    "LABEL_12": 12,
+    "LABEL_13": 13,
+    "LABEL_14": 14,
+    "LABEL_15": 15,
+    "LABEL_16": 16,
+    "LABEL_17": 17,
+    "LABEL_18": 18,
+    "LABEL_19": 19,
+    "LABEL_2": 2,
+    "LABEL_20": 20,
+    "LABEL_21": 21,
+    "LABEL_22": 22,
+    "LABEL_23": 23,
+    "LABEL_24": 24,
+    "LABEL_25": 25,
+    "LABEL_26": 26,
+    "LABEL_27": 27,
+    "LABEL_28": 28,
+    "LABEL_29": 29,
+    "LABEL_3": 3,
+    "LABEL_30": 30,
+    "LABEL_31": 31,
+    "LABEL_32": 32,
+    "LABEL_33": 33,
+    "LABEL_34": 34,
+    "LABEL_35": 35,
+    "LABEL_36": 36,
+    "LABEL_37": 37,
+    "LABEL_38": 38,
+    "LABEL_39": 39,
+    "LABEL_4": 4,
+    "LABEL_40": 40,
+    "LABEL_41": 41,
+    "LABEL_42": 42,
+    "LABEL_43": 43,
+    "LABEL_44": 44,
+    "LABEL_45": 45,
+    "LABEL_46": 46,
+    "LABEL_47": 47,
+    "LABEL_5": 5,
+    "LABEL_6": 6,
+    "LABEL_7": 7,
+    "LABEL_8": 8,
+    "LABEL_9": 9
+  },
+  "layer_norm_eps": 1e-06,
+  "mlp_ratio": 4,
+  "model_type": "vjepa2",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "num_pooler_layers": 3,
+  "patch_size": 16,
+  "pred_hidden_size": 384,
+  "pred_mlp_ratio": 4.0,
+  "pred_num_attention_heads": 12,
+  "pred_num_hidden_layers": 12,
+  "pred_num_mask_tokens": 10,
+  "pred_zero_init_mask_tokens": true,
+  "qkv_bias": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.0.dev0",
+  "tubelet_size": 2,
+  "use_SiLU": false,
+  "wide_SiLU": true
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0dec8132070562db7bd5ee0ba650d51f76f9b31157c5e87e4c593f98e81155c
+size 1501501984

sample/diving.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7652d0c08ebe634ec200d758746926edbe3b4972d88b83093f3719d67fca8490
+size 324132

video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,79 @@

+{
+  "_valid_kwargs_names": [
+    "do_convert_rgb",
+    "do_resize",
+    "size",
+    "size_divisor",
+    "default_to_square",
+    "resample",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "do_pad",
+    "do_center_crop",
+    "crop_size",
+    "data_format",
+    "input_data_format",
+    "device"
+  ],
+  "crop_size": {
+    "height": 256,
+    "width": 256
+  },
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": true,
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": null,
+  "fps": null,
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "input_data_format": null,
+  "model_valid_processing_keys": [
+    "do_convert_rgb",
+    "do_resize",
+    "size",
+    "size_divisor",
+    "default_to_square",
+    "resample",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "do_pad",
+    "do_center_crop",
+    "crop_size",
+    "data_format",
+    "input_data_format",
+    "device",
+    "do_sample_frames",
+    "video_metadata",
+    "fps",
+    "num_frames"
+  ],
+  "num_frames": null,
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 292
+  },
+  "size_divisor": null,
+  "video_metadata": null,
+  "video_processor_type": "VJEPA2VideoProcessor"
+}