causvid / distillation_data /download_mixkit.py
lyttt's picture
Add files using upload-large-folder tool
5f5f46e verified
raw
history blame
1.92 kB
import argparse
import os
import tarfile
from huggingface_hub import snapshot_download
from concurrent.futures import ThreadPoolExecutor
def extract_tar(tar_path, dest_dir):
"""
Extracts a .tar file to the specified destination directory.
"""
with tarfile.open(tar_path, 'r') as tar:
tar.extractall(path=dest_dir)
def main():
parser = argparse.ArgumentParser(
description="Download and extract dataset.")
parser.add_argument("--local_dir", type=str, default="/mnt/localssd/",
help="Local directory to save the dataset.")
parser.add_argument("--repo_id", type=str,
default="Languagebind/Open-Sora-Plan-v1.1.0", help="Hugging Face repository ID.")
parser.add_argument("--folder_name", type=str, default="all_mixkit",
help="Folder name of the huggingface repo.")
args = parser.parse_args()
allow_patterns = [f"{args.folder_name}/*.tar"]
snapshot_download(
repo_id=args.repo_id,
local_dir=args.local_dir,
revision="main", # or the branch/tag/commit you want
allow_patterns=allow_patterns,
repo_type="dataset"
)
# 4. Collect all .tar files recursively from the downloaded folder
tar_files = []
for root, dirs, files in os.walk(args.local_dir):
for file in files:
if file.endswith(".tar"):
tar_files.append(os.path.join(root, file))
# 5. Destination folder for extracted files
output_dir = os.path.join(args.local_dir, "videos")
os.makedirs(output_dir, exist_ok=True)
# 6. Extract each tar file in parallel
with ThreadPoolExecutor() as executor:
for tar_path in tar_files:
executor.submit(extract_tar, tar_path, output_dir)
print("All .tar files have been downloaded and extracted to:", output_dir)
if __name__ == "__main__":
main()