|
|
import argparse |
|
|
import os |
|
|
import tarfile |
|
|
from huggingface_hub import snapshot_download |
|
|
from concurrent.futures import ThreadPoolExecutor |
|
|
|
|
|
|
|
|
def extract_tar(tar_path, dest_dir): |
|
|
""" |
|
|
Extracts a .tar file to the specified destination directory. |
|
|
""" |
|
|
with tarfile.open(tar_path, 'r') as tar: |
|
|
tar.extractall(path=dest_dir) |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Download and extract dataset.") |
|
|
parser.add_argument("--local_dir", type=str, default="/mnt/localssd/", |
|
|
help="Local directory to save the dataset.") |
|
|
parser.add_argument("--repo_id", type=str, |
|
|
default="Languagebind/Open-Sora-Plan-v1.1.0", help="Hugging Face repository ID.") |
|
|
parser.add_argument("--folder_name", type=str, default="all_mixkit", |
|
|
help="Folder name of the huggingface repo.") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
allow_patterns = [f"{args.folder_name}/*.tar"] |
|
|
|
|
|
snapshot_download( |
|
|
repo_id=args.repo_id, |
|
|
local_dir=args.local_dir, |
|
|
revision="main", |
|
|
allow_patterns=allow_patterns, |
|
|
repo_type="dataset" |
|
|
) |
|
|
|
|
|
|
|
|
tar_files = [] |
|
|
for root, dirs, files in os.walk(args.local_dir): |
|
|
for file in files: |
|
|
if file.endswith(".tar"): |
|
|
tar_files.append(os.path.join(root, file)) |
|
|
|
|
|
|
|
|
output_dir = os.path.join(args.local_dir, "videos") |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
with ThreadPoolExecutor() as executor: |
|
|
for tar_path in tar_files: |
|
|
executor.submit(extract_tar, tar_path, output_dir) |
|
|
|
|
|
print("All .tar files have been downloaded and extracted to:", output_dir) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|