import argparse
import os
import tarfile
from huggingface_hub import snapshot_download
from concurrent.futures import ThreadPoolExecutor


def extract_tar(tar_path, dest_dir):
    """
    Extracts a .tar file to the specified destination directory.
    """
    with tarfile.open(tar_path, 'r') as tar:
        tar.extractall(path=dest_dir)


def main():
    parser = argparse.ArgumentParser(
        description="Download and extract dataset.")
    parser.add_argument("--local_dir", type=str, default="/mnt/localssd/",
                        help="Local directory to save the dataset.")
    parser.add_argument("--repo_id", type=str,
                        default="Languagebind/Open-Sora-Plan-v1.1.0", help="Hugging Face repository ID.")
    parser.add_argument("--folder_name", type=str, default="all_mixkit",
                        help="Folder name of the huggingface repo.")

    args = parser.parse_args()

    allow_patterns = [f"{args.folder_name}/*.tar"]

    snapshot_download(
        repo_id=args.repo_id,
        local_dir=args.local_dir,
        revision="main",          # or the branch/tag/commit you want
        allow_patterns=allow_patterns,
        repo_type="dataset"
    )

    # 4. Collect all .tar files recursively from the downloaded folder
    tar_files = []
    for root, dirs, files in os.walk(args.local_dir):
        for file in files:
            if file.endswith(".tar"):
                tar_files.append(os.path.join(root, file))

    # 5. Destination folder for extracted files
    output_dir = os.path.join(args.local_dir, "videos")
    os.makedirs(output_dir, exist_ok=True)

    # 6. Extract each tar file in parallel
    with ThreadPoolExecutor() as executor:
        for tar_path in tar_files:
            executor.submit(extract_tar, tar_path, output_dir)

    print("All .tar files have been downloaded and extracted to:", output_dir)


if __name__ == "__main__":
    main()