{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tool\n",
    "import json\n",
    "import os\n",
    "import random\n",
    "import shutil\n",
    "from tqdm import tqdm\n",
    "import re\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "def load_jsonl(path):\n",
    "    datas = []\n",
    "    with open(path, 'r') as file:\n",
    "        for line in file:\n",
    "            data = json.loads(line)\n",
    "            datas.append(data)\n",
    "    return datas\n",
    "\n",
    "def load_jsonl_fromdir(res_dir):\n",
    "    res_name = sorted(os.listdir(res_dir))\n",
    "    res_paths = [os.path.join(res_dir, name) for name in res_name]\n",
    "\n",
    "    datas = []\n",
    "    for path in res_paths:\n",
    "        datas.extend(load_jsonl(path))\n",
    "    return datas\n",
    "\n",
    "def load_json(path):\n",
    "    with open(path, 'r') as file:\n",
    "        datas = json.load(file)\n",
    "    return datas\n",
    "\n",
    "def save_json(datas, path, indent=4):\n",
    "    with open(path, 'w') as file:\n",
    "        json.dump(datas, file, indent=indent)\n",
    "\n",
    "def parse(generated_text):\n",
    "    generated_text = generated_text.strip()\n",
    "    if \"```json\" in generated_text:\n",
    "        generated_text = re.sub(r\"^```json\\s*|\\s*```$\", \"\", generated_text.strip())\n",
    "    try:\n",
    "        data = eval(generated_text)\n",
    "    except:\n",
    "        generated_text = generated_text.replace('\\'Q\\': \\'', \"\\\"Q\\\": \\\"\").replace('\\', \\'A\\': \\'', \"\\\", \\\"A\\\": \\\"\").replace('\\'}', \"\\\"}\")\n",
    "        data = eval(generated_text)\n",
    "\n",
    "    return data\n",
    "\n",
    "def formating_conversations(data):\n",
    "    \n",
    "    question = data['Q']\n",
    "    options = data['Options']\n",
    "    answer = data['Answer']\n",
    "\n",
    "    question_inp = question + '\\n' + '\\n'.join(options)\n",
    "    answer_inp = answer\n",
    "\n",
    "    conversations = [\n",
    "        {\n",
    "            \"from\": \"human\",\n",
    "            \"value\": '<image>\\n' + question_inp\n",
    "        },\n",
    "        {\n",
    "            \"from\": \"gpt\",\n",
    "            \"value\": answer_inp\n",
    "        }\n",
    "    ]\n",
    "\n",
    "    return conversations\n",
    "\n",
    "def time_to_seconds(time_str):\n",
    "    # Split the string by the dot to separate seconds and milliseconds\n",
    "    time_parts = time_str.split('.')\n",
    "    seconds = 0\n",
    "    \n",
    "    # If there are milliseconds, process them\n",
    "    if len(time_parts) == 2:\n",
    "        time_str = time_parts[0]\n",
    "        milliseconds = int(time_parts[1])\n",
    "    else:\n",
    "        time_str = time_parts[0]\n",
    "        milliseconds = 0\n",
    "\n",
    "    # Split the time string by colon to get hours, minutes, and seconds\n",
    "    time_parts = time_str.split(':')\n",
    "    hours = int(time_parts[0])\n",
    "    minutes = int(time_parts[1])\n",
    "    seconds += float(time_parts[2])\n",
    "\n",
    "    # Convert everything to seconds\n",
    "    total_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000\n",
    "    return total_seconds\n",
    "\n",
    "def get_datas_from_df(df_path):\n",
    "    df = pd.read_csv(df_path)\n",
    "    datas = df.to_dict('records')\n",
    "    return datas\n",
    "\n",
    "def list_2_dict(datas, key='video_id'):\n",
    "    datas_dict = {}\n",
    "    for data in tqdm(datas, desc='list_2_dict'):\n",
    "        video_id = data['video_id']\n",
    "        if video_id not in datas_dict:\n",
    "            datas_dict[video_id] = [data]\n",
    "        else:\n",
    "            datas_dict[video_id].append(data)\n",
    "            \n",
    "    return datas_dict"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# miradata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "视频来源分布：\n",
    "source: youtube, size: 158525\n",
    "source: pexels, size: 57075\n",
    "source: videvo, size: 9934\n",
    "source: pixabay, size: 10"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 构造候选数据\n",
    "包括原视频的 video id, url, duration, clips"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_9328/1679904496.py:95: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(df_path)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "orig datas size: 330313\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "list_2_dict: 100%|██████████| 330313/330313 [00:00<00:00, 1178135.49it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "datas size: 158525\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "# 构造候选数据，来自youtube的，下载了metadict的，把duration存起来\n",
    "\n",
    "orig_datas = get_datas_from_df('/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_v1_330k.csv')\n",
    "print(f'orig datas size: {len(orig_datas)}')\n",
    "orig_datas_dict = list_2_dict(orig_datas, 'video_id')\n",
    "\n",
    "datas = get_datas_from_df('/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube.csv')\n",
    "print(f'datas size: {len(datas)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 158525/158525 [09:17<00:00, 284.11it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total size: 139565\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 假设 load_json 和其他相关函数已经定义\n",
    "\n",
    "meta_dir = '/share_2/minghao/Datasets/MiraInfos'\n",
    "new_datas = []\n",
    "\n",
    "# 定义一个用于处理每个数据项的函数\n",
    "def process_data(data):\n",
    "    video_id = data['video_id']\n",
    "    clips_info = orig_datas_dict[video_id]\n",
    "    meta_path = os.path.join(meta_dir, video_id + '.json')\n",
    "    \n",
    "    # 如果文件不存在，跳过该条数据\n",
    "    if not os.path.exists(meta_path):\n",
    "        return None\n",
    "    \n",
    "    # 读取meta数据\n",
    "    meta_data = load_json(meta_path)\n",
    "    assert meta_data['video_id'] == video_id\n",
    "    \n",
    "    # 获取视频时长并附加到数据中\n",
    "    duration = meta_data['yt_meta_dict']['info']['duration']\n",
    "    data['duration'] = duration\n",
    "    data['clips'] = clips_info\n",
    "    return data\n",
    "\n",
    "# 使用线程池并行处理数据\n",
    "with ThreadPoolExecutor() as executor:\n",
    "    # 使用 tqdm 来显示进度条\n",
    "    futures = {executor.submit(process_data, data): data for data in datas}\n",
    "    \n",
    "    # 等待每个任务完成，并将结果添加到 new_datas\n",
    "    for future in tqdm(as_completed(futures), total=len(futures)):\n",
    "        result = future.result()\n",
    "        if result is not None:\n",
    "            new_datas.append(result)\n",
    "\n",
    "# 打印结果\n",
    "print(f'Total size: {len(new_datas)}')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "save_json(new_datas, '/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_140k.json')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 筛选合适的数据\n",
    "- clip 的时长，clip 的位置\n",
    "- 视频的时长\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_datas = load_json('/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_140k.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 139565/139565 [00:03<00:00, 42940.60it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0-1min: 1369 videos\n",
      "1-2min: 9997 videos\n",
      "2-3min: 17451 videos\n",
      "3-4min: 13403 videos\n",
      "4-5min: 10488 videos\n",
      "5-6min: 8879 videos\n",
      "6-7min: 7236 videos\n",
      "7-8min: 6566 videos\n",
      "8-9min: 5100 videos\n",
      "9-10min: 3655 videos\n",
      "10-11min: 5620 videos\n",
      "11-12min: 3901 videos\n",
      "12-13min: 3236 videos\n",
      "13-14min: 2666 videos\n",
      "14-15min: 2329 videos\n",
      "15-16min: 1994 videos\n",
      "16-17min: 1524 videos\n",
      "17-18min: 1328 videos\n",
      "18-19min: 1081 videos\n",
      "19-20min: 906 videos\n",
      "20min+: 6856 videos\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5/5 [00:00<00:00, 221.48it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "size: 31436\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5/5 [00:00<00:00, 406.48it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "size: 17752\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "def check_clip(clip_info, video_duration):\n",
    "    # 需要一个规则限制一下\n",
    "    clip_duration = clip_info['seconds']\n",
    "    timestamp = eval(clip_info['timestamp'])\n",
    "    timestamp = eval(clip_info['timestamp'])\n",
    "\n",
    "    start_time = time_to_seconds(timestamp[0])\n",
    "    end_time = time_to_seconds(timestamp[-1])\n",
    "\n",
    "    be_window_len = video_duration / 6\n",
    "    begin_window_end_time = 0 + be_window_len\n",
    "    end_window_end_time = video_duration - be_window_len\n",
    "    \n",
    "    if  10 <= clip_duration <= 120:\n",
    "        duration_flag =  True\n",
    "    else:\n",
    "        duration_flag =  False\n",
    "\n",
    "    if start_time >= end_window_end_time and clip_duration >= be_window_len//2:\n",
    "        clip_position = 'be'\n",
    "    elif end_time <= begin_window_end_time and clip_duration >= be_window_len//2:\n",
    "        clip_position = 'be'\n",
    "    else:\n",
    "        clip_position = 'mid'\n",
    "\n",
    "    return (duration_flag, clip_position)\n",
    "\n",
    "\n",
    "# 定义区间类别字典\n",
    "duration_categories = {\n",
    "    '0-1min': [],\n",
    "    '1-2min': [],\n",
    "    '2-3min': [],\n",
    "    '3-4min': [],\n",
    "    '4-5min': [],\n",
    "    '5-6min': [],\n",
    "    '6-7min': [],\n",
    "    '7-8min': [],\n",
    "    '8-9min': [],\n",
    "    '9-10min': [],\n",
    "    '10-11min': [],\n",
    "    '11-12min': [],\n",
    "    '12-13min': [],\n",
    "    '13-14min': [],\n",
    "    '14-15min': [],\n",
    "    '15-16min': [],\n",
    "    '16-17min': [],\n",
    "    '17-18min': [],\n",
    "    '18-19min': [],\n",
    "    '19-20min': [],\n",
    "    '20min+': [],\n",
    "}\n",
    "\n",
    "# 假设 video_in_panda 是一个视频ID列表，panad_70M_datas_mapping 是一个字典\n",
    "for data in tqdm(new_datas):\n",
    "    video_id = data['video_id']\n",
    "    duration = data['duration']\n",
    "    \n",
    "    clips_info = data['clips']\n",
    "\n",
    "    flag = False\n",
    "    new_clips_info = []\n",
    "    for clip in clips_info:\n",
    "        duration_flag, clip_position = check_clip(clip, duration)\n",
    "        if duration_flag:\n",
    "            clip['clip_position'] = clip_position\n",
    "            new_clips_info.append(clip)\n",
    "    \n",
    "    if len(new_clips_info) == 0:\n",
    "        continue\n",
    "\n",
    "    data['clips'] = new_clips_info\n",
    "    \n",
    "    # 根据 duration 将 video_id 分到不同的类别\n",
    "    if duration <= 60:\n",
    "        duration_categories['0-1min'].append(data)\n",
    "    elif 60 < duration <= 120:\n",
    "        duration_categories['1-2min'].append(data)\n",
    "    elif 120 < duration <= 180:\n",
    "        duration_categories['2-3min'].append(data)\n",
    "    elif 180 < duration <= 240:\n",
    "        duration_categories['3-4min'].append(data)\n",
    "    elif 240 < duration <= 300:\n",
    "        duration_categories['4-5min'].append(data)\n",
    "    elif 300 < duration <= 360:\n",
    "        duration_categories['5-6min'].append(data)\n",
    "    elif 360 < duration <= 420:\n",
    "        duration_categories['6-7min'].append(data)\n",
    "    elif 420 < duration <= 480:\n",
    "        duration_categories['7-8min'].append(data)\n",
    "    elif 480 < duration <= 540:\n",
    "        duration_categories['8-9min'].append(data)\n",
    "    elif 540 < duration <= 600:\n",
    "        duration_categories['9-10min'].append(data)\n",
    "    elif 600 < duration <= 660:\n",
    "        duration_categories['10-11min'].append(data)\n",
    "    elif 660 < duration <= 720:\n",
    "        duration_categories['11-12min'].append(data)\n",
    "    elif 720 < duration <= 780:\n",
    "        duration_categories['12-13min'].append(data)\n",
    "    elif 780 < duration <= 840:\n",
    "        duration_categories['13-14min'].append(data)\n",
    "    elif 840 < duration <= 900:\n",
    "        duration_categories['14-15min'].append(data)\n",
    "    elif 900 < duration <= 960:\n",
    "        duration_categories['15-16min'].append(data)\n",
    "    elif 960 < duration <= 1020:\n",
    "        duration_categories['16-17min'].append(data)\n",
    "    elif 1020 < duration <= 1080:\n",
    "        duration_categories['17-18min'].append(data)\n",
    "    elif 1080 < duration <= 1140:\n",
    "        duration_categories['18-19min'].append(data)\n",
    "    elif 1140 < duration <= 1200:\n",
    "        duration_categories['19-20min'].append(data)\n",
    "    else:\n",
    "        duration_categories['20min+'].append(data)\n",
    "\n",
    "# 输出分类结果\n",
    "for category, videos in duration_categories.items():\n",
    "    print(f\"{category}: {len(videos)} videos\")\n",
    "\n",
    "# 保存结果\n",
    "save_json(new_datas, '/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_140k_filter_clips.json')\n",
    "\n",
    "target_duration_group_1 = ['5-6min', '6-7min', '7-8min', '8-9min', '9-10min']\n",
    "target_duration_group_2 = ['10-11min', '11-12min', '12-13min', '13-14min', '14-15min']\n",
    "\n",
    "def get_target_video_id(duration_category, target_size=10000):\n",
    "    all_datas = []\n",
    "    for category in tqdm(duration_category):\n",
    "        this_duration_all_video = duration_categories[category]\n",
    "        random.shuffle(this_duration_all_video)\n",
    "        for data in this_duration_all_video[:target_size]:\n",
    "            data['duration_category'] = category\n",
    "            all_datas.append(data)\n",
    "\n",
    "    return all_datas\n",
    "\n",
    "group_1_video_ids = get_target_video_id(target_duration_group_1)\n",
    "print(f'size: {len(group_1_video_ids)}')\n",
    "group_2_video_ids = get_target_video_id(target_duration_group_2)\n",
    "print(f'size: {len(group_2_video_ids)}')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存这两个\n",
    "save_json(group_1_video_ids, '/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_31k_5_10min_filter_clips.json')\n",
    "save_json(group_2_video_ids, '/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_17k_10_15min_filter_clips.json')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# openvid"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 首先基于 panda70M 对openvid进行初步筛选"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pandas import DataFrame\n",
    "\n",
    "panad_70M = '/share_2/minghao/Datasets/Panda70M/panda70m_training_full.csv'\n",
    "df = pd.read_csv(panad_70M)\n",
    "panad_70M_datas = df.to_dict('records')\n",
    "\n",
    "panad_70M_datas_mapping = {}\n",
    "for data in tqdm(panad_70M_datas):\n",
    "    video_id = data['videoID']\n",
    "    panad_70M_datas_mapping[video_id] = data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1019957/1019957 [00:03<00:00, 326183.42it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ytb size: 630697\n",
      "ytb size: 332120\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 332120/332120 [00:00<00:00, 1078858.39it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "video_in_panda size: 332087\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "openvid_1M = '/share_2/mm_data_dir/data_1/OpenVid-1M/data/train/OpenVid-1M.csv'\n",
    "df = pd.read_csv(openvid_1M)\n",
    "openvid_1M_datas = df.to_dict('records')\n",
    "\n",
    "ytb_datas_video_ids = []\n",
    "openvid_ytb_datas_mapping = {}\n",
    "\n",
    "for data in tqdm(openvid_1M_datas):\n",
    "    video_id = data['video']\n",
    "    if 'to' in video_id:\n",
    "        pos1 = video_id.rfind('_')\n",
    "        video_id = video_id[:pos1]\n",
    "        pos2 = video_id.rfind('_')\n",
    "        orig_video_id = video_id[:pos2]\n",
    "        ytb_datas_video_ids.append(orig_video_id)\n",
    "        if orig_video_id not in openvid_ytb_datas_mapping:\n",
    "            openvid_ytb_datas_mapping[orig_video_id] = {'clips':[data]}\n",
    "        else:\n",
    "            openvid_ytb_datas_mapping[orig_video_id]['clips'].append(data)\n",
    "\n",
    "print(f'ytb size: {len(ytb_datas_video_ids)}')\n",
    "ytb_datas_video_ids = set(ytb_datas_video_ids)\n",
    "print(f'ytb size: {len(ytb_datas_video_ids)}')\n",
    "\n",
    "video_in_panda = []\n",
    "for video_id in tqdm(ytb_datas_video_ids):\n",
    "    if video_id in panad_70M_datas_mapping:\n",
    "        video_in_panda.append(data)\n",
    "\n",
    "print(f'video_in_panda size: {len(video_in_panda)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 332087/332087 [00:44<00:00, 7533.26it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0-1min: 4014 videos\n",
      "1-2min: 7115 videos\n",
      "2-3min: 9791 videos\n",
      "3-4min: 9214 videos\n",
      "4-5min: 9103 videos\n",
      "5-6min: 8616 videos\n",
      "6-7min: 7882 videos\n",
      "7-8min: 7385 videos\n",
      "8-9min: 6902 videos\n",
      "9-10min: 7111 videos\n",
      "10-11min: 8128 videos\n",
      "11-12min: 6859 videos\n",
      "12-13min: 6321 videos\n",
      "13-14min: 5288 videos\n",
      "14-15min: 4805 videos\n",
      "15-16min: 4072 videos\n",
      "16-17min: 3510 videos\n",
      "17-18min: 3021 videos\n",
      "18-19min: 2613 videos\n",
      "19-20min: 2090 videos\n",
      "20min+: 5972 videos\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "def check_clip(clip_info):\n",
    "    # 需要一个规则限制一下\n",
    "    clip_duration = clip_info['seconds']\n",
    "    if  10 <= clip_duration <= 120:\n",
    "        return True\n",
    "    else:\n",
    "        return False\n",
    "\n",
    "# 定义区间类别字典\n",
    "duration_categories = {\n",
    "    '0-1min': [],\n",
    "    '1-2min': [],\n",
    "    '2-3min': [],\n",
    "    '3-4min': [],\n",
    "    '4-5min': [],\n",
    "    '5-6min': [],\n",
    "    '6-7min': [],\n",
    "    '7-8min': [],\n",
    "    '8-9min': [],\n",
    "    '9-10min': [],\n",
    "    '10-11min': [],\n",
    "    '11-12min': [],\n",
    "    '12-13min': [],\n",
    "    '13-14min': [],\n",
    "    '14-15min': [],\n",
    "    '15-16min': [],\n",
    "    '16-17min': [],\n",
    "    '17-18min': [],\n",
    "    '18-19min': [],\n",
    "    '19-20min': [],\n",
    "    '20min+': [],\n",
    "}\n",
    "\n",
    "# 假设 video_in_panda 是一个视频ID列表，panad_70M_datas_mapping 是一个字典\n",
    "for video_id in tqdm(video_in_panda):\n",
    "    video_info = panad_70M_datas_mapping[video_id]\n",
    "    \n",
    "    # 获取视频的结束时间并转换为秒\n",
    "    end_time = eval(video_info['timestamp'])[-1][-1]\n",
    "    duration = time_to_seconds(end_time)\n",
    "\n",
    "    clips_info_openvid = openvid_ytb_datas_mapping[video_id]\n",
    "\n",
    "    flag = False\n",
    "    for clip_info in clips_info_openvid['clips']:\n",
    "        flag = check_clip(clip_info)\n",
    "        if flag:\n",
    "            break\n",
    "        \n",
    "    if not flag:\n",
    "        continue\n",
    "    \n",
    "    # 根据 duration 将 video_id 分到不同的类别\n",
    "    if duration <= 60:\n",
    "        duration_categories['0-1min'].append(data)\n",
    "    elif 60 < duration <= 120:\n",
    "        duration_categories['1-2min'].append(data)\n",
    "    elif 120 < duration <= 180:\n",
    "        duration_categories['2-3min'].append(data)\n",
    "    elif 180 < duration <= 240:\n",
    "        duration_categories['3-4min'].append(data)\n",
    "    elif 240 < duration <= 300:\n",
    "        duration_categories['4-5min'].append(data)\n",
    "    elif 300 < duration <= 360:\n",
    "        duration_categories['5-6min'].append(data)\n",
    "    elif 360 < duration <= 420:\n",
    "        duration_categories['6-7min'].append(data)\n",
    "    elif 420 < duration <= 480:\n",
    "        duration_categories['7-8min'].append(data)\n",
    "    elif 480 < duration <= 540:\n",
    "        duration_categories['8-9min'].append(data)\n",
    "    elif 540 < duration <= 600:\n",
    "        duration_categories['9-10min'].append(data)\n",
    "    elif 600 < duration <= 660:\n",
    "        duration_categories['10-11min'].append(data)\n",
    "    elif 660 < duration <= 720:\n",
    "        duration_categories['11-12min'].append(data)\n",
    "    elif 720 < duration <= 780:\n",
    "        duration_categories['12-13min'].append(data)\n",
    "    elif 780 < duration <= 840:\n",
    "        duration_categories['13-14min'].append(data)\n",
    "    elif 840 < duration <= 900:\n",
    "        duration_categories['14-15min'].append(data)\n",
    "    elif 900 < duration <= 960:\n",
    "        duration_categories['15-16min'].append(data)\n",
    "    elif 960 < duration <= 1020:\n",
    "        duration_categories['16-17min'].append(data)\n",
    "    elif 1020 < duration <= 1080:\n",
    "        duration_categories['17-18min'].append(data)\n",
    "    elif 1080 < duration <= 1140:\n",
    "        duration_categories['18-19min'].append(data)\n",
    "    elif 1140 < duration <= 1200:\n",
    "        duration_categories['19-20min'].append(data)\n",
    "    else:\n",
    "        duration_categories['20min+'].append(data)\n",
    "\n",
    "# 输出分类结果\n",
    "for category, videos in duration_categories.items():\n",
    "    print(f\"{category}: {len(videos)} videos\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5/5 [00:00<00:00, 63.58it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "size: 37896\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5/5 [00:00<00:00, 73.74it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "size: 31401\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "target_duration_group_1 = ['5-6min', '6-7min', '7-8min', '8-9min', '9-10min']\n",
    "target_duration_group_2 = ['10-11min', '11-12min', '12-13min', '13-14min', '14-15min']\n",
    "\n",
    "def get_target_video_id(duration_category, target_size=10000):\n",
    "    all_datas = []\n",
    "    for category in tqdm(duration_category):\n",
    "        this_duration_all_videoid = duration_categories[category]\n",
    "        random.shuffle(this_duration_all_videoid)\n",
    "        for video_id in this_duration_all_videoid[:target_size]:\n",
    "            datas_info_openvid = openvid_ytb_datas_mapping[video_id]\n",
    "            url = panad_70M_datas_mapping[video_id]['url']\n",
    "            datas_info_openvid['video_id'] = video_id\n",
    "            datas_info_openvid['duration_category'] = category\n",
    "            datas_info_openvid['url'] = url\n",
    "            all_datas.append(datas_info_openvid)\n",
    "\n",
    "    return all_datas\n",
    "\n",
    "group_1_video_ids = get_target_video_id(target_duration_group_1)\n",
    "print(f'size: {len(group_1_video_ids)}')\n",
    "group_2_video_ids = get_target_video_id(target_duration_group_2)\n",
    "print(f'size: {len(group_2_video_ids)}')\n",
    "\n",
    "from pandas import DataFrame\n",
    "\n",
    "save_dir = '/share/minghao/VideoProjects/Sythesis2/Candidates'\n",
    "save_name = 'openvid_5_10min_clip10_120_38k.csv'\n",
    "save_path = os.path.join(save_dir, save_name)\n",
    "group_1_df = DataFrame(group_1_video_ids)\n",
    "group_1_df.to_csv(save_path, index=False)\n",
    "\n",
    "save_name = 'openvid_10_15min_clip10_120_31k.csv'\n",
    "save_path = os.path.join(save_dir, save_name)\n",
    "group_2_df = DataFrame(group_2_video_ids)\n",
    "group_2_df.to_csv(save_path, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 修正duration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 31401/31401 [01:49<00:00, 286.69it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "new size:30857\n"
     ]
    }
   ],
   "source": [
    "# 修正duration\n",
    "\n",
    "from pandas import DataFrame\n",
    "import pandas as pd\n",
    "\n",
    "meta_dir = '/share_2/minghao/Datasets/OpenvidInfos'\n",
    "\n",
    "save_dir = '/share/minghao/VideoProjects/Sythesis2/Candidates'\n",
    "save_name = 'openvid_10_15min_clip10_120_31k.csv' # openvid_10_15min_clip10_120_31k.csv\n",
    "save_path = os.path.join(save_dir, save_name)\n",
    "df = pd.read_csv(save_path)\n",
    "datas = df.to_dict('records')\n",
    "\n",
    "new_datas = []\n",
    "for data in tqdm(datas):\n",
    "    video_id = data['video_id']\n",
    "    meta_path = os.path.join(meta_dir, video_id + '.json')\n",
    "    if os.path.exists(meta_path) == False:\n",
    "        continue\n",
    "    meta_data = load_json(meta_path)\n",
    "    duration = meta_data['yt_meta_dict']['info']['duration']\n",
    "    data['duration'] = duration\n",
    "    new_datas.append(data)\n",
    "\n",
    "print(f'new size:{len(new_datas)}')\n",
    "\n",
    "save_name = 'openvid_10_15min_clip10_120_31k.json'\n",
    "save_path = os.path.join(save_dir, save_name)\n",
    "save_json(new_datas, save_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 在修正 duration 后，重新调整 duration 分类和 clips"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_datas_path = './Candidates/openvid_revised_duration_68k.json'\n",
    "new_datas = load_json(new_datas_path)\n",
    "\n",
    "def add_timestamp(clip):\n",
    "    timestamp_str = clip['video'].split('.mp4')[0].split('_')[-1]\n",
    "    start_frame_idx = int(timestamp_str.split('to')[0])\n",
    "    fps = clip['fps']\n",
    "    start_time = start_frame_idx / fps\n",
    "    duration = clip['seconds']\n",
    "    return [start_time, start_time + duration]\n",
    "    \n",
    "\n",
    "def check_clip(clip_info, video_duration):\n",
    "    # 需要一个规则限制一下\n",
    "    clip_duration = clip_info['seconds']\n",
    "    timestamp = clip_info['timestamp']\n",
    "\n",
    "    start_time = time_to_seconds(timestamp[0])\n",
    "    end_time = time_to_seconds(timestamp[-1])\n",
    "\n",
    "    be_window_len = video_duration / 6\n",
    "    begin_window_end_time = 0 + be_window_len\n",
    "    end_window_end_time = video_duration - be_window_len\n",
    "    \n",
    "    if  10 <= clip_duration <= 120:\n",
    "        duration_flag =  True\n",
    "    else:\n",
    "        duration_flag =  False\n",
    "\n",
    "    if start_time >= end_window_end_time and clip_duration >= be_window_len//2:\n",
    "        clip_position = 'be'\n",
    "    elif end_time <= begin_window_end_time and clip_duration >= be_window_len//2:\n",
    "        clip_position = 'be'\n",
    "    else:\n",
    "        clip_position = 'mid'\n",
    "\n",
    "    return (duration_flag, clip_position)\n",
    "\n",
    "\n",
    "# 定义区间类别字典\n",
    "duration_categories = {\n",
    "    '0-1min': [],\n",
    "    '1-2min': [],\n",
    "    '2-3min': [],\n",
    "    '3-4min': [],\n",
    "    '4-5min': [],\n",
    "    '5-6min': [],\n",
    "    '6-7min': [],\n",
    "    '7-8min': [],\n",
    "    '8-9min': [],\n",
    "    '9-10min': [],\n",
    "    '10-11min': [],\n",
    "    '11-12min': [],\n",
    "    '12-13min': [],\n",
    "    '13-14min': [],\n",
    "    '14-15min': [],\n",
    "    '15-16min': [],\n",
    "    '16-17min': [],\n",
    "    '17-18min': [],\n",
    "    '18-19min': [],\n",
    "    '19-20min': [],\n",
    "    '20min+': [],\n",
    "}\n",
    "\n",
    "# 假设 video_in_panda 是一个视频ID列表，panad_70M_datas_mapping 是一个字典\n",
    "for data in tqdm(new_datas):\n",
    "    video_id = data['video_id']\n",
    "    \n",
    "    duration = data['duration']\n",
    "    clips_info = eval(data['clips'])\n",
    "    \n",
    "    flag = False\n",
    "    new_clips_info = []\n",
    "    for clip in clips_info:\n",
    "        # clip = eval(clip)\n",
    "        clip_name = clip['video']\n",
    "\n",
    "        pos1 = clip_name.rfind('_')\n",
    "        sub_clip_name = clip_name[:pos1]\n",
    "        pos2 = sub_clip_name.rfind('_')\n",
    "        clip_id = int(sub_clip_name[pos2+1:])\n",
    "        timestamp = eval(panad_70M_datas_mapping[video_id]['timestamp'])[clip_id]\n",
    "        clip['timestamp'] = timestamp\n",
    "\n",
    "        duration_flag, clip_position = check_clip(clip, duration)\n",
    "        if duration_flag:\n",
    "            clip['clip_position'] = clip_position\n",
    "            new_clips_info.append(clip)\n",
    "    \n",
    "    if len(new_clips_info) == 0:\n",
    "        continue\n",
    "\n",
    "    data['clips'] = new_clips_info\n",
    "    \n",
    "    # 根据 duration 将 video_id 分到不同的类别\n",
    "    if duration <= 60:\n",
    "        duration_categories['0-1min'].append(data)\n",
    "    elif 60 < duration <= 120:\n",
    "        duration_categories['1-2min'].append(data)\n",
    "    elif 120 < duration <= 180:\n",
    "        duration_categories['2-3min'].append(data)\n",
    "    elif 180 < duration <= 240:\n",
    "        duration_categories['3-4min'].append(data)\n",
    "    elif 240 < duration <= 300:\n",
    "        duration_categories['4-5min'].append(data)\n",
    "    elif 300 < duration <= 360:\n",
    "        duration_categories['5-6min'].append(data)\n",
    "    elif 360 < duration <= 420:\n",
    "        duration_categories['6-7min'].append(data)\n",
    "    elif 420 < duration <= 480:\n",
    "        duration_categories['7-8min'].append(data)\n",
    "    elif 480 < duration <= 540:\n",
    "        duration_categories['8-9min'].append(data)\n",
    "    elif 540 < duration <= 600:\n",
    "        duration_categories['9-10min'].append(data)\n",
    "    elif 600 < duration <= 660:\n",
    "        duration_categories['10-11min'].append(data)\n",
    "    elif 660 < duration <= 720:\n",
    "        duration_categories['11-12min'].append(data)\n",
    "    elif 720 < duration <= 780:\n",
    "        duration_categories['12-13min'].append(data)\n",
    "    elif 780 < duration <= 840:\n",
    "        duration_categories['13-14min'].append(data)\n",
    "    elif 840 < duration <= 900:\n",
    "        duration_categories['14-15min'].append(data)\n",
    "    elif 900 < duration <= 960:\n",
    "        duration_categories['15-16min'].append(data)\n",
    "    elif 960 < duration <= 1020:\n",
    "        duration_categories['16-17min'].append(data)\n",
    "    elif 1020 < duration <= 1080:\n",
    "        duration_categories['17-18min'].append(data)\n",
    "    elif 1080 < duration <= 1140:\n",
    "        duration_categories['18-19min'].append(data)\n",
    "    elif 1140 < duration <= 1200:\n",
    "        duration_categories['19-20min'].append(data)\n",
    "    else:\n",
    "        duration_categories['20min+'].append(data)\n",
    "\n",
    "# 输出分类结果\n",
    "for category, videos in duration_categories.items():\n",
    "    print(f\"{category}: {len(videos)} videos\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5/5 [00:00<00:00, 182.87it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "size: 33946\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5/5 [00:00<00:00, 184.37it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "size: 32316\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# 保存结果\n",
    "target_duration_group_1 = ['5-6min', '6-7min', '7-8min', '8-9min', '9-10min']\n",
    "target_duration_group_2 = ['10-11min', '11-12min', '12-13min', '13-14min', '14-15min']\n",
    "\n",
    "def get_target_video_id(duration_category, target_size=10000):\n",
    "    all_datas = []\n",
    "    for category in tqdm(duration_category):\n",
    "        this_duration_all_video = duration_categories[category]\n",
    "        random.shuffle(this_duration_all_video)\n",
    "        for data in this_duration_all_video[:target_size]:\n",
    "            data['duration_category'] = category\n",
    "            all_datas.append(data)\n",
    "\n",
    "    return all_datas\n",
    "\n",
    "group_1_video_ids = get_target_video_id(target_duration_group_1)\n",
    "print(f'size: {len(group_1_video_ids)}')\n",
    "group_2_video_ids = get_target_video_id(target_duration_group_2)\n",
    "print(f'size: {len(group_2_video_ids)}')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存结果\n",
    "save_json(group_1_video_ids, './Candidates/openvid_ytb_34k_5_10min_filter_clips.json')\n",
    "save_json(group_2_video_ids, './Candidates/openvid_ytb_32k_10_15min_filter_clips.json')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# panda70M \n",
    "筛选视频用于合成caption"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pandas import DataFrame\n",
    "\n",
    "panad_70M = '/share_2/minghao/Datasets/Panda70M/panda70m_training_full.csv'\n",
    "df = pd.read_csv(panad_70M)\n",
    "panad_70M_datas = df.to_dict('records')\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "random.shuffle(panad_70M_datas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/800000 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 800000/800000 [00:50<00:00, 15985.70it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0-1min: 90574 videos\n",
      "1-2min: 124476 videos\n",
      "2-3min: 115673 videos\n",
      "3-4min: 85111 videos\n",
      "4-5min: 67676 videos\n",
      "5-6min: 56215 videos\n",
      "6-7min: 44637 videos\n",
      "7-8min: 37226 videos\n",
      "8-9min: 27362 videos\n",
      "9-10min: 24169 videos\n",
      "10-11min: 24558 videos\n",
      "11-12min: 18850 videos\n",
      "12-13min: 15558 videos\n",
      "13-14min: 12831 videos\n",
      "14-15min: 11276 videos\n",
      "15-16min: 8808 videos\n",
      "16-17min: 7424 videos\n",
      "17-18min: 6045 videos\n",
      "18-19min: 5012 videos\n",
      "19-20min: 3956 videos\n",
      "20min+: 12563 videos\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "\n",
    "# 定义区间类别字典\n",
    "duration_categories = {\n",
    "    '0-1min': [],\n",
    "    '1-2min': [],\n",
    "    '2-3min': [],\n",
    "    '3-4min': [],\n",
    "    '4-5min': [],\n",
    "    '5-6min': [],\n",
    "    '6-7min': [],\n",
    "    '7-8min': [],\n",
    "    '8-9min': [],\n",
    "    '9-10min': [],\n",
    "    '10-11min': [],\n",
    "    '11-12min': [],\n",
    "    '12-13min': [],\n",
    "    '13-14min': [],\n",
    "    '14-15min': [],\n",
    "    '15-16min': [],\n",
    "    '16-17min': [],\n",
    "    '17-18min': [],\n",
    "    '18-19min': [],\n",
    "    '19-20min': [],\n",
    "    '20min+': [],\n",
    "}\n",
    "\n",
    "check_limit = 800000\n",
    "\n",
    "# 假设 video_in_panda 是一个视频ID列表，panad_70M_datas_mapping 是一个字典\n",
    "for data in tqdm(panad_70M_datas[:check_limit]):\n",
    "    \n",
    "    # 获取视频的结束时间并转换为秒\n",
    "    end_time = eval(data['timestamp'])[-1][-1]\n",
    "    duration = time_to_seconds(end_time)\n",
    "    \n",
    "    # 根据 duration 将 video_id 分到不同的类别\n",
    "    if duration <= 60:\n",
    "        duration_categories['0-1min'].append(data)\n",
    "    elif 60 < duration <= 120:\n",
    "        duration_categories['1-2min'].append(data)\n",
    "    elif 120 < duration <= 180:\n",
    "        duration_categories['2-3min'].append(data)\n",
    "    elif 180 < duration <= 240:\n",
    "        duration_categories['3-4min'].append(data)\n",
    "    elif 240 < duration <= 300:\n",
    "        duration_categories['4-5min'].append(data)\n",
    "    elif 300 < duration <= 360:\n",
    "        duration_categories['5-6min'].append(data)\n",
    "    elif 360 < duration <= 420:\n",
    "        duration_categories['6-7min'].append(data)\n",
    "    elif 420 < duration <= 480:\n",
    "        duration_categories['7-8min'].append(data)\n",
    "    elif 480 < duration <= 540:\n",
    "        duration_categories['8-9min'].append(data)\n",
    "    elif 540 < duration <= 600:\n",
    "        duration_categories['9-10min'].append(data)\n",
    "    elif 600 < duration <= 660:\n",
    "        duration_categories['10-11min'].append(data)\n",
    "    elif 660 < duration <= 720:\n",
    "        duration_categories['11-12min'].append(data)\n",
    "    elif 720 < duration <= 780:\n",
    "        duration_categories['12-13min'].append(data)\n",
    "    elif 780 < duration <= 840:\n",
    "        duration_categories['13-14min'].append(data)\n",
    "    elif 840 < duration <= 900:\n",
    "        duration_categories['14-15min'].append(data)\n",
    "    elif 900 < duration <= 960:\n",
    "        duration_categories['15-16min'].append(data)\n",
    "    elif 960 < duration <= 1020:\n",
    "        duration_categories['16-17min'].append(data)\n",
    "    elif 1020 < duration <= 1080:\n",
    "        duration_categories['17-18min'].append(data)\n",
    "    elif 1080 < duration <= 1140:\n",
    "        duration_categories['18-19min'].append(data)\n",
    "    elif 1140 < duration <= 1200:\n",
    "        duration_categories['19-20min'].append(data)\n",
    "    else:\n",
    "        duration_categories['20min+'].append(data)\n",
    "\n",
    "# 输出分类结果\n",
    "for category, videos in duration_categories.items():\n",
    "    print(f\"{category}: {len(videos)} videos\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5/5 [00:00<00:00, 1460.82it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "size: 50000\n"
     ]
    }
   ],
   "source": [
    "target_duration_group_1 = ['5-6min', '6-7min', '7-8min', '8-9min', '9-10min']\n",
    "target_duration_group_2 = ['10-11min', '11-12min', '12-13min', '13-14min', '14-15min']\n",
    "\n",
    "def get_target_video_id(duration_category, target_size=10000):\n",
    "    all_datas = []\n",
    "    for category in tqdm(duration_category):\n",
    "        this_duration_all_datas = duration_categories[category]\n",
    "        all_datas.extend(this_duration_all_datas[:target_size])\n",
    "\n",
    "    return all_datas\n",
    "\n",
    "group_1_video_ids = get_target_video_id(target_duration_group_2)\n",
    "print(f'size: {len(group_1_video_ids)}')\n",
    "\n",
    "from pandas import DataFrame\n",
    "\n",
    "save_dir = '/share/minghao/VideoProjects/Sythesis2/LongCaption/Candidates'\n",
    "save_name = 'panda70M_10_15min_50k.csv'\n",
    "save_path = os.path.join(save_dir, save_name)\n",
    "group_1_df = DataFrame(group_1_video_ids)\n",
    "group_1_df.to_csv(save_path, index=False)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}