{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Tool\n", "import json\n", "import os\n", "import random\n", "import shutil\n", "from tqdm import tqdm\n", "import re\n", "import pandas as pd\n", "\n", "\n", "def load_jsonl(path):\n", " datas = []\n", " with open(path, 'r') as file:\n", " for line in file:\n", " data = json.loads(line)\n", " datas.append(data)\n", " return datas\n", "\n", "def load_jsonl_fromdir(res_dir):\n", " res_name = sorted(os.listdir(res_dir))\n", " res_paths = [os.path.join(res_dir, name) for name in res_name]\n", "\n", " datas = []\n", " for path in res_paths:\n", " datas.extend(load_jsonl(path))\n", " return datas\n", "\n", "def load_json(path):\n", " with open(path, 'r') as file:\n", " datas = json.load(file)\n", " return datas\n", "\n", "def save_json(datas, path, indent=4):\n", " with open(path, 'w') as file:\n", " json.dump(datas, file, indent=indent)\n", "\n", "def parse(generated_text):\n", " generated_text = generated_text.strip()\n", " if \"```json\" in generated_text:\n", " generated_text = re.sub(r\"^```json\\s*|\\s*```$\", \"\", generated_text.strip())\n", " try:\n", " data = eval(generated_text)\n", " except:\n", " generated_text = generated_text.replace('\\'Q\\': \\'', \"\\\"Q\\\": \\\"\").replace('\\', \\'A\\': \\'', \"\\\", \\\"A\\\": \\\"\").replace('\\'}', \"\\\"}\")\n", " data = eval(generated_text)\n", "\n", " return data\n", "\n", "def formating_conversations(data):\n", " \n", " question = data['Q']\n", " options = data['Options']\n", " answer = data['Answer']\n", "\n", " question_inp = question + '\\n' + '\\n'.join(options)\n", " answer_inp = answer\n", "\n", " conversations = [\n", " {\n", " \"from\": \"human\",\n", " \"value\": '\\n' + question_inp\n", " },\n", " {\n", " \"from\": \"gpt\",\n", " \"value\": answer_inp\n", " }\n", " ]\n", "\n", " return conversations\n", "\n", "def time_to_seconds(time_str):\n", " # Split the string by the dot to separate seconds and milliseconds\n", " time_parts = time_str.split('.')\n", " seconds = 0\n", " \n", " # If there are milliseconds, process them\n", " if len(time_parts) == 2:\n", " time_str = time_parts[0]\n", " milliseconds = int(time_parts[1])\n", " else:\n", " time_str = time_parts[0]\n", " milliseconds = 0\n", "\n", " # Split the time string by colon to get hours, minutes, and seconds\n", " time_parts = time_str.split(':')\n", " hours = int(time_parts[0])\n", " minutes = int(time_parts[1])\n", " seconds += float(time_parts[2])\n", "\n", " # Convert everything to seconds\n", " total_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000\n", " return total_seconds\n", "\n", "def get_datas_from_df(df_path):\n", " df = pd.read_csv(df_path)\n", " datas = df.to_dict('records')\n", " return datas\n", "\n", "def list_2_dict(datas, key='video_id'):\n", " datas_dict = {}\n", " for data in tqdm(datas, desc='list_2_dict'):\n", " video_id = data['video_id']\n", " if video_id not in datas_dict:\n", " datas_dict[video_id] = [data]\n", " else:\n", " datas_dict[video_id].append(data)\n", " \n", " return datas_dict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# miradata" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "视频来源分布:\n", "source: youtube, size: 158525\n", "source: pexels, size: 57075\n", "source: videvo, size: 9934\n", "source: pixabay, size: 10" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 构造候选数据\n", "包括原视频的 video id, url, duration, clips" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_9328/1679904496.py:95: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(df_path)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "orig datas size: 330313\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "list_2_dict: 100%|██████████| 330313/330313 [00:00<00:00, 1178135.49it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "datas size: 158525\n" ] } ], "source": [ "import pandas as pd\n", "# 构造候选数据,来自youtube的,下载了metadict的,把duration存起来\n", "\n", "orig_datas = get_datas_from_df('/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_v1_330k.csv')\n", "print(f'orig datas size: {len(orig_datas)}')\n", "orig_datas_dict = list_2_dict(orig_datas, 'video_id')\n", "\n", "datas = get_datas_from_df('/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube.csv')\n", "print(f'datas size: {len(datas)}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 158525/158525 [09:17<00:00, 284.11it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Total size: 139565\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "import os\n", "from concurrent.futures import ThreadPoolExecutor, as_completed\n", "from tqdm import tqdm\n", "\n", "# 假设 load_json 和其他相关函数已经定义\n", "\n", "meta_dir = '/share_2/minghao/Datasets/MiraInfos'\n", "new_datas = []\n", "\n", "# 定义一个用于处理每个数据项的函数\n", "def process_data(data):\n", " video_id = data['video_id']\n", " clips_info = orig_datas_dict[video_id]\n", " meta_path = os.path.join(meta_dir, video_id + '.json')\n", " \n", " # 如果文件不存在,跳过该条数据\n", " if not os.path.exists(meta_path):\n", " return None\n", " \n", " # 读取meta数据\n", " meta_data = load_json(meta_path)\n", " assert meta_data['video_id'] == video_id\n", " \n", " # 获取视频时长并附加到数据中\n", " duration = meta_data['yt_meta_dict']['info']['duration']\n", " data['duration'] = duration\n", " data['clips'] = clips_info\n", " return data\n", "\n", "# 使用线程池并行处理数据\n", "with ThreadPoolExecutor() as executor:\n", " # 使用 tqdm 来显示进度条\n", " futures = {executor.submit(process_data, data): data for data in datas}\n", " \n", " # 等待每个任务完成,并将结果添加到 new_datas\n", " for future in tqdm(as_completed(futures), total=len(futures)):\n", " result = future.result()\n", " if result is not None:\n", " new_datas.append(result)\n", "\n", "# 打印结果\n", "print(f'Total size: {len(new_datas)}')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "save_json(new_datas, '/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_140k.json')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 筛选合适的数据\n", "- clip 的时长,clip 的位置\n", "- 视频的时长\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "new_datas = load_json('/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_140k.json')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 139565/139565 [00:03<00:00, 42940.60it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0-1min: 1369 videos\n", "1-2min: 9997 videos\n", "2-3min: 17451 videos\n", "3-4min: 13403 videos\n", "4-5min: 10488 videos\n", "5-6min: 8879 videos\n", "6-7min: 7236 videos\n", "7-8min: 6566 videos\n", "8-9min: 5100 videos\n", "9-10min: 3655 videos\n", "10-11min: 5620 videos\n", "11-12min: 3901 videos\n", "12-13min: 3236 videos\n", "13-14min: 2666 videos\n", "14-15min: 2329 videos\n", "15-16min: 1994 videos\n", "16-17min: 1524 videos\n", "17-18min: 1328 videos\n", "18-19min: 1081 videos\n", "19-20min: 906 videos\n", "20min+: 6856 videos\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5/5 [00:00<00:00, 221.48it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "size: 31436\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5/5 [00:00<00:00, 406.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "size: 17752\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "def check_clip(clip_info, video_duration):\n", " # 需要一个规则限制一下\n", " clip_duration = clip_info['seconds']\n", " timestamp = eval(clip_info['timestamp'])\n", " timestamp = eval(clip_info['timestamp'])\n", "\n", " start_time = time_to_seconds(timestamp[0])\n", " end_time = time_to_seconds(timestamp[-1])\n", "\n", " be_window_len = video_duration / 6\n", " begin_window_end_time = 0 + be_window_len\n", " end_window_end_time = video_duration - be_window_len\n", " \n", " if 10 <= clip_duration <= 120:\n", " duration_flag = True\n", " else:\n", " duration_flag = False\n", "\n", " if start_time >= end_window_end_time and clip_duration >= be_window_len//2:\n", " clip_position = 'be'\n", " elif end_time <= begin_window_end_time and clip_duration >= be_window_len//2:\n", " clip_position = 'be'\n", " else:\n", " clip_position = 'mid'\n", "\n", " return (duration_flag, clip_position)\n", "\n", "\n", "# 定义区间类别字典\n", "duration_categories = {\n", " '0-1min': [],\n", " '1-2min': [],\n", " '2-3min': [],\n", " '3-4min': [],\n", " '4-5min': [],\n", " '5-6min': [],\n", " '6-7min': [],\n", " '7-8min': [],\n", " '8-9min': [],\n", " '9-10min': [],\n", " '10-11min': [],\n", " '11-12min': [],\n", " '12-13min': [],\n", " '13-14min': [],\n", " '14-15min': [],\n", " '15-16min': [],\n", " '16-17min': [],\n", " '17-18min': [],\n", " '18-19min': [],\n", " '19-20min': [],\n", " '20min+': [],\n", "}\n", "\n", "# 假设 video_in_panda 是一个视频ID列表,panad_70M_datas_mapping 是一个字典\n", "for data in tqdm(new_datas):\n", " video_id = data['video_id']\n", " duration = data['duration']\n", " \n", " clips_info = data['clips']\n", "\n", " flag = False\n", " new_clips_info = []\n", " for clip in clips_info:\n", " duration_flag, clip_position = check_clip(clip, duration)\n", " if duration_flag:\n", " clip['clip_position'] = clip_position\n", " new_clips_info.append(clip)\n", " \n", " if len(new_clips_info) == 0:\n", " continue\n", "\n", " data['clips'] = new_clips_info\n", " \n", " # 根据 duration 将 video_id 分到不同的类别\n", " if duration <= 60:\n", " duration_categories['0-1min'].append(data)\n", " elif 60 < duration <= 120:\n", " duration_categories['1-2min'].append(data)\n", " elif 120 < duration <= 180:\n", " duration_categories['2-3min'].append(data)\n", " elif 180 < duration <= 240:\n", " duration_categories['3-4min'].append(data)\n", " elif 240 < duration <= 300:\n", " duration_categories['4-5min'].append(data)\n", " elif 300 < duration <= 360:\n", " duration_categories['5-6min'].append(data)\n", " elif 360 < duration <= 420:\n", " duration_categories['6-7min'].append(data)\n", " elif 420 < duration <= 480:\n", " duration_categories['7-8min'].append(data)\n", " elif 480 < duration <= 540:\n", " duration_categories['8-9min'].append(data)\n", " elif 540 < duration <= 600:\n", " duration_categories['9-10min'].append(data)\n", " elif 600 < duration <= 660:\n", " duration_categories['10-11min'].append(data)\n", " elif 660 < duration <= 720:\n", " duration_categories['11-12min'].append(data)\n", " elif 720 < duration <= 780:\n", " duration_categories['12-13min'].append(data)\n", " elif 780 < duration <= 840:\n", " duration_categories['13-14min'].append(data)\n", " elif 840 < duration <= 900:\n", " duration_categories['14-15min'].append(data)\n", " elif 900 < duration <= 960:\n", " duration_categories['15-16min'].append(data)\n", " elif 960 < duration <= 1020:\n", " duration_categories['16-17min'].append(data)\n", " elif 1020 < duration <= 1080:\n", " duration_categories['17-18min'].append(data)\n", " elif 1080 < duration <= 1140:\n", " duration_categories['18-19min'].append(data)\n", " elif 1140 < duration <= 1200:\n", " duration_categories['19-20min'].append(data)\n", " else:\n", " duration_categories['20min+'].append(data)\n", "\n", "# 输出分类结果\n", "for category, videos in duration_categories.items():\n", " print(f\"{category}: {len(videos)} videos\")\n", "\n", "# 保存结果\n", "save_json(new_datas, '/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_140k_filter_clips.json')\n", "\n", "target_duration_group_1 = ['5-6min', '6-7min', '7-8min', '8-9min', '9-10min']\n", "target_duration_group_2 = ['10-11min', '11-12min', '12-13min', '13-14min', '14-15min']\n", "\n", "def get_target_video_id(duration_category, target_size=10000):\n", " all_datas = []\n", " for category in tqdm(duration_category):\n", " this_duration_all_video = duration_categories[category]\n", " random.shuffle(this_duration_all_video)\n", " for data in this_duration_all_video[:target_size]:\n", " data['duration_category'] = category\n", " all_datas.append(data)\n", "\n", " return all_datas\n", "\n", "group_1_video_ids = get_target_video_id(target_duration_group_1)\n", "print(f'size: {len(group_1_video_ids)}')\n", "group_2_video_ids = get_target_video_id(target_duration_group_2)\n", "print(f'size: {len(group_2_video_ids)}')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 保存这两个\n", "save_json(group_1_video_ids, '/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_31k_5_10min_filter_clips.json')\n", "save_json(group_2_video_ids, '/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_17k_10_15min_filter_clips.json')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# openvid" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 首先基于 panda70M 对openvid进行初步筛选" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pandas import DataFrame\n", "\n", "panad_70M = '/share_2/minghao/Datasets/Panda70M/panda70m_training_full.csv'\n", "df = pd.read_csv(panad_70M)\n", "panad_70M_datas = df.to_dict('records')\n", "\n", "panad_70M_datas_mapping = {}\n", "for data in tqdm(panad_70M_datas):\n", " video_id = data['videoID']\n", " panad_70M_datas_mapping[video_id] = data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1019957/1019957 [00:03<00:00, 326183.42it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ytb size: 630697\n", "ytb size: 332120\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 332120/332120 [00:00<00:00, 1078858.39it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "video_in_panda size: 332087\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "openvid_1M = '/share_2/mm_data_dir/data_1/OpenVid-1M/data/train/OpenVid-1M.csv'\n", "df = pd.read_csv(openvid_1M)\n", "openvid_1M_datas = df.to_dict('records')\n", "\n", "ytb_datas_video_ids = []\n", "openvid_ytb_datas_mapping = {}\n", "\n", "for data in tqdm(openvid_1M_datas):\n", " video_id = data['video']\n", " if 'to' in video_id:\n", " pos1 = video_id.rfind('_')\n", " video_id = video_id[:pos1]\n", " pos2 = video_id.rfind('_')\n", " orig_video_id = video_id[:pos2]\n", " ytb_datas_video_ids.append(orig_video_id)\n", " if orig_video_id not in openvid_ytb_datas_mapping:\n", " openvid_ytb_datas_mapping[orig_video_id] = {'clips':[data]}\n", " else:\n", " openvid_ytb_datas_mapping[orig_video_id]['clips'].append(data)\n", "\n", "print(f'ytb size: {len(ytb_datas_video_ids)}')\n", "ytb_datas_video_ids = set(ytb_datas_video_ids)\n", "print(f'ytb size: {len(ytb_datas_video_ids)}')\n", "\n", "video_in_panda = []\n", "for video_id in tqdm(ytb_datas_video_ids):\n", " if video_id in panad_70M_datas_mapping:\n", " video_in_panda.append(data)\n", "\n", "print(f'video_in_panda size: {len(video_in_panda)}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 332087/332087 [00:44<00:00, 7533.26it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0-1min: 4014 videos\n", "1-2min: 7115 videos\n", "2-3min: 9791 videos\n", "3-4min: 9214 videos\n", "4-5min: 9103 videos\n", "5-6min: 8616 videos\n", "6-7min: 7882 videos\n", "7-8min: 7385 videos\n", "8-9min: 6902 videos\n", "9-10min: 7111 videos\n", "10-11min: 8128 videos\n", "11-12min: 6859 videos\n", "12-13min: 6321 videos\n", "13-14min: 5288 videos\n", "14-15min: 4805 videos\n", "15-16min: 4072 videos\n", "16-17min: 3510 videos\n", "17-18min: 3021 videos\n", "18-19min: 2613 videos\n", "19-20min: 2090 videos\n", "20min+: 5972 videos\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "def check_clip(clip_info):\n", " # 需要一个规则限制一下\n", " clip_duration = clip_info['seconds']\n", " if 10 <= clip_duration <= 120:\n", " return True\n", " else:\n", " return False\n", "\n", "# 定义区间类别字典\n", "duration_categories = {\n", " '0-1min': [],\n", " '1-2min': [],\n", " '2-3min': [],\n", " '3-4min': [],\n", " '4-5min': [],\n", " '5-6min': [],\n", " '6-7min': [],\n", " '7-8min': [],\n", " '8-9min': [],\n", " '9-10min': [],\n", " '10-11min': [],\n", " '11-12min': [],\n", " '12-13min': [],\n", " '13-14min': [],\n", " '14-15min': [],\n", " '15-16min': [],\n", " '16-17min': [],\n", " '17-18min': [],\n", " '18-19min': [],\n", " '19-20min': [],\n", " '20min+': [],\n", "}\n", "\n", "# 假设 video_in_panda 是一个视频ID列表,panad_70M_datas_mapping 是一个字典\n", "for video_id in tqdm(video_in_panda):\n", " video_info = panad_70M_datas_mapping[video_id]\n", " \n", " # 获取视频的结束时间并转换为秒\n", " end_time = eval(video_info['timestamp'])[-1][-1]\n", " duration = time_to_seconds(end_time)\n", "\n", " clips_info_openvid = openvid_ytb_datas_mapping[video_id]\n", "\n", " flag = False\n", " for clip_info in clips_info_openvid['clips']:\n", " flag = check_clip(clip_info)\n", " if flag:\n", " break\n", " \n", " if not flag:\n", " continue\n", " \n", " # 根据 duration 将 video_id 分到不同的类别\n", " if duration <= 60:\n", " duration_categories['0-1min'].append(data)\n", " elif 60 < duration <= 120:\n", " duration_categories['1-2min'].append(data)\n", " elif 120 < duration <= 180:\n", " duration_categories['2-3min'].append(data)\n", " elif 180 < duration <= 240:\n", " duration_categories['3-4min'].append(data)\n", " elif 240 < duration <= 300:\n", " duration_categories['4-5min'].append(data)\n", " elif 300 < duration <= 360:\n", " duration_categories['5-6min'].append(data)\n", " elif 360 < duration <= 420:\n", " duration_categories['6-7min'].append(data)\n", " elif 420 < duration <= 480:\n", " duration_categories['7-8min'].append(data)\n", " elif 480 < duration <= 540:\n", " duration_categories['8-9min'].append(data)\n", " elif 540 < duration <= 600:\n", " duration_categories['9-10min'].append(data)\n", " elif 600 < duration <= 660:\n", " duration_categories['10-11min'].append(data)\n", " elif 660 < duration <= 720:\n", " duration_categories['11-12min'].append(data)\n", " elif 720 < duration <= 780:\n", " duration_categories['12-13min'].append(data)\n", " elif 780 < duration <= 840:\n", " duration_categories['13-14min'].append(data)\n", " elif 840 < duration <= 900:\n", " duration_categories['14-15min'].append(data)\n", " elif 900 < duration <= 960:\n", " duration_categories['15-16min'].append(data)\n", " elif 960 < duration <= 1020:\n", " duration_categories['16-17min'].append(data)\n", " elif 1020 < duration <= 1080:\n", " duration_categories['17-18min'].append(data)\n", " elif 1080 < duration <= 1140:\n", " duration_categories['18-19min'].append(data)\n", " elif 1140 < duration <= 1200:\n", " duration_categories['19-20min'].append(data)\n", " else:\n", " duration_categories['20min+'].append(data)\n", "\n", "# 输出分类结果\n", "for category, videos in duration_categories.items():\n", " print(f\"{category}: {len(videos)} videos\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5/5 [00:00<00:00, 63.58it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "size: 37896\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5/5 [00:00<00:00, 73.74it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "size: 31401\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "target_duration_group_1 = ['5-6min', '6-7min', '7-8min', '8-9min', '9-10min']\n", "target_duration_group_2 = ['10-11min', '11-12min', '12-13min', '13-14min', '14-15min']\n", "\n", "def get_target_video_id(duration_category, target_size=10000):\n", " all_datas = []\n", " for category in tqdm(duration_category):\n", " this_duration_all_videoid = duration_categories[category]\n", " random.shuffle(this_duration_all_videoid)\n", " for video_id in this_duration_all_videoid[:target_size]:\n", " datas_info_openvid = openvid_ytb_datas_mapping[video_id]\n", " url = panad_70M_datas_mapping[video_id]['url']\n", " datas_info_openvid['video_id'] = video_id\n", " datas_info_openvid['duration_category'] = category\n", " datas_info_openvid['url'] = url\n", " all_datas.append(datas_info_openvid)\n", "\n", " return all_datas\n", "\n", "group_1_video_ids = get_target_video_id(target_duration_group_1)\n", "print(f'size: {len(group_1_video_ids)}')\n", "group_2_video_ids = get_target_video_id(target_duration_group_2)\n", "print(f'size: {len(group_2_video_ids)}')\n", "\n", "from pandas import DataFrame\n", "\n", "save_dir = '/share/minghao/VideoProjects/Sythesis2/Candidates'\n", "save_name = 'openvid_5_10min_clip10_120_38k.csv'\n", "save_path = os.path.join(save_dir, save_name)\n", "group_1_df = DataFrame(group_1_video_ids)\n", "group_1_df.to_csv(save_path, index=False)\n", "\n", "save_name = 'openvid_10_15min_clip10_120_31k.csv'\n", "save_path = os.path.join(save_dir, save_name)\n", "group_2_df = DataFrame(group_2_video_ids)\n", "group_2_df.to_csv(save_path, index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 修正duration" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 31401/31401 [01:49<00:00, 286.69it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "new size:30857\n" ] } ], "source": [ "# 修正duration\n", "\n", "from pandas import DataFrame\n", "import pandas as pd\n", "\n", "meta_dir = '/share_2/minghao/Datasets/OpenvidInfos'\n", "\n", "save_dir = '/share/minghao/VideoProjects/Sythesis2/Candidates'\n", "save_name = 'openvid_10_15min_clip10_120_31k.csv' # openvid_10_15min_clip10_120_31k.csv\n", "save_path = os.path.join(save_dir, save_name)\n", "df = pd.read_csv(save_path)\n", "datas = df.to_dict('records')\n", "\n", "new_datas = []\n", "for data in tqdm(datas):\n", " video_id = data['video_id']\n", " meta_path = os.path.join(meta_dir, video_id + '.json')\n", " if os.path.exists(meta_path) == False:\n", " continue\n", " meta_data = load_json(meta_path)\n", " duration = meta_data['yt_meta_dict']['info']['duration']\n", " data['duration'] = duration\n", " new_datas.append(data)\n", "\n", "print(f'new size:{len(new_datas)}')\n", "\n", "save_name = 'openvid_10_15min_clip10_120_31k.json'\n", "save_path = os.path.join(save_dir, save_name)\n", "save_json(new_datas, save_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 在修正 duration 后,重新调整 duration 分类和 clips" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "new_datas_path = './Candidates/openvid_revised_duration_68k.json'\n", "new_datas = load_json(new_datas_path)\n", "\n", "def add_timestamp(clip):\n", " timestamp_str = clip['video'].split('.mp4')[0].split('_')[-1]\n", " start_frame_idx = int(timestamp_str.split('to')[0])\n", " fps = clip['fps']\n", " start_time = start_frame_idx / fps\n", " duration = clip['seconds']\n", " return [start_time, start_time + duration]\n", " \n", "\n", "def check_clip(clip_info, video_duration):\n", " # 需要一个规则限制一下\n", " clip_duration = clip_info['seconds']\n", " timestamp = clip_info['timestamp']\n", "\n", " start_time = time_to_seconds(timestamp[0])\n", " end_time = time_to_seconds(timestamp[-1])\n", "\n", " be_window_len = video_duration / 6\n", " begin_window_end_time = 0 + be_window_len\n", " end_window_end_time = video_duration - be_window_len\n", " \n", " if 10 <= clip_duration <= 120:\n", " duration_flag = True\n", " else:\n", " duration_flag = False\n", "\n", " if start_time >= end_window_end_time and clip_duration >= be_window_len//2:\n", " clip_position = 'be'\n", " elif end_time <= begin_window_end_time and clip_duration >= be_window_len//2:\n", " clip_position = 'be'\n", " else:\n", " clip_position = 'mid'\n", "\n", " return (duration_flag, clip_position)\n", "\n", "\n", "# 定义区间类别字典\n", "duration_categories = {\n", " '0-1min': [],\n", " '1-2min': [],\n", " '2-3min': [],\n", " '3-4min': [],\n", " '4-5min': [],\n", " '5-6min': [],\n", " '6-7min': [],\n", " '7-8min': [],\n", " '8-9min': [],\n", " '9-10min': [],\n", " '10-11min': [],\n", " '11-12min': [],\n", " '12-13min': [],\n", " '13-14min': [],\n", " '14-15min': [],\n", " '15-16min': [],\n", " '16-17min': [],\n", " '17-18min': [],\n", " '18-19min': [],\n", " '19-20min': [],\n", " '20min+': [],\n", "}\n", "\n", "# 假设 video_in_panda 是一个视频ID列表,panad_70M_datas_mapping 是一个字典\n", "for data in tqdm(new_datas):\n", " video_id = data['video_id']\n", " \n", " duration = data['duration']\n", " clips_info = eval(data['clips'])\n", " \n", " flag = False\n", " new_clips_info = []\n", " for clip in clips_info:\n", " # clip = eval(clip)\n", " clip_name = clip['video']\n", "\n", " pos1 = clip_name.rfind('_')\n", " sub_clip_name = clip_name[:pos1]\n", " pos2 = sub_clip_name.rfind('_')\n", " clip_id = int(sub_clip_name[pos2+1:])\n", " timestamp = eval(panad_70M_datas_mapping[video_id]['timestamp'])[clip_id]\n", " clip['timestamp'] = timestamp\n", "\n", " duration_flag, clip_position = check_clip(clip, duration)\n", " if duration_flag:\n", " clip['clip_position'] = clip_position\n", " new_clips_info.append(clip)\n", " \n", " if len(new_clips_info) == 0:\n", " continue\n", "\n", " data['clips'] = new_clips_info\n", " \n", " # 根据 duration 将 video_id 分到不同的类别\n", " if duration <= 60:\n", " duration_categories['0-1min'].append(data)\n", " elif 60 < duration <= 120:\n", " duration_categories['1-2min'].append(data)\n", " elif 120 < duration <= 180:\n", " duration_categories['2-3min'].append(data)\n", " elif 180 < duration <= 240:\n", " duration_categories['3-4min'].append(data)\n", " elif 240 < duration <= 300:\n", " duration_categories['4-5min'].append(data)\n", " elif 300 < duration <= 360:\n", " duration_categories['5-6min'].append(data)\n", " elif 360 < duration <= 420:\n", " duration_categories['6-7min'].append(data)\n", " elif 420 < duration <= 480:\n", " duration_categories['7-8min'].append(data)\n", " elif 480 < duration <= 540:\n", " duration_categories['8-9min'].append(data)\n", " elif 540 < duration <= 600:\n", " duration_categories['9-10min'].append(data)\n", " elif 600 < duration <= 660:\n", " duration_categories['10-11min'].append(data)\n", " elif 660 < duration <= 720:\n", " duration_categories['11-12min'].append(data)\n", " elif 720 < duration <= 780:\n", " duration_categories['12-13min'].append(data)\n", " elif 780 < duration <= 840:\n", " duration_categories['13-14min'].append(data)\n", " elif 840 < duration <= 900:\n", " duration_categories['14-15min'].append(data)\n", " elif 900 < duration <= 960:\n", " duration_categories['15-16min'].append(data)\n", " elif 960 < duration <= 1020:\n", " duration_categories['16-17min'].append(data)\n", " elif 1020 < duration <= 1080:\n", " duration_categories['17-18min'].append(data)\n", " elif 1080 < duration <= 1140:\n", " duration_categories['18-19min'].append(data)\n", " elif 1140 < duration <= 1200:\n", " duration_categories['19-20min'].append(data)\n", " else:\n", " duration_categories['20min+'].append(data)\n", "\n", "# 输出分类结果\n", "for category, videos in duration_categories.items():\n", " print(f\"{category}: {len(videos)} videos\")\n" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5/5 [00:00<00:00, 182.87it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "size: 33946\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5/5 [00:00<00:00, 184.37it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "size: 32316\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "\n", "# 保存结果\n", "target_duration_group_1 = ['5-6min', '6-7min', '7-8min', '8-9min', '9-10min']\n", "target_duration_group_2 = ['10-11min', '11-12min', '12-13min', '13-14min', '14-15min']\n", "\n", "def get_target_video_id(duration_category, target_size=10000):\n", " all_datas = []\n", " for category in tqdm(duration_category):\n", " this_duration_all_video = duration_categories[category]\n", " random.shuffle(this_duration_all_video)\n", " for data in this_duration_all_video[:target_size]:\n", " data['duration_category'] = category\n", " all_datas.append(data)\n", "\n", " return all_datas\n", "\n", "group_1_video_ids = get_target_video_id(target_duration_group_1)\n", "print(f'size: {len(group_1_video_ids)}')\n", "group_2_video_ids = get_target_video_id(target_duration_group_2)\n", "print(f'size: {len(group_2_video_ids)}')\n" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "# 保存结果\n", "save_json(group_1_video_ids, './Candidates/openvid_ytb_34k_5_10min_filter_clips.json')\n", "save_json(group_2_video_ids, './Candidates/openvid_ytb_32k_10_15min_filter_clips.json')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# panda70M \n", "筛选视频用于合成caption" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pandas import DataFrame\n", "\n", "panad_70M = '/share_2/minghao/Datasets/Panda70M/panda70m_training_full.csv'\n", "df = pd.read_csv(panad_70M)\n", "panad_70M_datas = df.to_dict('records')\n", "\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "random.shuffle(panad_70M_datas)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/800000 [00:00