In [1]:
# Tool
import json
import os
import random
import shutil
from tqdm import tqdm
import re
import pandas as pd


def load_jsonl(path):
    datas = []
    with open(path, 'r') as file:
        for line in file:
            data = json.loads(line)
            datas.append(data)
    return datas

def load_jsonl_fromdir(res_dir):
    res_name = sorted(os.listdir(res_dir))
    res_paths = [os.path.join(res_dir, name) for name in res_name]

    datas = []
    for path in res_paths:
        datas.extend(load_jsonl(path))
    return datas

def load_json(path):
    with open(path, 'r') as file:
        datas = json.load(file)
    return datas

def save_json(datas, path, indent=4):
    with open(path, 'w') as file:
        json.dump(datas, file, indent=indent)

def parse(generated_text):
    generated_text = generated_text.strip()
    if "```json" in generated_text:
        generated_text = re.sub(r"^```json\s*|\s*```$", "", generated_text.strip())
    try:
        data = eval(generated_text)
    except:
        generated_text = generated_text.replace('\'Q\': \'', "\"Q\": \"").replace('\', \'A\': \'', "\", \"A\": \"").replace('\'}', "\"}")
        data = eval(generated_text)

    return data

def formating_conversations(data):
    
    question = data['Q']
    options = data['Options']
    answer = data['Answer']

    question_inp = question + '\n' + '\n'.join(options)
    answer_inp = answer

    conversations = [
        {
            "from": "human",
            "value": '<image>\n' + question_inp
        },
        {
            "from": "gpt",
            "value": answer_inp
        }
    ]

    return conversations

def time_to_seconds(time_str):
    # Split the string by the dot to separate seconds and milliseconds
    time_parts = time_str.split('.')
    seconds = 0
    
    # If there are milliseconds, process them
    if len(time_parts) == 2:
        time_str = time_parts[0]
        milliseconds = int(time_parts[1])
    else:
        time_str = time_parts[0]
        milliseconds = 0

    # Split the time string by colon to get hours, minutes, and seconds
    time_parts = time_str.split(':')
    hours = int(time_parts[0])
    minutes = int(time_parts[1])
    seconds += float(time_parts[2])

    # Convert everything to seconds
    total_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000
    return total_seconds

def get_datas_from_df(df_path):
    df = pd.read_csv(df_path)
    datas = df.to_dict('records')
    return datas

def list_2_dict(datas, key='video_id'):
    datas_dict = {}
    for data in tqdm(datas, desc='list_2_dict'):
        video_id = data['video_id']
        if video_id not in datas_dict:
            datas_dict[video_id] = [data]
        else:
            datas_dict[video_id].append(data)
            
    return datas_dict

# miradata

In [None]:
视频来源分布：
source: youtube, size: 158525
source: pexels, size: 57075
source: videvo, size: 9934
source: pixabay, size: 10

## 构造候选数据
包括原视频的 video id, url, duration, clips

In [None]:
import pandas as pd
# 构造候选数据，来自youtube的，下载了metadict的，把duration存起来

orig_datas = get_datas_from_df('/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_v1_330k.csv')
print(f'orig datas size: {len(orig_datas)}')
orig_datas_dict = list_2_dict(orig_datas, 'video_id')

datas = get_datas_from_df('/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube.csv')
print(f'datas size: {len(datas)}')

  df = pd.read_csv(df_path)


orig datas size: 330313


list_2_dict: 100%|██████████| 330313/330313 [00:00<00:00, 1178135.49it/s]


datas size: 158525


In [None]:
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# 假设 load_json 和其他相关函数已经定义

meta_dir = '/share_2/minghao/Datasets/MiraInfos'
new_datas = []

# 定义一个用于处理每个数据项的函数
def process_data(data):
    video_id = data['video_id']
    clips_info = orig_datas_dict[video_id]
    meta_path = os.path.join(meta_dir, video_id + '.json')
    
    # 如果文件不存在，跳过该条数据
    if not os.path.exists(meta_path):
        return None
    
    # 读取meta数据
    meta_data = load_json(meta_path)
    assert meta_data['video_id'] == video_id
    
    # 获取视频时长并附加到数据中
    duration = meta_data['yt_meta_dict']['info']['duration']
    data['duration'] = duration
    data['clips'] = clips_info
    return data

# 使用线程池并行处理数据
with ThreadPoolExecutor() as executor:
    # 使用 tqdm 来显示进度条
    futures = {executor.submit(process_data, data): data for data in datas}
    
    # 等待每个任务完成，并将结果添加到 new_datas
    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
        if result is not None:
            new_datas.append(result)

# 打印结果
print(f'Total size: {len(new_datas)}')


100%|██████████| 158525/158525 [09:17<00:00, 284.11it/s]

Total size: 139565





In [None]:

save_json(new_datas, '/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_140k.json')

## 筛选合适的数据
- clip 的时长，clip 的位置
- 视频的时长


In [None]:
new_datas = load_json('/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_140k.json')

In [None]:
def check_clip(clip_info, video_duration):
    # 需要一个规则限制一下
    clip_duration = clip_info['seconds']
    timestamp = eval(clip_info['timestamp'])
    timestamp = eval(clip_info['timestamp'])

    start_time = time_to_seconds(timestamp[0])
    end_time = time_to_seconds(timestamp[-1])

    be_window_len = video_duration / 6
    begin_window_end_time = 0 + be_window_len
    end_window_end_time = video_duration - be_window_len
    
    if  10 <= clip_duration <= 120:
        duration_flag =  True
    else:
        duration_flag =  False

    if start_time >= end_window_end_time and clip_duration >= be_window_len//2:
        clip_position = 'be'
    elif end_time <= begin_window_end_time and clip_duration >= be_window_len//2:
        clip_position = 'be'
    else:
        clip_position = 'mid'

    return (duration_flag, clip_position)


# 定义区间类别字典
duration_categories = {
    '0-1min': [],
    '1-2min': [],
    '2-3min': [],
    '3-4min': [],
    '4-5min': [],
    '5-6min': [],
    '6-7min': [],
    '7-8min': [],
    '8-9min': [],
    '9-10min': [],
    '10-11min': [],
    '11-12min': [],
    '12-13min': [],
    '13-14min': [],
    '14-15min': [],
    '15-16min': [],
    '16-17min': [],
    '17-18min': [],
    '18-19min': [],
    '19-20min': [],
    '20min+': [],
}

# 假设 video_in_panda 是一个视频ID列表，panad_70M_datas_mapping 是一个字典
for data in tqdm(new_datas):
    video_id = data['video_id']
    duration = data['duration']
    
    clips_info = data['clips']

    flag = False
    new_clips_info = []
    for clip in clips_info:
        duration_flag, clip_position = check_clip(clip, duration)
        if duration_flag:
            clip['clip_position'] = clip_position
            new_clips_info.append(clip)
    
    if len(new_clips_info) == 0:
        continue

    data['clips'] = new_clips_info
    
    # 根据 duration 将 video_id 分到不同的类别
    if duration <= 60:
        duration_categories['0-1min'].append(data)
    elif 60 < duration <= 120:
        duration_categories['1-2min'].append(data)
    elif 120 < duration <= 180:
        duration_categories['2-3min'].append(data)
    elif 180 < duration <= 240:
        duration_categories['3-4min'].append(data)
    elif 240 < duration <= 300:
        duration_categories['4-5min'].append(data)
    elif 300 < duration <= 360:
        duration_categories['5-6min'].append(data)
    elif 360 < duration <= 420:
        duration_categories['6-7min'].append(data)
    elif 420 < duration <= 480:
        duration_categories['7-8min'].append(data)
    elif 480 < duration <= 540:
        duration_categories['8-9min'].append(data)
    elif 540 < duration <= 600:
        duration_categories['9-10min'].append(data)
    elif 600 < duration <= 660:
        duration_categories['10-11min'].append(data)
    elif 660 < duration <= 720:
        duration_categories['11-12min'].append(data)
    elif 720 < duration <= 780:
        duration_categories['12-13min'].append(data)
    elif 780 < duration <= 840:
        duration_categories['13-14min'].append(data)
    elif 840 < duration <= 900:
        duration_categories['14-15min'].append(data)
    elif 900 < duration <= 960:
        duration_categories['15-16min'].append(data)
    elif 960 < duration <= 1020:
        duration_categories['16-17min'].append(data)
    elif 1020 < duration <= 1080:
        duration_categories['17-18min'].append(data)
    elif 1080 < duration <= 1140:
        duration_categories['18-19min'].append(data)
    elif 1140 < duration <= 1200:
        duration_categories['19-20min'].append(data)
    else:
        duration_categories['20min+'].append(data)

# 输出分类结果
for category, videos in duration_categories.items():
    print(f"{category}: {len(videos)} videos")

# 保存结果
save_json(new_datas, '/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_140k_filter_clips.json')

target_duration_group_1 = ['5-6min', '6-7min', '7-8min', '8-9min', '9-10min']
target_duration_group_2 = ['10-11min', '11-12min', '12-13min', '13-14min', '14-15min']

def get_target_video_id(duration_category, target_size=10000):
    all_datas = []
    for category in tqdm(duration_category):
        this_duration_all_video = duration_categories[category]
        random.shuffle(this_duration_all_video)
        for data in this_duration_all_video[:target_size]:
            data['duration_category'] = category
            all_datas.append(data)

    return all_datas

group_1_video_ids = get_target_video_id(target_duration_group_1)
print(f'size: {len(group_1_video_ids)}')
group_2_video_ids = get_target_video_id(target_duration_group_2)
print(f'size: {len(group_2_video_ids)}')


100%|██████████| 139565/139565 [00:03<00:00, 42940.60it/s]


0-1min: 1369 videos
1-2min: 9997 videos
2-3min: 17451 videos
3-4min: 13403 videos
4-5min: 10488 videos
5-6min: 8879 videos
6-7min: 7236 videos
7-8min: 6566 videos
8-9min: 5100 videos
9-10min: 3655 videos
10-11min: 5620 videos
11-12min: 3901 videos
12-13min: 3236 videos
13-14min: 2666 videos
14-15min: 2329 videos
15-16min: 1994 videos
16-17min: 1524 videos
17-18min: 1328 videos
18-19min: 1081 videos
19-20min: 906 videos
20min+: 6856 videos


100%|██████████| 5/5 [00:00<00:00, 221.48it/s]


size: 31436


100%|██████████| 5/5 [00:00<00:00, 406.48it/s]

size: 17752





In [None]:
# 保存这两个
save_json(group_1_video_ids, '/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_31k_5_10min_filter_clips.json')
save_json(group_2_video_ids, '/share/minghao/VideoProjects/Sythesis2/Candidates/miradata_youtube_17k_10_15min_filter_clips.json')

# openvid

## 首先基于 panda70M 对openvid进行初步筛选

In [35]:
import pandas as pd
from pandas import DataFrame

panad_70M = '/share_2/minghao/Datasets/Panda70M/panda70m_training_full.csv'
df = pd.read_csv(panad_70M)
panad_70M_datas = df.to_dict('records')

panad_70M_datas_mapping = {}
for data in tqdm(panad_70M_datas):
    video_id = data['videoID']
    panad_70M_datas_mapping[video_id] = data

In [None]:
openvid_1M = '/share_2/mm_data_dir/data_1/OpenVid-1M/data/train/OpenVid-1M.csv'
df = pd.read_csv(openvid_1M)
openvid_1M_datas = df.to_dict('records')

ytb_datas_video_ids = []
openvid_ytb_datas_mapping = {}

for data in tqdm(openvid_1M_datas):
    video_id = data['video']
    if 'to' in video_id:
        pos1 = video_id.rfind('_')
        video_id = video_id[:pos1]
        pos2 = video_id.rfind('_')
        orig_video_id = video_id[:pos2]
        ytb_datas_video_ids.append(orig_video_id)
        if orig_video_id not in openvid_ytb_datas_mapping:
            openvid_ytb_datas_mapping[orig_video_id] = {'clips':[data]}
        else:
            openvid_ytb_datas_mapping[orig_video_id]['clips'].append(data)

print(f'ytb size: {len(ytb_datas_video_ids)}')
ytb_datas_video_ids = set(ytb_datas_video_ids)
print(f'ytb size: {len(ytb_datas_video_ids)}')

video_in_panda = []
for video_id in tqdm(ytb_datas_video_ids):
    if video_id in panad_70M_datas_mapping:
        video_in_panda.append(data)

print(f'video_in_panda size: {len(video_in_panda)}')

100%|██████████| 1019957/1019957 [00:03<00:00, 326183.42it/s]


ytb size: 630697
ytb size: 332120


100%|██████████| 332120/332120 [00:00<00:00, 1078858.39it/s]

video_in_panda size: 332087





In [None]:
def check_clip(clip_info):
    # 需要一个规则限制一下
    clip_duration = clip_info['seconds']
    if  10 <= clip_duration <= 120:
        return True
    else:
        return False

# 定义区间类别字典
duration_categories = {
    '0-1min': [],
    '1-2min': [],
    '2-3min': [],
    '3-4min': [],
    '4-5min': [],
    '5-6min': [],
    '6-7min': [],
    '7-8min': [],
    '8-9min': [],
    '9-10min': [],
    '10-11min': [],
    '11-12min': [],
    '12-13min': [],
    '13-14min': [],
    '14-15min': [],
    '15-16min': [],
    '16-17min': [],
    '17-18min': [],
    '18-19min': [],
    '19-20min': [],
    '20min+': [],
}

# 假设 video_in_panda 是一个视频ID列表，panad_70M_datas_mapping 是一个字典
for video_id in tqdm(video_in_panda):
    video_info = panad_70M_datas_mapping[video_id]
    
    # 获取视频的结束时间并转换为秒
    end_time = eval(video_info['timestamp'])[-1][-1]
    duration = time_to_seconds(end_time)

    clips_info_openvid = openvid_ytb_datas_mapping[video_id]

    flag = False
    for clip_info in clips_info_openvid['clips']:
        flag = check_clip(clip_info)
        if flag:
            break
        
    if not flag:
        continue
    
    # 根据 duration 将 video_id 分到不同的类别
    if duration <= 60:
        duration_categories['0-1min'].append(data)
    elif 60 < duration <= 120:
        duration_categories['1-2min'].append(data)
    elif 120 < duration <= 180:
        duration_categories['2-3min'].append(data)
    elif 180 < duration <= 240:
        duration_categories['3-4min'].append(data)
    elif 240 < duration <= 300:
        duration_categories['4-5min'].append(data)
    elif 300 < duration <= 360:
        duration_categories['5-6min'].append(data)
    elif 360 < duration <= 420:
        duration_categories['6-7min'].append(data)
    elif 420 < duration <= 480:
        duration_categories['7-8min'].append(data)
    elif 480 < duration <= 540:
        duration_categories['8-9min'].append(data)
    elif 540 < duration <= 600:
        duration_categories['9-10min'].append(data)
    elif 600 < duration <= 660:
        duration_categories['10-11min'].append(data)
    elif 660 < duration <= 720:
        duration_categories['11-12min'].append(data)
    elif 720 < duration <= 780:
        duration_categories['12-13min'].append(data)
    elif 780 < duration <= 840:
        duration_categories['13-14min'].append(data)
    elif 840 < duration <= 900:
        duration_categories['14-15min'].append(data)
    elif 900 < duration <= 960:
        duration_categories['15-16min'].append(data)
    elif 960 < duration <= 1020:
        duration_categories['16-17min'].append(data)
    elif 1020 < duration <= 1080:
        duration_categories['17-18min'].append(data)
    elif 1080 < duration <= 1140:
        duration_categories['18-19min'].append(data)
    elif 1140 < duration <= 1200:
        duration_categories['19-20min'].append(data)
    else:
        duration_categories['20min+'].append(data)

# 输出分类结果
for category, videos in duration_categories.items():
    print(f"{category}: {len(videos)} videos")


100%|██████████| 332087/332087 [00:44<00:00, 7533.26it/s]

0-1min: 4014 videos
1-2min: 7115 videos
2-3min: 9791 videos
3-4min: 9214 videos
4-5min: 9103 videos
5-6min: 8616 videos
6-7min: 7882 videos
7-8min: 7385 videos
8-9min: 6902 videos
9-10min: 7111 videos
10-11min: 8128 videos
11-12min: 6859 videos
12-13min: 6321 videos
13-14min: 5288 videos
14-15min: 4805 videos
15-16min: 4072 videos
16-17min: 3510 videos
17-18min: 3021 videos
18-19min: 2613 videos
19-20min: 2090 videos
20min+: 5972 videos





In [None]:
target_duration_group_1 = ['5-6min', '6-7min', '7-8min', '8-9min', '9-10min']
target_duration_group_2 = ['10-11min', '11-12min', '12-13min', '13-14min', '14-15min']

def get_target_video_id(duration_category, target_size=10000):
    all_datas = []
    for category in tqdm(duration_category):
        this_duration_all_videoid = duration_categories[category]
        random.shuffle(this_duration_all_videoid)
        for video_id in this_duration_all_videoid[:target_size]:
            datas_info_openvid = openvid_ytb_datas_mapping[video_id]
            url = panad_70M_datas_mapping[video_id]['url']
            datas_info_openvid['video_id'] = video_id
            datas_info_openvid['duration_category'] = category
            datas_info_openvid['url'] = url
            all_datas.append(datas_info_openvid)

    return all_datas

group_1_video_ids = get_target_video_id(target_duration_group_1)
print(f'size: {len(group_1_video_ids)}')
group_2_video_ids = get_target_video_id(target_duration_group_2)
print(f'size: {len(group_2_video_ids)}')

from pandas import DataFrame

save_dir = '/share/minghao/VideoProjects/Sythesis2/Candidates'
save_name = 'openvid_5_10min_clip10_120_38k.csv'
save_path = os.path.join(save_dir, save_name)
group_1_df = DataFrame(group_1_video_ids)
group_1_df.to_csv(save_path, index=False)

save_name = 'openvid_10_15min_clip10_120_31k.csv'
save_path = os.path.join(save_dir, save_name)
group_2_df = DataFrame(group_2_video_ids)
group_2_df.to_csv(save_path, index=False)

100%|██████████| 5/5 [00:00<00:00, 63.58it/s]


size: 37896


100%|██████████| 5/5 [00:00<00:00, 73.74it/s]

size: 31401





## 修正duration

In [11]:
# 修正duration

from pandas import DataFrame
import pandas as pd

meta_dir = '/share_2/minghao/Datasets/OpenvidInfos'

save_dir = '/share/minghao/VideoProjects/Sythesis2/Candidates'
save_name = 'openvid_10_15min_clip10_120_31k.csv' # openvid_10_15min_clip10_120_31k.csv
save_path = os.path.join(save_dir, save_name)
df = pd.read_csv(save_path)
datas = df.to_dict('records')

new_datas = []
for data in tqdm(datas):
    video_id = data['video_id']
    meta_path = os.path.join(meta_dir, video_id + '.json')
    if os.path.exists(meta_path) == False:
        continue
    meta_data = load_json(meta_path)
    duration = meta_data['yt_meta_dict']['info']['duration']
    data['duration'] = duration
    new_datas.append(data)

print(f'new size:{len(new_datas)}')

save_name = 'openvid_10_15min_clip10_120_31k.json'
save_path = os.path.join(save_dir, save_name)
save_json(new_datas, save_path)

100%|██████████| 31401/31401 [01:49<00:00, 286.69it/s]


new size:30857


## 在修正 duration 后，重新调整 duration 分类和 clips

In [None]:
new_datas_path = './Candidates/openvid_revised_duration_68k.json'
new_datas = load_json(new_datas_path)

def add_timestamp(clip):
    timestamp_str = clip['video'].split('.mp4')[0].split('_')[-1]
    start_frame_idx = int(timestamp_str.split('to')[0])
    fps = clip['fps']
    start_time = start_frame_idx / fps
    duration = clip['seconds']
    return [start_time, start_time + duration]
    

def check_clip(clip_info, video_duration):
    # 需要一个规则限制一下
    clip_duration = clip_info['seconds']
    timestamp = clip_info['timestamp']

    start_time = time_to_seconds(timestamp[0])
    end_time = time_to_seconds(timestamp[-1])

    be_window_len = video_duration / 6
    begin_window_end_time = 0 + be_window_len
    end_window_end_time = video_duration - be_window_len
    
    if  10 <= clip_duration <= 120:
        duration_flag =  True
    else:
        duration_flag =  False

    if start_time >= end_window_end_time and clip_duration >= be_window_len//2:
        clip_position = 'be'
    elif end_time <= begin_window_end_time and clip_duration >= be_window_len//2:
        clip_position = 'be'
    else:
        clip_position = 'mid'

    return (duration_flag, clip_position)


# 定义区间类别字典
duration_categories = {
    '0-1min': [],
    '1-2min': [],
    '2-3min': [],
    '3-4min': [],
    '4-5min': [],
    '5-6min': [],
    '6-7min': [],
    '7-8min': [],
    '8-9min': [],
    '9-10min': [],
    '10-11min': [],
    '11-12min': [],
    '12-13min': [],
    '13-14min': [],
    '14-15min': [],
    '15-16min': [],
    '16-17min': [],
    '17-18min': [],
    '18-19min': [],
    '19-20min': [],
    '20min+': [],
}

# 假设 video_in_panda 是一个视频ID列表，panad_70M_datas_mapping 是一个字典
for data in tqdm(new_datas):
    video_id = data['video_id']
    
    duration = data['duration']
    clips_info = eval(data['clips'])
    
    flag = False
    new_clips_info = []
    for clip in clips_info:
        # clip = eval(clip)
        clip_name = clip['video']

        pos1 = clip_name.rfind('_')
        sub_clip_name = clip_name[:pos1]
        pos2 = sub_clip_name.rfind('_')
        clip_id = int(sub_clip_name[pos2+1:])
        timestamp = eval(panad_70M_datas_mapping[video_id]['timestamp'])[clip_id]
        clip['timestamp'] = timestamp

        duration_flag, clip_position = check_clip(clip, duration)
        if duration_flag:
            clip['clip_position'] = clip_position
            new_clips_info.append(clip)
    
    if len(new_clips_info) == 0:
        continue

    data['clips'] = new_clips_info
    
    # 根据 duration 将 video_id 分到不同的类别
    if duration <= 60:
        duration_categories['0-1min'].append(data)
    elif 60 < duration <= 120:
        duration_categories['1-2min'].append(data)
    elif 120 < duration <= 180:
        duration_categories['2-3min'].append(data)
    elif 180 < duration <= 240:
        duration_categories['3-4min'].append(data)
    elif 240 < duration <= 300:
        duration_categories['4-5min'].append(data)
    elif 300 < duration <= 360:
        duration_categories['5-6min'].append(data)
    elif 360 < duration <= 420:
        duration_categories['6-7min'].append(data)
    elif 420 < duration <= 480:
        duration_categories['7-8min'].append(data)
    elif 480 < duration <= 540:
        duration_categories['8-9min'].append(data)
    elif 540 < duration <= 600:
        duration_categories['9-10min'].append(data)
    elif 600 < duration <= 660:
        duration_categories['10-11min'].append(data)
    elif 660 < duration <= 720:
        duration_categories['11-12min'].append(data)
    elif 720 < duration <= 780:
        duration_categories['12-13min'].append(data)
    elif 780 < duration <= 840:
        duration_categories['13-14min'].append(data)
    elif 840 < duration <= 900:
        duration_categories['14-15min'].append(data)
    elif 900 < duration <= 960:
        duration_categories['15-16min'].append(data)
    elif 960 < duration <= 1020:
        duration_categories['16-17min'].append(data)
    elif 1020 < duration <= 1080:
        duration_categories['17-18min'].append(data)
    elif 1080 < duration <= 1140:
        duration_categories['18-19min'].append(data)
    elif 1140 < duration <= 1200:
        duration_categories['19-20min'].append(data)
    else:
        duration_categories['20min+'].append(data)

# 输出分类结果
for category, videos in duration_categories.items():
    print(f"{category}: {len(videos)} videos")


In [59]:

# 保存结果
target_duration_group_1 = ['5-6min', '6-7min', '7-8min', '8-9min', '9-10min']
target_duration_group_2 = ['10-11min', '11-12min', '12-13min', '13-14min', '14-15min']

def get_target_video_id(duration_category, target_size=10000):
    all_datas = []
    for category in tqdm(duration_category):
        this_duration_all_video = duration_categories[category]
        random.shuffle(this_duration_all_video)
        for data in this_duration_all_video[:target_size]:
            data['duration_category'] = category
            all_datas.append(data)

    return all_datas

group_1_video_ids = get_target_video_id(target_duration_group_1)
print(f'size: {len(group_1_video_ids)}')
group_2_video_ids = get_target_video_id(target_duration_group_2)
print(f'size: {len(group_2_video_ids)}')


100%|██████████| 5/5 [00:00<00:00, 182.87it/s]


size: 33946


100%|██████████| 5/5 [00:00<00:00, 184.37it/s]

size: 32316





In [61]:
# 保存结果
save_json(group_1_video_ids, './Candidates/openvid_ytb_34k_5_10min_filter_clips.json')
save_json(group_2_video_ids, './Candidates/openvid_ytb_32k_10_15min_filter_clips.json')

# panda70M 
筛选视频用于合成caption

In [2]:
import pandas as pd
from pandas import DataFrame

panad_70M = '/share_2/minghao/Datasets/Panda70M/panda70m_training_full.csv'
df = pd.read_csv(panad_70M)
panad_70M_datas = df.to_dict('records')



In [10]:
random.shuffle(panad_70M_datas)

In [11]:
import random

# 定义区间类别字典
duration_categories = {
    '0-1min': [],
    '1-2min': [],
    '2-3min': [],
    '3-4min': [],
    '4-5min': [],
    '5-6min': [],
    '6-7min': [],
    '7-8min': [],
    '8-9min': [],
    '9-10min': [],
    '10-11min': [],
    '11-12min': [],
    '12-13min': [],
    '13-14min': [],
    '14-15min': [],
    '15-16min': [],
    '16-17min': [],
    '17-18min': [],
    '18-19min': [],
    '19-20min': [],
    '20min+': [],
}

check_limit = 800000

# 假设 video_in_panda 是一个视频ID列表，panad_70M_datas_mapping 是一个字典
for data in tqdm(panad_70M_datas[:check_limit]):
    
    # 获取视频的结束时间并转换为秒
    end_time = eval(data['timestamp'])[-1][-1]
    duration = time_to_seconds(end_time)
    
    # 根据 duration 将 video_id 分到不同的类别
    if duration <= 60:
        duration_categories['0-1min'].append(data)
    elif 60 < duration <= 120:
        duration_categories['1-2min'].append(data)
    elif 120 < duration <= 180:
        duration_categories['2-3min'].append(data)
    elif 180 < duration <= 240:
        duration_categories['3-4min'].append(data)
    elif 240 < duration <= 300:
        duration_categories['4-5min'].append(data)
    elif 300 < duration <= 360:
        duration_categories['5-6min'].append(data)
    elif 360 < duration <= 420:
        duration_categories['6-7min'].append(data)
    elif 420 < duration <= 480:
        duration_categories['7-8min'].append(data)
    elif 480 < duration <= 540:
        duration_categories['8-9min'].append(data)
    elif 540 < duration <= 600:
        duration_categories['9-10min'].append(data)
    elif 600 < duration <= 660:
        duration_categories['10-11min'].append(data)
    elif 660 < duration <= 720:
        duration_categories['11-12min'].append(data)
    elif 720 < duration <= 780:
        duration_categories['12-13min'].append(data)
    elif 780 < duration <= 840:
        duration_categories['13-14min'].append(data)
    elif 840 < duration <= 900:
        duration_categories['14-15min'].append(data)
    elif 900 < duration <= 960:
        duration_categories['15-16min'].append(data)
    elif 960 < duration <= 1020:
        duration_categories['16-17min'].append(data)
    elif 1020 < duration <= 1080:
        duration_categories['17-18min'].append(data)
    elif 1080 < duration <= 1140:
        duration_categories['18-19min'].append(data)
    elif 1140 < duration <= 1200:
        duration_categories['19-20min'].append(data)
    else:
        duration_categories['20min+'].append(data)

# 输出分类结果
for category, videos in duration_categories.items():
    print(f"{category}: {len(videos)} videos")


  0%|          | 0/800000 [00:00<?, ?it/s]

100%|██████████| 800000/800000 [00:50<00:00, 15985.70it/s]

0-1min: 90574 videos
1-2min: 124476 videos
2-3min: 115673 videos
3-4min: 85111 videos
4-5min: 67676 videos
5-6min: 56215 videos
6-7min: 44637 videos
7-8min: 37226 videos
8-9min: 27362 videos
9-10min: 24169 videos
10-11min: 24558 videos
11-12min: 18850 videos
12-13min: 15558 videos
13-14min: 12831 videos
14-15min: 11276 videos
15-16min: 8808 videos
16-17min: 7424 videos
17-18min: 6045 videos
18-19min: 5012 videos
19-20min: 3956 videos
20min+: 12563 videos





In [12]:
target_duration_group_1 = ['5-6min', '6-7min', '7-8min', '8-9min', '9-10min']
target_duration_group_2 = ['10-11min', '11-12min', '12-13min', '13-14min', '14-15min']

def get_target_video_id(duration_category, target_size=10000):
    all_datas = []
    for category in tqdm(duration_category):
        this_duration_all_datas = duration_categories[category]
        all_datas.extend(this_duration_all_datas[:target_size])

    return all_datas

group_1_video_ids = get_target_video_id(target_duration_group_2)
print(f'size: {len(group_1_video_ids)}')

from pandas import DataFrame

save_dir = '/share/minghao/VideoProjects/Sythesis2/LongCaption/Candidates'
save_name = 'panda70M_10_15min_50k.csv'
save_path = os.path.join(save_dir, save_name)
group_1_df = DataFrame(group_1_video_ids)
group_1_df.to_csv(save_path, index=False)


100%|██████████| 5/5 [00:00<00:00, 1460.82it/s]


size: 50000
