{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Tool\n", "import json\n", "import os\n", "import random\n", "import shutil\n", "from tqdm import tqdm\n", "import re\n", "import pandas as pd\n", "\n", "\n", "def load_jsonl(path):\n", " datas = []\n", " with open(path, 'r') as file:\n", " for line in file:\n", " data = json.loads(line)\n", " datas.append(data)\n", " return datas\n", "\n", "def load_jsonl_fromdir(res_dir):\n", " res_name = sorted(os.listdir(res_dir))\n", " res_paths = [os.path.join(res_dir, name) for name in res_name]\n", "\n", " datas = []\n", " for path in res_paths:\n", " datas.extend(load_jsonl(path))\n", " return datas\n", "\n", "def load_json(path):\n", " with open(path, 'r') as file:\n", " datas = json.load(file)\n", " return datas\n", "\n", "def save_json(datas, path, indent=4):\n", " with open(path, 'w') as file:\n", " json.dump(datas, file, indent=indent)\n", "\n", "def parse(generated_text):\n", " generated_text = generated_text.strip()\n", " if \"```json\" in generated_text:\n", " generated_text = re.sub(r\"^```json\\s*|\\s*```$\", \"\", generated_text.strip())\n", " try:\n", " data = eval(generated_text)\n", " except:\n", " generated_text = generated_text.replace('\\'Q\\': \\'', \"\\\"Q\\\": \\\"\").replace('\\', \\'A\\': \\'', \"\\\", \\\"A\\\": \\\"\").replace('\\'}', \"\\\"}\")\n", " data = eval(generated_text)\n", "\n", " return data\n", "\n", "def formating_conversations(data):\n", " \n", " question = data['Q']\n", " options = data['Options']\n", " answer = data['Answer']\n", "\n", " question_inp = question + '\\n' + '\\n'.join(options)\n", " answer_inp = answer\n", "\n", " conversations = [\n", " {\n", " \"from\": \"human\",\n", " \"value\": '\\n' + question_inp\n", " },\n", " {\n", " \"from\": \"gpt\",\n", " \"value\": answer_inp\n", " }\n", " ]\n", "\n", " return conversations\n", "\n", "def time_to_seconds(time_str):\n", " # Split the string by the dot to separate seconds and milliseconds\n", " time_parts = time_str.split('.')\n", " seconds = 0\n", " \n", " # If there are milliseconds, process them\n", " if len(time_parts) == 2:\n", " time_str = time_parts[0]\n", " milliseconds = int(time_parts[1])\n", " else:\n", " time_str = time_parts[0]\n", " milliseconds = 0\n", "\n", " # Split the time string by colon to get hours, minutes, and seconds\n", " time_parts = time_str.split(':')\n", " hours = int(time_parts[0])\n", " minutes = int(time_parts[1])\n", " seconds += float(time_parts[2])\n", "\n", " # Convert everything to seconds\n", " total_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000\n", " return total_seconds\n", "\n", "def get_datas_from_df(df_path):\n", " df = pd.read_csv(df_path)\n", " datas = df.to_dict('records')\n", " return datas\n", "\n", "def list_2_dict(datas, key='video_id'):\n", " datas_dict = {}\n", " for data in tqdm(datas, desc='list_2_dict'):\n", " video_id = data['video_id']\n", " if video_id not in datas_dict:\n", " datas_dict[video_id] = [data]\n", " else:\n", " datas_dict[video_id].append(data)\n", " \n", " return datas_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'get_datas_from_df' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m/share/minghao/VideoProjects/Sythesis2/LongCaption/tmp.ipynb Cell 2\u001b[0m line \u001b[0;36m4\n\u001b[1;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[1;32m 3\u001b[0m path \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m/share_2/minghao/Datasets/Panda70M/panda70m_training_full.csv\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m----> 4\u001b[0m panda_70M_datas \u001b[39m=\u001b[39m get_datas_from_df(path)\n", "\u001b[0;31mNameError\u001b[0m: name 'get_datas_from_df' is not defined" ] } ], "source": [ "import pandas as pd\n", "\n", "path = '/share_2/minghao/Datasets/Panda70M/panda70m_training_full.csv'\n", "panda_70M_datas = get_datas_from_df(path)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "size: 50000\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/50000 [00:00