hoho / generate_pcloud_dataset.py
jskvrna's picture
Final submission code
9518589
# This script processes the 'usm3d/hoho25k' dataset.
# For each sample in the dataset, it performs the following steps:
# 1. Reads COLMAP reconstruction data.
# 2. Extracts 3D point coordinates and their corresponding colors.
# 3. Retrieves ground truth wireframe vertices and edges.
# 4. Skips processing if the output file already exists or if no 3D points are found.
# 5. Saves the extracted point cloud, colors, ground truth data, and sample ID
# into a pickle file in a specified output directory.
# The script shuffles the dataset before processing and keeps track of
# the number of samples successfully processed and saved.
#
from datasets import load_dataset
from hoho2025.viz3d import *
import os
import numpy as np
import pickle
from utils import read_colmap_rec
from tqdm import tqdm
ds = load_dataset("usm3d/hoho25k", cache_dir="<CACHE_DIR_PLACEHOLDER>", trust_remote_code=True)
#ds = load_dataset("usm3d/hoho25k", cache_dir="<ALTERNATIVE_CACHE_DIR_PLACEHOLDER>", trust_remote_code=True)
ds = ds.shuffle()
# Create output directory
output_dir = "<OUTPUT_DIR_PLACEHOLDER>"
os.makedirs(output_dir, exist_ok=True)
counter = 0
for a in tqdm(ds['train'], desc="Processing dataset"):
colmap = read_colmap_rec(a['colmap_binary'])
order_id = a['order_id']
# Save as pickle file
output_file = os.path.join(output_dir, f'sample_{order_id}.pkl')
if os.path.exists(output_file):
continue
# Extract point cloud from COLMAP
points3d = colmap.points3D
if len(points3d) == 0:
continue
# Convert to numpy arrays
point_coords = np.array([point.xyz for point in points3d.values()])
point_colors = np.array([point.color for point in points3d.values()])
# Get ground truth data
gt_vertices = np.array(a['wf_vertices'])
gt_connections = np.array(a['wf_edges'])
# Save the data
sample_data = {
'point_cloud': point_coords,
'point_colors': point_colors,
'gt_vertices': gt_vertices,
'gt_connections': gt_connections,
'sample_id': order_id
}
with open(output_file, 'wb') as f:
pickle.dump(sample_data, f)
counter += 1
print(f"Generated {counter} samples in {output_dir}")