In [1]:
# ═══════════════════════════════════════════════════════════════════
# šŸš€ HRHUB V2.1 - PRODUCTION NOTEBOOK
# Cell 1: Setup & Imports
# ═══════════════════════════════════════════════════════════════════

import warnings
warnings.filterwarnings('ignore')

# Core
import pandas as pd
import numpy as np
from pathlib import Path

# Embeddings
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Viz
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pyvis.network import Network

# Dimensionality reduction
from sklearn.manifold import TSNE

# Utils
from tqdm import tqdm
import pickle
from typing import List, Dict, Tuple
import time

# Config
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("āœ… All imports successful!")
print(f"šŸ“¦ Pandas: {pd.__version__}")
print(f"šŸ“¦ Numpy: {np.__version__}")

āœ… All imports successful!
šŸ“¦ Pandas: 2.1.4
šŸ“¦ Numpy: 1.26.3


In [5]:
# ═══════════════════════════════════════════════════════════════════
# Cell 2: Paths & Configuration
# ═══════════════════════════════════════════════════════════════════

# 🟢 VSCode local - path direto
BASE_PATH = Path("data")

# Input paths
DATA_PATHS = {
 'benefits': BASE_PATH / "benefits.csv",
 'companies': BASE_PATH / "companies.csv",
 'company_industries': BASE_PATH / "company_industries.csv",
 'company_specialties': BASE_PATH / "company_specialties.csv",
 'employee_counts': BASE_PATH / "employee_counts.csv",
 'industries': BASE_PATH / "industries.csv",
 'job_industries': BASE_PATH / "job_industries.csv",
 'job_skills': BASE_PATH / "job_skills.csv",
 'postings': BASE_PATH / "postings.csv",
 'resume_data': BASE_PATH / "resume_data.csv",
 'salaries': BASE_PATH / "salaries.csv",
 'skills': BASE_PATH / "skills.csv"
}

# Output files (salvamos direto com npy/pkl)
OUTPUT_FILES = {
 'candidate_embeddings': 'candidate_embeddings.npy',
 'company_embeddings': 'company_embeddings.npy',
 'candidate_metadata': 'candidate_metadata.pkl',
 'company_metadata': 'company_metadata.pkl'
}

# Model config
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
EMBEDDING_DIM = 384

print("āœ… Paths configured!")
print(f"šŸ“‚ Base path: {BASE_PATH}")
print(f"šŸ¤– Model: {MODEL_NAME}")

āœ… Paths configured!
šŸ“‚ Base path: data
šŸ¤– Model: sentence-transformers/all-MiniLM-L6-v2


In [6]:
# ═══════════════════════════════════════════════════════════════════
# Cell 3: Load Raw Data
# ═══════════════════════════════════════════════════════════════════

print("šŸ“„ Loading data...")
start_time = time.time()

# Load all CSVs
data = {}
for name, path in DATA_PATHS.items():
 try:
 df = pd.read_csv(path)
 data[name] = df
 print(f"āœ… {name}: {df.shape[0]:,} rows Ɨ {df.shape[1]} cols")
 except Exception as e:
 print(f"āŒ {name}: ERROR - {e}")
 data[name] = None

load_time = time.time() - start_time
print(f"\nā±ļø Loaded in {load_time:.2f}s")

# Quick peek at key datasets
print("\n" + "="*70)
print("šŸ” KEY DATASETS PREVIEW")
print("="*70)

print("\nšŸ“‹ CANDIDATES (resume_data):")
if data['resume_data'] is not None:
 print(f"Shape: {data['resume_data'].shape}")
 print(f"Columns: {list(data['resume_data'].columns)}")
 print(data['resume_data'].head(2))

print("\nšŸ¢ COMPANIES:")
if data['companies'] is not None:
 print(f"Shape: {data['companies'].shape}")
 print(f"Columns: {list(data['companies'].columns)}")
 print(data['companies'].head(2))

print("\nšŸ“„ JOB POSTINGS:")
if data['postings'] is not None:
 print(f"Shape: {data['postings'].shape}")
 print(f"Columns: {list(data['postings'].columns)}")
 print(data['postings'].head(2))

print("\nāœ… Data loaded! Ready to inspect and clean.")

šŸ“„ Loading data...
āŒ benefits: ERROR - [Errno 2] No such file or directory: 'data/benefits.csv'
āŒ companies: ERROR - [Errno 2] No such file or directory: 'data/companies.csv'
āŒ company_industries: ERROR - [Errno 2] No such file or directory: 'data/company_industries.csv'
āŒ company_specialties: ERROR - [Errno 2] No such file or directory: 'data/company_specialties.csv'
āŒ employee_counts: ERROR - [Errno 2] No such file or directory: 'data/employee_counts.csv'
āŒ industries: ERROR - [Errno 2] No such file or directory: 'data/industries.csv'
āŒ job_industries: ERROR - [Errno 2] No such file or directory: 'data/job_industries.csv'
āŒ job_skills: ERROR - [Errno 2] No such file or directory: 'data/job_skills.csv'
āŒ postings: ERROR - [Errno 2] No such file or directory: 'data/postings.csv'
āŒ resume_data: ERROR - [Errno 2] No such file or directory: 'data/resume_data.csv'
āŒ salaries: ERROR - [Errno 2] No such file or directory: 'data/salaries.csv'
āŒ skills: ERROR - [Errno