{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "b2dd5b02", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "āœ… All imports successful!\n", "šŸ“¦ Pandas: 2.1.4\n", "šŸ“¦ Numpy: 1.26.3\n" ] } ], "source": [ "# ═══════════════════════════════════════════════════════════════════\n", "# šŸš€ HRHUB V2.1 - PRODUCTION NOTEBOOK\n", "# Cell 1: Setup & Imports\n", "# ═══════════════════════════════════════════════════════════════════\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "# Core\n", "import pandas as pd\n", "import numpy as np\n", "from pathlib import Path\n", "\n", "# Embeddings\n", "from sentence_transformers import SentenceTransformer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "# Viz\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from pyvis.network import Network\n", "\n", "# Dimensionality reduction\n", "from sklearn.manifold import TSNE\n", "\n", "# Utils\n", "from tqdm import tqdm\n", "import pickle\n", "from typing import List, Dict, Tuple\n", "import time\n", "\n", "# Config\n", "plt.style.use('seaborn-v0_8-darkgrid')\n", "sns.set_palette(\"husl\")\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.max_rows', 100)\n", "\n", "print(\"āœ… All imports successful!\")\n", "print(f\"šŸ“¦ Pandas: {pd.__version__}\")\n", "print(f\"šŸ“¦ Numpy: {np.__version__}\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "b8696a11", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "āœ… Paths configured!\n", "šŸ“‚ Base path: data\n", "šŸ¤– Model: sentence-transformers/all-MiniLM-L6-v2\n" ] } ], "source": [ "# ═══════════════════════════════════════════════════════════════════\n", "# Cell 2: Paths & Configuration\n", "# ═══════════════════════════════════════════════════════════════════\n", "\n", "# 🟢 VSCode local - path direto\n", "BASE_PATH = Path(\"data\")\n", "\n", "# Input paths\n", "DATA_PATHS = {\n", " 'benefits': BASE_PATH / \"benefits.csv\",\n", " 'companies': BASE_PATH / \"companies.csv\",\n", " 'company_industries': BASE_PATH / \"company_industries.csv\",\n", " 'company_specialties': BASE_PATH / \"company_specialties.csv\",\n", " 'employee_counts': BASE_PATH / \"employee_counts.csv\",\n", " 'industries': BASE_PATH / \"industries.csv\",\n", " 'job_industries': BASE_PATH / \"job_industries.csv\",\n", " 'job_skills': BASE_PATH / \"job_skills.csv\",\n", " 'postings': BASE_PATH / \"postings.csv\",\n", " 'resume_data': BASE_PATH / \"resume_data.csv\",\n", " 'salaries': BASE_PATH / \"salaries.csv\",\n", " 'skills': BASE_PATH / \"skills.csv\"\n", "}\n", "\n", "# Output files (salvamos direto com npy/pkl)\n", "OUTPUT_FILES = {\n", " 'candidate_embeddings': 'candidate_embeddings.npy',\n", " 'company_embeddings': 'company_embeddings.npy',\n", " 'candidate_metadata': 'candidate_metadata.pkl',\n", " 'company_metadata': 'company_metadata.pkl'\n", "}\n", "\n", "# Model config\n", "MODEL_NAME = \"sentence-transformers/all-MiniLM-L6-v2\"\n", "EMBEDDING_DIM = 384\n", "\n", "print(\"āœ… Paths configured!\")\n", "print(f\"šŸ“‚ Base path: {BASE_PATH}\")\n", "print(f\"šŸ¤– Model: {MODEL_NAME}\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "657220e4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "šŸ“„ Loading data...\n", "āŒ benefits: ERROR - [Errno 2] No such file or directory: 'data/benefits.csv'\n", "āŒ companies: ERROR - [Errno 2] No such file or directory: 'data/companies.csv'\n", "āŒ company_industries: ERROR - [Errno 2] No such file or directory: 'data/company_industries.csv'\n", "āŒ company_specialties: ERROR - [Errno 2] No such file or directory: 'data/company_specialties.csv'\n", "āŒ employee_counts: ERROR - [Errno 2] No such file or directory: 'data/employee_counts.csv'\n", "āŒ industries: ERROR - [Errno 2] No such file or directory: 'data/industries.csv'\n", "āŒ job_industries: ERROR - [Errno 2] No such file or directory: 'data/job_industries.csv'\n", "āŒ job_skills: ERROR - [Errno 2] No such file or directory: 'data/job_skills.csv'\n", "āŒ postings: ERROR - [Errno 2] No such file or directory: 'data/postings.csv'\n", "āŒ resume_data: ERROR - [Errno 2] No such file or directory: 'data/resume_data.csv'\n", "āŒ salaries: ERROR - [Errno 2] No such file or directory: 'data/salaries.csv'\n", "āŒ skills: ERROR - [Errno 2] No such file or directory: 'data/skills.csv'\n", "\n", "ā±ļø Loaded in 0.00s\n", "\n", "======================================================================\n", "šŸ” KEY DATASETS PREVIEW\n", "======================================================================\n", "\n", "šŸ“‹ CANDIDATES (resume_data):\n", "\n", "šŸ¢ COMPANIES:\n", "\n", "šŸ“„ JOB POSTINGS:\n", "\n", "āœ… Data loaded! Ready to inspect and clean.\n" ] } ], "source": [ "# ═══════════════════════════════════════════════════════════════════\n", "# Cell 3: Load Raw Data\n", "# ═══════════════════════════════════════════════════════════════════\n", "\n", "print(\"šŸ“„ Loading data...\")\n", "start_time = time.time()\n", "\n", "# Load all CSVs\n", "data = {}\n", "for name, path in DATA_PATHS.items():\n", " try:\n", " df = pd.read_csv(path)\n", " data[name] = df\n", " print(f\"āœ… {name}: {df.shape[0]:,} rows Ɨ {df.shape[1]} cols\")\n", " except Exception as e:\n", " print(f\"āŒ {name}: ERROR - {e}\")\n", " data[name] = None\n", "\n", "load_time = time.time() - start_time\n", "print(f\"\\nā±ļø Loaded in {load_time:.2f}s\")\n", "\n", "# Quick peek at key datasets\n", "print(\"\\n\" + \"=\"*70)\n", "print(\"šŸ” KEY DATASETS PREVIEW\")\n", "print(\"=\"*70)\n", "\n", "print(\"\\nšŸ“‹ CANDIDATES (resume_data):\")\n", "if data['resume_data'] is not None:\n", " print(f\"Shape: {data['resume_data'].shape}\")\n", " print(f\"Columns: {list(data['resume_data'].columns)}\")\n", " print(data['resume_data'].head(2))\n", "\n", "print(\"\\nšŸ¢ COMPANIES:\")\n", "if data['companies'] is not None:\n", " print(f\"Shape: {data['companies'].shape}\")\n", " print(f\"Columns: {list(data['companies'].columns)}\")\n", " print(data['companies'].head(2))\n", "\n", "print(\"\\nšŸ“„ JOB POSTINGS:\")\n", "if data['postings'] is not None:\n", " print(f\"Shape: {data['postings'].shape}\")\n", " print(f\"Columns: {list(data['postings'].columns)}\")\n", " print(data['postings'].head(2))\n", "\n", "print(\"\\nāœ… Data loaded! Ready to inspect and clean.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "52833afd", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }