kristiangnordby
/

cyberLabse

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9112d5ff-60e3-41f4-b407-2b7a209354a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import gzip\n",
+    "import json\n",
+    "import random\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "from torch.utils.data import DataLoader, TensorDataset\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "76e80b80-604b-4a5a-a3a1-6e8196d7aa10",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "📁 Models will be saved to: /home/knordby/Documents/labeling/models\n"
+     ]
+    }
+   ],
+   "source": [
+    "random.seed(42)\n",
+    "np.random.seed(42)\n",
+    "\n",
+    "models_dir = \"/home/knordby/Documents/labeling/models\"\n",
+    "os.makedirs(models_dir, exist_ok=True)\n",
+    "print(f\"\\n📁 Models will be saved to: {models_dir}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "502e273f-b249-4b55-9680-8b68ce8539bd",
+   "metadata": {},
+   "source": [
+    "### Load the data\n",
+    "Here we load our embeddings and as well as our presaved labels for each article."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0b963ac1-3ffa-4079-9a0d-fd87f0cb2267",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "[1/4] Loading embeddings...\n",
+      "   Loading general_sample_200K embeddings...\n",
+      "   Loaded 199793 embeddings from 200K dataset\n",
+      "   Loading cyber_biased_sample_70K embeddings...\n",
+      "   Loaded 62605 embeddings from 70K dataset\n",
+      "   Total embeddings after merge: 262398\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"\\n[1/4] Loading embeddings...\")\n",
+    "\n",
+    "# Load 200K general embeddings\n",
+    "print(\"   Loading general_sample_200K embeddings...\")\n",
+    "with gzip.open('general_sample_200K_embedding_labse.jsonl.gz', 'rt') as f:\n",
+    "    _200k_embeddings = json.load(f)\n",
+    "_200k_embeddings = {k.replace('.json', ''): v for k, v in _200k_embeddings.items()}\n",
+    "print(f\"   Loaded {len(_200k_embeddings)} embeddings from 200K dataset\")\n",
+    "\n",
+    "# Load 70K cyber-biased embeddings\n",
+    "print(\"   Loading cyber_biased_sample_70K embeddings...\")\n",
+    "with gzip.open('cyber_biased_sample_70K_labse_embedding.jsonl.gz', 'rt') as f:\n",
+    "    _70k_embeddings = json.load(f)\n",
+    "_70k_embeddings = {k.replace('.json', ''): v for k, v in _70k_embeddings.items()}\n",
+    "print(f\"   Loaded {len(_70k_embeddings)} embeddings from 70K dataset\")\n",
+    "\n",
+    "# Merge embeddings\n",
+    "labse_embeddings = _70k_embeddings | _200k_embeddings\n",
+    "print(f\"   Total embeddings after merge: {len(labse_embeddings)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "90245f20-97e0-42ba-9cbc-04c78f7bcc01",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 5.69 s, sys: 519 ms, total: 6.21 s\n",
+      "Wall time: 6.21 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "data = np.load('cyber_gemma_embeddings_with_ids.npz')\n",
+    "gemma_embeddings = data['embeddings']  # Shape: (N, embedding_dim)\n",
+    "ids = data['ids']                 # Shape: (N,)\n",
+    "labels = data['labels'] "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "9d248341-e1c9-4418-8035-1ed4215e9b65",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "embeddings = [labse_embeddings[idx] for idx in ids]\n",
+    "embeddings = np.array(embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "9c881f9e-7d07-45ad-9edb-473829e36791",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(207990, 207990, 207990)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(embeddings), len(ids), len(labels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f61ed063-23c2-4919-8a3b-1a296f067290",
+   "metadata": {},
+   "source": [
+    "### Prepare Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "85b8a065-adc1-4acd-ab7a-9976172f4512",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "[3/4] Preparing train/test split...\n",
+      "x_train:  0.8\n",
+      "test size:  0.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "print(\"\\n[3/4] Preparing train/test split...\")\n",
+    "\n",
+    "x_train,x_test, y_train,y_test = train_test_split(embeddings, labels, train_size = 0.8, stratify = labels)\n",
+    "print(\"x_train: \", len(x_train)/(len(x_train)+len(x_test)))\n",
+    "print(\"test size: \", len(x_test)/(len(x_train)+len(x_test)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "030e8ac8-e22c-4144-a79f-f74d461d88ed",
+   "metadata": {},
+   "source": [
+    "#### Dataset Stats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "7888c8cd-df43-4378-8599-56c031dcb9c4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "📊 Dataset Statistics:\n",
+      "   Training set shape: (166392, 768)\n",
+      "   Test set shape: (41598, 768)\n",
+      "   Embedding dimension: 768\n",
+      "\n",
+      "   Label Distribution:\n",
+      "   • Training - Cyber: 29698 (17.8%)\n",
+      "   • Training - Non-cyber: 136694 (82.2%)\n",
+      "   • Test - Cyber: 7424 (17.8%)\n",
+      "   • Test - Non-cyber: 34174 (82.2%)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"\\n📊 Dataset Statistics:\")\n",
+    "print(f\"   Training set shape: {x_train.shape}\")\n",
+    "print(f\"   Test set shape: {x_test.shape}\")\n",
+    "print(f\"   Embedding dimension: {x_train.shape[1]}\")\n",
+    "print(f\"\\n   Label Distribution:\")\n",
+    "print(f\"   • Training - Cyber: {sum(y_train)} ({sum(y_train)/len(y_train)*100:.1f}%)\")\n",
+    "print(f\"   • Training - Non-cyber: {len(y_train)-sum(y_train)} ({(len(y_train)-sum(y_train))/len(y_train)*100:.1f}%)\")\n",
+    "print(f\"   • Test - Cyber: {sum(y_test)} ({sum(y_test)/len(y_test)*100:.1f}%)\")\n",
+    "print(f\"   • Test - Non-cyber: {len(y_test)-sum(y_test)} ({(len(y_test)-sum(y_test))/len(y_test)*100:.1f}%)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a6a6ba0a-274b-4de3-af75-66332a9ad399",
+   "metadata": {},
+   "source": [
+    "### Build the Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "7020d7af-30dd-4f35-8028-a3eccfd9fa71",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using device: cuda\n",
+      "======================================================================\n",
+      "MODEL BUILT\n",
+      "======================================================================\n",
+      "Architecture: CyberClassifier\n",
+      "Input dimension: 768\n",
+      "Hidden layers: 512 -> 256 -> 128\n",
+      "Output: 1 (binary classification)\n",
+      "Total parameters: 561,409\n",
+      "Trainable parameters: 561,409\n",
+      "Device: cuda\n",
+      "======================================================================\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from torch_models import *\n",
+    "\n",
+    "# Check GPU\n",
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "print(f\"Using device: {device}\")\n",
+    "\n",
+    "# Build model\n",
+    "model, optimizer, criterion = build_model(\n",
+    "    input_dim=x_train.shape[1],  # Auto-detect from your data\n",
+    "    device=device\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "5ddf9bdd-4c58-4be8-a07c-dfdbabb9ff84",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "======================================================================\n",
+      "TRAINING\n",
+      "======================================================================\n",
+      "Epochs: 80\n",
+      "Batch size: 512\n",
+      "Training samples: 141433\n",
+      "Validation samples: 24959\n",
+      "Early stopping patience: 15\n",
+      "======================================================================\n",
+      "\n",
+      "Epoch 1/80 - Time: 2.98s\n",
+      "  Train - Loss: 0.1856, Acc: 0.9277, AUC: 0.9563\n",
+      "  Val   - Loss: 0.1602, Acc: 0.9367, AUC: 0.9684, Precision: 0.8594, Recall: 0.7713\n",
+      "  ✓ Best model saved (AUC: 0.9684)\n",
+      "\n",
+      "Epoch 2/80 - Time: 6.42s\n",
+      "  Train - Loss: 0.1465, Acc: 0.9416, AUC: 0.9742\n",
+      "  Val   - Loss: 0.1597, Acc: 0.9363, AUC: 0.9684, Precision: 0.8559, Recall: 0.7735\n",
+      "  No improvement (patience: 1/15)\n",
+      "\n",
+      "Epoch 3/80 - Time: 2.61s\n",
+      "  Train - Loss: 0.1283, Acc: 0.9485, AUC: 0.9809\n",
+      "  Val   - Loss: 0.1628, Acc: 0.9356, AUC: 0.9675, Precision: 0.8600, Recall: 0.7636\n",
+      "  No improvement (patience: 2/15)\n",
+      "\n",
+      "Epoch 4/80 - Time: 2.60s\n",
+      "  Train - Loss: 0.1106, Acc: 0.9560, AUC: 0.9861\n",
+      "  Val   - Loss: 0.1670, Acc: 0.9350, AUC: 0.9673, Precision: 0.8419, Recall: 0.7827\n",
+      "  No improvement (patience: 3/15)\n",
+      "\n",
+      "Epoch 5/80 - Time: 2.58s\n",
+      "  Train - Loss: 0.0928, Acc: 0.9635, AUC: 0.9905\n",
+      "  Val   - Loss: 0.1775, Acc: 0.9353, AUC: 0.9666, Precision: 0.8465, Recall: 0.7785\n",
+      "  No improvement (patience: 4/15)\n",
+      "\n",
+      "Epoch 6/80 - Time: 5.84s\n",
+      "  Train - Loss: 0.0751, Acc: 0.9705, AUC: 0.9940\n",
+      "  Val   - Loss: 0.1964, Acc: 0.9311, AUC: 0.9616, Precision: 0.8285, Recall: 0.7744\n",
+      "  No improvement (patience: 5/15)\n",
+      "\n",
+      "Epoch 7/80 - Time: 2.63s\n",
+      "  Train - Loss: 0.0588, Acc: 0.9778, AUC: 0.9964\n",
+      "  Val   - Loss: 0.2131, Acc: 0.9334, AUC: 0.9642, Precision: 0.8321, Recall: 0.7854\n",
+      "  No improvement (patience: 6/15)\n",
+      "\n",
+      "Epoch 8/80 - Time: 2.61s\n",
+      "  Train - Loss: 0.0305, Acc: 0.9903, AUC: 0.9993\n",
+      "  Val   - Loss: 0.2407, Acc: 0.9333, AUC: 0.9630, Precision: 0.8456, Recall: 0.7661\n",
+      "  No improvement (patience: 7/15)\n",
+      "\n",
+      "Epoch 9/80 - Time: 5.48s\n",
+      "  Train - Loss: 0.0157, Acc: 0.9963, AUC: 0.9998\n",
+      "  Val   - Loss: 0.2672, Acc: 0.9336, AUC: 0.9619, Precision: 0.8380, Recall: 0.7782\n",
+      "  No improvement (patience: 8/15)\n",
+      "\n",
+      "Epoch 10/80 - Time: 2.65s\n",
+      "  Train - Loss: 0.0112, Acc: 0.9976, AUC: 0.9999\n",
+      "  Val   - Loss: 0.2903, Acc: 0.9336, AUC: 0.9607, Precision: 0.8379, Recall: 0.7785\n",
+      "  No improvement (patience: 9/15)\n",
+      "\n",
+      "Epoch 11/80 - Time: 2.66s\n",
+      "  Train - Loss: 0.0110, Acc: 0.9976, AUC: 0.9998\n",
+      "  Val   - Loss: 0.3117, Acc: 0.9331, AUC: 0.9601, Precision: 0.8398, Recall: 0.7722\n",
+      "  No improvement (patience: 10/15)\n",
+      "\n",
+      "Epoch 12/80 - Time: 5.76s\n",
+      "  Train - Loss: 0.0093, Acc: 0.9979, AUC: 0.9999\n",
+      "  Val   - Loss: 0.3281, Acc: 0.9320, AUC: 0.9602, Precision: 0.8372, Recall: 0.7688\n",
+      "  No improvement (patience: 11/15)\n",
+      "\n",
+      "Epoch 13/80 - Time: 2.61s\n",
+      "  Train - Loss: 0.0089, Acc: 0.9979, AUC: 0.9999\n",
+      "  Val   - Loss: 0.3364, Acc: 0.9310, AUC: 0.9587, Precision: 0.8312, Recall: 0.7695\n",
+      "  No improvement (patience: 12/15)\n",
+      "\n",
+      "Epoch 14/80 - Time: 2.61s\n",
+      "  Train - Loss: 0.0047, Acc: 0.9991, AUC: 1.0000\n",
+      "  Val   - Loss: 0.3367, Acc: 0.9324, AUC: 0.9605, Precision: 0.8286, Recall: 0.7834\n",
+      "  No improvement (patience: 13/15)\n",
+      "\n",
+      "Epoch 15/80 - Time: 5.86s\n",
+      "  Train - Loss: 0.0028, Acc: 0.9995, AUC: 1.0000\n",
+      "  Val   - Loss: 0.3522, Acc: 0.9332, AUC: 0.9601, Precision: 0.8400, Recall: 0.7731\n",
+      "  No improvement (patience: 14/15)\n",
+      "\n",
+      "Epoch 16/80 - Time: 2.61s\n",
+      "  Train - Loss: 0.0025, Acc: 0.9995, AUC: 1.0000\n",
+      "  Val   - Loss: 0.3577, Acc: 0.9325, AUC: 0.9589, Precision: 0.8336, Recall: 0.7771\n",
+      "  No improvement (patience: 15/15)\n",
+      "\n",
+      "⚠️  Early stopping triggered after 16 epochs\n",
+      "\n",
+      "======================================================================\n",
+      "Loading best model...\n",
+      "✅ Best model loaded (AUC: 0.9684)\n",
+      "💾 Model saved to: /home/knordby/Documents/labeling/models/cyber_labseEmbeddings.pt\n",
+      "⏱️  Total training time: 59.08s (0.98m)\n",
+      "======================================================================\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Set save path\n",
+    "model_path = '/home/knordby/Documents/labeling/models/cyber_labseEmbeddings.pt'\n",
+    "\n",
+    "# Train\n",
+    "model, history = train_model(\n",
+    "    model, optimizer, criterion,\n",
+    "    x_train, y_train, x_test, y_test,\n",
+    "    device=device,\n",
+    "    epochs=80,\n",
+    "    batch_size=512,\n",
+    "    model_path=model_path\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "736256f5-1fa1-4b37-b4da-5f38e6a9e9d6",
+   "metadata": {},
+   "source": [
+    "### Evaluate the Model's Performance Against the Test Set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "a1d5e970-c4b7-4218-bf64-c23414e4bc96",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "======================================================================\n",
+      "📈 CYBERSECURITY CLASSIFIER - FINAL TEST RESULTS\n",
+      "======================================================================\n",
+      "   Loss:      0.1673\n",
+      "   Accuracy:  0.9344 (93.44%)\n",
+      "   Precision: 0.8494\n",
+      "   Recall:    0.7689\n",
+      "   AUC:       0.9650\n",
+      "   F1 Score:  0.8071\n",
+      "\n",
+      "Confusion Matrix:\n",
+      "                 Predicted\n",
+      "                 Negative  Positive\n",
+      "Actual Negative     33162      1012\n",
+      "       Positive      1716      5708\n",
+      "\n",
+      "Detailed Metrics:\n",
+      "   True Positives:  5708\n",
+      "   True Negatives:  33162\n",
+      "   False Positives: 1012\n",
+      "   False Negatives: 1716\n",
+      "   Specificity:     0.9704\n",
+      "   NPV:             0.9508\n",
+      "\n",
+      "Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "   Non-Cyber     0.9508    0.9704    0.9605     34174\n",
+      "       Cyber     0.8494    0.7689    0.8071      7424\n",
+      "\n",
+      "    accuracy                         0.9344     41598\n",
+      "   macro avg     0.9001    0.8696    0.8838     41598\n",
+      "weighted avg     0.9327    0.9344    0.9331     41598\n",
+      "\n",
+      "======================================================================\n",
+      "\n",
+      "Test AUC: 0.9650\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Evaluate with detailed metrics\n",
+    "y_pred_probs, metrics = evaluate_model(\n",
+    "    model, x_test, y_test,\n",
+    "    device=device\n",
+    ")\n",
+    "\n",
+    "# Access individual metrics if needed\n",
+    "print(f\"Test AUC: {metrics['auc']:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "af1ddbc6-372a-4d2c-9f04-e4f3987165db",
+   "metadata": {},
+   "source": [
+    "### Push the Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "7ef72e71-49af-4d6b-9b44-3f1dcb03bcf9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "======================================================================\n",
+      "PUSHING MODEL TO HUGGINGFACE\n",
+      "======================================================================\n",
+      "Repository: kristiangnordby/cyberLabse\n",
+      "Private: False\n",
+      "======================================================================\n",
+      "\n",
+      "✅ Repository created/verified: kristiangnordby/cyberLabse\n",
+      "\n",
+      "📝 Creating model card...\n",
+      "⚙️  Saving configuration...\n",
+      "🏗️  Saving model architecture...\n",
+      "💾 Preparing model checkpoint...\n",
+      "\n",
+      "📤 Uploading files to HuggingFace...\n",
+      "  ✓ Uploaded: README.md\n",
+      "  ✓ Uploaded: config.json\n",
+      "  ✓ Uploaded: model_architecture.py\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "268511164a864b7ca53f82ebf30f1599",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Processing Files (0 / 0): |          |  0.00B /  0.00B            "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "317b3bbba97846fb9547da11b76b5693",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "New Data Upload: |          |  0.00B /  0.00B            "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  ✓ Uploaded: model.pt\n",
+      "\n",
+      "======================================================================\n",
+      "✅ MODEL SUCCESSFULLY PUSHED TO HUGGINGFACE!\n",
+      "======================================================================\n",
+      "🔗 View your model at: https://huggingface.co/kristiangnordby/cyberLabse\n",
+      "======================================================================\n",
+      "\n",
+      "Model available at: https://huggingface.co/kristiangnordby/cyberLabse\n"
+     ]
+    }
+   ],
+   "source": [
+    "from push_to_huggingface import push_to_huggingface\n",
+    "\n",
+    "with open(\"hf_token.txt\",'r') as f:\n",
+    "    token = f.read()\n",
+    "\n",
+    "# Push your model (after training and evaluation)\n",
+    "repo_url = push_to_huggingface(\n",
+    "    model_path='/home/knordby/Documents/labeling/models/cyber_labseEmbeddings.pt',\n",
+    "    repo_name='cyberLabse',  # Choose your repo name\n",
+    "    metrics=metrics,  # From evaluate_model()\n",
+    "    input_dim=x_train.shape[1],  # Your embedding dimension\n",
+    "    hf_token=token,  # Your token\n",
+    "    private=False  # Set True if you want private repo\n",
+    ")\n",
+    "\n",
+    "print(f\"Model available at: {repo_url}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "vanilla",
+   "language": "python",
+   "name": "vanilla"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}