kristiangnordby commited on
Commit
8a616c1
Β·
verified Β·
1 Parent(s): 68c9e75

Upload CyberLABSE.ipynb

Browse files
Files changed (1) hide show
  1. CyberLABSE.ipynb +600 -0
CyberLABSE.ipynb ADDED
@@ -0,0 +1,600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "9112d5ff-60e3-41f4-b407-2b7a209354a2",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import gzip\n",
12
+ "import json\n",
13
+ "import random\n",
14
+ "import torch\n",
15
+ "import torch.nn as nn\n",
16
+ "import torch.optim as optim\n",
17
+ "from torch.utils.data import DataLoader, TensorDataset\n",
18
+ "from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score\n",
19
+ "import numpy as np"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 2,
25
+ "id": "76e80b80-604b-4a5a-a3a1-6e8196d7aa10",
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "name": "stdout",
30
+ "output_type": "stream",
31
+ "text": [
32
+ "\n",
33
+ "πŸ“ Models will be saved to: /home/knordby/Documents/labeling/models\n"
34
+ ]
35
+ }
36
+ ],
37
+ "source": [
38
+ "random.seed(42)\n",
39
+ "np.random.seed(42)\n",
40
+ "\n",
41
+ "models_dir = \"/home/knordby/Documents/labeling/models\"\n",
42
+ "os.makedirs(models_dir, exist_ok=True)\n",
43
+ "print(f\"\\nπŸ“ Models will be saved to: {models_dir}\")"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "markdown",
48
+ "id": "502e273f-b249-4b55-9680-8b68ce8539bd",
49
+ "metadata": {},
50
+ "source": [
51
+ "### Load the data\n",
52
+ "Here we load our embeddings and as well as our presaved labels for each article."
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 11,
58
+ "id": "0b963ac1-3ffa-4079-9a0d-fd87f0cb2267",
59
+ "metadata": {},
60
+ "outputs": [
61
+ {
62
+ "name": "stdout",
63
+ "output_type": "stream",
64
+ "text": [
65
+ "\n",
66
+ "[1/4] Loading embeddings...\n",
67
+ " Loading general_sample_200K embeddings...\n",
68
+ " Loaded 199793 embeddings from 200K dataset\n",
69
+ " Loading cyber_biased_sample_70K embeddings...\n",
70
+ " Loaded 62605 embeddings from 70K dataset\n",
71
+ " Total embeddings after merge: 262398\n"
72
+ ]
73
+ }
74
+ ],
75
+ "source": [
76
+ "print(\"\\n[1/4] Loading embeddings...\")\n",
77
+ "\n",
78
+ "# Load 200K general embeddings\n",
79
+ "print(\" Loading general_sample_200K embeddings...\")\n",
80
+ "with gzip.open('general_sample_200K_embedding_labse.jsonl.gz', 'rt') as f:\n",
81
+ " _200k_embeddings = json.load(f)\n",
82
+ "_200k_embeddings = {k.replace('.json', ''): v for k, v in _200k_embeddings.items()}\n",
83
+ "print(f\" Loaded {len(_200k_embeddings)} embeddings from 200K dataset\")\n",
84
+ "\n",
85
+ "# Load 70K cyber-biased embeddings\n",
86
+ "print(\" Loading cyber_biased_sample_70K embeddings...\")\n",
87
+ "with gzip.open('cyber_biased_sample_70K_labse_embedding.jsonl.gz', 'rt') as f:\n",
88
+ " _70k_embeddings = json.load(f)\n",
89
+ "_70k_embeddings = {k.replace('.json', ''): v for k, v in _70k_embeddings.items()}\n",
90
+ "print(f\" Loaded {len(_70k_embeddings)} embeddings from 70K dataset\")\n",
91
+ "\n",
92
+ "# Merge embeddings\n",
93
+ "labse_embeddings = _70k_embeddings | _200k_embeddings\n",
94
+ "print(f\" Total embeddings after merge: {len(labse_embeddings)}\")"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": 16,
100
+ "id": "90245f20-97e0-42ba-9cbc-04c78f7bcc01",
101
+ "metadata": {},
102
+ "outputs": [
103
+ {
104
+ "name": "stdout",
105
+ "output_type": "stream",
106
+ "text": [
107
+ "CPU times: user 5.69 s, sys: 519 ms, total: 6.21 s\n",
108
+ "Wall time: 6.21 s\n"
109
+ ]
110
+ }
111
+ ],
112
+ "source": [
113
+ "%%time\n",
114
+ "data = np.load('cyber_gemma_embeddings_with_ids.npz')\n",
115
+ "gemma_embeddings = data['embeddings'] # Shape: (N, embedding_dim)\n",
116
+ "ids = data['ids'] # Shape: (N,)\n",
117
+ "labels = data['labels'] "
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": 22,
123
+ "id": "9d248341-e1c9-4418-8035-1ed4215e9b65",
124
+ "metadata": {
125
+ "scrolled": true
126
+ },
127
+ "outputs": [],
128
+ "source": [
129
+ "embeddings = [labse_embeddings[idx] for idx in ids]\n",
130
+ "embeddings = np.array(embeddings)"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": 23,
136
+ "id": "9c881f9e-7d07-45ad-9edb-473829e36791",
137
+ "metadata": {},
138
+ "outputs": [
139
+ {
140
+ "data": {
141
+ "text/plain": [
142
+ "(207990, 207990, 207990)"
143
+ ]
144
+ },
145
+ "execution_count": 23,
146
+ "metadata": {},
147
+ "output_type": "execute_result"
148
+ }
149
+ ],
150
+ "source": [
151
+ "len(embeddings), len(ids), len(labels)"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "markdown",
156
+ "id": "f61ed063-23c2-4919-8a3b-1a296f067290",
157
+ "metadata": {},
158
+ "source": [
159
+ "### Prepare Data"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": 24,
165
+ "id": "85b8a065-adc1-4acd-ab7a-9976172f4512",
166
+ "metadata": {},
167
+ "outputs": [
168
+ {
169
+ "name": "stdout",
170
+ "output_type": "stream",
171
+ "text": [
172
+ "\n",
173
+ "[3/4] Preparing train/test split...\n",
174
+ "x_train: 0.8\n",
175
+ "test size: 0.2\n"
176
+ ]
177
+ }
178
+ ],
179
+ "source": [
180
+ "from sklearn.model_selection import train_test_split\n",
181
+ "print(\"\\n[3/4] Preparing train/test split...\")\n",
182
+ "\n",
183
+ "x_train,x_test, y_train,y_test = train_test_split(embeddings, labels, train_size = 0.8, stratify = labels)\n",
184
+ "print(\"x_train: \", len(x_train)/(len(x_train)+len(x_test)))\n",
185
+ "print(\"test size: \", len(x_test)/(len(x_train)+len(x_test)))"
186
+ ]
187
+ },
188
+ {
189
+ "cell_type": "markdown",
190
+ "id": "030e8ac8-e22c-4144-a79f-f74d461d88ed",
191
+ "metadata": {},
192
+ "source": [
193
+ "#### Dataset Stats"
194
+ ]
195
+ },
196
+ {
197
+ "cell_type": "code",
198
+ "execution_count": 25,
199
+ "id": "7888c8cd-df43-4378-8599-56c031dcb9c4",
200
+ "metadata": {},
201
+ "outputs": [
202
+ {
203
+ "name": "stdout",
204
+ "output_type": "stream",
205
+ "text": [
206
+ "\n",
207
+ "πŸ“Š Dataset Statistics:\n",
208
+ " Training set shape: (166392, 768)\n",
209
+ " Test set shape: (41598, 768)\n",
210
+ " Embedding dimension: 768\n",
211
+ "\n",
212
+ " Label Distribution:\n",
213
+ " β€’ Training - Cyber: 29698 (17.8%)\n",
214
+ " β€’ Training - Non-cyber: 136694 (82.2%)\n",
215
+ " β€’ Test - Cyber: 7424 (17.8%)\n",
216
+ " β€’ Test - Non-cyber: 34174 (82.2%)\n"
217
+ ]
218
+ }
219
+ ],
220
+ "source": [
221
+ "print(f\"\\nπŸ“Š Dataset Statistics:\")\n",
222
+ "print(f\" Training set shape: {x_train.shape}\")\n",
223
+ "print(f\" Test set shape: {x_test.shape}\")\n",
224
+ "print(f\" Embedding dimension: {x_train.shape[1]}\")\n",
225
+ "print(f\"\\n Label Distribution:\")\n",
226
+ "print(f\" β€’ Training - Cyber: {sum(y_train)} ({sum(y_train)/len(y_train)*100:.1f}%)\")\n",
227
+ "print(f\" β€’ Training - Non-cyber: {len(y_train)-sum(y_train)} ({(len(y_train)-sum(y_train))/len(y_train)*100:.1f}%)\")\n",
228
+ "print(f\" β€’ Test - Cyber: {sum(y_test)} ({sum(y_test)/len(y_test)*100:.1f}%)\")\n",
229
+ "print(f\" β€’ Test - Non-cyber: {len(y_test)-sum(y_test)} ({(len(y_test)-sum(y_test))/len(y_test)*100:.1f}%)\")"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "markdown",
234
+ "id": "a6a6ba0a-274b-4de3-af75-66332a9ad399",
235
+ "metadata": {},
236
+ "source": [
237
+ "### Build the Model"
238
+ ]
239
+ },
240
+ {
241
+ "cell_type": "code",
242
+ "execution_count": 26,
243
+ "id": "7020d7af-30dd-4f35-8028-a3eccfd9fa71",
244
+ "metadata": {},
245
+ "outputs": [
246
+ {
247
+ "name": "stdout",
248
+ "output_type": "stream",
249
+ "text": [
250
+ "Using device: cuda\n",
251
+ "======================================================================\n",
252
+ "MODEL BUILT\n",
253
+ "======================================================================\n",
254
+ "Architecture: CyberClassifier\n",
255
+ "Input dimension: 768\n",
256
+ "Hidden layers: 512 -> 256 -> 128\n",
257
+ "Output: 1 (binary classification)\n",
258
+ "Total parameters: 561,409\n",
259
+ "Trainable parameters: 561,409\n",
260
+ "Device: cuda\n",
261
+ "======================================================================\n",
262
+ "\n"
263
+ ]
264
+ }
265
+ ],
266
+ "source": [
267
+ "from torch_models import *\n",
268
+ "\n",
269
+ "# Check GPU\n",
270
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
271
+ "print(f\"Using device: {device}\")\n",
272
+ "\n",
273
+ "# Build model\n",
274
+ "model, optimizer, criterion = build_model(\n",
275
+ " input_dim=x_train.shape[1], # Auto-detect from your data\n",
276
+ " device=device\n",
277
+ ")"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": 27,
283
+ "id": "5ddf9bdd-4c58-4be8-a07c-dfdbabb9ff84",
284
+ "metadata": {},
285
+ "outputs": [
286
+ {
287
+ "name": "stdout",
288
+ "output_type": "stream",
289
+ "text": [
290
+ "======================================================================\n",
291
+ "TRAINING\n",
292
+ "======================================================================\n",
293
+ "Epochs: 80\n",
294
+ "Batch size: 512\n",
295
+ "Training samples: 141433\n",
296
+ "Validation samples: 24959\n",
297
+ "Early stopping patience: 15\n",
298
+ "======================================================================\n",
299
+ "\n",
300
+ "Epoch 1/80 - Time: 2.98s\n",
301
+ " Train - Loss: 0.1856, Acc: 0.9277, AUC: 0.9563\n",
302
+ " Val - Loss: 0.1602, Acc: 0.9367, AUC: 0.9684, Precision: 0.8594, Recall: 0.7713\n",
303
+ " βœ“ Best model saved (AUC: 0.9684)\n",
304
+ "\n",
305
+ "Epoch 2/80 - Time: 6.42s\n",
306
+ " Train - Loss: 0.1465, Acc: 0.9416, AUC: 0.9742\n",
307
+ " Val - Loss: 0.1597, Acc: 0.9363, AUC: 0.9684, Precision: 0.8559, Recall: 0.7735\n",
308
+ " No improvement (patience: 1/15)\n",
309
+ "\n",
310
+ "Epoch 3/80 - Time: 2.61s\n",
311
+ " Train - Loss: 0.1283, Acc: 0.9485, AUC: 0.9809\n",
312
+ " Val - Loss: 0.1628, Acc: 0.9356, AUC: 0.9675, Precision: 0.8600, Recall: 0.7636\n",
313
+ " No improvement (patience: 2/15)\n",
314
+ "\n",
315
+ "Epoch 4/80 - Time: 2.60s\n",
316
+ " Train - Loss: 0.1106, Acc: 0.9560, AUC: 0.9861\n",
317
+ " Val - Loss: 0.1670, Acc: 0.9350, AUC: 0.9673, Precision: 0.8419, Recall: 0.7827\n",
318
+ " No improvement (patience: 3/15)\n",
319
+ "\n",
320
+ "Epoch 5/80 - Time: 2.58s\n",
321
+ " Train - Loss: 0.0928, Acc: 0.9635, AUC: 0.9905\n",
322
+ " Val - Loss: 0.1775, Acc: 0.9353, AUC: 0.9666, Precision: 0.8465, Recall: 0.7785\n",
323
+ " No improvement (patience: 4/15)\n",
324
+ "\n",
325
+ "Epoch 6/80 - Time: 5.84s\n",
326
+ " Train - Loss: 0.0751, Acc: 0.9705, AUC: 0.9940\n",
327
+ " Val - Loss: 0.1964, Acc: 0.9311, AUC: 0.9616, Precision: 0.8285, Recall: 0.7744\n",
328
+ " No improvement (patience: 5/15)\n",
329
+ "\n",
330
+ "Epoch 7/80 - Time: 2.63s\n",
331
+ " Train - Loss: 0.0588, Acc: 0.9778, AUC: 0.9964\n",
332
+ " Val - Loss: 0.2131, Acc: 0.9334, AUC: 0.9642, Precision: 0.8321, Recall: 0.7854\n",
333
+ " No improvement (patience: 6/15)\n",
334
+ "\n",
335
+ "Epoch 8/80 - Time: 2.61s\n",
336
+ " Train - Loss: 0.0305, Acc: 0.9903, AUC: 0.9993\n",
337
+ " Val - Loss: 0.2407, Acc: 0.9333, AUC: 0.9630, Precision: 0.8456, Recall: 0.7661\n",
338
+ " No improvement (patience: 7/15)\n",
339
+ "\n",
340
+ "Epoch 9/80 - Time: 5.48s\n",
341
+ " Train - Loss: 0.0157, Acc: 0.9963, AUC: 0.9998\n",
342
+ " Val - Loss: 0.2672, Acc: 0.9336, AUC: 0.9619, Precision: 0.8380, Recall: 0.7782\n",
343
+ " No improvement (patience: 8/15)\n",
344
+ "\n",
345
+ "Epoch 10/80 - Time: 2.65s\n",
346
+ " Train - Loss: 0.0112, Acc: 0.9976, AUC: 0.9999\n",
347
+ " Val - Loss: 0.2903, Acc: 0.9336, AUC: 0.9607, Precision: 0.8379, Recall: 0.7785\n",
348
+ " No improvement (patience: 9/15)\n",
349
+ "\n",
350
+ "Epoch 11/80 - Time: 2.66s\n",
351
+ " Train - Loss: 0.0110, Acc: 0.9976, AUC: 0.9998\n",
352
+ " Val - Loss: 0.3117, Acc: 0.9331, AUC: 0.9601, Precision: 0.8398, Recall: 0.7722\n",
353
+ " No improvement (patience: 10/15)\n",
354
+ "\n",
355
+ "Epoch 12/80 - Time: 5.76s\n",
356
+ " Train - Loss: 0.0093, Acc: 0.9979, AUC: 0.9999\n",
357
+ " Val - Loss: 0.3281, Acc: 0.9320, AUC: 0.9602, Precision: 0.8372, Recall: 0.7688\n",
358
+ " No improvement (patience: 11/15)\n",
359
+ "\n",
360
+ "Epoch 13/80 - Time: 2.61s\n",
361
+ " Train - Loss: 0.0089, Acc: 0.9979, AUC: 0.9999\n",
362
+ " Val - Loss: 0.3364, Acc: 0.9310, AUC: 0.9587, Precision: 0.8312, Recall: 0.7695\n",
363
+ " No improvement (patience: 12/15)\n",
364
+ "\n",
365
+ "Epoch 14/80 - Time: 2.61s\n",
366
+ " Train - Loss: 0.0047, Acc: 0.9991, AUC: 1.0000\n",
367
+ " Val - Loss: 0.3367, Acc: 0.9324, AUC: 0.9605, Precision: 0.8286, Recall: 0.7834\n",
368
+ " No improvement (patience: 13/15)\n",
369
+ "\n",
370
+ "Epoch 15/80 - Time: 5.86s\n",
371
+ " Train - Loss: 0.0028, Acc: 0.9995, AUC: 1.0000\n",
372
+ " Val - Loss: 0.3522, Acc: 0.9332, AUC: 0.9601, Precision: 0.8400, Recall: 0.7731\n",
373
+ " No improvement (patience: 14/15)\n",
374
+ "\n",
375
+ "Epoch 16/80 - Time: 2.61s\n",
376
+ " Train - Loss: 0.0025, Acc: 0.9995, AUC: 1.0000\n",
377
+ " Val - Loss: 0.3577, Acc: 0.9325, AUC: 0.9589, Precision: 0.8336, Recall: 0.7771\n",
378
+ " No improvement (patience: 15/15)\n",
379
+ "\n",
380
+ "⚠️ Early stopping triggered after 16 epochs\n",
381
+ "\n",
382
+ "======================================================================\n",
383
+ "Loading best model...\n",
384
+ "βœ… Best model loaded (AUC: 0.9684)\n",
385
+ "πŸ’Ύ Model saved to: /home/knordby/Documents/labeling/models/cyber_labseEmbeddings.pt\n",
386
+ "⏱️ Total training time: 59.08s (0.98m)\n",
387
+ "======================================================================\n",
388
+ "\n"
389
+ ]
390
+ }
391
+ ],
392
+ "source": [
393
+ "# Set save path\n",
394
+ "model_path = '/home/knordby/Documents/labeling/models/cyber_labseEmbeddings.pt'\n",
395
+ "\n",
396
+ "# Train\n",
397
+ "model, history = train_model(\n",
398
+ " model, optimizer, criterion,\n",
399
+ " x_train, y_train, x_test, y_test,\n",
400
+ " device=device,\n",
401
+ " epochs=80,\n",
402
+ " batch_size=512,\n",
403
+ " model_path=model_path\n",
404
+ ")"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "markdown",
409
+ "id": "736256f5-1fa1-4b37-b4da-5f38e6a9e9d6",
410
+ "metadata": {},
411
+ "source": [
412
+ "### Evaluate the Model's Performance Against the Test Set"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": 29,
418
+ "id": "a1d5e970-c4b7-4218-bf64-c23414e4bc96",
419
+ "metadata": {},
420
+ "outputs": [
421
+ {
422
+ "name": "stdout",
423
+ "output_type": "stream",
424
+ "text": [
425
+ "======================================================================\n",
426
+ "πŸ“ˆ CYBERSECURITY CLASSIFIER - FINAL TEST RESULTS\n",
427
+ "======================================================================\n",
428
+ " Loss: 0.1673\n",
429
+ " Accuracy: 0.9344 (93.44%)\n",
430
+ " Precision: 0.8494\n",
431
+ " Recall: 0.7689\n",
432
+ " AUC: 0.9650\n",
433
+ " F1 Score: 0.8071\n",
434
+ "\n",
435
+ "Confusion Matrix:\n",
436
+ " Predicted\n",
437
+ " Negative Positive\n",
438
+ "Actual Negative 33162 1012\n",
439
+ " Positive 1716 5708\n",
440
+ "\n",
441
+ "Detailed Metrics:\n",
442
+ " True Positives: 5708\n",
443
+ " True Negatives: 33162\n",
444
+ " False Positives: 1012\n",
445
+ " False Negatives: 1716\n",
446
+ " Specificity: 0.9704\n",
447
+ " NPV: 0.9508\n",
448
+ "\n",
449
+ "Classification Report:\n",
450
+ " precision recall f1-score support\n",
451
+ "\n",
452
+ " Non-Cyber 0.9508 0.9704 0.9605 34174\n",
453
+ " Cyber 0.8494 0.7689 0.8071 7424\n",
454
+ "\n",
455
+ " accuracy 0.9344 41598\n",
456
+ " macro avg 0.9001 0.8696 0.8838 41598\n",
457
+ "weighted avg 0.9327 0.9344 0.9331 41598\n",
458
+ "\n",
459
+ "======================================================================\n",
460
+ "\n",
461
+ "Test AUC: 0.9650\n"
462
+ ]
463
+ }
464
+ ],
465
+ "source": [
466
+ "# Evaluate with detailed metrics\n",
467
+ "y_pred_probs, metrics = evaluate_model(\n",
468
+ " model, x_test, y_test,\n",
469
+ " device=device\n",
470
+ ")\n",
471
+ "\n",
472
+ "# Access individual metrics if needed\n",
473
+ "print(f\"Test AUC: {metrics['auc']:.4f}\")"
474
+ ]
475
+ },
476
+ {
477
+ "cell_type": "markdown",
478
+ "id": "af1ddbc6-372a-4d2c-9f04-e4f3987165db",
479
+ "metadata": {},
480
+ "source": [
481
+ "### Push the Model"
482
+ ]
483
+ },
484
+ {
485
+ "cell_type": "code",
486
+ "execution_count": 30,
487
+ "id": "7ef72e71-49af-4d6b-9b44-3f1dcb03bcf9",
488
+ "metadata": {},
489
+ "outputs": [
490
+ {
491
+ "name": "stdout",
492
+ "output_type": "stream",
493
+ "text": [
494
+ "\n",
495
+ "======================================================================\n",
496
+ "PUSHING MODEL TO HUGGINGFACE\n",
497
+ "======================================================================\n",
498
+ "Repository: kristiangnordby/cyberLabse\n",
499
+ "Private: False\n",
500
+ "======================================================================\n",
501
+ "\n",
502
+ "βœ… Repository created/verified: kristiangnordby/cyberLabse\n",
503
+ "\n",
504
+ "πŸ“ Creating model card...\n",
505
+ "βš™οΈ Saving configuration...\n",
506
+ "πŸ—οΈ Saving model architecture...\n",
507
+ "πŸ’Ύ Preparing model checkpoint...\n",
508
+ "\n",
509
+ "πŸ“€ Uploading files to HuggingFace...\n",
510
+ " βœ“ Uploaded: README.md\n",
511
+ " βœ“ Uploaded: config.json\n",
512
+ " βœ“ Uploaded: model_architecture.py\n"
513
+ ]
514
+ },
515
+ {
516
+ "data": {
517
+ "application/vnd.jupyter.widget-view+json": {
518
+ "model_id": "268511164a864b7ca53f82ebf30f1599",
519
+ "version_major": 2,
520
+ "version_minor": 0
521
+ },
522
+ "text/plain": [
523
+ "Processing Files (0 / 0): | | 0.00B / 0.00B "
524
+ ]
525
+ },
526
+ "metadata": {},
527
+ "output_type": "display_data"
528
+ },
529
+ {
530
+ "data": {
531
+ "application/vnd.jupyter.widget-view+json": {
532
+ "model_id": "317b3bbba97846fb9547da11b76b5693",
533
+ "version_major": 2,
534
+ "version_minor": 0
535
+ },
536
+ "text/plain": [
537
+ "New Data Upload: | | 0.00B / 0.00B "
538
+ ]
539
+ },
540
+ "metadata": {},
541
+ "output_type": "display_data"
542
+ },
543
+ {
544
+ "name": "stdout",
545
+ "output_type": "stream",
546
+ "text": [
547
+ " βœ“ Uploaded: model.pt\n",
548
+ "\n",
549
+ "======================================================================\n",
550
+ "βœ… MODEL SUCCESSFULLY PUSHED TO HUGGINGFACE!\n",
551
+ "======================================================================\n",
552
+ "πŸ”— View your model at: https://huggingface.co/kristiangnordby/cyberLabse\n",
553
+ "======================================================================\n",
554
+ "\n",
555
+ "Model available at: https://huggingface.co/kristiangnordby/cyberLabse\n"
556
+ ]
557
+ }
558
+ ],
559
+ "source": [
560
+ "from push_to_huggingface import push_to_huggingface\n",
561
+ "\n",
562
+ "with open(\"hf_token.txt\",'r') as f:\n",
563
+ " token = f.read()\n",
564
+ "\n",
565
+ "# Push your model (after training and evaluation)\n",
566
+ "repo_url = push_to_huggingface(\n",
567
+ " model_path='/home/knordby/Documents/labeling/models/cyber_labseEmbeddings.pt',\n",
568
+ " repo_name='cyberLabse', # Choose your repo name\n",
569
+ " metrics=metrics, # From evaluate_model()\n",
570
+ " input_dim=x_train.shape[1], # Your embedding dimension\n",
571
+ " hf_token=token, # Your token\n",
572
+ " private=False # Set True if you want private repo\n",
573
+ ")\n",
574
+ "\n",
575
+ "print(f\"Model available at: {repo_url}\")"
576
+ ]
577
+ }
578
+ ],
579
+ "metadata": {
580
+ "kernelspec": {
581
+ "display_name": "vanilla",
582
+ "language": "python",
583
+ "name": "vanilla"
584
+ },
585
+ "language_info": {
586
+ "codemirror_mode": {
587
+ "name": "ipython",
588
+ "version": 3
589
+ },
590
+ "file_extension": ".py",
591
+ "mimetype": "text/x-python",
592
+ "name": "python",
593
+ "nbconvert_exporter": "python",
594
+ "pygments_lexer": "ipython3",
595
+ "version": "3.10.19"
596
+ }
597
+ },
598
+ "nbformat": 4,
599
+ "nbformat_minor": 5
600
+ }