Spaces:
Sleeping
Sleeping
plus rubert
Browse files- app.py +7 -0
- first.ipynb +425 -234
app.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
import faiss
|
| 6 |
+
import pickle
|
| 7 |
+
import time
|
first.ipynb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
-
"execution_count":
|
| 6 |
"metadata": {},
|
| 7 |
"outputs": [],
|
| 8 |
"source": [
|
|
@@ -10,12 +10,31 @@
|
|
| 10 |
"import numpy as np\n",
|
| 11 |
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
| 12 |
"import torch\n",
|
| 13 |
-
"import pandas as pd"
|
|
|
|
|
|
|
|
|
|
| 14 |
]
|
| 15 |
},
|
| 16 |
{
|
| 17 |
"cell_type": "code",
|
| 18 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
"metadata": {},
|
| 20 |
"outputs": [],
|
| 21 |
"source": [
|
|
@@ -25,7 +44,7 @@
|
|
| 25 |
},
|
| 26 |
{
|
| 27 |
"cell_type": "code",
|
| 28 |
-
"execution_count":
|
| 29 |
"metadata": {},
|
| 30 |
"outputs": [],
|
| 31 |
"source": [
|
|
@@ -35,7 +54,7 @@
|
|
| 35 |
},
|
| 36 |
{
|
| 37 |
"cell_type": "code",
|
| 38 |
-
"execution_count":
|
| 39 |
"metadata": {},
|
| 40 |
"outputs": [],
|
| 41 |
"source": [
|
|
@@ -44,7 +63,7 @@
|
|
| 44 |
},
|
| 45 |
{
|
| 46 |
"cell_type": "code",
|
| 47 |
-
"execution_count":
|
| 48 |
"metadata": {},
|
| 49 |
"outputs": [],
|
| 50 |
"source": [
|
|
@@ -54,7 +73,7 @@
|
|
| 54 |
},
|
| 55 |
{
|
| 56 |
"cell_type": "code",
|
| 57 |
-
"execution_count":
|
| 58 |
"metadata": {},
|
| 59 |
"outputs": [
|
| 60 |
{
|
|
@@ -82,7 +101,7 @@
|
|
| 82 |
},
|
| 83 |
{
|
| 84 |
"cell_type": "code",
|
| 85 |
-
"execution_count":
|
| 86 |
"metadata": {},
|
| 87 |
"outputs": [
|
| 88 |
{
|
|
@@ -181,7 +200,7 @@
|
|
| 181 |
"4 Компания мелких подонков старшего школьного во... 179 "
|
| 182 |
]
|
| 183 |
},
|
| 184 |
-
"execution_count":
|
| 185 |
"metadata": {},
|
| 186 |
"output_type": "execute_result"
|
| 187 |
}
|
|
@@ -192,7 +211,7 @@
|
|
| 192 |
},
|
| 193 |
{
|
| 194 |
"cell_type": "code",
|
| 195 |
-
"execution_count":
|
| 196 |
"metadata": {},
|
| 197 |
"outputs": [
|
| 198 |
{
|
|
@@ -201,7 +220,7 @@
|
|
| 201 |
"'История о Шерлоке Холмсе и докторе Ватсоне в Лондоне начала двадцать первого века успела прославиться не только как одна из самых стильных и интригующих экранизаций рассказов сэра Артура Конан-Дойла, но и как шоу, создатели которого заставляют фанатов изнывать в ожидании новых серий не меньше пары-тройки мучительных лет. Ожидание, впрочем, того стоит.\\r\\nВ чем суть?\\r\\nХромающий афганский ветеран ищет соседа по квартире и по совету знакомого отправляется в морг к возможному кандидату. К счастью, живому — но с характером, являющим собой ядерный коктейль из социопатии, презрения к интеллектуальному уровню окружающих и прочих милых личностных качеств, включая неоспоримую гениальность. Лондон, тем временем, сотрясает череда необъяснимых убийств, разобраться с которыми бравым служакам из Скотленд Ярда оказывается не по зубам. В дело под бодрый саундтрек вступают Шерлок и доктор Ватсон, вооруженные в придачу к давно знакомой дедукции еще и личным блогом, закодированными мобильниками и прочими благами цивилизации.'"
|
| 202 |
]
|
| 203 |
},
|
| 204 |
-
"execution_count":
|
| 205 |
"metadata": {},
|
| 206 |
"output_type": "execute_result"
|
| 207 |
}
|
|
@@ -212,74 +231,79 @@
|
|
| 212 |
},
|
| 213 |
{
|
| 214 |
"cell_type": "code",
|
| 215 |
-
"execution_count":
|
| 216 |
"metadata": {},
|
| 217 |
"outputs": [],
|
| 218 |
"source": [
|
| 219 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
]
|
| 221 |
},
|
| 222 |
{
|
| 223 |
"cell_type": "code",
|
| 224 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
"metadata": {},
|
| 226 |
"outputs": [
|
| 227 |
{
|
| 228 |
"data": {
|
| 229 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 230 |
-
"model_id": "f23f50b1b1fa41b395ce4eecea9f01dd",
|
| 231 |
-
"version_major": 2,
|
| 232 |
-
"version_minor": 0
|
| 233 |
-
},
|
| 234 |
-
"text/plain": [
|
| 235 |
-
"modules.json: 0%| | 0.00/229 [00:00<?, ?B/s]"
|
| 236 |
-
]
|
| 237 |
-
},
|
| 238 |
-
"metadata": {},
|
| 239 |
-
"output_type": "display_data"
|
| 240 |
-
},
|
| 241 |
-
{
|
| 242 |
-
"data": {
|
| 243 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 244 |
-
"model_id": "06a293770d35474e93f549f159422783",
|
| 245 |
-
"version_major": 2,
|
| 246 |
-
"version_minor": 0
|
| 247 |
-
},
|
| 248 |
-
"text/plain": [
|
| 249 |
-
"config_sentence_transformers.json: 0%| | 0.00/122 [00:00<?, ?B/s]"
|
| 250 |
-
]
|
| 251 |
-
},
|
| 252 |
-
"metadata": {},
|
| 253 |
-
"output_type": "display_data"
|
| 254 |
-
},
|
| 255 |
-
{
|
| 256 |
-
"data": {
|
| 257 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 258 |
-
"model_id": "c89e7b45fa0347e0a94fffaf53773082",
|
| 259 |
-
"version_major": 2,
|
| 260 |
-
"version_minor": 0
|
| 261 |
-
},
|
| 262 |
-
"text/plain": [
|
| 263 |
-
"README.md: 0%| | 0.00/4.13k [00:00<?, ?B/s]"
|
| 264 |
-
]
|
| 265 |
-
},
|
| 266 |
-
"metadata": {},
|
| 267 |
-
"output_type": "display_data"
|
| 268 |
-
},
|
| 269 |
-
{
|
| 270 |
-
"data": {
|
| 271 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 272 |
-
"model_id": "a843cd7d29b5444a92ebe0c3dae2da7b",
|
| 273 |
-
"version_major": 2,
|
| 274 |
-
"version_minor": 0
|
| 275 |
-
},
|
| 276 |
"text/plain": [
|
| 277 |
-
"
|
| 278 |
]
|
| 279 |
},
|
|
|
|
| 280 |
"metadata": {},
|
| 281 |
-
"output_type": "
|
| 282 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
{
|
| 284 |
"name": "stderr",
|
| 285 |
"output_type": "stream",
|
|
@@ -287,104 +311,6 @@
|
|
| 287 |
"/home/vera/miniforge3/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
| 288 |
" warnings.warn(\n"
|
| 289 |
]
|
| 290 |
-
},
|
| 291 |
-
{
|
| 292 |
-
"data": {
|
| 293 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 294 |
-
"model_id": "1a19f9f07f554f21bb862699155745b8",
|
| 295 |
-
"version_major": 2,
|
| 296 |
-
"version_minor": 0
|
| 297 |
-
},
|
| 298 |
-
"text/plain": [
|
| 299 |
-
"config.json: 0%| | 0.00/723 [00:00<?, ?B/s]"
|
| 300 |
-
]
|
| 301 |
-
},
|
| 302 |
-
"metadata": {},
|
| 303 |
-
"output_type": "display_data"
|
| 304 |
-
},
|
| 305 |
-
{
|
| 306 |
-
"data": {
|
| 307 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 308 |
-
"model_id": "b87d2b35368c4ab69bebfa4551c907f2",
|
| 309 |
-
"version_major": 2,
|
| 310 |
-
"version_minor": 0
|
| 311 |
-
},
|
| 312 |
-
"text/plain": [
|
| 313 |
-
"model.safetensors: 0%| | 0.00/1.11G [00:00<?, ?B/s]"
|
| 314 |
-
]
|
| 315 |
-
},
|
| 316 |
-
"metadata": {},
|
| 317 |
-
"output_type": "display_data"
|
| 318 |
-
},
|
| 319 |
-
{
|
| 320 |
-
"data": {
|
| 321 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 322 |
-
"model_id": "b01711c9aaf048b6a79d8ebcbdbd5e7e",
|
| 323 |
-
"version_major": 2,
|
| 324 |
-
"version_minor": 0
|
| 325 |
-
},
|
| 326 |
-
"text/plain": [
|
| 327 |
-
"tokenizer_config.json: 0%| | 0.00/402 [00:00<?, ?B/s]"
|
| 328 |
-
]
|
| 329 |
-
},
|
| 330 |
-
"metadata": {},
|
| 331 |
-
"output_type": "display_data"
|
| 332 |
-
},
|
| 333 |
-
{
|
| 334 |
-
"data": {
|
| 335 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 336 |
-
"model_id": "b45af46c58ae436e88b8f49b105580da",
|
| 337 |
-
"version_major": 2,
|
| 338 |
-
"version_minor": 0
|
| 339 |
-
},
|
| 340 |
-
"text/plain": [
|
| 341 |
-
"sentencepiece.bpe.model: 0%| | 0.00/5.07M [00:00<?, ?B/s]"
|
| 342 |
-
]
|
| 343 |
-
},
|
| 344 |
-
"metadata": {},
|
| 345 |
-
"output_type": "display_data"
|
| 346 |
-
},
|
| 347 |
-
{
|
| 348 |
-
"data": {
|
| 349 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 350 |
-
"model_id": "19d38eb3f34a4fda889e9ed69e48768b",
|
| 351 |
-
"version_major": 2,
|
| 352 |
-
"version_minor": 0
|
| 353 |
-
},
|
| 354 |
-
"text/plain": [
|
| 355 |
-
"tokenizer.json: 0%| | 0.00/9.08M [00:00<?, ?B/s]"
|
| 356 |
-
]
|
| 357 |
-
},
|
| 358 |
-
"metadata": {},
|
| 359 |
-
"output_type": "display_data"
|
| 360 |
-
},
|
| 361 |
-
{
|
| 362 |
-
"data": {
|
| 363 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 364 |
-
"model_id": "fb3b9db7b5f04921bf720a6245863e98",
|
| 365 |
-
"version_major": 2,
|
| 366 |
-
"version_minor": 0
|
| 367 |
-
},
|
| 368 |
-
"text/plain": [
|
| 369 |
-
"special_tokens_map.json: 0%| | 0.00/239 [00:00<?, ?B/s]"
|
| 370 |
-
]
|
| 371 |
-
},
|
| 372 |
-
"metadata": {},
|
| 373 |
-
"output_type": "display_data"
|
| 374 |
-
},
|
| 375 |
-
{
|
| 376 |
-
"data": {
|
| 377 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 378 |
-
"model_id": "e3a43d2e151142f4a4ff990b9716731d",
|
| 379 |
-
"version_major": 2,
|
| 380 |
-
"version_minor": 0
|
| 381 |
-
},
|
| 382 |
-
"text/plain": [
|
| 383 |
-
"1_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]"
|
| 384 |
-
]
|
| 385 |
-
},
|
| 386 |
-
"metadata": {},
|
| 387 |
-
"output_type": "display_data"
|
| 388 |
}
|
| 389 |
],
|
| 390 |
"source": [
|
|
@@ -398,7 +324,7 @@
|
|
| 398 |
},
|
| 399 |
{
|
| 400 |
"cell_type": "code",
|
| 401 |
-
"execution_count":
|
| 402 |
"metadata": {},
|
| 403 |
"outputs": [],
|
| 404 |
"source": [
|
|
@@ -408,7 +334,7 @@
|
|
| 408 |
},
|
| 409 |
{
|
| 410 |
"cell_type": "code",
|
| 411 |
-
"execution_count":
|
| 412 |
"metadata": {},
|
| 413 |
"outputs": [],
|
| 414 |
"source": [
|
|
@@ -420,7 +346,7 @@
|
|
| 420 |
},
|
| 421 |
{
|
| 422 |
"cell_type": "code",
|
| 423 |
-
"execution_count":
|
| 424 |
"metadata": {},
|
| 425 |
"outputs": [],
|
| 426 |
"source": [
|
|
@@ -429,13 +355,15 @@
|
|
| 429 |
" query_embedding = model.encode([user_query], convert_to_tensor=True).cpu().numpy()\n",
|
| 430 |
" query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True) # Нормализация\n",
|
| 431 |
" D, I = index.search(query_embedding, top_k)\n",
|
| 432 |
-
" results = data.iloc[I[0]]\n",
|
| 433 |
-
"
|
|
|
|
|
|
|
| 434 |
]
|
| 435 |
},
|
| 436 |
{
|
| 437 |
"cell_type": "code",
|
| 438 |
-
"execution_count":
|
| 439 |
"metadata": {},
|
| 440 |
"outputs": [
|
| 441 |
{
|
|
@@ -464,6 +392,7 @@
|
|
| 464 |
" <th>tvshow_title</th>\n",
|
| 465 |
" <th>description</th>\n",
|
| 466 |
" <th>annotation_len</th>\n",
|
|
|
|
| 467 |
" </tr>\n",
|
| 468 |
" </thead>\n",
|
| 469 |
" <tbody>\n",
|
|
@@ -472,16 +401,18 @@
|
|
| 472 |
" <td>https://myshows.me/view/187/</td>\n",
|
| 473 |
" <td>https://media.myshows.me/shows/1920/0/c2/0c296...</td>\n",
|
| 474 |
" <td>Во все тяжкие</td>\n",
|
| 475 |
-
" <td>Культовый сериал о
|
| 476 |
" <td>142</td>\n",
|
|
|
|
| 477 |
" </tr>\n",
|
| 478 |
" <tr>\n",
|
| 479 |
" <th>19</th>\n",
|
| 480 |
" <td>https://myshows.me/view/55/</td>\n",
|
| 481 |
" <td>https://media.myshows.me/shows/1920/3/21/32187...</td>\n",
|
| 482 |
" <td>Клиника</td>\n",
|
| 483 |
-
" <td>Отучившись четыре года в медицинской
|
| 484 |
" <td>46</td>\n",
|
|
|
|
| 485 |
" </tr>\n",
|
| 486 |
" <tr>\n",
|
| 487 |
" <th>43</th>\n",
|
|
@@ -490,62 +421,70 @@
|
|
| 490 |
" <td>Блудливая Калифорния</td>\n",
|
| 491 |
" <td>Талантливый и некогда известный писатель Хэнк ...</td>\n",
|
| 492 |
" <td>56</td>\n",
|
|
|
|
| 493 |
" </tr>\n",
|
| 494 |
" <tr>\n",
|
| 495 |
-
" <th>
|
| 496 |
-
" <td>https://myshows.me/view/
|
| 497 |
-
" <td>https://media.myshows.me/shows/1920/
|
| 498 |
-
" <td
|
| 499 |
-
" <td
|
| 500 |
-
" <td>
|
|
|
|
| 501 |
" </tr>\n",
|
| 502 |
" <tr>\n",
|
| 503 |
" <th>35</th>\n",
|
| 504 |
" <td>https://myshows.me/view/34737/</td>\n",
|
| 505 |
" <td>https://media.myshows.me/shows/1920/c/72/c7251...</td>\n",
|
| 506 |
" <td>Рик и Морти</td>\n",
|
| 507 |
-
" <td>В центре сюжета
|
| 508 |
" <td>81</td>\n",
|
|
|
|
| 509 |
" </tr>\n",
|
| 510 |
" <tr>\n",
|
| 511 |
-
" <th>
|
| 512 |
-
" <td>https://myshows.me/view/
|
| 513 |
-
" <td>https://media.myshows.me/shows/1920/
|
| 514 |
-
" <td
|
| 515 |
-
" <td
|
| 516 |
-
" <td>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
" </tr>\n",
|
| 518 |
" <tr>\n",
|
| 519 |
" <th>31</th>\n",
|
| 520 |
" <td>https://myshows.me/view/58610/</td>\n",
|
| 521 |
" <td>https://media.myshows.me/shows/1920/e/76/e763f...</td>\n",
|
| 522 |
" <td>Ведьмак</td>\n",
|
| 523 |
-
" <td>Геральт из Ривии
|
| 524 |
" <td>162</td>\n",
|
|
|
|
| 525 |
" </tr>\n",
|
| 526 |
" <tr>\n",
|
| 527 |
-
" <th>
|
| 528 |
-
" <td>https://myshows.me/view/
|
| 529 |
-
" <td>https://media.myshows.me/shows/1920/
|
| 530 |
-
" <td
|
| 531 |
-
" <td
|
| 532 |
-
" <td>
|
| 533 |
-
"
|
| 534 |
-
" <tr>\n",
|
| 535 |
-
" <th>46</th>\n",
|
| 536 |
-
" <td>https://myshows.me/view/34265/</td>\n",
|
| 537 |
-
" <td>https://media.myshows.me/shows/1920/c/45/c45a8...</td>\n",
|
| 538 |
-
" <td>Фарго</td>\n",
|
| 539 |
-
" <td>Первый сезон В январе 2006 года Лорн Малво (Би...</td>\n",
|
| 540 |
-
" <td>227</td>\n",
|
| 541 |
-
" </tr>\n",
|
| 542 |
-
" <tr>\n",
|
| 543 |
-
" <th>21</th>\n",
|
| 544 |
-
" <td>https://myshows.me/view/59512/</td>\n",
|
| 545 |
-
" <td>https://media.myshows.me/shows/1920/1/64/16430...</td>\n",
|
| 546 |
-
" <td>Половое воспитание</td>\n",
|
| 547 |
-
" <td>Оттис — замкнутый интроверт-девственник, живущ...</td>\n",
|
| 548 |
-
" <td>117</td>\n",
|
| 549 |
" </tr>\n",
|
| 550 |
" </tbody>\n",
|
| 551 |
"</table>\n",
|
|
@@ -556,59 +495,311 @@
|
|
| 556 |
"5 https://myshows.me/view/187/ \n",
|
| 557 |
"19 https://myshows.me/view/55/ \n",
|
| 558 |
"43 https://myshows.me/view/10/ \n",
|
| 559 |
-
"7 https://myshows.me/view/1/ \n",
|
| 560 |
-
"35 https://myshows.me/view/34737/ \n",
|
| 561 |
"40 https://myshows.me/view/37718/ \n",
|
| 562 |
-
"
|
| 563 |
-
"20 https://myshows.me/view/30443/ \n",
|
| 564 |
-
"46 https://myshows.me/view/34265/ \n",
|
| 565 |
"21 https://myshows.me/view/59512/ \n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
"\n",
|
| 567 |
" image_url \\\n",
|
| 568 |
"5 https://media.myshows.me/shows/1920/0/c2/0c296... \n",
|
| 569 |
"19 https://media.myshows.me/shows/1920/3/21/32187... \n",
|
| 570 |
"43 https://media.myshows.me/shows/1920/8/ac/8ac41... \n",
|
| 571 |
-
"7 https://media.myshows.me/shows/1920/0/73/073db... \n",
|
| 572 |
-
"35 https://media.myshows.me/shows/1920/c/72/c7251... \n",
|
| 573 |
"40 https://media.myshows.me/shows/1920/a/b3/ab397... \n",
|
| 574 |
-
"
|
| 575 |
-
"20 https://media.myshows.me/shows/1920/e/e4/ee415... \n",
|
| 576 |
-
"46 https://media.myshows.me/shows/1920/c/45/c45a8... \n",
|
| 577 |
"21 https://media.myshows.me/shows/1920/1/64/16430... \n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
"\n",
|
| 579 |
" tvshow_title \\\n",
|
| 580 |
"5 Во все тяжкие \n",
|
| 581 |
"19 Клиника \n",
|
| 582 |
"43 Блудливая Калифорния \n",
|
| 583 |
-
"7 Доктор Хаус \n",
|
| 584 |
-
"35 Рик и Морти \n",
|
| 585 |
"40 Как избежать наказания за убийство \n",
|
| 586 |
-
"
|
| 587 |
-
"20 Ганнибал \n",
|
| 588 |
-
"46 Фарго \n",
|
| 589 |
"21 Половое воспитание \n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
"\n",
|
| 591 |
-
" description annotation_len
|
| 592 |
-
"5 Культовый сериал о
|
| 593 |
-
"19 Отучившись четыре года в медицинской
|
| 594 |
-
"43 Талантливый и некогда известный писатель Хэнк ... 56
|
| 595 |
-
"
|
| 596 |
-
"35 В центре сюжета
|
| 597 |
-
"
|
| 598 |
-
"
|
| 599 |
-
"
|
| 600 |
-
"
|
| 601 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
]
|
| 603 |
},
|
| 604 |
-
"execution_count":
|
| 605 |
"metadata": {},
|
| 606 |
"output_type": "execute_result"
|
| 607 |
}
|
| 608 |
],
|
| 609 |
"source": [
|
| 610 |
"# Пример пользовательского запроса\n",
|
| 611 |
-
"user_query = \"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 612 |
"top_k_results = search_series(user_query)\n",
|
| 613 |
"top_k_results"
|
| 614 |
]
|
|
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
+
"execution_count": 139,
|
| 6 |
"metadata": {},
|
| 7 |
"outputs": [],
|
| 8 |
"source": [
|
|
|
|
| 10 |
"import numpy as np\n",
|
| 11 |
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
| 12 |
"import torch\n",
|
| 13 |
+
"import pandas as pd\n",
|
| 14 |
+
"import re\n",
|
| 15 |
+
"import nltk\n",
|
| 16 |
+
"from nltk.corpus import stopwords"
|
| 17 |
]
|
| 18 |
},
|
| 19 |
{
|
| 20 |
"cell_type": "code",
|
| 21 |
+
"execution_count": 140,
|
| 22 |
+
"metadata": {},
|
| 23 |
+
"outputs": [],
|
| 24 |
+
"source": [
|
| 25 |
+
"# nltk.download('stopwords')"
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "markdown",
|
| 30 |
+
"metadata": {},
|
| 31 |
+
"source": [
|
| 32 |
+
"# Sentence Transformers"
|
| 33 |
+
]
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"cell_type": "code",
|
| 37 |
+
"execution_count": 141,
|
| 38 |
"metadata": {},
|
| 39 |
"outputs": [],
|
| 40 |
"source": [
|
|
|
|
| 44 |
},
|
| 45 |
{
|
| 46 |
"cell_type": "code",
|
| 47 |
+
"execution_count": 142,
|
| 48 |
"metadata": {},
|
| 49 |
"outputs": [],
|
| 50 |
"source": [
|
|
|
|
| 54 |
},
|
| 55 |
{
|
| 56 |
"cell_type": "code",
|
| 57 |
+
"execution_count": 143,
|
| 58 |
"metadata": {},
|
| 59 |
"outputs": [],
|
| 60 |
"source": [
|
|
|
|
| 63 |
},
|
| 64 |
{
|
| 65 |
"cell_type": "code",
|
| 66 |
+
"execution_count": 144,
|
| 67 |
"metadata": {},
|
| 68 |
"outputs": [],
|
| 69 |
"source": [
|
|
|
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"cell_type": "code",
|
| 76 |
+
"execution_count": 145,
|
| 77 |
"metadata": {},
|
| 78 |
"outputs": [
|
| 79 |
{
|
|
|
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"cell_type": "code",
|
| 104 |
+
"execution_count": 146,
|
| 105 |
"metadata": {},
|
| 106 |
"outputs": [
|
| 107 |
{
|
|
|
|
| 200 |
"4 Компания мелких подонков старшего школьного во... 179 "
|
| 201 |
]
|
| 202 |
},
|
| 203 |
+
"execution_count": 146,
|
| 204 |
"metadata": {},
|
| 205 |
"output_type": "execute_result"
|
| 206 |
}
|
|
|
|
| 211 |
},
|
| 212 |
{
|
| 213 |
"cell_type": "code",
|
| 214 |
+
"execution_count": 147,
|
| 215 |
"metadata": {},
|
| 216 |
"outputs": [
|
| 217 |
{
|
|
|
|
| 220 |
"'История о Шерлоке Холмсе и докторе Ватсоне в Лондоне начала двадцать первого века успела прославиться не только как одна из самых стильных и интригующих экранизаций рассказов сэра Артура Конан-Дойла, но и как шоу, создатели которого заставляют фанатов изнывать в ожидании новых серий не меньше пары-тройки мучительных лет. Ожидание, впрочем, того стоит.\\r\\nВ чем суть?\\r\\nХромающий афганский ветеран ищет соседа по квартире и по совету знакомого отправляется в морг к возможному кандидату. К счастью, живому — но с характером, являющим собой ядерный коктейль из социопатии, презрения к интеллектуальному уровню окружающих и прочих милых личностных качеств, включая неоспоримую гениальность. Лондон, тем временем, сотрясает череда необъяснимых убийств, разобраться с которыми бравым служакам из Скотленд Ярда оказывается не по зубам. В дело под бодрый саундтрек вступают Шерлок и доктор Ватсон, вооруженные в придачу к давно знакомой дедукции еще и личным блогом, закодированными мобильниками и прочими благами цивилизации.'"
|
| 221 |
]
|
| 222 |
},
|
| 223 |
+
"execution_count": 147,
|
| 224 |
"metadata": {},
|
| 225 |
"output_type": "execute_result"
|
| 226 |
}
|
|
|
|
| 231 |
},
|
| 232 |
{
|
| 233 |
"cell_type": "code",
|
| 234 |
+
"execution_count": 148,
|
| 235 |
"metadata": {},
|
| 236 |
"outputs": [],
|
| 237 |
"source": [
|
| 238 |
+
"# def clean_text(text):\n",
|
| 239 |
+
"# # Удаление всего, что не является буквами или знаками препинания\n",
|
| 240 |
+
"# clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\\s]')\n",
|
| 241 |
+
"# text = clean_pattern.sub('', text)\n",
|
| 242 |
+
"# url_pattern = re.compile(r'http\\S+|www\\S+|https\\S+')\n",
|
| 243 |
+
"# text = url_pattern.sub(r'', text)\n",
|
| 244 |
+
"# text = re.sub(\"\\s+\", \" \", text)\n",
|
| 245 |
+
"# stop_words = set(stopwords.words(\"russian\"))\n",
|
| 246 |
+
"# splitted_text = [word for word in text.split() if word not in stop_words]\n",
|
| 247 |
+
"# text = \" \".join(splitted_text)\n",
|
| 248 |
+
"# return text"
|
| 249 |
]
|
| 250 |
},
|
| 251 |
{
|
| 252 |
"cell_type": "code",
|
| 253 |
+
"execution_count": 149,
|
| 254 |
+
"metadata": {},
|
| 255 |
+
"outputs": [],
|
| 256 |
+
"source": [
|
| 257 |
+
"#Предобработка текста\n",
|
| 258 |
+
"\n",
|
| 259 |
+
"\n",
|
| 260 |
+
"def clean_text(text):\n",
|
| 261 |
+
" # Удаляем переносы строк\n",
|
| 262 |
+
" text = re.sub(r'\\r\\n', ' ', text)\n",
|
| 263 |
+
" \n",
|
| 264 |
+
" # Удаляем все, кроме букв и пробелов\n",
|
| 265 |
+
" text = re.sub(r'[^а-яА-Яa-zA-Z\\s]', '', text)\n",
|
| 266 |
+
" \n",
|
| 267 |
+
" # Удаляем лишние пробелы\n",
|
| 268 |
+
" text = re.sub(r'\\s+', ' ', text).strip()\n",
|
| 269 |
+
" \n",
|
| 270 |
+
" return text"
|
| 271 |
+
]
|
| 272 |
+
},
|
| 273 |
+
{
|
| 274 |
+
"cell_type": "code",
|
| 275 |
+
"execution_count": 150,
|
| 276 |
+
"metadata": {},
|
| 277 |
+
"outputs": [],
|
| 278 |
+
"source": [
|
| 279 |
+
"data['description'] = data['description'].apply(clean_text)"
|
| 280 |
+
]
|
| 281 |
+
},
|
| 282 |
+
{
|
| 283 |
+
"cell_type": "code",
|
| 284 |
+
"execution_count": 151,
|
| 285 |
"metadata": {},
|
| 286 |
"outputs": [
|
| 287 |
{
|
| 288 |
"data": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
"text/plain": [
|
| 290 |
+
"'История о Шерлоке Холмсе и докторе Ватсоне в Лондоне начала двадцать первого века успела прославиться не только как одна из самых стильных и интригующих экранизаций рассказов сэра Артура КонанДойла но и как шоу создатели которого заставляют фанатов изнывать в ожидании новых серий не меньше парытройки мучительных лет Ожидание впрочем того стоит В чем суть Хромающий афганский ветеран ищет соседа по квартире и по совету знакомого отправляется в морг к возможному кандидату К счастью живому но с характером являющим собой ядерный коктейль из социопатии презрения к интеллектуальному уровню окружающих и прочих милых личностных качеств включая неоспоримую гениальность Лондон тем временем сотрясает череда необъяснимых убийств разобраться с которыми бравым служакам из Скотленд Ярда оказывается не по зубам В дело под бодрый саундтрек вступают Шерлок и доктор Ватсон вооруженные в придачу к давно знакомой дедукции еще и личным блогом закодированными мобильниками и прочими благами цивилизации'"
|
| 291 |
]
|
| 292 |
},
|
| 293 |
+
"execution_count": 151,
|
| 294 |
"metadata": {},
|
| 295 |
+
"output_type": "execute_result"
|
| 296 |
+
}
|
| 297 |
+
],
|
| 298 |
+
"source": [
|
| 299 |
+
"data.loc[0, 'description']"
|
| 300 |
+
]
|
| 301 |
+
},
|
| 302 |
+
{
|
| 303 |
+
"cell_type": "code",
|
| 304 |
+
"execution_count": 152,
|
| 305 |
+
"metadata": {},
|
| 306 |
+
"outputs": [
|
| 307 |
{
|
| 308 |
"name": "stderr",
|
| 309 |
"output_type": "stream",
|
|
|
|
| 311 |
"/home/vera/miniforge3/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
| 312 |
" warnings.warn(\n"
|
| 313 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
}
|
| 315 |
],
|
| 316 |
"source": [
|
|
|
|
| 324 |
},
|
| 325 |
{
|
| 326 |
"cell_type": "code",
|
| 327 |
+
"execution_count": 153,
|
| 328 |
"metadata": {},
|
| 329 |
"outputs": [],
|
| 330 |
"source": [
|
|
|
|
| 334 |
},
|
| 335 |
{
|
| 336 |
"cell_type": "code",
|
| 337 |
+
"execution_count": 154,
|
| 338 |
"metadata": {},
|
| 339 |
"outputs": [],
|
| 340 |
"source": [
|
|
|
|
| 346 |
},
|
| 347 |
{
|
| 348 |
"cell_type": "code",
|
| 349 |
+
"execution_count": 155,
|
| 350 |
"metadata": {},
|
| 351 |
"outputs": [],
|
| 352 |
"source": [
|
|
|
|
| 355 |
" query_embedding = model.encode([user_query], convert_to_tensor=True).cpu().numpy()\n",
|
| 356 |
" query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True) # Нормализация\n",
|
| 357 |
" D, I = index.search(query_embedding, top_k)\n",
|
| 358 |
+
" results = data.iloc[I[0]].copy()\n",
|
| 359 |
+
" cosine_similarities = D[0]\n",
|
| 360 |
+
" results['cosine_similarity'] = cosine_similarities\n",
|
| 361 |
+
" return results"
|
| 362 |
]
|
| 363 |
},
|
| 364 |
{
|
| 365 |
"cell_type": "code",
|
| 366 |
+
"execution_count": 133,
|
| 367 |
"metadata": {},
|
| 368 |
"outputs": [
|
| 369 |
{
|
|
|
|
| 392 |
" <th>tvshow_title</th>\n",
|
| 393 |
" <th>description</th>\n",
|
| 394 |
" <th>annotation_len</th>\n",
|
| 395 |
+
" <th>cosine_similarity</th>\n",
|
| 396 |
" </tr>\n",
|
| 397 |
" </thead>\n",
|
| 398 |
" <tbody>\n",
|
|
|
|
| 401 |
" <td>https://myshows.me/view/187/</td>\n",
|
| 402 |
" <td>https://media.myshows.me/shows/1920/0/c2/0c296...</td>\n",
|
| 403 |
" <td>Во все тяжкие</td>\n",
|
| 404 |
+
" <td>Культовый сериал о том что школьный учитель ок...</td>\n",
|
| 405 |
" <td>142</td>\n",
|
| 406 |
+
" <td>0.514940</td>\n",
|
| 407 |
" </tr>\n",
|
| 408 |
" <tr>\n",
|
| 409 |
" <th>19</th>\n",
|
| 410 |
" <td>https://myshows.me/view/55/</td>\n",
|
| 411 |
" <td>https://media.myshows.me/shows/1920/3/21/32187...</td>\n",
|
| 412 |
" <td>Клиника</td>\n",
|
| 413 |
+
" <td>Отучившись четыре года в медицинской школе Джо...</td>\n",
|
| 414 |
" <td>46</td>\n",
|
| 415 |
+
" <td>0.401157</td>\n",
|
| 416 |
" </tr>\n",
|
| 417 |
" <tr>\n",
|
| 418 |
" <th>43</th>\n",
|
|
|
|
| 421 |
" <td>Блудливая Калифорния</td>\n",
|
| 422 |
" <td>Талантливый и некогда известный писатель Хэнк ...</td>\n",
|
| 423 |
" <td>56</td>\n",
|
| 424 |
+
" <td>0.400791</td>\n",
|
| 425 |
" </tr>\n",
|
| 426 |
" <tr>\n",
|
| 427 |
+
" <th>40</th>\n",
|
| 428 |
+
" <td>https://myshows.me/view/37718/</td>\n",
|
| 429 |
+
" <td>https://media.myshows.me/shows/1920/a/b3/ab397...</td>\n",
|
| 430 |
+
" <td>Как избежать наказания за убийство</td>\n",
|
| 431 |
+
" <td>Практически производственная драма о нелегкой ...</td>\n",
|
| 432 |
+
" <td>132</td>\n",
|
| 433 |
+
" <td>0.355167</td>\n",
|
| 434 |
" </tr>\n",
|
| 435 |
" <tr>\n",
|
| 436 |
" <th>35</th>\n",
|
| 437 |
" <td>https://myshows.me/view/34737/</td>\n",
|
| 438 |
" <td>https://media.myshows.me/shows/1920/c/72/c7251...</td>\n",
|
| 439 |
" <td>Рик и Морти</td>\n",
|
| 440 |
+
" <td>В центре сюжета школьник по имени Морти и его ...</td>\n",
|
| 441 |
" <td>81</td>\n",
|
| 442 |
+
" <td>0.344527</td>\n",
|
| 443 |
" </tr>\n",
|
| 444 |
" <tr>\n",
|
| 445 |
+
" <th>21</th>\n",
|
| 446 |
+
" <td>https://myshows.me/view/59512/</td>\n",
|
| 447 |
+
" <td>https://media.myshows.me/shows/1920/1/64/16430...</td>\n",
|
| 448 |
+
" <td>Половое воспитание</td>\n",
|
| 449 |
+
" <td>Оттис замкнутый интровертдевственник живущий с...</td>\n",
|
| 450 |
+
" <td>117</td>\n",
|
| 451 |
+
" <td>0.340558</td>\n",
|
| 452 |
+
" </tr>\n",
|
| 453 |
+
" <tr>\n",
|
| 454 |
+
" <th>7</th>\n",
|
| 455 |
+
" <td>https://myshows.me/view/1/</td>\n",
|
| 456 |
+
" <td>https://media.myshows.me/shows/1920/0/73/073db...</td>\n",
|
| 457 |
+
" <td>Доктор Хаус</td>\n",
|
| 458 |
+
" <td>Пожалуй самый известный сериал о медицинских р...</td>\n",
|
| 459 |
+
" <td>126</td>\n",
|
| 460 |
+
" <td>0.320888</td>\n",
|
| 461 |
+
" </tr>\n",
|
| 462 |
+
" <tr>\n",
|
| 463 |
+
" <th>12</th>\n",
|
| 464 |
+
" <td>https://myshows.me/view/3/</td>\n",
|
| 465 |
+
" <td>https://media.myshows.me/shows/1920/8/7f/87fc3...</td>\n",
|
| 466 |
+
" <td>Обмани меня</td>\n",
|
| 467 |
+
" <td>Все люди лгут Каждое движение жест мимолетное ...</td>\n",
|
| 468 |
+
" <td>139</td>\n",
|
| 469 |
+
" <td>0.309987</td>\n",
|
| 470 |
" </tr>\n",
|
| 471 |
" <tr>\n",
|
| 472 |
" <th>31</th>\n",
|
| 473 |
" <td>https://myshows.me/view/58610/</td>\n",
|
| 474 |
" <td>https://media.myshows.me/shows/1920/e/76/e763f...</td>\n",
|
| 475 |
" <td>Ведьмак</td>\n",
|
| 476 |
+
" <td>Геральт из Ривии ведьмак представитель цеха ко...</td>\n",
|
| 477 |
" <td>162</td>\n",
|
| 478 |
+
" <td>0.309098</td>\n",
|
| 479 |
" </tr>\n",
|
| 480 |
" <tr>\n",
|
| 481 |
+
" <th>18</th>\n",
|
| 482 |
+
" <td>https://myshows.me/view/26428/</td>\n",
|
| 483 |
+
" <td>https://media.myshows.me/shows/1920/8/b0/8b0da...</td>\n",
|
| 484 |
+
" <td>Настоящий детектив</td>\n",
|
| 485 |
+
" <td>Давно похороненный висяк убийство молодой деву...</td>\n",
|
| 486 |
+
" <td>152</td>\n",
|
| 487 |
+
" <td>0.308820</td>\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
" </tr>\n",
|
| 489 |
" </tbody>\n",
|
| 490 |
"</table>\n",
|
|
|
|
| 495 |
"5 https://myshows.me/view/187/ \n",
|
| 496 |
"19 https://myshows.me/view/55/ \n",
|
| 497 |
"43 https://myshows.me/view/10/ \n",
|
|
|
|
|
|
|
| 498 |
"40 https://myshows.me/view/37718/ \n",
|
| 499 |
+
"35 https://myshows.me/view/34737/ \n",
|
|
|
|
|
|
|
| 500 |
"21 https://myshows.me/view/59512/ \n",
|
| 501 |
+
"7 https://myshows.me/view/1/ \n",
|
| 502 |
+
"12 https://myshows.me/view/3/ \n",
|
| 503 |
+
"31 https://myshows.me/view/58610/ \n",
|
| 504 |
+
"18 https://myshows.me/view/26428/ \n",
|
| 505 |
"\n",
|
| 506 |
" image_url \\\n",
|
| 507 |
"5 https://media.myshows.me/shows/1920/0/c2/0c296... \n",
|
| 508 |
"19 https://media.myshows.me/shows/1920/3/21/32187... \n",
|
| 509 |
"43 https://media.myshows.me/shows/1920/8/ac/8ac41... \n",
|
|
|
|
|
|
|
| 510 |
"40 https://media.myshows.me/shows/1920/a/b3/ab397... \n",
|
| 511 |
+
"35 https://media.myshows.me/shows/1920/c/72/c7251... \n",
|
|
|
|
|
|
|
| 512 |
"21 https://media.myshows.me/shows/1920/1/64/16430... \n",
|
| 513 |
+
"7 https://media.myshows.me/shows/1920/0/73/073db... \n",
|
| 514 |
+
"12 https://media.myshows.me/shows/1920/8/7f/87fc3... \n",
|
| 515 |
+
"31 https://media.myshows.me/shows/1920/e/76/e763f... \n",
|
| 516 |
+
"18 https://media.myshows.me/shows/1920/8/b0/8b0da... \n",
|
| 517 |
"\n",
|
| 518 |
" tvshow_title \\\n",
|
| 519 |
"5 Во все тяжкие \n",
|
| 520 |
"19 Клиника \n",
|
| 521 |
"43 Блудливая Калифорния \n",
|
|
|
|
|
|
|
| 522 |
"40 Как избежать наказания за убийство \n",
|
| 523 |
+
"35 Рик и Морти \n",
|
|
|
|
|
|
|
| 524 |
"21 Половое воспитание \n",
|
| 525 |
+
"7 Доктор Хаус \n",
|
| 526 |
+
"12 Обмани меня \n",
|
| 527 |
+
"31 Ведьмак \n",
|
| 528 |
+
"18 Настоящий детектив \n",
|
| 529 |
"\n",
|
| 530 |
+
" description annotation_len \\\n",
|
| 531 |
+
"5 Культовый сериал о том что школьный учитель ок... 142 \n",
|
| 532 |
+
"19 Отучившись четыре года в медицинской школе Джо... 46 \n",
|
| 533 |
+
"43 Талантливый и некогда известный писатель Хэнк ... 56 \n",
|
| 534 |
+
"40 Практически производственная драма о нелегкой ... 132 \n",
|
| 535 |
+
"35 В центре сюжета школьник по имени Морти и его ... 81 \n",
|
| 536 |
+
"21 Оттис замкнутый интровертдевственник живущий с... 117 \n",
|
| 537 |
+
"7 Пожалуй самый известный сериал о медицинских р... 126 \n",
|
| 538 |
+
"12 Все люди лгут Каждое движение жест мимолетное ... 139 \n",
|
| 539 |
+
"31 Геральт из Ривии ведьмак представитель цеха ко... 162 \n",
|
| 540 |
+
"18 Давно похороненный висяк убийство молодой деву... 152 \n",
|
| 541 |
+
"\n",
|
| 542 |
+
" cosine_similarity \n",
|
| 543 |
+
"5 0.514940 \n",
|
| 544 |
+
"19 0.401157 \n",
|
| 545 |
+
"43 0.400791 \n",
|
| 546 |
+
"40 0.355167 \n",
|
| 547 |
+
"35 0.344527 \n",
|
| 548 |
+
"21 0.340558 \n",
|
| 549 |
+
"7 0.320888 \n",
|
| 550 |
+
"12 0.309987 \n",
|
| 551 |
+
"31 0.309098 \n",
|
| 552 |
+
"18 0.308820 "
|
| 553 |
]
|
| 554 |
},
|
| 555 |
+
"execution_count": 133,
|
| 556 |
"metadata": {},
|
| 557 |
"output_type": "execute_result"
|
| 558 |
}
|
| 559 |
],
|
| 560 |
"source": [
|
| 561 |
"# Пример пользовательского запроса\n",
|
| 562 |
+
"user_query = \"Учитель, больной раком, варит метамфетамин\"\n",
|
| 563 |
+
"top_k_results = search_series(user_query)\n",
|
| 564 |
+
"top_k_results"
|
| 565 |
+
]
|
| 566 |
+
},
|
| 567 |
+
{
|
| 568 |
+
"cell_type": "markdown",
|
| 569 |
+
"metadata": {},
|
| 570 |
+
"source": [
|
| 571 |
+
"# RuBERT"
|
| 572 |
+
]
|
| 573 |
+
},
|
| 574 |
+
{
|
| 575 |
+
"cell_type": "code",
|
| 576 |
+
"execution_count": 89,
|
| 577 |
+
"metadata": {},
|
| 578 |
+
"outputs": [],
|
| 579 |
+
"source": [
|
| 580 |
+
"import torch\n",
|
| 581 |
+
"from transformers import AutoTokenizer, AutoModel\n",
|
| 582 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"cointegrated/rubert-tiny2\")\n",
|
| 583 |
+
"model = AutoModel.from_pretrained(\"cointegrated/rubert-tiny2\")"
|
| 584 |
+
]
|
| 585 |
+
},
|
| 586 |
+
{
|
| 587 |
+
"cell_type": "code",
|
| 588 |
+
"execution_count": 135,
|
| 589 |
+
"metadata": {},
|
| 590 |
+
"outputs": [],
|
| 591 |
+
"source": [
|
| 592 |
+
"model = SentenceTransformer('cointegrated/rubert-tiny2') # Используем RuBERT для асимметричного семантического поиска\n",
|
| 593 |
+
"descriptions = data['description'].tolist()\n",
|
| 594 |
+
"description_embeddings = model.encode(descriptions, convert_to_tensor=True).cpu().numpy()\n",
|
| 595 |
+
"\n",
|
| 596 |
+
"# Нормализация векторов для косинусного сходства\n",
|
| 597 |
+
"description_embeddings = description_embeddings / np.linalg.norm(description_embeddings, axis=1, keepdims=True)\n",
|
| 598 |
+
"\n",
|
| 599 |
+
"# Создание индекса с использованием Faiss для косинусного сходства\n",
|
| 600 |
+
"dimension = description_embeddings.shape[1]\n",
|
| 601 |
+
"index = faiss.IndexFlatIP(dimension)\n",
|
| 602 |
+
"index.add(description_embeddings)\n"
|
| 603 |
+
]
|
| 604 |
+
},
|
| 605 |
+
{
|
| 606 |
+
"cell_type": "code",
|
| 607 |
+
"execution_count": 138,
|
| 608 |
+
"metadata": {},
|
| 609 |
+
"outputs": [
|
| 610 |
+
{
|
| 611 |
+
"data": {
|
| 612 |
+
"text/html": [
|
| 613 |
+
"<div>\n",
|
| 614 |
+
"<style scoped>\n",
|
| 615 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 616 |
+
" vertical-align: middle;\n",
|
| 617 |
+
" }\n",
|
| 618 |
+
"\n",
|
| 619 |
+
" .dataframe tbody tr th {\n",
|
| 620 |
+
" vertical-align: top;\n",
|
| 621 |
+
" }\n",
|
| 622 |
+
"\n",
|
| 623 |
+
" .dataframe thead th {\n",
|
| 624 |
+
" text-align: right;\n",
|
| 625 |
+
" }\n",
|
| 626 |
+
"</style>\n",
|
| 627 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 628 |
+
" <thead>\n",
|
| 629 |
+
" <tr style=\"text-align: right;\">\n",
|
| 630 |
+
" <th></th>\n",
|
| 631 |
+
" <th>page_url</th>\n",
|
| 632 |
+
" <th>image_url</th>\n",
|
| 633 |
+
" <th>tvshow_title</th>\n",
|
| 634 |
+
" <th>description</th>\n",
|
| 635 |
+
" <th>annotation_len</th>\n",
|
| 636 |
+
" <th>cosine_similarity</th>\n",
|
| 637 |
+
" </tr>\n",
|
| 638 |
+
" </thead>\n",
|
| 639 |
+
" <tbody>\n",
|
| 640 |
+
" <tr>\n",
|
| 641 |
+
" <th>7</th>\n",
|
| 642 |
+
" <td>https://myshows.me/view/1/</td>\n",
|
| 643 |
+
" <td>https://media.myshows.me/shows/1920/0/73/073db...</td>\n",
|
| 644 |
+
" <td>Доктор Хаус</td>\n",
|
| 645 |
+
" <td>Пожалуй самый известный сериал о медицинских р...</td>\n",
|
| 646 |
+
" <td>126</td>\n",
|
| 647 |
+
" <td>0.505964</td>\n",
|
| 648 |
+
" </tr>\n",
|
| 649 |
+
" <tr>\n",
|
| 650 |
+
" <th>43</th>\n",
|
| 651 |
+
" <td>https://myshows.me/view/10/</td>\n",
|
| 652 |
+
" <td>https://media.myshows.me/shows/1920/8/ac/8ac41...</td>\n",
|
| 653 |
+
" <td>Блудливая Калифорния</td>\n",
|
| 654 |
+
" <td>Талантливый и некогда известный писатель Хэнк ...</td>\n",
|
| 655 |
+
" <td>56</td>\n",
|
| 656 |
+
" <td>0.501868</td>\n",
|
| 657 |
+
" </tr>\n",
|
| 658 |
+
" <tr>\n",
|
| 659 |
+
" <th>19</th>\n",
|
| 660 |
+
" <td>https://myshows.me/view/55/</td>\n",
|
| 661 |
+
" <td>https://media.myshows.me/shows/1920/3/21/32187...</td>\n",
|
| 662 |
+
" <td>Клиника</td>\n",
|
| 663 |
+
" <td>Отучившись четыре года в медицинской школе Джо...</td>\n",
|
| 664 |
+
" <td>46</td>\n",
|
| 665 |
+
" <td>0.490041</td>\n",
|
| 666 |
+
" </tr>\n",
|
| 667 |
+
" <tr>\n",
|
| 668 |
+
" <th>20</th>\n",
|
| 669 |
+
" <td>https://myshows.me/view/30443/</td>\n",
|
| 670 |
+
" <td>https://media.myshows.me/shows/1920/e/e4/ee415...</td>\n",
|
| 671 |
+
" <td>Ганнибал</td>\n",
|
| 672 |
+
" <td>Интеллектуальный поединок агента ФБР Уилла Грэ...</td>\n",
|
| 673 |
+
" <td>149</td>\n",
|
| 674 |
+
" <td>0.487131</td>\n",
|
| 675 |
+
" </tr>\n",
|
| 676 |
+
" <tr>\n",
|
| 677 |
+
" <th>35</th>\n",
|
| 678 |
+
" <td>https://myshows.me/view/34737/</td>\n",
|
| 679 |
+
" <td>https://media.myshows.me/shows/1920/c/72/c7251...</td>\n",
|
| 680 |
+
" <td>Рик и Морти</td>\n",
|
| 681 |
+
" <td>В центре сюжета школьник по имени Морти и его ...</td>\n",
|
| 682 |
+
" <td>81</td>\n",
|
| 683 |
+
" <td>0.485575</td>\n",
|
| 684 |
+
" </tr>\n",
|
| 685 |
+
" <tr>\n",
|
| 686 |
+
" <th>31</th>\n",
|
| 687 |
+
" <td>https://myshows.me/view/58610/</td>\n",
|
| 688 |
+
" <td>https://media.myshows.me/shows/1920/e/76/e763f...</td>\n",
|
| 689 |
+
" <td>Ведьмак</td>\n",
|
| 690 |
+
" <td>Геральт из Ривии ведьмак представитель цеха ко...</td>\n",
|
| 691 |
+
" <td>162</td>\n",
|
| 692 |
+
" <td>0.484689</td>\n",
|
| 693 |
+
" </tr>\n",
|
| 694 |
+
" <tr>\n",
|
| 695 |
+
" <th>44</th>\n",
|
| 696 |
+
" <td>https://myshows.me/view/42735/</td>\n",
|
| 697 |
+
" <td>https://media.myshows.me/shows/1920/4/d6/4d6fa...</td>\n",
|
| 698 |
+
" <td>Мистер Робот</td>\n",
|
| 699 |
+
" <td>Эллиот Алдерсон молодой человек который живет ...</td>\n",
|
| 700 |
+
" <td>127</td>\n",
|
| 701 |
+
" <td>0.483426</td>\n",
|
| 702 |
+
" </tr>\n",
|
| 703 |
+
" <tr>\n",
|
| 704 |
+
" <th>23</th>\n",
|
| 705 |
+
" <td>https://myshows.me/view/17483/</td>\n",
|
| 706 |
+
" <td>https://media.myshows.me/shows/1920/2/c6/2c684...</td>\n",
|
| 707 |
+
" <td>Однажды в сказке</td>\n",
|
| 708 |
+
" <td>Что если все жители городка под названием Стор...</td>\n",
|
| 709 |
+
" <td>147</td>\n",
|
| 710 |
+
" <td>0.472772</td>\n",
|
| 711 |
+
" </tr>\n",
|
| 712 |
+
" <tr>\n",
|
| 713 |
+
" <th>32</th>\n",
|
| 714 |
+
" <td>https://myshows.me/view/37432/</td>\n",
|
| 715 |
+
" <td>https://media.myshows.me/shows/1920/4/1f/41fd6...</td>\n",
|
| 716 |
+
" <td>Флэш</td>\n",
|
| 717 |
+
" <td>Парень по имени Барри Аллен с детства мечтал с...</td>\n",
|
| 718 |
+
" <td>112</td>\n",
|
| 719 |
+
" <td>0.469459</td>\n",
|
| 720 |
+
" </tr>\n",
|
| 721 |
+
" <tr>\n",
|
| 722 |
+
" <th>37</th>\n",
|
| 723 |
+
" <td>https://myshows.me/view/467/</td>\n",
|
| 724 |
+
" <td>https://media.myshows.me/shows/1920/a/96/a9616...</td>\n",
|
| 725 |
+
" <td>Тетрадь смерти</td>\n",
|
| 726 |
+
" <td>Убийство дело непростое даже для хорошо подгот...</td>\n",
|
| 727 |
+
" <td>206</td>\n",
|
| 728 |
+
" <td>0.468546</td>\n",
|
| 729 |
+
" </tr>\n",
|
| 730 |
+
" </tbody>\n",
|
| 731 |
+
"</table>\n",
|
| 732 |
+
"</div>"
|
| 733 |
+
],
|
| 734 |
+
"text/plain": [
|
| 735 |
+
" page_url \\\n",
|
| 736 |
+
"7 https://myshows.me/view/1/ \n",
|
| 737 |
+
"43 https://myshows.me/view/10/ \n",
|
| 738 |
+
"19 https://myshows.me/view/55/ \n",
|
| 739 |
+
"20 https://myshows.me/view/30443/ \n",
|
| 740 |
+
"35 https://myshows.me/view/34737/ \n",
|
| 741 |
+
"31 https://myshows.me/view/58610/ \n",
|
| 742 |
+
"44 https://myshows.me/view/42735/ \n",
|
| 743 |
+
"23 https://myshows.me/view/17483/ \n",
|
| 744 |
+
"32 https://myshows.me/view/37432/ \n",
|
| 745 |
+
"37 https://myshows.me/view/467/ \n",
|
| 746 |
+
"\n",
|
| 747 |
+
" image_url tvshow_title \\\n",
|
| 748 |
+
"7 https://media.myshows.me/shows/1920/0/73/073db... Доктор Хаус \n",
|
| 749 |
+
"43 https://media.myshows.me/shows/1920/8/ac/8ac41... Блудливая Калифорния \n",
|
| 750 |
+
"19 https://media.myshows.me/shows/1920/3/21/32187... Клиника \n",
|
| 751 |
+
"20 https://media.myshows.me/shows/1920/e/e4/ee415... Ганнибал \n",
|
| 752 |
+
"35 https://media.myshows.me/shows/1920/c/72/c7251... Рик и Морти \n",
|
| 753 |
+
"31 https://media.myshows.me/shows/1920/e/76/e763f... Ведьмак \n",
|
| 754 |
+
"44 https://media.myshows.me/shows/1920/4/d6/4d6fa... Мистер Робот \n",
|
| 755 |
+
"23 https://media.myshows.me/shows/1920/2/c6/2c684... Однажды в сказке \n",
|
| 756 |
+
"32 https://media.myshows.me/shows/1920/4/1f/41fd6... Флэш \n",
|
| 757 |
+
"37 https://media.myshows.me/shows/1920/a/96/a9616... Тетрадь смерти \n",
|
| 758 |
+
"\n",
|
| 759 |
+
" description annotation_len \\\n",
|
| 760 |
+
"7 Пожалуй самый известный сериал о медицинских р... 126 \n",
|
| 761 |
+
"43 Талантливый и некогда известный писатель Хэнк ... 56 \n",
|
| 762 |
+
"19 Отучившись четыре года в медицинской школе Джо... 46 \n",
|
| 763 |
+
"20 Интеллектуальный поединок агента ФБР Уилла Грэ... 149 \n",
|
| 764 |
+
"35 В центре сюжета школьник по имени Морти и его ... 81 \n",
|
| 765 |
+
"31 Геральт из Ривии ведьмак представитель цеха ко... 162 \n",
|
| 766 |
+
"44 Эллиот Алдерсон молодой человек который живет ... 127 \n",
|
| 767 |
+
"23 Что если все жители городка под названием Стор... 147 \n",
|
| 768 |
+
"32 Парень по имени Барри Аллен с детства мечтал с... 112 \n",
|
| 769 |
+
"37 Убийство дело непростое даже для хорошо подгот... 206 \n",
|
| 770 |
+
"\n",
|
| 771 |
+
" cosine_similarity \n",
|
| 772 |
+
"7 0.505964 \n",
|
| 773 |
+
"43 0.501868 \n",
|
| 774 |
+
"19 0.490041 \n",
|
| 775 |
+
"20 0.487131 \n",
|
| 776 |
+
"35 0.485575 \n",
|
| 777 |
+
"31 0.484689 \n",
|
| 778 |
+
"44 0.483426 \n",
|
| 779 |
+
"23 0.472772 \n",
|
| 780 |
+
"32 0.469459 \n",
|
| 781 |
+
"37 0.468546 "
|
| 782 |
+
]
|
| 783 |
+
},
|
| 784 |
+
"execution_count": 138,
|
| 785 |
+
"metadata": {},
|
| 786 |
+
"output_type": "execute_result"
|
| 787 |
+
}
|
| 788 |
+
],
|
| 789 |
+
"source": [
|
| 790 |
+
"# Шаг 4: Обработка пользовательского запроса и асимметричный поиск\n",
|
| 791 |
+
"def search_series(user_query, top_k=10):\n",
|
| 792 |
+
" user_query = (user_query)\n",
|
| 793 |
+
" query_embedding = model.encode([user_query], convert_to_tensor=True).cpu().numpy()\n",
|
| 794 |
+
" query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True) # Нормализация\n",
|
| 795 |
+
" D, I = index.search(query_embedding, top_k)\n",
|
| 796 |
+
" results = data.iloc[I[0]].copy()\n",
|
| 797 |
+
" cosine_similarities = D[0]\n",
|
| 798 |
+
" results['cosine_similarity'] = cosine_similarities\n",
|
| 799 |
+
" return results\n",
|
| 800 |
+
"\n",
|
| 801 |
+
"# Пример пользовательского запроса\n",
|
| 802 |
+
"user_query = \"Учитель, больной раком, варит метфмфетамин\"\n",
|
| 803 |
"top_k_results = search_series(user_query)\n",
|
| 804 |
"top_k_results"
|
| 805 |
]
|