{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "c0314d29-7fc8-4c11-8bfc-5440a442629a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://hf-mirror.com\n" ] } ], "source": [ "# import subprocess\n", "# import os\n", "\n", "# result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n", "# output = result.stdout\n", "# for line in output.splitlines():\n", "# if '=' in line:\n", "# var, value = line.split('=', 1)\n", "# os.environ[var] = value\n", "\n", "\n", "import os\n", "\n", "# 设置环境变量\n", "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n", "\n", "# 打印环境变量以确认设置成功\n", "print(os.environ.get('HF_ENDPOINT'))" ] }, { "cell_type": "code", "execution_count": 2, "id": "1a09a2f5-dda6-4d86-badd-60bf2fda3983", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025-02-10 09:46:42.348539: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", "2025-02-10 09:46:42.362158: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "2025-02-10 09:46:42.378104: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "2025-02-10 09:46:42.382810: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "2025-02-10 09:46:42.394903: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "2025-02-10 09:46:43.362404: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" ] } ], "source": [ "from transformers import GPT2Tokenizer, GPT2Model,AutoModel\n", "import torch\n", "from transformers import AutoTokenizer, DataCollatorWithPadding\n", "from datasets import load_dataset\n", "from transformers import AutoTokenizer, DataCollatorWithPadding\n", "from transformers import Trainer\n", "import evaluate\n", "import numpy as np\n", "from transformers import TrainingArguments\n", "from transformers import AutoModelForSequenceClassification" ] }, { "cell_type": "code", "execution_count": 3, "id": "7c5ab8f5-b8a9-4930-ac30-49d0bf26d579", "metadata": {}, "outputs": [], "source": [ "model_name=\"gpt2_gene_multi_v2_ft\"\n", "device=\"cuda\"\n", "tokenizer = AutoTokenizer.from_pretrained(\"gpt2_gene_multi_v2_ft\")\n", "model = AutoModel.from_pretrained(model_name)\n", "model.to(device)\n", "\n", "def get_text_embedding(text):\n", " \"\"\"\n", " 使用 GPT-2 模型获取文本的向量表示。\n", " \n", " 参数:\n", " text (str): 输入文本。\n", " model_name (str): 预训练 GPT-2 模型名称,默认为 \"gpt2\"。\n", " device (str): 设备名称(\"cpu\" 或 \"cuda\")。\n", " \n", " 返回:\n", " torch.Tensor: 文本的向量表示,维度为 [hidden_size]。\n", " \"\"\"\n", "\n", " # 将文本编码为输入 ID 并添加批量维度\n", " inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, max_length=256)\n", " input_ids = inputs[\"input_ids\"].to(device)\n", " attention_mask = inputs[\"attention_mask\"].to(device)\n", " \n", " # 获取模型的隐藏层输出\n", " with torch.no_grad():\n", " outputs = model(input_ids, attention_mask=attention_mask)\n", " hidden_states = outputs.last_hidden_state # [batch_size, seq_length, hidden_size]\n", " \n", " # 平均池化:获取序列中所有词向量的平均值\n", " embeddings = hidden_states.mean(dim=1).squeeze() # [hidden_size]\n", " \n", " return embeddings" ] }, { "cell_type": "code", "execution_count": 4, "id": "76ff7fed-fc17-421e-8a57-2a5de33d4ba6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "39.938614" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "def get_sim_score(s1, s2):\n", " v1 = get_text_embedding(s1)\n", " v2 = get_text_embedding(s2)\n", " \n", " # 假设dna_embedding和protein_embedding为numpy数组\n", " #similarity = cosine_similarity([v1.cpu().numpy()], [v2.cpu().numpy()])[0][0]\n", " A = v1.cpu().numpy()\n", " B = v2.cpu().numpy()\n", " #similarity = np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))\n", " similarity = np.linalg.norm(A - B)\n", " return similarity\n", "\n", "s1 = \"ATCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\"\n", "s2 = \"ATCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\"\n", "s1 = \"ATGTCTCTACAGATGATAACGGTCAGTAATAACGTAACTTTAATTCAACCAGGCTTCTCACTGATGAATTTTGATGGACAAGTTTTCTTCTTTGGTCAAAAAGGCTGGCCCAAGAGATCCTGCCCCACAGGAGTTTTCCATTTTGATGTAAAGCATAACCATCTCAAACTGAAGCCTACAGTTTTCTCAAAGGATTCCTGCTACCTTCCTCCTCTTCGCTATCCAGCCACTTGCATATTTAAAGGCAACTTCGAGTCTGAAAAGCATCAGTATATCATCCATGGAGGGAAAACACCAAACAATGAACTTTCAGATAAGATGTATGTCATGTCTATTGTTTGCAAAAACAACAAAAAATTTACTTTTCGCTGCACGGAGAAAGACTTGGTAGGTGATGTTCCTGAAGGCAGATATGGCCATTCCATTGATGTAGTGTATAGTCGAGGGAAAAGTATGGGCGTTCTCTTTGGAGGACGATCTTACATGCCTTCTGCCCAAAGAACCACAGAAAAATGGAACAGTGTAGTTGACTGCTTGCCCCATCTCTTCTTGGTGGATTTTGAATTTGGGTGTTCTACATCCTACATTCTTCCCGAACTTCAGGATGGGATATCTTTTCATGTCTCCATTGCCAGAAATGATACCATTTATATTTTAGGAGGTCATTCACTCACCAATAACATCCGCCCTGCCAATCTGTTCAGAGTAAGGGTTGATCTCCCCCTGGGTAGCCCAGCTGTGAGTTGCACGGTCTTATCAGGAGGAATCTCTGTCTCCAGTGCAATCTTGACTCAAACTAATAATGATGAATTTGTCATTGTTGGTGGCTATCAGCTTGAAAATCAAAAAAGAATGGTCTGCAACATTGTCACTTTAGATGACAACAAGATAGATATTCGTGAGATGGAGGCACCAGATTGGACCCCAGATATTAAGCACAGCAAGGTATGGTTTGGAAACAACATGGGAAATGGGAGTGTTTTCCTTGGAATACCAGGAGACAATAAGCAGGCTGTTTCAGAAGCATTCTATTTCTATATGTTGAAATGTGCTGAAGATGATATAAATGAAGATGAGAAAACATTGATGAACAGTCAGACATCAACAGAAGATCCAGGAGACTCCACACCCTTTGAAGACTCGGAAGAATTTTGCTTCAGTGCAGAAGCAAATAGTTTTGGTGGGGATGATGAATTTGACACCTATAATGAAGATGATGAGGAAGATGAGTCTGAGACAGGCTACTGGATTACGTGTTGCCTTACTTGTAATGTGGATATCAACACTTGGGTACCATTCTACTCAACTGAGCTCAACAAACCTGCTATGATCTACTGCTCTCATGAGGACGGGCACTGGGTCCATGCTCAGTGCATGGATCTGGCAGAGCGCACGCTCATCCATCTGTCAGAAGGAAGCAACAAGTATTATTGCAATGAGCATGTGGAGATAGCAAGAGCACTACAAACCCCCAAAAGAGCCATGCCCTTGAAAAAGCCCCCACTGAAATCCCTCCGCAAAAAAGGCCCTGCAAAAATCTTGACTCCTGCCAAGAAATCCTTCCTTAGAAGATTGTTTGAT\"\n", "s2 = \"MSLQMITVSNNVTLIQPGFSLMNFDGQVFFFGQKGWPKRSCPTGVFHFDVKHNHLKLKPTVFSKDSCYLPPLRYPATCIFKGNFESEKHQYIIHGGKTPNNELSDKMYVMSIVCKNNKKFTFRCTEKDLVGDVPEGRYGHSIDVVYSRGKSMGVLFGGRSYMPSAQRTTEKWNSVVDCLPHLFLVDFEFGCSTSYILPELQDGISFHVSIARNDTIYILGGHSLTNNIRPANLFRVRVDLPLGSPAVSCTVLSGGISVSSAILTQTNNDEFVIVGGYQLENQKRMVCNIVTLDDNKIDIREMEAPDWTPDIKHSKVWFGNNMGNGSVFLGIPGDNKQAVSEAFYFYMLKCAEDDINEDEKTLMNSQTSTEDPGDSTPFEDSEEFCFSAEANSFGGDDEFDTYNEDDEEDESETGYWITCCLTCNVDINTWVPFYSTELNKPAMIYCSHEDGHWVHAQCMDLAERTLIHLSEGSNKYYCNEHVEIARALQTPKRAMPLKKPPLKSLRKKGPAKILTPAKKSFLRRLFD\"\n", "get_sim_score(s1, s2)" ] }, { "cell_type": "code", "execution_count": 5, "id": "2116f787-781f-4bfc-b12d-c36efe26cfa9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['sentence1', 'sentence2', 'label'],\n", " num_rows: 8000\n", " })\n", " test: Dataset({\n", " features: ['sentence1', 'sentence2', 'label'],\n", " num_rows: 8000\n", " })\n", "})" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似\n", "raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.5) #默认已经shuffle\n", "raw_datasets_dna_protein" ] }, { "cell_type": "code", "execution_count": 6, "id": "6e6b1d3d-3d05-40b0-a96a-537a4dc324d6", "metadata": {}, "outputs": [], "source": [ "sim_score = []\n", "dif_score = []\n", "\n", "for item in raw_datasets_dna_protein[\"train\"]:\n", " #print(item)\n", " sentence1 = item[\"sentence1\"]\n", " sentence2 = item[\"sentence2\"]\n", " label = item[\"label\"]\n", " score = get_sim_score(sentence1, sentence2)\n", "\n", " if 1 == label:\n", " sim_score.append(score)\n", " else:\n", " dif_score.append(score)" ] }, { "cell_type": "code", "execution_count": 7, "id": "a515f319-254b-4675-9ca5-fb15da6a62e5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "38.87972 37.672188\n" ] } ], "source": [ "import numpy as np\n", "print(np.mean(sim_score), np.mean(dif_score))" ] }, { "cell_type": "code", "execution_count": 8, "id": "4417c7b5-8019-4a53-968a-4dee311acef3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4047 3953\n" ] } ], "source": [ "print(len(sim_score),len(dif_score))" ] }, { "cell_type": "code", "execution_count": 9, "id": "adc022c4-7bec-4381-b80b-6ac1b18be00c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 39057 (\\N{CJK UNIFIED IDEOGRAPH-9891}) missing from font(s) DejaVu Sans.\n", " fig.canvas.print_figure(bytes_io, **kw)\n", "/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 25968 (\\N{CJK UNIFIED IDEOGRAPH-6570}) missing from font(s) DejaVu Sans.\n", " fig.canvas.print_figure(bytes_io, **kw)\n", "/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 20540 (\\N{CJK UNIFIED IDEOGRAPH-503C}) missing from font(s) DejaVu Sans.\n", " fig.canvas.print_figure(bytes_io, **kw)\n", "/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 20998 (\\N{CJK UNIFIED IDEOGRAPH-5206}) missing from font(s) DejaVu Sans.\n", " fig.canvas.print_figure(bytes_io, **kw)\n", "/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 24067 (\\N{CJK UNIFIED IDEOGRAPH-5E03}) missing from font(s) DejaVu Sans.\n", " fig.canvas.print_figure(bytes_io, **kw)\n", "/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 30452 (\\N{CJK UNIFIED IDEOGRAPH-76F4}) missing from font(s) DejaVu Sans.\n", " fig.canvas.print_figure(bytes_io, **kw)\n", "/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 26041 (\\N{CJK UNIFIED IDEOGRAPH-65B9}) missing from font(s) DejaVu Sans.\n", " fig.canvas.print_figure(bytes_io, **kw)\n", "/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 22270 (\\N{CJK UNIFIED IDEOGRAPH-56FE}) missing from font(s) DejaVu Sans.\n", " fig.canvas.print_figure(bytes_io, **kw)\n", "/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 21306 (\\N{CJK UNIFIED IDEOGRAPH-533A}) missing from font(s) DejaVu Sans.\n", " fig.canvas.print_figure(bytes_io, **kw)\n", "/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 38388 (\\N{CJK UNIFIED IDEOGRAPH-95F4}) missing from font(s) DejaVu Sans.\n", " fig.canvas.print_figure(bytes_io, **kw)\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", "# 示例数据(Python list,float 类型)\n", "#data = np.random.normal(loc=50, scale=15, size=200) # 生成200个服从正态分布的随机数\n", "data = sim_score\n", "\n", "# 计算直方图并自动确定分区数\n", "plt.figure(figsize=(8, 6)) # 设置图像大小\n", "plt.hist(data, bins='auto', edgecolor='black', alpha=0.7)\n", "\n", "# 添加标题和标签\n", "plt.xlabel('数值区间', fontsize=12)\n", "plt.ylabel('频数', fontsize=12)\n", "plt.title('数值分布直方图', fontsize=14)\n", "\n", "# 显示网格\n", "plt.grid(axis='y', linestyle='--', alpha=0.7)\n", "\n", "# 显示直方图\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 10, "id": "71ff0b65-1ced-49de-8bf3-60c9715916db", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", "# 示例数据(Python list,float 类型)\n", "#data = np.random.normal(loc=50, scale=15, size=200) # 生成200个服从正态分布的随机数\n", "data = dif_score\n", "\n", "# 计算直方图并自动确定分区数\n", "plt.figure(figsize=(8, 6)) # 设置图像大小\n", "plt.hist(data, bins='auto', edgecolor='black', alpha=0.7)\n", "\n", "# 添加标题和标签\n", "plt.xlabel('数值区间', fontsize=12)\n", "plt.ylabel('频数', fontsize=12)\n", "plt.title('数值分布直方图', fontsize=14)\n", "\n", "# 显示网格\n", "plt.grid(axis='y', linestyle='--', alpha=0.7)\n", "\n", "# 显示直方图\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 11, "id": "f314d408-18ff-4e59-92bd-eb7a767ca262", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", "# 示例数据\n", "data1 = sim_score # 生成 1000 个符合正态分布的随机数\n", "data2 = dif_score # 生成 1000 个偏移的随机数\n", "\n", "# 绘制直方图\n", "plt.hist(data1, bins=30, alpha=0.5, label='Data 1', color='blue', edgecolor='black')\n", "plt.hist(data2, bins=30, alpha=0.5, label='Data 2', color='red', edgecolor='black')\n", "\n", "# 添加图例\n", "plt.legend()\n", "\n", "# 添加标题和标签\n", "plt.title('Histogram of Two Data Sets')\n", "plt.xlabel('Value')\n", "plt.ylabel('Frequency')\n", "\n", "# 显示图形\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ff7610d0-0487-418b-905e-969f4cd4f321", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }