Upload folder using huggingface_hub
Browse files- best_model/get_dna_protein_dict.ipynb +797 -0
- best_model/gpt2_gene_en_ft_dna_protein_pair_test_others.ipynb +1111 -0
- best_model/gpt2_gene_multiv1_ft_en_test_others_best.ipynb +771 -0
- best_model/gpt2_gene_multiv2_ft_en_test_others_best.ipynb +771 -0
- best_model/vect_sim_gpt2_gene_en_ft_dna_protein_pair_test_others.ipynb +468 -0
- best_model/vect_sim_protein_rand_test.ipynb +423 -0
- best_model/vect_sim_protein_test.ipynb +423 -0
- finetune/2-gpt2-gene-multi-v2-instruction-ft.ipynb +0 -0
- finetune/get_acc_stat_multiv1.ipynb +232 -0
- finetune/get_acc_stat_multiv1_2.ipynb +319 -0
- finetune/get_acc_stat_multiv1_3.ipynb +128 -0
- finetune/get_acc_stat_multiv2.ipynb +291 -0
- finetune/get_acc_stat_multiv2_2.ipynb +212 -0
- finetune/get_acc_stat_multiv2_3.ipynb +108 -0
- finetune/gpt2_gene_multiv1_ft_en.jsonl +0 -0
- finetune/gpt2_gene_multiv1_ft_en2.jsonl +108 -0
- finetune/gpt2_gene_multiv1_ft_en3.jsonl +72 -0
- finetune/gpt2_gene_multiv1_ft_en_test_others.py +342 -0
- finetune/gpt2_gene_multiv1_ft_en_test_others2.py +233 -0
- finetune/gpt2_gene_multiv1_ft_en_test_others3.py +344 -0
- finetune/gpt2_gene_multiv2_ft_en.jsonl +78 -0
- finetune/gpt2_gene_multiv2_ft_en2.jsonl +124 -0
- finetune/gpt2_gene_multiv2_ft_en3.jsonl +0 -0
- finetune/gpt2_gene_multiv2_ft_en_test_others.py +344 -0
- finetune/gpt2_gene_multiv2_ft_en_test_others2.py +233 -0
- finetune/run_ft_all_2.sh +10 -0
- pretrain/gpt2_gene_multi_v1/ds_zero2_no_offload.json +27 -0
- pretrain/gpt2_gene_multi_v1/gpt2_gene_multi_tokenizer/.ipynb_checkpoints/special_tokens_map-checkpoint.json +5 -0
- pretrain/gpt2_gene_multi_v1/gpt2_gene_multi_tokenizer/.ipynb_checkpoints/tokenizer_config-checkpoint.json +34 -0
- pretrain/gpt2_gene_multi_v1/gpt2_gene_multi_tokenizer/special_tokens_map.json +5 -0
- pretrain/gpt2_gene_multi_v1/gpt2_gene_multi_tokenizer/tokenizer.json +0 -0
- pretrain/gpt2_gene_multi_v1/gpt2_gene_multi_tokenizer/tokenizer_config.json +34 -0
- pretrain/gpt2_gene_multi_v1/run_clm_pt.py +646 -0
- pretrain/gpt2_gene_multi_v1/run_pt_gpt2.sh +49 -0
- pretrain/gpt2_gene_multi_v2/ds_zero2_no_offload.json +27 -0
- pretrain/gpt2_gene_multi_v2/gpt2_gene_multi_tokenizer/.ipynb_checkpoints/special_tokens_map-checkpoint.json +5 -0
- pretrain/gpt2_gene_multi_v2/gpt2_gene_multi_tokenizer/.ipynb_checkpoints/tokenizer_config-checkpoint.json +34 -0
- pretrain/gpt2_gene_multi_v2/gpt2_gene_multi_tokenizer/special_tokens_map.json +5 -0
- pretrain/gpt2_gene_multi_v2/gpt2_gene_multi_tokenizer/tokenizer.json +0 -0
- pretrain/gpt2_gene_multi_v2/gpt2_gene_multi_tokenizer/tokenizer_config.json +34 -0
- pretrain/gpt2_gene_multi_v2/run_clm_formal.py +657 -0
- pretrain/gpt2_gene_multi_v2/run_pt_gpt2_formal.sh +43 -0
best_model/get_dna_protein_dict.ipynb
ADDED
|
@@ -0,0 +1,797 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "e5dcfe41-d48d-4e5a-929d-76446da83b14",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import subprocess\n",
|
| 11 |
+
"import os\n",
|
| 12 |
+
"import json\n",
|
| 13 |
+
"\n",
|
| 14 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
| 15 |
+
"output = result.stdout\n",
|
| 16 |
+
"for line in output.splitlines():\n",
|
| 17 |
+
" if '=' in line:\n",
|
| 18 |
+
" var, value = line.split('=', 1)\n",
|
| 19 |
+
" os.environ[var] = value"
|
| 20 |
+
]
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"cell_type": "code",
|
| 24 |
+
"execution_count": 2,
|
| 25 |
+
"id": "7bbac3ce-9b63-4129-a8f4-d2a445868d6b",
|
| 26 |
+
"metadata": {},
|
| 27 |
+
"outputs": [
|
| 28 |
+
{
|
| 29 |
+
"name": "stderr",
|
| 30 |
+
"output_type": "stream",
|
| 31 |
+
"text": [
|
| 32 |
+
"2025-02-01 09:48:43.582404: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
|
| 33 |
+
"2025-02-01 09:48:43.595793: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
| 34 |
+
"2025-02-01 09:48:43.611116: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
| 35 |
+
"2025-02-01 09:48:43.615723: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
| 36 |
+
"2025-02-01 09:48:43.627763: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
| 37 |
+
"To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
| 38 |
+
"2025-02-01 09:48:44.582401: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
|
| 39 |
+
]
|
| 40 |
+
}
|
| 41 |
+
],
|
| 42 |
+
"source": [
|
| 43 |
+
"from transformers import AutoTokenizer, DataCollatorWithPadding\n",
|
| 44 |
+
"\n",
|
| 45 |
+
"#分词器\n",
|
| 46 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"dnagpt/gene_eng_gpt2_v1_ft\")"
|
| 47 |
+
]
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"cell_type": "code",
|
| 51 |
+
"execution_count": 3,
|
| 52 |
+
"id": "74b6bfd2-be51-4fb1-a81e-dccd1f354194",
|
| 53 |
+
"metadata": {},
|
| 54 |
+
"outputs": [
|
| 55 |
+
{
|
| 56 |
+
"data": {
|
| 57 |
+
"text/plain": [
|
| 58 |
+
"100000"
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
"execution_count": 3,
|
| 62 |
+
"metadata": {},
|
| 63 |
+
"output_type": "execute_result"
|
| 64 |
+
}
|
| 65 |
+
],
|
| 66 |
+
"source": [
|
| 67 |
+
"word_dict = tokenizer.get_vocab()\n",
|
| 68 |
+
"len(word_dict)"
|
| 69 |
+
]
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"cell_type": "code",
|
| 73 |
+
"execution_count": 4,
|
| 74 |
+
"id": "c4bea266-644a-478b-bf83-6f34079c8366",
|
| 75 |
+
"metadata": {},
|
| 76 |
+
"outputs": [],
|
| 77 |
+
"source": [
|
| 78 |
+
"from transformers import GPT2Tokenizer, GPT2Model,AutoModel\n",
|
| 79 |
+
"import torch\n",
|
| 80 |
+
"model_name=\"dnagpt/gene_eng_gpt2_v1_ft\"\n",
|
| 81 |
+
"device=\"cuda\"\n",
|
| 82 |
+
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
| 83 |
+
"model = AutoModel.from_pretrained(model_name)\n",
|
| 84 |
+
"model.to(device)\n",
|
| 85 |
+
"\n",
|
| 86 |
+
"def get_text_embedding(text):\n",
|
| 87 |
+
" \"\"\"\n",
|
| 88 |
+
" 使用 GPT-2 模型获取文本的向量表示。\n",
|
| 89 |
+
" \n",
|
| 90 |
+
" 参数:\n",
|
| 91 |
+
" text (str): 输入文本。\n",
|
| 92 |
+
" model_name (str): 预训练 GPT-2 模型名称,默认为 \"gpt2\"。\n",
|
| 93 |
+
" device (str): 设备名称(\"cpu\" 或 \"cuda\")。\n",
|
| 94 |
+
" \n",
|
| 95 |
+
" 返回:\n",
|
| 96 |
+
" torch.Tensor: 文本的向量表示,维度为 [hidden_size]。\n",
|
| 97 |
+
" \"\"\"\n",
|
| 98 |
+
"\n",
|
| 99 |
+
" # 将文本编码为输入 ID 并添加批量维度\n",
|
| 100 |
+
" inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, max_length=512)\n",
|
| 101 |
+
" input_ids = inputs[\"input_ids\"].to(device)\n",
|
| 102 |
+
" attention_mask = inputs[\"attention_mask\"].to(device)\n",
|
| 103 |
+
" \n",
|
| 104 |
+
" # 获取模型的隐藏层输出\n",
|
| 105 |
+
" with torch.no_grad():\n",
|
| 106 |
+
" outputs = model(input_ids, attention_mask=attention_mask)\n",
|
| 107 |
+
" hidden_states = outputs.last_hidden_state # [batch_size, seq_length, hidden_size]\n",
|
| 108 |
+
" \n",
|
| 109 |
+
" # 平均池化:获取序列中所有词向量的平均值\n",
|
| 110 |
+
" embeddings = hidden_states.mean(dim=1).squeeze() # [hidden_size]\n",
|
| 111 |
+
" \n",
|
| 112 |
+
" return embeddings"
|
| 113 |
+
]
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"cell_type": "code",
|
| 117 |
+
"execution_count": 16,
|
| 118 |
+
"id": "3c38a694-078c-44f3-99cd-07af734a951a",
|
| 119 |
+
"metadata": {},
|
| 120 |
+
"outputs": [
|
| 121 |
+
{
|
| 122 |
+
"name": "stdout",
|
| 123 |
+
"output_type": "stream",
|
| 124 |
+
"text": [
|
| 125 |
+
"'AGCT' is classified as: DNA\n",
|
| 126 |
+
"'MVLFRSSGYV' is classified as: Protein\n",
|
| 127 |
+
"'HELLO WORLD' is classified as: English\n",
|
| 128 |
+
"'AGCZ' is classified as: English\n",
|
| 129 |
+
"'XYZ' is classified as: English\n",
|
| 130 |
+
"'A T G C' is classified as: English\n",
|
| 131 |
+
"'HELLO, WORLD!' is classified as: English\n",
|
| 132 |
+
"'M' is classified as: Protein\n"
|
| 133 |
+
]
|
| 134 |
+
}
|
| 135 |
+
],
|
| 136 |
+
"source": [
|
| 137 |
+
"def classify_sequence(sequence):\n",
|
| 138 |
+
" # 定义字符集(所有字符都假设为大写)\n",
|
| 139 |
+
" dna_chars = set('ACGT')\n",
|
| 140 |
+
" protein_chars = set('ACDEFGHIKLMNPQRSTVWY')\n",
|
| 141 |
+
" english_chars = set('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 ,.!?:;-\"\\'()')\n",
|
| 142 |
+
"\n",
|
| 143 |
+
" # 去除空格并检查长度\n",
|
| 144 |
+
" sequence = sequence.strip() # \n",
|
| 145 |
+
" \n",
|
| 146 |
+
" # 检查是否为DNA序列\n",
|
| 147 |
+
" if all(c in dna_chars for c in sequence):\n",
|
| 148 |
+
" return \"DNA\"\n",
|
| 149 |
+
" \n",
|
| 150 |
+
" # 检查是否为蛋白质序列\n",
|
| 151 |
+
" if all(c in protein_chars for c in sequence):\n",
|
| 152 |
+
" return \"Protein\"\n",
|
| 153 |
+
" \n",
|
| 154 |
+
" # 检查是否为英文文本(允许大小写字母、数字及常见标点符号)\n",
|
| 155 |
+
" if all(c in english_chars for c in sequence):\n",
|
| 156 |
+
" return \"English\"\n",
|
| 157 |
+
" \n",
|
| 158 |
+
" # 如果不符合上述任何条件,则无法明确分类\n",
|
| 159 |
+
" return \"Unknown\"\n",
|
| 160 |
+
"\n",
|
| 161 |
+
"# 示例用法\n",
|
| 162 |
+
"sequences = [\"AGCT\", \"MVLFRSSGYV\", \"HELLO WORLD\", \"AGCZ\", \"XYZ\", \"A T G C\", \"HELLO, WORLD!\", \"M\"]\n",
|
| 163 |
+
"for seq in sequences:\n",
|
| 164 |
+
" print(f\"'{seq}' is classified as: {classify_sequence(seq)}\")"
|
| 165 |
+
]
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"cell_type": "code",
|
| 169 |
+
"execution_count": 17,
|
| 170 |
+
"id": "18f451d4-d79a-45f1-a76e-90da7b160d2d",
|
| 171 |
+
"metadata": {},
|
| 172 |
+
"outputs": [
|
| 173 |
+
{
|
| 174 |
+
"name": "stdout",
|
| 175 |
+
"output_type": "stream",
|
| 176 |
+
"text": [
|
| 177 |
+
"19413 24246 40140\n"
|
| 178 |
+
]
|
| 179 |
+
}
|
| 180 |
+
],
|
| 181 |
+
"source": [
|
| 182 |
+
"#获得DNA和英文词表 只要长度2个及以上的词\n",
|
| 183 |
+
"dna_word_list = []\n",
|
| 184 |
+
"eng_word_list = []\n",
|
| 185 |
+
"protein_word_list = []\n",
|
| 186 |
+
"\n",
|
| 187 |
+
"for word in word_dict:\n",
|
| 188 |
+
" if len(word)>=0:\n",
|
| 189 |
+
" word_type = classify_sequence(word)\n",
|
| 190 |
+
" if \"DNA\"==word_type:\n",
|
| 191 |
+
" dna_word_list.append(word)\n",
|
| 192 |
+
"\n",
|
| 193 |
+
" if \"Protein\"==word_type:\n",
|
| 194 |
+
" protein_word_list.append(word)\n",
|
| 195 |
+
"\n",
|
| 196 |
+
" if \"English\"==word_type:\n",
|
| 197 |
+
" eng_word_list.append(word)\n",
|
| 198 |
+
"\n",
|
| 199 |
+
" \n",
|
| 200 |
+
"\n",
|
| 201 |
+
"print(len(dna_word_list), len(eng_word_list), len(protein_word_list))"
|
| 202 |
+
]
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"cell_type": "code",
|
| 206 |
+
"execution_count": 18,
|
| 207 |
+
"id": "33d38d0b-b0b5-45fb-b923-09d848d3af0a",
|
| 208 |
+
"metadata": {},
|
| 209 |
+
"outputs": [],
|
| 210 |
+
"source": [
|
| 211 |
+
"dna_word_vect_dict = {}\n",
|
| 212 |
+
"for word in dna_word_list:\n",
|
| 213 |
+
" word_vect = get_text_embedding(word)\n",
|
| 214 |
+
" dna_word_vect_dict[word] = word_vect"
|
| 215 |
+
]
|
| 216 |
+
},
|
| 217 |
+
{
|
| 218 |
+
"cell_type": "code",
|
| 219 |
+
"execution_count": 19,
|
| 220 |
+
"id": "e404735d-777c-4cdb-a3c3-55651e940920",
|
| 221 |
+
"metadata": {},
|
| 222 |
+
"outputs": [],
|
| 223 |
+
"source": [
|
| 224 |
+
"protein_word_vect_dict = {}\n",
|
| 225 |
+
"\n",
|
| 226 |
+
"for word in protein_word_list:\n",
|
| 227 |
+
" word_vect = get_text_embedding(word)\n",
|
| 228 |
+
" protein_word_vect_dict[word] = word_vect"
|
| 229 |
+
]
|
| 230 |
+
},
|
| 231 |
+
{
|
| 232 |
+
"cell_type": "code",
|
| 233 |
+
"execution_count": 21,
|
| 234 |
+
"id": "d92f9503-7110-43d8-a56e-03d70ac8c121",
|
| 235 |
+
"metadata": {},
|
| 236 |
+
"outputs": [],
|
| 237 |
+
"source": [
|
| 238 |
+
"protein_letter_vect_dict = {}\n",
|
| 239 |
+
"for word in protein_word_vect_dict:\n",
|
| 240 |
+
" if 1==len(word):\n",
|
| 241 |
+
" protein_letter_vect_dict[word] = protein_word_vect_dict[word]\n",
|
| 242 |
+
"\n",
|
| 243 |
+
"#protein_letter_vect_dict"
|
| 244 |
+
]
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"cell_type": "code",
|
| 248 |
+
"execution_count": 22,
|
| 249 |
+
"id": "e3f5fed8-dd22-4991-932a-8868e24dc00c",
|
| 250 |
+
"metadata": {},
|
| 251 |
+
"outputs": [],
|
| 252 |
+
"source": [
|
| 253 |
+
"from sklearn.neighbors import NearestNeighbors\n",
|
| 254 |
+
"import numpy as np\n",
|
| 255 |
+
"\n",
|
| 256 |
+
"def find_most_similar_optimized(dna_word_vect_dict, eng_word_vect_dict):\n",
|
| 257 |
+
" \"\"\"\n",
|
| 258 |
+
" 使用 KD-Tree 加速 DNA 单词到英文单词的匹配。\n",
|
| 259 |
+
" \n",
|
| 260 |
+
" 参数:\n",
|
| 261 |
+
" dna_word_vect_dict (dict): DNA 单词与其向量的字典 {dna_word: dna_vector}.\n",
|
| 262 |
+
" eng_word_vect_dict (dict): 英文单词与其向量的字典 {eng_word: eng_vector}.\n",
|
| 263 |
+
" \n",
|
| 264 |
+
" 返回:\n",
|
| 265 |
+
" dict: DNA 单词到英文单词的映射词典 {dna_word: most_similar_eng_word}.\n",
|
| 266 |
+
" \"\"\"\n",
|
| 267 |
+
" # 构建英文单词向量矩阵和对应单词列表\n",
|
| 268 |
+
" eng_words = list(eng_word_vect_dict.keys())\n",
|
| 269 |
+
" \n",
|
| 270 |
+
" # 确保向量在 CPU 上并转换为 NumPy 数组\n",
|
| 271 |
+
" eng_vectors = np.array([v.cpu().numpy() if isinstance(v, torch.Tensor) else v for v in eng_word_vect_dict.values()])\n",
|
| 272 |
+
" \n",
|
| 273 |
+
" # 初始化最近邻搜索模型\n",
|
| 274 |
+
" nn_model = NearestNeighbors(metric=\"cosine\").fit(eng_vectors)\n",
|
| 275 |
+
" \n",
|
| 276 |
+
" dna_eng_dict = {}\n",
|
| 277 |
+
" \n",
|
| 278 |
+
" for dna_word, dna_vector in dna_word_vect_dict.items():\n",
|
| 279 |
+
" # 将 DNA 向量确保在 CPU 并转换为 NumPy 数组\n",
|
| 280 |
+
" if isinstance(dna_vector, torch.Tensor):\n",
|
| 281 |
+
" dna_vector = dna_vector.cpu().numpy()\n",
|
| 282 |
+
" \n",
|
| 283 |
+
" # 查找最近的英文单词\n",
|
| 284 |
+
" distances, indices = nn_model.kneighbors([dna_vector], n_neighbors=1)\n",
|
| 285 |
+
" most_similar_eng_word = eng_words[indices[0][0]]\n",
|
| 286 |
+
" \n",
|
| 287 |
+
" # 记录匹配结果\n",
|
| 288 |
+
" dna_eng_dict[dna_word] = most_similar_eng_word\n",
|
| 289 |
+
" \n",
|
| 290 |
+
" return dna_eng_dict\n",
|
| 291 |
+
"\n",
|
| 292 |
+
"# 示例调用\n",
|
| 293 |
+
"#dna_protein_dict_optimized = find_most_similar_optimized(dna_word_vect_dict, protein_word_vect_dict)"
|
| 294 |
+
]
|
| 295 |
+
},
|
| 296 |
+
{
|
| 297 |
+
"cell_type": "code",
|
| 298 |
+
"execution_count": 23,
|
| 299 |
+
"id": "f2d99bd3-7649-408c-be2d-6d8dfe44492b",
|
| 300 |
+
"metadata": {},
|
| 301 |
+
"outputs": [],
|
| 302 |
+
"source": [
|
| 303 |
+
"dna_protein_letter_dict = find_most_similar_optimized(dna_word_vect_dict, protein_letter_vect_dict)"
|
| 304 |
+
]
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"cell_type": "code",
|
| 308 |
+
"execution_count": 26,
|
| 309 |
+
"id": "38ace06a-d14a-440a-be5d-1738b48eb0ef",
|
| 310 |
+
"metadata": {},
|
| 311 |
+
"outputs": [
|
| 312 |
+
{
|
| 313 |
+
"name": "stdout",
|
| 314 |
+
"output_type": "stream",
|
| 315 |
+
"text": [
|
| 316 |
+
"TTT Q\n",
|
| 317 |
+
"TGC L\n",
|
| 318 |
+
"TTA Q\n",
|
| 319 |
+
"GGC L\n",
|
| 320 |
+
"ACA Q\n",
|
| 321 |
+
"ACC L\n",
|
| 322 |
+
"AAG L\n",
|
| 323 |
+
"AAC L\n",
|
| 324 |
+
"CCC H\n",
|
| 325 |
+
"ATT L\n",
|
| 326 |
+
"ATG L\n",
|
| 327 |
+
"CAA L\n",
|
| 328 |
+
"AGT Q\n",
|
| 329 |
+
"AAA Q\n",
|
| 330 |
+
"ACT Q\n",
|
| 331 |
+
"TGA Q\n",
|
| 332 |
+
"GGG L\n",
|
| 333 |
+
"TAG L\n",
|
| 334 |
+
"TAT R\n",
|
| 335 |
+
"CAG Q\n",
|
| 336 |
+
"TCG L\n",
|
| 337 |
+
"ATC L\n",
|
| 338 |
+
"GTA L\n",
|
| 339 |
+
"CGG L\n",
|
| 340 |
+
"CGC L\n",
|
| 341 |
+
"TGT R\n",
|
| 342 |
+
"AAT Q\n",
|
| 343 |
+
"GAG L\n",
|
| 344 |
+
"TGG L\n",
|
| 345 |
+
"GTG L\n",
|
| 346 |
+
"ATA L\n",
|
| 347 |
+
"TCA Q\n",
|
| 348 |
+
"AGG L\n",
|
| 349 |
+
"GCC L\n",
|
| 350 |
+
"GCG L\n",
|
| 351 |
+
"ACG L\n",
|
| 352 |
+
"TCC L\n",
|
| 353 |
+
"TAC L\n",
|
| 354 |
+
"TCT Q\n",
|
| 355 |
+
"CCG L\n",
|
| 356 |
+
"AGC L\n",
|
| 357 |
+
"TTG L\n",
|
| 358 |
+
"TTC L\n",
|
| 359 |
+
"AGA Q\n",
|
| 360 |
+
"GAA L\n",
|
| 361 |
+
"TAA L\n"
|
| 362 |
+
]
|
| 363 |
+
}
|
| 364 |
+
],
|
| 365 |
+
"source": [
|
| 366 |
+
"for word in dna_protein_letter_dict:\n",
|
| 367 |
+
" if 3==len(word):\n",
|
| 368 |
+
" print(word,dna_protein_letter_dict[word])"
|
| 369 |
+
]
|
| 370 |
+
},
|
| 371 |
+
{
|
| 372 |
+
"cell_type": "code",
|
| 373 |
+
"execution_count": 14,
|
| 374 |
+
"id": "92fb0b32-1b6a-4f6b-9130-073a3a7a515a",
|
| 375 |
+
"metadata": {},
|
| 376 |
+
"outputs": [
|
| 377 |
+
{
|
| 378 |
+
"name": "stdout",
|
| 379 |
+
"output_type": "stream",
|
| 380 |
+
"text": [
|
| 381 |
+
"DNA-English dictionary has been saved to dna_eng_dict_optimized.json.\n"
|
| 382 |
+
]
|
| 383 |
+
}
|
| 384 |
+
],
|
| 385 |
+
"source": [
|
| 386 |
+
"import json\n",
|
| 387 |
+
"\n",
|
| 388 |
+
"# 将 dna_eng_dict_optimized 保存到 JSON 文件中\n",
|
| 389 |
+
"def save_dict_to_json(data_dict, file_path):\n",
|
| 390 |
+
" \"\"\"\n",
|
| 391 |
+
" 将字典保存为 JSON 文件。\n",
|
| 392 |
+
" \n",
|
| 393 |
+
" 参数:\n",
|
| 394 |
+
" data_dict (dict): 要保存的字典。\n",
|
| 395 |
+
" file_path (str): 保存 JSON 文件的路径。\n",
|
| 396 |
+
" \"\"\"\n",
|
| 397 |
+
" with open(file_path, 'w', encoding='utf-8') as json_file:\n",
|
| 398 |
+
" json.dump(data_dict, json_file, ensure_ascii=False, indent=4)\n",
|
| 399 |
+
"\n",
|
| 400 |
+
"# 示例调用\n",
|
| 401 |
+
"save_dict_to_json(dna_eng_dict_optimized, \"dna_eng_dict_optimized.json\")\n",
|
| 402 |
+
"print(\"DNA-English dictionary has been saved to dna_eng_dict_optimized.json.\")"
|
| 403 |
+
]
|
| 404 |
+
},
|
| 405 |
+
{
|
| 406 |
+
"cell_type": "code",
|
| 407 |
+
"execution_count": 15,
|
| 408 |
+
"id": "831dd86e-2765-4db2-9fa8-396cc9ad1f72",
|
| 409 |
+
"metadata": {},
|
| 410 |
+
"outputs": [
|
| 411 |
+
{
|
| 412 |
+
"data": {
|
| 413 |
+
"text/plain": [
|
| 414 |
+
"{'olia': 5117,\n",
|
| 415 |
+
" 'umbai': 2040,\n",
|
| 416 |
+
" 'stic': 27,\n",
|
| 417 |
+
" 'peninsula': 2966,\n",
|
| 418 |
+
" 'iciency': 22,\n",
|
| 419 |
+
" 'eleph': 73,\n",
|
| 420 |
+
" 'pson': 446,\n",
|
| 421 |
+
" 'ala': 589,\n",
|
| 422 |
+
" 'politan': 2219,\n",
|
| 423 |
+
" 'https': 2,\n",
|
| 424 |
+
" 'transported': 1883,\n",
|
| 425 |
+
" 'icking': 1249,\n",
|
| 426 |
+
" 'displaystyle': 53,\n",
|
| 427 |
+
" 'cemet': 10,\n",
|
| 428 |
+
" 'icipal': 1138,\n",
|
| 429 |
+
" 'coln': 54,\n",
|
| 430 |
+
" 'idence': 108,\n",
|
| 431 |
+
" 'atherine': 47,\n",
|
| 432 |
+
" 'olph': 108,\n",
|
| 433 |
+
" 'beha': 39,\n",
|
| 434 |
+
" 'desirable': 121,\n",
|
| 435 |
+
" 'atting': 26,\n",
|
| 436 |
+
" 'inflamm': 14,\n",
|
| 437 |
+
" 'surroundings': 85,\n",
|
| 438 |
+
" 'mamm': 221,\n",
|
| 439 |
+
" 'demean': 5,\n",
|
| 440 |
+
" 'hower': 52,\n",
|
| 441 |
+
" 'annah': 19,\n",
|
| 442 |
+
" 'ushima': 54,\n",
|
| 443 |
+
" 'oples': 3,\n",
|
| 444 |
+
" 'enty': 30,\n",
|
| 445 |
+
" 'directions': 1,\n",
|
| 446 |
+
" 'apore': 21,\n",
|
| 447 |
+
" 'duc': 31,\n",
|
| 448 |
+
" 'XXXXXXXX': 2,\n",
|
| 449 |
+
" 'unsupported': 1,\n",
|
| 450 |
+
" 'electro': 21,\n",
|
| 451 |
+
" 'ashed': 46,\n",
|
| 452 |
+
" 'T1': 4,\n",
|
| 453 |
+
" 'ometimes': 54,\n",
|
| 454 |
+
" 'ancing': 1,\n",
|
| 455 |
+
" 'mechanic': 5,\n",
|
| 456 |
+
" 'atican': 16,\n",
|
| 457 |
+
" 'entirety': 6,\n",
|
| 458 |
+
" 'archite': 2,\n",
|
| 459 |
+
" 'employs': 12,\n",
|
| 460 |
+
" 'Resour': 1,\n",
|
| 461 |
+
" 'enjoyable': 2,\n",
|
| 462 |
+
" 'ving': 3,\n",
|
| 463 |
+
" 'rance': 11,\n",
|
| 464 |
+
" 'northwest': 8,\n",
|
| 465 |
+
" 'ampions': 13,\n",
|
| 466 |
+
" 'XXXXXXXXXXXX': 5,\n",
|
| 467 |
+
" 'Weap': 5,\n",
|
| 468 |
+
" 'XT': 7,\n",
|
| 469 |
+
" 'amen': 21,\n",
|
| 470 |
+
" 'Duter': 4,\n",
|
| 471 |
+
" 'ampion': 1,\n",
|
| 472 |
+
" 'agonal': 1,\n",
|
| 473 |
+
" 'involve': 4,\n",
|
| 474 |
+
" 'underneath': 2,\n",
|
| 475 |
+
" 'rought': 19,\n",
|
| 476 |
+
" 'Carneg': 11,\n",
|
| 477 |
+
" 'antibi': 4,\n",
|
| 478 |
+
" 'inery': 13,\n",
|
| 479 |
+
" 'tural': 7,\n",
|
| 480 |
+
" 'perspect': 1,\n",
|
| 481 |
+
" 'grey': 7,\n",
|
| 482 |
+
" 'necessarily': 8,\n",
|
| 483 |
+
" 'iencies': 1,\n",
|
| 484 |
+
" 'Cinc': 3,\n",
|
| 485 |
+
" 'amy': 1,\n",
|
| 486 |
+
" 'Performan': 1,\n",
|
| 487 |
+
" 'itic': 2,\n",
|
| 488 |
+
" '003': 2,\n",
|
| 489 |
+
" 'pecu': 1,\n",
|
| 490 |
+
" 'ambers': 1,\n",
|
| 491 |
+
" 'berra': 8,\n",
|
| 492 |
+
" 'incar': 1,\n",
|
| 493 |
+
" 'itchen': 3,\n",
|
| 494 |
+
" 'BS': 2,\n",
|
| 495 |
+
" 'passeng': 2,\n",
|
| 496 |
+
" 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX': 2,\n",
|
| 497 |
+
" 'phosp': 4,\n",
|
| 498 |
+
" 'ernand': 3,\n",
|
| 499 |
+
" 'territ': 2,\n",
|
| 500 |
+
" 'inity': 7,\n",
|
| 501 |
+
" 'promin': 2,\n",
|
| 502 |
+
" 'Sul': 1,\n",
|
| 503 |
+
" 'minster': 1,\n",
|
| 504 |
+
" 'ctica': 1,\n",
|
| 505 |
+
" 'thern': 3,\n",
|
| 506 |
+
" 'minerals': 1,\n",
|
| 507 |
+
" 'solely': 1,\n",
|
| 508 |
+
" 'hydr': 1,\n",
|
| 509 |
+
" 'ribut': 2,\n",
|
| 510 |
+
" 'knocked': 5,\n",
|
| 511 |
+
" 'auded': 1,\n",
|
| 512 |
+
" 'illi': 2,\n",
|
| 513 |
+
" 'ichever': 2,\n",
|
| 514 |
+
" 'performan': 6,\n",
|
| 515 |
+
" 'A1': 2,\n",
|
| 516 |
+
" 'iance': 1,\n",
|
| 517 |
+
" 'Milwau': 2,\n",
|
| 518 |
+
" 'Feat': 1,\n",
|
| 519 |
+
" 'accompan': 1,\n",
|
| 520 |
+
" 'atche': 3,\n",
|
| 521 |
+
" 'href': 1,\n",
|
| 522 |
+
" 'Ole': 2,\n",
|
| 523 |
+
" 'gomery': 6,\n",
|
| 524 |
+
" 'rhyth': 2,\n",
|
| 525 |
+
" 'mouth': 3,\n",
|
| 526 |
+
" 'nomine': 4,\n",
|
| 527 |
+
" 'Jup': 1,\n",
|
| 528 |
+
" 'mouths': 1,\n",
|
| 529 |
+
" 'Mediter': 1,\n",
|
| 530 |
+
" 'etsk': 1,\n",
|
| 531 |
+
" 'icular': 3,\n",
|
| 532 |
+
" 'gur': 1,\n",
|
| 533 |
+
" 'dale': 1,\n",
|
| 534 |
+
" 'XG': 1,\n",
|
| 535 |
+
" 'BLO': 1,\n",
|
| 536 |
+
" 'Dire': 1,\n",
|
| 537 |
+
" 'predecess': 1,\n",
|
| 538 |
+
" 'reliant': 3,\n",
|
| 539 |
+
" 'izoph': 1,\n",
|
| 540 |
+
" 'hefty': 3,\n",
|
| 541 |
+
" 'reper': 3,\n",
|
| 542 |
+
" 'coron': 1,\n",
|
| 543 |
+
" 'occas': 1,\n",
|
| 544 |
+
" 'itating': 1,\n",
|
| 545 |
+
" 'vertisement': 1,\n",
|
| 546 |
+
" 'depos': 1,\n",
|
| 547 |
+
" 'oldown': 1,\n",
|
| 548 |
+
" 'otro': 1,\n",
|
| 549 |
+
" 'Pere': 2,\n",
|
| 550 |
+
" 'Chi': 1,\n",
|
| 551 |
+
" 'portray': 1,\n",
|
| 552 |
+
" 'Damasc': 2,\n",
|
| 553 |
+
" 'portrayed': 1,\n",
|
| 554 |
+
" 'neurological': 1,\n",
|
| 555 |
+
" 'Shakespe': 1,\n",
|
| 556 |
+
" 'vironments': 1,\n",
|
| 557 |
+
" 'IoT': 1,\n",
|
| 558 |
+
" 'solute': 1,\n",
|
| 559 |
+
" 'prefers': 1,\n",
|
| 560 |
+
" 'dinosa': 1,\n",
|
| 561 |
+
" 'thanol': 1,\n",
|
| 562 |
+
" 'respon': 1,\n",
|
| 563 |
+
" 'RX': 1,\n",
|
| 564 |
+
" 'adays': 4,\n",
|
| 565 |
+
" 'Inspe': 2,\n",
|
| 566 |
+
" 'manner': 1,\n",
|
| 567 |
+
" 'subjected': 4,\n",
|
| 568 |
+
" 'cription': 1,\n",
|
| 569 |
+
" 'inosa': 3,\n",
|
| 570 |
+
" 'whereas': 1,\n",
|
| 571 |
+
" 'Myan': 1,\n",
|
| 572 |
+
" 'headaches': 1,\n",
|
| 573 |
+
" 'admire': 1,\n",
|
| 574 |
+
" 'landsc': 2,\n",
|
| 575 |
+
" 'icul': 1,\n",
|
| 576 |
+
" 'membr': 1,\n",
|
| 577 |
+
" 'entrepre': 1,\n",
|
| 578 |
+
" 'contracep': 1,\n",
|
| 579 |
+
" 'OE': 1,\n",
|
| 580 |
+
" 'responsib': 1,\n",
|
| 581 |
+
" 'XXXXXXXXXXXXXXXX': 1,\n",
|
| 582 |
+
" 'certific': 1,\n",
|
| 583 |
+
" 'ception': 1,\n",
|
| 584 |
+
" 'inflammatory': 1,\n",
|
| 585 |
+
" 'inh': 1,\n",
|
| 586 |
+
" 'reactive': 1,\n",
|
| 587 |
+
" 'arring': 1,\n",
|
| 588 |
+
" 'ral': 1,\n",
|
| 589 |
+
" 'enormous': 1,\n",
|
| 590 |
+
" 'behavi': 1,\n",
|
| 591 |
+
" 'frontal': 2,\n",
|
| 592 |
+
" 'impe': 3,\n",
|
| 593 |
+
" 'oliber': 1,\n",
|
| 594 |
+
" 'charac': 1,\n",
|
| 595 |
+
" 'helicop': 1,\n",
|
| 596 |
+
" 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX': 1,\n",
|
| 597 |
+
" 'malink': 1,\n",
|
| 598 |
+
" 'thumb': 1,\n",
|
| 599 |
+
" 'igue': 1,\n",
|
| 600 |
+
" 'influencing': 1,\n",
|
| 601 |
+
" 'enna': 1,\n",
|
| 602 |
+
" 'discre': 1,\n",
|
| 603 |
+
" 'Pover': 1,\n",
|
| 604 |
+
" 'teous': 1,\n",
|
| 605 |
+
" 'toos': 1,\n",
|
| 606 |
+
" 'URE': 1,\n",
|
| 607 |
+
" 'DX': 1,\n",
|
| 608 |
+
" 'trig': 1}"
|
| 609 |
+
]
|
| 610 |
+
},
|
| 611 |
+
"execution_count": 15,
|
| 612 |
+
"metadata": {},
|
| 613 |
+
"output_type": "execute_result"
|
| 614 |
+
}
|
| 615 |
+
],
|
| 616 |
+
"source": [
|
| 617 |
+
"en_word_dict = {}\n",
|
| 618 |
+
"for dna_word in dna_eng_dict_optimized:\n",
|
| 619 |
+
" en_word = dna_eng_dict_optimized[dna_word]\n",
|
| 620 |
+
" en_word_dict.setdefault(en_word,0)\n",
|
| 621 |
+
" en_word_dict[en_word] = en_word_dict[en_word] + 1\n",
|
| 622 |
+
"\n",
|
| 623 |
+
"en_word_dict"
|
| 624 |
+
]
|
| 625 |
+
},
|
| 626 |
+
{
|
| 627 |
+
"cell_type": "code",
|
| 628 |
+
"execution_count": 25,
|
| 629 |
+
"id": "c36edad3-e33e-4fd7-a461-84c7c995c779",
|
| 630 |
+
"metadata": {},
|
| 631 |
+
"outputs": [],
|
| 632 |
+
"source": [
|
| 633 |
+
"import random\n",
|
| 634 |
+
"from sklearn.neighbors import NearestNeighbors\n",
|
| 635 |
+
"import numpy as np\n",
|
| 636 |
+
"import torch\n",
|
| 637 |
+
"\n",
|
| 638 |
+
"def find_most_similar_with_randomization(dna_word_vect_dict, eng_word_vect_dict, top_k=500):\n",
|
| 639 |
+
" \"\"\"\n",
|
| 640 |
+
" 使用 KD-Tree 加速 DNA 单词到英文单词的匹配,并随机选择最近的 top_k 单词中的一个作为映射。\n",
|
| 641 |
+
"\n",
|
| 642 |
+
" 参数:\n",
|
| 643 |
+
" dna_word_vect_dict (dict): DNA 单词与其向量的字典 {dna_word: dna_vector}.\n",
|
| 644 |
+
" eng_word_vect_dict (dict): 英文单词与其向量的字典 {eng_word: eng_vector}.\n",
|
| 645 |
+
" top_k (int): 随机选择时从最近的 top_k 单词中选取。\n",
|
| 646 |
+
"\n",
|
| 647 |
+
" 返回:\n",
|
| 648 |
+
" dict: DNA 单词到英文单词的映射词典 {dna_word: random_eng_word_from_top_k}.\n",
|
| 649 |
+
" \"\"\"\n",
|
| 650 |
+
" # 构建英文单词向量矩阵和对应单词列表\n",
|
| 651 |
+
" eng_words = list(eng_word_vect_dict.keys())\n",
|
| 652 |
+
" \n",
|
| 653 |
+
" # 确保向量在 CPU 上并转换为 NumPy 数组\n",
|
| 654 |
+
" eng_vectors = np.array([v.cpu().numpy() if isinstance(v, torch.Tensor) else v for v in eng_word_vect_dict.values()])\n",
|
| 655 |
+
"\n",
|
| 656 |
+
" # 初始化最近邻搜索模型\n",
|
| 657 |
+
" nn_model = NearestNeighbors(metric=\"cosine\").fit(eng_vectors)\n",
|
| 658 |
+
"\n",
|
| 659 |
+
" dna_eng_dict = {}\n",
|
| 660 |
+
"\n",
|
| 661 |
+
" for dna_word, dna_vector in dna_word_vect_dict.items():\n",
|
| 662 |
+
" # 将 DNA 向量确保在 CPU 并转换为 NumPy 数组\n",
|
| 663 |
+
" if isinstance(dna_vector, torch.Tensor):\n",
|
| 664 |
+
" dna_vector = dna_vector.cpu().numpy()\n",
|
| 665 |
+
"\n",
|
| 666 |
+
" # 查找最近的 top_k 英文单词\n",
|
| 667 |
+
" distances, indices = nn_model.kneighbors([dna_vector], n_neighbors=top_k)\n",
|
| 668 |
+
" top_k_eng_words = [eng_words[idx] for idx in indices[0]]\n",
|
| 669 |
+
"\n",
|
| 670 |
+
" # 随机选择一个单词\n",
|
| 671 |
+
" random_eng_word = random.choice(top_k_eng_words)\n",
|
| 672 |
+
"\n",
|
| 673 |
+
" # 记录匹配结果\n",
|
| 674 |
+
" dna_eng_dict[dna_word] = random_eng_word\n",
|
| 675 |
+
"\n",
|
| 676 |
+
" return dna_eng_dict\n",
|
| 677 |
+
"\n",
|
| 678 |
+
"# 示例调用\n",
|
| 679 |
+
"dna_eng_dict_randomized = find_most_similar_with_randomization(dna_word_vect_dict, eng_word_vect_dict, top_k=100)"
|
| 680 |
+
]
|
| 681 |
+
},
|
| 682 |
+
{
|
| 683 |
+
"cell_type": "code",
|
| 684 |
+
"execution_count": 32,
|
| 685 |
+
"id": "8e68e6e6-3c91-402e-8cac-04d595e64c0c",
|
| 686 |
+
"metadata": {},
|
| 687 |
+
"outputs": [
|
| 688 |
+
{
|
| 689 |
+
"data": {
|
| 690 |
+
"text/plain": [
|
| 691 |
+
"618"
|
| 692 |
+
]
|
| 693 |
+
},
|
| 694 |
+
"execution_count": 32,
|
| 695 |
+
"metadata": {},
|
| 696 |
+
"output_type": "execute_result"
|
| 697 |
+
}
|
| 698 |
+
],
|
| 699 |
+
"source": [
|
| 700 |
+
"en_word_dict = {}\n",
|
| 701 |
+
"for dna_word in dna_eng_dict_randomized:\n",
|
| 702 |
+
" en_word = dna_eng_dict_randomized[dna_word]\n",
|
| 703 |
+
" en_word_dict.setdefault(en_word,0)\n",
|
| 704 |
+
" en_word_dict[en_word] = en_word_dict[en_word] + 1\n",
|
| 705 |
+
"\n",
|
| 706 |
+
"len(en_word_dict)"
|
| 707 |
+
]
|
| 708 |
+
},
|
| 709 |
+
{
|
| 710 |
+
"cell_type": "code",
|
| 711 |
+
"execution_count": 33,
|
| 712 |
+
"id": "490818e7-f635-4897-94ab-511dfd9ea78e",
|
| 713 |
+
"metadata": {},
|
| 714 |
+
"outputs": [],
|
| 715 |
+
"source": [
|
| 716 |
+
"def add_unique_suffix_to_dict(dna_eng_dict):\n",
|
| 717 |
+
" \"\"\"\n",
|
| 718 |
+
" 为 DNA 到英文单词的映射词典添加唯一后缀,防止多个 DNA 单词对应同一个英文单词。\n",
|
| 719 |
+
"\n",
|
| 720 |
+
" 参数:\n",
|
| 721 |
+
" dna_eng_dict (dict): {dna_word: eng_word} 形式的映射词典。\n",
|
| 722 |
+
"\n",
|
| 723 |
+
" 返回:\n",
|
| 724 |
+
" dict: 添加后缀后的映射词典。\n",
|
| 725 |
+
" \"\"\"\n",
|
| 726 |
+
" # 统计每个英文单词的映射次数\n",
|
| 727 |
+
" eng_word_count = {}\n",
|
| 728 |
+
" for dna_word, eng_word in dna_eng_dict.items():\n",
|
| 729 |
+
" if eng_word not in eng_word_count:\n",
|
| 730 |
+
" eng_word_count[eng_word] = 0\n",
|
| 731 |
+
" eng_word_count[eng_word] += 1\n",
|
| 732 |
+
"\n",
|
| 733 |
+
" # 为映射次数超过 1 的英文单词添加后缀\n",
|
| 734 |
+
" eng_word_suffix_count = {key: 1 for key in eng_word_count.keys()}\n",
|
| 735 |
+
" updated_dict = {}\n",
|
| 736 |
+
" for dna_word, eng_word in dna_eng_dict.items():\n",
|
| 737 |
+
" if eng_word_count[eng_word] > 1:\n",
|
| 738 |
+
" # 添加后缀\n",
|
| 739 |
+
" unique_eng_word = f\"{eng_word}{eng_word_suffix_count[eng_word]}\"\n",
|
| 740 |
+
" eng_word_suffix_count[eng_word] += 1\n",
|
| 741 |
+
" else:\n",
|
| 742 |
+
" unique_eng_word = eng_word\n",
|
| 743 |
+
" updated_dict[dna_word] = unique_eng_word\n",
|
| 744 |
+
"\n",
|
| 745 |
+
" return updated_dict\n",
|
| 746 |
+
"\n",
|
| 747 |
+
"# 示例调用\n",
|
| 748 |
+
"dna_eng_dict_unique = add_unique_suffix_to_dict(dna_eng_dict_randomized)"
|
| 749 |
+
]
|
| 750 |
+
},
|
| 751 |
+
{
|
| 752 |
+
"cell_type": "code",
|
| 753 |
+
"execution_count": 35,
|
| 754 |
+
"id": "be92a8a7-37c8-4b03-b400-026308d20a36",
|
| 755 |
+
"metadata": {},
|
| 756 |
+
"outputs": [],
|
| 757 |
+
"source": [
|
| 758 |
+
"import json\n",
|
| 759 |
+
"\n",
|
| 760 |
+
"# 保存 dna_eng_dict_unique 到 JSON 文件\n",
|
| 761 |
+
"output_file = \"dna_eng_dict_unique.json\"\n",
|
| 762 |
+
"\n",
|
| 763 |
+
"with open(output_file, \"w\") as f:\n",
|
| 764 |
+
" json.dump(dna_eng_dict_unique, f, indent=4)"
|
| 765 |
+
]
|
| 766 |
+
},
|
| 767 |
+
{
|
| 768 |
+
"cell_type": "code",
|
| 769 |
+
"execution_count": null,
|
| 770 |
+
"id": "afa3987d-8c1f-46cd-b954-e3ec0ada5faa",
|
| 771 |
+
"metadata": {},
|
| 772 |
+
"outputs": [],
|
| 773 |
+
"source": []
|
| 774 |
+
}
|
| 775 |
+
],
|
| 776 |
+
"metadata": {
|
| 777 |
+
"kernelspec": {
|
| 778 |
+
"display_name": "Python 3 (ipykernel)",
|
| 779 |
+
"language": "python",
|
| 780 |
+
"name": "python3"
|
| 781 |
+
},
|
| 782 |
+
"language_info": {
|
| 783 |
+
"codemirror_mode": {
|
| 784 |
+
"name": "ipython",
|
| 785 |
+
"version": 3
|
| 786 |
+
},
|
| 787 |
+
"file_extension": ".py",
|
| 788 |
+
"mimetype": "text/x-python",
|
| 789 |
+
"name": "python",
|
| 790 |
+
"nbconvert_exporter": "python",
|
| 791 |
+
"pygments_lexer": "ipython3",
|
| 792 |
+
"version": "3.12.3"
|
| 793 |
+
}
|
| 794 |
+
},
|
| 795 |
+
"nbformat": 4,
|
| 796 |
+
"nbformat_minor": 5
|
| 797 |
+
}
|
best_model/gpt2_gene_en_ft_dna_protein_pair_test_others.ipynb
ADDED
|
@@ -0,0 +1,1111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "1ff351d4-ec43-4337-b526-86e641611680",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"data": {
|
| 11 |
+
"text/plain": [
|
| 12 |
+
"\"\\nimport os\\n\\n# 设置环境变量\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT')\\n\""
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
"execution_count": 1,
|
| 16 |
+
"metadata": {},
|
| 17 |
+
"output_type": "execute_result"
|
| 18 |
+
}
|
| 19 |
+
],
|
| 20 |
+
"source": [
|
| 21 |
+
"import subprocess\n",
|
| 22 |
+
"import os\n",
|
| 23 |
+
"\n",
|
| 24 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
| 25 |
+
"output = result.stdout\n",
|
| 26 |
+
"for line in output.splitlines():\n",
|
| 27 |
+
" if '=' in line:\n",
|
| 28 |
+
" var, value = line.split('=', 1)\n",
|
| 29 |
+
" os.environ[var] = value\n",
|
| 30 |
+
"\n",
|
| 31 |
+
"'''\n",
|
| 32 |
+
"import os\n",
|
| 33 |
+
"\n",
|
| 34 |
+
"# 设置环境变量\n",
|
| 35 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
| 36 |
+
"\n",
|
| 37 |
+
"# 打印环境变量以确认设置成功\n",
|
| 38 |
+
"print(os.environ.get('HF_ENDPOINT')\n",
|
| 39 |
+
"'''"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"cell_type": "code",
|
| 44 |
+
"execution_count": 2,
|
| 45 |
+
"id": "0b92a446-c25a-4ae4-b32b-8ff9e1b8f320",
|
| 46 |
+
"metadata": {},
|
| 47 |
+
"outputs": [
|
| 48 |
+
{
|
| 49 |
+
"name": "stderr",
|
| 50 |
+
"output_type": "stream",
|
| 51 |
+
"text": [
|
| 52 |
+
"2025-02-10 00:25:00.462309: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
|
| 53 |
+
"2025-02-10 00:25:00.476690: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
| 54 |
+
"2025-02-10 00:25:00.492214: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
| 55 |
+
"2025-02-10 00:25:00.496840: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
| 56 |
+
"2025-02-10 00:25:00.508480: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
| 57 |
+
"To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
| 58 |
+
"2025-02-10 00:25:01.341392: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
|
| 59 |
+
"Using the latest cached version of the dataset since dnagpt/gene_lan_transfer couldn't be found on the Hugging Face Hub\n",
|
| 60 |
+
"Found the latest cached dataset configuration 'dna_protein_pair_rand' at /root/.cache/huggingface/datasets/dnagpt___gene_lan_transfer/dna_protein_pair_rand/0.0.0/fc103580e7cda0d9bc41947f4058887fdc81188c (last modified on Mon Feb 10 00:16:09 2025).\n"
|
| 61 |
+
]
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"data": {
|
| 65 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 66 |
+
"model_id": "37c331e8249b49058653c8524ab4267c",
|
| 67 |
+
"version_major": 2,
|
| 68 |
+
"version_minor": 0
|
| 69 |
+
},
|
| 70 |
+
"text/plain": [
|
| 71 |
+
"Map: 0%| | 0/14400 [00:00<?, ? examples/s]"
|
| 72 |
+
]
|
| 73 |
+
},
|
| 74 |
+
"metadata": {},
|
| 75 |
+
"output_type": "display_data"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"data": {
|
| 79 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 80 |
+
"model_id": "648bdf2ad1a043a3949cb629aab056dd",
|
| 81 |
+
"version_major": 2,
|
| 82 |
+
"version_minor": 0
|
| 83 |
+
},
|
| 84 |
+
"text/plain": [
|
| 85 |
+
"Map: 0%| | 0/1600 [00:00<?, ? examples/s]"
|
| 86 |
+
]
|
| 87 |
+
},
|
| 88 |
+
"metadata": {},
|
| 89 |
+
"output_type": "display_data"
|
| 90 |
+
}
|
| 91 |
+
],
|
| 92 |
+
"source": [
|
| 93 |
+
"from datasets import load_dataset\n",
|
| 94 |
+
"from transformers import AutoTokenizer, DataCollatorWithPadding\n",
|
| 95 |
+
"from transformers import Trainer\n",
|
| 96 |
+
"import evaluate\n",
|
| 97 |
+
"import numpy as np\n",
|
| 98 |
+
"from transformers import TrainingArguments\n",
|
| 99 |
+
"from transformers import AutoModelForSequenceClassification\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"\n",
|
| 102 |
+
"# 假设你的JSON文件名为 'data.json' 并且每行是一个独立的JSON对象\n",
|
| 103 |
+
"#raw_datasets = load_dataset('paws-x', 'en') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x\n",
|
| 104 |
+
"#raw_datasets = load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_150bp')['train'].train_test_split(test_size=0.1)\n",
|
| 105 |
+
"raw_datasets = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.1) #默认已经shuffle\n",
|
| 106 |
+
"\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"#分词器\n",
|
| 109 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"dnagpt/gene_eng_gpt2_v1\")\n",
|
| 110 |
+
"tokenizer.pad_token = tokenizer.eos_token\n",
|
| 111 |
+
"\n",
|
| 112 |
+
"# 修改分词器的填充方向为左侧,默认有右侧,分类问题建议左侧\n",
|
| 113 |
+
"#tokenizer.padding_side = \"left\"\n",
|
| 114 |
+
"\n",
|
| 115 |
+
"\n",
|
| 116 |
+
"#分词函数\n",
|
| 117 |
+
"def tokenize_function(example):\n",
|
| 118 |
+
" return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True,max_length=256, padding=\"max_length\")\n",
|
| 119 |
+
" #return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True,max_length=1024) #padding=\"max_length\")\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"#构建分词后的数据集\n",
|
| 122 |
+
"tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
|
| 123 |
+
"\n",
|
| 124 |
+
"#训练数据构建\n",
|
| 125 |
+
"data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
|
| 126 |
+
]
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"cell_type": "code",
|
| 130 |
+
"execution_count": 3,
|
| 131 |
+
"id": "ba1a32f9-d548-4fae-ae66-f2961fadfc1c",
|
| 132 |
+
"metadata": {},
|
| 133 |
+
"outputs": [
|
| 134 |
+
{
|
| 135 |
+
"data": {
|
| 136 |
+
"text/plain": [
|
| 137 |
+
"{'sentence1': 'TTGAGTACCTTGAGTACCAAGGAACCCCTGGCGGCTATTGTCGGCCCCACGGCTACCGGTAAATCGACCATTGCCCTCAAGGTTGCCGCCCGGCTGGGGGCGGAGATAATCTCTGTGGATTCCGCCCAGGTTTACCGCGGTATGGATATTGGTACAGCAAAACTGCTCCCGGAGGAAAGGGTGGGGCCCGACGGCCGACCCATCCCCCACCACCTGATAGATATCGTCGATCCCGACGAGCCTTTCAGTGTAGCCGATTACCAGAAACTAGCCCGACAAACGATAACAGCCATCATCAGGAGGGGTCACCTGCCCCTTCTGGTCGGCGGTACAGGCCTGTATTATCAAGCGGTAGTCGATCCCTACCGCTTCACCCCGGAGGGGGGGGATCCCCGGGTCCGGCAGGAGCTCGAGGAGCTGGCGGCCAAGTTTGGTGACGCATATCTCCATGAACAATTGAAAAGGGTCGACCCGGAAGCGGCCAAACGGATACATCCCCATGACCGGCGCCGCCTGGTAAGGGCCCTGGAGGTTTTTAAAACAACCGGGCAACCCATATCGGCTGCCCTGGCCTGGCGCCGGCAACAGGAATCGCCCTATCATCTGGCGGCAGTAGCTCTCAGCATGCCCCGACCCCTCCTTTACCGACGCATCGAAGCCCGGGTCGATGCCATGATCGCAGCGGGCCTCATTGAAGAAGTTTCCCGCCTGCTGGCCCGGTACGATTACCGCCTACCGGCCCTCCAGGCCCTGGGTTATAAAGAAATCGGCGCTTATTTGCGTAAAGAAATAGAACTGGAGGAAGCAATAGCCATTTTAAAACGCAATACCAGACGTCTAGCTAAAAGGCAATTAACATGGTTCCGGCGCGACCGTCGCCTGCACTGGTGGGAAGTAGATCCGGATAAAATTGAGGAAATTTCAGCCGCTATTGCCGATTTTATTAGCAGGACAATTGATATTAATGTAGAATAG',\n",
|
| 138 |
+
" 'sentence2': 'MSTLSTKEPLAAIVGPTATGKSTIALKVAARLGAEIISVDSAQVYRGMDIGTAKLLPEERVGPDGRPIPHHLIDIVDPDEPFSVADYQKLARQTITAIIRRGHLPLLVGGTGLYYQAVVDPYRFTPEGGDPRVRQELEELAAKFGDAYLHEQLKRVDPEAAKRIHPHDRRRLVRALEVFKTTGQPISAALAWRRQQESPYHLAAVALSMPRPLLYRRIEARVDAMIAAGLIEEVSRLLARYDYRLPALQALGYKEIGAYLRKEIELEEAIAILKRNTRRLAKRQLTWFRRDRRLHWWEVDPDKIEEISAAIADFISRTIDINVE',\n",
|
| 139 |
+
" 'label': 1}"
|
| 140 |
+
]
|
| 141 |
+
},
|
| 142 |
+
"execution_count": 3,
|
| 143 |
+
"metadata": {},
|
| 144 |
+
"output_type": "execute_result"
|
| 145 |
+
}
|
| 146 |
+
],
|
| 147 |
+
"source": [
|
| 148 |
+
"raw_datasets[\"train\"][0]"
|
| 149 |
+
]
|
| 150 |
+
},
|
| 151 |
+
{
|
| 152 |
+
"cell_type": "code",
|
| 153 |
+
"execution_count": 4,
|
| 154 |
+
"id": "1209977a-00b4-471d-abdf-9b0264ce8cdf",
|
| 155 |
+
"metadata": {},
|
| 156 |
+
"outputs": [],
|
| 157 |
+
"source": [
|
| 158 |
+
"#指标函数定义\n",
|
| 159 |
+
"def compute_metrics(eval_pred):\n",
|
| 160 |
+
" predictions, labels = eval_pred\n",
|
| 161 |
+
" predictions = np.argmax(predictions, axis=1)\n",
|
| 162 |
+
" return {'accuracy': (predictions==labels).sum() / len(labels)}"
|
| 163 |
+
]
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
"cell_type": "code",
|
| 167 |
+
"execution_count": 5,
|
| 168 |
+
"id": "2c996c5b-0e8c-4956-a3a3-86cda2177f85",
|
| 169 |
+
"metadata": {},
|
| 170 |
+
"outputs": [
|
| 171 |
+
{
|
| 172 |
+
"name": "stderr",
|
| 173 |
+
"output_type": "stream",
|
| 174 |
+
"text": [
|
| 175 |
+
"/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
|
| 176 |
+
" warnings.warn(\n",
|
| 177 |
+
"Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/gene_eng_gpt2_v1 and are newly initialized: ['score.weight']\n",
|
| 178 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
| 179 |
+
]
|
| 180 |
+
}
|
| 181 |
+
],
|
| 182 |
+
"source": [
|
| 183 |
+
"training_args = TrainingArguments(\n",
|
| 184 |
+
" output_dir=\"ds_job_dna_2222\",\n",
|
| 185 |
+
" learning_rate=1e-5,\n",
|
| 186 |
+
" lr_scheduler_type=\"constant_with_warmup\",\n",
|
| 187 |
+
" warmup_ratio=0.1,\n",
|
| 188 |
+
" optim='adamw_torch',\n",
|
| 189 |
+
" weight_decay=0.0,\n",
|
| 190 |
+
" per_device_train_batch_size=20,\n",
|
| 191 |
+
" per_device_eval_batch_size=20,\n",
|
| 192 |
+
" num_train_epochs=5, #训练多少轮\n",
|
| 193 |
+
" evaluation_strategy=\"epoch\",\n",
|
| 194 |
+
" save_strategy=\"epoch\",\n",
|
| 195 |
+
" logging_strategy=\"epoch\",\n",
|
| 196 |
+
" load_best_model_at_end=True\n",
|
| 197 |
+
")\n",
|
| 198 |
+
"\n",
|
| 199 |
+
"#模型定义,文本分类模型\n",
|
| 200 |
+
"model = AutoModelForSequenceClassification.from_pretrained(\"dnagpt/gene_eng_gpt2_v1\", num_labels=2)\n",
|
| 201 |
+
"model.config.pad_token_id = model.config.eos_token_id\n",
|
| 202 |
+
"\n",
|
| 203 |
+
"trainer = Trainer(\n",
|
| 204 |
+
" model,\n",
|
| 205 |
+
" training_args,\n",
|
| 206 |
+
" train_dataset=tokenized_datasets[\"train\"],\n",
|
| 207 |
+
" eval_dataset=tokenized_datasets[\"test\"],\n",
|
| 208 |
+
" data_collator=data_collator,\n",
|
| 209 |
+
" tokenizer=tokenizer,\n",
|
| 210 |
+
" compute_metrics=compute_metrics,\n",
|
| 211 |
+
")"
|
| 212 |
+
]
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"cell_type": "code",
|
| 216 |
+
"execution_count": 6,
|
| 217 |
+
"id": "0964a53d-a60d-4364-99ca-0f04461b615b",
|
| 218 |
+
"metadata": {},
|
| 219 |
+
"outputs": [
|
| 220 |
+
{
|
| 221 |
+
"name": "stdout",
|
| 222 |
+
"output_type": "stream",
|
| 223 |
+
"text": [
|
| 224 |
+
"[2025-02-10 00:26:53,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
|
| 225 |
+
]
|
| 226 |
+
},
|
| 227 |
+
{
|
| 228 |
+
"name": "stderr",
|
| 229 |
+
"output_type": "stream",
|
| 230 |
+
"text": [
|
| 231 |
+
"/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory\n",
|
| 232 |
+
"collect2: error: ld returned 1 exit status\n",
|
| 233 |
+
"/root/miniconda3/compiler_compat/ld: warning: libpthread.so.0, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
| 234 |
+
"/root/miniconda3/compiler_compat/ld: warning: libstdc++.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
| 235 |
+
"/root/miniconda3/compiler_compat/ld: warning: libm.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
| 236 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'\n",
|
| 237 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'\n",
|
| 238 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'\n",
|
| 239 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'\n",
|
| 240 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'\n",
|
| 241 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'\n",
|
| 242 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_logic_error(char const*)@GLIBCXX_3.4'\n",
|
| 243 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 244 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::logic_error@GLIBCXX_3.4'\n",
|
| 245 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::~locale()@GLIBCXX_3.4'\n",
|
| 246 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
| 247 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_end_catch@CXXABI_1.3'\n",
|
| 248 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 249 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::logic_error::~logic_error()@GLIBCXX_3.4'\n",
|
| 250 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__si_class_type_info@CXXABI_1.3'\n",
|
| 251 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
|
| 252 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 253 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new[](unsigned long)@GLIBCXX_3.4'\n",
|
| 254 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak_hard()@GLIBCXX_3.4'\n",
|
| 255 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 256 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::basic_streambuf(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
|
| 257 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*, unsigned long)@GLIBCXX_3.4'\n",
|
| 258 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&)@GLIBCXX_3.4'\n",
|
| 259 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned short@CXXABI_1.3'\n",
|
| 260 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::resize(unsigned long, char)@GLIBCXX_3.4'\n",
|
| 261 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char const*@CXXABI_1.3'\n",
|
| 262 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ctype<char>::_M_widen_init() const@GLIBCXX_3.4.11'\n",
|
| 263 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_invalid_argument(char const*)@GLIBCXX_3.4'\n",
|
| 264 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::operator=(std::locale const&)@GLIBCXX_3.4'\n",
|
| 265 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
|
| 266 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
|
| 267 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_free_exception@CXXABI_1.3'\n",
|
| 268 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::notify_one()@GLIBCXX_3.4.11'\n",
|
| 269 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::~Init()@GLIBCXX_3.4'\n",
|
| 270 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::~basic_string()@GLIBCXX_3.4'\n",
|
| 271 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_pure_virtual@CXXABI_1.3'\n",
|
| 272 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::flush()@GLIBCXX_3.4'\n",
|
| 273 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__class_type_info@CXXABI_1.3'\n",
|
| 274 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_rethrow@CXXABI_1.3'\n",
|
| 275 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 276 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_fstream<char, std::char_traits<char> >::~basic_fstream()@GLIBCXX_3.4'\n",
|
| 277 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::compare(char const*) const@GLIBCXX_3.4'\n",
|
| 278 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 279 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale()@GLIBCXX_3.4'\n",
|
| 280 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::system_clock::now()@GLIBCXX_3.4.19'\n",
|
| 281 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 282 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Hash_bytes(void const*, unsigned long, unsigned long)@CXXABI_1.3.5'\n",
|
| 283 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long long>(long long)@GLIBCXX_3.4.9'\n",
|
| 284 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char*@CXXABI_1.3'\n",
|
| 285 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const@GLIBCXX_3.4.18'\n",
|
| 286 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::out_of_range@GLIBCXX_3.4'\n",
|
| 287 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long>(unsigned long)@GLIBCXX_3.4.9'\n",
|
| 288 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
|
| 289 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::~ios_base()@GLIBCXX_3.4'\n",
|
| 290 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::range_error::~range_error()@GLIBCXX_3.4'\n",
|
| 291 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::~__basic_file()@GLIBCXX_3.4'\n",
|
| 292 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_acquire@CXXABI_1.3'\n",
|
| 293 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<bool>(bool)@GLIBCXX_3.4.9'\n",
|
| 294 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::overflow_error@GLIBCXX_3.4'\n",
|
| 295 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 296 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::range_error@GLIBCXX_3.4'\n",
|
| 297 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 298 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_filebuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 299 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete[](void*)@GLIBCXX_3.4'\n",
|
| 300 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 301 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(unsigned long, char, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 302 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_transfer(std::__detail::_List_node_base*, std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
|
| 303 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::replace(unsigned long, unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
|
| 304 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::exception@GLIBCXX_3.4'\n",
|
| 305 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_M_destroy(std::allocator<wchar_t> const&)@GLIBCXX_3.4'\n",
|
| 306 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream& std::istream::_M_extract<double>(double&)@GLIBCXX_3.4.9'\n",
|
| 307 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
| 308 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 309 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::basic_ifstream(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
|
| 310 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(std::string const&)@GLIBCXX_3.4'\n",
|
| 311 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new(unsigned long)@GLIBCXX_3.4'\n",
|
| 312 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 313 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned int@CXXABI_1.3'\n",
|
| 314 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*)@GLIBCXX_3.4'\n",
|
| 315 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::domain_error@GLIBCXX_3.4'\n",
|
| 316 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char, unsigned long) const@GLIBCXX_3.4'\n",
|
| 317 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::put(char)@GLIBCXX_3.4'\n",
|
| 318 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int@CXXABI_1.3'\n",
|
| 319 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_alloc()@GLIBCXX_3.4'\n",
|
| 320 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_thread_atexit@CXXABI_1.3.7'\n",
|
| 321 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
|
| 322 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::~basic_ifstream()@GLIBCXX_3.4'\n",
|
| 323 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::Init()@GLIBCXX_3.4'\n",
|
| 324 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::condition_variable()@GLIBCXX_3.4.11'\n",
|
| 325 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::basic_filebuf()@GLIBCXX_3.4'\n",
|
| 326 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 327 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::domain_error::~domain_error()@GLIBCXX_3.4'\n",
|
| 328 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cerr@GLIBCXX_3.4'\n",
|
| 329 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char const*, unsigned long, unsigned long) const@GLIBCXX_3.4'\n",
|
| 330 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 331 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 332 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::str() const@GLIBCXX_3.4'\n",
|
| 333 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::invalid_argument@GLIBCXX_3.4'\n",
|
| 334 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void*@CXXABI_1.3'\n",
|
| 335 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(std::string const&)@GLIBCXX_3.4'\n",
|
| 336 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_ostringstream()@GLIBCXX_3.4'\n",
|
| 337 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
|
| 338 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long@CXXABI_1.3'\n",
|
| 339 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
|
| 340 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_unhook()@GLIBCXX_3.4.15'\n",
|
| 341 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 342 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::_M_sync(char*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
| 343 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<char, std::char_traits<char> >::~basic_iostream()@GLIBCXX_3.4'\n",
|
| 344 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale(std::locale const&)@GLIBCXX_3.4'\n",
|
| 345 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 346 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'\n",
|
| 347 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
|
| 348 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
|
| 349 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::exception::~exception()@GLIBCXX_3.4'\n",
|
| 350 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_create(unsigned long, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 351 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::is_open() const@GLIBCXX_3.4'\n",
|
| 352 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_istringstream()@GLIBCXX_3.4'\n",
|
| 353 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::swap(std::string&)@GLIBCXX_3.4'\n",
|
| 354 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long*@CXXABI_1.3'\n",
|
| 355 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 356 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::basic_streambuf(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
|
| 357 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::init(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
|
| 358 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_cast()@GLIBCXX_3.4'\n",
|
| 359 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::clear(std::_Ios_Iostate)@GLIBCXX_3.4'\n",
|
| 360 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::operator=(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
|
| 361 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete(void*)@GLIBCXX_3.4'\n",
|
| 362 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(int)@GLIBCXX_3.4'\n",
|
| 363 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
|
| 364 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_M_destroy(std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 365 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<wchar_t, std::char_traits<wchar_t> >::~basic_iostream()@GLIBCXX_3.4'\n",
|
| 366 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::runtime_error@GLIBCXX_3.4'\n",
|
| 367 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 368 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
|
| 369 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_stringstream()@GLIBCXX_3.4'\n",
|
| 370 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 371 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long>(long)@GLIBCXX_3.4.9'\n",
|
| 372 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::get()@GLIBCXX_3.4'\n",
|
| 373 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long long@CXXABI_1.3'\n",
|
| 374 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)@GLIBCXX_3.4'\n",
|
| 375 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::out_of_range::~out_of_range()@GLIBCXX_3.4'\n",
|
| 376 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::length_error::~length_error()@GLIBCXX_3.4'\n",
|
| 377 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)@GLIBCXX_3.4.9'\n",
|
| 378 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::invalid_argument::~invalid_argument()@GLIBCXX_3.4'\n",
|
| 379 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::swap(std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >&)@GLIBCXX_3.4'\n",
|
| 380 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cout@GLIBCXX_3.4'\n",
|
| 381 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long long>(unsigned long long)@GLIBCXX_3.4.9'\n",
|
| 382 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int*@CXXABI_1.3'\n",
|
| 383 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<void const*>(void const*)@GLIBCXX_3.4.9'\n",
|
| 384 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::underflow_error@GLIBCXX_3.4'\n",
|
| 385 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 386 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::out_of_range@GLIBCXX_3.4'\n",
|
| 387 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_allocate_exception@CXXABI_1.3'\n",
|
| 388 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
|
| 389 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void const*@CXXABI_1.3'\n",
|
| 390 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::init(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >*)@GLIBCXX_3.4'\n",
|
| 391 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::reserve(unsigned long)@GLIBCXX_3.4'\n",
|
| 392 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_begin_catch@CXXABI_1.3'\n",
|
| 393 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long@CXXABI_1.3'\n",
|
| 394 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
|
| 395 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak()@GLIBCXX_3.4'\n",
|
| 396 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::open(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
|
| 397 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_M_sync(wchar_t*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
| 398 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::getline(char*, long, char)@GLIBCXX_3.4'\n",
|
| 399 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istream<char, std::char_traits<char> >& std::getline<char, std::char_traits<char>, std::allocator<char> >(std::basic_istream<char, std::char_traits<char> >&, std::basic_string<char, std::char_traits<char>, std::allocator<char> >&, char)@GLIBCXX_3.4'\n",
|
| 400 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 401 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::~condition_variable()@GLIBCXX_3.4.11'\n",
|
| 402 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 403 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::insert(unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
|
| 404 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(char const*, unsigned long)@GLIBCXX_3.4'\n",
|
| 405 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned char@CXXABI_1.3'\n",
|
| 406 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::ios_base()@GLIBCXX_3.4'\n",
|
| 407 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_out_of_range(char const*)@GLIBCXX_3.4'\n",
|
| 408 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::overflow_error::~overflow_error()@GLIBCXX_3.4'\n",
|
| 409 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_length_error(char const*)@GLIBCXX_3.4'\n",
|
| 410 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_system_error(int)@GLIBCXX_3.4.11'\n",
|
| 411 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ofstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
| 412 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<double>(double)@GLIBCXX_3.4.9'\n",
|
| 413 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::operator=(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
|
| 414 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long long@CXXABI_1.3'\n",
|
| 415 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(char const*, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 416 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
| 417 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_release@CXXABI_1.3'\n",
|
| 418 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_throw@CXXABI_1.3'\n",
|
| 419 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::underflow_error::~underflow_error()@GLIBCXX_3.4'\n",
|
| 420 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
|
| 421 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::length_error@GLIBCXX_3.4'\n",
|
| 422 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::~basic_filebuf()@GLIBCXX_3.4'\n",
|
| 423 |
+
"collect2: error: ld returned 1 exit status\n"
|
| 424 |
+
]
|
| 425 |
+
},
|
| 426 |
+
{
|
| 427 |
+
"data": {
|
| 428 |
+
"text/html": [
|
| 429 |
+
"\n",
|
| 430 |
+
" <div>\n",
|
| 431 |
+
" \n",
|
| 432 |
+
" <progress value='3600' max='3600' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
| 433 |
+
" [3600/3600 09:30, Epoch 5/5]\n",
|
| 434 |
+
" </div>\n",
|
| 435 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
| 436 |
+
" <thead>\n",
|
| 437 |
+
" <tr style=\"text-align: left;\">\n",
|
| 438 |
+
" <th>Epoch</th>\n",
|
| 439 |
+
" <th>Training Loss</th>\n",
|
| 440 |
+
" <th>Validation Loss</th>\n",
|
| 441 |
+
" <th>Accuracy</th>\n",
|
| 442 |
+
" </tr>\n",
|
| 443 |
+
" </thead>\n",
|
| 444 |
+
" <tbody>\n",
|
| 445 |
+
" <tr>\n",
|
| 446 |
+
" <td>1</td>\n",
|
| 447 |
+
" <td>0.118400</td>\n",
|
| 448 |
+
" <td>0.017560</td>\n",
|
| 449 |
+
" <td>0.997500</td>\n",
|
| 450 |
+
" </tr>\n",
|
| 451 |
+
" <tr>\n",
|
| 452 |
+
" <td>2</td>\n",
|
| 453 |
+
" <td>0.021100</td>\n",
|
| 454 |
+
" <td>0.012475</td>\n",
|
| 455 |
+
" <td>0.997500</td>\n",
|
| 456 |
+
" </tr>\n",
|
| 457 |
+
" <tr>\n",
|
| 458 |
+
" <td>3</td>\n",
|
| 459 |
+
" <td>0.015300</td>\n",
|
| 460 |
+
" <td>0.008951</td>\n",
|
| 461 |
+
" <td>0.998125</td>\n",
|
| 462 |
+
" </tr>\n",
|
| 463 |
+
" <tr>\n",
|
| 464 |
+
" <td>4</td>\n",
|
| 465 |
+
" <td>0.007200</td>\n",
|
| 466 |
+
" <td>0.038701</td>\n",
|
| 467 |
+
" <td>0.994375</td>\n",
|
| 468 |
+
" </tr>\n",
|
| 469 |
+
" <tr>\n",
|
| 470 |
+
" <td>5</td>\n",
|
| 471 |
+
" <td>0.010600</td>\n",
|
| 472 |
+
" <td>0.021079</td>\n",
|
| 473 |
+
" <td>0.996875</td>\n",
|
| 474 |
+
" </tr>\n",
|
| 475 |
+
" </tbody>\n",
|
| 476 |
+
"</table><p>"
|
| 477 |
+
],
|
| 478 |
+
"text/plain": [
|
| 479 |
+
"<IPython.core.display.HTML object>"
|
| 480 |
+
]
|
| 481 |
+
},
|
| 482 |
+
"metadata": {},
|
| 483 |
+
"output_type": "display_data"
|
| 484 |
+
},
|
| 485 |
+
{
|
| 486 |
+
"data": {
|
| 487 |
+
"text/plain": [
|
| 488 |
+
"TrainOutput(global_step=3600, training_loss=0.03451399882634481, metrics={'train_runtime': 571.3197, 'train_samples_per_second': 126.024, 'train_steps_per_second': 6.301, 'total_flos': 9406683021312000.0, 'train_loss': 0.03451399882634481, 'epoch': 5.0})"
|
| 489 |
+
]
|
| 490 |
+
},
|
| 491 |
+
"execution_count": 6,
|
| 492 |
+
"metadata": {},
|
| 493 |
+
"output_type": "execute_result"
|
| 494 |
+
}
|
| 495 |
+
],
|
| 496 |
+
"source": [
|
| 497 |
+
"trainer.train() #模型训练"
|
| 498 |
+
]
|
| 499 |
+
},
|
| 500 |
+
{
|
| 501 |
+
"cell_type": "code",
|
| 502 |
+
"execution_count": 7,
|
| 503 |
+
"id": "59f75608-0756-4783-b8e0-96d0286c9502",
|
| 504 |
+
"metadata": {},
|
| 505 |
+
"outputs": [],
|
| 506 |
+
"source": [
|
| 507 |
+
"model.save_pretrained(\"gpt2_dna_ft_5\")"
|
| 508 |
+
]
|
| 509 |
+
},
|
| 510 |
+
{
|
| 511 |
+
"cell_type": "code",
|
| 512 |
+
"execution_count": 8,
|
| 513 |
+
"id": "2a621183-2fe9-404c-9680-2872713155a0",
|
| 514 |
+
"metadata": {},
|
| 515 |
+
"outputs": [
|
| 516 |
+
{
|
| 517 |
+
"data": {
|
| 518 |
+
"text/html": [],
|
| 519 |
+
"text/plain": [
|
| 520 |
+
"<IPython.core.display.HTML object>"
|
| 521 |
+
]
|
| 522 |
+
},
|
| 523 |
+
"metadata": {},
|
| 524 |
+
"output_type": "display_data"
|
| 525 |
+
},
|
| 526 |
+
{
|
| 527 |
+
"name": "stderr",
|
| 528 |
+
"output_type": "stream",
|
| 529 |
+
"text": [
|
| 530 |
+
"Using the latest cached version of the module from /root/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed (last modified on Sat Feb 1 18:33:18 2025) since it couldn't be found locally at evaluate-metric--glue, or remotely on the Hugging Face Hub.\n"
|
| 531 |
+
]
|
| 532 |
+
},
|
| 533 |
+
{
|
| 534 |
+
"name": "stdout",
|
| 535 |
+
"output_type": "stream",
|
| 536 |
+
"text": [
|
| 537 |
+
"{'accuracy': 0.998125, 'f1': 0.998104864181933}\n"
|
| 538 |
+
]
|
| 539 |
+
}
|
| 540 |
+
],
|
| 541 |
+
"source": [
|
| 542 |
+
"#模型测试,英文数据集\n",
|
| 543 |
+
"result = {}\n",
|
| 544 |
+
"predictions = trainer.predict(tokenized_datasets[\"test\"])\n",
|
| 545 |
+
"preds = np.argmax(predictions.predictions, axis=-1)\n",
|
| 546 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 547 |
+
"ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
|
| 548 |
+
"print(ret)\n",
|
| 549 |
+
"result[\"en\"] = ret"
|
| 550 |
+
]
|
| 551 |
+
},
|
| 552 |
+
{
|
| 553 |
+
"cell_type": "markdown",
|
| 554 |
+
"id": "7c0d2d33-1eca-44df-9bee-a4a9ba60ce1b",
|
| 555 |
+
"metadata": {},
|
| 556 |
+
"source": [
|
| 557 |
+
"## 其他自然语言测试"
|
| 558 |
+
]
|
| 559 |
+
},
|
| 560 |
+
{
|
| 561 |
+
"cell_type": "code",
|
| 562 |
+
"execution_count": 9,
|
| 563 |
+
"id": "c149b679-1347-4c97-a726-28469b38e629",
|
| 564 |
+
"metadata": {},
|
| 565 |
+
"outputs": [
|
| 566 |
+
{
|
| 567 |
+
"name": "stderr",
|
| 568 |
+
"output_type": "stream",
|
| 569 |
+
"text": [
|
| 570 |
+
"Using the latest cached version of the dataset since paws-x couldn't be found on the Hugging Face Hub\n",
|
| 571 |
+
"Found the latest cached dataset configuration 'fr' at /root/.cache/huggingface/datasets/paws-x/fr/0.0.0/4cd8187c404bda33cb1f62b49b001115862acf37 (last modified on Sat Feb 8 09:41:42 2025).\n"
|
| 572 |
+
]
|
| 573 |
+
},
|
| 574 |
+
{
|
| 575 |
+
"data": {
|
| 576 |
+
"text/html": [],
|
| 577 |
+
"text/plain": [
|
| 578 |
+
"<IPython.core.display.HTML object>"
|
| 579 |
+
]
|
| 580 |
+
},
|
| 581 |
+
"metadata": {},
|
| 582 |
+
"output_type": "display_data"
|
| 583 |
+
},
|
| 584 |
+
{
|
| 585 |
+
"name": "stderr",
|
| 586 |
+
"output_type": "stream",
|
| 587 |
+
"text": [
|
| 588 |
+
"Using the latest cached version of the module from /root/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed (last modified on Sat Feb 1 18:33:18 2025) since it couldn't be found locally at evaluate-metric--glue, or remotely on the Hugging Face Hub.\n"
|
| 589 |
+
]
|
| 590 |
+
},
|
| 591 |
+
{
|
| 592 |
+
"name": "stdout",
|
| 593 |
+
"output_type": "stream",
|
| 594 |
+
"text": [
|
| 595 |
+
"{'accuracy': 0.535, 'f1': 0.3857331571994716}\n"
|
| 596 |
+
]
|
| 597 |
+
}
|
| 598 |
+
],
|
| 599 |
+
"source": [
|
| 600 |
+
"#模型测试,法文数据集\n",
|
| 601 |
+
"raw_datasets_fr = load_dataset('paws-x', 'fr') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x\n",
|
| 602 |
+
"tokenized_datasets_fr = raw_datasets_fr.map(tokenize_function, batched=True)\n",
|
| 603 |
+
"\n",
|
| 604 |
+
"predictions = trainer.predict(tokenized_datasets_fr[\"test\"])\n",
|
| 605 |
+
"preds = np.argmax(predictions.predictions, axis=-1)\n",
|
| 606 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 607 |
+
"ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
|
| 608 |
+
"print(ret)\n",
|
| 609 |
+
"result[\"fr\"] = ret"
|
| 610 |
+
]
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"cell_type": "code",
|
| 614 |
+
"execution_count": 10,
|
| 615 |
+
"id": "158fe03f-4cf1-456a-b5b3-cf46e3ca47eb",
|
| 616 |
+
"metadata": {},
|
| 617 |
+
"outputs": [
|
| 618 |
+
{
|
| 619 |
+
"name": "stderr",
|
| 620 |
+
"output_type": "stream",
|
| 621 |
+
"text": [
|
| 622 |
+
"Using the latest cached version of the dataset since google-research-datasets/paws-x couldn't be found on the Hugging Face Hub\n",
|
| 623 |
+
"Found the latest cached dataset configuration 'de' at /root/.cache/huggingface/datasets/google-research-datasets___paws-x/de/0.0.0/4cd8187c404bda33cb1f62b49b001115862acf37 (last modified on Fri Feb 7 11:21:19 2025).\n"
|
| 624 |
+
]
|
| 625 |
+
},
|
| 626 |
+
{
|
| 627 |
+
"data": {
|
| 628 |
+
"text/html": [],
|
| 629 |
+
"text/plain": [
|
| 630 |
+
"<IPython.core.display.HTML object>"
|
| 631 |
+
]
|
| 632 |
+
},
|
| 633 |
+
"metadata": {},
|
| 634 |
+
"output_type": "display_data"
|
| 635 |
+
},
|
| 636 |
+
{
|
| 637 |
+
"name": "stderr",
|
| 638 |
+
"output_type": "stream",
|
| 639 |
+
"text": [
|
| 640 |
+
"Using the latest cached version of the module from /root/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed (last modified on Sat Feb 1 18:33:18 2025) since it couldn't be found locally at evaluate-metric--glue, or remotely on the Hugging Face Hub.\n"
|
| 641 |
+
]
|
| 642 |
+
},
|
| 643 |
+
{
|
| 644 |
+
"name": "stdout",
|
| 645 |
+
"output_type": "stream",
|
| 646 |
+
"text": [
|
| 647 |
+
"{'accuracy': 0.46, 'f1': 0.5690343176376695}\n"
|
| 648 |
+
]
|
| 649 |
+
}
|
| 650 |
+
],
|
| 651 |
+
"source": [
|
| 652 |
+
"#模型测试,德文数据集\n",
|
| 653 |
+
"raw_datasets_de = load_dataset('google-research-datasets/paws-x', 'de') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-\n",
|
| 654 |
+
"tokenized_datasets_de = raw_datasets_de.map(tokenize_function, batched=True)\n",
|
| 655 |
+
"predictions = trainer.predict(tokenized_datasets_de[\"test\"])\n",
|
| 656 |
+
"preds = np.argmax(predictions.predictions, axis=-1)\n",
|
| 657 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 658 |
+
"ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
|
| 659 |
+
"print(ret)\n",
|
| 660 |
+
"result[\"de\"] = ret"
|
| 661 |
+
]
|
| 662 |
+
},
|
| 663 |
+
{
|
| 664 |
+
"cell_type": "code",
|
| 665 |
+
"execution_count": 11,
|
| 666 |
+
"id": "51800395-5a52-4eab-90fc-acecd3b16a9d",
|
| 667 |
+
"metadata": {},
|
| 668 |
+
"outputs": [
|
| 669 |
+
{
|
| 670 |
+
"name": "stderr",
|
| 671 |
+
"output_type": "stream",
|
| 672 |
+
"text": [
|
| 673 |
+
"Using the latest cached version of the dataset since google-research-datasets/paws-x couldn't be found on the Hugging Face Hub\n",
|
| 674 |
+
"Found the latest cached dataset configuration 'zh' at /root/.cache/huggingface/datasets/google-research-datasets___paws-x/zh/0.0.0/4cd8187c404bda33cb1f62b49b001115862acf37 (last modified on Fri Feb 7 11:21:38 2025).\n"
|
| 675 |
+
]
|
| 676 |
+
},
|
| 677 |
+
{
|
| 678 |
+
"data": {
|
| 679 |
+
"text/html": [],
|
| 680 |
+
"text/plain": [
|
| 681 |
+
"<IPython.core.display.HTML object>"
|
| 682 |
+
]
|
| 683 |
+
},
|
| 684 |
+
"metadata": {},
|
| 685 |
+
"output_type": "display_data"
|
| 686 |
+
},
|
| 687 |
+
{
|
| 688 |
+
"name": "stderr",
|
| 689 |
+
"output_type": "stream",
|
| 690 |
+
"text": [
|
| 691 |
+
"Using the latest cached version of the module from /root/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--glue/05234ba7acc44554edcca0978db5fa3bc600eeee66229abe79ff9887eacaf3ed (last modified on Sat Feb 1 18:33:18 2025) since it couldn't be found locally at evaluate-metric--glue, or remotely on the Hugging Face Hub.\n"
|
| 692 |
+
]
|
| 693 |
+
},
|
| 694 |
+
{
|
| 695 |
+
"name": "stdout",
|
| 696 |
+
"output_type": "stream",
|
| 697 |
+
"text": [
|
| 698 |
+
"{'accuracy': 0.539, 'f1': 0.1811722912966252}\n"
|
| 699 |
+
]
|
| 700 |
+
}
|
| 701 |
+
],
|
| 702 |
+
"source": [
|
| 703 |
+
"#模型测试,中文数据集\n",
|
| 704 |
+
"raw_datasets_zh = load_dataset('google-research-datasets/paws-x', 'zh') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-\n",
|
| 705 |
+
"tokenized_datasets_zh = raw_datasets_zh.map(tokenize_function, batched=True)\n",
|
| 706 |
+
"\n",
|
| 707 |
+
"predictions = trainer.predict(tokenized_datasets_zh[\"test\"])\n",
|
| 708 |
+
"preds = np.argmax(predictions.predictions, axis=-1)\n",
|
| 709 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 710 |
+
"ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
|
| 711 |
+
"\n",
|
| 712 |
+
"print(ret)\n",
|
| 713 |
+
"result[\"zh\"] = ret"
|
| 714 |
+
]
|
| 715 |
+
},
|
| 716 |
+
{
|
| 717 |
+
"cell_type": "code",
|
| 718 |
+
"execution_count": 12,
|
| 719 |
+
"id": "ff9680bc-d7f8-4cb0-a1da-da056016fd5d",
|
| 720 |
+
"metadata": {},
|
| 721 |
+
"outputs": [
|
| 722 |
+
{
|
| 723 |
+
"data": {
|
| 724 |
+
"text/plain": [
|
| 725 |
+
"{'id': 111,\n",
|
| 726 |
+
" 'sentence1': '这在澳大利亚地区和澳大利亚南部更为常见,但在澳大利亚城市已经普遍存在了数十年。',\n",
|
| 727 |
+
" 'sentence2': '这种情况在澳大利亚城市更为常见,但几十年来一直在澳大利亚和澳大利亚南部地区普遍使用。',\n",
|
| 728 |
+
" 'label': 0}"
|
| 729 |
+
]
|
| 730 |
+
},
|
| 731 |
+
"execution_count": 12,
|
| 732 |
+
"metadata": {},
|
| 733 |
+
"output_type": "execute_result"
|
| 734 |
+
}
|
| 735 |
+
],
|
| 736 |
+
"source": [
|
| 737 |
+
"raw_datasets_zh[\"train\"][110]"
|
| 738 |
+
]
|
| 739 |
+
},
|
| 740 |
+
{
|
| 741 |
+
"cell_type": "markdown",
|
| 742 |
+
"id": "12ab9c6f-2b9d-432e-87be-85c22a46d89b",
|
| 743 |
+
"metadata": {},
|
| 744 |
+
"source": [
|
| 745 |
+
"## 生物序列测试"
|
| 746 |
+
]
|
| 747 |
+
},
|
| 748 |
+
{
|
| 749 |
+
"cell_type": "markdown",
|
| 750 |
+
"id": "cf9e0f8c-f74e-4eb6-83bc-6b83220e6122",
|
| 751 |
+
"metadata": {},
|
| 752 |
+
"source": [
|
| 753 |
+
"### DNA 150bp simple"
|
| 754 |
+
]
|
| 755 |
+
},
|
| 756 |
+
{
|
| 757 |
+
"cell_type": "code",
|
| 758 |
+
"execution_count": 13,
|
| 759 |
+
"id": "29ca633a-f7fa-49fe-bb8d-842dd573a0fc",
|
| 760 |
+
"metadata": {},
|
| 761 |
+
"outputs": [
|
| 762 |
+
{
|
| 763 |
+
"name": "stderr",
|
| 764 |
+
"output_type": "stream",
|
| 765 |
+
"text": [
|
| 766 |
+
"Using the latest cached version of the dataset since dnagpt/gene_lan_transfer couldn't be found on the Hugging Face Hub\n",
|
| 767 |
+
"Found the latest cached dataset configuration 'dna_sim_pair_simple_150bp' at /root/.cache/huggingface/datasets/dnagpt___gene_lan_transfer/dna_sim_pair_simple_150bp/0.0.0/fc103580e7cda0d9bc41947f4058887fdc81188c (last modified on Fri Feb 7 11:21:53 2025).\n"
|
| 768 |
+
]
|
| 769 |
+
},
|
| 770 |
+
{
|
| 771 |
+
"data": {
|
| 772 |
+
"text/html": [],
|
| 773 |
+
"text/plain": [
|
| 774 |
+
"<IPython.core.display.HTML object>"
|
| 775 |
+
]
|
| 776 |
+
},
|
| 777 |
+
"metadata": {},
|
| 778 |
+
"output_type": "display_data"
|
| 779 |
+
},
|
| 780 |
+
{
|
| 781 |
+
"name": "stderr",
|
| 782 |
+
"output_type": "stream",
|
| 783 |
+
"text": [
|
| 784 |
+
"\n",
|
| 785 |
+
"KeyboardInterrupt\n",
|
| 786 |
+
"\n"
|
| 787 |
+
]
|
| 788 |
+
}
|
| 789 |
+
],
|
| 790 |
+
"source": [
|
| 791 |
+
"#模型测试 dna数据集,150 bp长度 简单版本\n",
|
| 792 |
+
"raw_datasets_dna =load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_simple_150bp')['train'].train_test_split(test_size=0.2) #默认已经shuffle\n",
|
| 793 |
+
"tokenized_datasets_dna = raw_datasets_dna.map(tokenize_function, batched=True)\n",
|
| 794 |
+
"predictions = trainer.predict(tokenized_datasets_dna[\"test\"])\n",
|
| 795 |
+
"preds = np.argmax(predictions.predictions, axis=-1)\n",
|
| 796 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 797 |
+
"ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
|
| 798 |
+
"\n",
|
| 799 |
+
"print(ret)\n",
|
| 800 |
+
"result[\"dna_sim_pair_simple_150bp\"] = ret"
|
| 801 |
+
]
|
| 802 |
+
},
|
| 803 |
+
{
|
| 804 |
+
"cell_type": "code",
|
| 805 |
+
"execution_count": null,
|
| 806 |
+
"id": "2da2c6b8-409f-462a-a0de-178c7d66b40b",
|
| 807 |
+
"metadata": {},
|
| 808 |
+
"outputs": [],
|
| 809 |
+
"source": [
|
| 810 |
+
"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
|
| 811 |
+
"import matplotlib.pyplot as plt\n",
|
| 812 |
+
"\n",
|
| 813 |
+
"# 假设 predictions.label_ids 是真实的标签,preds 是模型的预测\n",
|
| 814 |
+
"cm = confusion_matrix(predictions.label_ids, preds)\n",
|
| 815 |
+
"\n",
|
| 816 |
+
"# 可视化混淆矩阵\n",
|
| 817 |
+
"disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])\n",
|
| 818 |
+
"disp.plot(cmap=plt.cm.Blues)\n",
|
| 819 |
+
"plt.title('Confusion Matrix')\n",
|
| 820 |
+
"plt.show()"
|
| 821 |
+
]
|
| 822 |
+
},
|
| 823 |
+
{
|
| 824 |
+
"cell_type": "markdown",
|
| 825 |
+
"id": "fb436436-20f3-4bec-8fb2-5ab58d837f42",
|
| 826 |
+
"metadata": {},
|
| 827 |
+
"source": [
|
| 828 |
+
"### DNA 150 bp"
|
| 829 |
+
]
|
| 830 |
+
},
|
| 831 |
+
{
|
| 832 |
+
"cell_type": "code",
|
| 833 |
+
"execution_count": null,
|
| 834 |
+
"id": "be3aa946-c697-4392-80ea-a2d31710ca5c",
|
| 835 |
+
"metadata": {},
|
| 836 |
+
"outputs": [],
|
| 837 |
+
"source": [
|
| 838 |
+
"#模型测试 dna数据集,150长度,复杂版本 不相似\n",
|
| 839 |
+
"raw_datasets_dna = load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_150bp')['train'].train_test_split(test_size=0.2) #默认已经shuffle\n",
|
| 840 |
+
"tokenized_datasets_dna= raw_datasets_dna.map(tokenize_function, batched=True)\n",
|
| 841 |
+
"\n",
|
| 842 |
+
"predictions = trainer.predict(tokenized_datasets_dna[\"test\"])\n",
|
| 843 |
+
"preds = np.argmax(predictions.predictions, axis=-1)\n",
|
| 844 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 845 |
+
"ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
|
| 846 |
+
"\n",
|
| 847 |
+
"print(ret)\n",
|
| 848 |
+
"result[\"dna_sim_pair_150bp\"] = ret"
|
| 849 |
+
]
|
| 850 |
+
},
|
| 851 |
+
{
|
| 852 |
+
"cell_type": "code",
|
| 853 |
+
"execution_count": null,
|
| 854 |
+
"id": "fa2918b9-6ef2-4f82-8262-88372c624a90",
|
| 855 |
+
"metadata": {},
|
| 856 |
+
"outputs": [],
|
| 857 |
+
"source": [
|
| 858 |
+
"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
|
| 859 |
+
"import matplotlib.pyplot as plt\n",
|
| 860 |
+
"\n",
|
| 861 |
+
"# 假设 predictions.label_ids 是真实的标签,preds 是模型的预测\n",
|
| 862 |
+
"cm = confusion_matrix(predictions.label_ids, preds)\n",
|
| 863 |
+
"\n",
|
| 864 |
+
"# 可视化混淆矩阵\n",
|
| 865 |
+
"disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])\n",
|
| 866 |
+
"disp.plot(cmap=plt.cm.Blues)\n",
|
| 867 |
+
"plt.title('Confusion Matrix')\n",
|
| 868 |
+
"plt.show()"
|
| 869 |
+
]
|
| 870 |
+
},
|
| 871 |
+
{
|
| 872 |
+
"cell_type": "markdown",
|
| 873 |
+
"id": "c32158a3-6fd0-4ef8-9847-d50644ff9257",
|
| 874 |
+
"metadata": {},
|
| 875 |
+
"source": [
|
| 876 |
+
"### DNA 50bp"
|
| 877 |
+
]
|
| 878 |
+
},
|
| 879 |
+
{
|
| 880 |
+
"cell_type": "code",
|
| 881 |
+
"execution_count": null,
|
| 882 |
+
"id": "0f93b4e6-6243-4ebd-8e34-5f8b8a8a24b6",
|
| 883 |
+
"metadata": {},
|
| 884 |
+
"outputs": [],
|
| 885 |
+
"source": [
|
| 886 |
+
"#模型测试 dna数据集,50长度,复杂版本 不相似\n",
|
| 887 |
+
"raw_datasets_dna = load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_50bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle\n",
|
| 888 |
+
"tokenized_datasets_dna = raw_datasets_dna.map(tokenize_function, batched=True)\n",
|
| 889 |
+
"predictions = trainer.predict(tokenized_datasets_dna[\"test\"])\n",
|
| 890 |
+
"preds = np.argmax(predictions.predictions, axis=-1)\n",
|
| 891 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 892 |
+
"ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
|
| 893 |
+
"\n",
|
| 894 |
+
"print(ret)\n",
|
| 895 |
+
"result[\"dna_sim_pair_50bp\"] = ret"
|
| 896 |
+
]
|
| 897 |
+
},
|
| 898 |
+
{
|
| 899 |
+
"cell_type": "code",
|
| 900 |
+
"execution_count": null,
|
| 901 |
+
"id": "27ab77c8-a1b4-40d3-b342-debc04e5517a",
|
| 902 |
+
"metadata": {},
|
| 903 |
+
"outputs": [],
|
| 904 |
+
"source": [
|
| 905 |
+
"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
|
| 906 |
+
"import matplotlib.pyplot as plt\n",
|
| 907 |
+
"\n",
|
| 908 |
+
"# 假设 predictions.label_ids 是真实的标签,preds 是模型的预测\n",
|
| 909 |
+
"cm = confusion_matrix(predictions.label_ids, preds)\n",
|
| 910 |
+
"\n",
|
| 911 |
+
"# 可视化混淆矩阵\n",
|
| 912 |
+
"disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])\n",
|
| 913 |
+
"disp.plot(cmap=plt.cm.Blues)\n",
|
| 914 |
+
"plt.title('Confusion Matrix')\n",
|
| 915 |
+
"plt.show()"
|
| 916 |
+
]
|
| 917 |
+
},
|
| 918 |
+
{
|
| 919 |
+
"cell_type": "markdown",
|
| 920 |
+
"id": "613be8ca-afcf-4115-8f1e-28c4b21e6e99",
|
| 921 |
+
"metadata": {},
|
| 922 |
+
"source": [
|
| 923 |
+
"### protein 150bp 50 len"
|
| 924 |
+
]
|
| 925 |
+
},
|
| 926 |
+
{
|
| 927 |
+
"cell_type": "code",
|
| 928 |
+
"execution_count": null,
|
| 929 |
+
"id": "3785407d-de4f-42a4-b755-8365255bf72b",
|
| 930 |
+
"metadata": {},
|
| 931 |
+
"outputs": [],
|
| 932 |
+
"source": [
|
| 933 |
+
"#模型测试 蛋白质数据集,50长度/150bp,复杂版本 不相似\n",
|
| 934 |
+
"\n",
|
| 935 |
+
"raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'protein_sim_pair_150bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle\n",
|
| 936 |
+
"tokenized_datasets_dna_protein = raw_datasets_dna_protein.map(tokenize_function, batched=True)\n",
|
| 937 |
+
"predictions = trainer.predict(tokenized_datasets_dna_protein[\"test\"])\n",
|
| 938 |
+
"preds = np.argmax(predictions.predictions, axis=-1)\n",
|
| 939 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 940 |
+
"ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
|
| 941 |
+
"\n",
|
| 942 |
+
"\n",
|
| 943 |
+
"print(ret)\n",
|
| 944 |
+
"result[\"protein_sim_pair_150bp\"] = ret"
|
| 945 |
+
]
|
| 946 |
+
},
|
| 947 |
+
{
|
| 948 |
+
"cell_type": "markdown",
|
| 949 |
+
"id": "55ed3d1d-38e7-430a-bec4-bcc59e9dd288",
|
| 950 |
+
"metadata": {},
|
| 951 |
+
"source": [
|
| 952 |
+
"### protein 450bp 150 len"
|
| 953 |
+
]
|
| 954 |
+
},
|
| 955 |
+
{
|
| 956 |
+
"cell_type": "code",
|
| 957 |
+
"execution_count": null,
|
| 958 |
+
"id": "c268e40d-c973-4166-a9ff-a5ba5478e125",
|
| 959 |
+
"metadata": {},
|
| 960 |
+
"outputs": [],
|
| 961 |
+
"source": [
|
| 962 |
+
"#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似\n",
|
| 963 |
+
"\n",
|
| 964 |
+
"raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'protein_sim_pair_450bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle\n",
|
| 965 |
+
"tokenized_datasets_dna_protein = raw_datasets_dna_protein.map(tokenize_function, batched=True)\n",
|
| 966 |
+
"predictions = trainer.predict(tokenized_datasets_dna_protein[\"test\"])\n",
|
| 967 |
+
"preds = np.argmax(predictions.predictions, axis=-1)\n",
|
| 968 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 969 |
+
"ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
|
| 970 |
+
"\n",
|
| 971 |
+
"print(ret)\n",
|
| 972 |
+
"result[\"protein_sim_pair_450bp\"] = ret"
|
| 973 |
+
]
|
| 974 |
+
},
|
| 975 |
+
{
|
| 976 |
+
"cell_type": "markdown",
|
| 977 |
+
"id": "58c56540-ca70-4f97-9533-6144cbede218",
|
| 978 |
+
"metadata": {},
|
| 979 |
+
"source": [
|
| 980 |
+
"### dna-protein"
|
| 981 |
+
]
|
| 982 |
+
},
|
| 983 |
+
{
|
| 984 |
+
"cell_type": "code",
|
| 985 |
+
"execution_count": null,
|
| 986 |
+
"id": "b5b91fdc-3003-4d46-bd0c-6c14f92395e9",
|
| 987 |
+
"metadata": {},
|
| 988 |
+
"outputs": [],
|
| 989 |
+
"source": [
|
| 990 |
+
"#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似\n",
|
| 991 |
+
"raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle\n",
|
| 992 |
+
"\n",
|
| 993 |
+
"# 定义翻转标签的函数\n",
|
| 994 |
+
"def flip_labels(example):\n",
|
| 995 |
+
" # 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token\n",
|
| 996 |
+
" example[\"sentence1\"] = example[\"sentence1\"]\n",
|
| 997 |
+
" example[\"sentence2\"] = example[\"sentence2\"]\n",
|
| 998 |
+
" #example['label'] = 1 - example['label']\n",
|
| 999 |
+
" return example\n",
|
| 1000 |
+
"\n",
|
| 1001 |
+
"# 应用翻转标签函数\n",
|
| 1002 |
+
"flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)\n",
|
| 1003 |
+
"\n",
|
| 1004 |
+
"tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)\n",
|
| 1005 |
+
"predictions = trainer.predict(tokenized_datasets_dna_protein[\"test\"])\n",
|
| 1006 |
+
"preds = np.argmax(predictions.predictions, axis=-1)\n",
|
| 1007 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 1008 |
+
"ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
|
| 1009 |
+
"print(ret)\n",
|
| 1010 |
+
"result[\"dna_protein_pair\"] = ret"
|
| 1011 |
+
]
|
| 1012 |
+
},
|
| 1013 |
+
{
|
| 1014 |
+
"cell_type": "markdown",
|
| 1015 |
+
"id": "6ba865ea-5dfa-4917-80ec-a6e2109fb92a",
|
| 1016 |
+
"metadata": {},
|
| 1017 |
+
"source": [
|
| 1018 |
+
"## dna protein 2"
|
| 1019 |
+
]
|
| 1020 |
+
},
|
| 1021 |
+
{
|
| 1022 |
+
"cell_type": "code",
|
| 1023 |
+
"execution_count": null,
|
| 1024 |
+
"id": "e854a058-e902-4f63-b44a-cfb6633fdc3f",
|
| 1025 |
+
"metadata": {},
|
| 1026 |
+
"outputs": [],
|
| 1027 |
+
"source": [
|
| 1028 |
+
"#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似\n",
|
| 1029 |
+
"raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.1) #默认已经shuffle\n",
|
| 1030 |
+
"\n",
|
| 1031 |
+
"# 定义翻转标签的函数\n",
|
| 1032 |
+
"def flip_labels(example):\n",
|
| 1033 |
+
" # 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token\n",
|
| 1034 |
+
" example[\"sentence1\"] = example[\"sentence1\"]\n",
|
| 1035 |
+
" example[\"sentence2\"] = example[\"sentence2\"]\n",
|
| 1036 |
+
" #example['label'] = 1 - example['label']\n",
|
| 1037 |
+
" return example\n",
|
| 1038 |
+
"\n",
|
| 1039 |
+
"# 应用翻转标签函数\n",
|
| 1040 |
+
"flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)\n",
|
| 1041 |
+
"\n",
|
| 1042 |
+
"tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)\n",
|
| 1043 |
+
"predictions = trainer.predict(tokenized_datasets_dna_protein[\"test\"])\n",
|
| 1044 |
+
"preds = np.argmax(predictions.predictions, axis=-1)\n",
|
| 1045 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 1046 |
+
"ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
|
| 1047 |
+
"print(ret)\n",
|
| 1048 |
+
"result[\"dna_protein_pair_rand\"] = ret"
|
| 1049 |
+
]
|
| 1050 |
+
},
|
| 1051 |
+
{
|
| 1052 |
+
"cell_type": "code",
|
| 1053 |
+
"execution_count": null,
|
| 1054 |
+
"id": "3b3b3504-ba21-429d-9129-3b2a2409eb3f",
|
| 1055 |
+
"metadata": {},
|
| 1056 |
+
"outputs": [],
|
| 1057 |
+
"source": [
|
| 1058 |
+
"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
|
| 1059 |
+
"import matplotlib.pyplot as plt\n",
|
| 1060 |
+
"\n",
|
| 1061 |
+
"# 假设 predictions.label_ids 是真实的标签,preds 是模型的预测\n",
|
| 1062 |
+
"cm = confusion_matrix(predictions.label_ids, preds)\n",
|
| 1063 |
+
"\n",
|
| 1064 |
+
"# 可视化混淆矩阵\n",
|
| 1065 |
+
"disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])\n",
|
| 1066 |
+
"disp.plot(cmap=plt.cm.Blues)\n",
|
| 1067 |
+
"plt.title('Confusion Matrix')\n",
|
| 1068 |
+
"plt.show()"
|
| 1069 |
+
]
|
| 1070 |
+
},
|
| 1071 |
+
{
|
| 1072 |
+
"cell_type": "code",
|
| 1073 |
+
"execution_count": null,
|
| 1074 |
+
"id": "3b6e4efe-7158-4ea3-8700-447ea56f0f27",
|
| 1075 |
+
"metadata": {},
|
| 1076 |
+
"outputs": [],
|
| 1077 |
+
"source": [
|
| 1078 |
+
"result"
|
| 1079 |
+
]
|
| 1080 |
+
},
|
| 1081 |
+
{
|
| 1082 |
+
"cell_type": "code",
|
| 1083 |
+
"execution_count": null,
|
| 1084 |
+
"id": "c38fe4c3-7f1a-4544-8451-ed4ac29b80f0",
|
| 1085 |
+
"metadata": {},
|
| 1086 |
+
"outputs": [],
|
| 1087 |
+
"source": []
|
| 1088 |
+
}
|
| 1089 |
+
],
|
| 1090 |
+
"metadata": {
|
| 1091 |
+
"kernelspec": {
|
| 1092 |
+
"display_name": "Python 3 (ipykernel)",
|
| 1093 |
+
"language": "python",
|
| 1094 |
+
"name": "python3"
|
| 1095 |
+
},
|
| 1096 |
+
"language_info": {
|
| 1097 |
+
"codemirror_mode": {
|
| 1098 |
+
"name": "ipython",
|
| 1099 |
+
"version": 3
|
| 1100 |
+
},
|
| 1101 |
+
"file_extension": ".py",
|
| 1102 |
+
"mimetype": "text/x-python",
|
| 1103 |
+
"name": "python",
|
| 1104 |
+
"nbconvert_exporter": "python",
|
| 1105 |
+
"pygments_lexer": "ipython3",
|
| 1106 |
+
"version": "3.12.3"
|
| 1107 |
+
}
|
| 1108 |
+
},
|
| 1109 |
+
"nbformat": 4,
|
| 1110 |
+
"nbformat_minor": 5
|
| 1111 |
+
}
|
best_model/gpt2_gene_multiv1_ft_en_test_others_best.ipynb
ADDED
|
@@ -0,0 +1,771 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "dcaea864-b707-4651-965d-b8eefa1b0e07",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stdout",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"https://hf-mirror.com\n"
|
| 14 |
+
]
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"source": [
|
| 18 |
+
"import os\n",
|
| 19 |
+
"\n",
|
| 20 |
+
"# 设置环境变量\n",
|
| 21 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
| 22 |
+
"\n",
|
| 23 |
+
"# 打印环境变量以确认设置成功\n",
|
| 24 |
+
"print(os.environ.get('HF_ENDPOINT'))\n",
|
| 25 |
+
"\n",
|
| 26 |
+
"# import subprocess\n",
|
| 27 |
+
"# import os\n",
|
| 28 |
+
"\n",
|
| 29 |
+
"# result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
| 30 |
+
"# output = result.stdout\n",
|
| 31 |
+
"# for line in output.splitlines():\n",
|
| 32 |
+
"# if '=' in line:\n",
|
| 33 |
+
"# var, value = line.split('=', 1)\n",
|
| 34 |
+
"# os.environ[var] = value"
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"cell_type": "code",
|
| 39 |
+
"execution_count": 2,
|
| 40 |
+
"id": "73cc9d73-f0cb-4392-8280-782275dc7036",
|
| 41 |
+
"metadata": {},
|
| 42 |
+
"outputs": [
|
| 43 |
+
{
|
| 44 |
+
"name": "stderr",
|
| 45 |
+
"output_type": "stream",
|
| 46 |
+
"text": [
|
| 47 |
+
"2025-02-09 19:41:53.922324: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
|
| 48 |
+
"2025-02-09 19:41:53.935891: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
| 49 |
+
"2025-02-09 19:41:53.951620: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
| 50 |
+
"2025-02-09 19:41:53.956337: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
| 51 |
+
"2025-02-09 19:41:53.968352: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
| 52 |
+
"To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
| 53 |
+
"2025-02-09 19:41:54.782860: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
|
| 54 |
+
]
|
| 55 |
+
}
|
| 56 |
+
],
|
| 57 |
+
"source": [
|
| 58 |
+
"from datasets import load_dataset\n",
|
| 59 |
+
"from transformers import AutoTokenizer, DataCollatorWithPadding\n",
|
| 60 |
+
"from transformers import Trainer\n",
|
| 61 |
+
"import evaluate\n",
|
| 62 |
+
"import numpy as np\n",
|
| 63 |
+
"from transformers import TrainingArguments\n",
|
| 64 |
+
"from transformers import AutoModelForSequenceClassification\n",
|
| 65 |
+
"import json\n",
|
| 66 |
+
"from transformers import set_seed\n",
|
| 67 |
+
"import random\n",
|
| 68 |
+
"import numpy as np\n",
|
| 69 |
+
"import torch\n",
|
| 70 |
+
"from tqdm import tqdm"
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"cell_type": "code",
|
| 75 |
+
"execution_count": 3,
|
| 76 |
+
"id": "8255851b-d2c4-437c-a306-fcb00ccab684",
|
| 77 |
+
"metadata": {},
|
| 78 |
+
"outputs": [],
|
| 79 |
+
"source": [
|
| 80 |
+
"# seed = 42\n",
|
| 81 |
+
"# random.seed(seed)\n",
|
| 82 |
+
"# np.random.seed(seed)\n",
|
| 83 |
+
"# torch.manual_seed(seed)\n",
|
| 84 |
+
"# torch.cuda.manual_seed_all(seed)\n",
|
| 85 |
+
"\n",
|
| 86 |
+
"\n",
|
| 87 |
+
"# 动态生成随机种子\n",
|
| 88 |
+
"import random\n",
|
| 89 |
+
"#seed = random.randint(0, 10000)\n",
|
| 90 |
+
"seed = 7967\n",
|
| 91 |
+
"#print(f\"Generated seed: {seed}\")\n",
|
| 92 |
+
"set_seed(seed)\n",
|
| 93 |
+
"result = {}\n",
|
| 94 |
+
"result[\"seed\"] = seed"
|
| 95 |
+
]
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"cell_type": "code",
|
| 99 |
+
"execution_count": 4,
|
| 100 |
+
"id": "2a744bd7-b674-4048-b303-d4f85bdd694d",
|
| 101 |
+
"metadata": {},
|
| 102 |
+
"outputs": [],
|
| 103 |
+
"source": [
|
| 104 |
+
"# 假设你的JSON文件名为 'data.json' 并且每行是一个独立的JSON对象\n",
|
| 105 |
+
"raw_datasets = load_dataset('google-research-datasets/paws-x', 'en') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x\n",
|
| 106 |
+
"\n",
|
| 107 |
+
"#分词器\n",
|
| 108 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"dnagpt/gpt2_gene_multi_v1\")\n",
|
| 109 |
+
"tokenizer.pad_token = tokenizer.eos_token\n",
|
| 110 |
+
"\n",
|
| 111 |
+
"\n",
|
| 112 |
+
"#分词函数\n",
|
| 113 |
+
"def tokenize_function(example):\n",
|
| 114 |
+
" return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True,max_length=256, padding=\"max_length\")\n",
|
| 115 |
+
"\n",
|
| 116 |
+
"#构建分词后的数据集\n",
|
| 117 |
+
"tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
|
| 118 |
+
"\n",
|
| 119 |
+
"#训练数据构建\n",
|
| 120 |
+
"data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
|
| 121 |
+
]
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"cell_type": "code",
|
| 125 |
+
"execution_count": 5,
|
| 126 |
+
"id": "b2d3081f-1df2-45fa-a3f6-1516c19d7318",
|
| 127 |
+
"metadata": {},
|
| 128 |
+
"outputs": [
|
| 129 |
+
{
|
| 130 |
+
"name": "stderr",
|
| 131 |
+
"output_type": "stream",
|
| 132 |
+
"text": [
|
| 133 |
+
"/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
|
| 134 |
+
" warnings.warn(\n",
|
| 135 |
+
"Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/gpt2_gene_multi_v1 and are newly initialized: ['score.weight']\n",
|
| 136 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
| 137 |
+
]
|
| 138 |
+
}
|
| 139 |
+
],
|
| 140 |
+
"source": [
|
| 141 |
+
"#指标函数定义\n",
|
| 142 |
+
"def compute_metrics(eval_pred):\n",
|
| 143 |
+
" predictions, labels = eval_pred\n",
|
| 144 |
+
" predictions = np.argmax(predictions, axis=1)\n",
|
| 145 |
+
" return {'accuracy': (predictions==labels).sum() / len(labels)}\n",
|
| 146 |
+
"\n",
|
| 147 |
+
"\n",
|
| 148 |
+
"\n",
|
| 149 |
+
"training_args = TrainingArguments(\n",
|
| 150 |
+
" output_dir=\"ds_job_dna_2222\",\n",
|
| 151 |
+
" learning_rate=1e-5,\n",
|
| 152 |
+
" lr_scheduler_type=\"constant_with_warmup\",\n",
|
| 153 |
+
" warmup_ratio=0.1,\n",
|
| 154 |
+
" optim='adamw_torch',\n",
|
| 155 |
+
" weight_decay=0.0,\n",
|
| 156 |
+
" seed=seed, # 使用动态生成的随机种子\n",
|
| 157 |
+
" per_device_train_batch_size=64,\n",
|
| 158 |
+
" per_device_eval_batch_size=64,\n",
|
| 159 |
+
" num_train_epochs=4, #训练多少轮\n",
|
| 160 |
+
" evaluation_strategy=\"epoch\",\n",
|
| 161 |
+
" save_strategy=\"epoch\",\n",
|
| 162 |
+
" logging_strategy=\"epoch\",\n",
|
| 163 |
+
" load_best_model_at_end=True\n",
|
| 164 |
+
")\n",
|
| 165 |
+
"\n",
|
| 166 |
+
"#模型定义,文本分类模型\n",
|
| 167 |
+
"model = AutoModelForSequenceClassification.from_pretrained(\"dnagpt/gpt2_gene_multi_v1\", num_labels=2)\n",
|
| 168 |
+
"model.config.pad_token_id = model.config.eos_token_id\n",
|
| 169 |
+
"\n",
|
| 170 |
+
"trainer = Trainer(\n",
|
| 171 |
+
" model,\n",
|
| 172 |
+
" training_args,\n",
|
| 173 |
+
" train_dataset=tokenized_datasets[\"train\"],\n",
|
| 174 |
+
" eval_dataset=tokenized_datasets[\"validation\"],\n",
|
| 175 |
+
" data_collator=data_collator,\n",
|
| 176 |
+
" tokenizer=tokenizer,\n",
|
| 177 |
+
" compute_metrics=compute_metrics,\n",
|
| 178 |
+
")"
|
| 179 |
+
]
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"cell_type": "code",
|
| 183 |
+
"execution_count": 6,
|
| 184 |
+
"id": "06f4ea9e-dbac-405e-8875-aea569c708cf",
|
| 185 |
+
"metadata": {},
|
| 186 |
+
"outputs": [
|
| 187 |
+
{
|
| 188 |
+
"name": "stdout",
|
| 189 |
+
"output_type": "stream",
|
| 190 |
+
"text": [
|
| 191 |
+
"[2025-02-09 19:42:11,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
|
| 192 |
+
]
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"name": "stderr",
|
| 196 |
+
"output_type": "stream",
|
| 197 |
+
"text": [
|
| 198 |
+
"/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory\n",
|
| 199 |
+
"collect2: error: ld returned 1 exit status\n",
|
| 200 |
+
"/root/miniconda3/compiler_compat/ld: warning: libpthread.so.0, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
| 201 |
+
"/root/miniconda3/compiler_compat/ld: warning: libstdc++.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
| 202 |
+
"/root/miniconda3/compiler_compat/ld: warning: libm.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
| 203 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'\n",
|
| 204 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'\n",
|
| 205 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'\n",
|
| 206 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'\n",
|
| 207 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'\n",
|
| 208 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'\n",
|
| 209 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_logic_error(char const*)@GLIBCXX_3.4'\n",
|
| 210 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 211 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::logic_error@GLIBCXX_3.4'\n",
|
| 212 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::~locale()@GLIBCXX_3.4'\n",
|
| 213 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
| 214 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_end_catch@CXXABI_1.3'\n",
|
| 215 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 216 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::logic_error::~logic_error()@GLIBCXX_3.4'\n",
|
| 217 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__si_class_type_info@CXXABI_1.3'\n",
|
| 218 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
|
| 219 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 220 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new[](unsigned long)@GLIBCXX_3.4'\n",
|
| 221 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak_hard()@GLIBCXX_3.4'\n",
|
| 222 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 223 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::basic_streambuf(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
|
| 224 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*, unsigned long)@GLIBCXX_3.4'\n",
|
| 225 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&)@GLIBCXX_3.4'\n",
|
| 226 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned short@CXXABI_1.3'\n",
|
| 227 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::resize(unsigned long, char)@GLIBCXX_3.4'\n",
|
| 228 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char const*@CXXABI_1.3'\n",
|
| 229 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ctype<char>::_M_widen_init() const@GLIBCXX_3.4.11'\n",
|
| 230 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_invalid_argument(char const*)@GLIBCXX_3.4'\n",
|
| 231 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::operator=(std::locale const&)@GLIBCXX_3.4'\n",
|
| 232 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
|
| 233 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
|
| 234 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_free_exception@CXXABI_1.3'\n",
|
| 235 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::notify_one()@GLIBCXX_3.4.11'\n",
|
| 236 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::~Init()@GLIBCXX_3.4'\n",
|
| 237 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::~basic_string()@GLIBCXX_3.4'\n",
|
| 238 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_pure_virtual@CXXABI_1.3'\n",
|
| 239 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::flush()@GLIBCXX_3.4'\n",
|
| 240 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__class_type_info@CXXABI_1.3'\n",
|
| 241 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_rethrow@CXXABI_1.3'\n",
|
| 242 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 243 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_fstream<char, std::char_traits<char> >::~basic_fstream()@GLIBCXX_3.4'\n",
|
| 244 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::compare(char const*) const@GLIBCXX_3.4'\n",
|
| 245 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 246 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale()@GLIBCXX_3.4'\n",
|
| 247 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::system_clock::now()@GLIBCXX_3.4.19'\n",
|
| 248 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 249 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Hash_bytes(void const*, unsigned long, unsigned long)@CXXABI_1.3.5'\n",
|
| 250 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long long>(long long)@GLIBCXX_3.4.9'\n",
|
| 251 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char*@CXXABI_1.3'\n",
|
| 252 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const@GLIBCXX_3.4.18'\n",
|
| 253 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::out_of_range@GLIBCXX_3.4'\n",
|
| 254 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long>(unsigned long)@GLIBCXX_3.4.9'\n",
|
| 255 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
|
| 256 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::~ios_base()@GLIBCXX_3.4'\n",
|
| 257 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::range_error::~range_error()@GLIBCXX_3.4'\n",
|
| 258 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::~__basic_file()@GLIBCXX_3.4'\n",
|
| 259 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_acquire@CXXABI_1.3'\n",
|
| 260 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<bool>(bool)@GLIBCXX_3.4.9'\n",
|
| 261 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::overflow_error@GLIBCXX_3.4'\n",
|
| 262 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 263 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::range_error@GLIBCXX_3.4'\n",
|
| 264 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 265 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_filebuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 266 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete[](void*)@GLIBCXX_3.4'\n",
|
| 267 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 268 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(unsigned long, char, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 269 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_transfer(std::__detail::_List_node_base*, std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
|
| 270 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::replace(unsigned long, unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
|
| 271 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::exception@GLIBCXX_3.4'\n",
|
| 272 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_M_destroy(std::allocator<wchar_t> const&)@GLIBCXX_3.4'\n",
|
| 273 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream& std::istream::_M_extract<double>(double&)@GLIBCXX_3.4.9'\n",
|
| 274 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
| 275 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 276 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::basic_ifstream(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
|
| 277 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(std::string const&)@GLIBCXX_3.4'\n",
|
| 278 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new(unsigned long)@GLIBCXX_3.4'\n",
|
| 279 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 280 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned int@CXXABI_1.3'\n",
|
| 281 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*)@GLIBCXX_3.4'\n",
|
| 282 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::domain_error@GLIBCXX_3.4'\n",
|
| 283 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char, unsigned long) const@GLIBCXX_3.4'\n",
|
| 284 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::put(char)@GLIBCXX_3.4'\n",
|
| 285 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int@CXXABI_1.3'\n",
|
| 286 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_alloc()@GLIBCXX_3.4'\n",
|
| 287 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_thread_atexit@CXXABI_1.3.7'\n",
|
| 288 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
|
| 289 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::~basic_ifstream()@GLIBCXX_3.4'\n",
|
| 290 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::Init()@GLIBCXX_3.4'\n",
|
| 291 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::condition_variable()@GLIBCXX_3.4.11'\n",
|
| 292 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::basic_filebuf()@GLIBCXX_3.4'\n",
|
| 293 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 294 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::domain_error::~domain_error()@GLIBCXX_3.4'\n",
|
| 295 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cerr@GLIBCXX_3.4'\n",
|
| 296 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char const*, unsigned long, unsigned long) const@GLIBCXX_3.4'\n",
|
| 297 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 298 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 299 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::str() const@GLIBCXX_3.4'\n",
|
| 300 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::invalid_argument@GLIBCXX_3.4'\n",
|
| 301 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void*@CXXABI_1.3'\n",
|
| 302 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(std::string const&)@GLIBCXX_3.4'\n",
|
| 303 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_ostringstream()@GLIBCXX_3.4'\n",
|
| 304 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
|
| 305 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long@CXXABI_1.3'\n",
|
| 306 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
|
| 307 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_unhook()@GLIBCXX_3.4.15'\n",
|
| 308 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 309 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::_M_sync(char*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
| 310 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<char, std::char_traits<char> >::~basic_iostream()@GLIBCXX_3.4'\n",
|
| 311 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale(std::locale const&)@GLIBCXX_3.4'\n",
|
| 312 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 313 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'\n",
|
| 314 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
|
| 315 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
|
| 316 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::exception::~exception()@GLIBCXX_3.4'\n",
|
| 317 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_create(unsigned long, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 318 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::is_open() const@GLIBCXX_3.4'\n",
|
| 319 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_istringstream()@GLIBCXX_3.4'\n",
|
| 320 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::swap(std::string&)@GLIBCXX_3.4'\n",
|
| 321 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long*@CXXABI_1.3'\n",
|
| 322 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 323 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::basic_streambuf(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
|
| 324 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::init(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
|
| 325 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_cast()@GLIBCXX_3.4'\n",
|
| 326 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::clear(std::_Ios_Iostate)@GLIBCXX_3.4'\n",
|
| 327 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::operator=(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
|
| 328 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete(void*)@GLIBCXX_3.4'\n",
|
| 329 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(int)@GLIBCXX_3.4'\n",
|
| 330 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
|
| 331 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_M_destroy(std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 332 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<wchar_t, std::char_traits<wchar_t> >::~basic_iostream()@GLIBCXX_3.4'\n",
|
| 333 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::runtime_error@GLIBCXX_3.4'\n",
|
| 334 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 335 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
|
| 336 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_stringstream()@GLIBCXX_3.4'\n",
|
| 337 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 338 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long>(long)@GLIBCXX_3.4.9'\n",
|
| 339 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::get()@GLIBCXX_3.4'\n",
|
| 340 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long long@CXXABI_1.3'\n",
|
| 341 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)@GLIBCXX_3.4'\n",
|
| 342 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::out_of_range::~out_of_range()@GLIBCXX_3.4'\n",
|
| 343 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::length_error::~length_error()@GLIBCXX_3.4'\n",
|
| 344 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)@GLIBCXX_3.4.9'\n",
|
| 345 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::invalid_argument::~invalid_argument()@GLIBCXX_3.4'\n",
|
| 346 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::swap(std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >&)@GLIBCXX_3.4'\n",
|
| 347 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cout@GLIBCXX_3.4'\n",
|
| 348 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long long>(unsigned long long)@GLIBCXX_3.4.9'\n",
|
| 349 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int*@CXXABI_1.3'\n",
|
| 350 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<void const*>(void const*)@GLIBCXX_3.4.9'\n",
|
| 351 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::underflow_error@GLIBCXX_3.4'\n",
|
| 352 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 353 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::out_of_range@GLIBCXX_3.4'\n",
|
| 354 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_allocate_exception@CXXABI_1.3'\n",
|
| 355 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
|
| 356 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void const*@CXXABI_1.3'\n",
|
| 357 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::init(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >*)@GLIBCXX_3.4'\n",
|
| 358 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::reserve(unsigned long)@GLIBCXX_3.4'\n",
|
| 359 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_begin_catch@CXXABI_1.3'\n",
|
| 360 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long@CXXABI_1.3'\n",
|
| 361 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
|
| 362 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak()@GLIBCXX_3.4'\n",
|
| 363 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::open(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
|
| 364 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_M_sync(wchar_t*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
| 365 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::getline(char*, long, char)@GLIBCXX_3.4'\n",
|
| 366 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istream<char, std::char_traits<char> >& std::getline<char, std::char_traits<char>, std::allocator<char> >(std::basic_istream<char, std::char_traits<char> >&, std::basic_string<char, std::char_traits<char>, std::allocator<char> >&, char)@GLIBCXX_3.4'\n",
|
| 367 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 368 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::~condition_variable()@GLIBCXX_3.4.11'\n",
|
| 369 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 370 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::insert(unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
|
| 371 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(char const*, unsigned long)@GLIBCXX_3.4'\n",
|
| 372 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned char@CXXABI_1.3'\n",
|
| 373 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::ios_base()@GLIBCXX_3.4'\n",
|
| 374 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_out_of_range(char const*)@GLIBCXX_3.4'\n",
|
| 375 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::overflow_error::~overflow_error()@GLIBCXX_3.4'\n",
|
| 376 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_length_error(char const*)@GLIBCXX_3.4'\n",
|
| 377 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_system_error(int)@GLIBCXX_3.4.11'\n",
|
| 378 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ofstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
| 379 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<double>(double)@GLIBCXX_3.4.9'\n",
|
| 380 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::operator=(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
|
| 381 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long long@CXXABI_1.3'\n",
|
| 382 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(char const*, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 383 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
| 384 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_release@CXXABI_1.3'\n",
|
| 385 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_throw@CXXABI_1.3'\n",
|
| 386 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::underflow_error::~underflow_error()@GLIBCXX_3.4'\n",
|
| 387 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
|
| 388 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::length_error@GLIBCXX_3.4'\n",
|
| 389 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::~basic_filebuf()@GLIBCXX_3.4'\n",
|
| 390 |
+
"collect2: error: ld returned 1 exit status\n"
|
| 391 |
+
]
|
| 392 |
+
},
|
| 393 |
+
{
|
| 394 |
+
"data": {
|
| 395 |
+
"text/html": [
|
| 396 |
+
"\n",
|
| 397 |
+
" <div>\n",
|
| 398 |
+
" \n",
|
| 399 |
+
" <progress value='3088' max='3088' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
| 400 |
+
" [3088/3088 22:01, Epoch 4/4]\n",
|
| 401 |
+
" </div>\n",
|
| 402 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
| 403 |
+
" <thead>\n",
|
| 404 |
+
" <tr style=\"text-align: left;\">\n",
|
| 405 |
+
" <th>Epoch</th>\n",
|
| 406 |
+
" <th>Training Loss</th>\n",
|
| 407 |
+
" <th>Validation Loss</th>\n",
|
| 408 |
+
" <th>Accuracy</th>\n",
|
| 409 |
+
" </tr>\n",
|
| 410 |
+
" </thead>\n",
|
| 411 |
+
" <tbody>\n",
|
| 412 |
+
" <tr>\n",
|
| 413 |
+
" <td>1</td>\n",
|
| 414 |
+
" <td>0.633200</td>\n",
|
| 415 |
+
" <td>0.448454</td>\n",
|
| 416 |
+
" <td>0.803000</td>\n",
|
| 417 |
+
" </tr>\n",
|
| 418 |
+
" <tr>\n",
|
| 419 |
+
" <td>2</td>\n",
|
| 420 |
+
" <td>0.376200</td>\n",
|
| 421 |
+
" <td>0.471739</td>\n",
|
| 422 |
+
" <td>0.821000</td>\n",
|
| 423 |
+
" </tr>\n",
|
| 424 |
+
" <tr>\n",
|
| 425 |
+
" <td>3</td>\n",
|
| 426 |
+
" <td>0.267600</td>\n",
|
| 427 |
+
" <td>0.354833</td>\n",
|
| 428 |
+
" <td>0.865000</td>\n",
|
| 429 |
+
" </tr>\n",
|
| 430 |
+
" <tr>\n",
|
| 431 |
+
" <td>4</td>\n",
|
| 432 |
+
" <td>0.204300</td>\n",
|
| 433 |
+
" <td>0.383833</td>\n",
|
| 434 |
+
" <td>0.873500</td>\n",
|
| 435 |
+
" </tr>\n",
|
| 436 |
+
" </tbody>\n",
|
| 437 |
+
"</table><p>"
|
| 438 |
+
],
|
| 439 |
+
"text/plain": [
|
| 440 |
+
"<IPython.core.display.HTML object>"
|
| 441 |
+
]
|
| 442 |
+
},
|
| 443 |
+
"metadata": {},
|
| 444 |
+
"output_type": "display_data"
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"data": {
|
| 448 |
+
"text/plain": [
|
| 449 |
+
"TrainOutput(global_step=3088, training_loss=0.37034162215000604, metrics={'train_runtime': 1321.9045, 'train_samples_per_second': 149.484, 'train_steps_per_second': 2.336, 'total_flos': 2.5816641551990784e+16, 'train_loss': 0.37034162215000604, 'epoch': 4.0})"
|
| 450 |
+
]
|
| 451 |
+
},
|
| 452 |
+
"execution_count": 6,
|
| 453 |
+
"metadata": {},
|
| 454 |
+
"output_type": "execute_result"
|
| 455 |
+
}
|
| 456 |
+
],
|
| 457 |
+
"source": [
|
| 458 |
+
"trainer.train() #模型训练"
|
| 459 |
+
]
|
| 460 |
+
},
|
| 461 |
+
{
|
| 462 |
+
"cell_type": "code",
|
| 463 |
+
"execution_count": 7,
|
| 464 |
+
"id": "adacc2bb-bda6-4e9f-92fb-54d64c324147",
|
| 465 |
+
"metadata": {},
|
| 466 |
+
"outputs": [
|
| 467 |
+
{
|
| 468 |
+
"data": {
|
| 469 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 470 |
+
"model_id": "2a404a3810d442ecb077b5e0794cf21f",
|
| 471 |
+
"version_major": 2,
|
| 472 |
+
"version_minor": 0
|
| 473 |
+
},
|
| 474 |
+
"text/plain": [
|
| 475 |
+
"Map (num_proc=4): 0%| | 0/2000 [00:00<?, ? examples/s]"
|
| 476 |
+
]
|
| 477 |
+
},
|
| 478 |
+
"metadata": {},
|
| 479 |
+
"output_type": "display_data"
|
| 480 |
+
},
|
| 481 |
+
{
|
| 482 |
+
"data": {
|
| 483 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 484 |
+
"model_id": "90bf9102d4d446ea83c037dcd0bd78bc",
|
| 485 |
+
"version_major": 2,
|
| 486 |
+
"version_minor": 0
|
| 487 |
+
},
|
| 488 |
+
"text/plain": [
|
| 489 |
+
"Map (num_proc=4): 0%| | 0/2000 [00:00<?, ? examples/s]"
|
| 490 |
+
]
|
| 491 |
+
},
|
| 492 |
+
"metadata": {},
|
| 493 |
+
"output_type": "display_data"
|
| 494 |
+
},
|
| 495 |
+
{
|
| 496 |
+
"name": "stderr",
|
| 497 |
+
"output_type": "stream",
|
| 498 |
+
"text": [
|
| 499 |
+
"Predicting: 100%|██████████| 32/32 [00:04<00:00, 6.82it/s]\n"
|
| 500 |
+
]
|
| 501 |
+
}
|
| 502 |
+
],
|
| 503 |
+
"source": [
|
| 504 |
+
"#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似\n",
|
| 505 |
+
"raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.5) #默认已经shuffle\n",
|
| 506 |
+
"\n",
|
| 507 |
+
"# 定义翻转标签的函数\n",
|
| 508 |
+
"def flip_labels(example):\n",
|
| 509 |
+
" # 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token\n",
|
| 510 |
+
" example[\"sentence1\"] = example[\"sentence1\"]\n",
|
| 511 |
+
" example[\"sentence2\"] = example[\"sentence2\"]\n",
|
| 512 |
+
" example['label'] = 1 - example['label']\n",
|
| 513 |
+
" return example\n",
|
| 514 |
+
"\n",
|
| 515 |
+
"# 应用翻转标签函数\n",
|
| 516 |
+
"flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)\n",
|
| 517 |
+
"tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True,num_proc=4)\n",
|
| 518 |
+
"\n",
|
| 519 |
+
"\n",
|
| 520 |
+
"\n",
|
| 521 |
+
"# 确保模型在 GPU 上\n",
|
| 522 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 523 |
+
"model.to(device)\n",
|
| 524 |
+
"model.eval() # 进入推理模式,加速推理\n",
|
| 525 |
+
"\n",
|
| 526 |
+
"# 取出测试集数据\n",
|
| 527 |
+
"test_dataset = tokenized_datasets_dna_protein[\"test\"]\n",
|
| 528 |
+
"\n",
|
| 529 |
+
"# 预存预测结果\n",
|
| 530 |
+
"preds = []\n",
|
| 531 |
+
"labels = []\n",
|
| 532 |
+
"\n",
|
| 533 |
+
"# 批量大小(建议 64、128、256 视显存大小调整)\n",
|
| 534 |
+
"batch_size = 64\n",
|
| 535 |
+
"\n",
|
| 536 |
+
"# 直接遍历数据集进行推理\n",
|
| 537 |
+
"for i in tqdm(range(0, len(test_dataset), batch_size), desc=\"Predicting\"):\n",
|
| 538 |
+
" batch = test_dataset[i : i + batch_size]\n",
|
| 539 |
+
" \n",
|
| 540 |
+
" # 转换为 Tensor 并移动到 GPU\n",
|
| 541 |
+
" inputs = {\n",
|
| 542 |
+
" \"input_ids\": torch.tensor(batch[\"input_ids\"]).to(device),\n",
|
| 543 |
+
" \"attention_mask\": torch.tensor(batch[\"attention_mask\"]).to(device),\n",
|
| 544 |
+
" }\n",
|
| 545 |
+
" batch_labels = batch[\"label\"] # 原始标签\n",
|
| 546 |
+
"\n",
|
| 547 |
+
" with torch.no_grad(): # 关闭梯度计算,减少内存占用\n",
|
| 548 |
+
" outputs = model(**inputs)\n",
|
| 549 |
+
" batch_preds = torch.argmax(outputs.logits, axis=-1).cpu().numpy() # 取最大概率的类别\n",
|
| 550 |
+
"\n",
|
| 551 |
+
" preds.extend(batch_preds)\n",
|
| 552 |
+
" labels.extend(batch_labels)\n",
|
| 553 |
+
" \n",
|
| 554 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 555 |
+
"ret = metric.compute(predictions=preds, references=labels)\n",
|
| 556 |
+
"\n",
|
| 557 |
+
"\n",
|
| 558 |
+
"result[\"dna_protein_pair_full\"] = ret"
|
| 559 |
+
]
|
| 560 |
+
},
|
| 561 |
+
{
|
| 562 |
+
"cell_type": "code",
|
| 563 |
+
"execution_count": 8,
|
| 564 |
+
"id": "55d7df83-ff43-4dc3-970d-8124d0f4b534",
|
| 565 |
+
"metadata": {},
|
| 566 |
+
"outputs": [
|
| 567 |
+
{
|
| 568 |
+
"data": {
|
| 569 |
+
"image/png": "",
|
| 570 |
+
"text/plain": [
|
| 571 |
+
"<Figure size 640x480 with 2 Axes>"
|
| 572 |
+
]
|
| 573 |
+
},
|
| 574 |
+
"metadata": {},
|
| 575 |
+
"output_type": "display_data"
|
| 576 |
+
}
|
| 577 |
+
],
|
| 578 |
+
"source": [
|
| 579 |
+
"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
|
| 580 |
+
"import matplotlib.pyplot as plt\n",
|
| 581 |
+
"\n",
|
| 582 |
+
"# 假设 predictions.label_ids 是真实的标签,preds 是模型的预测\n",
|
| 583 |
+
"cm = confusion_matrix(labels, preds)\n",
|
| 584 |
+
"\n",
|
| 585 |
+
"# 可视化混淆矩阵\n",
|
| 586 |
+
"disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])\n",
|
| 587 |
+
"disp.plot(cmap=plt.cm.Blues)\n",
|
| 588 |
+
"plt.title('Confusion Matrix')\n",
|
| 589 |
+
"plt.show()"
|
| 590 |
+
]
|
| 591 |
+
},
|
| 592 |
+
{
|
| 593 |
+
"cell_type": "code",
|
| 594 |
+
"execution_count": 9,
|
| 595 |
+
"id": "ab2de8ae-c04d-4de7-aef7-25aa66551a0a",
|
| 596 |
+
"metadata": {},
|
| 597 |
+
"outputs": [
|
| 598 |
+
{
|
| 599 |
+
"data": {
|
| 600 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 601 |
+
"model_id": "062918eaa1034dc588d04114d6b0b5e3",
|
| 602 |
+
"version_major": 2,
|
| 603 |
+
"version_minor": 0
|
| 604 |
+
},
|
| 605 |
+
"text/plain": [
|
| 606 |
+
"Map (num_proc=4): 0%| | 0/8000 [00:00<?, ? examples/s]"
|
| 607 |
+
]
|
| 608 |
+
},
|
| 609 |
+
"metadata": {},
|
| 610 |
+
"output_type": "display_data"
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"data": {
|
| 614 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 615 |
+
"model_id": "afbefd7d107843459de649cf5e603472",
|
| 616 |
+
"version_major": 2,
|
| 617 |
+
"version_minor": 0
|
| 618 |
+
},
|
| 619 |
+
"text/plain": [
|
| 620 |
+
"Map (num_proc=4): 0%| | 0/8000 [00:00<?, ? examples/s]"
|
| 621 |
+
]
|
| 622 |
+
},
|
| 623 |
+
"metadata": {},
|
| 624 |
+
"output_type": "display_data"
|
| 625 |
+
},
|
| 626 |
+
{
|
| 627 |
+
"name": "stderr",
|
| 628 |
+
"output_type": "stream",
|
| 629 |
+
"text": [
|
| 630 |
+
"Predicting: 100%|██████████| 125/125 [00:18<00:00, 6.63it/s]\n"
|
| 631 |
+
]
|
| 632 |
+
}
|
| 633 |
+
],
|
| 634 |
+
"source": [
|
| 635 |
+
"#############################################################\n",
|
| 636 |
+
"#模型测试 蛋白质数据集,随机版本\n",
|
| 637 |
+
"raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.5) #默认已经shuffle\n",
|
| 638 |
+
"\n",
|
| 639 |
+
"# 定义翻转标签的函数\n",
|
| 640 |
+
"def flip_labels(example):\n",
|
| 641 |
+
" # 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token\n",
|
| 642 |
+
" example[\"sentence1\"] = example[\"sentence1\"]\n",
|
| 643 |
+
" example[\"sentence2\"] = example[\"sentence2\"]\n",
|
| 644 |
+
" example['label'] = 1 - example['label']\n",
|
| 645 |
+
" return example\n",
|
| 646 |
+
"\n",
|
| 647 |
+
"# 应用翻转标签函数\n",
|
| 648 |
+
"flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)\n",
|
| 649 |
+
"tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True,num_proc=4)\n",
|
| 650 |
+
"\n",
|
| 651 |
+
"\n",
|
| 652 |
+
"# 确保模型在 GPU 上\n",
|
| 653 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 654 |
+
"model.to(device)\n",
|
| 655 |
+
"model.eval() # 进入推理模式,加速推理\n",
|
| 656 |
+
"\n",
|
| 657 |
+
"# 取出测试集数据\n",
|
| 658 |
+
"test_dataset = tokenized_datasets_dna_protein[\"test\"]\n",
|
| 659 |
+
"\n",
|
| 660 |
+
"# 预存预测结果\n",
|
| 661 |
+
"preds = []\n",
|
| 662 |
+
"labels = []\n",
|
| 663 |
+
"\n",
|
| 664 |
+
"# 批量大小(建议 64、128、256 视显存大小调整)\n",
|
| 665 |
+
"batch_size = 64\n",
|
| 666 |
+
"\n",
|
| 667 |
+
"# 直接遍历数据集进行推理\n",
|
| 668 |
+
"for i in tqdm(range(0, len(test_dataset), batch_size), desc=\"Predicting\"):\n",
|
| 669 |
+
" batch = test_dataset[i : i + batch_size]\n",
|
| 670 |
+
" \n",
|
| 671 |
+
" # 转换为 Tensor 并移动到 GPU\n",
|
| 672 |
+
" inputs = {\n",
|
| 673 |
+
" \"input_ids\": torch.tensor(batch[\"input_ids\"]).to(device),\n",
|
| 674 |
+
" \"attention_mask\": torch.tensor(batch[\"attention_mask\"]).to(device),\n",
|
| 675 |
+
" }\n",
|
| 676 |
+
" batch_labels = batch[\"label\"] # 原始标签\n",
|
| 677 |
+
"\n",
|
| 678 |
+
" with torch.no_grad(): # 关闭梯度计算,减少内存占用\n",
|
| 679 |
+
" outputs = model(**inputs)\n",
|
| 680 |
+
" batch_preds = torch.argmax(outputs.logits, axis=-1).cpu().numpy() # 取最大概率的类别\n",
|
| 681 |
+
"\n",
|
| 682 |
+
" preds.extend(batch_preds)\n",
|
| 683 |
+
" labels.extend(batch_labels)\n",
|
| 684 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 685 |
+
"ret = metric.compute(predictions=preds, references=labels)\n",
|
| 686 |
+
"\n",
|
| 687 |
+
"result[\"dna_protein_pair_rand_full\"] = ret"
|
| 688 |
+
]
|
| 689 |
+
},
|
| 690 |
+
{
|
| 691 |
+
"cell_type": "code",
|
| 692 |
+
"execution_count": 10,
|
| 693 |
+
"id": "b6fc8ee6-89d3-4d32-b67d-2980a0be79cb",
|
| 694 |
+
"metadata": {},
|
| 695 |
+
"outputs": [
|
| 696 |
+
{
|
| 697 |
+
"name": "stdout",
|
| 698 |
+
"output_type": "stream",
|
| 699 |
+
"text": [
|
| 700 |
+
"{\"seed\": 7967, \"dna_protein_pair_full\": {\"accuracy\": 0.694, \"f1\": 0.6412661195779601}, \"dna_protein_pair_rand_full\": {\"accuracy\": 0.6, \"f1\": 0.6077469968129443}}\n"
|
| 701 |
+
]
|
| 702 |
+
}
|
| 703 |
+
],
|
| 704 |
+
"source": [
|
| 705 |
+
"print(json.dumps(result))"
|
| 706 |
+
]
|
| 707 |
+
},
|
| 708 |
+
{
|
| 709 |
+
"cell_type": "code",
|
| 710 |
+
"execution_count": 11,
|
| 711 |
+
"id": "f7db7f38-a3f1-45e8-bf92-af50b7b67ece",
|
| 712 |
+
"metadata": {},
|
| 713 |
+
"outputs": [],
|
| 714 |
+
"source": [
|
| 715 |
+
"model.save_pretrained(\"gpt2_gene_multi_v1_ft\")"
|
| 716 |
+
]
|
| 717 |
+
},
|
| 718 |
+
{
|
| 719 |
+
"cell_type": "code",
|
| 720 |
+
"execution_count": 12,
|
| 721 |
+
"id": "c34d5df5-7baf-4a05-ba25-7d3b99856d8f",
|
| 722 |
+
"metadata": {},
|
| 723 |
+
"outputs": [
|
| 724 |
+
{
|
| 725 |
+
"data": {
|
| 726 |
+
"text/plain": [
|
| 727 |
+
"('gpt2_gene_multi_v1_ft/tokenizer_config.json',\n",
|
| 728 |
+
" 'gpt2_gene_multi_v1_ft/special_tokens_map.json',\n",
|
| 729 |
+
" 'gpt2_gene_multi_v1_ft/tokenizer.json')"
|
| 730 |
+
]
|
| 731 |
+
},
|
| 732 |
+
"execution_count": 12,
|
| 733 |
+
"metadata": {},
|
| 734 |
+
"output_type": "execute_result"
|
| 735 |
+
}
|
| 736 |
+
],
|
| 737 |
+
"source": [
|
| 738 |
+
"tokenizer.save_pretrained(\"gpt2_gene_multi_v1_ft\")"
|
| 739 |
+
]
|
| 740 |
+
},
|
| 741 |
+
{
|
| 742 |
+
"cell_type": "code",
|
| 743 |
+
"execution_count": null,
|
| 744 |
+
"id": "15feb59b-73c1-462d-b9ed-ceb607a38f6f",
|
| 745 |
+
"metadata": {},
|
| 746 |
+
"outputs": [],
|
| 747 |
+
"source": []
|
| 748 |
+
}
|
| 749 |
+
],
|
| 750 |
+
"metadata": {
|
| 751 |
+
"kernelspec": {
|
| 752 |
+
"display_name": "Python 3 (ipykernel)",
|
| 753 |
+
"language": "python",
|
| 754 |
+
"name": "python3"
|
| 755 |
+
},
|
| 756 |
+
"language_info": {
|
| 757 |
+
"codemirror_mode": {
|
| 758 |
+
"name": "ipython",
|
| 759 |
+
"version": 3
|
| 760 |
+
},
|
| 761 |
+
"file_extension": ".py",
|
| 762 |
+
"mimetype": "text/x-python",
|
| 763 |
+
"name": "python",
|
| 764 |
+
"nbconvert_exporter": "python",
|
| 765 |
+
"pygments_lexer": "ipython3",
|
| 766 |
+
"version": "3.12.3"
|
| 767 |
+
}
|
| 768 |
+
},
|
| 769 |
+
"nbformat": 4,
|
| 770 |
+
"nbformat_minor": 5
|
| 771 |
+
}
|
best_model/gpt2_gene_multiv2_ft_en_test_others_best.ipynb
ADDED
|
@@ -0,0 +1,771 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "dcaea864-b707-4651-965d-b8eefa1b0e07",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stdout",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"https://hf-mirror.com\n"
|
| 14 |
+
]
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"source": [
|
| 18 |
+
"import os\n",
|
| 19 |
+
"\n",
|
| 20 |
+
"# 设置环境变量\n",
|
| 21 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
| 22 |
+
"\n",
|
| 23 |
+
"# 打印环境变量以确认设置成功\n",
|
| 24 |
+
"print(os.environ.get('HF_ENDPOINT'))\n",
|
| 25 |
+
"\n",
|
| 26 |
+
"# import subprocess\n",
|
| 27 |
+
"# import os\n",
|
| 28 |
+
"\n",
|
| 29 |
+
"# result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
| 30 |
+
"# output = result.stdout\n",
|
| 31 |
+
"# for line in output.splitlines():\n",
|
| 32 |
+
"# if '=' in line:\n",
|
| 33 |
+
"# var, value = line.split('=', 1)\n",
|
| 34 |
+
"# os.environ[var] = value"
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"cell_type": "code",
|
| 39 |
+
"execution_count": 2,
|
| 40 |
+
"id": "73cc9d73-f0cb-4392-8280-782275dc7036",
|
| 41 |
+
"metadata": {},
|
| 42 |
+
"outputs": [
|
| 43 |
+
{
|
| 44 |
+
"name": "stderr",
|
| 45 |
+
"output_type": "stream",
|
| 46 |
+
"text": [
|
| 47 |
+
"2025-02-09 16:27:19.297328: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
|
| 48 |
+
"2025-02-09 16:27:19.311074: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
| 49 |
+
"2025-02-09 16:27:19.327136: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
| 50 |
+
"2025-02-09 16:27:19.331944: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
| 51 |
+
"2025-02-09 16:27:19.344392: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
| 52 |
+
"To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
| 53 |
+
"2025-02-09 16:27:20.187982: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
|
| 54 |
+
]
|
| 55 |
+
}
|
| 56 |
+
],
|
| 57 |
+
"source": [
|
| 58 |
+
"from datasets import load_dataset\n",
|
| 59 |
+
"from transformers import AutoTokenizer, DataCollatorWithPadding\n",
|
| 60 |
+
"from transformers import Trainer\n",
|
| 61 |
+
"import evaluate\n",
|
| 62 |
+
"import numpy as np\n",
|
| 63 |
+
"from transformers import TrainingArguments\n",
|
| 64 |
+
"from transformers import AutoModelForSequenceClassification\n",
|
| 65 |
+
"import json\n",
|
| 66 |
+
"from transformers import set_seed\n",
|
| 67 |
+
"import random\n",
|
| 68 |
+
"import numpy as np\n",
|
| 69 |
+
"import torch\n",
|
| 70 |
+
"from tqdm import tqdm"
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"cell_type": "code",
|
| 75 |
+
"execution_count": 3,
|
| 76 |
+
"id": "8255851b-d2c4-437c-a306-fcb00ccab684",
|
| 77 |
+
"metadata": {},
|
| 78 |
+
"outputs": [],
|
| 79 |
+
"source": [
|
| 80 |
+
"# seed = 42\n",
|
| 81 |
+
"# random.seed(seed)\n",
|
| 82 |
+
"# np.random.seed(seed)\n",
|
| 83 |
+
"# torch.manual_seed(seed)\n",
|
| 84 |
+
"# torch.cuda.manual_seed_all(seed)\n",
|
| 85 |
+
"\n",
|
| 86 |
+
"\n",
|
| 87 |
+
"# 动态生成随机种子\n",
|
| 88 |
+
"import random\n",
|
| 89 |
+
"#seed = random.randint(0, 10000)\n",
|
| 90 |
+
"seed = 2621\n",
|
| 91 |
+
"#print(f\"Generated seed: {seed}\")\n",
|
| 92 |
+
"set_seed(seed)\n",
|
| 93 |
+
"result = {}\n",
|
| 94 |
+
"result[\"seed\"] = seed"
|
| 95 |
+
]
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"cell_type": "code",
|
| 99 |
+
"execution_count": 4,
|
| 100 |
+
"id": "2a744bd7-b674-4048-b303-d4f85bdd694d",
|
| 101 |
+
"metadata": {},
|
| 102 |
+
"outputs": [],
|
| 103 |
+
"source": [
|
| 104 |
+
"# 假设你的JSON文件名为 'data.json' 并且每行是一个独立的JSON对象\n",
|
| 105 |
+
"raw_datasets = load_dataset('google-research-datasets/paws-x', 'en') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x\n",
|
| 106 |
+
"\n",
|
| 107 |
+
"#分词器\n",
|
| 108 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"dnagpt/gpt2_gene_multi_v2\")\n",
|
| 109 |
+
"tokenizer.pad_token = tokenizer.eos_token\n",
|
| 110 |
+
"\n",
|
| 111 |
+
"\n",
|
| 112 |
+
"#分词函数\n",
|
| 113 |
+
"def tokenize_function(example):\n",
|
| 114 |
+
" return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True,max_length=256, padding=\"max_length\")\n",
|
| 115 |
+
"\n",
|
| 116 |
+
"#构建分词后的数据集\n",
|
| 117 |
+
"tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
|
| 118 |
+
"\n",
|
| 119 |
+
"#训练数据构建\n",
|
| 120 |
+
"data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
|
| 121 |
+
]
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"cell_type": "code",
|
| 125 |
+
"execution_count": 5,
|
| 126 |
+
"id": "b2d3081f-1df2-45fa-a3f6-1516c19d7318",
|
| 127 |
+
"metadata": {},
|
| 128 |
+
"outputs": [
|
| 129 |
+
{
|
| 130 |
+
"name": "stderr",
|
| 131 |
+
"output_type": "stream",
|
| 132 |
+
"text": [
|
| 133 |
+
"/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
|
| 134 |
+
" warnings.warn(\n",
|
| 135 |
+
"Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/gpt2_gene_multi_v2 and are newly initialized: ['score.weight']\n",
|
| 136 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
| 137 |
+
]
|
| 138 |
+
}
|
| 139 |
+
],
|
| 140 |
+
"source": [
|
| 141 |
+
"#指标函数定义\n",
|
| 142 |
+
"def compute_metrics(eval_pred):\n",
|
| 143 |
+
" predictions, labels = eval_pred\n",
|
| 144 |
+
" predictions = np.argmax(predictions, axis=1)\n",
|
| 145 |
+
" return {'accuracy': (predictions==labels).sum() / len(labels)}\n",
|
| 146 |
+
"\n",
|
| 147 |
+
"\n",
|
| 148 |
+
"\n",
|
| 149 |
+
"training_args = TrainingArguments(\n",
|
| 150 |
+
" output_dir=\"ds_job_dna_2222\",\n",
|
| 151 |
+
" learning_rate=1e-5,\n",
|
| 152 |
+
" lr_scheduler_type=\"constant_with_warmup\",\n",
|
| 153 |
+
" warmup_ratio=0.1,\n",
|
| 154 |
+
" optim='adamw_torch',\n",
|
| 155 |
+
" weight_decay=0.0,\n",
|
| 156 |
+
" seed=seed, # 使用动态生成的随机种子\n",
|
| 157 |
+
" per_device_train_batch_size=64,\n",
|
| 158 |
+
" per_device_eval_batch_size=64,\n",
|
| 159 |
+
" num_train_epochs=4, #训练多少轮\n",
|
| 160 |
+
" evaluation_strategy=\"epoch\",\n",
|
| 161 |
+
" save_strategy=\"epoch\",\n",
|
| 162 |
+
" logging_strategy=\"epoch\",\n",
|
| 163 |
+
" load_best_model_at_end=True\n",
|
| 164 |
+
")\n",
|
| 165 |
+
"\n",
|
| 166 |
+
"#模型定义,文本分类模型\n",
|
| 167 |
+
"model = AutoModelForSequenceClassification.from_pretrained(\"dnagpt/gpt2_gene_multi_v2\", num_labels=2)\n",
|
| 168 |
+
"model.config.pad_token_id = model.config.eos_token_id\n",
|
| 169 |
+
"\n",
|
| 170 |
+
"trainer = Trainer(\n",
|
| 171 |
+
" model,\n",
|
| 172 |
+
" training_args,\n",
|
| 173 |
+
" train_dataset=tokenized_datasets[\"train\"],\n",
|
| 174 |
+
" eval_dataset=tokenized_datasets[\"validation\"],\n",
|
| 175 |
+
" data_collator=data_collator,\n",
|
| 176 |
+
" tokenizer=tokenizer,\n",
|
| 177 |
+
" compute_metrics=compute_metrics,\n",
|
| 178 |
+
")"
|
| 179 |
+
]
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"cell_type": "code",
|
| 183 |
+
"execution_count": 6,
|
| 184 |
+
"id": "06f4ea9e-dbac-405e-8875-aea569c708cf",
|
| 185 |
+
"metadata": {},
|
| 186 |
+
"outputs": [
|
| 187 |
+
{
|
| 188 |
+
"name": "stdout",
|
| 189 |
+
"output_type": "stream",
|
| 190 |
+
"text": [
|
| 191 |
+
"[2025-02-09 16:27:34,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
|
| 192 |
+
]
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"name": "stderr",
|
| 196 |
+
"output_type": "stream",
|
| 197 |
+
"text": [
|
| 198 |
+
"/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory\n",
|
| 199 |
+
"collect2: error: ld returned 1 exit status\n",
|
| 200 |
+
"/root/miniconda3/compiler_compat/ld: warning: libpthread.so.0, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
| 201 |
+
"/root/miniconda3/compiler_compat/ld: warning: libstdc++.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
| 202 |
+
"/root/miniconda3/compiler_compat/ld: warning: libm.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
| 203 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'\n",
|
| 204 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'\n",
|
| 205 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'\n",
|
| 206 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'\n",
|
| 207 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'\n",
|
| 208 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'\n",
|
| 209 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_logic_error(char const*)@GLIBCXX_3.4'\n",
|
| 210 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 211 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::logic_error@GLIBCXX_3.4'\n",
|
| 212 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::~locale()@GLIBCXX_3.4'\n",
|
| 213 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
| 214 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_end_catch@CXXABI_1.3'\n",
|
| 215 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 216 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::logic_error::~logic_error()@GLIBCXX_3.4'\n",
|
| 217 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__si_class_type_info@CXXABI_1.3'\n",
|
| 218 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
|
| 219 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 220 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new[](unsigned long)@GLIBCXX_3.4'\n",
|
| 221 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak_hard()@GLIBCXX_3.4'\n",
|
| 222 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 223 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::basic_streambuf(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
|
| 224 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*, unsigned long)@GLIBCXX_3.4'\n",
|
| 225 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&)@GLIBCXX_3.4'\n",
|
| 226 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned short@CXXABI_1.3'\n",
|
| 227 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::resize(unsigned long, char)@GLIBCXX_3.4'\n",
|
| 228 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char const*@CXXABI_1.3'\n",
|
| 229 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ctype<char>::_M_widen_init() const@GLIBCXX_3.4.11'\n",
|
| 230 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_invalid_argument(char const*)@GLIBCXX_3.4'\n",
|
| 231 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::operator=(std::locale const&)@GLIBCXX_3.4'\n",
|
| 232 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
|
| 233 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
|
| 234 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_free_exception@CXXABI_1.3'\n",
|
| 235 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::notify_one()@GLIBCXX_3.4.11'\n",
|
| 236 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::~Init()@GLIBCXX_3.4'\n",
|
| 237 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::~basic_string()@GLIBCXX_3.4'\n",
|
| 238 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_pure_virtual@CXXABI_1.3'\n",
|
| 239 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::flush()@GLIBCXX_3.4'\n",
|
| 240 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__class_type_info@CXXABI_1.3'\n",
|
| 241 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_rethrow@CXXABI_1.3'\n",
|
| 242 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 243 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_fstream<char, std::char_traits<char> >::~basic_fstream()@GLIBCXX_3.4'\n",
|
| 244 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::compare(char const*) const@GLIBCXX_3.4'\n",
|
| 245 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 246 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale()@GLIBCXX_3.4'\n",
|
| 247 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::system_clock::now()@GLIBCXX_3.4.19'\n",
|
| 248 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 249 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Hash_bytes(void const*, unsigned long, unsigned long)@CXXABI_1.3.5'\n",
|
| 250 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long long>(long long)@GLIBCXX_3.4.9'\n",
|
| 251 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char*@CXXABI_1.3'\n",
|
| 252 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const@GLIBCXX_3.4.18'\n",
|
| 253 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::out_of_range@GLIBCXX_3.4'\n",
|
| 254 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long>(unsigned long)@GLIBCXX_3.4.9'\n",
|
| 255 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
|
| 256 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::~ios_base()@GLIBCXX_3.4'\n",
|
| 257 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::range_error::~range_error()@GLIBCXX_3.4'\n",
|
| 258 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::~__basic_file()@GLIBCXX_3.4'\n",
|
| 259 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_acquire@CXXABI_1.3'\n",
|
| 260 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<bool>(bool)@GLIBCXX_3.4.9'\n",
|
| 261 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::overflow_error@GLIBCXX_3.4'\n",
|
| 262 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 263 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::range_error@GLIBCXX_3.4'\n",
|
| 264 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 265 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_filebuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 266 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete[](void*)@GLIBCXX_3.4'\n",
|
| 267 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 268 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(unsigned long, char, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 269 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_transfer(std::__detail::_List_node_base*, std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
|
| 270 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::replace(unsigned long, unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
|
| 271 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::exception@GLIBCXX_3.4'\n",
|
| 272 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_M_destroy(std::allocator<wchar_t> const&)@GLIBCXX_3.4'\n",
|
| 273 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream& std::istream::_M_extract<double>(double&)@GLIBCXX_3.4.9'\n",
|
| 274 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
| 275 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 276 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::basic_ifstream(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
|
| 277 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(std::string const&)@GLIBCXX_3.4'\n",
|
| 278 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new(unsigned long)@GLIBCXX_3.4'\n",
|
| 279 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 280 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned int@CXXABI_1.3'\n",
|
| 281 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*)@GLIBCXX_3.4'\n",
|
| 282 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::domain_error@GLIBCXX_3.4'\n",
|
| 283 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char, unsigned long) const@GLIBCXX_3.4'\n",
|
| 284 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::put(char)@GLIBCXX_3.4'\n",
|
| 285 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int@CXXABI_1.3'\n",
|
| 286 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_alloc()@GLIBCXX_3.4'\n",
|
| 287 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_thread_atexit@CXXABI_1.3.7'\n",
|
| 288 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
|
| 289 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::~basic_ifstream()@GLIBCXX_3.4'\n",
|
| 290 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::Init()@GLIBCXX_3.4'\n",
|
| 291 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::condition_variable()@GLIBCXX_3.4.11'\n",
|
| 292 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::basic_filebuf()@GLIBCXX_3.4'\n",
|
| 293 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 294 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::domain_error::~domain_error()@GLIBCXX_3.4'\n",
|
| 295 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cerr@GLIBCXX_3.4'\n",
|
| 296 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char const*, unsigned long, unsigned long) const@GLIBCXX_3.4'\n",
|
| 297 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 298 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 299 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::str() const@GLIBCXX_3.4'\n",
|
| 300 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::invalid_argument@GLIBCXX_3.4'\n",
|
| 301 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void*@CXXABI_1.3'\n",
|
| 302 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(std::string const&)@GLIBCXX_3.4'\n",
|
| 303 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_ostringstream()@GLIBCXX_3.4'\n",
|
| 304 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
|
| 305 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long@CXXABI_1.3'\n",
|
| 306 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
|
| 307 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_unhook()@GLIBCXX_3.4.15'\n",
|
| 308 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 309 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::_M_sync(char*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
| 310 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<char, std::char_traits<char> >::~basic_iostream()@GLIBCXX_3.4'\n",
|
| 311 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale(std::locale const&)@GLIBCXX_3.4'\n",
|
| 312 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 313 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'\n",
|
| 314 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
|
| 315 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
|
| 316 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::exception::~exception()@GLIBCXX_3.4'\n",
|
| 317 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_create(unsigned long, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 318 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::is_open() const@GLIBCXX_3.4'\n",
|
| 319 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_istringstream()@GLIBCXX_3.4'\n",
|
| 320 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::swap(std::string&)@GLIBCXX_3.4'\n",
|
| 321 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long*@CXXABI_1.3'\n",
|
| 322 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
| 323 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::basic_streambuf(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
|
| 324 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::init(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
|
| 325 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_cast()@GLIBCXX_3.4'\n",
|
| 326 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::clear(std::_Ios_Iostate)@GLIBCXX_3.4'\n",
|
| 327 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::operator=(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
|
| 328 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete(void*)@GLIBCXX_3.4'\n",
|
| 329 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(int)@GLIBCXX_3.4'\n",
|
| 330 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
|
| 331 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_M_destroy(std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 332 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<wchar_t, std::char_traits<wchar_t> >::~basic_iostream()@GLIBCXX_3.4'\n",
|
| 333 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::runtime_error@GLIBCXX_3.4'\n",
|
| 334 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 335 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
|
| 336 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_stringstream()@GLIBCXX_3.4'\n",
|
| 337 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 338 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long>(long)@GLIBCXX_3.4.9'\n",
|
| 339 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::get()@GLIBCXX_3.4'\n",
|
| 340 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long long@CXXABI_1.3'\n",
|
| 341 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)@GLIBCXX_3.4'\n",
|
| 342 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::out_of_range::~out_of_range()@GLIBCXX_3.4'\n",
|
| 343 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::length_error::~length_error()@GLIBCXX_3.4'\n",
|
| 344 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)@GLIBCXX_3.4.9'\n",
|
| 345 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::invalid_argument::~invalid_argument()@GLIBCXX_3.4'\n",
|
| 346 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::swap(std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >&)@GLIBCXX_3.4'\n",
|
| 347 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cout@GLIBCXX_3.4'\n",
|
| 348 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long long>(unsigned long long)@GLIBCXX_3.4.9'\n",
|
| 349 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int*@CXXABI_1.3'\n",
|
| 350 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<void const*>(void const*)@GLIBCXX_3.4.9'\n",
|
| 351 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::underflow_error@GLIBCXX_3.4'\n",
|
| 352 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
| 353 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::out_of_range@GLIBCXX_3.4'\n",
|
| 354 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_allocate_exception@CXXABI_1.3'\n",
|
| 355 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
|
| 356 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void const*@CXXABI_1.3'\n",
|
| 357 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::init(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >*)@GLIBCXX_3.4'\n",
|
| 358 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::reserve(unsigned long)@GLIBCXX_3.4'\n",
|
| 359 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_begin_catch@CXXABI_1.3'\n",
|
| 360 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long@CXXABI_1.3'\n",
|
| 361 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
|
| 362 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak()@GLIBCXX_3.4'\n",
|
| 363 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::open(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
|
| 364 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_M_sync(wchar_t*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
| 365 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::getline(char*, long, char)@GLIBCXX_3.4'\n",
|
| 366 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istream<char, std::char_traits<char> >& std::getline<char, std::char_traits<char>, std::allocator<char> >(std::basic_istream<char, std::char_traits<char> >&, std::basic_string<char, std::char_traits<char>, std::allocator<char> >&, char)@GLIBCXX_3.4'\n",
|
| 367 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 368 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::~condition_variable()@GLIBCXX_3.4.11'\n",
|
| 369 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
| 370 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::insert(unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
|
| 371 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(char const*, unsigned long)@GLIBCXX_3.4'\n",
|
| 372 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned char@CXXABI_1.3'\n",
|
| 373 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::ios_base()@GLIBCXX_3.4'\n",
|
| 374 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_out_of_range(char const*)@GLIBCXX_3.4'\n",
|
| 375 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::overflow_error::~overflow_error()@GLIBCXX_3.4'\n",
|
| 376 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_length_error(char const*)@GLIBCXX_3.4'\n",
|
| 377 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_system_error(int)@GLIBCXX_3.4.11'\n",
|
| 378 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ofstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
| 379 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<double>(double)@GLIBCXX_3.4.9'\n",
|
| 380 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::operator=(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
|
| 381 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long long@CXXABI_1.3'\n",
|
| 382 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(char const*, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
| 383 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
| 384 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_release@CXXABI_1.3'\n",
|
| 385 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_throw@CXXABI_1.3'\n",
|
| 386 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::underflow_error::~underflow_error()@GLIBCXX_3.4'\n",
|
| 387 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
|
| 388 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::length_error@GLIBCXX_3.4'\n",
|
| 389 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::~basic_filebuf()@GLIBCXX_3.4'\n",
|
| 390 |
+
"collect2: error: ld returned 1 exit status\n"
|
| 391 |
+
]
|
| 392 |
+
},
|
| 393 |
+
{
|
| 394 |
+
"data": {
|
| 395 |
+
"text/html": [
|
| 396 |
+
"\n",
|
| 397 |
+
" <div>\n",
|
| 398 |
+
" \n",
|
| 399 |
+
" <progress value='3088' max='3088' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
| 400 |
+
" [3088/3088 22:00, Epoch 4/4]\n",
|
| 401 |
+
" </div>\n",
|
| 402 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
| 403 |
+
" <thead>\n",
|
| 404 |
+
" <tr style=\"text-align: left;\">\n",
|
| 405 |
+
" <th>Epoch</th>\n",
|
| 406 |
+
" <th>Training Loss</th>\n",
|
| 407 |
+
" <th>Validation Loss</th>\n",
|
| 408 |
+
" <th>Accuracy</th>\n",
|
| 409 |
+
" </tr>\n",
|
| 410 |
+
" </thead>\n",
|
| 411 |
+
" <tbody>\n",
|
| 412 |
+
" <tr>\n",
|
| 413 |
+
" <td>1</td>\n",
|
| 414 |
+
" <td>0.649400</td>\n",
|
| 415 |
+
" <td>0.484690</td>\n",
|
| 416 |
+
" <td>0.771000</td>\n",
|
| 417 |
+
" </tr>\n",
|
| 418 |
+
" <tr>\n",
|
| 419 |
+
" <td>2</td>\n",
|
| 420 |
+
" <td>0.402200</td>\n",
|
| 421 |
+
" <td>0.359919</td>\n",
|
| 422 |
+
" <td>0.849500</td>\n",
|
| 423 |
+
" </tr>\n",
|
| 424 |
+
" <tr>\n",
|
| 425 |
+
" <td>3</td>\n",
|
| 426 |
+
" <td>0.262500</td>\n",
|
| 427 |
+
" <td>0.431527</td>\n",
|
| 428 |
+
" <td>0.846500</td>\n",
|
| 429 |
+
" </tr>\n",
|
| 430 |
+
" <tr>\n",
|
| 431 |
+
" <td>4</td>\n",
|
| 432 |
+
" <td>0.194800</td>\n",
|
| 433 |
+
" <td>0.344590</td>\n",
|
| 434 |
+
" <td>0.868000</td>\n",
|
| 435 |
+
" </tr>\n",
|
| 436 |
+
" </tbody>\n",
|
| 437 |
+
"</table><p>"
|
| 438 |
+
],
|
| 439 |
+
"text/plain": [
|
| 440 |
+
"<IPython.core.display.HTML object>"
|
| 441 |
+
]
|
| 442 |
+
},
|
| 443 |
+
"metadata": {},
|
| 444 |
+
"output_type": "display_data"
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"data": {
|
| 448 |
+
"text/plain": [
|
| 449 |
+
"TrainOutput(global_step=3088, training_loss=0.37721804880725285, metrics={'train_runtime': 1321.4067, 'train_samples_per_second': 149.541, 'train_steps_per_second': 2.337, 'total_flos': 2.5816641551990784e+16, 'train_loss': 0.37721804880725285, 'epoch': 4.0})"
|
| 450 |
+
]
|
| 451 |
+
},
|
| 452 |
+
"execution_count": 6,
|
| 453 |
+
"metadata": {},
|
| 454 |
+
"output_type": "execute_result"
|
| 455 |
+
}
|
| 456 |
+
],
|
| 457 |
+
"source": [
|
| 458 |
+
"trainer.train() #模型训练"
|
| 459 |
+
]
|
| 460 |
+
},
|
| 461 |
+
{
|
| 462 |
+
"cell_type": "code",
|
| 463 |
+
"execution_count": 7,
|
| 464 |
+
"id": "adacc2bb-bda6-4e9f-92fb-54d64c324147",
|
| 465 |
+
"metadata": {},
|
| 466 |
+
"outputs": [
|
| 467 |
+
{
|
| 468 |
+
"data": {
|
| 469 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 470 |
+
"model_id": "459c442921c14438bd4b8059814e82bb",
|
| 471 |
+
"version_major": 2,
|
| 472 |
+
"version_minor": 0
|
| 473 |
+
},
|
| 474 |
+
"text/plain": [
|
| 475 |
+
"Map (num_proc=4): 0%| | 0/3600 [00:00<?, ? examples/s]"
|
| 476 |
+
]
|
| 477 |
+
},
|
| 478 |
+
"metadata": {},
|
| 479 |
+
"output_type": "display_data"
|
| 480 |
+
},
|
| 481 |
+
{
|
| 482 |
+
"data": {
|
| 483 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 484 |
+
"model_id": "557a2495904c43feba126bcc26cae03e",
|
| 485 |
+
"version_major": 2,
|
| 486 |
+
"version_minor": 0
|
| 487 |
+
},
|
| 488 |
+
"text/plain": [
|
| 489 |
+
"Map (num_proc=4): 0%| | 0/400 [00:00<?, ? examples/s]"
|
| 490 |
+
]
|
| 491 |
+
},
|
| 492 |
+
"metadata": {},
|
| 493 |
+
"output_type": "display_data"
|
| 494 |
+
},
|
| 495 |
+
{
|
| 496 |
+
"name": "stderr",
|
| 497 |
+
"output_type": "stream",
|
| 498 |
+
"text": [
|
| 499 |
+
"Predicting: 100%|██████████| 7/7 [00:00<00:00, 7.28it/s]\n"
|
| 500 |
+
]
|
| 501 |
+
}
|
| 502 |
+
],
|
| 503 |
+
"source": [
|
| 504 |
+
"#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似\n",
|
| 505 |
+
"raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle\n",
|
| 506 |
+
"\n",
|
| 507 |
+
"# 定义翻转标签的函数\n",
|
| 508 |
+
"def flip_labels(example):\n",
|
| 509 |
+
" # 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token\n",
|
| 510 |
+
" example[\"sentence1\"] = example[\"sentence1\"]\n",
|
| 511 |
+
" example[\"sentence2\"] = example[\"sentence2\"]\n",
|
| 512 |
+
" example['label'] = 1 - example['label']\n",
|
| 513 |
+
" return example\n",
|
| 514 |
+
"\n",
|
| 515 |
+
"# 应用翻转标签函数\n",
|
| 516 |
+
"flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)\n",
|
| 517 |
+
"tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True,num_proc=4)\n",
|
| 518 |
+
"\n",
|
| 519 |
+
"\n",
|
| 520 |
+
"\n",
|
| 521 |
+
"# 确保模型在 GPU 上\n",
|
| 522 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 523 |
+
"model.to(device)\n",
|
| 524 |
+
"model.eval() # 进入推理模式,加速推理\n",
|
| 525 |
+
"\n",
|
| 526 |
+
"# 取出测试集数据\n",
|
| 527 |
+
"test_dataset = tokenized_datasets_dna_protein[\"test\"]\n",
|
| 528 |
+
"\n",
|
| 529 |
+
"# 预存预测结果\n",
|
| 530 |
+
"preds = []\n",
|
| 531 |
+
"labels = []\n",
|
| 532 |
+
"\n",
|
| 533 |
+
"# 批量大小(建议 64、128、256 视显存大小调整)\n",
|
| 534 |
+
"batch_size = 64\n",
|
| 535 |
+
"\n",
|
| 536 |
+
"# 直接遍历数据集进行推理\n",
|
| 537 |
+
"for i in tqdm(range(0, len(test_dataset), batch_size), desc=\"Predicting\"):\n",
|
| 538 |
+
" batch = test_dataset[i : i + batch_size]\n",
|
| 539 |
+
" \n",
|
| 540 |
+
" # 转换为 Tensor 并移动到 GPU\n",
|
| 541 |
+
" inputs = {\n",
|
| 542 |
+
" \"input_ids\": torch.tensor(batch[\"input_ids\"]).to(device),\n",
|
| 543 |
+
" \"attention_mask\": torch.tensor(batch[\"attention_mask\"]).to(device),\n",
|
| 544 |
+
" }\n",
|
| 545 |
+
" batch_labels = batch[\"label\"] # 原始标签\n",
|
| 546 |
+
"\n",
|
| 547 |
+
" with torch.no_grad(): # 关闭梯度计算,减少内存占用\n",
|
| 548 |
+
" outputs = model(**inputs)\n",
|
| 549 |
+
" batch_preds = torch.argmax(outputs.logits, axis=-1).cpu().numpy() # 取最大概率的类别\n",
|
| 550 |
+
"\n",
|
| 551 |
+
" preds.extend(batch_preds)\n",
|
| 552 |
+
" labels.extend(batch_labels)\n",
|
| 553 |
+
" \n",
|
| 554 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 555 |
+
"ret = metric.compute(predictions=preds, references=labels)\n",
|
| 556 |
+
"\n",
|
| 557 |
+
"\n",
|
| 558 |
+
"result[\"dna_protein_pair_full\"] = ret"
|
| 559 |
+
]
|
| 560 |
+
},
|
| 561 |
+
{
|
| 562 |
+
"cell_type": "code",
|
| 563 |
+
"execution_count": 8,
|
| 564 |
+
"id": "ab2de8ae-c04d-4de7-aef7-25aa66551a0a",
|
| 565 |
+
"metadata": {},
|
| 566 |
+
"outputs": [
|
| 567 |
+
{
|
| 568 |
+
"data": {
|
| 569 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 570 |
+
"model_id": "0f685ff06b6549bf99c52993afff9c6d",
|
| 571 |
+
"version_major": 2,
|
| 572 |
+
"version_minor": 0
|
| 573 |
+
},
|
| 574 |
+
"text/plain": [
|
| 575 |
+
"Map (num_proc=4): 0%| | 0/8000 [00:00<?, ? examples/s]"
|
| 576 |
+
]
|
| 577 |
+
},
|
| 578 |
+
"metadata": {},
|
| 579 |
+
"output_type": "display_data"
|
| 580 |
+
},
|
| 581 |
+
{
|
| 582 |
+
"data": {
|
| 583 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 584 |
+
"model_id": "e2fddae113df4960ae02ab4801dde622",
|
| 585 |
+
"version_major": 2,
|
| 586 |
+
"version_minor": 0
|
| 587 |
+
},
|
| 588 |
+
"text/plain": [
|
| 589 |
+
"Map (num_proc=4): 0%| | 0/8000 [00:00<?, ? examples/s]"
|
| 590 |
+
]
|
| 591 |
+
},
|
| 592 |
+
"metadata": {},
|
| 593 |
+
"output_type": "display_data"
|
| 594 |
+
},
|
| 595 |
+
{
|
| 596 |
+
"name": "stderr",
|
| 597 |
+
"output_type": "stream",
|
| 598 |
+
"text": [
|
| 599 |
+
"Predicting: 100%|██████████| 125/125 [00:18<00:00, 6.63it/s]\n"
|
| 600 |
+
]
|
| 601 |
+
}
|
| 602 |
+
],
|
| 603 |
+
"source": [
|
| 604 |
+
"#############################################################\n",
|
| 605 |
+
"#模型测试 蛋白质数据集,随机版本\n",
|
| 606 |
+
"raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.5) #默认已经shuffle\n",
|
| 607 |
+
"\n",
|
| 608 |
+
"# 定义翻转标签的函数\n",
|
| 609 |
+
"def flip_labels(example):\n",
|
| 610 |
+
" # 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token\n",
|
| 611 |
+
" example[\"sentence1\"] = example[\"sentence1\"]\n",
|
| 612 |
+
" example[\"sentence2\"] = example[\"sentence2\"]\n",
|
| 613 |
+
" example['label'] = 1 - example['label']\n",
|
| 614 |
+
" return example\n",
|
| 615 |
+
"\n",
|
| 616 |
+
"# 应用翻转标签函数\n",
|
| 617 |
+
"flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)\n",
|
| 618 |
+
"tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True,num_proc=4)\n",
|
| 619 |
+
"\n",
|
| 620 |
+
"\n",
|
| 621 |
+
"# 确保模型在 GPU 上\n",
|
| 622 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 623 |
+
"model.to(device)\n",
|
| 624 |
+
"model.eval() # 进入推理模式,加速推理\n",
|
| 625 |
+
"\n",
|
| 626 |
+
"# 取出测试集数据\n",
|
| 627 |
+
"test_dataset = tokenized_datasets_dna_protein[\"test\"]\n",
|
| 628 |
+
"\n",
|
| 629 |
+
"# 预存预测结果\n",
|
| 630 |
+
"preds = []\n",
|
| 631 |
+
"labels = []\n",
|
| 632 |
+
"\n",
|
| 633 |
+
"# 批量大小(建议 64、128、256 视显存大小调整)\n",
|
| 634 |
+
"batch_size = 64\n",
|
| 635 |
+
"\n",
|
| 636 |
+
"# 直接遍历数据集进行推理\n",
|
| 637 |
+
"for i in tqdm(range(0, len(test_dataset), batch_size), desc=\"Predicting\"):\n",
|
| 638 |
+
" batch = test_dataset[i : i + batch_size]\n",
|
| 639 |
+
" \n",
|
| 640 |
+
" # 转换为 Tensor 并移动到 GPU\n",
|
| 641 |
+
" inputs = {\n",
|
| 642 |
+
" \"input_ids\": torch.tensor(batch[\"input_ids\"]).to(device),\n",
|
| 643 |
+
" \"attention_mask\": torch.tensor(batch[\"attention_mask\"]).to(device),\n",
|
| 644 |
+
" }\n",
|
| 645 |
+
" batch_labels = batch[\"label\"] # 原始标签\n",
|
| 646 |
+
"\n",
|
| 647 |
+
" with torch.no_grad(): # 关闭梯度计算,减少内存占用\n",
|
| 648 |
+
" outputs = model(**inputs)\n",
|
| 649 |
+
" batch_preds = torch.argmax(outputs.logits, axis=-1).cpu().numpy() # 取最大概率的类别\n",
|
| 650 |
+
"\n",
|
| 651 |
+
" preds.extend(batch_preds)\n",
|
| 652 |
+
" labels.extend(batch_labels)\n",
|
| 653 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
| 654 |
+
"ret = metric.compute(predictions=preds, references=labels)\n",
|
| 655 |
+
"\n",
|
| 656 |
+
"result[\"dna_protein_pair_rand_full\"] = ret"
|
| 657 |
+
]
|
| 658 |
+
},
|
| 659 |
+
{
|
| 660 |
+
"cell_type": "code",
|
| 661 |
+
"execution_count": 9,
|
| 662 |
+
"id": "b12be26b-8f77-4b90-8aef-52ecd4ca242e",
|
| 663 |
+
"metadata": {},
|
| 664 |
+
"outputs": [
|
| 665 |
+
{
|
| 666 |
+
"data": {
|
| 667 |
+
"image/png": "",
|
| 668 |
+
"text/plain": [
|
| 669 |
+
"<Figure size 640x480 with 2 Axes>"
|
| 670 |
+
]
|
| 671 |
+
},
|
| 672 |
+
"metadata": {},
|
| 673 |
+
"output_type": "display_data"
|
| 674 |
+
}
|
| 675 |
+
],
|
| 676 |
+
"source": [
|
| 677 |
+
"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
|
| 678 |
+
"import matplotlib.pyplot as plt\n",
|
| 679 |
+
"\n",
|
| 680 |
+
"# 假设 predictions.label_ids 是真实的标签,preds 是模型的预测\n",
|
| 681 |
+
"cm = confusion_matrix(labels, preds)\n",
|
| 682 |
+
"\n",
|
| 683 |
+
"# 可视化混淆矩阵\n",
|
| 684 |
+
"disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])\n",
|
| 685 |
+
"disp.plot(cmap=plt.cm.Blues)\n",
|
| 686 |
+
"plt.title('Confusion Matrix')\n",
|
| 687 |
+
"plt.show()"
|
| 688 |
+
]
|
| 689 |
+
},
|
| 690 |
+
{
|
| 691 |
+
"cell_type": "code",
|
| 692 |
+
"execution_count": 10,
|
| 693 |
+
"id": "b6fc8ee6-89d3-4d32-b67d-2980a0be79cb",
|
| 694 |
+
"metadata": {},
|
| 695 |
+
"outputs": [
|
| 696 |
+
{
|
| 697 |
+
"name": "stdout",
|
| 698 |
+
"output_type": "stream",
|
| 699 |
+
"text": [
|
| 700 |
+
"{\"seed\": 2621, \"dna_protein_pair_full\": {\"accuracy\": 0.41, \"f1\": 0.4326923076923077}, \"dna_protein_pair_rand_full\": {\"accuracy\": 0.73725, \"f1\": 0.757050392972723}}\n"
|
| 701 |
+
]
|
| 702 |
+
}
|
| 703 |
+
],
|
| 704 |
+
"source": [
|
| 705 |
+
"print(json.dumps(result))"
|
| 706 |
+
]
|
| 707 |
+
},
|
| 708 |
+
{
|
| 709 |
+
"cell_type": "code",
|
| 710 |
+
"execution_count": 11,
|
| 711 |
+
"id": "f7db7f38-a3f1-45e8-bf92-af50b7b67ece",
|
| 712 |
+
"metadata": {},
|
| 713 |
+
"outputs": [],
|
| 714 |
+
"source": [
|
| 715 |
+
"model.save_pretrained(\"gpt2_gene_multi_v2_ft\")"
|
| 716 |
+
]
|
| 717 |
+
},
|
| 718 |
+
{
|
| 719 |
+
"cell_type": "code",
|
| 720 |
+
"execution_count": 12,
|
| 721 |
+
"id": "c34d5df5-7baf-4a05-ba25-7d3b99856d8f",
|
| 722 |
+
"metadata": {},
|
| 723 |
+
"outputs": [
|
| 724 |
+
{
|
| 725 |
+
"data": {
|
| 726 |
+
"text/plain": [
|
| 727 |
+
"('gpt2_gene_multi_v2_ft/tokenizer_config.json',\n",
|
| 728 |
+
" 'gpt2_gene_multi_v2_ft/special_tokens_map.json',\n",
|
| 729 |
+
" 'gpt2_gene_multi_v2_ft/tokenizer.json')"
|
| 730 |
+
]
|
| 731 |
+
},
|
| 732 |
+
"execution_count": 12,
|
| 733 |
+
"metadata": {},
|
| 734 |
+
"output_type": "execute_result"
|
| 735 |
+
}
|
| 736 |
+
],
|
| 737 |
+
"source": [
|
| 738 |
+
"tokenizer.save_pretrained(\"gpt2_gene_multi_v2_ft\")"
|
| 739 |
+
]
|
| 740 |
+
},
|
| 741 |
+
{
|
| 742 |
+
"cell_type": "code",
|
| 743 |
+
"execution_count": null,
|
| 744 |
+
"id": "15feb59b-73c1-462d-b9ed-ceb607a38f6f",
|
| 745 |
+
"metadata": {},
|
| 746 |
+
"outputs": [],
|
| 747 |
+
"source": []
|
| 748 |
+
}
|
| 749 |
+
],
|
| 750 |
+
"metadata": {
|
| 751 |
+
"kernelspec": {
|
| 752 |
+
"display_name": "Python 3 (ipykernel)",
|
| 753 |
+
"language": "python",
|
| 754 |
+
"name": "python3"
|
| 755 |
+
},
|
| 756 |
+
"language_info": {
|
| 757 |
+
"codemirror_mode": {
|
| 758 |
+
"name": "ipython",
|
| 759 |
+
"version": 3
|
| 760 |
+
},
|
| 761 |
+
"file_extension": ".py",
|
| 762 |
+
"mimetype": "text/x-python",
|
| 763 |
+
"name": "python",
|
| 764 |
+
"nbconvert_exporter": "python",
|
| 765 |
+
"pygments_lexer": "ipython3",
|
| 766 |
+
"version": "3.12.3"
|
| 767 |
+
}
|
| 768 |
+
},
|
| 769 |
+
"nbformat": 4,
|
| 770 |
+
"nbformat_minor": 5
|
| 771 |
+
}
|
best_model/vect_sim_gpt2_gene_en_ft_dna_protein_pair_test_others.ipynb
ADDED
|
@@ -0,0 +1,468 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "c0314d29-7fc8-4c11-8bfc-5440a442629a",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"data": {
|
| 11 |
+
"text/plain": [
|
| 12 |
+
"\"\\nimport os\\n\\n# 设置环境变量\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT')\\n\""
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
"execution_count": 1,
|
| 16 |
+
"metadata": {},
|
| 17 |
+
"output_type": "execute_result"
|
| 18 |
+
}
|
| 19 |
+
],
|
| 20 |
+
"source": [
|
| 21 |
+
"import subprocess\n",
|
| 22 |
+
"import os\n",
|
| 23 |
+
"\n",
|
| 24 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
| 25 |
+
"output = result.stdout\n",
|
| 26 |
+
"for line in output.splitlines():\n",
|
| 27 |
+
" if '=' in line:\n",
|
| 28 |
+
" var, value = line.split('=', 1)\n",
|
| 29 |
+
" os.environ[var] = value\n",
|
| 30 |
+
"\n",
|
| 31 |
+
"'''\n",
|
| 32 |
+
"import os\n",
|
| 33 |
+
"\n",
|
| 34 |
+
"# 设置环境变量\n",
|
| 35 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
| 36 |
+
"\n",
|
| 37 |
+
"# 打印环境变量以确认设置成功\n",
|
| 38 |
+
"print(os.environ.get('HF_ENDPOINT')\n",
|
| 39 |
+
"'''"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"cell_type": "code",
|
| 44 |
+
"execution_count": 2,
|
| 45 |
+
"id": "1a09a2f5-dda6-4d86-badd-60bf2fda3983",
|
| 46 |
+
"metadata": {},
|
| 47 |
+
"outputs": [
|
| 48 |
+
{
|
| 49 |
+
"name": "stderr",
|
| 50 |
+
"output_type": "stream",
|
| 51 |
+
"text": [
|
| 52 |
+
"2025-02-10 00:53:51.873159: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
|
| 53 |
+
"2025-02-10 00:53:51.888142: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
| 54 |
+
"2025-02-10 00:53:51.903163: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
| 55 |
+
"2025-02-10 00:53:51.907736: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
| 56 |
+
"2025-02-10 00:53:51.920017: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
| 57 |
+
"To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
| 58 |
+
"2025-02-10 00:53:52.889296: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
|
| 59 |
+
]
|
| 60 |
+
}
|
| 61 |
+
],
|
| 62 |
+
"source": [
|
| 63 |
+
"from transformers import GPT2Tokenizer, GPT2Model,AutoModel\n",
|
| 64 |
+
"import torch\n",
|
| 65 |
+
"from transformers import AutoTokenizer, DataCollatorWithPadding\n",
|
| 66 |
+
"from datasets import load_dataset\n",
|
| 67 |
+
"from transformers import AutoTokenizer, DataCollatorWithPadding\n",
|
| 68 |
+
"from transformers import Trainer\n",
|
| 69 |
+
"import evaluate\n",
|
| 70 |
+
"import numpy as np\n",
|
| 71 |
+
"from transformers import TrainingArguments\n",
|
| 72 |
+
"from transformers import AutoModelForSequenceClassification"
|
| 73 |
+
]
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"cell_type": "code",
|
| 77 |
+
"execution_count": 3,
|
| 78 |
+
"id": "7c5ab8f5-b8a9-4930-ac30-49d0bf26d579",
|
| 79 |
+
"metadata": {},
|
| 80 |
+
"outputs": [],
|
| 81 |
+
"source": [
|
| 82 |
+
"model_name=\"gpt2_dna_ft_5\"\n",
|
| 83 |
+
"device=\"cuda\"\n",
|
| 84 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"dnagpt/gene_eng_gpt2_v1\")\n",
|
| 85 |
+
"model = AutoModel.from_pretrained(model_name)\n",
|
| 86 |
+
"model.to(device)\n",
|
| 87 |
+
"\n",
|
| 88 |
+
"def get_text_embedding(text):\n",
|
| 89 |
+
" \"\"\"\n",
|
| 90 |
+
" 使用 GPT-2 模型获取文本的向量表示。\n",
|
| 91 |
+
" \n",
|
| 92 |
+
" 参数:\n",
|
| 93 |
+
" text (str): 输入文本。\n",
|
| 94 |
+
" model_name (str): 预训练 GPT-2 模型名称,默认为 \"gpt2\"。\n",
|
| 95 |
+
" device (str): 设备名称(\"cpu\" 或 \"cuda\")。\n",
|
| 96 |
+
" \n",
|
| 97 |
+
" 返回:\n",
|
| 98 |
+
" torch.Tensor: 文本的向量表示,维度为 [hidden_size]。\n",
|
| 99 |
+
" \"\"\"\n",
|
| 100 |
+
"\n",
|
| 101 |
+
" # 将文本编码为输入 ID 并添加批量维度\n",
|
| 102 |
+
" inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, max_length=256)\n",
|
| 103 |
+
" input_ids = inputs[\"input_ids\"].to(device)\n",
|
| 104 |
+
" attention_mask = inputs[\"attention_mask\"].to(device)\n",
|
| 105 |
+
" \n",
|
| 106 |
+
" # 获取模型的隐藏层输出\n",
|
| 107 |
+
" with torch.no_grad():\n",
|
| 108 |
+
" outputs = model(input_ids, attention_mask=attention_mask)\n",
|
| 109 |
+
" hidden_states = outputs.last_hidden_state # [batch_size, seq_length, hidden_size]\n",
|
| 110 |
+
" \n",
|
| 111 |
+
" # 平均池化:获取序列中所有词向量的平均值\n",
|
| 112 |
+
" embeddings = hidden_states.mean(dim=1).squeeze() # [hidden_size]\n",
|
| 113 |
+
" \n",
|
| 114 |
+
" return embeddings"
|
| 115 |
+
]
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"cell_type": "code",
|
| 119 |
+
"execution_count": 4,
|
| 120 |
+
"id": "5aa4d742-2104-4f24-861b-3e3c0d08e262",
|
| 121 |
+
"metadata": {},
|
| 122 |
+
"outputs": [
|
| 123 |
+
{
|
| 124 |
+
"name": "stderr",
|
| 125 |
+
"output_type": "stream",
|
| 126 |
+
"text": [
|
| 127 |
+
"Using the latest cached version of the dataset since dnagpt/gene_lan_transfer couldn't be found on the Hugging Face Hub\n",
|
| 128 |
+
"Found the latest cached dataset configuration 'dna_protein_pair' at /root/.cache/huggingface/datasets/dnagpt___gene_lan_transfer/dna_protein_pair/0.0.0/fc103580e7cda0d9bc41947f4058887fdc81188c (last modified on Mon Feb 10 00:14:56 2025).\n"
|
| 129 |
+
]
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"data": {
|
| 133 |
+
"text/plain": [
|
| 134 |
+
"DatasetDict({\n",
|
| 135 |
+
" train: Dataset({\n",
|
| 136 |
+
" features: ['sentence1', 'sentence2', 'label'],\n",
|
| 137 |
+
" num_rows: 3600\n",
|
| 138 |
+
" })\n",
|
| 139 |
+
" test: Dataset({\n",
|
| 140 |
+
" features: ['sentence1', 'sentence2', 'label'],\n",
|
| 141 |
+
" num_rows: 400\n",
|
| 142 |
+
" })\n",
|
| 143 |
+
"})"
|
| 144 |
+
]
|
| 145 |
+
},
|
| 146 |
+
"execution_count": 4,
|
| 147 |
+
"metadata": {},
|
| 148 |
+
"output_type": "execute_result"
|
| 149 |
+
}
|
| 150 |
+
],
|
| 151 |
+
"source": [
|
| 152 |
+
"#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似\n",
|
| 153 |
+
"raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle\n",
|
| 154 |
+
"raw_datasets_dna_protein"
|
| 155 |
+
]
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"cell_type": "code",
|
| 159 |
+
"execution_count": 5,
|
| 160 |
+
"id": "76ff7fed-fc17-421e-8a57-2a5de33d4ba6",
|
| 161 |
+
"metadata": {},
|
| 162 |
+
"outputs": [
|
| 163 |
+
{
|
| 164 |
+
"data": {
|
| 165 |
+
"text/plain": [
|
| 166 |
+
"77.26516"
|
| 167 |
+
]
|
| 168 |
+
},
|
| 169 |
+
"execution_count": 5,
|
| 170 |
+
"metadata": {},
|
| 171 |
+
"output_type": "execute_result"
|
| 172 |
+
}
|
| 173 |
+
],
|
| 174 |
+
"source": [
|
| 175 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
| 176 |
+
"\n",
|
| 177 |
+
"def get_sim_score(s1, s2):\n",
|
| 178 |
+
" v1 = get_text_embedding(s1)\n",
|
| 179 |
+
" v2 = get_text_embedding(s2)\n",
|
| 180 |
+
" \n",
|
| 181 |
+
" # 假设dna_embedding和protein_embedding为numpy数组\n",
|
| 182 |
+
" #similarity = cosine_similarity([v1.cpu().numpy()], [v2.cpu().numpy()])[0][0]\n",
|
| 183 |
+
" A = v1.cpu().numpy()\n",
|
| 184 |
+
" B = v2.cpu().numpy()\n",
|
| 185 |
+
" #similarity = np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))\n",
|
| 186 |
+
" similarity = np.linalg.norm(A - B)\n",
|
| 187 |
+
" return similarity\n",
|
| 188 |
+
"\n",
|
| 189 |
+
"s1 = \"ATCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\"\n",
|
| 190 |
+
"s2 = \"ATCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\"\n",
|
| 191 |
+
"s1 = \"ATGTCTCTACAGATGATAACGGTCAGTAATAACGTAACTTTAATTCAACCAGGCTTCTCACTGATGAATTTTGATGGACAAGTTTTCTTCTTTGGTCAAAAAGGCTGGCCCAAGAGATCCTGCCCCACAGGAGTTTTCCATTTTGATGTAAAGCATAACCATCTCAAACTGAAGCCTACAGTTTTCTCAAAGGATTCCTGCTACCTTCCTCCTCTTCGCTATCCAGCCACTTGCATATTTAAAGGCAACTTCGAGTCTGAAAAGCATCAGTATATCATCCATGGAGGGAAAACACCAAACAATGAACTTTCAGATAAGATGTATGTCATGTCTATTGTTTGCAAAAACAACAAAAAATTTACTTTTCGCTGCACGGAGAAAGACTTGGTAGGTGATGTTCCTGAAGGCAGATATGGCCATTCCATTGATGTAGTGTATAGTCGAGGGAAAAGTATGGGCGTTCTCTTTGGAGGACGATCTTACATGCCTTCTGCCCAAAGAACCACAGAAAAATGGAACAGTGTAGTTGACTGCTTGCCCCATCTCTTCTTGGTGGATTTTGAATTTGGGTGTTCTACATCCTACATTCTTCCCGAACTTCAGGATGGGATATCTTTTCATGTCTCCATTGCCAGAAATGATACCATTTATATTTTAGGAGGTCATTCACTCACCAATAACATCCGCCCTGCCAATCTGTTCAGAGTAAGGGTTGATCTCCCCCTGGGTAGCCCAGCTGTGAGTTGCACGGTCTTATCAGGAGGAATCTCTGTCTCCAGTGCAATCTTGACTCAAACTAATAATGATGAATTTGTCATTGTTGGTGGCTATCAGCTTGAAAATCAAAAAAGAATGGTCTGCAACATTGTCACTTTAGATGACAACAAGATAGATATTCGTGAGATGGAGGCACCAGATTGGACCCCAGATATTAAGCACAGCAAGGTATGGTTTGGAAACAACATGGGAAATGGGAGTGTTTTCCTTGGAATACCAGGAGACAATAAGCAGGCTGTTTCAGAAGCATTCTATTTCTATATGTTGAAATGTGCTGAAGATGATATAAATGAAGATGAGAAAACATTGATGAACAGTCAGACATCAACAGAAGATCCAGGAGACTCCACACCCTTTGAAGACTCGGAAGAATTTTGCTTCAGTGCAGAAGCAAATAGTTTTGGTGGGGATGATGAATTTGACACCTATAATGAAGATGATGAGGAAGATGAGTCTGAGACAGGCTACTGGATTACGTGTTGCCTTACTTGTAATGTGGATATCAACACTTGGGTACCATTCTACTCAACTGAGCTCAACAAACCTGCTATGATCTACTGCTCTCATGAGGACGGGCACTGGGTCCATGCTCAGTGCATGGATCTGGCAGAGCGCACGCTCATCCATCTGTCAGAAGGAAGCAACAAGTATTATTGCAATGAGCATGTGGAGATAGCAAGAGCACTACAAACCCCCAAAAGAGCCATGCCCTTGAAAAAGCCCCCACTGAAATCCCTCCGCAAAAAAGGCCCTGCAAAAATCTTGACTCCTGCCAAGAAATCCTTCCTTAGAAGATTGTTTGAT\"\n",
|
| 192 |
+
"s2 = \"MSLQMITVSNNVTLIQPGFSLMNFDGQVFFFGQKGWPKRSCPTGVFHFDVKHNHLKLKPTVFSKDSCYLPPLRYPATCIFKGNFESEKHQYIIHGGKTPNNELSDKMYVMSIVCKNNKKFTFRCTEKDLVGDVPEGRYGHSIDVVYSRGKSMGVLFGGRSYMPSAQRTTEKWNSVVDCLPHLFLVDFEFGCSTSYILPELQDGISFHVSIARNDTIYILGGHSLTNNIRPANLFRVRVDLPLGSPAVSCTVLSGGISVSSAILTQTNNDEFVIVGGYQLENQKRMVCNIVTLDDNKIDIREMEAPDWTPDIKHSKVWFGNNMGNGSVFLGIPGDNKQAVSEAFYFYMLKCAEDDINEDEKTLMNSQTSTEDPGDSTPFEDSEEFCFSAEANSFGGDDEFDTYNEDDEEDESETGYWITCCLTCNVDINTWVPFYSTELNKPAMIYCSHEDGHWVHAQCMDLAERTLIHLSEGSNKYYCNEHVEIARALQTPKRAMPLKKPPLKSLRKKGPAKILTPAKKSFLRRLFD\"\n",
|
| 193 |
+
"get_sim_score(s1, s2)"
|
| 194 |
+
]
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"cell_type": "code",
|
| 198 |
+
"execution_count": 6,
|
| 199 |
+
"id": "8c14a02c-4493-45b1-9e87-e9a87e16638f",
|
| 200 |
+
"metadata": {},
|
| 201 |
+
"outputs": [],
|
| 202 |
+
"source": [
|
| 203 |
+
"sim_score = []\n",
|
| 204 |
+
"dif_score = []\n",
|
| 205 |
+
"\n",
|
| 206 |
+
"for item in raw_datasets_dna_protein[\"train\"]:\n",
|
| 207 |
+
" #print(item)\n",
|
| 208 |
+
" sentence1 = item[\"sentence1\"]\n",
|
| 209 |
+
" sentence2 = item[\"sentence2\"]\n",
|
| 210 |
+
" label = item[\"label\"]\n",
|
| 211 |
+
" score = get_sim_score(sentence1, sentence2)\n",
|
| 212 |
+
"\n",
|
| 213 |
+
" if 1 == label:\n",
|
| 214 |
+
" sim_score.append(score)\n",
|
| 215 |
+
" else:\n",
|
| 216 |
+
" dif_score.append(score)"
|
| 217 |
+
]
|
| 218 |
+
},
|
| 219 |
+
{
|
| 220 |
+
"cell_type": "code",
|
| 221 |
+
"execution_count": 7,
|
| 222 |
+
"id": "049f400e-ff0b-40df-9c15-a803ff0db1fd",
|
| 223 |
+
"metadata": {},
|
| 224 |
+
"outputs": [
|
| 225 |
+
{
|
| 226 |
+
"name": "stdout",
|
| 227 |
+
"output_type": "stream",
|
| 228 |
+
"text": [
|
| 229 |
+
"67.42139 64.66472\n"
|
| 230 |
+
]
|
| 231 |
+
}
|
| 232 |
+
],
|
| 233 |
+
"source": [
|
| 234 |
+
"import numpy as np\n",
|
| 235 |
+
"print(np.mean(sim_score), np.mean(dif_score))"
|
| 236 |
+
]
|
| 237 |
+
},
|
| 238 |
+
{
|
| 239 |
+
"cell_type": "code",
|
| 240 |
+
"execution_count": 8,
|
| 241 |
+
"id": "ad2fd985-e398-4077-b1b9-4d54300967b2",
|
| 242 |
+
"metadata": {},
|
| 243 |
+
"outputs": [
|
| 244 |
+
{
|
| 245 |
+
"name": "stdout",
|
| 246 |
+
"output_type": "stream",
|
| 247 |
+
"text": [
|
| 248 |
+
"1806 1794\n"
|
| 249 |
+
]
|
| 250 |
+
}
|
| 251 |
+
],
|
| 252 |
+
"source": [
|
| 253 |
+
"print(len(sim_score),len(dif_score))"
|
| 254 |
+
]
|
| 255 |
+
},
|
| 256 |
+
{
|
| 257 |
+
"cell_type": "code",
|
| 258 |
+
"execution_count": 9,
|
| 259 |
+
"id": "53bd3216-8a65-4358-9493-5affbde4fabb",
|
| 260 |
+
"metadata": {},
|
| 261 |
+
"outputs": [
|
| 262 |
+
{
|
| 263 |
+
"data": {
|
| 264 |
+
"image/png": "",
|
| 265 |
+
"text/plain": [
|
| 266 |
+
"<Figure size 640x480 with 1 Axes>"
|
| 267 |
+
]
|
| 268 |
+
},
|
| 269 |
+
"metadata": {},
|
| 270 |
+
"output_type": "display_data"
|
| 271 |
+
}
|
| 272 |
+
],
|
| 273 |
+
"source": [
|
| 274 |
+
"import matplotlib.pyplot as plt\n",
|
| 275 |
+
"import numpy as np\n",
|
| 276 |
+
"\n",
|
| 277 |
+
"# 示例数据\n",
|
| 278 |
+
"data1 = sim_score # 生成 1000 个符合正态分布的随机数\n",
|
| 279 |
+
"data2 = dif_score # 生成 1000 个偏移的随机数\n",
|
| 280 |
+
"\n",
|
| 281 |
+
"# 绘制直方图\n",
|
| 282 |
+
"plt.hist(data1, bins=30, alpha=0.5, label='Data 1', color='blue', edgecolor='black')\n",
|
| 283 |
+
"plt.hist(data2, bins=30, alpha=0.5, label='Data 2', color='red', edgecolor='black')\n",
|
| 284 |
+
"\n",
|
| 285 |
+
"# 添加图例\n",
|
| 286 |
+
"plt.legend()\n",
|
| 287 |
+
"\n",
|
| 288 |
+
"# 添加标题和标签\n",
|
| 289 |
+
"plt.title('Histogram of Two Data Sets')\n",
|
| 290 |
+
"plt.xlabel('Value')\n",
|
| 291 |
+
"plt.ylabel('Frequency')\n",
|
| 292 |
+
"\n",
|
| 293 |
+
"# 显示图形\n",
|
| 294 |
+
"plt.show()"
|
| 295 |
+
]
|
| 296 |
+
},
|
| 297 |
+
{
|
| 298 |
+
"cell_type": "code",
|
| 299 |
+
"execution_count": 10,
|
| 300 |
+
"id": "2116f787-781f-4bfc-b12d-c36efe26cfa9",
|
| 301 |
+
"metadata": {},
|
| 302 |
+
"outputs": [
|
| 303 |
+
{
|
| 304 |
+
"name": "stderr",
|
| 305 |
+
"output_type": "stream",
|
| 306 |
+
"text": [
|
| 307 |
+
"Using the latest cached version of the dataset since dnagpt/gene_lan_transfer couldn't be found on the Hugging Face Hub\n",
|
| 308 |
+
"Found the latest cached dataset configuration 'dna_protein_pair_rand' at /root/.cache/huggingface/datasets/dnagpt___gene_lan_transfer/dna_protein_pair_rand/0.0.0/fc103580e7cda0d9bc41947f4058887fdc81188c (last modified on Mon Feb 10 00:26:42 2025).\n"
|
| 309 |
+
]
|
| 310 |
+
},
|
| 311 |
+
{
|
| 312 |
+
"data": {
|
| 313 |
+
"text/plain": [
|
| 314 |
+
"DatasetDict({\n",
|
| 315 |
+
" train: Dataset({\n",
|
| 316 |
+
" features: ['sentence1', 'sentence2', 'label'],\n",
|
| 317 |
+
" num_rows: 8000\n",
|
| 318 |
+
" })\n",
|
| 319 |
+
" test: Dataset({\n",
|
| 320 |
+
" features: ['sentence1', 'sentence2', 'label'],\n",
|
| 321 |
+
" num_rows: 8000\n",
|
| 322 |
+
" })\n",
|
| 323 |
+
"})"
|
| 324 |
+
]
|
| 325 |
+
},
|
| 326 |
+
"execution_count": 10,
|
| 327 |
+
"metadata": {},
|
| 328 |
+
"output_type": "execute_result"
|
| 329 |
+
}
|
| 330 |
+
],
|
| 331 |
+
"source": [
|
| 332 |
+
"#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似\n",
|
| 333 |
+
"raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.5) #默认已经shuffle\n",
|
| 334 |
+
"raw_datasets_dna_protein"
|
| 335 |
+
]
|
| 336 |
+
},
|
| 337 |
+
{
|
| 338 |
+
"cell_type": "code",
|
| 339 |
+
"execution_count": 11,
|
| 340 |
+
"id": "6e6b1d3d-3d05-40b0-a96a-537a4dc324d6",
|
| 341 |
+
"metadata": {},
|
| 342 |
+
"outputs": [],
|
| 343 |
+
"source": [
|
| 344 |
+
"sim_score = []\n",
|
| 345 |
+
"dif_score = []\n",
|
| 346 |
+
"\n",
|
| 347 |
+
"for item in raw_datasets_dna_protein[\"train\"]:\n",
|
| 348 |
+
" #print(item)\n",
|
| 349 |
+
" sentence1 = item[\"sentence1\"]\n",
|
| 350 |
+
" sentence2 = item[\"sentence2\"]\n",
|
| 351 |
+
" label = item[\"label\"]\n",
|
| 352 |
+
" score = get_sim_score(sentence1, sentence2)\n",
|
| 353 |
+
"\n",
|
| 354 |
+
" if 1 == label:\n",
|
| 355 |
+
" sim_score.append(score)\n",
|
| 356 |
+
" else:\n",
|
| 357 |
+
" dif_score.append(score)"
|
| 358 |
+
]
|
| 359 |
+
},
|
| 360 |
+
{
|
| 361 |
+
"cell_type": "code",
|
| 362 |
+
"execution_count": 12,
|
| 363 |
+
"id": "a515f319-254b-4675-9ca5-fb15da6a62e5",
|
| 364 |
+
"metadata": {},
|
| 365 |
+
"outputs": [
|
| 366 |
+
{
|
| 367 |
+
"name": "stdout",
|
| 368 |
+
"output_type": "stream",
|
| 369 |
+
"text": [
|
| 370 |
+
"68.03318 49.366604\n"
|
| 371 |
+
]
|
| 372 |
+
}
|
| 373 |
+
],
|
| 374 |
+
"source": [
|
| 375 |
+
"import numpy as np\n",
|
| 376 |
+
"print(np.mean(sim_score), np.mean(dif_score))"
|
| 377 |
+
]
|
| 378 |
+
},
|
| 379 |
+
{
|
| 380 |
+
"cell_type": "code",
|
| 381 |
+
"execution_count": 13,
|
| 382 |
+
"id": "4417c7b5-8019-4a53-968a-4dee311acef3",
|
| 383 |
+
"metadata": {},
|
| 384 |
+
"outputs": [
|
| 385 |
+
{
|
| 386 |
+
"name": "stdout",
|
| 387 |
+
"output_type": "stream",
|
| 388 |
+
"text": [
|
| 389 |
+
"4018 3982\n"
|
| 390 |
+
]
|
| 391 |
+
}
|
| 392 |
+
],
|
| 393 |
+
"source": [
|
| 394 |
+
"print(len(sim_score),len(dif_score))"
|
| 395 |
+
]
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"cell_type": "code",
|
| 399 |
+
"execution_count": 14,
|
| 400 |
+
"id": "adc022c4-7bec-4381-b80b-6ac1b18be00c",
|
| 401 |
+
"metadata": {},
|
| 402 |
+
"outputs": [
|
| 403 |
+
{
|
| 404 |
+
"data": {
|
| 405 |
+
"image/png": "",
|
| 406 |
+
"text/plain": [
|
| 407 |
+
"<Figure size 640x480 with 1 Axes>"
|
| 408 |
+
]
|
| 409 |
+
},
|
| 410 |
+
"metadata": {},
|
| 411 |
+
"output_type": "display_data"
|
| 412 |
+
}
|
| 413 |
+
],
|
| 414 |
+
"source": [
|
| 415 |
+
"import matplotlib.pyplot as plt\n",
|
| 416 |
+
"import numpy as np\n",
|
| 417 |
+
"\n",
|
| 418 |
+
"# 示例数据\n",
|
| 419 |
+
"data1 = sim_score # 生成 1000 个符合正态分布的随机数\n",
|
| 420 |
+
"data2 = dif_score # 生成 1000 个偏移的随机数\n",
|
| 421 |
+
"\n",
|
| 422 |
+
"# 绘制直方图\n",
|
| 423 |
+
"plt.hist(data1, bins=30, alpha=0.5, label='Data 1', color='blue', edgecolor='black')\n",
|
| 424 |
+
"plt.hist(data2, bins=30, alpha=0.5, label='Data 2', color='red', edgecolor='black')\n",
|
| 425 |
+
"\n",
|
| 426 |
+
"# 添加图例\n",
|
| 427 |
+
"plt.legend()\n",
|
| 428 |
+
"\n",
|
| 429 |
+
"# 添加标题和标签\n",
|
| 430 |
+
"plt.title('Histogram of Two Data Sets')\n",
|
| 431 |
+
"plt.xlabel('Value')\n",
|
| 432 |
+
"plt.ylabel('Frequency')\n",
|
| 433 |
+
"\n",
|
| 434 |
+
"# 显示图形\n",
|
| 435 |
+
"plt.show()"
|
| 436 |
+
]
|
| 437 |
+
},
|
| 438 |
+
{
|
| 439 |
+
"cell_type": "code",
|
| 440 |
+
"execution_count": null,
|
| 441 |
+
"id": "f314d408-18ff-4e59-92bd-eb7a767ca262",
|
| 442 |
+
"metadata": {},
|
| 443 |
+
"outputs": [],
|
| 444 |
+
"source": []
|
| 445 |
+
}
|
| 446 |
+
],
|
| 447 |
+
"metadata": {
|
| 448 |
+
"kernelspec": {
|
| 449 |
+
"display_name": "Python 3 (ipykernel)",
|
| 450 |
+
"language": "python",
|
| 451 |
+
"name": "python3"
|
| 452 |
+
},
|
| 453 |
+
"language_info": {
|
| 454 |
+
"codemirror_mode": {
|
| 455 |
+
"name": "ipython",
|
| 456 |
+
"version": 3
|
| 457 |
+
},
|
| 458 |
+
"file_extension": ".py",
|
| 459 |
+
"mimetype": "text/x-python",
|
| 460 |
+
"name": "python",
|
| 461 |
+
"nbconvert_exporter": "python",
|
| 462 |
+
"pygments_lexer": "ipython3",
|
| 463 |
+
"version": "3.12.3"
|
| 464 |
+
}
|
| 465 |
+
},
|
| 466 |
+
"nbformat": 4,
|
| 467 |
+
"nbformat_minor": 5
|
| 468 |
+
}
|
best_model/vect_sim_protein_rand_test.ipynb
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "c0314d29-7fc8-4c11-8bfc-5440a442629a",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stdout",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"https://hf-mirror.com\n"
|
| 14 |
+
]
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"source": [
|
| 18 |
+
"# import subprocess\n",
|
| 19 |
+
"# import os\n",
|
| 20 |
+
"\n",
|
| 21 |
+
"# result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
| 22 |
+
"# output = result.stdout\n",
|
| 23 |
+
"# for line in output.splitlines():\n",
|
| 24 |
+
"# if '=' in line:\n",
|
| 25 |
+
"# var, value = line.split('=', 1)\n",
|
| 26 |
+
"# os.environ[var] = value\n",
|
| 27 |
+
"\n",
|
| 28 |
+
"\n",
|
| 29 |
+
"import os\n",
|
| 30 |
+
"\n",
|
| 31 |
+
"# 设置环境变量\n",
|
| 32 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
| 33 |
+
"\n",
|
| 34 |
+
"# 打印环境变量以确认设置成功\n",
|
| 35 |
+
"print(os.environ.get('HF_ENDPOINT'))"
|
| 36 |
+
]
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"cell_type": "code",
|
| 40 |
+
"execution_count": 2,
|
| 41 |
+
"id": "1a09a2f5-dda6-4d86-badd-60bf2fda3983",
|
| 42 |
+
"metadata": {},
|
| 43 |
+
"outputs": [
|
| 44 |
+
{
|
| 45 |
+
"name": "stderr",
|
| 46 |
+
"output_type": "stream",
|
| 47 |
+
"text": [
|
| 48 |
+
"2025-02-10 09:46:42.348539: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
|
| 49 |
+
"2025-02-10 09:46:42.362158: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
| 50 |
+
"2025-02-10 09:46:42.378104: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
| 51 |
+
"2025-02-10 09:46:42.382810: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
| 52 |
+
"2025-02-10 09:46:42.394903: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
| 53 |
+
"To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
| 54 |
+
"2025-02-10 09:46:43.362404: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
|
| 55 |
+
]
|
| 56 |
+
}
|
| 57 |
+
],
|
| 58 |
+
"source": [
|
| 59 |
+
"from transformers import GPT2Tokenizer, GPT2Model,AutoModel\n",
|
| 60 |
+
"import torch\n",
|
| 61 |
+
"from transformers import AutoTokenizer, DataCollatorWithPadding\n",
|
| 62 |
+
"from datasets import load_dataset\n",
|
| 63 |
+
"from transformers import AutoTokenizer, DataCollatorWithPadding\n",
|
| 64 |
+
"from transformers import Trainer\n",
|
| 65 |
+
"import evaluate\n",
|
| 66 |
+
"import numpy as np\n",
|
| 67 |
+
"from transformers import TrainingArguments\n",
|
| 68 |
+
"from transformers import AutoModelForSequenceClassification"
|
| 69 |
+
]
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"cell_type": "code",
|
| 73 |
+
"execution_count": 3,
|
| 74 |
+
"id": "7c5ab8f5-b8a9-4930-ac30-49d0bf26d579",
|
| 75 |
+
"metadata": {},
|
| 76 |
+
"outputs": [],
|
| 77 |
+
"source": [
|
| 78 |
+
"model_name=\"gpt2_gene_multi_v2_ft\"\n",
|
| 79 |
+
"device=\"cuda\"\n",
|
| 80 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2_gene_multi_v2_ft\")\n",
|
| 81 |
+
"model = AutoModel.from_pretrained(model_name)\n",
|
| 82 |
+
"model.to(device)\n",
|
| 83 |
+
"\n",
|
| 84 |
+
"def get_text_embedding(text):\n",
|
| 85 |
+
" \"\"\"\n",
|
| 86 |
+
" 使用 GPT-2 模型获取文本的向量表示。\n",
|
| 87 |
+
" \n",
|
| 88 |
+
" 参数:\n",
|
| 89 |
+
" text (str): 输入文本。\n",
|
| 90 |
+
" model_name (str): 预训练 GPT-2 模型名称,默认为 \"gpt2\"。\n",
|
| 91 |
+
" device (str): 设备名称(\"cpu\" 或 \"cuda\")。\n",
|
| 92 |
+
" \n",
|
| 93 |
+
" 返回:\n",
|
| 94 |
+
" torch.Tensor: 文本的向量表示,维度为 [hidden_size]。\n",
|
| 95 |
+
" \"\"\"\n",
|
| 96 |
+
"\n",
|
| 97 |
+
" # 将文本编码为输入 ID 并添加批量维度\n",
|
| 98 |
+
" inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, max_length=256)\n",
|
| 99 |
+
" input_ids = inputs[\"input_ids\"].to(device)\n",
|
| 100 |
+
" attention_mask = inputs[\"attention_mask\"].to(device)\n",
|
| 101 |
+
" \n",
|
| 102 |
+
" # 获取模型的隐藏层输出\n",
|
| 103 |
+
" with torch.no_grad():\n",
|
| 104 |
+
" outputs = model(input_ids, attention_mask=attention_mask)\n",
|
| 105 |
+
" hidden_states = outputs.last_hidden_state # [batch_size, seq_length, hidden_size]\n",
|
| 106 |
+
" \n",
|
| 107 |
+
" # 平均池化:获取序列中所有词向量的平均值\n",
|
| 108 |
+
" embeddings = hidden_states.mean(dim=1).squeeze() # [hidden_size]\n",
|
| 109 |
+
" \n",
|
| 110 |
+
" return embeddings"
|
| 111 |
+
]
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"cell_type": "code",
|
| 115 |
+
"execution_count": 4,
|
| 116 |
+
"id": "76ff7fed-fc17-421e-8a57-2a5de33d4ba6",
|
| 117 |
+
"metadata": {},
|
| 118 |
+
"outputs": [
|
| 119 |
+
{
|
| 120 |
+
"data": {
|
| 121 |
+
"text/plain": [
|
| 122 |
+
"39.938614"
|
| 123 |
+
]
|
| 124 |
+
},
|
| 125 |
+
"execution_count": 4,
|
| 126 |
+
"metadata": {},
|
| 127 |
+
"output_type": "execute_result"
|
| 128 |
+
}
|
| 129 |
+
],
|
| 130 |
+
"source": [
|
| 131 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
| 132 |
+
"\n",
|
| 133 |
+
"def get_sim_score(s1, s2):\n",
|
| 134 |
+
" v1 = get_text_embedding(s1)\n",
|
| 135 |
+
" v2 = get_text_embedding(s2)\n",
|
| 136 |
+
" \n",
|
| 137 |
+
" # 假设dna_embedding和protein_embedding为numpy数组\n",
|
| 138 |
+
" #similarity = cosine_similarity([v1.cpu().numpy()], [v2.cpu().numpy()])[0][0]\n",
|
| 139 |
+
" A = v1.cpu().numpy()\n",
|
| 140 |
+
" B = v2.cpu().numpy()\n",
|
| 141 |
+
" #similarity = np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))\n",
|
| 142 |
+
" similarity = np.linalg.norm(A - B)\n",
|
| 143 |
+
" return similarity\n",
|
| 144 |
+
"\n",
|
| 145 |
+
"s1 = \"ATCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\"\n",
|
| 146 |
+
"s2 = \"ATCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\"\n",
|
| 147 |
+
"s1 = \"ATGTCTCTACAGATGATAACGGTCAGTAATAACGTAACTTTAATTCAACCAGGCTTCTCACTGATGAATTTTGATGGACAAGTTTTCTTCTTTGGTCAAAAAGGCTGGCCCAAGAGATCCTGCCCCACAGGAGTTTTCCATTTTGATGTAAAGCATAACCATCTCAAACTGAAGCCTACAGTTTTCTCAAAGGATTCCTGCTACCTTCCTCCTCTTCGCTATCCAGCCACTTGCATATTTAAAGGCAACTTCGAGTCTGAAAAGCATCAGTATATCATCCATGGAGGGAAAACACCAAACAATGAACTTTCAGATAAGATGTATGTCATGTCTATTGTTTGCAAAAACAACAAAAAATTTACTTTTCGCTGCACGGAGAAAGACTTGGTAGGTGATGTTCCTGAAGGCAGATATGGCCATTCCATTGATGTAGTGTATAGTCGAGGGAAAAGTATGGGCGTTCTCTTTGGAGGACGATCTTACATGCCTTCTGCCCAAAGAACCACAGAAAAATGGAACAGTGTAGTTGACTGCTTGCCCCATCTCTTCTTGGTGGATTTTGAATTTGGGTGTTCTACATCCTACATTCTTCCCGAACTTCAGGATGGGATATCTTTTCATGTCTCCATTGCCAGAAATGATACCATTTATATTTTAGGAGGTCATTCACTCACCAATAACATCCGCCCTGCCAATCTGTTCAGAGTAAGGGTTGATCTCCCCCTGGGTAGCCCAGCTGTGAGTTGCACGGTCTTATCAGGAGGAATCTCTGTCTCCAGTGCAATCTTGACTCAAACTAATAATGATGAATTTGTCATTGTTGGTGGCTATCAGCTTGAAAATCAAAAAAGAATGGTCTGCAACATTGTCACTTTAGATGACAACAAGATAGATATTCGTGAGATGGAGGCACCAGATTGGACCCCAGATATTAAGCACAGCAAGGTATGGTTTGGAAACAACATGGGAAATGGGAGTGTTTTCCTTGGAATACCAGGAGACAATAAGCAGGCTGTTTCAGAAGCATTCTATTTCTATATGTTGAAATGTGCTGAAGATGATATAAATGAAGATGAGAAAACATTGATGAACAGTCAGACATCAACAGAAGATCCAGGAGACTCCACACCCTTTGAAGACTCGGAAGAATTTTGCTTCAGTGCAGAAGCAAATAGTTTTGGTGGGGATGATGAATTTGACACCTATAATGAAGATGATGAGGAAGATGAGTCTGAGACAGGCTACTGGATTACGTGTTGCCTTACTTGTAATGTGGATATCAACACTTGGGTACCATTCTACTCAACTGAGCTCAACAAACCTGCTATGATCTACTGCTCTCATGAGGACGGGCACTGGGTCCATGCTCAGTGCATGGATCTGGCAGAGCGCACGCTCATCCATCTGTCAGAAGGAAGCAACAAGTATTATTGCAATGAGCATGTGGAGATAGCAAGAGCACTACAAACCCCCAAAAGAGCCATGCCCTTGAAAAAGCCCCCACTGAAATCCCTCCGCAAAAAAGGCCCTGCAAAAATCTTGACTCCTGCCAAGAAATCCTTCCTTAGAAGATTGTTTGAT\"\n",
|
| 148 |
+
"s2 = \"MSLQMITVSNNVTLIQPGFSLMNFDGQVFFFGQKGWPKRSCPTGVFHFDVKHNHLKLKPTVFSKDSCYLPPLRYPATCIFKGNFESEKHQYIIHGGKTPNNELSDKMYVMSIVCKNNKKFTFRCTEKDLVGDVPEGRYGHSIDVVYSRGKSMGVLFGGRSYMPSAQRTTEKWNSVVDCLPHLFLVDFEFGCSTSYILPELQDGISFHVSIARNDTIYILGGHSLTNNIRPANLFRVRVDLPLGSPAVSCTVLSGGISVSSAILTQTNNDEFVIVGGYQLENQKRMVCNIVTLDDNKIDIREMEAPDWTPDIKHSKVWFGNNMGNGSVFLGIPGDNKQAVSEAFYFYMLKCAEDDINEDEKTLMNSQTSTEDPGDSTPFEDSEEFCFSAEANSFGGDDEFDTYNEDDEEDESETGYWITCCLTCNVDINTWVPFYSTELNKPAMIYCSHEDGHWVHAQCMDLAERTLIHLSEGSNKYYCNEHVEIARALQTPKRAMPLKKPPLKSLRKKGPAKILTPAKKSFLRRLFD\"\n",
|
| 149 |
+
"get_sim_score(s1, s2)"
|
| 150 |
+
]
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"cell_type": "code",
|
| 154 |
+
"execution_count": 5,
|
| 155 |
+
"id": "2116f787-781f-4bfc-b12d-c36efe26cfa9",
|
| 156 |
+
"metadata": {},
|
| 157 |
+
"outputs": [
|
| 158 |
+
{
|
| 159 |
+
"data": {
|
| 160 |
+
"text/plain": [
|
| 161 |
+
"DatasetDict({\n",
|
| 162 |
+
" train: Dataset({\n",
|
| 163 |
+
" features: ['sentence1', 'sentence2', 'label'],\n",
|
| 164 |
+
" num_rows: 8000\n",
|
| 165 |
+
" })\n",
|
| 166 |
+
" test: Dataset({\n",
|
| 167 |
+
" features: ['sentence1', 'sentence2', 'label'],\n",
|
| 168 |
+
" num_rows: 8000\n",
|
| 169 |
+
" })\n",
|
| 170 |
+
"})"
|
| 171 |
+
]
|
| 172 |
+
},
|
| 173 |
+
"execution_count": 5,
|
| 174 |
+
"metadata": {},
|
| 175 |
+
"output_type": "execute_result"
|
| 176 |
+
}
|
| 177 |
+
],
|
| 178 |
+
"source": [
|
| 179 |
+
"#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似\n",
|
| 180 |
+
"raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.5) #默认已经shuffle\n",
|
| 181 |
+
"raw_datasets_dna_protein"
|
| 182 |
+
]
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"cell_type": "code",
|
| 186 |
+
"execution_count": 6,
|
| 187 |
+
"id": "6e6b1d3d-3d05-40b0-a96a-537a4dc324d6",
|
| 188 |
+
"metadata": {},
|
| 189 |
+
"outputs": [],
|
| 190 |
+
"source": [
|
| 191 |
+
"sim_score = []\n",
|
| 192 |
+
"dif_score = []\n",
|
| 193 |
+
"\n",
|
| 194 |
+
"for item in raw_datasets_dna_protein[\"train\"]:\n",
|
| 195 |
+
" #print(item)\n",
|
| 196 |
+
" sentence1 = item[\"sentence1\"]\n",
|
| 197 |
+
" sentence2 = item[\"sentence2\"]\n",
|
| 198 |
+
" label = item[\"label\"]\n",
|
| 199 |
+
" score = get_sim_score(sentence1, sentence2)\n",
|
| 200 |
+
"\n",
|
| 201 |
+
" if 1 == label:\n",
|
| 202 |
+
" sim_score.append(score)\n",
|
| 203 |
+
" else:\n",
|
| 204 |
+
" dif_score.append(score)"
|
| 205 |
+
]
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"cell_type": "code",
|
| 209 |
+
"execution_count": 7,
|
| 210 |
+
"id": "a515f319-254b-4675-9ca5-fb15da6a62e5",
|
| 211 |
+
"metadata": {},
|
| 212 |
+
"outputs": [
|
| 213 |
+
{
|
| 214 |
+
"name": "stdout",
|
| 215 |
+
"output_type": "stream",
|
| 216 |
+
"text": [
|
| 217 |
+
"38.87972 37.672188\n"
|
| 218 |
+
]
|
| 219 |
+
}
|
| 220 |
+
],
|
| 221 |
+
"source": [
|
| 222 |
+
"import numpy as np\n",
|
| 223 |
+
"print(np.mean(sim_score), np.mean(dif_score))"
|
| 224 |
+
]
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"cell_type": "code",
|
| 228 |
+
"execution_count": 8,
|
| 229 |
+
"id": "4417c7b5-8019-4a53-968a-4dee311acef3",
|
| 230 |
+
"metadata": {},
|
| 231 |
+
"outputs": [
|
| 232 |
+
{
|
| 233 |
+
"name": "stdout",
|
| 234 |
+
"output_type": "stream",
|
| 235 |
+
"text": [
|
| 236 |
+
"4047 3953\n"
|
| 237 |
+
]
|
| 238 |
+
}
|
| 239 |
+
],
|
| 240 |
+
"source": [
|
| 241 |
+
"print(len(sim_score),len(dif_score))"
|
| 242 |
+
]
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"cell_type": "code",
|
| 246 |
+
"execution_count": 9,
|
| 247 |
+
"id": "adc022c4-7bec-4381-b80b-6ac1b18be00c",
|
| 248 |
+
"metadata": {},
|
| 249 |
+
"outputs": [
|
| 250 |
+
{
|
| 251 |
+
"name": "stderr",
|
| 252 |
+
"output_type": "stream",
|
| 253 |
+
"text": [
|
| 254 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 39057 (\\N{CJK UNIFIED IDEOGRAPH-9891}) missing from font(s) DejaVu Sans.\n",
|
| 255 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 256 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 25968 (\\N{CJK UNIFIED IDEOGRAPH-6570}) missing from font(s) DejaVu Sans.\n",
|
| 257 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 258 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 20540 (\\N{CJK UNIFIED IDEOGRAPH-503C}) missing from font(s) DejaVu Sans.\n",
|
| 259 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 260 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 20998 (\\N{CJK UNIFIED IDEOGRAPH-5206}) missing from font(s) DejaVu Sans.\n",
|
| 261 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 262 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 24067 (\\N{CJK UNIFIED IDEOGRAPH-5E03}) missing from font(s) DejaVu Sans.\n",
|
| 263 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 264 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 30452 (\\N{CJK UNIFIED IDEOGRAPH-76F4}) missing from font(s) DejaVu Sans.\n",
|
| 265 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 266 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 26041 (\\N{CJK UNIFIED IDEOGRAPH-65B9}) missing from font(s) DejaVu Sans.\n",
|
| 267 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 268 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 22270 (\\N{CJK UNIFIED IDEOGRAPH-56FE}) missing from font(s) DejaVu Sans.\n",
|
| 269 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 270 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 21306 (\\N{CJK UNIFIED IDEOGRAPH-533A}) missing from font(s) DejaVu Sans.\n",
|
| 271 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 272 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 38388 (\\N{CJK UNIFIED IDEOGRAPH-95F4}) missing from font(s) DejaVu Sans.\n",
|
| 273 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n"
|
| 274 |
+
]
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"data": {
|
| 278 |
+
"image/png": "",
|
| 279 |
+
"text/plain": [
|
| 280 |
+
"<Figure size 800x600 with 1 Axes>"
|
| 281 |
+
]
|
| 282 |
+
},
|
| 283 |
+
"metadata": {},
|
| 284 |
+
"output_type": "display_data"
|
| 285 |
+
}
|
| 286 |
+
],
|
| 287 |
+
"source": [
|
| 288 |
+
"import matplotlib.pyplot as plt\n",
|
| 289 |
+
"import numpy as np\n",
|
| 290 |
+
"\n",
|
| 291 |
+
"# 示例数据(Python list,float 类型)\n",
|
| 292 |
+
"#data = np.random.normal(loc=50, scale=15, size=200) # 生成200个服从正态分布的随机数\n",
|
| 293 |
+
"data = sim_score\n",
|
| 294 |
+
"\n",
|
| 295 |
+
"# 计算直方图并自动确定分区数\n",
|
| 296 |
+
"plt.figure(figsize=(8, 6)) # 设置图像大小\n",
|
| 297 |
+
"plt.hist(data, bins='auto', edgecolor='black', alpha=0.7)\n",
|
| 298 |
+
"\n",
|
| 299 |
+
"# 添加标题和标签\n",
|
| 300 |
+
"plt.xlabel('数值区间', fontsize=12)\n",
|
| 301 |
+
"plt.ylabel('频数', fontsize=12)\n",
|
| 302 |
+
"plt.title('数值分布直方图', fontsize=14)\n",
|
| 303 |
+
"\n",
|
| 304 |
+
"# 显示网格\n",
|
| 305 |
+
"plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
|
| 306 |
+
"\n",
|
| 307 |
+
"# 显示直方图\n",
|
| 308 |
+
"plt.show()"
|
| 309 |
+
]
|
| 310 |
+
},
|
| 311 |
+
{
|
| 312 |
+
"cell_type": "code",
|
| 313 |
+
"execution_count": 10,
|
| 314 |
+
"id": "71ff0b65-1ced-49de-8bf3-60c9715916db",
|
| 315 |
+
"metadata": {},
|
| 316 |
+
"outputs": [
|
| 317 |
+
{
|
| 318 |
+
"data": {
|
| 319 |
+
"image/png": "",
|
| 320 |
+
"text/plain": [
|
| 321 |
+
"<Figure size 800x600 with 1 Axes>"
|
| 322 |
+
]
|
| 323 |
+
},
|
| 324 |
+
"metadata": {},
|
| 325 |
+
"output_type": "display_data"
|
| 326 |
+
}
|
| 327 |
+
],
|
| 328 |
+
"source": [
|
| 329 |
+
"import matplotlib.pyplot as plt\n",
|
| 330 |
+
"import numpy as np\n",
|
| 331 |
+
"\n",
|
| 332 |
+
"# 示例数据(Python list,float 类型)\n",
|
| 333 |
+
"#data = np.random.normal(loc=50, scale=15, size=200) # 生成200个服从正态分布的随机数\n",
|
| 334 |
+
"data = dif_score\n",
|
| 335 |
+
"\n",
|
| 336 |
+
"# 计算直方图并自动确定分区数\n",
|
| 337 |
+
"plt.figure(figsize=(8, 6)) # 设置图像大小\n",
|
| 338 |
+
"plt.hist(data, bins='auto', edgecolor='black', alpha=0.7)\n",
|
| 339 |
+
"\n",
|
| 340 |
+
"# 添加标题和标签\n",
|
| 341 |
+
"plt.xlabel('数值区间', fontsize=12)\n",
|
| 342 |
+
"plt.ylabel('频数', fontsize=12)\n",
|
| 343 |
+
"plt.title('数值分布直方图', fontsize=14)\n",
|
| 344 |
+
"\n",
|
| 345 |
+
"# 显示网格\n",
|
| 346 |
+
"plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
|
| 347 |
+
"\n",
|
| 348 |
+
"# 显示直方图\n",
|
| 349 |
+
"plt.show()"
|
| 350 |
+
]
|
| 351 |
+
},
|
| 352 |
+
{
|
| 353 |
+
"cell_type": "code",
|
| 354 |
+
"execution_count": 11,
|
| 355 |
+
"id": "f314d408-18ff-4e59-92bd-eb7a767ca262",
|
| 356 |
+
"metadata": {},
|
| 357 |
+
"outputs": [
|
| 358 |
+
{
|
| 359 |
+
"data": {
|
| 360 |
+
"image/png": "",
|
| 361 |
+
"text/plain": [
|
| 362 |
+
"<Figure size 640x480 with 1 Axes>"
|
| 363 |
+
]
|
| 364 |
+
},
|
| 365 |
+
"metadata": {},
|
| 366 |
+
"output_type": "display_data"
|
| 367 |
+
}
|
| 368 |
+
],
|
| 369 |
+
"source": [
|
| 370 |
+
"import matplotlib.pyplot as plt\n",
|
| 371 |
+
"import numpy as np\n",
|
| 372 |
+
"\n",
|
| 373 |
+
"# 示例数据\n",
|
| 374 |
+
"data1 = sim_score # 生成 1000 个符合正态分布的随机数\n",
|
| 375 |
+
"data2 = dif_score # 生成 1000 个偏移的随机数\n",
|
| 376 |
+
"\n",
|
| 377 |
+
"# 绘制直方图\n",
|
| 378 |
+
"plt.hist(data1, bins=30, alpha=0.5, label='Data 1', color='blue', edgecolor='black')\n",
|
| 379 |
+
"plt.hist(data2, bins=30, alpha=0.5, label='Data 2', color='red', edgecolor='black')\n",
|
| 380 |
+
"\n",
|
| 381 |
+
"# 添加图例\n",
|
| 382 |
+
"plt.legend()\n",
|
| 383 |
+
"\n",
|
| 384 |
+
"# 添加标题和标签\n",
|
| 385 |
+
"plt.title('Histogram of Two Data Sets')\n",
|
| 386 |
+
"plt.xlabel('Value')\n",
|
| 387 |
+
"plt.ylabel('Frequency')\n",
|
| 388 |
+
"\n",
|
| 389 |
+
"# 显示图形\n",
|
| 390 |
+
"plt.show()\n"
|
| 391 |
+
]
|
| 392 |
+
},
|
| 393 |
+
{
|
| 394 |
+
"cell_type": "code",
|
| 395 |
+
"execution_count": null,
|
| 396 |
+
"id": "ff7610d0-0487-418b-905e-969f4cd4f321",
|
| 397 |
+
"metadata": {},
|
| 398 |
+
"outputs": [],
|
| 399 |
+
"source": []
|
| 400 |
+
}
|
| 401 |
+
],
|
| 402 |
+
"metadata": {
|
| 403 |
+
"kernelspec": {
|
| 404 |
+
"display_name": "Python 3 (ipykernel)",
|
| 405 |
+
"language": "python",
|
| 406 |
+
"name": "python3"
|
| 407 |
+
},
|
| 408 |
+
"language_info": {
|
| 409 |
+
"codemirror_mode": {
|
| 410 |
+
"name": "ipython",
|
| 411 |
+
"version": 3
|
| 412 |
+
},
|
| 413 |
+
"file_extension": ".py",
|
| 414 |
+
"mimetype": "text/x-python",
|
| 415 |
+
"name": "python",
|
| 416 |
+
"nbconvert_exporter": "python",
|
| 417 |
+
"pygments_lexer": "ipython3",
|
| 418 |
+
"version": "3.12.3"
|
| 419 |
+
}
|
| 420 |
+
},
|
| 421 |
+
"nbformat": 4,
|
| 422 |
+
"nbformat_minor": 5
|
| 423 |
+
}
|
best_model/vect_sim_protein_test.ipynb
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "c0314d29-7fc8-4c11-8bfc-5440a442629a",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stdout",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"https://hf-mirror.com\n"
|
| 14 |
+
]
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"source": [
|
| 18 |
+
"# import subprocess\n",
|
| 19 |
+
"# import os\n",
|
| 20 |
+
"\n",
|
| 21 |
+
"# result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
| 22 |
+
"# output = result.stdout\n",
|
| 23 |
+
"# for line in output.splitlines():\n",
|
| 24 |
+
"# if '=' in line:\n",
|
| 25 |
+
"# var, value = line.split('=', 1)\n",
|
| 26 |
+
"# os.environ[var] = value\n",
|
| 27 |
+
"\n",
|
| 28 |
+
"\n",
|
| 29 |
+
"import os\n",
|
| 30 |
+
"\n",
|
| 31 |
+
"# 设置环境变量\n",
|
| 32 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
| 33 |
+
"\n",
|
| 34 |
+
"# 打印环境变量以确认设置成功\n",
|
| 35 |
+
"print(os.environ.get('HF_ENDPOINT'))"
|
| 36 |
+
]
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"cell_type": "code",
|
| 40 |
+
"execution_count": 2,
|
| 41 |
+
"id": "1a09a2f5-dda6-4d86-badd-60bf2fda3983",
|
| 42 |
+
"metadata": {},
|
| 43 |
+
"outputs": [
|
| 44 |
+
{
|
| 45 |
+
"name": "stderr",
|
| 46 |
+
"output_type": "stream",
|
| 47 |
+
"text": [
|
| 48 |
+
"2025-02-09 17:52:09.693182: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
|
| 49 |
+
"2025-02-09 17:52:09.706600: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
| 50 |
+
"2025-02-09 17:52:09.722220: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
| 51 |
+
"2025-02-09 17:52:09.726994: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
| 52 |
+
"2025-02-09 17:52:09.739072: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
| 53 |
+
"To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
| 54 |
+
"2025-02-09 17:52:10.723436: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
|
| 55 |
+
]
|
| 56 |
+
}
|
| 57 |
+
],
|
| 58 |
+
"source": [
|
| 59 |
+
"from transformers import GPT2Tokenizer, GPT2Model,AutoModel\n",
|
| 60 |
+
"import torch\n",
|
| 61 |
+
"from transformers import AutoTokenizer, DataCollatorWithPadding\n",
|
| 62 |
+
"from datasets import load_dataset\n",
|
| 63 |
+
"from transformers import AutoTokenizer, DataCollatorWithPadding\n",
|
| 64 |
+
"from transformers import Trainer\n",
|
| 65 |
+
"import evaluate\n",
|
| 66 |
+
"import numpy as np\n",
|
| 67 |
+
"from transformers import TrainingArguments\n",
|
| 68 |
+
"from transformers import AutoModelForSequenceClassification"
|
| 69 |
+
]
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"cell_type": "code",
|
| 73 |
+
"execution_count": 3,
|
| 74 |
+
"id": "7c5ab8f5-b8a9-4930-ac30-49d0bf26d579",
|
| 75 |
+
"metadata": {},
|
| 76 |
+
"outputs": [],
|
| 77 |
+
"source": [
|
| 78 |
+
"model_name=\"gpt2_gene_multi_v1_ft\"\n",
|
| 79 |
+
"device=\"cuda\"\n",
|
| 80 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2_gene_multi_v1_ft\")\n",
|
| 81 |
+
"model = AutoModel.from_pretrained(model_name)\n",
|
| 82 |
+
"model.to(device)\n",
|
| 83 |
+
"\n",
|
| 84 |
+
"def get_text_embedding(text):\n",
|
| 85 |
+
" \"\"\"\n",
|
| 86 |
+
" 使用 GPT-2 模型获取文本的向量表示。\n",
|
| 87 |
+
" \n",
|
| 88 |
+
" 参数:\n",
|
| 89 |
+
" text (str): 输入文本。\n",
|
| 90 |
+
" model_name (str): 预训练 GPT-2 模型名称,默认为 \"gpt2\"。\n",
|
| 91 |
+
" device (str): 设备名称(\"cpu\" 或 \"cuda\")。\n",
|
| 92 |
+
" \n",
|
| 93 |
+
" 返回:\n",
|
| 94 |
+
" torch.Tensor: 文本的向量表示,维度为 [hidden_size]。\n",
|
| 95 |
+
" \"\"\"\n",
|
| 96 |
+
"\n",
|
| 97 |
+
" # 将文本编码为输入 ID 并添加批量维度\n",
|
| 98 |
+
" inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, max_length=256)\n",
|
| 99 |
+
" input_ids = inputs[\"input_ids\"].to(device)\n",
|
| 100 |
+
" attention_mask = inputs[\"attention_mask\"].to(device)\n",
|
| 101 |
+
" \n",
|
| 102 |
+
" # 获取模型的隐藏层输出\n",
|
| 103 |
+
" with torch.no_grad():\n",
|
| 104 |
+
" outputs = model(input_ids, attention_mask=attention_mask)\n",
|
| 105 |
+
" hidden_states = outputs.last_hidden_state # [batch_size, seq_length, hidden_size]\n",
|
| 106 |
+
" \n",
|
| 107 |
+
" # 平均池化:获取序列中所有词向量的平均值\n",
|
| 108 |
+
" embeddings = hidden_states.mean(dim=1).squeeze() # [hidden_size]\n",
|
| 109 |
+
" \n",
|
| 110 |
+
" return embeddings"
|
| 111 |
+
]
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"cell_type": "code",
|
| 115 |
+
"execution_count": 4,
|
| 116 |
+
"id": "76ff7fed-fc17-421e-8a57-2a5de33d4ba6",
|
| 117 |
+
"metadata": {},
|
| 118 |
+
"outputs": [
|
| 119 |
+
{
|
| 120 |
+
"data": {
|
| 121 |
+
"text/plain": [
|
| 122 |
+
"0.3379075"
|
| 123 |
+
]
|
| 124 |
+
},
|
| 125 |
+
"execution_count": 4,
|
| 126 |
+
"metadata": {},
|
| 127 |
+
"output_type": "execute_result"
|
| 128 |
+
}
|
| 129 |
+
],
|
| 130 |
+
"source": [
|
| 131 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
| 132 |
+
"\n",
|
| 133 |
+
"def get_sim_score(s1, s2):\n",
|
| 134 |
+
" v1 = get_text_embedding(s1)\n",
|
| 135 |
+
" v2 = get_text_embedding(s2)\n",
|
| 136 |
+
" \n",
|
| 137 |
+
" # 假设dna_embedding和protein_embedding为numpy数组\n",
|
| 138 |
+
" #similarity = cosine_similarity([v1.cpu().numpy()], [v2.cpu().numpy()])[0][0]\n",
|
| 139 |
+
" A = v1.cpu().numpy()\n",
|
| 140 |
+
" B = v2.cpu().numpy()\n",
|
| 141 |
+
" similarity = np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))\n",
|
| 142 |
+
" #similarity = np.linalg.norm(A - B)\n",
|
| 143 |
+
" return similarity\n",
|
| 144 |
+
"\n",
|
| 145 |
+
"s1 = \"ATCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\"\n",
|
| 146 |
+
"s2 = \"ATCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\"\n",
|
| 147 |
+
"s1 = \"ATGTCTCTACAGATGATAACGGTCAGTAATAACGTAACTTTAATTCAACCAGGCTTCTCACTGATGAATTTTGATGGACAAGTTTTCTTCTTTGGTCAAAAAGGCTGGCCCAAGAGATCCTGCCCCACAGGAGTTTTCCATTTTGATGTAAAGCATAACCATCTCAAACTGAAGCCTACAGTTTTCTCAAAGGATTCCTGCTACCTTCCTCCTCTTCGCTATCCAGCCACTTGCATATTTAAAGGCAACTTCGAGTCTGAAAAGCATCAGTATATCATCCATGGAGGGAAAACACCAAACAATGAACTTTCAGATAAGATGTATGTCATGTCTATTGTTTGCAAAAACAACAAAAAATTTACTTTTCGCTGCACGGAGAAAGACTTGGTAGGTGATGTTCCTGAAGGCAGATATGGCCATTCCATTGATGTAGTGTATAGTCGAGGGAAAAGTATGGGCGTTCTCTTTGGAGGACGATCTTACATGCCTTCTGCCCAAAGAACCACAGAAAAATGGAACAGTGTAGTTGACTGCTTGCCCCATCTCTTCTTGGTGGATTTTGAATTTGGGTGTTCTACATCCTACATTCTTCCCGAACTTCAGGATGGGATATCTTTTCATGTCTCCATTGCCAGAAATGATACCATTTATATTTTAGGAGGTCATTCACTCACCAATAACATCCGCCCTGCCAATCTGTTCAGAGTAAGGGTTGATCTCCCCCTGGGTAGCCCAGCTGTGAGTTGCACGGTCTTATCAGGAGGAATCTCTGTCTCCAGTGCAATCTTGACTCAAACTAATAATGATGAATTTGTCATTGTTGGTGGCTATCAGCTTGAAAATCAAAAAAGAATGGTCTGCAACATTGTCACTTTAGATGACAACAAGATAGATATTCGTGAGATGGAGGCACCAGATTGGACCCCAGATATTAAGCACAGCAAGGTATGGTTTGGAAACAACATGGGAAATGGGAGTGTTTTCCTTGGAATACCAGGAGACAATAAGCAGGCTGTTTCAGAAGCATTCTATTTCTATATGTTGAAATGTGCTGAAGATGATATAAATGAAGATGAGAAAACATTGATGAACAGTCAGACATCAACAGAAGATCCAGGAGACTCCACACCCTTTGAAGACTCGGAAGAATTTTGCTTCAGTGCAGAAGCAAATAGTTTTGGTGGGGATGATGAATTTGACACCTATAATGAAGATGATGAGGAAGATGAGTCTGAGACAGGCTACTGGATTACGTGTTGCCTTACTTGTAATGTGGATATCAACACTTGGGTACCATTCTACTCAACTGAGCTCAACAAACCTGCTATGATCTACTGCTCTCATGAGGACGGGCACTGGGTCCATGCTCAGTGCATGGATCTGGCAGAGCGCACGCTCATCCATCTGTCAGAAGGAAGCAACAAGTATTATTGCAATGAGCATGTGGAGATAGCAAGAGCACTACAAACCCCCAAAAGAGCCATGCCCTTGAAAAAGCCCCCACTGAAATCCCTCCGCAAAAAAGGCCCTGCAAAAATCTTGACTCCTGCCAAGAAATCCTTCCTTAGAAGATTGTTTGAT\"\n",
|
| 148 |
+
"s2 = \"MSLQMITVSNNVTLIQPGFSLMNFDGQVFFFGQKGWPKRSCPTGVFHFDVKHNHLKLKPTVFSKDSCYLPPLRYPATCIFKGNFESEKHQYIIHGGKTPNNELSDKMYVMSIVCKNNKKFTFRCTEKDLVGDVPEGRYGHSIDVVYSRGKSMGVLFGGRSYMPSAQRTTEKWNSVVDCLPHLFLVDFEFGCSTSYILPELQDGISFHVSIARNDTIYILGGHSLTNNIRPANLFRVRVDLPLGSPAVSCTVLSGGISVSSAILTQTNNDEFVIVGGYQLENQKRMVCNIVTLDDNKIDIREMEAPDWTPDIKHSKVWFGNNMGNGSVFLGIPGDNKQAVSEAFYFYMLKCAEDDINEDEKTLMNSQTSTEDPGDSTPFEDSEEFCFSAEANSFGGDDEFDTYNEDDEEDESETGYWITCCLTCNVDINTWVPFYSTELNKPAMIYCSHEDGHWVHAQCMDLAERTLIHLSEGSNKYYCNEHVEIARALQTPKRAMPLKKPPLKSLRKKGPAKILTPAKKSFLRRLFD\"\n",
|
| 149 |
+
"get_sim_score(s1, s2)"
|
| 150 |
+
]
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"cell_type": "code",
|
| 154 |
+
"execution_count": 5,
|
| 155 |
+
"id": "2116f787-781f-4bfc-b12d-c36efe26cfa9",
|
| 156 |
+
"metadata": {},
|
| 157 |
+
"outputs": [
|
| 158 |
+
{
|
| 159 |
+
"data": {
|
| 160 |
+
"text/plain": [
|
| 161 |
+
"DatasetDict({\n",
|
| 162 |
+
" train: Dataset({\n",
|
| 163 |
+
" features: ['sentence1', 'sentence2', 'label'],\n",
|
| 164 |
+
" num_rows: 2000\n",
|
| 165 |
+
" })\n",
|
| 166 |
+
" test: Dataset({\n",
|
| 167 |
+
" features: ['sentence1', 'sentence2', 'label'],\n",
|
| 168 |
+
" num_rows: 2000\n",
|
| 169 |
+
" })\n",
|
| 170 |
+
"})"
|
| 171 |
+
]
|
| 172 |
+
},
|
| 173 |
+
"execution_count": 5,
|
| 174 |
+
"metadata": {},
|
| 175 |
+
"output_type": "execute_result"
|
| 176 |
+
}
|
| 177 |
+
],
|
| 178 |
+
"source": [
|
| 179 |
+
"#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似\n",
|
| 180 |
+
"raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.5) #默认已经shuffle\n",
|
| 181 |
+
"raw_datasets_dna_protein"
|
| 182 |
+
]
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"cell_type": "code",
|
| 186 |
+
"execution_count": 6,
|
| 187 |
+
"id": "6e6b1d3d-3d05-40b0-a96a-537a4dc324d6",
|
| 188 |
+
"metadata": {},
|
| 189 |
+
"outputs": [],
|
| 190 |
+
"source": [
|
| 191 |
+
"sim_score = []\n",
|
| 192 |
+
"dif_score = []\n",
|
| 193 |
+
"\n",
|
| 194 |
+
"for item in raw_datasets_dna_protein[\"train\"]:\n",
|
| 195 |
+
" #print(item)\n",
|
| 196 |
+
" sentence1 = item[\"sentence1\"]\n",
|
| 197 |
+
" sentence2 = item[\"sentence2\"]\n",
|
| 198 |
+
" label = item[\"label\"]\n",
|
| 199 |
+
" score = get_sim_score(sentence1, sentence2)\n",
|
| 200 |
+
"\n",
|
| 201 |
+
" if 1 == label:\n",
|
| 202 |
+
" sim_score.append(score)\n",
|
| 203 |
+
" else:\n",
|
| 204 |
+
" dif_score.append(score)"
|
| 205 |
+
]
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"cell_type": "code",
|
| 209 |
+
"execution_count": 7,
|
| 210 |
+
"id": "a515f319-254b-4675-9ca5-fb15da6a62e5",
|
| 211 |
+
"metadata": {},
|
| 212 |
+
"outputs": [
|
| 213 |
+
{
|
| 214 |
+
"name": "stdout",
|
| 215 |
+
"output_type": "stream",
|
| 216 |
+
"text": [
|
| 217 |
+
"0.34443858 0.3725082\n"
|
| 218 |
+
]
|
| 219 |
+
}
|
| 220 |
+
],
|
| 221 |
+
"source": [
|
| 222 |
+
"import numpy as np\n",
|
| 223 |
+
"print(np.mean(sim_score), np.mean(dif_score))"
|
| 224 |
+
]
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"cell_type": "code",
|
| 228 |
+
"execution_count": 8,
|
| 229 |
+
"id": "4417c7b5-8019-4a53-968a-4dee311acef3",
|
| 230 |
+
"metadata": {},
|
| 231 |
+
"outputs": [
|
| 232 |
+
{
|
| 233 |
+
"name": "stdout",
|
| 234 |
+
"output_type": "stream",
|
| 235 |
+
"text": [
|
| 236 |
+
"983 1017\n"
|
| 237 |
+
]
|
| 238 |
+
}
|
| 239 |
+
],
|
| 240 |
+
"source": [
|
| 241 |
+
"print(len(sim_score),len(dif_score))"
|
| 242 |
+
]
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"cell_type": "code",
|
| 246 |
+
"execution_count": 9,
|
| 247 |
+
"id": "adc022c4-7bec-4381-b80b-6ac1b18be00c",
|
| 248 |
+
"metadata": {},
|
| 249 |
+
"outputs": [
|
| 250 |
+
{
|
| 251 |
+
"name": "stderr",
|
| 252 |
+
"output_type": "stream",
|
| 253 |
+
"text": [
|
| 254 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 39057 (\\N{CJK UNIFIED IDEOGRAPH-9891}) missing from font(s) DejaVu Sans.\n",
|
| 255 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 256 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 25968 (\\N{CJK UNIFIED IDEOGRAPH-6570}) missing from font(s) DejaVu Sans.\n",
|
| 257 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 258 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 20540 (\\N{CJK UNIFIED IDEOGRAPH-503C}) missing from font(s) DejaVu Sans.\n",
|
| 259 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 260 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 20998 (\\N{CJK UNIFIED IDEOGRAPH-5206}) missing from font(s) DejaVu Sans.\n",
|
| 261 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 262 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 24067 (\\N{CJK UNIFIED IDEOGRAPH-5E03}) missing from font(s) DejaVu Sans.\n",
|
| 263 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 264 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 30452 (\\N{CJK UNIFIED IDEOGRAPH-76F4}) missing from font(s) DejaVu Sans.\n",
|
| 265 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 266 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 26041 (\\N{CJK UNIFIED IDEOGRAPH-65B9}) missing from font(s) DejaVu Sans.\n",
|
| 267 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 268 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 22270 (\\N{CJK UNIFIED IDEOGRAPH-56FE}) missing from font(s) DejaVu Sans.\n",
|
| 269 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 270 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 21306 (\\N{CJK UNIFIED IDEOGRAPH-533A}) missing from font(s) DejaVu Sans.\n",
|
| 271 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 272 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 38388 (\\N{CJK UNIFIED IDEOGRAPH-95F4}) missing from font(s) DejaVu Sans.\n",
|
| 273 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n"
|
| 274 |
+
]
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"data": {
|
| 278 |
+
"image/png": "",
|
| 279 |
+
"text/plain": [
|
| 280 |
+
"<Figure size 800x600 with 1 Axes>"
|
| 281 |
+
]
|
| 282 |
+
},
|
| 283 |
+
"metadata": {},
|
| 284 |
+
"output_type": "display_data"
|
| 285 |
+
}
|
| 286 |
+
],
|
| 287 |
+
"source": [
|
| 288 |
+
"import matplotlib.pyplot as plt\n",
|
| 289 |
+
"import numpy as np\n",
|
| 290 |
+
"\n",
|
| 291 |
+
"# 示例数据(Python list,float 类型)\n",
|
| 292 |
+
"#data = np.random.normal(loc=50, scale=15, size=200) # 生成200个服从正态分布的随机数\n",
|
| 293 |
+
"data = sim_score\n",
|
| 294 |
+
"\n",
|
| 295 |
+
"# 计算直方图并自动确定分区数\n",
|
| 296 |
+
"plt.figure(figsize=(8, 6)) # 设置图像大小\n",
|
| 297 |
+
"plt.hist(data, bins='auto', edgecolor='black', alpha=0.7)\n",
|
| 298 |
+
"\n",
|
| 299 |
+
"# 添加标题和标签\n",
|
| 300 |
+
"plt.xlabel('数值区间', fontsize=12)\n",
|
| 301 |
+
"plt.ylabel('频数', fontsize=12)\n",
|
| 302 |
+
"plt.title('数值分布直方图', fontsize=14)\n",
|
| 303 |
+
"\n",
|
| 304 |
+
"# 显示网格\n",
|
| 305 |
+
"plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
|
| 306 |
+
"\n",
|
| 307 |
+
"# 显示直方图\n",
|
| 308 |
+
"plt.show()"
|
| 309 |
+
]
|
| 310 |
+
},
|
| 311 |
+
{
|
| 312 |
+
"cell_type": "code",
|
| 313 |
+
"execution_count": 10,
|
| 314 |
+
"id": "71ff0b65-1ced-49de-8bf3-60c9715916db",
|
| 315 |
+
"metadata": {},
|
| 316 |
+
"outputs": [
|
| 317 |
+
{
|
| 318 |
+
"data": {
|
| 319 |
+
"image/png": "",
|
| 320 |
+
"text/plain": [
|
| 321 |
+
"<Figure size 800x600 with 1 Axes>"
|
| 322 |
+
]
|
| 323 |
+
},
|
| 324 |
+
"metadata": {},
|
| 325 |
+
"output_type": "display_data"
|
| 326 |
+
}
|
| 327 |
+
],
|
| 328 |
+
"source": [
|
| 329 |
+
"import matplotlib.pyplot as plt\n",
|
| 330 |
+
"import numpy as np\n",
|
| 331 |
+
"\n",
|
| 332 |
+
"# 示例数据(Python list,float 类型)\n",
|
| 333 |
+
"#data = np.random.normal(loc=50, scale=15, size=200) # 生成200个服从正态分布的随机数\n",
|
| 334 |
+
"data = dif_score\n",
|
| 335 |
+
"\n",
|
| 336 |
+
"# 计算直方图并自动确定分区数\n",
|
| 337 |
+
"plt.figure(figsize=(8, 6)) # 设置图像大小\n",
|
| 338 |
+
"plt.hist(data, bins='auto', edgecolor='black', alpha=0.7)\n",
|
| 339 |
+
"\n",
|
| 340 |
+
"# 添加标题和标签\n",
|
| 341 |
+
"plt.xlabel('数值区间', fontsize=12)\n",
|
| 342 |
+
"plt.ylabel('频数', fontsize=12)\n",
|
| 343 |
+
"plt.title('数值分布直方图', fontsize=14)\n",
|
| 344 |
+
"\n",
|
| 345 |
+
"# 显示网格\n",
|
| 346 |
+
"plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
|
| 347 |
+
"\n",
|
| 348 |
+
"# 显示直方图\n",
|
| 349 |
+
"plt.show()"
|
| 350 |
+
]
|
| 351 |
+
},
|
| 352 |
+
{
|
| 353 |
+
"cell_type": "code",
|
| 354 |
+
"execution_count": 11,
|
| 355 |
+
"id": "f314d408-18ff-4e59-92bd-eb7a767ca262",
|
| 356 |
+
"metadata": {},
|
| 357 |
+
"outputs": [
|
| 358 |
+
{
|
| 359 |
+
"data": {
|
| 360 |
+
"image/png": "",
|
| 361 |
+
"text/plain": [
|
| 362 |
+
"<Figure size 640x480 with 1 Axes>"
|
| 363 |
+
]
|
| 364 |
+
},
|
| 365 |
+
"metadata": {},
|
| 366 |
+
"output_type": "display_data"
|
| 367 |
+
}
|
| 368 |
+
],
|
| 369 |
+
"source": [
|
| 370 |
+
"import matplotlib.pyplot as plt\n",
|
| 371 |
+
"import numpy as np\n",
|
| 372 |
+
"\n",
|
| 373 |
+
"# 示例数据\n",
|
| 374 |
+
"data1 = sim_score # 生成 1000 个符合正态分布的随机数\n",
|
| 375 |
+
"data2 = dif_score # 生成 1000 个偏移的随机数\n",
|
| 376 |
+
"\n",
|
| 377 |
+
"# 绘制直方图\n",
|
| 378 |
+
"plt.hist(data1, bins=30, alpha=0.5, label='Data 1', color='blue', edgecolor='black')\n",
|
| 379 |
+
"plt.hist(data2, bins=30, alpha=0.5, label='Data 2', color='red', edgecolor='black')\n",
|
| 380 |
+
"\n",
|
| 381 |
+
"# 添加图例\n",
|
| 382 |
+
"plt.legend()\n",
|
| 383 |
+
"\n",
|
| 384 |
+
"# 添加标题和标签\n",
|
| 385 |
+
"plt.title('Histogram of Two Data Sets')\n",
|
| 386 |
+
"plt.xlabel('Value')\n",
|
| 387 |
+
"plt.ylabel('Frequency')\n",
|
| 388 |
+
"\n",
|
| 389 |
+
"# 显示图形\n",
|
| 390 |
+
"plt.show()\n"
|
| 391 |
+
]
|
| 392 |
+
},
|
| 393 |
+
{
|
| 394 |
+
"cell_type": "code",
|
| 395 |
+
"execution_count": null,
|
| 396 |
+
"id": "ff7610d0-0487-418b-905e-969f4cd4f321",
|
| 397 |
+
"metadata": {},
|
| 398 |
+
"outputs": [],
|
| 399 |
+
"source": []
|
| 400 |
+
}
|
| 401 |
+
],
|
| 402 |
+
"metadata": {
|
| 403 |
+
"kernelspec": {
|
| 404 |
+
"display_name": "Python 3 (ipykernel)",
|
| 405 |
+
"language": "python",
|
| 406 |
+
"name": "python3"
|
| 407 |
+
},
|
| 408 |
+
"language_info": {
|
| 409 |
+
"codemirror_mode": {
|
| 410 |
+
"name": "ipython",
|
| 411 |
+
"version": 3
|
| 412 |
+
},
|
| 413 |
+
"file_extension": ".py",
|
| 414 |
+
"mimetype": "text/x-python",
|
| 415 |
+
"name": "python",
|
| 416 |
+
"nbconvert_exporter": "python",
|
| 417 |
+
"pygments_lexer": "ipython3",
|
| 418 |
+
"version": "3.12.3"
|
| 419 |
+
}
|
| 420 |
+
},
|
| 421 |
+
"nbformat": 4,
|
| 422 |
+
"nbformat_minor": 5
|
| 423 |
+
}
|
finetune/2-gpt2-gene-multi-v2-instruction-ft.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
finetune/get_acc_stat_multiv1.ipynb
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "8159f2eb-88ce-4c45-b1ae-584ce3a1976f",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import json"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "code",
|
| 15 |
+
"execution_count": 2,
|
| 16 |
+
"id": "179a6741-6649-4bea-be83-7fc9fd6c13c6",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": [
|
| 20 |
+
"filename = \"gpt2_gene_multiv1_ft_en.jsonl\"\n",
|
| 21 |
+
"data_list = []\n",
|
| 22 |
+
"for line in open(filename):\n",
|
| 23 |
+
" data = json.loads(line)\n",
|
| 24 |
+
" data_list.append(data)\n",
|
| 25 |
+
" "
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "code",
|
| 30 |
+
"execution_count": 10,
|
| 31 |
+
"id": "c8cc78e9-fbdf-4c95-847f-44ea953a38ec",
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"outputs": [
|
| 34 |
+
{
|
| 35 |
+
"name": "stdout",
|
| 36 |
+
"output_type": "stream",
|
| 37 |
+
"text": [
|
| 38 |
+
"en: 0.899\n",
|
| 39 |
+
"fr: 0.8245\n",
|
| 40 |
+
"de: 0.79\n",
|
| 41 |
+
"zh: 0.7395\n",
|
| 42 |
+
"dna_sim_pair_simple_150bp: 0.955\n",
|
| 43 |
+
"dna_sim_pair_150bp: 0.86975\n",
|
| 44 |
+
"dna_sim_pair_50bp: 0.8185\n",
|
| 45 |
+
"protein_sim_pair_150bp: 0.9738888888888889\n",
|
| 46 |
+
"protein_sim_pair_450bp: 0.975\n",
|
| 47 |
+
"dna_protein_pair: 0.5575\n",
|
| 48 |
+
"dna_protein_pair_100: 0.5725\n",
|
| 49 |
+
"dna_protein_pair_full: 0.7324999999999999\n",
|
| 50 |
+
"dna_protein_pair_rand: 0.6\n",
|
| 51 |
+
"dna_protein_pair_rand_100: 0.60375\n",
|
| 52 |
+
"dna_protein_pair_rand_full: 0.61125\n"
|
| 53 |
+
]
|
| 54 |
+
}
|
| 55 |
+
],
|
| 56 |
+
"source": [
|
| 57 |
+
"# 假设您的数据存储在一个名为data_list的列表中\n",
|
| 58 |
+
"# 初始化一个字典来保存每个键的最大accuracy值\n",
|
| 59 |
+
"max_accuracies = {}\n",
|
| 60 |
+
"\n",
|
| 61 |
+
"dna_protein_pair_full_list = []\n",
|
| 62 |
+
"\n",
|
| 63 |
+
"\n",
|
| 64 |
+
"# 遍历列表中的每个字典\n",
|
| 65 |
+
"for data in data_list:\n",
|
| 66 |
+
" for key, metrics in data.items():\n",
|
| 67 |
+
" if key not in ['seed']: # 忽略非目标键,例如'seed'\n",
|
| 68 |
+
" if isinstance(metrics, dict) and 'accuracy' in metrics:\n",
|
| 69 |
+
" accuracy = metrics['accuracy']\n",
|
| 70 |
+
" if accuracy<0.5:\n",
|
| 71 |
+
" accuracy = 1-accuracy\n",
|
| 72 |
+
"\n",
|
| 73 |
+
" if key==\"dna_protein_pair_full\":\n",
|
| 74 |
+
" dna_protein_pair_full_list.append(accuracy)\n",
|
| 75 |
+
" \n",
|
| 76 |
+
" if key not in max_accuracies or accuracy > max_accuracies[key]:\n",
|
| 77 |
+
" max_accuracies[key] = accuracy\n",
|
| 78 |
+
"\n",
|
| 79 |
+
"# 打印每个键的最大accuracy值\n",
|
| 80 |
+
"for key, max_accuracy in max_accuracies.items():\n",
|
| 81 |
+
" print(f\"{key}: {max_accuracy}\")"
|
| 82 |
+
]
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"cell_type": "code",
|
| 86 |
+
"execution_count": 9,
|
| 87 |
+
"id": "0d2f40f8-a817-4b6b-ae17-310478f6f8d8",
|
| 88 |
+
"metadata": {},
|
| 89 |
+
"outputs": [
|
| 90 |
+
{
|
| 91 |
+
"data": {
|
| 92 |
+
"text/plain": [
|
| 93 |
+
"[0.505,\n",
|
| 94 |
+
" 0.4975,\n",
|
| 95 |
+
" 0.575,\n",
|
| 96 |
+
" 0.5025,\n",
|
| 97 |
+
" 0.45,\n",
|
| 98 |
+
" 0.4775,\n",
|
| 99 |
+
" 0.515,\n",
|
| 100 |
+
" 0.505,\n",
|
| 101 |
+
" 0.475,\n",
|
| 102 |
+
" 0.515,\n",
|
| 103 |
+
" 0.5675,\n",
|
| 104 |
+
" 0.4275,\n",
|
| 105 |
+
" 0.4875,\n",
|
| 106 |
+
" 0.5125,\n",
|
| 107 |
+
" 0.505,\n",
|
| 108 |
+
" 0.5025,\n",
|
| 109 |
+
" 0.53,\n",
|
| 110 |
+
" 0.5425,\n",
|
| 111 |
+
" 0.51,\n",
|
| 112 |
+
" 0.49,\n",
|
| 113 |
+
" 0.3925,\n",
|
| 114 |
+
" 0.4825,\n",
|
| 115 |
+
" 0.5425,\n",
|
| 116 |
+
" 0.385,\n",
|
| 117 |
+
" 0.36,\n",
|
| 118 |
+
" 0.5125,\n",
|
| 119 |
+
" 0.535,\n",
|
| 120 |
+
" 0.4825,\n",
|
| 121 |
+
" 0.5025,\n",
|
| 122 |
+
" 0.485,\n",
|
| 123 |
+
" 0.5125,\n",
|
| 124 |
+
" 0.3325,\n",
|
| 125 |
+
" 0.6225,\n",
|
| 126 |
+
" 0.4975,\n",
|
| 127 |
+
" 0.5375,\n",
|
| 128 |
+
" 0.4975,\n",
|
| 129 |
+
" 0.5325,\n",
|
| 130 |
+
" 0.2925,\n",
|
| 131 |
+
" 0.4825,\n",
|
| 132 |
+
" 0.4875,\n",
|
| 133 |
+
" 0.4975,\n",
|
| 134 |
+
" 0.53,\n",
|
| 135 |
+
" 0.285,\n",
|
| 136 |
+
" 0.4625,\n",
|
| 137 |
+
" 0.4275,\n",
|
| 138 |
+
" 0.48,\n",
|
| 139 |
+
" 0.4225,\n",
|
| 140 |
+
" 0.55,\n",
|
| 141 |
+
" 0.385,\n",
|
| 142 |
+
" 0.5175,\n",
|
| 143 |
+
" 0.53,\n",
|
| 144 |
+
" 0.4375,\n",
|
| 145 |
+
" 0.495,\n",
|
| 146 |
+
" 0.485,\n",
|
| 147 |
+
" 0.3425,\n",
|
| 148 |
+
" 0.4875,\n",
|
| 149 |
+
" 0.5575,\n",
|
| 150 |
+
" 0.4825,\n",
|
| 151 |
+
" 0.2675,\n",
|
| 152 |
+
" 0.4975,\n",
|
| 153 |
+
" 0.5375,\n",
|
| 154 |
+
" 0.5375,\n",
|
| 155 |
+
" 0.475,\n",
|
| 156 |
+
" 0.3525,\n",
|
| 157 |
+
" 0.485,\n",
|
| 158 |
+
" 0.34,\n",
|
| 159 |
+
" 0.4625,\n",
|
| 160 |
+
" 0.5,\n",
|
| 161 |
+
" 0.505,\n",
|
| 162 |
+
" 0.5075,\n",
|
| 163 |
+
" 0.515,\n",
|
| 164 |
+
" 0.4925,\n",
|
| 165 |
+
" 0.445,\n",
|
| 166 |
+
" 0.3675,\n",
|
| 167 |
+
" 0.5125,\n",
|
| 168 |
+
" 0.495,\n",
|
| 169 |
+
" 0.4175,\n",
|
| 170 |
+
" 0.4725,\n",
|
| 171 |
+
" 0.5025,\n",
|
| 172 |
+
" 0.4875,\n",
|
| 173 |
+
" 0.53,\n",
|
| 174 |
+
" 0.5425,\n",
|
| 175 |
+
" 0.4175,\n",
|
| 176 |
+
" 0.34,\n",
|
| 177 |
+
" 0.5225,\n",
|
| 178 |
+
" 0.49,\n",
|
| 179 |
+
" 0.4125,\n",
|
| 180 |
+
" 0.3575,\n",
|
| 181 |
+
" 0.4925,\n",
|
| 182 |
+
" 0.535,\n",
|
| 183 |
+
" 0.51,\n",
|
| 184 |
+
" 0.49,\n",
|
| 185 |
+
" 0.535,\n",
|
| 186 |
+
" 0.4975,\n",
|
| 187 |
+
" 0.3825,\n",
|
| 188 |
+
" 0.48,\n",
|
| 189 |
+
" 0.485,\n",
|
| 190 |
+
" 0.5]"
|
| 191 |
+
]
|
| 192 |
+
},
|
| 193 |
+
"execution_count": 9,
|
| 194 |
+
"metadata": {},
|
| 195 |
+
"output_type": "execute_result"
|
| 196 |
+
}
|
| 197 |
+
],
|
| 198 |
+
"source": [
|
| 199 |
+
"dna_protein_pair_full_list"
|
| 200 |
+
]
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"cell_type": "code",
|
| 204 |
+
"execution_count": null,
|
| 205 |
+
"id": "110b1efd-1ccb-43d1-9033-53cbf92146e2",
|
| 206 |
+
"metadata": {},
|
| 207 |
+
"outputs": [],
|
| 208 |
+
"source": []
|
| 209 |
+
}
|
| 210 |
+
],
|
| 211 |
+
"metadata": {
|
| 212 |
+
"kernelspec": {
|
| 213 |
+
"display_name": "Python 3 (ipykernel)",
|
| 214 |
+
"language": "python",
|
| 215 |
+
"name": "python3"
|
| 216 |
+
},
|
| 217 |
+
"language_info": {
|
| 218 |
+
"codemirror_mode": {
|
| 219 |
+
"name": "ipython",
|
| 220 |
+
"version": 3
|
| 221 |
+
},
|
| 222 |
+
"file_extension": ".py",
|
| 223 |
+
"mimetype": "text/x-python",
|
| 224 |
+
"name": "python",
|
| 225 |
+
"nbconvert_exporter": "python",
|
| 226 |
+
"pygments_lexer": "ipython3",
|
| 227 |
+
"version": "3.12.3"
|
| 228 |
+
}
|
| 229 |
+
},
|
| 230 |
+
"nbformat": 4,
|
| 231 |
+
"nbformat_minor": 5
|
| 232 |
+
}
|
finetune/get_acc_stat_multiv1_2.ipynb
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "8159f2eb-88ce-4c45-b1ae-584ce3a1976f",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import json"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "code",
|
| 15 |
+
"execution_count": 2,
|
| 16 |
+
"id": "179a6741-6649-4bea-be83-7fc9fd6c13c6",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": [
|
| 20 |
+
"filename = \"gpt2_gene_multiv1_ft_en2.jsonl\"\n",
|
| 21 |
+
"data_list = []\n",
|
| 22 |
+
"for line in open(filename):\n",
|
| 23 |
+
" data = json.loads(line)\n",
|
| 24 |
+
" data_list.append(data)\n",
|
| 25 |
+
" "
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "code",
|
| 30 |
+
"execution_count": 3,
|
| 31 |
+
"id": "c8cc78e9-fbdf-4c95-847f-44ea953a38ec",
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"outputs": [
|
| 34 |
+
{
|
| 35 |
+
"name": "stdout",
|
| 36 |
+
"output_type": "stream",
|
| 37 |
+
"text": [
|
| 38 |
+
"dna_protein_pair_full: 0.7\n",
|
| 39 |
+
"dna_protein_pair_rand_full: 0.708375\n"
|
| 40 |
+
]
|
| 41 |
+
}
|
| 42 |
+
],
|
| 43 |
+
"source": [
|
| 44 |
+
"# 假设您的数据存储在一个名为data_list的列表中\n",
|
| 45 |
+
"# 初始化一个字典来保存每个键的最大accuracy值\n",
|
| 46 |
+
"max_accuracies = {}\n",
|
| 47 |
+
"dna_protein_pair_full_list = []\n",
|
| 48 |
+
"fr_list = []\n",
|
| 49 |
+
"zh_list = []\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"# 遍历列表中的每个字典\n",
|
| 52 |
+
"for data in data_list:\n",
|
| 53 |
+
" for key, metrics in data.items():\n",
|
| 54 |
+
" if key not in ['seed']: # 忽略非目标键,例如'seed'\n",
|
| 55 |
+
" if isinstance(metrics, dict) and 'accuracy' in metrics:\n",
|
| 56 |
+
" accuracy = metrics['accuracy']\n",
|
| 57 |
+
" if accuracy<0.5:\n",
|
| 58 |
+
" accuracy = 1 - accuracy\n",
|
| 59 |
+
" if \"dna_protein_pair_full\"==key:\n",
|
| 60 |
+
" dna_protein_pair_full_list.append(accuracy)\n",
|
| 61 |
+
"\n",
|
| 62 |
+
" if \"fr\"==key:\n",
|
| 63 |
+
" fr_list.append(accuracy)\n",
|
| 64 |
+
"\n",
|
| 65 |
+
" if \"zh\"==key:\n",
|
| 66 |
+
" zh_list.append(accuracy)\n",
|
| 67 |
+
" \n",
|
| 68 |
+
" if key not in max_accuracies or accuracy > max_accuracies[key]:\n",
|
| 69 |
+
" max_accuracies[key] = accuracy\n",
|
| 70 |
+
"\n",
|
| 71 |
+
"# 打印每个键的最大accuracy值\n",
|
| 72 |
+
"for key, max_accuracy in max_accuracies.items():\n",
|
| 73 |
+
" print(f\"{key}: {max_accuracy}\")"
|
| 74 |
+
]
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"cell_type": "code",
|
| 78 |
+
"execution_count": 4,
|
| 79 |
+
"id": "0d2f40f8-a817-4b6b-ae17-310478f6f8d8",
|
| 80 |
+
"metadata": {},
|
| 81 |
+
"outputs": [],
|
| 82 |
+
"source": [
|
| 83 |
+
"#!pip install matplotlib seaborn"
|
| 84 |
+
]
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"cell_type": "code",
|
| 88 |
+
"execution_count": 5,
|
| 89 |
+
"id": "367b765c-ec8d-4cbb-a76c-d1e891816e14",
|
| 90 |
+
"metadata": {},
|
| 91 |
+
"outputs": [
|
| 92 |
+
{
|
| 93 |
+
"data": {
|
| 94 |
+
"text/plain": [
|
| 95 |
+
"Text(0, 0.5, 'Frequency')"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
"execution_count": 5,
|
| 99 |
+
"metadata": {},
|
| 100 |
+
"output_type": "execute_result"
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"data": {
|
| 104 |
+
"image/png": "",
|
| 105 |
+
"text/plain": [
|
| 106 |
+
"<Figure size 1000x600 with 1 Axes>"
|
| 107 |
+
]
|
| 108 |
+
},
|
| 109 |
+
"metadata": {},
|
| 110 |
+
"output_type": "display_data"
|
| 111 |
+
}
|
| 112 |
+
],
|
| 113 |
+
"source": [
|
| 114 |
+
"import numpy as np\n",
|
| 115 |
+
"import matplotlib.pyplot as plt\n",
|
| 116 |
+
"import seaborn as sns\n",
|
| 117 |
+
"\n",
|
| 118 |
+
"# 假设这是你的列表\n",
|
| 119 |
+
"#dna_protein_pair_rand_full_list = [0.1, 0.2, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2] # 示例数据\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"# 使用Freedman-Diaconis规则自动确定bin的数量\n",
|
| 122 |
+
"bins = np.histogram_bin_edges(dna_protein_pair_full_list, bins='fd')\n",
|
| 123 |
+
"\n",
|
| 124 |
+
"# 设置图形大小\n",
|
| 125 |
+
"plt.figure(figsize=(10, 6))\n",
|
| 126 |
+
"\n",
|
| 127 |
+
"# 使用seaborn绘制直方图\n",
|
| 128 |
+
"sns.histplot(data=dna_protein_pair_full_list, bins=bins, kde=True)\n",
|
| 129 |
+
"\n",
|
| 130 |
+
"# 添加标题和轴标签\n",
|
| 131 |
+
"plt.title('Distribution of Accuracy Value (Test dna_protein_pair)')\n",
|
| 132 |
+
"plt.xlabel('Values')\n",
|
| 133 |
+
"plt.ylabel('Frequency')\n",
|
| 134 |
+
"\n",
|
| 135 |
+
"# 显示图形"
|
| 136 |
+
]
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"cell_type": "code",
|
| 140 |
+
"execution_count": 8,
|
| 141 |
+
"id": "094948c7-3797-46ef-96cb-8a784ec937b5",
|
| 142 |
+
"metadata": {},
|
| 143 |
+
"outputs": [
|
| 144 |
+
{
|
| 145 |
+
"name": "stdout",
|
| 146 |
+
"output_type": "stream",
|
| 147 |
+
"text": [
|
| 148 |
+
"平均值: 0.542962962962963\n",
|
| 149 |
+
"方差: 0.0017891375171467762\n",
|
| 150 |
+
"标准差: 0.04229819756380614\n"
|
| 151 |
+
]
|
| 152 |
+
}
|
| 153 |
+
],
|
| 154 |
+
"source": [
|
| 155 |
+
"import numpy as np\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"# 示例数据(Python list,包含 float 类型的数值)\n",
|
| 158 |
+
"data = dna_protein_pair_full_list\n",
|
| 159 |
+
"\n",
|
| 160 |
+
"# 计算平均值\n",
|
| 161 |
+
"mean_value = np.mean(data)\n",
|
| 162 |
+
"\n",
|
| 163 |
+
"# 计算方差(注意:np.var默认计算总体方差,如果需要样本方差,请设置ddof=1)\n",
|
| 164 |
+
"variance_value = np.var(data)\n",
|
| 165 |
+
"\n",
|
| 166 |
+
"# 计算标准差\n",
|
| 167 |
+
"std_value = np.std(data)\n",
|
| 168 |
+
"\n",
|
| 169 |
+
"print(\"平均值:\", mean_value)\n",
|
| 170 |
+
"print(\"方差:\", variance_value)\n",
|
| 171 |
+
"print(\"标准差:\", std_value)"
|
| 172 |
+
]
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"cell_type": "code",
|
| 176 |
+
"execution_count": 9,
|
| 177 |
+
"id": "8ce7f481-1753-4682-960f-9cd459b6a383",
|
| 178 |
+
"metadata": {},
|
| 179 |
+
"outputs": [
|
| 180 |
+
{
|
| 181 |
+
"name": "stderr",
|
| 182 |
+
"output_type": "stream",
|
| 183 |
+
"text": [
|
| 184 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 39057 (\\N{CJK UNIFIED IDEOGRAPH-9891}) missing from font(s) DejaVu Sans.\n",
|
| 185 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 186 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 25968 (\\N{CJK UNIFIED IDEOGRAPH-6570}) missing from font(s) DejaVu Sans.\n",
|
| 187 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 188 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 20540 (\\N{CJK UNIFIED IDEOGRAPH-503C}) missing from font(s) DejaVu Sans.\n",
|
| 189 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 190 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 20998 (\\N{CJK UNIFIED IDEOGRAPH-5206}) missing from font(s) DejaVu Sans.\n",
|
| 191 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 192 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 24067 (\\N{CJK UNIFIED IDEOGRAPH-5E03}) missing from font(s) DejaVu Sans.\n",
|
| 193 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 194 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 30452 (\\N{CJK UNIFIED IDEOGRAPH-76F4}) missing from font(s) DejaVu Sans.\n",
|
| 195 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 196 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 26041 (\\N{CJK UNIFIED IDEOGRAPH-65B9}) missing from font(s) DejaVu Sans.\n",
|
| 197 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 198 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 22270 (\\N{CJK UNIFIED IDEOGRAPH-56FE}) missing from font(s) DejaVu Sans.\n",
|
| 199 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 200 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 21306 (\\N{CJK UNIFIED IDEOGRAPH-533A}) missing from font(s) DejaVu Sans.\n",
|
| 201 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n",
|
| 202 |
+
"/root/miniconda3/lib/python3.12/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 38388 (\\N{CJK UNIFIED IDEOGRAPH-95F4}) missing from font(s) DejaVu Sans.\n",
|
| 203 |
+
" fig.canvas.print_figure(bytes_io, **kw)\n"
|
| 204 |
+
]
|
| 205 |
+
},
|
| 206 |
+
{
|
| 207 |
+
"data": {
|
| 208 |
+
"image/png": "",
|
| 209 |
+
"text/plain": [
|
| 210 |
+
"<Figure size 800x600 with 1 Axes>"
|
| 211 |
+
]
|
| 212 |
+
},
|
| 213 |
+
"metadata": {},
|
| 214 |
+
"output_type": "display_data"
|
| 215 |
+
}
|
| 216 |
+
],
|
| 217 |
+
"source": [
|
| 218 |
+
"import matplotlib.pyplot as plt\n",
|
| 219 |
+
"import numpy as np\n",
|
| 220 |
+
"\n",
|
| 221 |
+
"# 示例数据(Python list,float 类型)\n",
|
| 222 |
+
"#data = np.random.normal(loc=50, scale=15, size=200) # 生成200个服从正态分布的随机数\n",
|
| 223 |
+
"data = dna_protein_pair_full_list\n",
|
| 224 |
+
"\n",
|
| 225 |
+
"# 计算直方图并自动确定分区数\n",
|
| 226 |
+
"plt.figure(figsize=(8, 6)) # 设置图像大小\n",
|
| 227 |
+
"plt.hist(data, bins='auto', edgecolor='black', alpha=0.7)\n",
|
| 228 |
+
"\n",
|
| 229 |
+
"# 添加标题和标签\n",
|
| 230 |
+
"plt.xlabel('数值区间', fontsize=12)\n",
|
| 231 |
+
"plt.ylabel('频数', fontsize=12)\n",
|
| 232 |
+
"plt.title('数值分布直方图', fontsize=14)\n",
|
| 233 |
+
"\n",
|
| 234 |
+
"# 显示网格\n",
|
| 235 |
+
"plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
|
| 236 |
+
"\n",
|
| 237 |
+
"# 显示直方图\n",
|
| 238 |
+
"plt.show()\n"
|
| 239 |
+
]
|
| 240 |
+
},
|
| 241 |
+
{
|
| 242 |
+
"cell_type": "code",
|
| 243 |
+
"execution_count": 10,
|
| 244 |
+
"id": "8d9c667f-2620-4039-85b5-fa8b24a8e14f",
|
| 245 |
+
"metadata": {},
|
| 246 |
+
"outputs": [
|
| 247 |
+
{
|
| 248 |
+
"name": "stdout",
|
| 249 |
+
"output_type": "stream",
|
| 250 |
+
"text": [
|
| 251 |
+
"大于0.7的元素个数: 1\n"
|
| 252 |
+
]
|
| 253 |
+
}
|
| 254 |
+
],
|
| 255 |
+
"source": [
|
| 256 |
+
"count = sum(1 for x in dna_protein_pair_full_list if x >= 0.7)\n",
|
| 257 |
+
"print(\"大于0.7的元素个数:\", count)"
|
| 258 |
+
]
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"cell_type": "code",
|
| 262 |
+
"execution_count": null,
|
| 263 |
+
"id": "276f8fe0-7227-4c12-999b-8c025cb963cb",
|
| 264 |
+
"metadata": {},
|
| 265 |
+
"outputs": [],
|
| 266 |
+
"source": []
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"cell_type": "code",
|
| 270 |
+
"execution_count": 12,
|
| 271 |
+
"id": "b550e6b5-a89f-48e7-95ca-7b6097867d0b",
|
| 272 |
+
"metadata": {},
|
| 273 |
+
"outputs": [
|
| 274 |
+
{
|
| 275 |
+
"data": {
|
| 276 |
+
"text/plain": [
|
| 277 |
+
"108"
|
| 278 |
+
]
|
| 279 |
+
},
|
| 280 |
+
"execution_count": 12,
|
| 281 |
+
"metadata": {},
|
| 282 |
+
"output_type": "execute_result"
|
| 283 |
+
}
|
| 284 |
+
],
|
| 285 |
+
"source": [
|
| 286 |
+
"len(dna_protein_pair_full_list)"
|
| 287 |
+
]
|
| 288 |
+
},
|
| 289 |
+
{
|
| 290 |
+
"cell_type": "code",
|
| 291 |
+
"execution_count": null,
|
| 292 |
+
"id": "35e0c7c6-bb4b-4333-8dae-1536f545f421",
|
| 293 |
+
"metadata": {},
|
| 294 |
+
"outputs": [],
|
| 295 |
+
"source": []
|
| 296 |
+
}
|
| 297 |
+
],
|
| 298 |
+
"metadata": {
|
| 299 |
+
"kernelspec": {
|
| 300 |
+
"display_name": "Python 3 (ipykernel)",
|
| 301 |
+
"language": "python",
|
| 302 |
+
"name": "python3"
|
| 303 |
+
},
|
| 304 |
+
"language_info": {
|
| 305 |
+
"codemirror_mode": {
|
| 306 |
+
"name": "ipython",
|
| 307 |
+
"version": 3
|
| 308 |
+
},
|
| 309 |
+
"file_extension": ".py",
|
| 310 |
+
"mimetype": "text/x-python",
|
| 311 |
+
"name": "python",
|
| 312 |
+
"nbconvert_exporter": "python",
|
| 313 |
+
"pygments_lexer": "ipython3",
|
| 314 |
+
"version": "3.12.3"
|
| 315 |
+
}
|
| 316 |
+
},
|
| 317 |
+
"nbformat": 4,
|
| 318 |
+
"nbformat_minor": 5
|
| 319 |
+
}
|
finetune/get_acc_stat_multiv1_3.ipynb
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "8159f2eb-88ce-4c45-b1ae-584ce3a1976f",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import json"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "code",
|
| 15 |
+
"execution_count": 5,
|
| 16 |
+
"id": "179a6741-6649-4bea-be83-7fc9fd6c13c6",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": [
|
| 20 |
+
"filename = \"gpt2_gene_multiv1_ft_en3.jsonl\"\n",
|
| 21 |
+
"data_list = []\n",
|
| 22 |
+
"for line in open(filename):\n",
|
| 23 |
+
" data = json.loads(line)\n",
|
| 24 |
+
" data_list.append(data)\n",
|
| 25 |
+
" "
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "code",
|
| 30 |
+
"execution_count": 6,
|
| 31 |
+
"id": "c8cc78e9-fbdf-4c95-847f-44ea953a38ec",
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"outputs": [
|
| 34 |
+
{
|
| 35 |
+
"name": "stdout",
|
| 36 |
+
"output_type": "stream",
|
| 37 |
+
"text": [
|
| 38 |
+
"en: 0.9015\n",
|
| 39 |
+
"fr: 0.827\n",
|
| 40 |
+
"de: 0.791\n",
|
| 41 |
+
"zh: 0.731\n",
|
| 42 |
+
"dna_sim_pair_simple_150bp: 0.955\n",
|
| 43 |
+
"dna_sim_pair_150bp: 0.8715\n",
|
| 44 |
+
"dna_sim_pair_50bp: 0.813\n",
|
| 45 |
+
"protein_sim_pair_150bp: 0.9705555555555555\n",
|
| 46 |
+
"protein_sim_pair_450bp: 0.9738888888888889\n",
|
| 47 |
+
"dna_protein_pair: 0.585\n",
|
| 48 |
+
"dna_protein_pair_100: 0.5925\n",
|
| 49 |
+
"dna_protein_pair_full: 0.69\n",
|
| 50 |
+
"dna_protein_pair_rand: 0.580625\n",
|
| 51 |
+
"dna_protein_pair_rand_100: 0.5775\n",
|
| 52 |
+
"dna_protein_pair_rand_full: 0.6\n"
|
| 53 |
+
]
|
| 54 |
+
}
|
| 55 |
+
],
|
| 56 |
+
"source": [
|
| 57 |
+
"# 假设您的数据存储在一个名为data_list的列表中\n",
|
| 58 |
+
"# 初始化一个字典来保存每个键的最大accuracy值\n",
|
| 59 |
+
"max_accuracies = {}\n",
|
| 60 |
+
"\n",
|
| 61 |
+
"# 遍历列表中的每个字典\n",
|
| 62 |
+
"for data in data_list:\n",
|
| 63 |
+
" for key, metrics in data.items():\n",
|
| 64 |
+
" if key not in ['seed']: # 忽略非目标键,例如'seed'\n",
|
| 65 |
+
" if isinstance(metrics, dict) and 'accuracy' in metrics:\n",
|
| 66 |
+
" accuracy = metrics['accuracy']\n",
|
| 67 |
+
" if accuracy<0.5:\n",
|
| 68 |
+
" accuracy = 1 - accuracy\n",
|
| 69 |
+
" if key not in max_accuracies or accuracy > max_accuracies[key]:\n",
|
| 70 |
+
" max_accuracies[key] = accuracy\n",
|
| 71 |
+
"\n",
|
| 72 |
+
"# 打印每个键的最大accuracy值\n",
|
| 73 |
+
"for key, max_accuracy in max_accuracies.items():\n",
|
| 74 |
+
" print(f\"{key}: {max_accuracy}\")"
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"cell_type": "code",
|
| 79 |
+
"execution_count": 7,
|
| 80 |
+
"id": "0d2f40f8-a817-4b6b-ae17-310478f6f8d8",
|
| 81 |
+
"metadata": {},
|
| 82 |
+
"outputs": [
|
| 83 |
+
{
|
| 84 |
+
"data": {
|
| 85 |
+
"text/plain": [
|
| 86 |
+
"72"
|
| 87 |
+
]
|
| 88 |
+
},
|
| 89 |
+
"execution_count": 7,
|
| 90 |
+
"metadata": {},
|
| 91 |
+
"output_type": "execute_result"
|
| 92 |
+
}
|
| 93 |
+
],
|
| 94 |
+
"source": [
|
| 95 |
+
"len(data_list)"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"cell_type": "code",
|
| 100 |
+
"execution_count": null,
|
| 101 |
+
"id": "99f4b3d1-a96c-4ac8-88c4-72f9e3302c8e",
|
| 102 |
+
"metadata": {},
|
| 103 |
+
"outputs": [],
|
| 104 |
+
"source": []
|
| 105 |
+
}
|
| 106 |
+
],
|
| 107 |
+
"metadata": {
|
| 108 |
+
"kernelspec": {
|
| 109 |
+
"display_name": "Python 3 (ipykernel)",
|
| 110 |
+
"language": "python",
|
| 111 |
+
"name": "python3"
|
| 112 |
+
},
|
| 113 |
+
"language_info": {
|
| 114 |
+
"codemirror_mode": {
|
| 115 |
+
"name": "ipython",
|
| 116 |
+
"version": 3
|
| 117 |
+
},
|
| 118 |
+
"file_extension": ".py",
|
| 119 |
+
"mimetype": "text/x-python",
|
| 120 |
+
"name": "python",
|
| 121 |
+
"nbconvert_exporter": "python",
|
| 122 |
+
"pygments_lexer": "ipython3",
|
| 123 |
+
"version": "3.12.3"
|
| 124 |
+
}
|
| 125 |
+
},
|
| 126 |
+
"nbformat": 4,
|
| 127 |
+
"nbformat_minor": 5
|
| 128 |
+
}
|
finetune/get_acc_stat_multiv2.ipynb
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "8159f2eb-88ce-4c45-b1ae-584ce3a1976f",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import json"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "code",
|
| 15 |
+
"execution_count": 2,
|
| 16 |
+
"id": "179a6741-6649-4bea-be83-7fc9fd6c13c6",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": [
|
| 20 |
+
"filename = \"gpt2_gene_multiv2_ft_en.jsonl\"\n",
|
| 21 |
+
"data_list = []\n",
|
| 22 |
+
"for line in open(filename):\n",
|
| 23 |
+
" data = json.loads(line)\n",
|
| 24 |
+
" data_list.append(data)\n",
|
| 25 |
+
" "
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "code",
|
| 30 |
+
"execution_count": 3,
|
| 31 |
+
"id": "c8cc78e9-fbdf-4c95-847f-44ea953a38ec",
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"outputs": [
|
| 34 |
+
{
|
| 35 |
+
"name": "stdout",
|
| 36 |
+
"output_type": "stream",
|
| 37 |
+
"text": [
|
| 38 |
+
"en: 0.9045\n",
|
| 39 |
+
"fr: 0.8145\n",
|
| 40 |
+
"de: 0.7875\n",
|
| 41 |
+
"zh: 0.744\n",
|
| 42 |
+
"dna_sim_pair_simple_150bp: 0.9583333333333334\n",
|
| 43 |
+
"dna_sim_pair_150bp: 0.87725\n",
|
| 44 |
+
"dna_sim_pair_50bp: 0.8295\n",
|
| 45 |
+
"protein_sim_pair_150bp: 0.9855555555555555\n",
|
| 46 |
+
"protein_sim_pair_450bp: 0.9727777777777777\n",
|
| 47 |
+
"dna_protein_pair: 0.58\n",
|
| 48 |
+
"dna_protein_pair_100: 0.57\n",
|
| 49 |
+
"dna_protein_pair_full: 0.665\n",
|
| 50 |
+
"dna_protein_pair_rand: 0.59375\n",
|
| 51 |
+
"dna_protein_pair_rand_100: 0.55875\n",
|
| 52 |
+
"dna_protein_pair_rand_full: 0.66625\n"
|
| 53 |
+
]
|
| 54 |
+
}
|
| 55 |
+
],
|
| 56 |
+
"source": [
|
| 57 |
+
"max_accuracies = {}\n",
|
| 58 |
+
"dna_protein_pair_rand_full_list = []\n",
|
| 59 |
+
"fr_list = []\n",
|
| 60 |
+
"zh_list = []\n",
|
| 61 |
+
"\n",
|
| 62 |
+
"# 遍历列表中的每个字典\n",
|
| 63 |
+
"for data in data_list:\n",
|
| 64 |
+
" for key, metrics in data.items():\n",
|
| 65 |
+
" if key not in ['seed']: # 忽略非目标键,例如'seed'\n",
|
| 66 |
+
" if isinstance(metrics, dict) and 'accuracy' in metrics:\n",
|
| 67 |
+
" accuracy = metrics['accuracy']\n",
|
| 68 |
+
" if accuracy<0.5:\n",
|
| 69 |
+
" accuracy = 1 - accuracy\n",
|
| 70 |
+
" if \"dna_protein_pair_rand_full\"==key:\n",
|
| 71 |
+
" dna_protein_pair_rand_full_list.append(accuracy)\n",
|
| 72 |
+
"\n",
|
| 73 |
+
" if \"fr\"==key:\n",
|
| 74 |
+
" fr_list.append(accuracy)\n",
|
| 75 |
+
"\n",
|
| 76 |
+
" if \"zh\"==key:\n",
|
| 77 |
+
" zh_list.append(accuracy)\n",
|
| 78 |
+
" \n",
|
| 79 |
+
" if key not in max_accuracies or accuracy > max_accuracies[key]:\n",
|
| 80 |
+
" max_accuracies[key] = accuracy\n",
|
| 81 |
+
"\n",
|
| 82 |
+
"# 打印每个键的最大accuracy值\n",
|
| 83 |
+
"for key, max_accuracy in max_accuracies.items():\n",
|
| 84 |
+
" print(f\"{key}: {max_accuracy}\")"
|
| 85 |
+
]
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"cell_type": "code",
|
| 89 |
+
"execution_count": 6,
|
| 90 |
+
"id": "0d2f40f8-a817-4b6b-ae17-310478f6f8d8",
|
| 91 |
+
"metadata": {},
|
| 92 |
+
"outputs": [
|
| 93 |
+
{
|
| 94 |
+
"data": {
|
| 95 |
+
"text/plain": [
|
| 96 |
+
"Text(0, 0.5, 'Frequency')"
|
| 97 |
+
]
|
| 98 |
+
},
|
| 99 |
+
"execution_count": 6,
|
| 100 |
+
"metadata": {},
|
| 101 |
+
"output_type": "execute_result"
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"data": {
|
| 105 |
+
"image/png": "",
|
| 106 |
+
"text/plain": [
|
| 107 |
+
"<Figure size 1000x600 with 1 Axes>"
|
| 108 |
+
]
|
| 109 |
+
},
|
| 110 |
+
"metadata": {},
|
| 111 |
+
"output_type": "display_data"
|
| 112 |
+
}
|
| 113 |
+
],
|
| 114 |
+
"source": [
|
| 115 |
+
"import numpy as np\n",
|
| 116 |
+
"import matplotlib.pyplot as plt\n",
|
| 117 |
+
"import seaborn as sns\n",
|
| 118 |
+
"\n",
|
| 119 |
+
"# 假设这是你的列表\n",
|
| 120 |
+
"data = fr_list \n",
|
| 121 |
+
"\n",
|
| 122 |
+
"# 使用Freedman-Diaconis规则自动确定bin的数量\n",
|
| 123 |
+
"bins = np.histogram_bin_edges(data, bins='fd')\n",
|
| 124 |
+
"\n",
|
| 125 |
+
"# 设置图形大小\n",
|
| 126 |
+
"plt.figure(figsize=(10, 6))\n",
|
| 127 |
+
"\n",
|
| 128 |
+
"# 使用seaborn绘制直方图\n",
|
| 129 |
+
"sns.histplot(data=data, bins=bins, kde=True)\n",
|
| 130 |
+
"\n",
|
| 131 |
+
"# 添加标题和轴标签\n",
|
| 132 |
+
"plt.title('Distribution of Accuracy Value(French text similar Test)')\n",
|
| 133 |
+
"plt.xlabel('Values')\n",
|
| 134 |
+
"plt.ylabel('Frequency')\n",
|
| 135 |
+
"\n",
|
| 136 |
+
"# 显示图形"
|
| 137 |
+
]
|
| 138 |
+
},
|
| 139 |
+
{
|
| 140 |
+
"cell_type": "code",
|
| 141 |
+
"execution_count": 7,
|
| 142 |
+
"id": "b5daf611-35c4-4f16-bf95-089307a30128",
|
| 143 |
+
"metadata": {},
|
| 144 |
+
"outputs": [
|
| 145 |
+
{
|
| 146 |
+
"data": {
|
| 147 |
+
"text/plain": [
|
| 148 |
+
"Text(0, 0.5, 'Frequency')"
|
| 149 |
+
]
|
| 150 |
+
},
|
| 151 |
+
"execution_count": 7,
|
| 152 |
+
"metadata": {},
|
| 153 |
+
"output_type": "execute_result"
|
| 154 |
+
},
|
| 155 |
+
{
|
| 156 |
+
"data": {
|
| 157 |
+
"image/png": "",
|
| 158 |
+
"text/plain": [
|
| 159 |
+
"<Figure size 1000x600 with 1 Axes>"
|
| 160 |
+
]
|
| 161 |
+
},
|
| 162 |
+
"metadata": {},
|
| 163 |
+
"output_type": "display_data"
|
| 164 |
+
}
|
| 165 |
+
],
|
| 166 |
+
"source": [
|
| 167 |
+
"import numpy as np\n",
|
| 168 |
+
"import matplotlib.pyplot as plt\n",
|
| 169 |
+
"import seaborn as sns\n",
|
| 170 |
+
"\n",
|
| 171 |
+
"# 假设这是你的列表\n",
|
| 172 |
+
"data = zh_list \n",
|
| 173 |
+
"\n",
|
| 174 |
+
"# 使用Freedman-Diaconis规则自动确定bin的数量\n",
|
| 175 |
+
"bins = np.histogram_bin_edges(data, bins='fd')\n",
|
| 176 |
+
"\n",
|
| 177 |
+
"# 设置图形大小\n",
|
| 178 |
+
"plt.figure(figsize=(10, 6))\n",
|
| 179 |
+
"\n",
|
| 180 |
+
"# 使用seaborn绘制直方图\n",
|
| 181 |
+
"sns.histplot(data=data, bins=bins, kde=True)\n",
|
| 182 |
+
"\n",
|
| 183 |
+
"# 添加标题和轴标签\n",
|
| 184 |
+
"plt.title('Distribution of Accuracy Value(Chinese text similar Test)')\n",
|
| 185 |
+
"plt.xlabel('Values')\n",
|
| 186 |
+
"plt.ylabel('Frequency')\n",
|
| 187 |
+
"\n",
|
| 188 |
+
"# 显示图形"
|
| 189 |
+
]
|
| 190 |
+
},
|
| 191 |
+
{
|
| 192 |
+
"cell_type": "code",
|
| 193 |
+
"execution_count": 8,
|
| 194 |
+
"id": "2d48c7e7-a218-4c2e-b01a-b981cb3e4018",
|
| 195 |
+
"metadata": {},
|
| 196 |
+
"outputs": [
|
| 197 |
+
{
|
| 198 |
+
"name": "stdout",
|
| 199 |
+
"output_type": "stream",
|
| 200 |
+
"text": [
|
| 201 |
+
"平均值: 0.7900793650793653\n",
|
| 202 |
+
"方差: 0.00020592227261274904\n",
|
| 203 |
+
"标准差: 0.014349992077097083\n"
|
| 204 |
+
]
|
| 205 |
+
}
|
| 206 |
+
],
|
| 207 |
+
"source": [
|
| 208 |
+
"import numpy as np\n",
|
| 209 |
+
"\n",
|
| 210 |
+
"# 示���数据(Python list,包含 float 类型的数值)\n",
|
| 211 |
+
"data = fr_list\n",
|
| 212 |
+
"\n",
|
| 213 |
+
"# 计算平均值\n",
|
| 214 |
+
"mean_value = np.mean(data)\n",
|
| 215 |
+
"\n",
|
| 216 |
+
"# 计算方差(注意:np.var默认计算总体方差,如果需要样本方差,请设置ddof=1)\n",
|
| 217 |
+
"variance_value = np.var(data)\n",
|
| 218 |
+
"\n",
|
| 219 |
+
"# 计算标准差\n",
|
| 220 |
+
"std_value = np.std(data)\n",
|
| 221 |
+
"\n",
|
| 222 |
+
"print(\"平均值:\", mean_value)\n",
|
| 223 |
+
"print(\"方差:\", variance_value)\n",
|
| 224 |
+
"print(\"标准差:\", std_value)"
|
| 225 |
+
]
|
| 226 |
+
},
|
| 227 |
+
{
|
| 228 |
+
"cell_type": "code",
|
| 229 |
+
"execution_count": 9,
|
| 230 |
+
"id": "af6ec113-8c12-4724-bbd4-a7089c969700",
|
| 231 |
+
"metadata": {},
|
| 232 |
+
"outputs": [
|
| 233 |
+
{
|
| 234 |
+
"name": "stdout",
|
| 235 |
+
"output_type": "stream",
|
| 236 |
+
"text": [
|
| 237 |
+
"平均值: 0.7147619047619047\n",
|
| 238 |
+
"方差: 0.0002495226757369616\n",
|
| 239 |
+
"标准差: 0.01579628677053445\n"
|
| 240 |
+
]
|
| 241 |
+
}
|
| 242 |
+
],
|
| 243 |
+
"source": [
|
| 244 |
+
"# 示例数据(Python list,包含 float 类型的数值)\n",
|
| 245 |
+
"data = zh_list\n",
|
| 246 |
+
"\n",
|
| 247 |
+
"# 计算平均值\n",
|
| 248 |
+
"mean_value = np.mean(data)\n",
|
| 249 |
+
"\n",
|
| 250 |
+
"# 计算方差(注意:np.var默认计算总体方差,如果需要样本方差,请设置ddof=1)\n",
|
| 251 |
+
"variance_value = np.var(data)\n",
|
| 252 |
+
"\n",
|
| 253 |
+
"# 计算标准差\n",
|
| 254 |
+
"std_value = np.std(data)\n",
|
| 255 |
+
"\n",
|
| 256 |
+
"print(\"平均值:\", mean_value)\n",
|
| 257 |
+
"print(\"方差:\", variance_value)\n",
|
| 258 |
+
"print(\"标准差:\", std_value)"
|
| 259 |
+
]
|
| 260 |
+
},
|
| 261 |
+
{
|
| 262 |
+
"cell_type": "code",
|
| 263 |
+
"execution_count": null,
|
| 264 |
+
"id": "ae5db37b-2301-480a-ab33-a4a798089596",
|
| 265 |
+
"metadata": {},
|
| 266 |
+
"outputs": [],
|
| 267 |
+
"source": []
|
| 268 |
+
}
|
| 269 |
+
],
|
| 270 |
+
"metadata": {
|
| 271 |
+
"kernelspec": {
|
| 272 |
+
"display_name": "Python 3 (ipykernel)",
|
| 273 |
+
"language": "python",
|
| 274 |
+
"name": "python3"
|
| 275 |
+
},
|
| 276 |
+
"language_info": {
|
| 277 |
+
"codemirror_mode": {
|
| 278 |
+
"name": "ipython",
|
| 279 |
+
"version": 3
|
| 280 |
+
},
|
| 281 |
+
"file_extension": ".py",
|
| 282 |
+
"mimetype": "text/x-python",
|
| 283 |
+
"name": "python",
|
| 284 |
+
"nbconvert_exporter": "python",
|
| 285 |
+
"pygments_lexer": "ipython3",
|
| 286 |
+
"version": "3.12.3"
|
| 287 |
+
}
|
| 288 |
+
},
|
| 289 |
+
"nbformat": 4,
|
| 290 |
+
"nbformat_minor": 5
|
| 291 |
+
}
|
finetune/get_acc_stat_multiv2_2.ipynb
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "8159f2eb-88ce-4c45-b1ae-584ce3a1976f",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import json"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "code",
|
| 15 |
+
"execution_count": 2,
|
| 16 |
+
"id": "179a6741-6649-4bea-be83-7fc9fd6c13c6",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": [
|
| 20 |
+
"filename = \"gpt2_gene_multiv2_ft_en2.jsonl\"\n",
|
| 21 |
+
"data_list = []\n",
|
| 22 |
+
"for line in open(filename):\n",
|
| 23 |
+
" data = json.loads(line)\n",
|
| 24 |
+
" data_list.append(data)\n",
|
| 25 |
+
" "
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "code",
|
| 30 |
+
"execution_count": 3,
|
| 31 |
+
"id": "c8cc78e9-fbdf-4c95-847f-44ea953a38ec",
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"outputs": [
|
| 34 |
+
{
|
| 35 |
+
"name": "stdout",
|
| 36 |
+
"output_type": "stream",
|
| 37 |
+
"text": [
|
| 38 |
+
"dna_protein_pair_full: 0.6675\n",
|
| 39 |
+
"dna_protein_pair_rand_full: 0.76025\n"
|
| 40 |
+
]
|
| 41 |
+
}
|
| 42 |
+
],
|
| 43 |
+
"source": [
|
| 44 |
+
"# 假设您的数据存储在一个名为data_list的列表中\n",
|
| 45 |
+
"# 初始化一个字典来保存每个键的最大accuracy值\n",
|
| 46 |
+
"max_accuracies = {}\n",
|
| 47 |
+
"\n",
|
| 48 |
+
"dna_protein_pair_rand_full_list = []\n",
|
| 49 |
+
"\n",
|
| 50 |
+
"# 遍历列表中的每个字典\n",
|
| 51 |
+
"for data in data_list:\n",
|
| 52 |
+
" for key, metrics in data.items():\n",
|
| 53 |
+
" if key not in ['seed']: # 忽略非目标键,例如'seed'\n",
|
| 54 |
+
" if isinstance(metrics, dict) and 'accuracy' in metrics:\n",
|
| 55 |
+
" accuracy = metrics['accuracy']\n",
|
| 56 |
+
" if accuracy<0.5:\n",
|
| 57 |
+
" accuracy = 1 - accuracy\n",
|
| 58 |
+
" if \"dna_protein_pair_rand_full\"==key:\n",
|
| 59 |
+
" dna_protein_pair_rand_full_list.append(accuracy)\n",
|
| 60 |
+
" if key not in max_accuracies or accuracy > max_accuracies[key]:\n",
|
| 61 |
+
" max_accuracies[key] = accuracy\n",
|
| 62 |
+
"\n",
|
| 63 |
+
"# 打印每个键的最大accuracy值\n",
|
| 64 |
+
"for key, max_accuracy in max_accuracies.items():\n",
|
| 65 |
+
" print(f\"{key}: {max_accuracy}\")"
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"cell_type": "code",
|
| 70 |
+
"execution_count": 4,
|
| 71 |
+
"id": "0d2f40f8-a817-4b6b-ae17-310478f6f8d8",
|
| 72 |
+
"metadata": {},
|
| 73 |
+
"outputs": [],
|
| 74 |
+
"source": [
|
| 75 |
+
"#dna_protein_pair_rand_full_list"
|
| 76 |
+
]
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"cell_type": "code",
|
| 80 |
+
"execution_count": 11,
|
| 81 |
+
"id": "13ad1fd4-5c99-4c06-b92c-6fe417d28c32",
|
| 82 |
+
"metadata": {},
|
| 83 |
+
"outputs": [
|
| 84 |
+
{
|
| 85 |
+
"data": {
|
| 86 |
+
"image/png": "",
|
| 87 |
+
"text/plain": [
|
| 88 |
+
"<Figure size 1000x600 with 1 Axes>"
|
| 89 |
+
]
|
| 90 |
+
},
|
| 91 |
+
"metadata": {},
|
| 92 |
+
"output_type": "display_data"
|
| 93 |
+
}
|
| 94 |
+
],
|
| 95 |
+
"source": [
|
| 96 |
+
"import numpy as np\n",
|
| 97 |
+
"import matplotlib.pyplot as plt\n",
|
| 98 |
+
"import seaborn as sns\n",
|
| 99 |
+
"\n",
|
| 100 |
+
"# 假设这是你的列表\n",
|
| 101 |
+
"#dna_protein_pair_rand_full_list = [0.1, 0.2, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2] # 示例数据\n",
|
| 102 |
+
"\n",
|
| 103 |
+
"# 使用Freedman-Diaconis规则自动确定bin的数量\n",
|
| 104 |
+
"bins = np.histogram_bin_edges(dna_protein_pair_rand_full_list, bins='fd')\n",
|
| 105 |
+
"\n",
|
| 106 |
+
"# 设置图形大小\n",
|
| 107 |
+
"plt.figure(figsize=(10, 6))\n",
|
| 108 |
+
"\n",
|
| 109 |
+
"# 使用seaborn绘制直方图\n",
|
| 110 |
+
"sns.histplot(data=dna_protein_pair_rand_full_list, bins=bins, kde=True)\n",
|
| 111 |
+
"\n",
|
| 112 |
+
"# 添加标题和轴标签\n",
|
| 113 |
+
"plt.title('Distribution of Accuracy Value (Test dna_protein_pair_rand)')\n",
|
| 114 |
+
"plt.xlabel('Values')\n",
|
| 115 |
+
"plt.ylabel('Frequency')\n",
|
| 116 |
+
"\n",
|
| 117 |
+
"# 显示图形\n",
|
| 118 |
+
"plt.show()"
|
| 119 |
+
]
|
| 120 |
+
},
|
| 121 |
+
{
|
| 122 |
+
"cell_type": "code",
|
| 123 |
+
"execution_count": 6,
|
| 124 |
+
"id": "d19b9579-30a7-4442-a8a4-786631f52e88",
|
| 125 |
+
"metadata": {},
|
| 126 |
+
"outputs": [
|
| 127 |
+
{
|
| 128 |
+
"name": "stdout",
|
| 129 |
+
"output_type": "stream",
|
| 130 |
+
"text": [
|
| 131 |
+
"大于0.7的元素个数: 13\n"
|
| 132 |
+
]
|
| 133 |
+
}
|
| 134 |
+
],
|
| 135 |
+
"source": [
|
| 136 |
+
"count = sum(1 for x in dna_protein_pair_rand_full_list if x > 0.7)\n",
|
| 137 |
+
"print(\"大于0.7的元素个数:\", count)"
|
| 138 |
+
]
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"cell_type": "code",
|
| 142 |
+
"execution_count": 7,
|
| 143 |
+
"id": "7f1a63ad-9f94-42bb-8380-5163dd8efff0",
|
| 144 |
+
"metadata": {},
|
| 145 |
+
"outputs": [
|
| 146 |
+
{
|
| 147 |
+
"data": {
|
| 148 |
+
"text/plain": [
|
| 149 |
+
"124"
|
| 150 |
+
]
|
| 151 |
+
},
|
| 152 |
+
"execution_count": 7,
|
| 153 |
+
"metadata": {},
|
| 154 |
+
"output_type": "execute_result"
|
| 155 |
+
}
|
| 156 |
+
],
|
| 157 |
+
"source": [
|
| 158 |
+
"len(dna_protein_pair_rand_full_list)"
|
| 159 |
+
]
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"cell_type": "code",
|
| 163 |
+
"execution_count": 9,
|
| 164 |
+
"id": "bc31dd0e-e56e-4fda-8071-f11139922173",
|
| 165 |
+
"metadata": {},
|
| 166 |
+
"outputs": [
|
| 167 |
+
{
|
| 168 |
+
"data": {
|
| 169 |
+
"text/plain": [
|
| 170 |
+
"0.08870967741935484"
|
| 171 |
+
]
|
| 172 |
+
},
|
| 173 |
+
"execution_count": 9,
|
| 174 |
+
"metadata": {},
|
| 175 |
+
"output_type": "execute_result"
|
| 176 |
+
}
|
| 177 |
+
],
|
| 178 |
+
"source": [
|
| 179 |
+
"11/124"
|
| 180 |
+
]
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"cell_type": "code",
|
| 184 |
+
"execution_count": null,
|
| 185 |
+
"id": "b71d0c97-c962-4bb7-8ada-a970ffb9cbbc",
|
| 186 |
+
"metadata": {},
|
| 187 |
+
"outputs": [],
|
| 188 |
+
"source": []
|
| 189 |
+
}
|
| 190 |
+
],
|
| 191 |
+
"metadata": {
|
| 192 |
+
"kernelspec": {
|
| 193 |
+
"display_name": "Python 3 (ipykernel)",
|
| 194 |
+
"language": "python",
|
| 195 |
+
"name": "python3"
|
| 196 |
+
},
|
| 197 |
+
"language_info": {
|
| 198 |
+
"codemirror_mode": {
|
| 199 |
+
"name": "ipython",
|
| 200 |
+
"version": 3
|
| 201 |
+
},
|
| 202 |
+
"file_extension": ".py",
|
| 203 |
+
"mimetype": "text/x-python",
|
| 204 |
+
"name": "python",
|
| 205 |
+
"nbconvert_exporter": "python",
|
| 206 |
+
"pygments_lexer": "ipython3",
|
| 207 |
+
"version": "3.12.3"
|
| 208 |
+
}
|
| 209 |
+
},
|
| 210 |
+
"nbformat": 4,
|
| 211 |
+
"nbformat_minor": 5
|
| 212 |
+
}
|
finetune/get_acc_stat_multiv2_3.ipynb
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "8159f2eb-88ce-4c45-b1ae-584ce3a1976f",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import json"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "code",
|
| 15 |
+
"execution_count": 2,
|
| 16 |
+
"id": "179a6741-6649-4bea-be83-7fc9fd6c13c6",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": [
|
| 20 |
+
"filename = \"gpt2_gene_multiv2_ft_en3.jsonl\"\n",
|
| 21 |
+
"data_list = []\n",
|
| 22 |
+
"for line in open(filename):\n",
|
| 23 |
+
" data = json.loads(line)\n",
|
| 24 |
+
" data_list.append(data)\n",
|
| 25 |
+
" "
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "code",
|
| 30 |
+
"execution_count": 4,
|
| 31 |
+
"id": "c8cc78e9-fbdf-4c95-847f-44ea953a38ec",
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"outputs": [
|
| 34 |
+
{
|
| 35 |
+
"name": "stdout",
|
| 36 |
+
"output_type": "stream",
|
| 37 |
+
"text": [
|
| 38 |
+
"en: 0.9065\n",
|
| 39 |
+
"fr: 0.8245\n",
|
| 40 |
+
"de: 0.798\n",
|
| 41 |
+
"zh: 0.7415\n",
|
| 42 |
+
"dna_sim_pair_simple_150bp: 0.9577777777777777\n",
|
| 43 |
+
"dna_sim_pair_150bp: 0.84825\n",
|
| 44 |
+
"dna_sim_pair_50bp: 0.8615\n",
|
| 45 |
+
"protein_sim_pair_150bp: 0.9711111111111111\n",
|
| 46 |
+
"protein_sim_pair_450bp: 0.9705555555555555\n",
|
| 47 |
+
"dna_protein_pair: 0.585\n",
|
| 48 |
+
"dna_protein_pair_100: 0.57\n",
|
| 49 |
+
"dna_protein_pair_full: 0.7025\n",
|
| 50 |
+
"dna_protein_pair_rand: 0.570625\n",
|
| 51 |
+
"dna_protein_pair_rand_100: 0.58625\n",
|
| 52 |
+
"dna_protein_pair_rand_full: 0.695\n"
|
| 53 |
+
]
|
| 54 |
+
}
|
| 55 |
+
],
|
| 56 |
+
"source": [
|
| 57 |
+
"# 假设您的数据存储在一个名为data_list的列表中\n",
|
| 58 |
+
"# 初始化一个字典来保存每个键的最大accuracy值\n",
|
| 59 |
+
"max_accuracies = {}\n",
|
| 60 |
+
"\n",
|
| 61 |
+
"# 遍历列表中的每个字典\n",
|
| 62 |
+
"for data in data_list:\n",
|
| 63 |
+
" for key, metrics in data.items():\n",
|
| 64 |
+
" if key not in ['seed']: # 忽略非目标键,例如'seed'\n",
|
| 65 |
+
" if isinstance(metrics, dict) and 'accuracy' in metrics:\n",
|
| 66 |
+
" accuracy = metrics['accuracy']\n",
|
| 67 |
+
"\n",
|
| 68 |
+
" if accuracy<0.5:\n",
|
| 69 |
+
" accuracy = 1-accuracy\n",
|
| 70 |
+
" if key not in max_accuracies or accuracy > max_accuracies[key]:\n",
|
| 71 |
+
" max_accuracies[key] = accuracy\n",
|
| 72 |
+
"\n",
|
| 73 |
+
"# 打印每个键的最大accuracy值\n",
|
| 74 |
+
"for key, max_accuracy in max_accuracies.items():\n",
|
| 75 |
+
" print(f\"{key}: {max_accuracy}\")"
|
| 76 |
+
]
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"cell_type": "code",
|
| 80 |
+
"execution_count": null,
|
| 81 |
+
"id": "0d2f40f8-a817-4b6b-ae17-310478f6f8d8",
|
| 82 |
+
"metadata": {},
|
| 83 |
+
"outputs": [],
|
| 84 |
+
"source": []
|
| 85 |
+
}
|
| 86 |
+
],
|
| 87 |
+
"metadata": {
|
| 88 |
+
"kernelspec": {
|
| 89 |
+
"display_name": "Python 3 (ipykernel)",
|
| 90 |
+
"language": "python",
|
| 91 |
+
"name": "python3"
|
| 92 |
+
},
|
| 93 |
+
"language_info": {
|
| 94 |
+
"codemirror_mode": {
|
| 95 |
+
"name": "ipython",
|
| 96 |
+
"version": 3
|
| 97 |
+
},
|
| 98 |
+
"file_extension": ".py",
|
| 99 |
+
"mimetype": "text/x-python",
|
| 100 |
+
"name": "python",
|
| 101 |
+
"nbconvert_exporter": "python",
|
| 102 |
+
"pygments_lexer": "ipython3",
|
| 103 |
+
"version": "3.12.3"
|
| 104 |
+
}
|
| 105 |
+
},
|
| 106 |
+
"nbformat": 4,
|
| 107 |
+
"nbformat_minor": 5
|
| 108 |
+
}
|
finetune/gpt2_gene_multiv1_ft_en.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
finetune/gpt2_gene_multiv1_ft_en2.jsonl
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"seed": 5445, "dna_protein_pair_full": {"accuracy": 0.475, "f1": 0.16}, "dna_protein_pair_rand_full": {"accuracy": 0.565375, "f1": 0.3523933693425219}}
|
| 2 |
+
{"seed": 990, "dna_protein_pair_full": {"accuracy": 0.6575, "f1": 0.573208722741433}, "dna_protein_pair_rand_full": {"accuracy": 0.560375, "f1": 0.5041590300296066}}
|
| 3 |
+
{"seed": 1531, "dna_protein_pair_full": {"accuracy": 0.5025, "f1": 0.3559870550161812}, "dna_protein_pair_rand_full": {"accuracy": 0.614125, "f1": 0.548618219037871}}
|
| 4 |
+
{"seed": 3500, "dna_protein_pair_full": {"accuracy": 0.4675, "f1": 0.05333333333333334}, "dna_protein_pair_rand_full": {"accuracy": 0.531375, "f1": 0.15046453659641967}}
|
| 5 |
+
{"seed": 9923, "dna_protein_pair_full": {"accuracy": 0.495, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.4965, "f1": 0.007392804337111878}}
|
| 6 |
+
{"seed": 3546, "dna_protein_pair_full": {"accuracy": 0.455, "f1": 0.0603448275862069}, "dna_protein_pair_rand_full": {"accuracy": 0.494, "f1": 0.0878774222622803}}
|
| 7 |
+
{"seed": 6026, "dna_protein_pair_full": {"accuracy": 0.4075, "f1": 0.2379421221864952}, "dna_protein_pair_rand_full": {"accuracy": 0.5185, "f1": 0.45531674208144796}}
|
| 8 |
+
{"seed": 4172, "dna_protein_pair_full": {"accuracy": 0.52, "f1": 0.02040816326530612}, "dna_protein_pair_rand_full": {"accuracy": 0.503625, "f1": 0.04336304504938569}}
|
| 9 |
+
{"seed": 6437, "dna_protein_pair_full": {"accuracy": 0.39, "f1": 0.32967032967032966}, "dna_protein_pair_rand_full": {"accuracy": 0.490875, "f1": 0.4291520672740014}}
|
| 10 |
+
{"seed": 779, "dna_protein_pair_full": {"accuracy": 0.5125, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.50525, "f1": 0.0124750499001996}}
|
| 11 |
+
{"seed": 7005, "dna_protein_pair_full": {"accuracy": 0.6025, "f1": 0.3510204081632653}, "dna_protein_pair_rand_full": {"accuracy": 0.57575, "f1": 0.4011997177134792}}
|
| 12 |
+
{"seed": 2353, "dna_protein_pair_full": {"accuracy": 0.5425, "f1": 0.05181347150259067}, "dna_protein_pair_rand_full": {"accuracy": 0.53, "f1": 0.13958810068649885}}
|
| 13 |
+
{"seed": 4224, "dna_protein_pair_full": {"accuracy": 0.5025, "f1": 0.03864734299516908}, "dna_protein_pair_rand_full": {"accuracy": 0.534625, "f1": 0.15367128892930212}}
|
| 14 |
+
{"seed": 6703, "dna_protein_pair_full": {"accuracy": 0.4875, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.514625, "f1": 0.06501324343847821}}
|
| 15 |
+
{"seed": 4990, "dna_protein_pair_full": {"accuracy": 0.4875, "f1": 0.1276595744680851}, "dna_protein_pair_rand_full": {"accuracy": 0.5055, "f1": 0.14074717636837533}}
|
| 16 |
+
{"seed": 6775, "dna_protein_pair_full": {"accuracy": 0.4825, "f1": 0.2188679245283019}, "dna_protein_pair_rand_full": {"accuracy": 0.59225, "f1": 0.4393262289446545}}
|
| 17 |
+
{"seed": 7854, "dna_protein_pair_full": {"accuracy": 0.515, "f1": 0.039603960396039604}, "dna_protein_pair_rand_full": {"accuracy": 0.510375, "f1": 0.09725743258815395}}
|
| 18 |
+
{"seed": 1088, "dna_protein_pair_full": {"accuracy": 0.515, "f1": 0.5380952380952381}, "dna_protein_pair_rand_full": {"accuracy": 0.489875, "f1": 0.40082219938335045}}
|
| 19 |
+
{"seed": 5911, "dna_protein_pair_full": {"accuracy": 0.5075, "f1": 0.06635071090047394}, "dna_protein_pair_rand_full": {"accuracy": 0.537375, "f1": 0.17992466208730334}}
|
| 20 |
+
{"seed": 2543, "dna_protein_pair_full": {"accuracy": 0.585, "f1": 0.5931372549019608}, "dna_protein_pair_rand_full": {"accuracy": 0.609125, "f1": 0.6461468824261627}}
|
| 21 |
+
{"seed": 8490, "dna_protein_pair_full": {"accuracy": 0.505, "f1": 0.08333333333333333}, "dna_protein_pair_rand_full": {"accuracy": 0.540625, "f1": 0.22745427790624342}}
|
| 22 |
+
{"seed": 8304, "dna_protein_pair_full": {"accuracy": 0.5025, "f1": 0.09954751131221719}, "dna_protein_pair_rand_full": {"accuracy": 0.536625, "f1": 0.20535905680600214}}
|
| 23 |
+
{"seed": 8670, "dna_protein_pair_full": {"accuracy": 0.535, "f1": 0.37583892617449666}, "dna_protein_pair_rand_full": {"accuracy": 0.568125, "f1": 0.5093026558727454}}
|
| 24 |
+
{"seed": 6225, "dna_protein_pair_full": {"accuracy": 0.52, "f1": 0.04}, "dna_protein_pair_rand_full": {"accuracy": 0.514625, "f1": 0.08527679623085983}}
|
| 25 |
+
{"seed": 5908, "dna_protein_pair_full": {"accuracy": 0.4975, "f1": 0.03827751196172249}, "dna_protein_pair_rand_full": {"accuracy": 0.526625, "f1": 0.14841466156959748}}
|
| 26 |
+
{"seed": 7500, "dna_protein_pair_full": {"accuracy": 0.5625, "f1": 0.5862884160756501}, "dna_protein_pair_rand_full": {"accuracy": 0.585, "f1": 0.6457533077251387}}
|
| 27 |
+
{"seed": 8310, "dna_protein_pair_full": {"accuracy": 0.45, "f1": 0.043478260869565216}, "dna_protein_pair_rand_full": {"accuracy": 0.51475, "f1": 0.08356940509915015}}
|
| 28 |
+
{"seed": 2652, "dna_protein_pair_full": {"accuracy": 0.5125, "f1": 0.01015228426395939}, "dna_protein_pair_rand_full": {"accuracy": 0.5015, "f1": 0.009438648782911079}}
|
| 29 |
+
{"seed": 446, "dna_protein_pair_full": {"accuracy": 0.5175, "f1": 0.010256410256410256}, "dna_protein_pair_rand_full": {"accuracy": 0.51875, "f1": 0.09539473684210527}}
|
| 30 |
+
{"seed": 6273, "dna_protein_pair_full": {"accuracy": 0.5325, "f1": 0.5961123110151187}, "dna_protein_pair_rand_full": {"accuracy": 0.661375, "f1": 0.7237687366167024}}
|
| 31 |
+
{"seed": 4050, "dna_protein_pair_full": {"accuracy": 0.3725, "f1": 0.49292929292929294}, "dna_protein_pair_rand_full": {"accuracy": 0.46975, "f1": 0.5407102641836293}}
|
| 32 |
+
{"seed": 5642, "dna_protein_pair_full": {"accuracy": 0.48, "f1": 0.14049586776859505}, "dna_protein_pair_rand_full": {"accuracy": 0.547, "f1": 0.32134831460674157}}
|
| 33 |
+
{"seed": 5062, "dna_protein_pair_full": {"accuracy": 0.365, "f1": 0.4009433962264151}, "dna_protein_pair_rand_full": {"accuracy": 0.458, "f1": 0.44123711340206184}}
|
| 34 |
+
{"seed": 7347, "dna_protein_pair_full": {"accuracy": 0.4625, "f1": 0.5691382765531062}, "dna_protein_pair_rand_full": {"accuracy": 0.588375, "f1": 0.681127142442142}}
|
| 35 |
+
{"seed": 3111, "dna_protein_pair_full": {"accuracy": 0.375, "f1": 0.3489583333333333}, "dna_protein_pair_rand_full": {"accuracy": 0.492375, "f1": 0.37320574162679426}}
|
| 36 |
+
{"seed": 3101, "dna_protein_pair_full": {"accuracy": 0.64, "f1": 0.6230366492146597}, "dna_protein_pair_rand_full": {"accuracy": 0.597125, "f1": 0.6494834148994019}}
|
| 37 |
+
{"seed": 5382, "dna_protein_pair_full": {"accuracy": 0.485, "f1": 0.037383177570093455}, "dna_protein_pair_rand_full": {"accuracy": 0.522875, "f1": 0.12273040680303378}}
|
| 38 |
+
{"seed": 3385, "dna_protein_pair_full": {"accuracy": 0.54, "f1": 0.041666666666666664}, "dna_protein_pair_rand_full": {"accuracy": 0.500125, "f1": 0.043072505384063174}}
|
| 39 |
+
{"seed": 1798, "dna_protein_pair_full": {"accuracy": 0.515, "f1": 0.039603960396039604}, "dna_protein_pair_rand_full": {"accuracy": 0.510125, "f1": 0.06445452375268561}}
|
| 40 |
+
{"seed": 2096, "dna_protein_pair_full": {"accuracy": 0.485, "f1": 0.009615384615384616}, "dna_protein_pair_rand_full": {"accuracy": 0.502, "f1": 0.033478893740902474}}
|
| 41 |
+
{"seed": 7605, "dna_protein_pair_full": {"accuracy": 0.495, "f1": 0.6505190311418685}, "dna_protein_pair_rand_full": {"accuracy": 0.537625, "f1": 0.6737807566804833}}
|
| 42 |
+
{"seed": 6915, "dna_protein_pair_full": {"accuracy": 0.5675, "f1": 0.32684824902723736}, "dna_protein_pair_rand_full": {"accuracy": 0.559625, "f1": 0.37833068643021}}
|
| 43 |
+
{"seed": 3022, "dna_protein_pair_full": {"accuracy": 0.4525, "f1": 0.043668122270742356}, "dna_protein_pair_rand_full": {"accuracy": 0.529, "f1": 0.1313969571230982}}
|
| 44 |
+
{"seed": 1000, "dna_protein_pair_full": {"accuracy": 0.505, "f1": 0.029411764705882353}, "dna_protein_pair_rand_full": {"accuracy": 0.511375, "f1": 0.04402054292002935}}
|
| 45 |
+
{"seed": 3167, "dna_protein_pair_full": {"accuracy": 0.495, "f1": 0.2462686567164179}, "dna_protein_pair_rand_full": {"accuracy": 0.521375, "f1": 0.3548441449031171}}
|
| 46 |
+
{"seed": 2501, "dna_protein_pair_full": {"accuracy": 0.495, "f1": 0.1440677966101695}, "dna_protein_pair_rand_full": {"accuracy": 0.515, "f1": 0.2673716012084592}}
|
| 47 |
+
{"seed": 8523, "dna_protein_pair_full": {"accuracy": 0.475, "f1": 0.03669724770642202}, "dna_protein_pair_rand_full": {"accuracy": 0.511375, "f1": 0.049598832968636035}}
|
| 48 |
+
{"seed": 4205, "dna_protein_pair_full": {"accuracy": 0.47, "f1": 0.21481481481481482}, "dna_protein_pair_rand_full": {"accuracy": 0.50875, "f1": 0.2738359201773836}}
|
| 49 |
+
{"seed": 6034, "dna_protein_pair_full": {"accuracy": 0.5175, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.51, "f1": 0.02729528535980149}}
|
| 50 |
+
{"seed": 8902, "dna_protein_pair_full": {"accuracy": 0.52, "f1": 0.010309278350515464}, "dna_protein_pair_rand_full": {"accuracy": 0.510875, "f1": 0.0378657487091222}}
|
| 51 |
+
{"seed": 3000, "dna_protein_pair_full": {"accuracy": 0.4825, "f1": 0.3591331269349845}, "dna_protein_pair_rand_full": {"accuracy": 0.551125, "f1": 0.5069339557874503}}
|
| 52 |
+
{"seed": 7135, "dna_protein_pair_full": {"accuracy": 0.4775, "f1": 0.027906976744186046}, "dna_protein_pair_rand_full": {"accuracy": 0.518, "f1": 0.12917795844625113}}
|
| 53 |
+
{"seed": 5425, "dna_protein_pair_full": {"accuracy": 0.5175, "f1": 0.5038560411311054}, "dna_protein_pair_rand_full": {"accuracy": 0.531125, "f1": 0.5447263017356475}}
|
| 54 |
+
{"seed": 680, "dna_protein_pair_full": {"accuracy": 0.505, "f1": 0.01}, "dna_protein_pair_rand_full": {"accuracy": 0.513375, "f1": 0.060796139927623644}}
|
| 55 |
+
{"seed": 5307, "dna_protein_pair_full": {"accuracy": 0.52, "f1": 0.09433962264150944}, "dna_protein_pair_rand_full": {"accuracy": 0.508, "f1": 0.07778819119025304}}
|
| 56 |
+
{"seed": 5904, "dna_protein_pair_full": {"accuracy": 0.4575, "f1": 0.2977346278317152}, "dna_protein_pair_rand_full": {"accuracy": 0.598875, "f1": 0.4916838270236021}}
|
| 57 |
+
{"seed": 4481, "dna_protein_pair_full": {"accuracy": 0.4775, "f1": 0.1606425702811245}, "dna_protein_pair_rand_full": {"accuracy": 0.58175, "f1": 0.47157296272899557}}
|
| 58 |
+
{"seed": 6550, "dna_protein_pair_full": {"accuracy": 0.49, "f1": 0.009708737864077669}, "dna_protein_pair_rand_full": {"accuracy": 0.50075, "f1": 0.048141086749285036}}
|
| 59 |
+
{"seed": 8337, "dna_protein_pair_full": {"accuracy": 0.505, "f1": 0.08333333333333333}, "dna_protein_pair_rand_full": {"accuracy": 0.5075, "f1": 0.11261261261261261}}
|
| 60 |
+
{"seed": 8229, "dna_protein_pair_full": {"accuracy": 0.5075, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.500125, "f1": 0.002494387627837366}}
|
| 61 |
+
{"seed": 8437, "dna_protein_pair_full": {"accuracy": 0.47, "f1": 0.061946902654867256}, "dna_protein_pair_rand_full": {"accuracy": 0.4805, "f1": 0.09336823734729494}}
|
| 62 |
+
{"seed": 1130, "dna_protein_pair_full": {"accuracy": 0.615, "f1": 0.4338235294117647}, "dna_protein_pair_rand_full": {"accuracy": 0.708375, "f1": 0.7002441218039317}}
|
| 63 |
+
{"seed": 4028, "dna_protein_pair_full": {"accuracy": 0.4825, "f1": 0.2527075812274368}, "dna_protein_pair_rand_full": {"accuracy": 0.581625, "f1": 0.40306759407883}}
|
| 64 |
+
{"seed": 516, "dna_protein_pair_full": {"accuracy": 0.485, "f1": 0.6448275862068965}, "dna_protein_pair_rand_full": {"accuracy": 0.529375, "f1": 0.6790007673288431}}
|
| 65 |
+
{"seed": 3075, "dna_protein_pair_full": {"accuracy": 0.3525, "f1": 0.3573200992555831}, "dna_protein_pair_rand_full": {"accuracy": 0.459125, "f1": 0.2960793883195055}}
|
| 66 |
+
{"seed": 9428, "dna_protein_pair_full": {"accuracy": 0.5, "f1": 0.009900990099009901}, "dna_protein_pair_rand_full": {"accuracy": 0.49725, "f1": 0.01614481409001957}}
|
| 67 |
+
{"seed": 4633, "dna_protein_pair_full": {"accuracy": 0.5575, "f1": 0.3166023166023166}, "dna_protein_pair_rand_full": {"accuracy": 0.661125, "f1": 0.5886815354270976}}
|
| 68 |
+
{"seed": 7170, "dna_protein_pair_full": {"accuracy": 0.3725, "f1": 0.3863080684596577}, "dna_protein_pair_rand_full": {"accuracy": 0.479625, "f1": 0.3284400709791902}}
|
| 69 |
+
{"seed": 7273, "dna_protein_pair_full": {"accuracy": 0.4475, "f1": 0.2939297124600639}, "dna_protein_pair_rand_full": {"accuracy": 0.4345, "f1": 0.3253802564867283}}
|
| 70 |
+
{"seed": 941, "dna_protein_pair_full": {"accuracy": 0.535, "f1": 0.6678571428571428}, "dna_protein_pair_rand_full": {"accuracy": 0.5015, "f1": 0.6613451086956522}}
|
| 71 |
+
{"seed": 4612, "dna_protein_pair_full": {"accuracy": 0.4225, "f1": 0.2572347266881029}, "dna_protein_pair_rand_full": {"accuracy": 0.4665, "f1": 0.3229695431472081}}
|
| 72 |
+
{"seed": 3411, "dna_protein_pair_full": {"accuracy": 0.4475, "f1": 0.5424430641821946}, "dna_protein_pair_rand_full": {"accuracy": 0.551375, "f1": 0.6532702154381219}}
|
| 73 |
+
{"seed": 8413, "dna_protein_pair_full": {"accuracy": 0.4725, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.500375, "f1": 0.016244154565591928}}
|
| 74 |
+
{"seed": 4748, "dna_protein_pair_full": {"accuracy": 0.515, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.502625, "f1": 0.00200652119388011}}
|
| 75 |
+
{"seed": 823, "dna_protein_pair_full": {"accuracy": 0.4025, "f1": 0.3979848866498741}, "dna_protein_pair_rand_full": {"accuracy": 0.5055, "f1": 0.45070813662871423}}
|
| 76 |
+
{"seed": 9869, "dna_protein_pair_full": {"accuracy": 0.5525, "f1": 0.10945273631840796}, "dna_protein_pair_rand_full": {"accuracy": 0.509, "f1": 0.10605370960400547}}
|
| 77 |
+
{"seed": 1627, "dna_protein_pair_full": {"accuracy": 0.485, "f1": 0.05504587155963303}, "dna_protein_pair_rand_full": {"accuracy": 0.509125, "f1": 0.11732973701955496}}
|
| 78 |
+
{"seed": 88, "dna_protein_pair_full": {"accuracy": 0.5375, "f1": 0.0975609756097561}, "dna_protein_pair_rand_full": {"accuracy": 0.50775, "f1": 0.0934622467771639}}
|
| 79 |
+
{"seed": 5390, "dna_protein_pair_full": {"accuracy": 0.4525, "f1": 0.23693379790940766}, "dna_protein_pair_rand_full": {"accuracy": 0.645375, "f1": 0.585051923358198}}
|
| 80 |
+
{"seed": 7967, "dna_protein_pair_full": {"accuracy": 0.7, "f1": 0.6842105263157895}, "dna_protein_pair_rand_full": {"accuracy": 0.60475, "f1": 0.6294820717131474}}
|
| 81 |
+
{"seed": 7296, "dna_protein_pair_full": {"accuracy": 0.5125, "f1": 0.01015228426395939}, "dna_protein_pair_rand_full": {"accuracy": 0.498, "f1": 0.015203531142717018}}
|
| 82 |
+
{"seed": 5129, "dna_protein_pair_full": {"accuracy": 0.4775, "f1": 0.027906976744186046}, "dna_protein_pair_rand_full": {"accuracy": 0.547375, "f1": 0.20713816509743815}}
|
| 83 |
+
{"seed": 649, "dna_protein_pair_full": {"accuracy": 0.4225, "f1": 0.3565459610027855}, "dna_protein_pair_rand_full": {"accuracy": 0.569875, "f1": 0.5604802656788862}}
|
| 84 |
+
{"seed": 9280, "dna_protein_pair_full": {"accuracy": 0.435, "f1": 0.19858156028368795}, "dna_protein_pair_rand_full": {"accuracy": 0.58725, "f1": 0.38987435328898745}}
|
| 85 |
+
{"seed": 5570, "dna_protein_pair_full": {"accuracy": 0.44, "f1": 0.15151515151515152}, "dna_protein_pair_rand_full": {"accuracy": 0.544625, "f1": 0.2566823097327076}}
|
| 86 |
+
{"seed": 1413, "dna_protein_pair_full": {"accuracy": 0.4675, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.494125, "f1": 0.03389830508474576}}
|
| 87 |
+
{"seed": 1608, "dna_protein_pair_full": {"accuracy": 0.4775, "f1": 0.5324384787472036}, "dna_protein_pair_rand_full": {"accuracy": 0.47375, "f1": 0.48870536798639785}}
|
| 88 |
+
{"seed": 4393, "dna_protein_pair_full": {"accuracy": 0.515, "f1": 0.0673076923076923}, "dna_protein_pair_rand_full": {"accuracy": 0.52225, "f1": 0.08036573628488931}}
|
| 89 |
+
{"seed": 7211, "dna_protein_pair_full": {"accuracy": 0.5025, "f1": 0.091324200913242}, "dna_protein_pair_rand_full": {"accuracy": 0.564, "f1": 0.355268022181146}}
|
| 90 |
+
{"seed": 7095, "dna_protein_pair_full": {"accuracy": 0.365, "f1": 0.3520408163265306}, "dna_protein_pair_rand_full": {"accuracy": 0.497, "f1": 0.476995061086561}}
|
| 91 |
+
{"seed": 4250, "dna_protein_pair_full": {"accuracy": 0.47, "f1": 0.13114754098360656}, "dna_protein_pair_rand_full": {"accuracy": 0.53075, "f1": 0.26622361219702895}}
|
| 92 |
+
{"seed": 9997, "dna_protein_pair_full": {"accuracy": 0.4775, "f1": 0.10300429184549356}, "dna_protein_pair_rand_full": {"accuracy": 0.521375, "f1": 0.18928647046368832}}
|
| 93 |
+
{"seed": 7553, "dna_protein_pair_full": {"accuracy": 0.375, "f1": 0.4158878504672897}, "dna_protein_pair_rand_full": {"accuracy": 0.54, "f1": 0.5315682281059063}}
|
| 94 |
+
{"seed": 87, "dna_protein_pair_full": {"accuracy": 0.6225, "f1": 0.48109965635738833}, "dna_protein_pair_rand_full": {"accuracy": 0.6245, "f1": 0.6419547079856972}}
|
| 95 |
+
{"seed": 9399, "dna_protein_pair_full": {"accuracy": 0.5475, "f1": 0.27309236947791166}, "dna_protein_pair_rand_full": {"accuracy": 0.5495, "f1": 0.2662866449511401}}
|
| 96 |
+
{"seed": 8228, "dna_protein_pair_full": {"accuracy": 0.505, "f1": 0.01}, "dna_protein_pair_rand_full": {"accuracy": 0.493125, "f1": 0.024536925667548712}}
|
| 97 |
+
{"seed": 5918, "dna_protein_pair_full": {"accuracy": 0.57, "f1": 0.4027777777777778}, "dna_protein_pair_rand_full": {"accuracy": 0.60825, "f1": 0.5965499485066942}}
|
| 98 |
+
{"seed": 9110, "dna_protein_pair_full": {"accuracy": 0.4025, "f1": 0.4860215053763441}, "dna_protein_pair_rand_full": {"accuracy": 0.474875, "f1": 0.5089421390999416}}
|
| 99 |
+
{"seed": 5520, "dna_protein_pair_full": {"accuracy": 0.4525, "f1": 0.5228758169934641}, "dna_protein_pair_rand_full": {"accuracy": 0.53775, "f1": 0.6076808826649692}}
|
| 100 |
+
{"seed": 6322, "dna_protein_pair_full": {"accuracy": 0.565, "f1": 0.40816326530612246}, "dna_protein_pair_rand_full": {"accuracy": 0.625375, "f1": 0.527063279154174}}
|
| 101 |
+
{"seed": 285, "dna_protein_pair_full": {"accuracy": 0.475, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.5105, "f1": 0.03165182987141444}}
|
| 102 |
+
{"seed": 1443, "dna_protein_pair_full": {"accuracy": 0.52, "f1": 0.08571428571428572}, "dna_protein_pair_rand_full": {"accuracy": 0.536, "f1": 0.1586582048957389}}
|
| 103 |
+
{"seed": 2518, "dna_protein_pair_full": {"accuracy": 0.4675, "f1": 0.03619909502262444}, "dna_protein_pair_rand_full": {"accuracy": 0.520625, "f1": 0.1238291066940827}}
|
| 104 |
+
{"seed": 5828, "dna_protein_pair_full": {"accuracy": 0.5075, "f1": 0.01990049751243781}, "dna_protein_pair_rand_full": {"accuracy": 0.504875, "f1": 0.039291777831675964}}
|
| 105 |
+
{"seed": 2377, "dna_protein_pair_full": {"accuracy": 0.4725, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.501625, "f1": 0.020152371590071272}}
|
| 106 |
+
{"seed": 3332, "dna_protein_pair_full": {"accuracy": 0.4175, "f1": 0.3206997084548105}, "dna_protein_pair_rand_full": {"accuracy": 0.533625, "f1": 0.4542928184876408}}
|
| 107 |
+
{"seed": 4033, "dna_protein_pair_full": {"accuracy": 0.49, "f1": 0.08108108108108109}, "dna_protein_pair_rand_full": {"accuracy": 0.505875, "f1": 0.09063722107200368}}
|
| 108 |
+
{"seed": 4457, "dna_protein_pair_full": {"accuracy": 0.485, "f1": 0.02830188679245283}, "dna_protein_pair_rand_full": {"accuracy": 0.531625, "f1": 0.1423666742961776}}
|
finetune/gpt2_gene_multiv1_ft_en3.jsonl
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"seed": 0, "en": {"accuracy": 0.875, "f1": 0.8636859323882224}, "fr": {"accuracy": 0.794, "f1": 0.7900101936799184}, "de": {"accuracy": 0.7715, "f1": 0.7570441254651781}, "zh": {"accuracy": 0.7075, "f1": 0.699537750385208}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9097222222222222, "f1": 0.900398406374502}, "dna_sim_pair_150bp": {"accuracy": 0.6925, "f1": 0.5533769063180828}, "dna_sim_pair_50bp": {"accuracy": 0.737, "f1": 0.67125}, "protein_sim_pair_150bp": {"accuracy": 0.9283333333333333, "f1": 0.9269121813031161}, "protein_sim_pair_450bp": {"accuracy": 0.8027777777777778, "f1": 0.7779862414008756}, "dna_protein_pair": {"accuracy": 0.515, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.5275, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.475, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.513125, "f1": 0.002560819462227913}, "dna_protein_pair_rand_100": {"accuracy": 0.510625, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.480625, "f1": 0.0024009603841536613}}
|
| 2 |
+
{"seed": 1, "en": {"accuracy": 0.872, "f1": 0.8642629904559915}, "fr": {"accuracy": 0.802, "f1": 0.7987804878048781}, "de": {"accuracy": 0.777, "f1": 0.7740628166160081}, "zh": {"accuracy": 0.712, "f1": 0.7}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9322222222222222, "f1": 0.933911159263272}, "dna_sim_pair_150bp": {"accuracy": 0.78125, "f1": 0.7412008281573499}, "dna_sim_pair_50bp": {"accuracy": 0.804, "f1": 0.7923728813559322}, "protein_sim_pair_150bp": {"accuracy": 0.9216666666666666, "f1": 0.923907177549919}, "protein_sim_pair_450bp": {"accuracy": 0.8622222222222222, "f1": 0.8772277227722772}, "dna_protein_pair": {"accuracy": 0.52, "f1": 0.19327731092436976}, "dna_protein_pair_100": {"accuracy": 0.515, "f1": 0.23015873015873015}, "dna_protein_pair_full": {"accuracy": 0.3475, "f1": 0.4161073825503356}, "dna_protein_pair_rand": {"accuracy": 0.55, "f1": 0.21568627450980393}, "dna_protein_pair_rand_100": {"accuracy": 0.53, "f1": 0.27552986512524086}, "dna_protein_pair_rand_full": {"accuracy": 0.5525, "f1": 0.5295663600525624}}
|
| 3 |
+
{"seed": 3, "en": {"accuracy": 0.868, "f1": 0.8622129436325678}, "fr": {"accuracy": 0.788, "f1": 0.7911330049261084}, "de": {"accuracy": 0.7655, "f1": 0.7670144063586687}, "zh": {"accuracy": 0.692, "f1": 0.7029893924783028}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8419444444444445, "f1": 0.8528575122834239}, "dna_sim_pair_150bp": {"accuracy": 0.7905, "f1": 0.8017037387600567}, "dna_sim_pair_50bp": {"accuracy": 0.5055, "f1": 0.6550401116149285}, "protein_sim_pair_150bp": {"accuracy": 0.8805555555555555, "f1": 0.8934060485870104}, "protein_sim_pair_450bp": {"accuracy": 0.7927777777777778, "f1": 0.8132198297446169}, "dna_protein_pair": {"accuracy": 0.585, "f1": 0.2905982905982906}, "dna_protein_pair_100": {"accuracy": 0.5925, "f1": 0.4359861591695502}, "dna_protein_pair_full": {"accuracy": 0.4175, "f1": 0.5801801801801801}, "dna_protein_pair_rand": {"accuracy": 0.545625, "f1": 0.4333593141075604}, "dna_protein_pair_rand_100": {"accuracy": 0.575625, "f1": 0.5535831689677844}, "dna_protein_pair_rand_full": {"accuracy": 0.520625, "f1": 0.663153271848924}}
|
| 4 |
+
{"seed": 4, "en": {"accuracy": 0.883, "f1": 0.8764519535374868}, "fr": {"accuracy": 0.807, "f1": 0.8090999010880316}, "de": {"accuracy": 0.779, "f1": 0.7805362462760675}, "zh": {"accuracy": 0.6995, "f1": 0.7153008053055424}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9233333333333333, "f1": 0.9172165566886623}, "dna_sim_pair_150bp": {"accuracy": 0.69875, "f1": 0.5728465083303793}, "dna_sim_pair_50bp": {"accuracy": 0.6045, "f1": 0.3563873067534581}, "protein_sim_pair_150bp": {"accuracy": 0.965, "f1": 0.9637722829212191}, "protein_sim_pair_450bp": {"accuracy": 0.9544444444444444, "f1": 0.9524361948955916}, "dna_protein_pair": {"accuracy": 0.515, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.515, "f1": 0.01020408163265306}, "dna_protein_pair_full": {"accuracy": 0.5125, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.504375, "f1": 0.005018820577164366}, "dna_protein_pair_rand_100": {"accuracy": 0.49875, "f1": 0.0024875621890547263}, "dna_protein_pair_rand_full": {"accuracy": 0.489375, "f1": 0.0}}
|
| 5 |
+
{"seed": 5, "en": {"accuracy": 0.8935, "f1": 0.8892355694227769}, "fr": {"accuracy": 0.801, "f1": 0.8015952143569293}, "de": {"accuracy": 0.7725, "f1": 0.769620253164557}, "zh": {"accuracy": 0.678, "f1": 0.6912751677852349}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9347222222222222, "f1": 0.9319432377642629}, "dna_sim_pair_150bp": {"accuracy": 0.75125, "f1": 0.687008493236867}, "dna_sim_pair_50bp": {"accuracy": 0.711, "f1": 0.7008281573498965}, "protein_sim_pair_150bp": {"accuracy": 0.9505555555555556, "f1": 0.9504178272980501}, "protein_sim_pair_450bp": {"accuracy": 0.82, "f1": 0.7925736235595391}, "dna_protein_pair": {"accuracy": 0.5075, "f1": 0.03902439024390244}, "dna_protein_pair_100": {"accuracy": 0.4875, "f1": 0.03755868544600939}, "dna_protein_pair_full": {"accuracy": 0.5325, "f1": 0.0966183574879227}, "dna_protein_pair_rand": {"accuracy": 0.51625, "f1": 0.046798029556650245}, "dna_protein_pair_rand_100": {"accuracy": 0.48375, "f1": 0.078125}, "dna_protein_pair_rand_full": {"accuracy": 0.538125, "f1": 0.16308040770101925}}
|
| 6 |
+
{"seed": 6, "en": {"accuracy": 0.859, "f1": 0.8528183716075156}, "fr": {"accuracy": 0.7635, "f1": 0.7718282682103232}, "de": {"accuracy": 0.751, "f1": 0.7514970059880239}, "zh": {"accuracy": 0.7165, "f1": 0.704841228526809}, "dna_sim_pair_simple_150bp": {"accuracy": 0.7169444444444445, "f1": 0.7613024127430311}, "dna_sim_pair_150bp": {"accuracy": 0.67725, "f1": 0.7200173498156582}, "dna_sim_pair_50bp": {"accuracy": 0.531, "f1": 0.65}, "protein_sim_pair_150bp": {"accuracy": 0.9127777777777778, "f1": 0.9128262076624097}, "protein_sim_pair_450bp": {"accuracy": 0.7894444444444444, "f1": 0.829202343397927}, "dna_protein_pair": {"accuracy": 0.49, "f1": 0.019230769230769232}, "dna_protein_pair_100": {"accuracy": 0.515, "f1": 0.03}, "dna_protein_pair_full": {"accuracy": 0.31, "f1": 0.38392857142857145}, "dna_protein_pair_rand": {"accuracy": 0.496875, "f1": 0.03592814371257485}, "dna_protein_pair_rand_100": {"accuracy": 0.520625, "f1": 0.04005006257822278}, "dna_protein_pair_rand_full": {"accuracy": 0.523125, "f1": 0.5428400239664469}}
|
| 7 |
+
{"seed": 7, "en": {"accuracy": 0.871, "f1": 0.8632025450689289}, "fr": {"accuracy": 0.785, "f1": 0.7918683446272992}, "de": {"accuracy": 0.7535, "f1": 0.7624096385542168}, "zh": {"accuracy": 0.699, "f1": 0.7111324376199616}, "dna_sim_pair_simple_150bp": {"accuracy": 0.7925, "f1": 0.8184690157958687}, "dna_sim_pair_150bp": {"accuracy": 0.70325, "f1": 0.7220791383750877}, "dna_sim_pair_50bp": {"accuracy": 0.554, "f1": 0.6646616541353384}, "protein_sim_pair_150bp": {"accuracy": 0.7016666666666667, "f1": 0.7710021321961621}, "protein_sim_pair_450bp": {"accuracy": 0.7394444444444445, "f1": 0.7914628723877278}, "dna_protein_pair": {"accuracy": 0.5425, "f1": 0.4369230769230769}, "dna_protein_pair_100": {"accuracy": 0.585, "f1": 0.3897058823529412}, "dna_protein_pair_full": {"accuracy": 0.5325, "f1": 0.44510385756676557}, "dna_protein_pair_rand": {"accuracy": 0.5575, "f1": 0.512396694214876}, "dna_protein_pair_rand_100": {"accuracy": 0.5775, "f1": 0.4643423137876387}, "dna_protein_pair_rand_full": {"accuracy": 0.569375, "f1": 0.46297739672642246}}
|
| 8 |
+
{"seed": 8, "en": {"accuracy": 0.8765, "f1": 0.8685470995210218}, "fr": {"accuracy": 0.7965, "f1": 0.7974116475858636}, "de": {"accuracy": 0.7425, "f1": 0.7560397915679773}, "zh": {"accuracy": 0.7145, "f1": 0.7134972403411942}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9269444444444445, "f1": 0.9227152512488981}, "dna_sim_pair_150bp": {"accuracy": 0.8225, "f1": 0.7928821470245041}, "dna_sim_pair_50bp": {"accuracy": 0.7675, "f1": 0.7203848466626579}, "protein_sim_pair_150bp": {"accuracy": 0.9577777777777777, "f1": 0.9581497797356828}, "protein_sim_pair_450bp": {"accuracy": 0.9244444444444444, "f1": 0.9250275633958104}, "dna_protein_pair": {"accuracy": 0.51, "f1": 0.010101010101010102}, "dna_protein_pair_100": {"accuracy": 0.49, "f1": 0.009708737864077669}, "dna_protein_pair_full": {"accuracy": 0.4825, "f1": 0.018957345971563982}, "dna_protein_pair_rand": {"accuracy": 0.52125, "f1": 0.0103359173126615}, "dna_protein_pair_rand_100": {"accuracy": 0.500625, "f1": 0.0024968789013732834}, "dna_protein_pair_rand_full": {"accuracy": 0.491875, "f1": 0.0378698224852071}}
|
| 9 |
+
{"seed": 9, "en": {"accuracy": 0.885, "f1": 0.8756756756756757}, "fr": {"accuracy": 0.7875, "f1": 0.7919725893294175}, "de": {"accuracy": 0.758, "f1": 0.7611056268509379}, "zh": {"accuracy": 0.7035, "f1": 0.7144920558497834}, "dna_sim_pair_simple_150bp": {"accuracy": 0.7744444444444445, "f1": 0.8088512241054614}, "dna_sim_pair_150bp": {"accuracy": 0.65875, "f1": 0.6930514953901507}, "dna_sim_pair_50bp": {"accuracy": 0.667, "f1": 0.7111882046834345}, "protein_sim_pair_150bp": {"accuracy": 0.9122222222222223, "f1": 0.9166666666666666}, "protein_sim_pair_450bp": {"accuracy": 0.8638888888888889, "f1": 0.8709847288046341}, "dna_protein_pair": {"accuracy": 0.5675, "f1": 0.3215686274509804}, "dna_protein_pair_100": {"accuracy": 0.5425, "f1": 0.37542662116040953}, "dna_protein_pair_full": {"accuracy": 0.5575, "f1": 0.4380952380952381}, "dna_protein_pair_rand": {"accuracy": 0.50375, "f1": 0.3873456790123457}, "dna_protein_pair_rand_100": {"accuracy": 0.490625, "f1": 0.42484121383203954}, "dna_protein_pair_rand_full": {"accuracy": 0.491875, "f1": 0.3722007722007722}}
|
| 10 |
+
{"seed": 10, "en": {"accuracy": 0.8725, "f1": 0.8634172469201928}, "fr": {"accuracy": 0.7965, "f1": 0.7996061053668144}, "de": {"accuracy": 0.76, "f1": 0.7611940298507462}, "zh": {"accuracy": 0.7265, "f1": 0.7143603133159269}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9241666666666667, "f1": 0.9209840810419682}, "dna_sim_pair_150bp": {"accuracy": 0.80175, "f1": 0.7613602166716822}, "dna_sim_pair_50bp": {"accuracy": 0.625, "f1": 0.6023329798515377}, "protein_sim_pair_150bp": {"accuracy": 0.9155555555555556, "f1": 0.9142212189616253}, "protein_sim_pair_450bp": {"accuracy": 0.9577777777777777, "f1": 0.9583333333333334}, "dna_protein_pair": {"accuracy": 0.495, "f1": 0.0380952380952381}, "dna_protein_pair_100": {"accuracy": 0.4475, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.4925, "f1": 0.01932367149758454}, "dna_protein_pair_rand": {"accuracy": 0.506875, "f1": 0.02952029520295203}, "dna_protein_pair_rand_100": {"accuracy": 0.520625, "f1": 0.01287001287001287}, "dna_protein_pair_rand_full": {"accuracy": 0.474375, "f1": 0.01866977829638273}}
|
| 11 |
+
{"seed": 11, "en": {"accuracy": 0.8835, "f1": 0.8738494856524093}, "fr": {"accuracy": 0.8025, "f1": 0.8027958062905641}, "de": {"accuracy": 0.7605, "f1": 0.7511688311688312}, "zh": {"accuracy": 0.708, "f1": 0.6620370370370371}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9258333333333333, "f1": 0.9206066012488849}, "dna_sim_pair_150bp": {"accuracy": 0.7435, "f1": 0.6361702127659574}, "dna_sim_pair_50bp": {"accuracy": 0.6915, "f1": 0.6087507926442612}, "protein_sim_pair_150bp": {"accuracy": 0.925, "f1": 0.923425978445831}, "protein_sim_pair_450bp": {"accuracy": 0.8894444444444445, "f1": 0.8763206960845246}, "dna_protein_pair": {"accuracy": 0.4675, "f1": 0.009302325581395349}, "dna_protein_pair_100": {"accuracy": 0.5425, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.465, "f1": 0.02727272727272727}, "dna_protein_pair_rand": {"accuracy": 0.511875, "f1": 0.03936039360393604}, "dna_protein_pair_rand_100": {"accuracy": 0.50625, "f1": 0.017412935323383085}, "dna_protein_pair_rand_full": {"accuracy": 0.509375, "f1": 0.024844720496894408}}
|
| 12 |
+
{"seed": 12, "en": {"accuracy": 0.886, "f1": 0.8759521218715995}, "fr": {"accuracy": 0.7965, "f1": 0.7890098496630379}, "de": {"accuracy": 0.7705, "f1": 0.7552}, "zh": {"accuracy": 0.7305, "f1": 0.6914710933028048}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8952777777777777, "f1": 0.8899270072992701}, "dna_sim_pair_150bp": {"accuracy": 0.6635, "f1": 0.5213371266002845}, "dna_sim_pair_50bp": {"accuracy": 0.472, "f1": 0.5735056542810986}, "protein_sim_pair_150bp": {"accuracy": 0.9188888888888889, "f1": 0.9140164899882215}, "protein_sim_pair_450bp": {"accuracy": 0.95, "f1": 0.9497767857142857}, "dna_protein_pair": {"accuracy": 0.475, "f1": 0.027777777777777776}, "dna_protein_pair_100": {"accuracy": 0.515, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.4525, "f1": 0.08368200836820083}, "dna_protein_pair_rand": {"accuracy": 0.48875, "f1": 0.007281553398058253}, "dna_protein_pair_rand_100": {"accuracy": 0.5025, "f1": 0.01485148514851485}, "dna_protein_pair_rand_full": {"accuracy": 0.474375, "f1": 0.06243032329988852}}
|
| 13 |
+
{"seed": 13, "en": {"accuracy": 0.8955, "f1": 0.8873315363881401}, "fr": {"accuracy": 0.812, "f1": 0.8039624608967675}, "de": {"accuracy": 0.7785, "f1": 0.7611859838274933}, "zh": {"accuracy": 0.719, "f1": 0.6915477497255763}, "dna_sim_pair_simple_150bp": {"accuracy": 0.92, "f1": 0.9119266055045872}, "dna_sim_pair_150bp": {"accuracy": 0.70925, "f1": 0.5929296464823242}, "dna_sim_pair_50bp": {"accuracy": 0.594, "f1": 0.2852112676056338}, "protein_sim_pair_150bp": {"accuracy": 0.9455555555555556, "f1": 0.9439359267734554}, "protein_sim_pair_450bp": {"accuracy": 0.8977777777777778, "f1": 0.8908659549228944}, "dna_protein_pair": {"accuracy": 0.535, "f1": 0.021052631578947368}, "dna_protein_pair_100": {"accuracy": 0.485, "f1": 0.009615384615384616}, "dna_protein_pair_full": {"accuracy": 0.45, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.499375, "f1": 0.012330456226880395}, "dna_protein_pair_rand_100": {"accuracy": 0.51, "f1": 0.005076142131979695}, "dna_protein_pair_rand_full": {"accuracy": 0.4975, "f1": 0.0049504950495049506}}
|
| 14 |
+
{"seed": 14, "en": {"accuracy": 0.8825, "f1": 0.8759894459102903}, "fr": {"accuracy": 0.7905, "f1": 0.7992333493052228}, "de": {"accuracy": 0.755, "f1": 0.7651006711409396}, "zh": {"accuracy": 0.697, "f1": 0.7173507462686567}, "dna_sim_pair_simple_150bp": {"accuracy": 0.6244444444444445, "f1": 0.7167993297025556}, "dna_sim_pair_150bp": {"accuracy": 0.57425, "f1": 0.6696411251212415}, "dna_sim_pair_50bp": {"accuracy": 0.459, "f1": 0.62560553633218}, "protein_sim_pair_150bp": {"accuracy": 0.6972222222222222, "f1": 0.7570218457423094}, "protein_sim_pair_450bp": {"accuracy": 0.8488888888888889, "f1": 0.846674182638106}, "dna_protein_pair": {"accuracy": 0.4575, "f1": 0.42133333333333334}, "dna_protein_pair_100": {"accuracy": 0.5325, "f1": 0.20425531914893616}, "dna_protein_pair_full": {"accuracy": 0.4975, "f1": 0.12987012987012986}, "dna_protein_pair_rand": {"accuracy": 0.53875, "f1": 0.4383561643835616}, "dna_protein_pair_rand_100": {"accuracy": 0.524375, "f1": 0.17011995637949837}, "dna_protein_pair_rand_full": {"accuracy": 0.496875, "f1": 0.08209806157354618}}
|
| 15 |
+
{"seed": 15, "en": {"accuracy": 0.8945, "f1": 0.8857606930157012}, "fr": {"accuracy": 0.807, "f1": 0.8058350100603622}, "de": {"accuracy": 0.768, "f1": 0.768}, "zh": {"accuracy": 0.7265, "f1": 0.6969529085872577}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9091666666666667, "f1": 0.9004566210045662}, "dna_sim_pair_150bp": {"accuracy": 0.6815, "f1": 0.510752688172043}, "dna_sim_pair_50bp": {"accuracy": 0.6495, "f1": 0.4811250925240563}, "protein_sim_pair_150bp": {"accuracy": 0.9372222222222222, "f1": 0.9338794616734932}, "protein_sim_pair_450bp": {"accuracy": 0.845, "f1": 0.8203477141017386}, "dna_protein_pair": {"accuracy": 0.5, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.4975, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.505, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.51875, "f1": 0.0}, "dna_protein_pair_rand_100": {"accuracy": 0.519375, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.498125, "f1": 0.0}}
|
| 16 |
+
{"seed": 16, "en": {"accuracy": 0.8725, "f1": 0.8614883215643672}, "fr": {"accuracy": 0.795, "f1": 0.7964250248262165}, "de": {"accuracy": 0.759, "f1": 0.76162215628091}, "zh": {"accuracy": 0.7105, "f1": 0.7065382665990877}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9227777777777778, "f1": 0.9177027827116637}, "dna_sim_pair_150bp": {"accuracy": 0.703, "f1": 0.5875}, "dna_sim_pair_50bp": {"accuracy": 0.727, "f1": 0.6305818673883626}, "protein_sim_pair_150bp": {"accuracy": 0.9683333333333334, "f1": 0.9681386249301286}, "protein_sim_pair_450bp": {"accuracy": 0.9072222222222223, "f1": 0.9022820362785254}, "dna_protein_pair": {"accuracy": 0.52, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.515, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.51, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.515625, "f1": 0.002574002574002574}, "dna_protein_pair_rand_100": {"accuracy": 0.500625, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.50375, "f1": 0.02457002457002457}}
|
| 17 |
+
{"seed": 17, "en": {"accuracy": 0.886, "f1": 0.8758169934640523}, "fr": {"accuracy": 0.8155, "f1": 0.8050713153724247}, "de": {"accuracy": 0.7715, "f1": 0.7565263718700054}, "zh": {"accuracy": 0.7275, "f1": 0.712401055408971}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8986111111111111, "f1": 0.8996425625515535}, "dna_sim_pair_150bp": {"accuracy": 0.7325, "f1": 0.6749696233292831}, "dna_sim_pair_50bp": {"accuracy": 0.5555, "f1": 0.44885306881587106}, "protein_sim_pair_150bp": {"accuracy": 0.9422222222222222, "f1": 0.9395348837209302}, "protein_sim_pair_450bp": {"accuracy": 0.9577777777777777, "f1": 0.9563218390804598}, "dna_protein_pair": {"accuracy": 0.505, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.4725, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5025, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.495, "f1": 0.0024691358024691358}, "dna_protein_pair_rand_100": {"accuracy": 0.49625, "f1": 0.0024752475247524753}, "dna_protein_pair_rand_full": {"accuracy": 0.501875, "f1": 0.0}}
|
| 18 |
+
{"seed": 18, "en": {"accuracy": 0.872, "f1": 0.8659685863874346}, "fr": {"accuracy": 0.7995, "f1": 0.7981882234524409}, "de": {"accuracy": 0.774, "f1": 0.7648283038501561}, "zh": {"accuracy": 0.724, "f1": 0.7100840336134454}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8791666666666667, "f1": 0.8627327232565478}, "dna_sim_pair_150bp": {"accuracy": 0.593, "f1": 0.3165407220822838}, "dna_sim_pair_50bp": {"accuracy": 0.6615, "f1": 0.5125989920806335}, "protein_sim_pair_150bp": {"accuracy": 0.9394444444444444, "f1": 0.9339794064203513}, "protein_sim_pair_450bp": {"accuracy": 0.9161111111111111, "f1": 0.9104919976289271}, "dna_protein_pair": {"accuracy": 0.555, "f1": 0.011111111111111112}, "dna_protein_pair_100": {"accuracy": 0.4925, "f1": 0.00975609756097561}, "dna_protein_pair_full": {"accuracy": 0.5025, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.503125, "f1": 0.0}, "dna_protein_pair_rand_100": {"accuracy": 0.479375, "f1": 0.01652892561983471}, "dna_protein_pair_rand_full": {"accuracy": 0.520625, "f1": 0.01540436456996149}}
|
| 19 |
+
{"seed": 19, "en": {"accuracy": 0.887, "f1": 0.8810526315789474}, "fr": {"accuracy": 0.8005, "f1": 0.797153024911032}, "de": {"accuracy": 0.778, "f1": 0.76875}, "zh": {"accuracy": 0.713, "f1": 0.7007299270072993}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9422222222222222, "f1": 0.93885949441505}, "dna_sim_pair_150bp": {"accuracy": 0.7455, "f1": 0.6651315789473684}, "dna_sim_pair_50bp": {"accuracy": 0.794, "f1": 0.7927565392354124}, "protein_sim_pair_150bp": {"accuracy": 0.9272222222222222, "f1": 0.9322997416020672}, "protein_sim_pair_450bp": {"accuracy": 0.8827777777777778, "f1": 0.878386167146974}, "dna_protein_pair": {"accuracy": 0.53, "f1": 0.19658119658119658}, "dna_protein_pair_100": {"accuracy": 0.485, "f1": 0.10434782608695652}, "dna_protein_pair_full": {"accuracy": 0.485, "f1": 0.1693548387096774}, "dna_protein_pair_rand": {"accuracy": 0.513125, "f1": 0.21708542713567838}, "dna_protein_pair_rand_100": {"accuracy": 0.468125, "f1": 0.10137275607180571}, "dna_protein_pair_rand_full": {"accuracy": 0.49, "f1": 0.1724137931034483}}
|
| 20 |
+
{"seed": 20, "en": {"accuracy": 0.8855, "f1": 0.8757460661964189}, "fr": {"accuracy": 0.827, "f1": 0.8220164609053497}, "de": {"accuracy": 0.784, "f1": 0.767491926803014}, "zh": {"accuracy": 0.716, "f1": 0.6833890746934225}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9355555555555556, "f1": 0.9318048206937096}, "dna_sim_pair_150bp": {"accuracy": 0.70825, "f1": 0.5938043856595893}, "dna_sim_pair_50bp": {"accuracy": 0.644, "f1": 0.538860103626943}, "protein_sim_pair_150bp": {"accuracy": 0.9644444444444444, "f1": 0.9646799116997793}, "protein_sim_pair_450bp": {"accuracy": 0.9266666666666666, "f1": 0.9231664726426076}, "dna_protein_pair": {"accuracy": 0.4775, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.5325, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.475, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.505625, "f1": 0.005031446540880503}, "dna_protein_pair_rand_100": {"accuracy": 0.5075, "f1": 0.005050505050505051}, "dna_protein_pair_rand_full": {"accuracy": 0.51875, "f1": 0.007731958762886598}}
|
| 21 |
+
{"seed": 21, "en": {"accuracy": 0.874, "f1": 0.8675078864353313}, "fr": {"accuracy": 0.771, "f1": 0.7806513409961686}, "de": {"accuracy": 0.7475, "f1": 0.7607768829938418}, "zh": {"accuracy": 0.6995, "f1": 0.7069722086786934}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8816666666666667, "f1": 0.8688423645320197}, "dna_sim_pair_150bp": {"accuracy": 0.705, "f1": 0.5958904109589042}, "dna_sim_pair_50bp": {"accuracy": 0.768, "f1": 0.7289719626168224}, "protein_sim_pair_150bp": {"accuracy": 0.9177777777777778, "f1": 0.9247202441505595}, "protein_sim_pair_450bp": {"accuracy": 0.8983333333333333, "f1": 0.9048361934477379}, "dna_protein_pair": {"accuracy": 0.525, "f1": 0.15178571428571427}, "dna_protein_pair_100": {"accuracy": 0.475, "f1": 0.11764705882352941}, "dna_protein_pair_full": {"accuracy": 0.39, "f1": 0.42990654205607476}, "dna_protein_pair_rand": {"accuracy": 0.49, "f1": 0.15702479338842976}, "dna_protein_pair_rand_100": {"accuracy": 0.535625, "f1": 0.14302191464821223}, "dna_protein_pair_rand_full": {"accuracy": 0.490625, "f1": 0.4526527871054399}}
|
| 22 |
+
{"seed": 22, "en": {"accuracy": 0.84, "f1": 0.8247535596933188}, "fr": {"accuracy": 0.7595, "f1": 0.7708432586946165}, "de": {"accuracy": 0.716, "f1": 0.7389705882352942}, "zh": {"accuracy": 0.6735, "f1": 0.700870361887311}, "dna_sim_pair_simple_150bp": {"accuracy": 0.705, "f1": 0.7620967741935484}, "dna_sim_pair_150bp": {"accuracy": 0.67325, "f1": 0.7286692962424746}, "dna_sim_pair_50bp": {"accuracy": 0.4845, "f1": 0.6520418494768815}, "protein_sim_pair_150bp": {"accuracy": 0.5811111111111111, "f1": 0.70546875}, "protein_sim_pair_450bp": {"accuracy": 0.8172222222222222, "f1": 0.8495656149977138}, "dna_protein_pair": {"accuracy": 0.5375, "f1": 0.5268542199488491}, "dna_protein_pair_100": {"accuracy": 0.5275, "f1": 0.08695652173913043}, "dna_protein_pair_full": {"accuracy": 0.38, "f1": 0.5373134328358209}, "dna_protein_pair_rand": {"accuracy": 0.525, "f1": 0.5084087968952135}, "dna_protein_pair_rand_100": {"accuracy": 0.489375, "f1": 0.0932297447280799}, "dna_protein_pair_rand_full": {"accuracy": 0.459375, "f1": 0.5963602426504899}}
|
| 23 |
+
{"seed": 23, "en": {"accuracy": 0.8695, "f1": 0.8628481345244351}, "fr": {"accuracy": 0.774, "f1": 0.7757936507936508}, "de": {"accuracy": 0.748, "f1": 0.7495029821073559}, "zh": {"accuracy": 0.7065, "f1": 0.7086848635235732}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9219444444444445, "f1": 0.9214864487286951}, "dna_sim_pair_150bp": {"accuracy": 0.7935, "f1": 0.7767567567567567}, "dna_sim_pair_50bp": {"accuracy": 0.6235, "f1": 0.5388854868340478}, "protein_sim_pair_150bp": {"accuracy": 0.9066666666666666, "f1": 0.9074889867841409}, "protein_sim_pair_450bp": {"accuracy": 0.91, "f1": 0.9124324324324324}, "dna_protein_pair": {"accuracy": 0.51, "f1": 0.11711711711711711}, "dna_protein_pair_100": {"accuracy": 0.5225, "f1": 0.1511111111111111}, "dna_protein_pair_full": {"accuracy": 0.3575, "f1": 0.4145785876993166}, "dna_protein_pair_rand": {"accuracy": 0.544375, "f1": 0.21528525296017223}, "dna_protein_pair_rand_100": {"accuracy": 0.506875, "f1": 0.10847457627118644}, "dna_protein_pair_rand_full": {"accuracy": 0.475625, "f1": 0.424159231297186}}
|
| 24 |
+
{"seed": 24, "en": {"accuracy": 0.8735, "f1": 0.8691153647180548}, "fr": {"accuracy": 0.7815, "f1": 0.7923990498812351}, "de": {"accuracy": 0.7545, "f1": 0.7656324582338903}, "zh": {"accuracy": 0.7125, "f1": 0.7193753050268423}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8238888888888889, "f1": 0.8467858869018849}, "dna_sim_pair_150bp": {"accuracy": 0.72225, "f1": 0.7203624465139693}, "dna_sim_pair_50bp": {"accuracy": 0.508, "f1": 0.6675675675675675}, "protein_sim_pair_150bp": {"accuracy": 0.7261111111111112, "f1": 0.7780279153534444}, "protein_sim_pair_450bp": {"accuracy": 0.8838888888888888, "f1": 0.8878153515834675}, "dna_protein_pair": {"accuracy": 0.47, "f1": 0.28859060402684567}, "dna_protein_pair_100": {"accuracy": 0.4775, "f1": 0.03686635944700461}, "dna_protein_pair_full": {"accuracy": 0.5125, "f1": 0.14847161572052403}, "dna_protein_pair_rand": {"accuracy": 0.536875, "f1": 0.3220494053064959}, "dna_protein_pair_rand_100": {"accuracy": 0.50625, "f1": 0.04819277108433735}, "dna_protein_pair_rand_full": {"accuracy": 0.551875, "f1": 0.3059051306873185}}
|
| 25 |
+
{"seed": 25, "en": {"accuracy": 0.8805, "f1": 0.8728046833422033}, "fr": {"accuracy": 0.783, "f1": 0.7893203883495146}, "de": {"accuracy": 0.7745, "f1": 0.7711821410451547}, "zh": {"accuracy": 0.693, "f1": 0.6870540265035678}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9272222222222222, "f1": 0.9254835039817975}, "dna_sim_pair_150bp": {"accuracy": 0.74775, "f1": 0.6934062594955941}, "dna_sim_pair_50bp": {"accuracy": 0.611, "f1": 0.69609375}, "protein_sim_pair_150bp": {"accuracy": 0.9244444444444444, "f1": 0.9287211740041929}, "protein_sim_pair_450bp": {"accuracy": 0.8683333333333333, "f1": 0.8539741219963032}, "dna_protein_pair": {"accuracy": 0.5275, "f1": 0.16}, "dna_protein_pair_100": {"accuracy": 0.545, "f1": 0.10784313725490197}, "dna_protein_pair_full": {"accuracy": 0.4775, "f1": 0.05429864253393665}, "dna_protein_pair_rand": {"accuracy": 0.525, "f1": 0.21161825726141079}, "dna_protein_pair_rand_100": {"accuracy": 0.5175, "f1": 0.06987951807228916}, "dna_protein_pair_rand_full": {"accuracy": 0.51, "f1": 0.04156479217603912}}
|
| 26 |
+
{"seed": 26, "en": {"accuracy": 0.8615, "f1": 0.8544403573305307}, "fr": {"accuracy": 0.7635, "f1": 0.7716079188797682}, "de": {"accuracy": 0.748, "f1": 0.7459677419354839}, "zh": {"accuracy": 0.6955, "f1": 0.7120567375886525}, "dna_sim_pair_simple_150bp": {"accuracy": 0.7847222222222222, "f1": 0.8073576932637335}, "dna_sim_pair_150bp": {"accuracy": 0.72825, "f1": 0.7416211076776801}, "dna_sim_pair_50bp": {"accuracy": 0.503, "f1": 0.6403762662807525}, "protein_sim_pair_150bp": {"accuracy": 0.9255555555555556, "f1": 0.9270152505446623}, "protein_sim_pair_450bp": {"accuracy": 0.9738888888888889, "f1": 0.9743589743589743}, "dna_protein_pair": {"accuracy": 0.45, "f1": 0.03508771929824561}, "dna_protein_pair_100": {"accuracy": 0.48, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.525, "f1": 0.030612244897959183}, "dna_protein_pair_rand": {"accuracy": 0.505625, "f1": 0.11422172452407615}, "dna_protein_pair_rand_100": {"accuracy": 0.51125, "f1": 0.017587939698492462}, "dna_protein_pair_rand_full": {"accuracy": 0.491875, "f1": 0.04014167650531287}}
|
| 27 |
+
{"seed": 27, "en": {"accuracy": 0.8815, "f1": 0.8681135225375626}, "fr": {"accuracy": 0.812, "f1": 0.8008474576271186}, "de": {"accuracy": 0.778, "f1": 0.7623126338329764}, "zh": {"accuracy": 0.7175, "f1": 0.699308142629058}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9061111111111111, "f1": 0.9016870273414777}, "dna_sim_pair_150bp": {"accuracy": 0.7425, "f1": 0.6836609336609336}, "dna_sim_pair_50bp": {"accuracy": 0.622, "f1": 0.6698689956331878}, "protein_sim_pair_150bp": {"accuracy": 0.8916666666666667, "f1": 0.8993288590604027}, "protein_sim_pair_450bp": {"accuracy": 0.8988888888888888, "f1": 0.8969422423556059}, "dna_protein_pair": {"accuracy": 0.5175, "f1": 0.10232558139534884}, "dna_protein_pair_100": {"accuracy": 0.505, "f1": 0.038834951456310676}, "dna_protein_pair_full": {"accuracy": 0.4875, "f1": 0.18326693227091634}, "dna_protein_pair_rand": {"accuracy": 0.523125, "f1": 0.1839572192513369}, "dna_protein_pair_rand_100": {"accuracy": 0.52625, "f1": 0.07334963325183375}, "dna_protein_pair_rand_full": {"accuracy": 0.52, "f1": 0.1794871794871795}}
|
| 28 |
+
{"seed": 28, "en": {"accuracy": 0.8775, "f1": 0.8744233726294208}, "fr": {"accuracy": 0.77, "f1": 0.786046511627907}, "de": {"accuracy": 0.748, "f1": 0.76}, "zh": {"accuracy": 0.695, "f1": 0.7183748845798708}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8963888888888889, "f1": 0.9019710906701708}, "dna_sim_pair_150bp": {"accuracy": 0.79675, "f1": 0.7919119529050422}, "dna_sim_pair_50bp": {"accuracy": 0.609, "f1": 0.718299711815562}, "protein_sim_pair_150bp": {"accuracy": 0.855, "f1": 0.8718703976435935}, "protein_sim_pair_450bp": {"accuracy": 0.9211111111111111, "f1": 0.9232432432432433}, "dna_protein_pair": {"accuracy": 0.4625, "f1": 0.0851063829787234}, "dna_protein_pair_100": {"accuracy": 0.4775, "f1": 0.027906976744186046}, "dna_protein_pair_full": {"accuracy": 0.4875, "f1": 0.00966183574879227}, "dna_protein_pair_rand": {"accuracy": 0.5125, "f1": 0.1352549889135255}, "dna_protein_pair_rand_100": {"accuracy": 0.4975, "f1": 0.0405727923627685}, "dna_protein_pair_rand_full": {"accuracy": 0.505625, "f1": 0.03654080389768575}}
|
| 29 |
+
{"seed": 29, "en": {"accuracy": 0.8855, "f1": 0.8804177545691906}, "fr": {"accuracy": 0.7855, "f1": 0.7936507936507936}, "de": {"accuracy": 0.762, "f1": 0.7652859960552268}, "zh": {"accuracy": 0.7115, "f1": 0.7039507439712673}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9522222222222222, "f1": 0.9517666853617499}, "dna_sim_pair_150bp": {"accuracy": 0.7995, "f1": 0.7530788177339901}, "dna_sim_pair_50bp": {"accuracy": 0.7345, "f1": 0.6885630498533725}, "protein_sim_pair_150bp": {"accuracy": 0.9705555555555555, "f1": 0.9718832891246685}, "protein_sim_pair_450bp": {"accuracy": 0.9588888888888889, "f1": 0.9587513935340022}, "dna_protein_pair": {"accuracy": 0.5325, "f1": 0.020942408376963352}, "dna_protein_pair_100": {"accuracy": 0.54, "f1": 0.010752688172043012}, "dna_protein_pair_full": {"accuracy": 0.4725, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.505, "f1": 0.019801980198019802}, "dna_protein_pair_rand_100": {"accuracy": 0.504375, "f1": 0.005018820577164366}, "dna_protein_pair_rand_full": {"accuracy": 0.503125, "f1": 0.009962640099626401}}
|
| 30 |
+
{"seed": 30, "en": {"accuracy": 0.868, "f1": 0.8563656147986942}, "fr": {"accuracy": 0.7905, "f1": 0.79369768586903}, "de": {"accuracy": 0.766, "f1": 0.7523809523809524}, "zh": {"accuracy": 0.7155, "f1": 0.709545686574783}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9177777777777778, "f1": 0.9157655093910074}, "dna_sim_pair_150bp": {"accuracy": 0.7755, "f1": 0.7328970850684117}, "dna_sim_pair_50bp": {"accuracy": 0.53, "f1": 0.5837023914969}, "protein_sim_pair_150bp": {"accuracy": 0.9261111111111111, "f1": 0.9264787175234936}, "protein_sim_pair_450bp": {"accuracy": 0.9327777777777778, "f1": 0.9328896283971159}, "dna_protein_pair": {"accuracy": 0.4875, "f1": 0.019138755980861243}, "dna_protein_pair_100": {"accuracy": 0.5225, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5, "f1": 0.06542056074766354}, "dna_protein_pair_rand": {"accuracy": 0.506875, "f1": 0.024721878862793572}, "dna_protein_pair_rand_100": {"accuracy": 0.524375, "f1": 0.010403120936280884}, "dna_protein_pair_rand_full": {"accuracy": 0.521875, "f1": 0.12571428571428572}}
|
| 31 |
+
{"seed": 31, "en": {"accuracy": 0.8725, "f1": 0.8661417322834646}, "fr": {"accuracy": 0.771, "f1": 0.7845719661335842}, "de": {"accuracy": 0.766, "f1": 0.7701375245579568}, "zh": {"accuracy": 0.6825, "f1": 0.7166443551985721}, "dna_sim_pair_simple_150bp": {"accuracy": 0.7336111111111111, "f1": 0.7831788378928329}, "dna_sim_pair_150bp": {"accuracy": 0.66125, "f1": 0.7085394708539471}, "dna_sim_pair_50bp": {"accuracy": 0.5115, "f1": 0.6662111376836352}, "protein_sim_pair_150bp": {"accuracy": 0.8411111111111111, "f1": 0.8565697091273822}, "protein_sim_pair_450bp": {"accuracy": 0.9055555555555556, "f1": 0.9113660062565172}, "dna_protein_pair": {"accuracy": 0.4625, "f1": 0.3042071197411003}, "dna_protein_pair_100": {"accuracy": 0.49, "f1": 0.02857142857142857}, "dna_protein_pair_full": {"accuracy": 0.4475, "f1": 0.2706270627062706}, "dna_protein_pair_rand": {"accuracy": 0.545, "f1": 0.3591549295774648}, "dna_protein_pair_rand_100": {"accuracy": 0.53125, "f1": 0.07862407862407862}, "dna_protein_pair_rand_full": {"accuracy": 0.52625, "f1": 0.3374125874125874}}
|
| 32 |
+
{"seed": 32, "en": {"accuracy": 0.8635, "f1": 0.8577384054194893}, "fr": {"accuracy": 0.7775, "f1": 0.782820888238165}, "de": {"accuracy": 0.751, "f1": 0.7549212598425197}, "zh": {"accuracy": 0.692, "f1": 0.7069457659372027}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9380555555555555, "f1": 0.9354184766869389}, "dna_sim_pair_150bp": {"accuracy": 0.76125, "f1": 0.701095461658842}, "dna_sim_pair_50bp": {"accuracy": 0.727, "f1": 0.6650306748466258}, "protein_sim_pair_150bp": {"accuracy": 0.9688888888888889, "f1": 0.9701492537313433}, "protein_sim_pair_450bp": {"accuracy": 0.9294444444444444, "f1": 0.9258610624635143}, "dna_protein_pair": {"accuracy": 0.505, "f1": 0.1}, "dna_protein_pair_100": {"accuracy": 0.4575, "f1": 0.0091324200913242}, "dna_protein_pair_full": {"accuracy": 0.54, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.5125, "f1": 0.09302325581395349}, "dna_protein_pair_rand_100": {"accuracy": 0.496875, "f1": 0.060676779463243874}, "dna_protein_pair_rand_full": {"accuracy": 0.498125, "f1": 0.019536019536019536}}
|
| 33 |
+
{"seed": 33, "en": {"accuracy": 0.8955, "f1": 0.8894764674775251}, "fr": {"accuracy": 0.816, "f1": 0.8137651821862348}, "de": {"accuracy": 0.7805, "f1": 0.7768174885612608}, "zh": {"accuracy": 0.7145, "f1": 0.703068122724909}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9097222222222222, "f1": 0.912233324331623}, "dna_sim_pair_150bp": {"accuracy": 0.7485, "f1": 0.7110855829982768}, "dna_sim_pair_50bp": {"accuracy": 0.589, "f1": 0.691904047976012}, "protein_sim_pair_150bp": {"accuracy": 0.9272222222222222, "f1": 0.9297587131367292}, "protein_sim_pair_450bp": {"accuracy": 0.8955555555555555, "f1": 0.8879618593563766}, "dna_protein_pair": {"accuracy": 0.5325, "f1": 0.13023255813953488}, "dna_protein_pair_100": {"accuracy": 0.5, "f1": 0.009900990099009901}, "dna_protein_pair_full": {"accuracy": 0.51, "f1": 0.19008264462809918}, "dna_protein_pair_rand": {"accuracy": 0.535625, "f1": 0.2694198623402163}, "dna_protein_pair_rand_100": {"accuracy": 0.520625, "f1": 0.06349206349206349}, "dna_protein_pair_rand_full": {"accuracy": 0.536875, "f1": 0.2949571836346337}}
|
| 34 |
+
{"seed": 34, "en": {"accuracy": 0.8805, "f1": 0.867590027700831}, "fr": {"accuracy": 0.8005, "f1": 0.7851373182552503}, "de": {"accuracy": 0.7655, "f1": 0.7525065963060686}, "zh": {"accuracy": 0.723, "f1": 0.7065677966101694}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9258333333333333, "f1": 0.9232538085656798}, "dna_sim_pair_150bp": {"accuracy": 0.816, "f1": 0.7841642228739003}, "dna_sim_pair_50bp": {"accuracy": 0.798, "f1": 0.7799564270152506}, "protein_sim_pair_150bp": {"accuracy": 0.9611111111111111, "f1": 0.9604072398190046}, "protein_sim_pair_450bp": {"accuracy": 0.6638888888888889, "f1": 0.49958643507030603}, "dna_protein_pair": {"accuracy": 0.56, "f1": 0.022222222222222223}, "dna_protein_pair_100": {"accuracy": 0.5275, "f1": 0.02072538860103627}, "dna_protein_pair_full": {"accuracy": 0.36, "f1": 0.47107438016528924}, "dna_protein_pair_rand": {"accuracy": 0.50375, "f1": 0.00997506234413965}, "dna_protein_pair_rand_100": {"accuracy": 0.52625, "f1": 0.013020833333333334}, "dna_protein_pair_rand_full": {"accuracy": 0.525625, "f1": 0.5474060822898033}}
|
| 35 |
+
{"seed": 35, "en": {"accuracy": 0.8865, "f1": 0.8791910590739755}, "fr": {"accuracy": 0.788, "f1": 0.7975167144221585}, "de": {"accuracy": 0.7565, "f1": 0.7575908412145346}, "zh": {"accuracy": 0.719, "f1": 0.7175879396984924}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9175, "f1": 0.9117909117909118}, "dna_sim_pair_150bp": {"accuracy": 0.80625, "f1": 0.7683109118086696}, "dna_sim_pair_50bp": {"accuracy": 0.716, "f1": 0.7380073800738007}, "protein_sim_pair_150bp": {"accuracy": 0.9472222222222222, "f1": 0.9480021893814997}, "protein_sim_pair_450bp": {"accuracy": 0.8955555555555555, "f1": 0.8973799126637555}, "dna_protein_pair": {"accuracy": 0.5275, "f1": 0.010471204188481676}, "dna_protein_pair_100": {"accuracy": 0.495, "f1": 0.056074766355140186}, "dna_protein_pair_full": {"accuracy": 0.37, "f1": 0.5097276264591439}, "dna_protein_pair_rand": {"accuracy": 0.488125, "f1": 0.023837902264600714}, "dna_protein_pair_rand_100": {"accuracy": 0.5025, "f1": 0.031630170316301706}, "dna_protein_pair_rand_full": {"accuracy": 0.46375, "f1": 0.5760869565217391}}
|
| 36 |
+
{"seed": 36, "en": {"accuracy": 0.875, "f1": 0.8708677685950413}, "fr": {"accuracy": 0.7935, "f1": 0.7974497302599314}, "de": {"accuracy": 0.764, "f1": 0.7665677546983185}, "zh": {"accuracy": 0.704, "f1": 0.7145612343297975}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9427777777777778, "f1": 0.9409742120343839}, "dna_sim_pair_150bp": {"accuracy": 0.81775, "f1": 0.7816711590296496}, "dna_sim_pair_50bp": {"accuracy": 0.813, "f1": 0.7889390519187359}, "protein_sim_pair_150bp": {"accuracy": 0.9388888888888889, "f1": 0.9431818181818182}, "protein_sim_pair_450bp": {"accuracy": 0.9122222222222223, "f1": 0.9159574468085107}, "dna_protein_pair": {"accuracy": 0.5375, "f1": 0.07960199004975124}, "dna_protein_pair_100": {"accuracy": 0.505, "f1": 0.038834951456310676}, "dna_protein_pair_full": {"accuracy": 0.535, "f1": 0.17699115044247787}, "dna_protein_pair_rand": {"accuracy": 0.511875, "f1": 0.10946408209806158}, "dna_protein_pair_rand_100": {"accuracy": 0.53, "f1": 0.10476190476190476}, "dna_protein_pair_rand_full": {"accuracy": 0.5225, "f1": 0.2899628252788104}}
|
| 37 |
+
{"seed": 37, "en": {"accuracy": 0.861, "f1": 0.8547544409613375}, "fr": {"accuracy": 0.7825, "f1": 0.7804139323573952}, "de": {"accuracy": 0.75, "f1": 0.7443762781186094}, "zh": {"accuracy": 0.6715, "f1": 0.669350780070458}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9205555555555556, "f1": 0.9148302561048243}, "dna_sim_pair_150bp": {"accuracy": 0.72, "f1": 0.6174863387978142}, "dna_sim_pair_50bp": {"accuracy": 0.743, "f1": 0.7283298097251586}, "protein_sim_pair_150bp": {"accuracy": 0.9472222222222222, "f1": 0.9471341124095715}, "protein_sim_pair_450bp": {"accuracy": 0.8788888888888889, "f1": 0.8604353393085787}, "dna_protein_pair": {"accuracy": 0.4825, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.5375, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.47, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.5025, "f1": 0.022113022113022112}, "dna_protein_pair_rand_100": {"accuracy": 0.48375, "f1": 0.009592326139088728}, "dna_protein_pair_rand_full": {"accuracy": 0.495625, "f1": 0.002472187886279357}}
|
| 38 |
+
{"seed": 38, "en": {"accuracy": 0.8315, "f1": 0.8064330844342332}, "fr": {"accuracy": 0.7655, "f1": 0.7463493780421849}, "de": {"accuracy": 0.7345, "f1": 0.7143625605164067}, "zh": {"accuracy": 0.711, "f1": 0.6995841995841996}, "dna_sim_pair_simple_150bp": {"accuracy": 0.89, "f1": 0.8933764135702746}, "dna_sim_pair_150bp": {"accuracy": 0.78825, "f1": 0.7789089010702167}, "dna_sim_pair_50bp": {"accuracy": 0.5365, "f1": 0.6713931230060263}, "protein_sim_pair_150bp": {"accuracy": 0.9444444444444444, "f1": 0.9431171786120591}, "protein_sim_pair_450bp": {"accuracy": 0.9111111111111111, "f1": 0.909706546275395}, "dna_protein_pair": {"accuracy": 0.515, "f1": 0.17796610169491525}, "dna_protein_pair_100": {"accuracy": 0.4875, "f1": 0.00966183574879227}, "dna_protein_pair_full": {"accuracy": 0.5025, "f1": 0.2546816479400749}, "dna_protein_pair_rand": {"accuracy": 0.51, "f1": 0.05084745762711865}, "dna_protein_pair_rand_100": {"accuracy": 0.495625, "f1": 0.02181818181818182}, "dna_protein_pair_rand_full": {"accuracy": 0.48875, "f1": 0.2164750957854406}}
|
| 39 |
+
{"seed": 39, "en": {"accuracy": 0.8735, "f1": 0.8640515851692638}, "fr": {"accuracy": 0.795, "f1": 0.7910295616717635}, "de": {"accuracy": 0.766, "f1": 0.7577639751552795}, "zh": {"accuracy": 0.704, "f1": 0.6887486855941115}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9336111111111111, "f1": 0.9321600908316775}, "dna_sim_pair_150bp": {"accuracy": 0.785, "f1": 0.7345679012345679}, "dna_sim_pair_50bp": {"accuracy": 0.679, "f1": 0.7251712328767124}, "protein_sim_pair_150bp": {"accuracy": 0.9577777777777777, "f1": 0.9575418994413408}, "protein_sim_pair_450bp": {"accuracy": 0.8761111111111111, "f1": 0.862600123228589}, "dna_protein_pair": {"accuracy": 0.52, "f1": 0.08571428571428572}, "dna_protein_pair_100": {"accuracy": 0.53, "f1": 0.050505050505050504}, "dna_protein_pair_full": {"accuracy": 0.53, "f1": 0.06930693069306931}, "dna_protein_pair_rand": {"accuracy": 0.515, "f1": 0.07177033492822966}, "dna_protein_pair_rand_100": {"accuracy": 0.47875, "f1": 0.04576659038901602}, "dna_protein_pair_rand_full": {"accuracy": 0.510625, "f1": 0.10718358038768529}}
|
| 40 |
+
{"seed": 41, "en": {"accuracy": 0.882, "f1": 0.8770833333333333}, "fr": {"accuracy": 0.799, "f1": 0.8061716489874639}, "de": {"accuracy": 0.77, "f1": 0.7725024727992087}, "zh": {"accuracy": 0.7095, "f1": 0.7055245818550431}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9077777777777778, "f1": 0.9029807130333138}, "dna_sim_pair_150bp": {"accuracy": 0.688, "f1": 0.5714285714285714}, "dna_sim_pair_50bp": {"accuracy": 0.667, "f1": 0.7245657568238213}, "protein_sim_pair_150bp": {"accuracy": 0.8961111111111111, "f1": 0.9018372703412073}, "protein_sim_pair_450bp": {"accuracy": 0.835, "f1": 0.8251912889935256}, "dna_protein_pair": {"accuracy": 0.465, "f1": 0.09322033898305085}, "dna_protein_pair_100": {"accuracy": 0.54, "f1": 0.0891089108910891}, "dna_protein_pair_full": {"accuracy": 0.47, "f1": 0.32051282051282054}, "dna_protein_pair_rand": {"accuracy": 0.523125, "f1": 0.16429353778751368}, "dna_protein_pair_rand_100": {"accuracy": 0.52625, "f1": 0.11448598130841121}, "dna_protein_pair_rand_full": {"accuracy": 0.494375, "f1": 0.43307638402242465}}
|
| 41 |
+
{"seed": 42, "en": {"accuracy": 0.874, "f1": 0.8679245283018868}, "fr": {"accuracy": 0.8015, "f1": 0.8015992003998001}, "de": {"accuracy": 0.761, "f1": 0.7642998027613412}, "zh": {"accuracy": 0.71, "f1": 0.702258726899384}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8925, "f1": 0.8897121687090339}, "dna_sim_pair_150bp": {"accuracy": 0.76025, "f1": 0.7151767151767152}, "dna_sim_pair_50bp": {"accuracy": 0.53, "f1": 0.6495152870991797}, "protein_sim_pair_150bp": {"accuracy": 0.9566666666666667, "f1": 0.9555808656036446}, "protein_sim_pair_450bp": {"accuracy": 0.9355555555555556, "f1": 0.9344632768361582}, "dna_protein_pair": {"accuracy": 0.5375, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.475, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.4425, "f1": 0.10441767068273092}, "dna_protein_pair_rand": {"accuracy": 0.516875, "f1": 0.015286624203821656}, "dna_protein_pair_rand_100": {"accuracy": 0.5075, "f1": 0.005050505050505051}, "dna_protein_pair_rand_full": {"accuracy": 0.52375, "f1": 0.12814645308924486}}
|
| 42 |
+
{"seed": 43, "en": {"accuracy": 0.885, "f1": 0.8755411255411255}, "fr": {"accuracy": 0.8205, "f1": 0.8058409951325041}, "de": {"accuracy": 0.791, "f1": 0.7764705882352941}, "zh": {"accuracy": 0.718, "f1": 0.6901098901098901}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9047222222222222, "f1": 0.8987902035998819}, "dna_sim_pair_150bp": {"accuracy": 0.744, "f1": 0.64762560220234}, "dna_sim_pair_50bp": {"accuracy": 0.752, "f1": 0.6852791878172588}, "protein_sim_pair_150bp": {"accuracy": 0.9388888888888889, "f1": 0.9354460093896714}, "protein_sim_pair_450bp": {"accuracy": 0.9511111111111111, "f1": 0.9505061867266592}, "dna_protein_pair": {"accuracy": 0.4975, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.53, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.46, "f1": 0.3032258064516129}, "dna_protein_pair_rand": {"accuracy": 0.495, "f1": 0.0049261083743842365}, "dna_protein_pair_rand_100": {"accuracy": 0.495, "f1": 0.0024691358024691358}, "dna_protein_pair_rand_full": {"accuracy": 0.55375, "f1": 0.39285714285714285}}
|
| 43 |
+
{"seed": 44, "en": {"accuracy": 0.884, "f1": 0.875268817204301}, "fr": {"accuracy": 0.788, "f1": 0.788}, "de": {"accuracy": 0.763, "f1": 0.7601214574898786}, "zh": {"accuracy": 0.686, "f1": 0.7158371040723982}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8136111111111111, "f1": 0.8380400675838764}, "dna_sim_pair_150bp": {"accuracy": 0.72575, "f1": 0.7355989395034949}, "dna_sim_pair_50bp": {"accuracy": 0.48, "f1": 0.6264367816091954}, "protein_sim_pair_150bp": {"accuracy": 0.8744444444444445, "f1": 0.8831437435367114}, "protein_sim_pair_450bp": {"accuracy": 0.9072222222222223, "f1": 0.909386869234943}, "dna_protein_pair": {"accuracy": 0.5075, "f1": 0.3746031746031746}, "dna_protein_pair_100": {"accuracy": 0.485, "f1": 0.1271186440677966}, "dna_protein_pair_full": {"accuracy": 0.3975, "f1": 0.23492063492063492}, "dna_protein_pair_rand": {"accuracy": 0.484375, "f1": 0.3037974683544304}, "dna_protein_pair_rand_100": {"accuracy": 0.506875, "f1": 0.11843575418994413}, "dna_protein_pair_rand_full": {"accuracy": 0.498125, "f1": 0.22265246853823814}}
|
| 44 |
+
{"seed": 45, "en": {"accuracy": 0.883, "f1": 0.8737864077669902}, "fr": {"accuracy": 0.795, "f1": 0.7918781725888325}, "de": {"accuracy": 0.764, "f1": 0.7518401682439537}, "zh": {"accuracy": 0.703, "f1": 0.6799568965517241}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9208333333333333, "f1": 0.9229104679469841}, "dna_sim_pair_150bp": {"accuracy": 0.7655, "f1": 0.7205005959475567}, "dna_sim_pair_50bp": {"accuracy": 0.476, "f1": 0.5604026845637584}, "protein_sim_pair_150bp": {"accuracy": 0.96, "f1": 0.9593220338983051}, "protein_sim_pair_450bp": {"accuracy": 0.9283333333333333, "f1": 0.9299293862031505}, "dna_protein_pair": {"accuracy": 0.5225, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.505, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5025, "f1": 0.009950248756218905}, "dna_protein_pair_rand": {"accuracy": 0.5125, "f1": 0.020100502512562814}, "dna_protein_pair_rand_100": {"accuracy": 0.506875, "f1": 0.022304832713754646}, "dna_protein_pair_rand_full": {"accuracy": 0.511875, "f1": 0.02252816020025031}}
|
| 45 |
+
{"seed": 47, "en": {"accuracy": 0.8745, "f1": 0.8678251711427067}, "fr": {"accuracy": 0.8035, "f1": 0.8016153457849571}, "de": {"accuracy": 0.776, "f1": 0.7700205338809035}, "zh": {"accuracy": 0.7045, "f1": 0.7081481481481482}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9119444444444444, "f1": 0.912358307990047}, "dna_sim_pair_150bp": {"accuracy": 0.76875, "f1": 0.7300846221184709}, "dna_sim_pair_50bp": {"accuracy": 0.675, "f1": 0.6968283582089553}, "protein_sim_pair_150bp": {"accuracy": 0.9316666666666666, "f1": 0.9364341085271318}, "protein_sim_pair_450bp": {"accuracy": 0.9172222222222223, "f1": 0.9176340519624102}, "dna_protein_pair": {"accuracy": 0.5225, "f1": 0.13574660633484162}, "dna_protein_pair_100": {"accuracy": 0.485, "f1": 0.01904761904761905}, "dna_protein_pair_full": {"accuracy": 0.3175, "f1": 0.3695150115473441}, "dna_protein_pair_rand": {"accuracy": 0.53375, "f1": 0.10336538461538461}, "dna_protein_pair_rand_100": {"accuracy": 0.4925, "f1": 0.021686746987951807}, "dna_protein_pair_rand_full": {"accuracy": 0.488125, "f1": 0.49287925696594426}}
|
| 46 |
+
{"seed": 48, "en": {"accuracy": 0.866, "f1": 0.8593913955928646}, "fr": {"accuracy": 0.7795, "f1": 0.7856101118133204}, "de": {"accuracy": 0.7405, "f1": 0.7513176808816483}, "zh": {"accuracy": 0.709, "f1": 0.7036659877800407}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8758333333333334, "f1": 0.8839262529213191}, "dna_sim_pair_150bp": {"accuracy": 0.7665, "f1": 0.7566440854611777}, "dna_sim_pair_50bp": {"accuracy": 0.509, "f1": 0.6480286738351254}, "protein_sim_pair_150bp": {"accuracy": 0.9016666666666666, "f1": 0.9071840587309911}, "protein_sim_pair_450bp": {"accuracy": 0.9083333333333333, "f1": 0.9146404552509053}, "dna_protein_pair": {"accuracy": 0.5125, "f1": 0.14847161572052403}, "dna_protein_pair_100": {"accuracy": 0.485, "f1": 0.01904761904761905}, "dna_protein_pair_full": {"accuracy": 0.3675, "f1": 0.22629969418960244}, "dna_protein_pair_rand": {"accuracy": 0.5325, "f1": 0.1282051282051282}, "dna_protein_pair_rand_100": {"accuracy": 0.484375, "f1": 0.0440324449594438}, "dna_protein_pair_rand_full": {"accuracy": 0.519375, "f1": 0.27793427230046946}}
|
| 47 |
+
{"seed": 49, "en": {"accuracy": 0.9005, "f1": 0.890958904109589}, "fr": {"accuracy": 0.802, "f1": 0.797752808988764}, "de": {"accuracy": 0.785, "f1": 0.7695605573419079}, "zh": {"accuracy": 0.7295, "f1": 0.7142102482831485}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9319444444444445, "f1": 0.9285088999124599}, "dna_sim_pair_150bp": {"accuracy": 0.718, "f1": 0.6224899598393574}, "dna_sim_pair_50bp": {"accuracy": 0.6335, "f1": 0.6094832179009056}, "protein_sim_pair_150bp": {"accuracy": 0.9055555555555556, "f1": 0.8957055214723927}, "protein_sim_pair_450bp": {"accuracy": 0.8561111111111112, "f1": 0.8334405144694533}, "dna_protein_pair": {"accuracy": 0.5, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.52, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.499375, "f1": 0.01717791411042945}, "dna_protein_pair_rand_100": {"accuracy": 0.501875, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.51, "f1": 0.002544529262086514}}
|
| 48 |
+
{"seed": 50, "en": {"accuracy": 0.8865, "f1": 0.8754799780581459}, "fr": {"accuracy": 0.8215, "f1": 0.8108108108108109}, "de": {"accuracy": 0.776, "f1": 0.7593984962406015}, "zh": {"accuracy": 0.712, "f1": 0.7012448132780082}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9397222222222222, "f1": 0.9370101596516691}, "dna_sim_pair_150bp": {"accuracy": 0.651, "f1": 0.45891472868217054}, "dna_sim_pair_50bp": {"accuracy": 0.763, "f1": 0.6945876288659794}, "protein_sim_pair_150bp": {"accuracy": 0.9516666666666667, "f1": 0.9509859154929577}, "protein_sim_pair_450bp": {"accuracy": 0.7427777777777778, "f1": 0.664735698769008}, "dna_protein_pair": {"accuracy": 0.505, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.4775, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.515, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.516875, "f1": 0.027672955974842768}, "dna_protein_pair_rand_100": {"accuracy": 0.51, "f1": 0.002544529262086514}, "dna_protein_pair_rand_full": {"accuracy": 0.509375, "f1": 0.0025412960609911056}}
|
| 49 |
+
{"seed": 52, "en": {"accuracy": 0.8945, "f1": 0.8862533692722372}, "fr": {"accuracy": 0.801, "f1": 0.7971457696228338}, "de": {"accuracy": 0.7735, "f1": 0.7636932707355243}, "zh": {"accuracy": 0.715, "f1": 0.698093220338983}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9305555555555556, "f1": 0.9290578887627696}, "dna_sim_pair_150bp": {"accuracy": 0.74675, "f1": 0.6692784851452824}, "dna_sim_pair_50bp": {"accuracy": 0.5685, "f1": 0.3335907335907336}, "protein_sim_pair_150bp": {"accuracy": 0.9388888888888889, "f1": 0.9384098544232923}, "protein_sim_pair_450bp": {"accuracy": 0.9244444444444444, "f1": 0.9227272727272727}, "dna_protein_pair": {"accuracy": 0.5, "f1": 0.009900990099009901}, "dna_protein_pair_100": {"accuracy": 0.4575, "f1": 0.0091324200913242}, "dna_protein_pair_full": {"accuracy": 0.4925, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.50125, "f1": 0.026829268292682926}, "dna_protein_pair_rand_100": {"accuracy": 0.493125, "f1": 0.012180267965895249}, "dna_protein_pair_rand_full": {"accuracy": 0.489375, "f1": 0.028537455410225922}}
|
| 50 |
+
{"seed": 53, "en": {"accuracy": 0.883, "f1": 0.8725490196078431}, "fr": {"accuracy": 0.8065, "f1": 0.8006182380216383}, "de": {"accuracy": 0.772, "f1": 0.7642192347466391}, "zh": {"accuracy": 0.727, "f1": 0.7061356297093649}, "dna_sim_pair_simple_150bp": {"accuracy": 0.955, "f1": 0.9546218487394958}, "dna_sim_pair_150bp": {"accuracy": 0.7155, "f1": 0.6216755319148937}, "dna_sim_pair_50bp": {"accuracy": 0.772, "f1": 0.7921604375569735}, "protein_sim_pair_150bp": {"accuracy": 0.9044444444444445, "f1": 0.907725321888412}, "protein_sim_pair_450bp": {"accuracy": 0.835, "f1": 0.8147223955084217}, "dna_protein_pair": {"accuracy": 0.49, "f1": 0.05555555555555555}, "dna_protein_pair_100": {"accuracy": 0.5425, "f1": 0.010810810810810811}, "dna_protein_pair_full": {"accuracy": 0.4975, "f1": 0.028985507246376812}, "dna_protein_pair_rand": {"accuracy": 0.51625, "f1": 0.11845102505694761}, "dna_protein_pair_rand_100": {"accuracy": 0.4825, "f1": 0.018957345971563982}, "dna_protein_pair_rand_full": {"accuracy": 0.510625, "f1": 0.0345252774352651}}
|
| 51 |
+
{"seed": 54, "en": {"accuracy": 0.9015, "f1": 0.8933405522468868}, "fr": {"accuracy": 0.8225, "f1": 0.8213387015601409}, "de": {"accuracy": 0.773, "f1": 0.7774509803921569}, "zh": {"accuracy": 0.72, "f1": 0.7134083930399181}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9294444444444444, "f1": 0.9291689905186837}, "dna_sim_pair_150bp": {"accuracy": 0.8265, "f1": 0.8025042686397268}, "dna_sim_pair_50bp": {"accuracy": 0.791, "f1": 0.7517814726840855}, "protein_sim_pair_150bp": {"accuracy": 0.9477777777777778, "f1": 0.9489685124864278}, "protein_sim_pair_450bp": {"accuracy": 0.8216666666666667, "f1": 0.8241095890410959}, "dna_protein_pair": {"accuracy": 0.5375, "f1": 0.021164021164021163}, "dna_protein_pair_100": {"accuracy": 0.5, "f1": 0.05660377358490566}, "dna_protein_pair_full": {"accuracy": 0.47, "f1": 0.13821138211382114}, "dna_protein_pair_rand": {"accuracy": 0.519375, "f1": 0.04472049689440994}, "dna_protein_pair_rand_100": {"accuracy": 0.509375, "f1": 0.048484848484848485}, "dna_protein_pair_rand_full": {"accuracy": 0.4975, "f1": 0.056338028169014086}}
|
| 52 |
+
{"seed": 55, "en": {"accuracy": 0.8795, "f1": 0.8683779355543418}, "fr": {"accuracy": 0.796, "f1": 0.7978196233894945}, "de": {"accuracy": 0.756, "f1": 0.7502558853633572}, "zh": {"accuracy": 0.7235, "f1": 0.6989657049537289}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9127777777777778, "f1": 0.9037400367872471}, "dna_sim_pair_150bp": {"accuracy": 0.68975, "f1": 0.5674451028232834}, "dna_sim_pair_50bp": {"accuracy": 0.662, "f1": 0.559322033898305}, "protein_sim_pair_150bp": {"accuracy": 0.9372222222222222, "f1": 0.9334119033588686}, "protein_sim_pair_450bp": {"accuracy": 0.8361111111111111, "f1": 0.8110185778347213}, "dna_protein_pair": {"accuracy": 0.5075, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.465, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.495, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.494375, "f1": 0.014616321559074299}, "dna_protein_pair_rand_100": {"accuracy": 0.50125, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.48, "f1": 0.011876484560570071}}
|
| 53 |
+
{"seed": 58, "en": {"accuracy": 0.8805, "f1": 0.8732095490716181}, "fr": {"accuracy": 0.7945, "f1": 0.7976366322008862}, "de": {"accuracy": 0.7665, "f1": 0.7682382133995037}, "zh": {"accuracy": 0.709, "f1": 0.7063572149344097}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9072222222222223, "f1": 0.9066517607602013}, "dna_sim_pair_150bp": {"accuracy": 0.733, "f1": 0.6815742397137746}, "dna_sim_pair_50bp": {"accuracy": 0.5355, "f1": 0.679323438039351}, "protein_sim_pair_150bp": {"accuracy": 0.9405555555555556, "f1": 0.9421308815575987}, "protein_sim_pair_450bp": {"accuracy": 0.8772222222222222, "f1": 0.8686868686868687}, "dna_protein_pair": {"accuracy": 0.53, "f1": 0.08737864077669903}, "dna_protein_pair_100": {"accuracy": 0.51, "f1": 0.02}, "dna_protein_pair_full": {"accuracy": 0.52, "f1": 0.010309278350515464}, "dna_protein_pair_rand": {"accuracy": 0.491875, "f1": 0.11147540983606558}, "dna_protein_pair_rand_100": {"accuracy": 0.500625, "f1": 0.03151515151515152}, "dna_protein_pair_rand_full": {"accuracy": 0.51, "f1": 0.01507537688442211}}
|
| 54 |
+
{"seed": 59, "en": {"accuracy": 0.882, "f1": 0.8793456032719836}, "fr": {"accuracy": 0.7905, "f1": 0.7984607984607984}, "de": {"accuracy": 0.757, "f1": 0.7631578947368421}, "zh": {"accuracy": 0.6745, "f1": 0.6895565092989986}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9433333333333334, "f1": 0.941747572815534}, "dna_sim_pair_150bp": {"accuracy": 0.716, "f1": 0.6125511596180082}, "dna_sim_pair_50bp": {"accuracy": 0.5705, "f1": 0.5795398923152227}, "protein_sim_pair_150bp": {"accuracy": 0.9638888888888889, "f1": 0.9638688160088938}, "protein_sim_pair_450bp": {"accuracy": 0.9077777777777778, "f1": 0.904707233065442}, "dna_protein_pair": {"accuracy": 0.4675, "f1": 0.018433179723502304}, "dna_protein_pair_100": {"accuracy": 0.505, "f1": 0.01}, "dna_protein_pair_full": {"accuracy": 0.495, "f1": 0.019417475728155338}, "dna_protein_pair_rand": {"accuracy": 0.4925, "f1": 0.026378896882494004}, "dna_protein_pair_rand_100": {"accuracy": 0.493125, "f1": 0.033373063170441}, "dna_protein_pair_rand_full": {"accuracy": 0.486875, "f1": 0.028402366863905324}}
|
| 55 |
+
{"seed": 63, "en": {"accuracy": 0.8895, "f1": 0.8827586206896552}, "fr": {"accuracy": 0.794, "f1": 0.798828125}, "de": {"accuracy": 0.7725, "f1": 0.7748639287481445}, "zh": {"accuracy": 0.701, "f1": 0.7155090390104663}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8088888888888889, "f1": 0.8302073050345509}, "dna_sim_pair_150bp": {"accuracy": 0.71975, "f1": 0.7321385902031063}, "dna_sim_pair_50bp": {"accuracy": 0.521, "f1": 0.6610049539985846}, "protein_sim_pair_150bp": {"accuracy": 0.9683333333333334, "f1": 0.9686985172981878}, "protein_sim_pair_450bp": {"accuracy": 0.8355555555555556, "f1": 0.8348214285714286}, "dna_protein_pair": {"accuracy": 0.535, "f1": 0.07920792079207921}, "dna_protein_pair_100": {"accuracy": 0.47, "f1": 0.5391304347826087}, "dna_protein_pair_full": {"accuracy": 0.515, "f1": 0.5}, "dna_protein_pair_rand": {"accuracy": 0.503125, "f1": 0.1657922350472193}, "dna_protein_pair_rand_100": {"accuracy": 0.5275, "f1": 0.6115107913669064}, "dna_protein_pair_rand_full": {"accuracy": 0.6, "f1": 0.627906976744186}}
|
| 56 |
+
{"seed": 64, "en": {"accuracy": 0.889, "f1": 0.880901287553648}, "fr": {"accuracy": 0.8035, "f1": 0.8010126582278481}, "de": {"accuracy": 0.781, "f1": 0.7616974972796517}, "zh": {"accuracy": 0.731, "f1": 0.7126068376068376}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9102777777777777, "f1": 0.9068358811652726}, "dna_sim_pair_150bp": {"accuracy": 0.8365, "f1": 0.8161888701517707}, "dna_sim_pair_50bp": {"accuracy": 0.744, "f1": 0.7282377919320594}, "protein_sim_pair_150bp": {"accuracy": 0.9511111111111111, "f1": 0.9483568075117371}, "protein_sim_pair_450bp": {"accuracy": 0.9361111111111111, "f1": 0.9380053908355795}, "dna_protein_pair": {"accuracy": 0.465, "f1": 0.01834862385321101}, "dna_protein_pair_100": {"accuracy": 0.4875, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.355, "f1": 0.5132075471698113}, "dna_protein_pair_rand": {"accuracy": 0.4825, "f1": 0.011933174224343675}, "dna_protein_pair_rand_100": {"accuracy": 0.508125, "f1": 0.03907203907203907}, "dna_protein_pair_rand_full": {"accuracy": 0.5175, "f1": 0.6284889316650626}}
|
| 57 |
+
{"seed": 65, "en": {"accuracy": 0.8695, "f1": 0.8658097686375321}, "fr": {"accuracy": 0.784, "f1": 0.7886497064579256}, "de": {"accuracy": 0.7495, "f1": 0.7583212735166426}, "zh": {"accuracy": 0.6915, "f1": 0.6982885085574572}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8608333333333333, "f1": 0.8720959918304825}, "dna_sim_pair_150bp": {"accuracy": 0.7205, "f1": 0.7071765322158198}, "dna_sim_pair_50bp": {"accuracy": 0.572, "f1": 0.672782874617737}, "protein_sim_pair_150bp": {"accuracy": 0.8488888888888889, "f1": 0.8667972575905974}, "protein_sim_pair_450bp": {"accuracy": 0.8194444444444444, "f1": 0.8369292523833417}, "dna_protein_pair": {"accuracy": 0.4525, "f1": 0.2912621359223301}, "dna_protein_pair_100": {"accuracy": 0.4975, "f1": 0.4401114206128134}, "dna_protein_pair_full": {"accuracy": 0.4275, "f1": 0.5888689407540395}, "dna_protein_pair_rand": {"accuracy": 0.5175, "f1": 0.3723577235772358}, "dna_protein_pair_rand_100": {"accuracy": 0.47875, "f1": 0.39124087591240875}, "dna_protein_pair_rand_full": {"accuracy": 0.443125, "f1": 0.57225156024964}}
|
| 58 |
+
{"seed": 70, "en": {"accuracy": 0.8565, "f1": 0.8507540301612064}, "fr": {"accuracy": 0.7905, "f1": 0.7951100244498778}, "de": {"accuracy": 0.7395, "f1": 0.7486734201640135}, "zh": {"accuracy": 0.6855, "f1": 0.70648623425105}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9052777777777777, "f1": 0.9097644879597777}, "dna_sim_pair_150bp": {"accuracy": 0.8715, "f1": 0.8703983862834089}, "dna_sim_pair_50bp": {"accuracy": 0.6345, "f1": 0.716116504854369}, "protein_sim_pair_150bp": {"accuracy": 0.9583333333333334, "f1": 0.9593054801953337}, "protein_sim_pair_450bp": {"accuracy": 0.8833333333333333, "f1": 0.8939393939393939}, "dna_protein_pair": {"accuracy": 0.5, "f1": 0.0196078431372549}, "dna_protein_pair_100": {"accuracy": 0.4525, "f1": 0.05194805194805195}, "dna_protein_pair_full": {"accuracy": 0.3575, "f1": 0.3558897243107769}, "dna_protein_pair_rand": {"accuracy": 0.5, "f1": 0.06103286384976526}, "dna_protein_pair_rand_100": {"accuracy": 0.50125, "f1": 0.052256532066508314}, "dna_protein_pair_rand_full": {"accuracy": 0.481875, "f1": 0.3988397389412618}}
|
| 59 |
+
{"seed": 71, "en": {"accuracy": 0.8805, "f1": 0.8745406824146982}, "fr": {"accuracy": 0.791, "f1": 0.7986512524084779}, "de": {"accuracy": 0.7455, "f1": 0.7544621321755909}, "zh": {"accuracy": 0.693, "f1": 0.7056567593480345}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8836111111111111, "f1": 0.8835140394773422}, "dna_sim_pair_150bp": {"accuracy": 0.7385, "f1": 0.691627358490566}, "dna_sim_pair_50bp": {"accuracy": 0.5345, "f1": 0.6803982148987299}, "protein_sim_pair_150bp": {"accuracy": 0.84, "f1": 0.8526100307062436}, "protein_sim_pair_450bp": {"accuracy": 0.8827777777777778, "f1": 0.8837465564738292}, "dna_protein_pair": {"accuracy": 0.4825, "f1": 0.15510204081632653}, "dna_protein_pair_100": {"accuracy": 0.5125, "f1": 0.04878048780487805}, "dna_protein_pair_full": {"accuracy": 0.525, "f1": 0.15178571428571427}, "dna_protein_pair_rand": {"accuracy": 0.515, "f1": 0.16379310344827586}, "dna_protein_pair_rand_100": {"accuracy": 0.508125, "f1": 0.05748502994011976}, "dna_protein_pair_rand_full": {"accuracy": 0.495625, "f1": 0.23796033994334279}}
|
| 60 |
+
{"seed": 72, "en": {"accuracy": 0.8655, "f1": 0.859968766267569}, "fr": {"accuracy": 0.7675, "f1": 0.7797252486972999}, "de": {"accuracy": 0.746, "f1": 0.7502458210422812}, "zh": {"accuracy": 0.6945, "f1": 0.6982716049382716}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8830555555555556, "f1": 0.8906209405040271}, "dna_sim_pair_150bp": {"accuracy": 0.73675, "f1": 0.7142469470827679}, "dna_sim_pair_50bp": {"accuracy": 0.548, "f1": 0.6814658210007047}, "protein_sim_pair_150bp": {"accuracy": 0.7616666666666667, "f1": 0.8054421768707483}, "protein_sim_pair_450bp": {"accuracy": 0.8461111111111111, "f1": 0.8607340372046255}, "dna_protein_pair": {"accuracy": 0.5325, "f1": 0.32974910394265233}, "dna_protein_pair_100": {"accuracy": 0.5, "f1": 0.2857142857142857}, "dna_protein_pair_full": {"accuracy": 0.4125, "f1": 0.5607476635514018}, "dna_protein_pair_rand": {"accuracy": 0.55, "f1": 0.40298507462686567}, "dna_protein_pair_rand_100": {"accuracy": 0.501875, "f1": 0.25583566760037346}, "dna_protein_pair_rand_full": {"accuracy": 0.48875, "f1": 0.5942460317460317}}
|
| 61 |
+
{"seed": 74, "en": {"accuracy": 0.877, "f1": 0.869287991498406}, "fr": {"accuracy": 0.782, "f1": 0.7903846153846154}, "de": {"accuracy": 0.7595, "f1": 0.7608155146693187}, "zh": {"accuracy": 0.6975, "f1": 0.7128618889416232}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9369444444444445, "f1": 0.9369969469886206}, "dna_sim_pair_150bp": {"accuracy": 0.80925, "f1": 0.7794160161896502}, "dna_sim_pair_50bp": {"accuracy": 0.7695, "f1": 0.7709885742672627}, "protein_sim_pair_150bp": {"accuracy": 0.9572222222222222, "f1": 0.9574820541137493}, "protein_sim_pair_450bp": {"accuracy": 0.9544444444444444, "f1": 0.9542920847268673}, "dna_protein_pair": {"accuracy": 0.4875, "f1": 0.019138755980861243}, "dna_protein_pair_100": {"accuracy": 0.5275, "f1": 0.07804878048780488}, "dna_protein_pair_full": {"accuracy": 0.3375, "f1": 0.19939577039274925}, "dna_protein_pair_rand": {"accuracy": 0.506875, "f1": 0.04825090470446321}, "dna_protein_pair_rand_100": {"accuracy": 0.478125, "f1": 0.043528064146620846}, "dna_protein_pair_rand_full": {"accuracy": 0.505, "f1": 0.1885245901639344}}
|
| 62 |
+
{"seed": 75, "en": {"accuracy": 0.875, "f1": 0.8674443266171792}, "fr": {"accuracy": 0.809, "f1": 0.8006263048016702}, "de": {"accuracy": 0.786, "f1": 0.7780082987551867}, "zh": {"accuracy": 0.719, "f1": 0.7054507337526206}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9375, "f1": 0.9343832020997376}, "dna_sim_pair_150bp": {"accuracy": 0.714, "f1": 0.6030534351145038}, "dna_sim_pair_50bp": {"accuracy": 0.679, "f1": 0.5776315789473684}, "protein_sim_pair_150bp": {"accuracy": 0.9161111111111111, "f1": 0.91101944608132}, "protein_sim_pair_450bp": {"accuracy": 0.78, "f1": 0.7117903930131004}, "dna_protein_pair": {"accuracy": 0.51, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.475, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5125, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.5125, "f1": 0.0025575447570332483}, "dna_protein_pair_rand_100": {"accuracy": 0.50375, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.48875, "f1": 0.0}}
|
| 63 |
+
{"seed": 78, "en": {"accuracy": 0.873, "f1": 0.8678459937565036}, "fr": {"accuracy": 0.7815, "f1": 0.7865168539325843}, "de": {"accuracy": 0.761, "f1": 0.7663734115347018}, "zh": {"accuracy": 0.6985, "f1": 0.6962216624685138}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9330555555555555, "f1": 0.9337365960956833}, "dna_sim_pair_150bp": {"accuracy": 0.79225, "f1": 0.7590605972745723}, "dna_sim_pair_50bp": {"accuracy": 0.7325, "f1": 0.7722435078756917}, "protein_sim_pair_150bp": {"accuracy": 0.9322222222222222, "f1": 0.936656282450675}, "protein_sim_pair_450bp": {"accuracy": 0.8888888888888888, "f1": 0.8932764140875133}, "dna_protein_pair": {"accuracy": 0.5325, "f1": 0.19047619047619047}, "dna_protein_pair_100": {"accuracy": 0.5225, "f1": 0.3696369636963696}, "dna_protein_pair_full": {"accuracy": 0.555, "f1": 0.22608695652173913}, "dna_protein_pair_rand": {"accuracy": 0.54, "f1": 0.28957528957528955}, "dna_protein_pair_rand_100": {"accuracy": 0.5225, "f1": 0.37886178861788616}, "dna_protein_pair_rand_full": {"accuracy": 0.573125, "f1": 0.32841691248770893}}
|
| 64 |
+
{"seed": 80, "en": {"accuracy": 0.8745, "f1": 0.8692027097446586}, "fr": {"accuracy": 0.779, "f1": 0.790521327014218}, "de": {"accuracy": 0.7435, "f1": 0.7597189695550352}, "zh": {"accuracy": 0.6935, "f1": 0.7131492746841367}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9038888888888889, "f1": 0.911327524346489}, "dna_sim_pair_150bp": {"accuracy": 0.80425, "f1": 0.796887159533074}, "dna_sim_pair_50bp": {"accuracy": 0.518, "f1": 0.6612789880534083}, "protein_sim_pair_150bp": {"accuracy": 0.925, "f1": 0.9280767181672882}, "protein_sim_pair_450bp": {"accuracy": 0.9211111111111111, "f1": 0.925026399155227}, "dna_protein_pair": {"accuracy": 0.505, "f1": 0.1}, "dna_protein_pair_100": {"accuracy": 0.5125, "f1": 0.029850746268656716}, "dna_protein_pair_full": {"accuracy": 0.445, "f1": 0.034782608695652174}, "dna_protein_pair_rand": {"accuracy": 0.53, "f1": 0.12761020881670534}, "dna_protein_pair_rand_100": {"accuracy": 0.5125, "f1": 0.060240963855421686}, "dna_protein_pair_rand_full": {"accuracy": 0.52, "f1": 0.13122171945701358}}
|
| 65 |
+
{"seed": 82, "en": {"accuracy": 0.8885, "f1": 0.8778082191780822}, "fr": {"accuracy": 0.801, "f1": 0.7903055848261328}, "de": {"accuracy": 0.781, "f1": 0.7632432432432432}, "zh": {"accuracy": 0.704, "f1": 0.6740088105726872}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9244444444444444, "f1": 0.9238095238095239}, "dna_sim_pair_150bp": {"accuracy": 0.74225, "f1": 0.6752755905511811}, "dna_sim_pair_50bp": {"accuracy": 0.8045, "f1": 0.791243993593166}, "protein_sim_pair_150bp": {"accuracy": 0.91, "f1": 0.9111842105263158}, "protein_sim_pair_450bp": {"accuracy": 0.7411111111111112, "f1": 0.6732117812061711}, "dna_protein_pair": {"accuracy": 0.4825, "f1": 0.18823529411764706}, "dna_protein_pair_100": {"accuracy": 0.5025, "f1": 0.019704433497536946}, "dna_protein_pair_full": {"accuracy": 0.4725, "f1": 0.04524886877828054}, "dna_protein_pair_rand": {"accuracy": 0.506875, "f1": 0.207035175879397}, "dna_protein_pair_rand_100": {"accuracy": 0.496875, "f1": 0.026602176541717048}, "dna_protein_pair_rand_full": {"accuracy": 0.503125, "f1": 0.10573678290213723}}
|
| 66 |
+
{"seed": 84, "en": {"accuracy": 0.877, "f1": 0.8625698324022346}, "fr": {"accuracy": 0.8055, "f1": 0.7949393779652082}, "de": {"accuracy": 0.7695, "f1": 0.7473972602739726}, "zh": {"accuracy": 0.7275, "f1": 0.6980609418282548}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9108333333333334, "f1": 0.906875543951262}, "dna_sim_pair_150bp": {"accuracy": 0.7395, "f1": 0.6731493099121706}, "dna_sim_pair_50bp": {"accuracy": 0.6585, "f1": 0.6499231163505894}, "protein_sim_pair_150bp": {"accuracy": 0.9388888888888889, "f1": 0.936046511627907}, "protein_sim_pair_450bp": {"accuracy": 0.8927777777777778, "f1": 0.8786926461345066}, "dna_protein_pair": {"accuracy": 0.5325, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.48, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.49, "f1": 0.009708737864077669}, "dna_protein_pair_rand": {"accuracy": 0.48875, "f1": 0.0024390243902439024}, "dna_protein_pair_rand_100": {"accuracy": 0.475, "f1": 0.0023752969121140144}, "dna_protein_pair_rand_full": {"accuracy": 0.504375, "f1": 0.0457280385078219}}
|
| 67 |
+
{"seed": 85, "en": {"accuracy": 0.873, "f1": 0.8641711229946524}, "fr": {"accuracy": 0.787, "f1": 0.7901477832512315}, "de": {"accuracy": 0.7615, "f1": 0.7587253414264037}, "zh": {"accuracy": 0.718, "f1": 0.7113613101330604}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9155555555555556, "f1": 0.9143661971830986}, "dna_sim_pair_150bp": {"accuracy": 0.742, "f1": 0.6957547169811321}, "dna_sim_pair_50bp": {"accuracy": 0.584, "f1": 0.6797536566589685}, "protein_sim_pair_150bp": {"accuracy": 0.9544444444444444, "f1": 0.9522144522144522}, "protein_sim_pair_450bp": {"accuracy": 0.9683333333333334, "f1": 0.9693383539537386}, "dna_protein_pair": {"accuracy": 0.51, "f1": 0.02}, "dna_protein_pair_100": {"accuracy": 0.5075, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.51, "f1": 0.10091743119266056}, "dna_protein_pair_rand": {"accuracy": 0.498125, "f1": 0.02902055622732769}, "dna_protein_pair_rand_100": {"accuracy": 0.513125, "f1": 0.015170670037926675}, "dna_protein_pair_rand_full": {"accuracy": 0.48875, "f1": 0.06407322654462243}}
|
| 68 |
+
{"seed": 86, "en": {"accuracy": 0.893, "f1": 0.8830601092896175}, "fr": {"accuracy": 0.816, "f1": 0.811088295687885}, "de": {"accuracy": 0.78, "f1": 0.7676874340021119}, "zh": {"accuracy": 0.7075, "f1": 0.7147732813261823}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9316666666666666, "f1": 0.9290657439446367}, "dna_sim_pair_150bp": {"accuracy": 0.7735, "f1": 0.7175810473815462}, "dna_sim_pair_50bp": {"accuracy": 0.663, "f1": 0.5818858560794045}, "protein_sim_pair_150bp": {"accuracy": 0.9588888888888889, "f1": 0.9593852908891328}, "protein_sim_pair_450bp": {"accuracy": 0.9661111111111111, "f1": 0.9651627641347801}, "dna_protein_pair": {"accuracy": 0.4875, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.4375, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.47, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.495625, "f1": 0.014652014652014652}, "dna_protein_pair_rand_100": {"accuracy": 0.5075, "f1": 0.012531328320802004}, "dna_protein_pair_rand_full": {"accuracy": 0.490625, "f1": 0.033214709371293}}
|
| 69 |
+
{"seed": 87, "en": {"accuracy": 0.875, "f1": 0.8626373626373627}, "fr": {"accuracy": 0.7955, "f1": 0.785751702462022}, "de": {"accuracy": 0.769, "f1": 0.7494577006507592}, "zh": {"accuracy": 0.7205, "f1": 0.7009095773140717}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8622222222222222, "f1": 0.8702250130821559}, "dna_sim_pair_150bp": {"accuracy": 0.6625, "f1": 0.6184284906726965}, "dna_sim_pair_50bp": {"accuracy": 0.477, "f1": 0.6168498168498169}, "protein_sim_pair_150bp": {"accuracy": 0.91, "f1": 0.9073226544622426}, "protein_sim_pair_450bp": {"accuracy": 0.9061111111111111, "f1": 0.8989838613269575}, "dna_protein_pair": {"accuracy": 0.5025, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.5075, "f1": 0.010050251256281407}, "dna_protein_pair_full": {"accuracy": 0.5275, "f1": 0.05025125628140704}, "dna_protein_pair_rand": {"accuracy": 0.5175, "f1": 0.02278481012658228}, "dna_protein_pair_rand_100": {"accuracy": 0.525625, "f1": 0.002628120893561104}, "dna_protein_pair_rand_full": {"accuracy": 0.533125, "f1": 0.16722408026755853}}
|
| 70 |
+
{"seed": 88, "en": {"accuracy": 0.8645, "f1": 0.8571428571428571}, "fr": {"accuracy": 0.7735, "f1": 0.7814761215629522}, "de": {"accuracy": 0.7525, "f1": 0.7619047619047619}, "zh": {"accuracy": 0.7115, "f1": 0.7153428712382832}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8080555555555555, "f1": 0.8328899637243047}, "dna_sim_pair_150bp": {"accuracy": 0.6895, "f1": 0.6937869822485208}, "dna_sim_pair_50bp": {"accuracy": 0.455, "f1": 0.6164672765657987}, "protein_sim_pair_150bp": {"accuracy": 0.9144444444444444, "f1": 0.9182590233545648}, "protein_sim_pair_450bp": {"accuracy": 0.9305555555555556, "f1": 0.9279538904899135}, "dna_protein_pair": {"accuracy": 0.5325, "f1": 0.30996309963099633}, "dna_protein_pair_100": {"accuracy": 0.51, "f1": 0.04854368932038835}, "dna_protein_pair_full": {"accuracy": 0.505, "f1": 0.038834951456310676}, "dna_protein_pair_rand": {"accuracy": 0.531875, "f1": 0.30840258541089566}, "dna_protein_pair_rand_100": {"accuracy": 0.495625, "f1": 0.026537997587454766}, "dna_protein_pair_rand_full": {"accuracy": 0.4875, "f1": 0.07029478458049887}}
|
| 71 |
+
{"seed": 89, "en": {"accuracy": 0.894, "f1": 0.887592788971368}, "fr": {"accuracy": 0.8105, "f1": 0.8130241736556487}, "de": {"accuracy": 0.768, "f1": 0.7705242334322453}, "zh": {"accuracy": 0.707, "f1": 0.7081673306772909}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9283333333333333, "f1": 0.9263278126784694}, "dna_sim_pair_150bp": {"accuracy": 0.818, "f1": 0.7875072971395213}, "dna_sim_pair_50bp": {"accuracy": 0.6355, "f1": 0.6826295167609926}, "protein_sim_pair_150bp": {"accuracy": 0.9477777777777778, "f1": 0.9482948294829483}, "protein_sim_pair_450bp": {"accuracy": 0.9522222222222222, "f1": 0.9523281596452328}, "dna_protein_pair": {"accuracy": 0.5375, "f1": 0.0106951871657754}, "dna_protein_pair_100": {"accuracy": 0.5, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.4425, "f1": 0.026200873362445413}, "dna_protein_pair_rand": {"accuracy": 0.498125, "f1": 0.02902055622732769}, "dna_protein_pair_rand_100": {"accuracy": 0.5025, "f1": 0.01240694789081886}, "dna_protein_pair_rand_full": {"accuracy": 0.51125, "f1": 0.0645933014354067}}
|
| 72 |
+
{"seed": 90, "en": {"accuracy": 0.862, "f1": 0.8517722878625135}, "fr": {"accuracy": 0.7765, "f1": 0.7788223651657595}, "de": {"accuracy": 0.737, "f1": 0.7429130009775171}, "zh": {"accuracy": 0.707, "f1": 0.7101879327398615}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8041666666666667, "f1": 0.8267387564512165}, "dna_sim_pair_150bp": {"accuracy": 0.6845, "f1": 0.6996668253212756}, "dna_sim_pair_50bp": {"accuracy": 0.493, "f1": 0.656271186440678}, "protein_sim_pair_150bp": {"accuracy": 0.8527777777777777, "f1": 0.8654139156932453}, "protein_sim_pair_450bp": {"accuracy": 0.945, "f1": 0.9473124002128792}, "dna_protein_pair": {"accuracy": 0.54, "f1": 0.21367521367521367}, "dna_protein_pair_100": {"accuracy": 0.4875, "f1": 0.00966183574879227}, "dna_protein_pair_full": {"accuracy": 0.3525, "f1": 0.4746450304259635}, "dna_protein_pair_rand": {"accuracy": 0.580625, "f1": 0.42204995693367786}, "dna_protein_pair_rand_100": {"accuracy": 0.51, "f1": 0.022443890274314215}, "dna_protein_pair_rand_full": {"accuracy": 0.459375, "f1": 0.5361930294906166}}
|
finetune/gpt2_gene_multiv1_ft_en_test_others.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import os
|
| 2 |
+
|
| 3 |
+
# # 设置环境变量
|
| 4 |
+
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
| 5 |
+
|
| 6 |
+
# # 打印环境变量以确认设置成功
|
| 7 |
+
# print(os.environ.get('HF_ENDPOINT'))
|
| 8 |
+
|
| 9 |
+
# import subprocess
|
| 10 |
+
# import os
|
| 11 |
+
|
| 12 |
+
# result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
|
| 13 |
+
# output = result.stdout
|
| 14 |
+
# for line in output.splitlines():
|
| 15 |
+
# if '=' in line:
|
| 16 |
+
# var, value = line.split('=', 1)
|
| 17 |
+
# os.environ[var] = value
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
from datasets import load_dataset
|
| 21 |
+
from transformers import AutoTokenizer, DataCollatorWithPadding
|
| 22 |
+
from transformers import Trainer
|
| 23 |
+
import evaluate
|
| 24 |
+
import numpy as np
|
| 25 |
+
from transformers import TrainingArguments
|
| 26 |
+
from transformers import AutoModelForSequenceClassification
|
| 27 |
+
import json
|
| 28 |
+
from transformers import set_seed
|
| 29 |
+
import random
|
| 30 |
+
import numpy as np
|
| 31 |
+
import torch
|
| 32 |
+
|
| 33 |
+
# seed = 42
|
| 34 |
+
# random.seed(seed)
|
| 35 |
+
# np.random.seed(seed)
|
| 36 |
+
# torch.manual_seed(seed)
|
| 37 |
+
# torch.cuda.manual_seed_all(seed)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# 动态生成随机种子
|
| 41 |
+
import random
|
| 42 |
+
seed = random.randint(0, 10000)
|
| 43 |
+
#print(f"Generated seed: {seed}")
|
| 44 |
+
set_seed(seed)
|
| 45 |
+
result = {}
|
| 46 |
+
result["seed"] = seed
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# 假设你的JSON文件名为 'data.json' 并且每行是一个独立的JSON对象
|
| 51 |
+
raw_datasets = load_dataset('google-research-datasets/paws-x', 'en') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x
|
| 52 |
+
|
| 53 |
+
#分词器
|
| 54 |
+
tokenizer = AutoTokenizer.from_pretrained("dnagpt/gpt2_gene_multi_v1")
|
| 55 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 56 |
+
|
| 57 |
+
# 修改分词器的填充方向为左侧,默认有右侧,分类问题建议左侧
|
| 58 |
+
#tokenizer.padding_side = "left"
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
#分词函数
|
| 62 |
+
def tokenize_function(example):
|
| 63 |
+
#return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=256)
|
| 64 |
+
return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=256, padding="max_length")
|
| 65 |
+
#return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=1024) #padding="max_length")
|
| 66 |
+
|
| 67 |
+
#构建分词后的数据集
|
| 68 |
+
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
|
| 69 |
+
|
| 70 |
+
#训练数据构建
|
| 71 |
+
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
#指标函数定义
|
| 75 |
+
def compute_metrics(eval_pred):
|
| 76 |
+
predictions, labels = eval_pred
|
| 77 |
+
predictions = np.argmax(predictions, axis=1)
|
| 78 |
+
return {'accuracy': (predictions==labels).sum() / len(labels)}
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
training_args = TrainingArguments(
|
| 83 |
+
output_dir="ds_job_dna_2222",
|
| 84 |
+
learning_rate=1e-5,
|
| 85 |
+
lr_scheduler_type="constant_with_warmup",
|
| 86 |
+
warmup_ratio=0.1,
|
| 87 |
+
optim='adamw_torch',
|
| 88 |
+
weight_decay=0.0,
|
| 89 |
+
seed=seed, # 使用动态生成的随机种子
|
| 90 |
+
per_device_train_batch_size=20,
|
| 91 |
+
per_device_eval_batch_size=20,
|
| 92 |
+
num_train_epochs=4, #训练多少轮
|
| 93 |
+
evaluation_strategy="epoch",
|
| 94 |
+
save_strategy="epoch",
|
| 95 |
+
logging_strategy="epoch",
|
| 96 |
+
load_best_model_at_end=True
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
#模型定义,文本分类模型
|
| 100 |
+
model = AutoModelForSequenceClassification.from_pretrained("dnagpt/gpt2_gene_multi_v1", num_labels=2)
|
| 101 |
+
model.config.pad_token_id = model.config.eos_token_id
|
| 102 |
+
|
| 103 |
+
trainer = Trainer(
|
| 104 |
+
model,
|
| 105 |
+
training_args,
|
| 106 |
+
train_dataset=tokenized_datasets["train"],
|
| 107 |
+
eval_dataset=tokenized_datasets["validation"],
|
| 108 |
+
data_collator=data_collator,
|
| 109 |
+
tokenizer=tokenizer,
|
| 110 |
+
compute_metrics=compute_metrics,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
trainer.train() #模型训练
|
| 114 |
+
|
| 115 |
+
#模型测试,英文数据集
|
| 116 |
+
predictions = trainer.predict(tokenized_datasets["test"])
|
| 117 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 118 |
+
metric = evaluate.load("glue", "mrpc")
|
| 119 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 120 |
+
result["en"] = ret
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
#模型测试,法文数据集
|
| 124 |
+
raw_datasets_fr = load_dataset('paws-x', 'fr') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x
|
| 125 |
+
tokenized_datasets_fr = raw_datasets_fr.map(tokenize_function, batched=True)
|
| 126 |
+
|
| 127 |
+
predictions = trainer.predict(tokenized_datasets_fr["test"])
|
| 128 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 129 |
+
metric = evaluate.load("glue", "mrpc")
|
| 130 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 131 |
+
result["fr"] = ret
|
| 132 |
+
|
| 133 |
+
#模型测试,德文数据集
|
| 134 |
+
raw_datasets_de = load_dataset('google-research-datasets/paws-x', 'de') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-
|
| 135 |
+
tokenized_datasets_de = raw_datasets_de.map(tokenize_function, batched=True)
|
| 136 |
+
predictions = trainer.predict(tokenized_datasets_de["test"])
|
| 137 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 138 |
+
metric = evaluate.load("glue", "mrpc")
|
| 139 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 140 |
+
result["de"] = ret
|
| 141 |
+
|
| 142 |
+
#模型测试,中文数据集
|
| 143 |
+
raw_datasets_zh = load_dataset('google-research-datasets/paws-x', 'zh') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-
|
| 144 |
+
tokenized_datasets_zh = raw_datasets_zh.map(tokenize_function, batched=True)
|
| 145 |
+
|
| 146 |
+
predictions = trainer.predict(tokenized_datasets_zh["test"])
|
| 147 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 148 |
+
metric = evaluate.load("glue", "mrpc")
|
| 149 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 150 |
+
result["zh"] = ret
|
| 151 |
+
|
| 152 |
+
#模型测试 dna数据集,150 bp长度 简单版本
|
| 153 |
+
raw_datasets_dna =load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_simple_150bp')['train'].train_test_split(test_size=0.2) #默认已经shuffle
|
| 154 |
+
tokenized_datasets_dna = raw_datasets_dna.map(tokenize_function, batched=True)
|
| 155 |
+
predictions = trainer.predict(tokenized_datasets_dna["test"])
|
| 156 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 157 |
+
metric = evaluate.load("glue", "mrpc")
|
| 158 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 159 |
+
result["dna_sim_pair_simple_150bp"] = ret
|
| 160 |
+
|
| 161 |
+
#模型测试 dna数据集,150长度,复杂版本 不相似
|
| 162 |
+
raw_datasets_dna = load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_150bp')['train'].train_test_split(test_size=0.2) #默认已经shuffle
|
| 163 |
+
tokenized_datasets_dna= raw_datasets_dna.map(tokenize_function, batched=True)
|
| 164 |
+
|
| 165 |
+
predictions = trainer.predict(tokenized_datasets_dna["test"])
|
| 166 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 167 |
+
metric = evaluate.load("glue", "mrpc")
|
| 168 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 169 |
+
result["dna_sim_pair_150bp"] = ret
|
| 170 |
+
|
| 171 |
+
#模型测试 dna数据集,50长度,复杂版本 不相似
|
| 172 |
+
raw_datasets_dna = load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_50bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 173 |
+
tokenized_datasets_dna = raw_datasets_dna.map(tokenize_function, batched=True)
|
| 174 |
+
predictions = trainer.predict(tokenized_datasets_dna["test"])
|
| 175 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 176 |
+
metric = evaluate.load("glue", "mrpc")
|
| 177 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 178 |
+
result["dna_sim_pair_50bp"] = ret
|
| 179 |
+
|
| 180 |
+
#模型测试 蛋白质数据集,50长度/150bp,复杂版本 不相似
|
| 181 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'protein_sim_pair_150bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 182 |
+
tokenized_datasets_dna_protein = raw_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 183 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 184 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 185 |
+
metric = evaluate.load("glue", "mrpc")
|
| 186 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 187 |
+
result["protein_sim_pair_150bp"] = ret
|
| 188 |
+
|
| 189 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 190 |
+
|
| 191 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'protein_sim_pair_450bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 192 |
+
tokenized_datasets_dna_protein = raw_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 193 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 194 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 195 |
+
metric = evaluate.load("glue", "mrpc")
|
| 196 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 197 |
+
result["protein_sim_pair_450bp"] = ret
|
| 198 |
+
|
| 199 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 200 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 201 |
+
|
| 202 |
+
# 定义翻转标签的函数
|
| 203 |
+
def flip_labels(example):
|
| 204 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 205 |
+
example["sentence1"] = example["sentence1"][:150]
|
| 206 |
+
example["sentence2"] = example["sentence2"][:50]
|
| 207 |
+
example['label'] = 1 - example['label']
|
| 208 |
+
return example
|
| 209 |
+
|
| 210 |
+
# 应用翻转标签函数
|
| 211 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 212 |
+
|
| 213 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 214 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 215 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 216 |
+
metric = evaluate.load("glue", "mrpc")
|
| 217 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 218 |
+
result["dna_protein_pair"] = ret
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 222 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 223 |
+
|
| 224 |
+
# 定义翻转标签的函数
|
| 225 |
+
def flip_labels(example):
|
| 226 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 227 |
+
example["sentence1"] = example["sentence1"][:300]
|
| 228 |
+
example["sentence2"] = example["sentence2"][:100]
|
| 229 |
+
example['label'] = 1 - example['label']
|
| 230 |
+
return example
|
| 231 |
+
|
| 232 |
+
# 应用翻转标签函数
|
| 233 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 234 |
+
|
| 235 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 236 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 237 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 238 |
+
metric = evaluate.load("glue", "mrpc")
|
| 239 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 240 |
+
result["dna_protein_pair_100"] = ret
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 248 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 249 |
+
|
| 250 |
+
# 定义翻转标签的函数
|
| 251 |
+
def flip_labels(example):
|
| 252 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 253 |
+
example["sentence1"] = example["sentence1"]
|
| 254 |
+
example["sentence2"] = example["sentence2"]
|
| 255 |
+
example['label'] = 1 - example['label']
|
| 256 |
+
return example
|
| 257 |
+
|
| 258 |
+
# 应用翻转标签函数
|
| 259 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 260 |
+
|
| 261 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 262 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 263 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 264 |
+
metric = evaluate.load("glue", "mrpc")
|
| 265 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 266 |
+
result["dna_protein_pair_full"] = ret
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
#模型测试 蛋白质数据集,随机版本
|
| 273 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 274 |
+
|
| 275 |
+
# 定义翻转标签的函数
|
| 276 |
+
def flip_labels(example):
|
| 277 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 278 |
+
example["sentence1"] = example["sentence1"][:150]
|
| 279 |
+
example["sentence2"] = example["sentence2"][:50]
|
| 280 |
+
example['label'] = 1 - example['label']
|
| 281 |
+
return example
|
| 282 |
+
|
| 283 |
+
# 应用翻转标签函数
|
| 284 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 285 |
+
|
| 286 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 287 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 288 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 289 |
+
metric = evaluate.load("glue", "mrpc")
|
| 290 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 291 |
+
result["dna_protein_pair_rand"] = ret
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
#模型测试 蛋白质数据集,随机版本
|
| 295 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 296 |
+
|
| 297 |
+
# 定义翻转标签的函数
|
| 298 |
+
def flip_labels(example):
|
| 299 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 300 |
+
example["sentence1"] = example["sentence1"][:300]
|
| 301 |
+
example["sentence2"] = example["sentence2"][:100]
|
| 302 |
+
example['label'] = 1 - example['label']
|
| 303 |
+
return example
|
| 304 |
+
|
| 305 |
+
# 应用翻转标签函数
|
| 306 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 307 |
+
|
| 308 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 309 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 310 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 311 |
+
metric = evaluate.load("glue", "mrpc")
|
| 312 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 313 |
+
result["dna_protein_pair_rand_100"] = ret
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
#模型测试 蛋白质数据集,随机版本
|
| 319 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 320 |
+
|
| 321 |
+
# 定义翻转标签的函数
|
| 322 |
+
def flip_labels(example):
|
| 323 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 324 |
+
example["sentence1"] = example["sentence1"]
|
| 325 |
+
example["sentence2"] = example["sentence2"]
|
| 326 |
+
example['label'] = 1 - example['label']
|
| 327 |
+
return example
|
| 328 |
+
|
| 329 |
+
# 应用翻转标签函数
|
| 330 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 331 |
+
|
| 332 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 333 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 334 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 335 |
+
metric = evaluate.load("glue", "mrpc")
|
| 336 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 337 |
+
result["dna_protein_pair_rand_full"] = ret
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
print(json.dumps(result))
|
| 342 |
+
|
finetune/gpt2_gene_multiv1_ft_en_test_others2.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import os
|
| 2 |
+
|
| 3 |
+
# # 设置环境变量
|
| 4 |
+
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
| 5 |
+
|
| 6 |
+
# # 打印环境变量以确认设置成功
|
| 7 |
+
# print(os.environ.get('HF_ENDPOINT'))
|
| 8 |
+
|
| 9 |
+
# import subprocess
|
| 10 |
+
# import os
|
| 11 |
+
|
| 12 |
+
# result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
|
| 13 |
+
# output = result.stdout
|
| 14 |
+
# for line in output.splitlines():
|
| 15 |
+
# if '=' in line:
|
| 16 |
+
# var, value = line.split('=', 1)
|
| 17 |
+
# os.environ[var] = value
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
from datasets import load_dataset
|
| 21 |
+
from transformers import AutoTokenizer, DataCollatorWithPadding
|
| 22 |
+
from transformers import Trainer
|
| 23 |
+
import evaluate
|
| 24 |
+
import numpy as np
|
| 25 |
+
from transformers import TrainingArguments
|
| 26 |
+
from transformers import AutoModelForSequenceClassification
|
| 27 |
+
import json
|
| 28 |
+
from transformers import set_seed
|
| 29 |
+
import random
|
| 30 |
+
import numpy as np
|
| 31 |
+
import torch
|
| 32 |
+
from tqdm import tqdm
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# seed = 42
|
| 36 |
+
# random.seed(seed)
|
| 37 |
+
# np.random.seed(seed)
|
| 38 |
+
# torch.manual_seed(seed)
|
| 39 |
+
# torch.cuda.manual_seed_all(seed)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# 动态生成随机种子
|
| 43 |
+
import random
|
| 44 |
+
seed = random.randint(0, 10000)
|
| 45 |
+
#print(f"Generated seed: {seed}")
|
| 46 |
+
set_seed(seed)
|
| 47 |
+
result = {}
|
| 48 |
+
result["seed"] = seed
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# 假设你的JSON文件名为 'data.json' 并且每行是一个独立的JSON对象
|
| 53 |
+
raw_datasets = load_dataset('paws-x', 'en') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x
|
| 54 |
+
|
| 55 |
+
#分词器
|
| 56 |
+
tokenizer = AutoTokenizer.from_pretrained("dnagpt/gpt2_gene_multi_v1")
|
| 57 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
#分词函数
|
| 61 |
+
def tokenize_function(example):
|
| 62 |
+
return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=256, padding="max_length")
|
| 63 |
+
|
| 64 |
+
#构建分词后的数据集
|
| 65 |
+
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
|
| 66 |
+
|
| 67 |
+
#训练数据构建
|
| 68 |
+
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
#指标函数定义
|
| 72 |
+
def compute_metrics(eval_pred):
|
| 73 |
+
predictions, labels = eval_pred
|
| 74 |
+
predictions = np.argmax(predictions, axis=1)
|
| 75 |
+
return {'accuracy': (predictions==labels).sum() / len(labels)}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
training_args = TrainingArguments(
|
| 80 |
+
output_dir="ds_job_dna_2222",
|
| 81 |
+
learning_rate=1e-5,
|
| 82 |
+
lr_scheduler_type="constant_with_warmup",
|
| 83 |
+
warmup_ratio=0.1,
|
| 84 |
+
optim='adamw_torch',
|
| 85 |
+
weight_decay=0.0,
|
| 86 |
+
seed=seed, # 使用动态生成的随机种子
|
| 87 |
+
per_device_train_batch_size=64,
|
| 88 |
+
per_device_eval_batch_size=64,
|
| 89 |
+
num_train_epochs=4, #训练多少轮
|
| 90 |
+
evaluation_strategy="epoch",
|
| 91 |
+
save_strategy="epoch",
|
| 92 |
+
logging_strategy="epoch",
|
| 93 |
+
load_best_model_at_end=True
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
#模型定义,文本分类模型
|
| 97 |
+
model = AutoModelForSequenceClassification.from_pretrained("dnagpt/gpt2_gene_multi_v1", num_labels=2)
|
| 98 |
+
model.config.pad_token_id = model.config.eos_token_id
|
| 99 |
+
|
| 100 |
+
trainer = Trainer(
|
| 101 |
+
model,
|
| 102 |
+
training_args,
|
| 103 |
+
train_dataset=tokenized_datasets["train"],
|
| 104 |
+
eval_dataset=tokenized_datasets["validation"],
|
| 105 |
+
data_collator=data_collator,
|
| 106 |
+
tokenizer=tokenizer,
|
| 107 |
+
compute_metrics=compute_metrics,
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
trainer.train() #模型训练
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 116 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 117 |
+
|
| 118 |
+
# 定义翻转标签的函数
|
| 119 |
+
def flip_labels(example):
|
| 120 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 121 |
+
example["sentence1"] = example["sentence1"]
|
| 122 |
+
example["sentence2"] = example["sentence2"]
|
| 123 |
+
example['label'] = 1 - example['label']
|
| 124 |
+
return example
|
| 125 |
+
|
| 126 |
+
# 应用翻转标签函数
|
| 127 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 128 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True,num_proc=4)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# 确保模型在 GPU 上
|
| 133 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 134 |
+
model.to(device)
|
| 135 |
+
model.eval() # 进入推理模式,加速推理
|
| 136 |
+
|
| 137 |
+
# 取出测试集数据
|
| 138 |
+
test_dataset = tokenized_datasets_dna_protein["test"]
|
| 139 |
+
|
| 140 |
+
# 预存预测结果
|
| 141 |
+
preds = []
|
| 142 |
+
labels = []
|
| 143 |
+
|
| 144 |
+
# 批量大小(建议 64、128、256 视显存大小调整)
|
| 145 |
+
batch_size = 64
|
| 146 |
+
|
| 147 |
+
# 直接遍历数据集进行推理
|
| 148 |
+
for i in tqdm(range(0, len(test_dataset), batch_size), desc="Predicting"):
|
| 149 |
+
batch = test_dataset[i : i + batch_size]
|
| 150 |
+
|
| 151 |
+
# 转换为 Tensor 并移动到 GPU
|
| 152 |
+
inputs = {
|
| 153 |
+
"input_ids": torch.tensor(batch["input_ids"]).to(device),
|
| 154 |
+
"attention_mask": torch.tensor(batch["attention_mask"]).to(device),
|
| 155 |
+
}
|
| 156 |
+
batch_labels = batch["label"] # 原始标签
|
| 157 |
+
|
| 158 |
+
with torch.no_grad(): # 关闭梯度计算,减少内存占用
|
| 159 |
+
outputs = model(**inputs)
|
| 160 |
+
batch_preds = torch.argmax(outputs.logits, axis=-1).cpu().numpy() # 取最大概率的类别
|
| 161 |
+
|
| 162 |
+
preds.extend(batch_preds)
|
| 163 |
+
labels.extend(batch_labels)
|
| 164 |
+
|
| 165 |
+
metric = evaluate.load("glue", "mrpc")
|
| 166 |
+
ret = metric.compute(predictions=preds, references=labels)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
result["dna_protein_pair_full"] = ret
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
#############################################################
|
| 175 |
+
#模型测试 蛋白质数据集,随机版本
|
| 176 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.5) #默认已经shuffle
|
| 177 |
+
|
| 178 |
+
# 定义翻转标签的函数
|
| 179 |
+
def flip_labels(example):
|
| 180 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 181 |
+
example["sentence1"] = example["sentence1"]
|
| 182 |
+
example["sentence2"] = example["sentence2"]
|
| 183 |
+
example['label'] = 1 - example['label']
|
| 184 |
+
return example
|
| 185 |
+
|
| 186 |
+
# 应用翻转标签函数
|
| 187 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 188 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True,num_proc=4)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# 确保模型在 GPU 上
|
| 192 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 193 |
+
model.to(device)
|
| 194 |
+
model.eval() # 进入推理模式,加速推理
|
| 195 |
+
|
| 196 |
+
# 取出测试集数据
|
| 197 |
+
test_dataset = tokenized_datasets_dna_protein["test"]
|
| 198 |
+
|
| 199 |
+
# 预存预测结果
|
| 200 |
+
preds = []
|
| 201 |
+
labels = []
|
| 202 |
+
|
| 203 |
+
# 批量大小(建议 64、128、256 视显存大小调整)
|
| 204 |
+
batch_size = 64
|
| 205 |
+
|
| 206 |
+
# 直接遍历数据集进行推理
|
| 207 |
+
for i in tqdm(range(0, len(test_dataset), batch_size), desc="Predicting"):
|
| 208 |
+
batch = test_dataset[i : i + batch_size]
|
| 209 |
+
|
| 210 |
+
# 转换为 Tensor 并移动到 GPU
|
| 211 |
+
inputs = {
|
| 212 |
+
"input_ids": torch.tensor(batch["input_ids"]).to(device),
|
| 213 |
+
"attention_mask": torch.tensor(batch["attention_mask"]).to(device),
|
| 214 |
+
}
|
| 215 |
+
batch_labels = batch["label"] # 原始标签
|
| 216 |
+
|
| 217 |
+
with torch.no_grad(): # 关闭梯度计算,减少内存占用
|
| 218 |
+
outputs = model(**inputs)
|
| 219 |
+
batch_preds = torch.argmax(outputs.logits, axis=-1).cpu().numpy() # 取最大概率的类别
|
| 220 |
+
|
| 221 |
+
preds.extend(batch_preds)
|
| 222 |
+
labels.extend(batch_labels)
|
| 223 |
+
metric = evaluate.load("glue", "mrpc")
|
| 224 |
+
ret = metric.compute(predictions=preds, references=labels)
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
result["dna_protein_pair_rand_full"] = ret
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
print(json.dumps(result))
|
| 233 |
+
|
finetune/gpt2_gene_multiv1_ft_en_test_others3.py
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import os
|
| 2 |
+
|
| 3 |
+
# # 设置环境变量
|
| 4 |
+
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
| 5 |
+
|
| 6 |
+
# # 打印环境变量以确认设置成功
|
| 7 |
+
# print(os.environ.get('HF_ENDPOINT'))
|
| 8 |
+
|
| 9 |
+
# import subprocess
|
| 10 |
+
# import os
|
| 11 |
+
|
| 12 |
+
# result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
|
| 13 |
+
# output = result.stdout
|
| 14 |
+
# for line in output.splitlines():
|
| 15 |
+
# if '=' in line:
|
| 16 |
+
# var, value = line.split('=', 1)
|
| 17 |
+
# os.environ[var] = value
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
from datasets import load_dataset
|
| 21 |
+
from transformers import AutoTokenizer, DataCollatorWithPadding
|
| 22 |
+
from transformers import Trainer
|
| 23 |
+
import evaluate
|
| 24 |
+
import numpy as np
|
| 25 |
+
from transformers import TrainingArguments
|
| 26 |
+
from transformers import AutoModelForSequenceClassification
|
| 27 |
+
import json
|
| 28 |
+
from transformers import set_seed
|
| 29 |
+
import random
|
| 30 |
+
import numpy as np
|
| 31 |
+
import torch
|
| 32 |
+
import sys
|
| 33 |
+
|
| 34 |
+
# seed = 42
|
| 35 |
+
# random.seed(seed)
|
| 36 |
+
# np.random.seed(seed)
|
| 37 |
+
# torch.manual_seed(seed)
|
| 38 |
+
# torch.cuda.manual_seed_all(seed)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# 动态生成随机种子
|
| 42 |
+
import random
|
| 43 |
+
#seed = random.randint(0, 10000)
|
| 44 |
+
seed = int(sys.argv[1])
|
| 45 |
+
#print(f"Generated seed: {seed}")
|
| 46 |
+
set_seed(seed)
|
| 47 |
+
result = {}
|
| 48 |
+
result["seed"] = seed
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# 假设你的JSON文件名为 'data.json' 并且每行是一个独立的JSON对象
|
| 53 |
+
raw_datasets = load_dataset('paws-x', 'en') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x
|
| 54 |
+
|
| 55 |
+
#分词器
|
| 56 |
+
tokenizer = AutoTokenizer.from_pretrained("pt_lora_model")
|
| 57 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 58 |
+
|
| 59 |
+
# 修改分词器的填充方向为左侧,默认有右侧,分类问题建议左侧
|
| 60 |
+
#tokenizer.padding_side = "left"
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
#分词函数
|
| 64 |
+
def tokenize_function(example):
|
| 65 |
+
#return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=256)
|
| 66 |
+
return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=256, padding="max_length")
|
| 67 |
+
#return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=1024) #padding="max_length")
|
| 68 |
+
|
| 69 |
+
#构建分词后的数据集
|
| 70 |
+
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
|
| 71 |
+
|
| 72 |
+
#训练数据构建
|
| 73 |
+
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
#指标函数定义
|
| 77 |
+
def compute_metrics(eval_pred):
|
| 78 |
+
predictions, labels = eval_pred
|
| 79 |
+
predictions = np.argmax(predictions, axis=1)
|
| 80 |
+
return {'accuracy': (predictions==labels).sum() / len(labels)}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
training_args = TrainingArguments(
|
| 85 |
+
output_dir="ds_job_dna_2222",
|
| 86 |
+
learning_rate=1e-5,
|
| 87 |
+
lr_scheduler_type="constant_with_warmup",
|
| 88 |
+
warmup_ratio=0.1,
|
| 89 |
+
optim='adamw_torch',
|
| 90 |
+
weight_decay=0.0,
|
| 91 |
+
seed=seed, # 使用动态生成的随机种子
|
| 92 |
+
per_device_train_batch_size=20,
|
| 93 |
+
per_device_eval_batch_size=20,
|
| 94 |
+
num_train_epochs=4, #训练多少轮
|
| 95 |
+
evaluation_strategy="epoch",
|
| 96 |
+
save_strategy="epoch",
|
| 97 |
+
logging_strategy="epoch",
|
| 98 |
+
load_best_model_at_end=True
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
#模型定义,文本分类模型
|
| 102 |
+
model = AutoModelForSequenceClassification.from_pretrained("pt_lora_model", num_labels=2)
|
| 103 |
+
model.config.pad_token_id = model.config.eos_token_id
|
| 104 |
+
|
| 105 |
+
trainer = Trainer(
|
| 106 |
+
model,
|
| 107 |
+
training_args,
|
| 108 |
+
train_dataset=tokenized_datasets["train"],
|
| 109 |
+
eval_dataset=tokenized_datasets["validation"],
|
| 110 |
+
data_collator=data_collator,
|
| 111 |
+
tokenizer=tokenizer,
|
| 112 |
+
compute_metrics=compute_metrics,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
trainer.train() #模型训练
|
| 116 |
+
|
| 117 |
+
#模型测试,英文数据集
|
| 118 |
+
predictions = trainer.predict(tokenized_datasets["test"])
|
| 119 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 120 |
+
metric = evaluate.load("glue", "mrpc")
|
| 121 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 122 |
+
result["en"] = ret
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
#模型测试,法文数据集
|
| 126 |
+
raw_datasets_fr = load_dataset('paws-x', 'fr') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x
|
| 127 |
+
tokenized_datasets_fr = raw_datasets_fr.map(tokenize_function, batched=True)
|
| 128 |
+
|
| 129 |
+
predictions = trainer.predict(tokenized_datasets_fr["test"])
|
| 130 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 131 |
+
metric = evaluate.load("glue", "mrpc")
|
| 132 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 133 |
+
result["fr"] = ret
|
| 134 |
+
|
| 135 |
+
#模型测试,德文数据集
|
| 136 |
+
raw_datasets_de = load_dataset('google-research-datasets/paws-x', 'de') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-
|
| 137 |
+
tokenized_datasets_de = raw_datasets_de.map(tokenize_function, batched=True)
|
| 138 |
+
predictions = trainer.predict(tokenized_datasets_de["test"])
|
| 139 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 140 |
+
metric = evaluate.load("glue", "mrpc")
|
| 141 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 142 |
+
result["de"] = ret
|
| 143 |
+
|
| 144 |
+
#模型测试,中文数据集
|
| 145 |
+
raw_datasets_zh = load_dataset('google-research-datasets/paws-x', 'zh') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-
|
| 146 |
+
tokenized_datasets_zh = raw_datasets_zh.map(tokenize_function, batched=True)
|
| 147 |
+
|
| 148 |
+
predictions = trainer.predict(tokenized_datasets_zh["test"])
|
| 149 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 150 |
+
metric = evaluate.load("glue", "mrpc")
|
| 151 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 152 |
+
result["zh"] = ret
|
| 153 |
+
|
| 154 |
+
#模型测试 dna数据集,150 bp长度 简单版本
|
| 155 |
+
raw_datasets_dna =load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_simple_150bp')['train'].train_test_split(test_size=0.2) #默认已经shuffle
|
| 156 |
+
tokenized_datasets_dna = raw_datasets_dna.map(tokenize_function, batched=True)
|
| 157 |
+
predictions = trainer.predict(tokenized_datasets_dna["test"])
|
| 158 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 159 |
+
metric = evaluate.load("glue", "mrpc")
|
| 160 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 161 |
+
result["dna_sim_pair_simple_150bp"] = ret
|
| 162 |
+
|
| 163 |
+
#模型测试 dna数据集,150长度,复杂版本 不相似
|
| 164 |
+
raw_datasets_dna = load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_150bp')['train'].train_test_split(test_size=0.2) #默认已经shuffle
|
| 165 |
+
tokenized_datasets_dna= raw_datasets_dna.map(tokenize_function, batched=True)
|
| 166 |
+
|
| 167 |
+
predictions = trainer.predict(tokenized_datasets_dna["test"])
|
| 168 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 169 |
+
metric = evaluate.load("glue", "mrpc")
|
| 170 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 171 |
+
result["dna_sim_pair_150bp"] = ret
|
| 172 |
+
|
| 173 |
+
#模型测试 dna数据集,50长度,复杂版本 不相似
|
| 174 |
+
raw_datasets_dna = load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_50bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 175 |
+
tokenized_datasets_dna = raw_datasets_dna.map(tokenize_function, batched=True)
|
| 176 |
+
predictions = trainer.predict(tokenized_datasets_dna["test"])
|
| 177 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 178 |
+
metric = evaluate.load("glue", "mrpc")
|
| 179 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 180 |
+
result["dna_sim_pair_50bp"] = ret
|
| 181 |
+
|
| 182 |
+
#模型测试 蛋白质数据集,50长度/150bp,复杂版本 不相似
|
| 183 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'protein_sim_pair_150bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 184 |
+
tokenized_datasets_dna_protein = raw_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 185 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 186 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 187 |
+
metric = evaluate.load("glue", "mrpc")
|
| 188 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 189 |
+
result["protein_sim_pair_150bp"] = ret
|
| 190 |
+
|
| 191 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 192 |
+
|
| 193 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'protein_sim_pair_450bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 194 |
+
tokenized_datasets_dna_protein = raw_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 195 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 196 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 197 |
+
metric = evaluate.load("glue", "mrpc")
|
| 198 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 199 |
+
result["protein_sim_pair_450bp"] = ret
|
| 200 |
+
|
| 201 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 202 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 203 |
+
|
| 204 |
+
# 定义翻转标签的函数
|
| 205 |
+
def flip_labels(example):
|
| 206 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 207 |
+
example["sentence1"] = example["sentence1"][:150]
|
| 208 |
+
example["sentence2"] = example["sentence2"][:50]
|
| 209 |
+
example['label'] = 1 - example['label']
|
| 210 |
+
return example
|
| 211 |
+
|
| 212 |
+
# 应用翻转标签函数
|
| 213 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 214 |
+
|
| 215 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 216 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 217 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 218 |
+
metric = evaluate.load("glue", "mrpc")
|
| 219 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 220 |
+
result["dna_protein_pair"] = ret
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 224 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 225 |
+
|
| 226 |
+
# 定义翻转标签的函数
|
| 227 |
+
def flip_labels(example):
|
| 228 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 229 |
+
example["sentence1"] = example["sentence1"][:300]
|
| 230 |
+
example["sentence2"] = example["sentence2"][:100]
|
| 231 |
+
example['label'] = 1 - example['label']
|
| 232 |
+
return example
|
| 233 |
+
|
| 234 |
+
# 应用翻转标签函数
|
| 235 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 236 |
+
|
| 237 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 238 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 239 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 240 |
+
metric = evaluate.load("glue", "mrpc")
|
| 241 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 242 |
+
result["dna_protein_pair_100"] = ret
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 250 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 251 |
+
|
| 252 |
+
# 定义翻转标签的函数
|
| 253 |
+
def flip_labels(example):
|
| 254 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 255 |
+
example["sentence1"] = example["sentence1"]
|
| 256 |
+
example["sentence2"] = example["sentence2"]
|
| 257 |
+
example['label'] = 1 - example['label']
|
| 258 |
+
return example
|
| 259 |
+
|
| 260 |
+
# 应用翻转标签函数
|
| 261 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 262 |
+
|
| 263 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 264 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 265 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 266 |
+
metric = evaluate.load("glue", "mrpc")
|
| 267 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 268 |
+
result["dna_protein_pair_full"] = ret
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
#模型测试 蛋白质数据集,随机版本
|
| 275 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 276 |
+
|
| 277 |
+
# 定义翻转标签的函数
|
| 278 |
+
def flip_labels(example):
|
| 279 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 280 |
+
example["sentence1"] = example["sentence1"][:150]
|
| 281 |
+
example["sentence2"] = example["sentence2"][:50]
|
| 282 |
+
example['label'] = 1 - example['label']
|
| 283 |
+
return example
|
| 284 |
+
|
| 285 |
+
# 应用翻转标签函数
|
| 286 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 287 |
+
|
| 288 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 289 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 290 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 291 |
+
metric = evaluate.load("glue", "mrpc")
|
| 292 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 293 |
+
result["dna_protein_pair_rand"] = ret
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
#模型测试 蛋白质数据集,随机版本
|
| 297 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 298 |
+
|
| 299 |
+
# 定义翻转标签的函数
|
| 300 |
+
def flip_labels(example):
|
| 301 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 302 |
+
example["sentence1"] = example["sentence1"][:300]
|
| 303 |
+
example["sentence2"] = example["sentence2"][:100]
|
| 304 |
+
example['label'] = 1 - example['label']
|
| 305 |
+
return example
|
| 306 |
+
|
| 307 |
+
# 应用翻转标签函数
|
| 308 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 309 |
+
|
| 310 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 311 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 312 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 313 |
+
metric = evaluate.load("glue", "mrpc")
|
| 314 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 315 |
+
result["dna_protein_pair_rand_100"] = ret
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
#模型测试 蛋白质数据集,随机版本
|
| 321 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 322 |
+
|
| 323 |
+
# 定义翻转标签的函数
|
| 324 |
+
def flip_labels(example):
|
| 325 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 326 |
+
example["sentence1"] = example["sentence1"]
|
| 327 |
+
example["sentence2"] = example["sentence2"]
|
| 328 |
+
example['label'] = 1 - example['label']
|
| 329 |
+
return example
|
| 330 |
+
|
| 331 |
+
# 应用翻转标签函数
|
| 332 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 333 |
+
|
| 334 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 335 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 336 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 337 |
+
metric = evaluate.load("glue", "mrpc")
|
| 338 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 339 |
+
result["dna_protein_pair_rand_full"] = ret
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
print(json.dumps(result))
|
| 344 |
+
|
finetune/gpt2_gene_multiv2_ft_en.jsonl
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"seed": 0, "en": {"accuracy": 0.8785, "f1": 0.8682926829268293}, "fr": {"accuracy": 0.7845, "f1": 0.783308195072901}, "de": {"accuracy": 0.7845, "f1": 0.7725593667546174}, "zh": {"accuracy": 0.713, "f1": 0.6828729281767956}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9358333333333333, "f1": 0.9312704552216602}, "dna_sim_pair_150bp": {"accuracy": 0.7265, "f1": 0.6240549828178694}, "dna_sim_pair_50bp": {"accuracy": 0.816, "f1": 0.8071278825995807}, "protein_sim_pair_150bp": {"accuracy": 0.9477777777777778, "f1": 0.9476614699331849}, "protein_sim_pair_450bp": {"accuracy": 0.8955555555555555, "f1": 0.8934240362811792}, "dna_protein_pair": {"accuracy": 0.5175, "f1": 0.010256410256410256}, "dna_protein_pair_100": {"accuracy": 0.53, "f1": 0.010526315789473684}, "dna_protein_pair_full": {"accuracy": 0.4825, "f1": 0.028169014084507043}, "dna_protein_pair_rand": {"accuracy": 0.51875, "f1": 0.0399002493765586}, "dna_protein_pair_rand_100": {"accuracy": 0.510625, "f1": 0.0076045627376425855}, "dna_protein_pair_rand_full": {"accuracy": 0.480625, "f1": 0.014234875444839857}}
|
| 2 |
+
{"seed": 1, "en": {"accuracy": 0.8695, "f1": 0.8610963278339542}, "fr": {"accuracy": 0.8045, "f1": 0.8047928107838243}, "de": {"accuracy": 0.7565, "f1": 0.7526663280853225}, "zh": {"accuracy": 0.7185, "f1": 0.7246943765281174}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9308333333333333, "f1": 0.9312344656172328}, "dna_sim_pair_150bp": {"accuracy": 0.87725, "f1": 0.8644769528015457}, "dna_sim_pair_50bp": {"accuracy": 0.7425, "f1": 0.768954688200987}, "protein_sim_pair_150bp": {"accuracy": 0.9472222222222222, "f1": 0.9463579898362507}, "protein_sim_pair_450bp": {"accuracy": 0.8344444444444444, "f1": 0.8558994197292069}, "dna_protein_pair": {"accuracy": 0.5025, "f1": 0.009950248756218905}, "dna_protein_pair_100": {"accuracy": 0.4975, "f1": 0.06511627906976744}, "dna_protein_pair_full": {"accuracy": 0.335, "f1": 0.46586345381526106}, "dna_protein_pair_rand": {"accuracy": 0.5325, "f1": 0.065}, "dna_protein_pair_rand_100": {"accuracy": 0.5175, "f1": 0.12866817155756208}, "dna_protein_pair_rand_full": {"accuracy": 0.540625, "f1": 0.6185781006746238}}
|
| 3 |
+
{"seed": 3, "en": {"accuracy": 0.8905, "f1": 0.8855201254573968}, "fr": {"accuracy": 0.791, "f1": 0.7972841901066925}, "de": {"accuracy": 0.764, "f1": 0.776303317535545}, "zh": {"accuracy": 0.719, "f1": 0.7013815090329437}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9502777777777778, "f1": 0.9484595450619061}, "dna_sim_pair_150bp": {"accuracy": 0.749, "f1": 0.6710353866317169}, "dna_sim_pair_50bp": {"accuracy": 0.7525, "f1": 0.7086521483225426}, "protein_sim_pair_150bp": {"accuracy": 0.9533333333333334, "f1": 0.9522184300341296}, "protein_sim_pair_450bp": {"accuracy": 0.945, "f1": 0.9422066549912435}, "dna_protein_pair": {"accuracy": 0.5425, "f1": 0.0213903743315508}, "dna_protein_pair_100": {"accuracy": 0.52, "f1": 0.04}, "dna_protein_pair_full": {"accuracy": 0.5125, "f1": 0.06698564593301436}, "dna_protein_pair_rand": {"accuracy": 0.49375, "f1": 0.04929577464788732}, "dna_protein_pair_rand_100": {"accuracy": 0.495, "f1": 0.024154589371980676}, "dna_protein_pair_rand_full": {"accuracy": 0.495, "f1": 0.04941176470588235}}
|
| 4 |
+
{"seed": 4, "en": {"accuracy": 0.888, "f1": 0.8848920863309353}, "fr": {"accuracy": 0.775, "f1": 0.7922437673130194}, "de": {"accuracy": 0.7515, "f1": 0.7678654834189631}, "zh": {"accuracy": 0.6985, "f1": 0.7207040296433534}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8472222222222222, "f1": 0.8643315244203256}, "dna_sim_pair_150bp": {"accuracy": 0.80775, "f1": 0.8225248096007385}, "dna_sim_pair_50bp": {"accuracy": 0.516, "f1": 0.6675824175824175}, "protein_sim_pair_150bp": {"accuracy": 0.8961111111111111, "f1": 0.9051243023845763}, "protein_sim_pair_450bp": {"accuracy": 0.8694444444444445, "f1": 0.8806500761808025}, "dna_protein_pair": {"accuracy": 0.4675, "f1": 0.25263157894736843}, "dna_protein_pair_100": {"accuracy": 0.5025, "f1": 0.16033755274261605}, "dna_protein_pair_full": {"accuracy": 0.5275, "f1": 0.1888412017167382}, "dna_protein_pair_rand": {"accuracy": 0.52875, "f1": 0.300556586270872}, "dna_protein_pair_rand_100": {"accuracy": 0.518125, "f1": 0.27332704995287466}, "dna_protein_pair_rand_full": {"accuracy": 0.513125, "f1": 0.24}}
|
| 5 |
+
{"seed": 5, "en": {"accuracy": 0.878, "f1": 0.872784150156413}, "fr": {"accuracy": 0.7845, "f1": 0.7936811871708952}, "de": {"accuracy": 0.767, "f1": 0.7681592039800995}, "zh": {"accuracy": 0.719, "f1": 0.7085062240663901}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9263888888888889, "f1": 0.9274568847522584}, "dna_sim_pair_150bp": {"accuracy": 0.85375, "f1": 0.8475371383893667}, "dna_sim_pair_50bp": {"accuracy": 0.658, "f1": 0.7274900398406374}, "protein_sim_pair_150bp": {"accuracy": 0.9838888888888889, "f1": 0.9839335180055402}, "protein_sim_pair_450bp": {"accuracy": 0.9505555555555556, "f1": 0.9493454752418896}, "dna_protein_pair": {"accuracy": 0.505, "f1": 0.05714285714285714}, "dna_protein_pair_100": {"accuracy": 0.5, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5475, "f1": 0.1339712918660287}, "dna_protein_pair_rand": {"accuracy": 0.51625, "f1": 0.03970223325062035}, "dna_protein_pair_rand_100": {"accuracy": 0.483125, "f1": 0.03274853801169591}, "dna_protein_pair_rand_full": {"accuracy": 0.496875, "f1": 0.08626560726447219}}
|
| 6 |
+
{"seed": 6, "en": {"accuracy": 0.886, "f1": 0.8791092258748674}, "fr": {"accuracy": 0.783, "f1": 0.7925430210325047}, "de": {"accuracy": 0.764, "f1": 0.7618567103935419}, "zh": {"accuracy": 0.7235, "f1": 0.7159732922444787}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9247222222222222, "f1": 0.9215629522431259}, "dna_sim_pair_150bp": {"accuracy": 0.73, "f1": 0.6465968586387435}, "dna_sim_pair_50bp": {"accuracy": 0.632, "f1": 0.7100078802206462}, "protein_sim_pair_150bp": {"accuracy": 0.9344444444444444, "f1": 0.9355895196506551}, "protein_sim_pair_450bp": {"accuracy": 0.905, "f1": 0.8989958653278204}, "dna_protein_pair": {"accuracy": 0.495, "f1": 0.07339449541284404}, "dna_protein_pair_100": {"accuracy": 0.5175, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5125, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.49375, "f1": 0.04929577464788732}, "dna_protein_pair_rand_100": {"accuracy": 0.519375, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.490625, "f1": 0.0024479804161566705}}
|
| 7 |
+
{"seed": 7, "en": {"accuracy": 0.885, "f1": 0.8773987206823027}, "fr": {"accuracy": 0.795, "f1": 0.7966269841269841}, "de": {"accuracy": 0.754, "f1": 0.7604673807205453}, "zh": {"accuracy": 0.714, "f1": 0.7069672131147541}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9416666666666667, "f1": 0.9387040280210157}, "dna_sim_pair_150bp": {"accuracy": 0.73675, "f1": 0.6541871921182266}, "dna_sim_pair_50bp": {"accuracy": 0.7805, "f1": 0.7939934303144064}, "protein_sim_pair_150bp": {"accuracy": 0.9455555555555556, "f1": 0.9476495726495726}, "protein_sim_pair_450bp": {"accuracy": 0.8877777777777778, "f1": 0.875615763546798}, "dna_protein_pair": {"accuracy": 0.5025, "f1": 0.019704433497536946}, "dna_protein_pair_100": {"accuracy": 0.51, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.455, "f1": 0.043859649122807015}, "dna_protein_pair_rand": {"accuracy": 0.513125, "f1": 0.12570145903479238}, "dna_protein_pair_rand_100": {"accuracy": 0.49625, "f1": 0.026570048309178744}, "dna_protein_pair_rand_full": {"accuracy": 0.53125, "f1": 0.1591928251121076}}
|
| 8 |
+
{"seed": 8, "en": {"accuracy": 0.872, "f1": 0.8649789029535865}, "fr": {"accuracy": 0.774, "f1": 0.7818532818532818}, "de": {"accuracy": 0.753, "f1": 0.7564102564102564}, "zh": {"accuracy": 0.7175, "f1": 0.6920980926430518}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9580555555555555, "f1": 0.9565217391304348}, "dna_sim_pair_150bp": {"accuracy": 0.77125, "f1": 0.7092469018112488}, "dna_sim_pair_50bp": {"accuracy": 0.8295, "f1": 0.8165680473372781}, "protein_sim_pair_150bp": {"accuracy": 0.9166666666666666, "f1": 0.9197860962566845}, "protein_sim_pair_450bp": {"accuracy": 0.9083333333333333, "f1": 0.9032258064516129}, "dna_protein_pair": {"accuracy": 0.51, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.4925, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.4925, "f1": 0.1506276150627615}, "dna_protein_pair_rand": {"accuracy": 0.520625, "f1": 0.005188067444876783}, "dna_protein_pair_rand_100": {"accuracy": 0.501875, "f1": 0.0025031289111389237}, "dna_protein_pair_rand_full": {"accuracy": 0.623125, "f1": 0.4928511354079058}}
|
| 9 |
+
{"seed": 9, "en": {"accuracy": 0.8525, "f1": 0.8381788261108064}, "fr": {"accuracy": 0.7825, "f1": 0.768}, "de": {"accuracy": 0.74, "f1": 0.7302904564315352}, "zh": {"accuracy": 0.6945, "f1": 0.6673924877517692}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9363888888888889, "f1": 0.9338723650014439}, "dna_sim_pair_150bp": {"accuracy": 0.6815, "f1": 0.5475852272727273}, "dna_sim_pair_50bp": {"accuracy": 0.7825, "f1": 0.7495682210708118}, "protein_sim_pair_150bp": {"accuracy": 0.9416666666666667, "f1": 0.9413735343383585}, "protein_sim_pair_450bp": {"accuracy": 0.8, "f1": 0.7643979057591623}, "dna_protein_pair": {"accuracy": 0.49, "f1": 0.05555555555555555}, "dna_protein_pair_100": {"accuracy": 0.495, "f1": 0.00980392156862745}, "dna_protein_pair_full": {"accuracy": 0.4875, "f1": 0.2807017543859649}, "dna_protein_pair_rand": {"accuracy": 0.4875, "f1": 0.07029478458049887}, "dna_protein_pair_rand_100": {"accuracy": 0.491875, "f1": 0.0097442143727162}, "dna_protein_pair_rand_full": {"accuracy": 0.4875, "f1": 0.24493554327808473}}
|
| 10 |
+
{"seed": 10, "en": {"accuracy": 0.8845, "f1": 0.8789942378208486}, "fr": {"accuracy": 0.7975, "f1": 0.8015678588926997}, "de": {"accuracy": 0.7715, "f1": 0.7669556348801632}, "zh": {"accuracy": 0.7095, "f1": 0.6871297792137857}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9347222222222222, "f1": 0.9358098880087408}, "dna_sim_pair_150bp": {"accuracy": 0.85325, "f1": 0.8461336828309305}, "dna_sim_pair_50bp": {"accuracy": 0.755, "f1": 0.7829937998228521}, "protein_sim_pair_150bp": {"accuracy": 0.935, "f1": 0.9360306178239475}, "protein_sim_pair_450bp": {"accuracy": 0.9333333333333333, "f1": 0.9349945828819068}, "dna_protein_pair": {"accuracy": 0.4825, "f1": 0.1038961038961039}, "dna_protein_pair_100": {"accuracy": 0.4625, "f1": 0.13654618473895583}, "dna_protein_pair_full": {"accuracy": 0.5125, "f1": 0.15584415584415584}, "dna_protein_pair_rand": {"accuracy": 0.514375, "f1": 0.112}, "dna_protein_pair_rand_100": {"accuracy": 0.5275, "f1": 0.13302752293577982}, "dna_protein_pair_rand_full": {"accuracy": 0.5075, "f1": 0.20883534136546184}}
|
| 11 |
+
{"seed": 11, "en": {"accuracy": 0.8995, "f1": 0.8914100486223663}, "fr": {"accuracy": 0.811, "f1": 0.8100502512562814}, "de": {"accuracy": 0.778, "f1": 0.7720739219712526}, "zh": {"accuracy": 0.713, "f1": 0.7172413793103448}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9369444444444445, "f1": 0.9322185727082711}, "dna_sim_pair_150bp": {"accuracy": 0.783, "f1": 0.7065584854631508}, "dna_sim_pair_50bp": {"accuracy": 0.795, "f1": 0.7704367301231803}, "protein_sim_pair_150bp": {"accuracy": 0.9516666666666667, "f1": 0.9509859154929577}, "protein_sim_pair_450bp": {"accuracy": 0.9688888888888889, "f1": 0.9681818181818181}, "dna_protein_pair": {"accuracy": 0.465, "f1": 0.044642857142857144}, "dna_protein_pair_100": {"accuracy": 0.535, "f1": 0.010638297872340425}, "dna_protein_pair_full": {"accuracy": 0.475, "f1": 0.0625}, "dna_protein_pair_rand": {"accuracy": 0.509375, "f1": 0.05078597339782346}, "dna_protein_pair_rand_100": {"accuracy": 0.505625, "f1": 0.07485380116959064}, "dna_protein_pair_rand_full": {"accuracy": 0.518125, "f1": 0.07220216606498195}}
|
| 12 |
+
{"seed": 12, "en": {"accuracy": 0.8885, "f1": 0.8799138395261173}, "fr": {"accuracy": 0.792, "f1": 0.7964774951076321}, "de": {"accuracy": 0.7705, "f1": 0.7682988389702171}, "zh": {"accuracy": 0.711, "f1": 0.7035897435897436}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9516666666666667, "f1": 0.949798038084247}, "dna_sim_pair_150bp": {"accuracy": 0.8075, "f1": 0.7566371681415929}, "dna_sim_pair_50bp": {"accuracy": 0.72, "f1": 0.7592433361994841}, "protein_sim_pair_150bp": {"accuracy": 0.9283333333333333, "f1": 0.9293150684931507}, "protein_sim_pair_450bp": {"accuracy": 0.885, "f1": 0.8805539526832084}, "dna_protein_pair": {"accuracy": 0.4725, "f1": 0.009389671361502348}, "dna_protein_pair_100": {"accuracy": 0.515, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.4925, "f1": 0.046948356807511735}, "dna_protein_pair_rand": {"accuracy": 0.493125, "f1": 0.01696969696969697}, "dna_protein_pair_rand_100": {"accuracy": 0.501875, "f1": 0.009937888198757764}, "dna_protein_pair_rand_full": {"accuracy": 0.588125, "f1": 0.3812206572769953}}
|
| 13 |
+
{"seed": 13, "en": {"accuracy": 0.9045, "f1": 0.8947658402203856}, "fr": {"accuracy": 0.814, "f1": 0.805439330543933}, "de": {"accuracy": 0.7875, "f1": 0.7742963356346256}, "zh": {"accuracy": 0.7335, "f1": 0.699718309859155}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9525, "f1": 0.949926793557833}, "dna_sim_pair_150bp": {"accuracy": 0.77875, "f1": 0.721785602011946}, "dna_sim_pair_50bp": {"accuracy": 0.7955, "f1": 0.7685342388228636}, "protein_sim_pair_150bp": {"accuracy": 0.9527777777777777, "f1": 0.9523275378575434}, "protein_sim_pair_450bp": {"accuracy": 0.92, "f1": 0.9164733178654292}, "dna_protein_pair": {"accuracy": 0.5275, "f1": 0.010471204188481676}, "dna_protein_pair_100": {"accuracy": 0.4875, "f1": 0.02843601895734597}, "dna_protein_pair_full": {"accuracy": 0.4625, "f1": 0.09282700421940929}, "dna_protein_pair_rand": {"accuracy": 0.495, "f1": 0.0170316301703163}, "dna_protein_pair_rand_100": {"accuracy": 0.510625, "f1": 0.01756587202007528}, "dna_protein_pair_rand_full": {"accuracy": 0.58125, "f1": 0.35946462715105165}}
|
| 14 |
+
{"seed": 14, "en": {"accuracy": 0.877, "f1": 0.8699788583509513}, "fr": {"accuracy": 0.786, "f1": 0.7934362934362934}, "de": {"accuracy": 0.751, "f1": 0.7575462512171373}, "zh": {"accuracy": 0.727, "f1": 0.7242424242424242}, "dna_sim_pair_simple_150bp": {"accuracy": 0.6475, "f1": 0.7286722257857601}, "dna_sim_pair_150bp": {"accuracy": 0.5595, "f1": 0.6436084142394822}, "dna_sim_pair_50bp": {"accuracy": 0.485, "f1": 0.6411149825783972}, "protein_sim_pair_150bp": {"accuracy": 0.9088888888888889, "f1": 0.9096916299559471}, "protein_sim_pair_450bp": {"accuracy": 0.9283333333333333, "f1": 0.9207129686539643}, "dna_protein_pair": {"accuracy": 0.515, "f1": 0.11009174311926606}, "dna_protein_pair_100": {"accuracy": 0.525, "f1": 0.020618556701030927}, "dna_protein_pair_full": {"accuracy": 0.4675, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.51875, "f1": 0.12100456621004566}, "dna_protein_pair_rand_100": {"accuracy": 0.51375, "f1": 0.005115089514066497}, "dna_protein_pair_rand_full": {"accuracy": 0.491875, "f1": 0.0}}
|
| 15 |
+
{"seed": 15, "en": {"accuracy": 0.8955, "f1": 0.8892421833598304}, "fr": {"accuracy": 0.803, "f1": 0.8081791626095424}, "de": {"accuracy": 0.7685, "f1": 0.7655696202531646}, "zh": {"accuracy": 0.7205, "f1": 0.7183879093198993}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9452777777777778, "f1": 0.9423808131032466}, "dna_sim_pair_150bp": {"accuracy": 0.8075, "f1": 0.7543075941289088}, "dna_sim_pair_50bp": {"accuracy": 0.6715, "f1": 0.5186813186813187}, "protein_sim_pair_150bp": {"accuracy": 0.9416666666666667, "f1": 0.9392712550607287}, "protein_sim_pair_450bp": {"accuracy": 0.9666666666666667, "f1": 0.9661399548532731}, "dna_protein_pair": {"accuracy": 0.4925, "f1": 0.00975609756097561}, "dna_protein_pair_100": {"accuracy": 0.495, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5075, "f1": 0.010050251256281407}, "dna_protein_pair_rand": {"accuracy": 0.52, "f1": 0.017902813299232736}, "dna_protein_pair_rand_100": {"accuracy": 0.520625, "f1": 0.005188067444876783}, "dna_protein_pair_rand_full": {"accuracy": 0.49875, "f1": 0.0024875621890547263}}
|
| 16 |
+
{"seed": 16, "en": {"accuracy": 0.877, "f1": 0.8695652173913043}, "fr": {"accuracy": 0.795, "f1": 0.7988223748773308}, "de": {"accuracy": 0.7535, "f1": 0.7598636142230881}, "zh": {"accuracy": 0.7175, "f1": 0.6904109589041096}, "dna_sim_pair_simple_150bp": {"accuracy": 0.7672222222222222, "f1": 0.8031015037593985}, "dna_sim_pair_150bp": {"accuracy": 0.67225, "f1": 0.7074313769247935}, "dna_sim_pair_50bp": {"accuracy": 0.466, "f1": 0.6339958875942426}, "protein_sim_pair_150bp": {"accuracy": 0.9316666666666666, "f1": 0.9321566464423607}, "protein_sim_pair_450bp": {"accuracy": 0.8872222222222222, "f1": 0.886781929726715}, "dna_protein_pair": {"accuracy": 0.525, "f1": 0.030612244897959183}, "dna_protein_pair_100": {"accuracy": 0.515, "f1": 0.020202020202020204}, "dna_protein_pair_full": {"accuracy": 0.4425, "f1": 0.19494584837545126}, "dna_protein_pair_rand": {"accuracy": 0.534375, "f1": 0.09915356711003627}, "dna_protein_pair_rand_100": {"accuracy": 0.514375, "f1": 0.06498194945848375}, "dna_protein_pair_rand_full": {"accuracy": 0.57125, "f1": 0.3875}}
|
| 17 |
+
{"seed": 17, "en": {"accuracy": 0.881, "f1": 0.8760416666666667}, "fr": {"accuracy": 0.772, "f1": 0.7853107344632768}, "de": {"accuracy": 0.7535, "f1": 0.7617206379893668}, "zh": {"accuracy": 0.7115, "f1": 0.7084386053562405}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9452777777777778, "f1": 0.9452321378926883}, "dna_sim_pair_150bp": {"accuracy": 0.80475, "f1": 0.7631179860479224}, "dna_sim_pair_50bp": {"accuracy": 0.7765, "f1": 0.780559646539028}, "protein_sim_pair_150bp": {"accuracy": 0.9438888888888889, "f1": 0.9449591280653951}, "protein_sim_pair_450bp": {"accuracy": 0.8183333333333334, "f1": 0.8399412628487518}, "dna_protein_pair": {"accuracy": 0.49, "f1": 0.24444444444444444}, "dna_protein_pair_100": {"accuracy": 0.53, "f1": 0.37748344370860926}, "dna_protein_pair_full": {"accuracy": 0.4275, "f1": 0.5219206680584552}, "dna_protein_pair_rand": {"accuracy": 0.5075, "f1": 0.20724346076458752}, "dna_protein_pair_rand_100": {"accuracy": 0.55875, "f1": 0.400679117147708}, "dna_protein_pair_rand_full": {"accuracy": 0.5275, "f1": 0.5837004405286343}}
|
| 18 |
+
{"seed": 18, "en": {"accuracy": 0.884, "f1": 0.8758029978586723}, "fr": {"accuracy": 0.7905, "f1": 0.7968977217644208}, "de": {"accuracy": 0.749, "f1": 0.7584215591915303}, "zh": {"accuracy": 0.7295, "f1": 0.7142102482831485}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9155555555555556, "f1": 0.9146546883773161}, "dna_sim_pair_150bp": {"accuracy": 0.71725, "f1": 0.6435549952726126}, "dna_sim_pair_50bp": {"accuracy": 0.5355, "f1": 0.6583302684810592}, "protein_sim_pair_150bp": {"accuracy": 0.9538888888888889, "f1": 0.9528140989198408}, "protein_sim_pair_450bp": {"accuracy": 0.9527777777777777, "f1": 0.9512893982808023}, "dna_protein_pair": {"accuracy": 0.5525, "f1": 0.05291005291005291}, "dna_protein_pair_100": {"accuracy": 0.4925, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5075, "f1": 0.01990049751243781}, "dna_protein_pair_rand": {"accuracy": 0.51125, "f1": 0.041666666666666664}, "dna_protein_pair_rand_100": {"accuracy": 0.47625, "f1": 0.004750593824228029}, "dna_protein_pair_rand_full": {"accuracy": 0.545625, "f1": 0.12091898428053205}}
|
| 19 |
+
{"seed": 19, "en": {"accuracy": 0.8985, "f1": 0.8908015061861215}, "fr": {"accuracy": 0.7955, "f1": 0.8013598834385625}, "de": {"accuracy": 0.7795, "f1": 0.776255707762557}, "zh": {"accuracy": 0.725, "f1": 0.7132429614181439}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9572222222222222, "f1": 0.9553364269141531}, "dna_sim_pair_150bp": {"accuracy": 0.82225, "f1": 0.7853908843948083}, "dna_sim_pair_50bp": {"accuracy": 0.741, "f1": 0.7349027635619243}, "protein_sim_pair_150bp": {"accuracy": 0.9566666666666667, "f1": 0.9573304157549234}, "protein_sim_pair_450bp": {"accuracy": 0.9727777777777777, "f1": 0.9731506849315068}, "dna_protein_pair": {"accuracy": 0.51, "f1": 0.0297029702970297}, "dna_protein_pair_100": {"accuracy": 0.48, "f1": 0.028037383177570093}, "dna_protein_pair_full": {"accuracy": 0.47, "f1": 0.061946902654867256}, "dna_protein_pair_rand": {"accuracy": 0.500625, "f1": 0.04767580452920143}, "dna_protein_pair_rand_100": {"accuracy": 0.4625, "f1": 0.03803131991051454}, "dna_protein_pair_rand_full": {"accuracy": 0.515, "f1": 0.12415349887133183}}
|
| 20 |
+
{"seed": 20, "en": {"accuracy": 0.872, "f1": 0.8651211801896733}, "fr": {"accuracy": 0.779, "f1": 0.7833333333333333}, "de": {"accuracy": 0.7525, "f1": 0.7521281922884326}, "zh": {"accuracy": 0.734, "f1": 0.7007874015748031}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9041666666666667, "f1": 0.9040867389491243}, "dna_sim_pair_150bp": {"accuracy": 0.742, "f1": 0.693950177935943}, "dna_sim_pair_50bp": {"accuracy": 0.673, "f1": 0.7154046997389034}, "protein_sim_pair_150bp": {"accuracy": 0.9355555555555556, "f1": 0.9347581552305961}, "protein_sim_pair_450bp": {"accuracy": 0.9472222222222222, "f1": 0.9446709376820035}, "dna_protein_pair": {"accuracy": 0.48, "f1": 0.037037037037037035}, "dna_protein_pair_100": {"accuracy": 0.5325, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.4725, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.51125, "f1": 0.041666666666666664}, "dna_protein_pair_rand_100": {"accuracy": 0.505625, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.519375, "f1": 0.015364916773367477}}
|
| 21 |
+
{"seed": 22, "en": {"accuracy": 0.882, "f1": 0.8737967914438503}, "fr": {"accuracy": 0.782, "f1": 0.7895752895752896}, "de": {"accuracy": 0.7575, "f1": 0.7597820703318474}, "zh": {"accuracy": 0.726, "f1": 0.7018498367791077}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9222222222222223, "f1": 0.92}, "dna_sim_pair_150bp": {"accuracy": 0.8355, "f1": 0.8183324130314743}, "dna_sim_pair_50bp": {"accuracy": 0.617, "f1": 0.6770657672849916}, "protein_sim_pair_150bp": {"accuracy": 0.8383333333333334, "f1": 0.8581179912237933}, "protein_sim_pair_450bp": {"accuracy": 0.8461111111111111, "f1": 0.8661188980183664}, "dna_protein_pair": {"accuracy": 0.515, "f1": 0.049019607843137254}, "dna_protein_pair_100": {"accuracy": 0.55, "f1": 0.19642857142857142}, "dna_protein_pair_full": {"accuracy": 0.3575, "f1": 0.48906560636182905}, "dna_protein_pair_rand": {"accuracy": 0.535625, "f1": 0.1753607103218646}, "dna_protein_pair_rand_100": {"accuracy": 0.545, "f1": 0.30134357005758156}, "dna_protein_pair_rand_full": {"accuracy": 0.578125, "f1": 0.6441750131787032}}
|
| 22 |
+
{"seed": 23, "en": {"accuracy": 0.88, "f1": 0.8727465535524921}, "fr": {"accuracy": 0.7975, "f1": 0.7978032950574139}, "de": {"accuracy": 0.768, "f1": 0.760577915376677}, "zh": {"accuracy": 0.715, "f1": 0.6928879310344828}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9355555555555556, "f1": 0.9343891402714932}, "dna_sim_pair_150bp": {"accuracy": 0.747, "f1": 0.6993464052287581}, "dna_sim_pair_50bp": {"accuracy": 0.724, "f1": 0.7639007698887939}, "protein_sim_pair_150bp": {"accuracy": 0.8972222222222223, "f1": 0.8980716253443526}, "protein_sim_pair_450bp": {"accuracy": 0.9283333333333333, "f1": 0.9270774448841154}, "dna_protein_pair": {"accuracy": 0.5175, "f1": 0.06763285024154589}, "dna_protein_pair_100": {"accuracy": 0.5275, "f1": 0.02072538860103627}, "dna_protein_pair_full": {"accuracy": 0.4275, "f1": 0.14232209737827714}, "dna_protein_pair_rand": {"accuracy": 0.53375, "f1": 0.12028301886792453}, "dna_protein_pair_rand_100": {"accuracy": 0.504375, "f1": 0.02937576499388005}, "dna_protein_pair_rand_full": {"accuracy": 0.5425, "f1": 0.2294736842105263}}
|
| 23 |
+
{"seed": 24, "en": {"accuracy": 0.8735, "f1": 0.8701898409440739}, "fr": {"accuracy": 0.7905, "f1": 0.797682279092226}, "de": {"accuracy": 0.74, "f1": 0.7549481621112158}, "zh": {"accuracy": 0.7135, "f1": 0.7041817243159525}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9419444444444445, "f1": 0.9409103760248798}, "dna_sim_pair_150bp": {"accuracy": 0.75075, "f1": 0.6650990930466913}, "dna_sim_pair_50bp": {"accuracy": 0.632, "f1": 0.6912751677852349}, "protein_sim_pair_150bp": {"accuracy": 0.9472222222222222, "f1": 0.9440847557386698}, "protein_sim_pair_450bp": {"accuracy": 0.8816666666666667, "f1": 0.8671241422333126}, "dna_protein_pair": {"accuracy": 0.47, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.4675, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5275, "f1": 0.03076923076923077}, "dna_protein_pair_rand": {"accuracy": 0.485, "f1": 0.009615384615384616}, "dna_protein_pair_rand_100": {"accuracy": 0.49875, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.5025, "f1": 0.009950248756218905}}
|
| 24 |
+
{"seed": 25, "en": {"accuracy": 0.8795, "f1": 0.8733578560168156}, "fr": {"accuracy": 0.7755, "f1": 0.7852702056432329}, "de": {"accuracy": 0.771, "f1": 0.7607105538140021}, "zh": {"accuracy": 0.706, "f1": 0.6842105263157895}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9325, "f1": 0.9326683291770573}, "dna_sim_pair_150bp": {"accuracy": 0.77225, "f1": 0.7369332948310713}, "dna_sim_pair_50bp": {"accuracy": 0.5825, "f1": 0.6840711312902006}, "protein_sim_pair_150bp": {"accuracy": 0.9466666666666667, "f1": 0.9470198675496688}, "protein_sim_pair_450bp": {"accuracy": 0.8538888888888889, "f1": 0.8383527965580824}, "dna_protein_pair": {"accuracy": 0.535, "f1": 0.10576923076923077}, "dna_protein_pair_100": {"accuracy": 0.5325, "f1": 0.010582010582010581}, "dna_protein_pair_full": {"accuracy": 0.4675, "f1": 0.018433179723502304}, "dna_protein_pair_rand": {"accuracy": 0.5275, "f1": 0.12296983758700696}, "dna_protein_pair_rand_100": {"accuracy": 0.52875, "f1": 0.0893719806763285}, "dna_protein_pair_rand_full": {"accuracy": 0.5175, "f1": 0.05623471882640587}}
|
| 25 |
+
{"seed": 26, "en": {"accuracy": 0.888, "f1": 0.8827225130890053}, "fr": {"accuracy": 0.8, "f1": 0.8073217726396917}, "de": {"accuracy": 0.7645, "f1": 0.7678659438146871}, "zh": {"accuracy": 0.7145, "f1": 0.7094147582697201}, "dna_sim_pair_simple_150bp": {"accuracy": 0.935, "f1": 0.9311359623307828}, "dna_sim_pair_150bp": {"accuracy": 0.751, "f1": 0.6682211858760826}, "dna_sim_pair_50bp": {"accuracy": 0.7545, "f1": 0.6906112161310649}, "protein_sim_pair_150bp": {"accuracy": 0.9283333333333333, "f1": 0.9308310991957105}, "protein_sim_pair_450bp": {"accuracy": 0.8711111111111111, "f1": 0.8576687116564418}, "dna_protein_pair": {"accuracy": 0.47, "f1": 0.12396694214876033}, "dna_protein_pair_100": {"accuracy": 0.495, "f1": 0.06481481481481481}, "dna_protein_pair_full": {"accuracy": 0.5325, "f1": 0.13023255813953488}, "dna_protein_pair_rand": {"accuracy": 0.5025, "f1": 0.17083333333333334}, "dna_protein_pair_rand_100": {"accuracy": 0.5125, "f1": 0.051094890510948905}, "dna_protein_pair_rand_full": {"accuracy": 0.555625, "f1": 0.2767039674465921}}
|
| 26 |
+
{"seed": 27, "en": {"accuracy": 0.8905, "f1": 0.8807838867719108}, "fr": {"accuracy": 0.7985, "f1": 0.8027410670582477}, "de": {"accuracy": 0.7705, "f1": 0.771072319201995}, "zh": {"accuracy": 0.727, "f1": 0.7039045553145337}, "dna_sim_pair_simple_150bp": {"accuracy": 0.925, "f1": 0.9259462424574877}, "dna_sim_pair_150bp": {"accuracy": 0.7845, "f1": 0.7562217194570136}, "dna_sim_pair_50bp": {"accuracy": 0.6275, "f1": 0.707728520988623}, "protein_sim_pair_150bp": {"accuracy": 0.7683333333333333, "f1": 0.8093278463648834}, "protein_sim_pair_450bp": {"accuracy": 0.8972222222222223, "f1": 0.8988518316019682}, "dna_protein_pair": {"accuracy": 0.5, "f1": 0.12280701754385964}, "dna_protein_pair_100": {"accuracy": 0.5075, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5325, "f1": 0.020942408376963352}, "dna_protein_pair_rand": {"accuracy": 0.5325, "f1": 0.20930232558139536}, "dna_protein_pair_rand_100": {"accuracy": 0.52625, "f1": 0.04050632911392405}, "dna_protein_pair_rand_full": {"accuracy": 0.550625, "f1": 0.1764032073310424}}
|
| 27 |
+
{"seed": 28, "en": {"accuracy": 0.8895, "f1": 0.8815013404825738}, "fr": {"accuracy": 0.803, "f1": 0.7947916666666667}, "de": {"accuracy": 0.778, "f1": 0.7675392670157068}, "zh": {"accuracy": 0.7335, "f1": 0.7139023081052066}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9344444444444444, "f1": 0.9334085778781038}, "dna_sim_pair_150bp": {"accuracy": 0.72525, "f1": 0.6650411459920755}, "dna_sim_pair_50bp": {"accuracy": 0.602, "f1": 0.7084249084249085}, "protein_sim_pair_150bp": {"accuracy": 0.9538888888888889, "f1": 0.952050837666089}, "protein_sim_pair_450bp": {"accuracy": 0.8344444444444444, "f1": 0.8054830287206266}, "dna_protein_pair": {"accuracy": 0.4675, "f1": 0.009302325581395349}, "dna_protein_pair_100": {"accuracy": 0.4725, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.4875, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.493125, "f1": 0.01696969696969697}, "dna_protein_pair_rand_100": {"accuracy": 0.491875, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.5025, "f1": 0.005}}
|
| 28 |
+
{"seed": 29, "en": {"accuracy": 0.883, "f1": 0.8769716088328076}, "fr": {"accuracy": 0.791, "f1": 0.7944936086529006}, "de": {"accuracy": 0.755, "f1": 0.7635135135135135}, "zh": {"accuracy": 0.707, "f1": 0.7004089979550102}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9533333333333334, "f1": 0.9537953795379538}, "dna_sim_pair_150bp": {"accuracy": 0.797, "f1": 0.7597633136094675}, "dna_sim_pair_50bp": {"accuracy": 0.6665, "f1": 0.7267513314215486}, "protein_sim_pair_150bp": {"accuracy": 0.9055555555555556, "f1": 0.915506958250497}, "protein_sim_pair_450bp": {"accuracy": 0.81, "f1": 0.773209549071618}, "dna_protein_pair": {"accuracy": 0.54, "f1": 0.1559633027522936}, "dna_protein_pair_100": {"accuracy": 0.545, "f1": 0.031914893617021274}, "dna_protein_pair_full": {"accuracy": 0.4875, "f1": 0.055299539170506916}, "dna_protein_pair_rand": {"accuracy": 0.553125, "f1": 0.2969518190757129}, "dna_protein_pair_rand_100": {"accuracy": 0.516875, "f1": 0.07202881152460984}, "dna_protein_pair_rand_full": {"accuracy": 0.5275, "f1": 0.11267605633802817}}
|
| 29 |
+
{"seed": 30, "en": {"accuracy": 0.8895, "f1": 0.8836229594523434}, "fr": {"accuracy": 0.7805, "f1": 0.7932171455487518}, "de": {"accuracy": 0.757, "f1": 0.7629268292682927}, "zh": {"accuracy": 0.734, "f1": 0.718816067653277}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9388888888888889, "f1": 0.9386503067484663}, "dna_sim_pair_150bp": {"accuracy": 0.83525, "f1": 0.8192043895747599}, "dna_sim_pair_50bp": {"accuracy": 0.666, "f1": 0.7155025553662692}, "protein_sim_pair_150bp": {"accuracy": 0.8966666666666666, "f1": 0.905295315682281}, "protein_sim_pair_450bp": {"accuracy": 0.9094444444444445, "f1": 0.9139841688654353}, "dna_protein_pair": {"accuracy": 0.4975, "f1": 0.14468085106382977}, "dna_protein_pair_100": {"accuracy": 0.5325, "f1": 0.06965174129353234}, "dna_protein_pair_full": {"accuracy": 0.5225, "f1": 0.1511111111111111}, "dna_protein_pair_rand": {"accuracy": 0.53125, "f1": 0.19181034482758622}, "dna_protein_pair_rand_100": {"accuracy": 0.538125, "f1": 0.1515499425947187}, "dna_protein_pair_rand_full": {"accuracy": 0.516875, "f1": 0.1245753114382786}}
|
| 30 |
+
{"seed": 31, "en": {"accuracy": 0.8675, "f1": 0.8637532133676092}, "fr": {"accuracy": 0.755, "f1": 0.7774750227066304}, "de": {"accuracy": 0.748, "f1": 0.7644859813084112}, "zh": {"accuracy": 0.7105, "f1": 0.7230989956958394}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8333333333333334, "f1": 0.8527245949926362}, "dna_sim_pair_150bp": {"accuracy": 0.74375, "f1": 0.7511531925224569}, "dna_sim_pair_50bp": {"accuracy": 0.498, "f1": 0.6624075319435104}, "protein_sim_pair_150bp": {"accuracy": 0.8222222222222222, "f1": 0.8461538461538461}, "protein_sim_pair_450bp": {"accuracy": 0.9288888888888889, "f1": 0.9262672811059908}, "dna_protein_pair": {"accuracy": 0.435, "f1": 0.3651685393258427}, "dna_protein_pair_100": {"accuracy": 0.4575, "f1": 0.1685823754789272}, "dna_protein_pair_full": {"accuracy": 0.3725, "f1": 0.512621359223301}, "dna_protein_pair_rand": {"accuracy": 0.511875, "f1": 0.37269076305220883}, "dna_protein_pair_rand_100": {"accuracy": 0.51375, "f1": 0.15250544662309368}, "dna_protein_pair_rand_full": {"accuracy": 0.464375, "f1": 0.5598356445814073}}
|
| 31 |
+
{"seed": 32, "en": {"accuracy": 0.882, "f1": 0.8747346072186837}, "fr": {"accuracy": 0.7955, "f1": 0.7970223325062035}, "de": {"accuracy": 0.7675, "f1": 0.7635993899339095}, "zh": {"accuracy": 0.708, "f1": 0.7165048543689321}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9505555555555556, "f1": 0.9480443666082895}, "dna_sim_pair_150bp": {"accuracy": 0.75125, "f1": 0.6801671488267438}, "dna_sim_pair_50bp": {"accuracy": 0.787, "f1": 0.7633333333333333}, "protein_sim_pair_150bp": {"accuracy": 0.9661111111111111, "f1": 0.9661674986134221}, "protein_sim_pair_450bp": {"accuracy": 0.9144444444444444, "f1": 0.903387703889586}, "dna_protein_pair": {"accuracy": 0.5025, "f1": 0.019704433497536946}, "dna_protein_pair_100": {"accuracy": 0.46, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.54, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.50875, "f1": 0.02962962962962963}, "dna_protein_pair_rand_100": {"accuracy": 0.496875, "f1": 0.0024783147459727386}, "dna_protein_pair_rand_full": {"accuracy": 0.5, "f1": 0.007444168734491315}}
|
| 32 |
+
{"seed": 33, "en": {"accuracy": 0.87, "f1": 0.8576122672508215}, "fr": {"accuracy": 0.7915, "f1": 0.785824345146379}, "de": {"accuracy": 0.7695, "f1": 0.7385138967668746}, "zh": {"accuracy": 0.718, "f1": 0.7065556711758585}, "dna_sim_pair_simple_150bp": {"accuracy": 0.5802777777777778, "f1": 0.6843534572801337}, "dna_sim_pair_150bp": {"accuracy": 0.515, "f1": 0.6231546231546231}, "dna_sim_pair_50bp": {"accuracy": 0.4905, "f1": 0.6511468675111264}, "protein_sim_pair_150bp": {"accuracy": 0.8583333333333333, "f1": 0.855031267765776}, "protein_sim_pair_450bp": {"accuracy": 0.82, "f1": 0.7931034482758621}, "dna_protein_pair": {"accuracy": 0.58, "f1": 0.288135593220339}, "dna_protein_pair_100": {"accuracy": 0.57, "f1": 0.47878787878787876}, "dna_protein_pair_full": {"accuracy": 0.5275, "f1": 0.1888412017167382}, "dna_protein_pair_rand": {"accuracy": 0.56625, "f1": 0.4403225806451613}, "dna_protein_pair_rand_100": {"accuracy": 0.51, "f1": 0.49872122762148335}, "dna_protein_pair_rand_full": {"accuracy": 0.573125, "f1": 0.3928888888888889}}
|
| 33 |
+
{"seed": 34, "en": {"accuracy": 0.879, "f1": 0.8707264957264957}, "fr": {"accuracy": 0.7825, "f1": 0.7911665866538646}, "de": {"accuracy": 0.765, "f1": 0.7659362549800797}, "zh": {"accuracy": 0.7105, "f1": 0.7023136246786632}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9525, "f1": 0.9524868018894137}, "dna_sim_pair_150bp": {"accuracy": 0.857, "f1": 0.8405797101449275}, "dna_sim_pair_50bp": {"accuracy": 0.749, "f1": 0.7972536348949919}, "protein_sim_pair_150bp": {"accuracy": 0.9072222222222223, "f1": 0.9117802430005283}, "protein_sim_pair_450bp": {"accuracy": 0.8788888888888889, "f1": 0.8716136631330977}, "dna_protein_pair": {"accuracy": 0.555, "f1": 0.03260869565217391}, "dna_protein_pair_100": {"accuracy": 0.525, "f1": 0.010416666666666666}, "dna_protein_pair_full": {"accuracy": 0.435, "f1": 0.02586206896551724}, "dna_protein_pair_rand": {"accuracy": 0.510625, "f1": 0.0416156670746634}, "dna_protein_pair_rand_100": {"accuracy": 0.529375, "f1": 0.0258732212160414}, "dna_protein_pair_rand_full": {"accuracy": 0.528125, "f1": 0.13516609392898052}}
|
| 34 |
+
{"seed": 35, "en": {"accuracy": 0.894, "f1": 0.8823529411764706}, "fr": {"accuracy": 0.811, "f1": 0.8045501551189245}, "de": {"accuracy": 0.777, "f1": 0.7669801462904912}, "zh": {"accuracy": 0.7325, "f1": 0.7032723239046035}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9419444444444445, "f1": 0.9372937293729373}, "dna_sim_pair_150bp": {"accuracy": 0.68575, "f1": 0.5404021937842779}, "dna_sim_pair_50bp": {"accuracy": 0.736, "f1": 0.6413043478260869}, "protein_sim_pair_150bp": {"accuracy": 0.9344444444444444, "f1": 0.929678188319428}, "protein_sim_pair_450bp": {"accuracy": 0.8605555555555555, "f1": 0.8330006653359947}, "dna_protein_pair": {"accuracy": 0.525, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.4875, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.4925, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.483125, "f1": 0.0048134777376654635}, "dna_protein_pair_rand_100": {"accuracy": 0.4975, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.495625, "f1": 0.0}}
|
| 35 |
+
{"seed": 36, "en": {"accuracy": 0.8955, "f1": 0.8862275449101796}, "fr": {"accuracy": 0.8045, "f1": 0.7989717223650386}, "de": {"accuracy": 0.7755, "f1": 0.766510660426417}, "zh": {"accuracy": 0.7325, "f1": 0.718272775144813}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9427777777777778, "f1": 0.9418079096045198}, "dna_sim_pair_150bp": {"accuracy": 0.74525, "f1": 0.6788528206744406}, "dna_sim_pair_50bp": {"accuracy": 0.582, "f1": 0.672156862745098}, "protein_sim_pair_150bp": {"accuracy": 0.9377777777777778, "f1": 0.940552016985138}, "protein_sim_pair_450bp": {"accuracy": 0.8783333333333333, "f1": 0.864898210980876}, "dna_protein_pair": {"accuracy": 0.5275, "f1": 0.010471204188481676}, "dna_protein_pair_100": {"accuracy": 0.5125, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5375, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.503125, "f1": 0.01486988847583643}, "dna_protein_pair_rand_100": {"accuracy": 0.515625, "f1": 0.002574002574002574}, "dna_protein_pair_rand_full": {"accuracy": 0.48875, "f1": 0.0}}
|
| 36 |
+
{"seed": 37, "en": {"accuracy": 0.8795, "f1": 0.8702207862143242}, "fr": {"accuracy": 0.792, "f1": 0.7956777996070727}, "de": {"accuracy": 0.76, "f1": 0.7538461538461538}, "zh": {"accuracy": 0.7235, "f1": 0.7127272727272728}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9097222222222222, "f1": 0.9002149217070924}, "dna_sim_pair_150bp": {"accuracy": 0.65675, "f1": 0.47933257489571485}, "dna_sim_pair_50bp": {"accuracy": 0.758, "f1": 0.7305122494432071}, "protein_sim_pair_150bp": {"accuracy": 0.9716666666666667, "f1": 0.971900826446281}, "protein_sim_pair_450bp": {"accuracy": 0.9605555555555556, "f1": 0.9589358010410642}, "dna_protein_pair": {"accuracy": 0.48, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.54, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.47, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.50375, "f1": 0.012437810945273632}, "dna_protein_pair_rand_100": {"accuracy": 0.48125, "f1": 0.002403846153846154}, "dna_protein_pair_rand_full": {"accuracy": 0.4975, "f1": 0.007407407407407408}}
|
| 37 |
+
{"seed": 38, "en": {"accuracy": 0.887, "f1": 0.8825363825363826}, "fr": {"accuracy": 0.779, "f1": 0.790719696969697}, "de": {"accuracy": 0.755, "f1": 0.7574257425742574}, "zh": {"accuracy": 0.7115, "f1": 0.7030365414307771}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9530555555555555, "f1": 0.9527008116428771}, "dna_sim_pair_150bp": {"accuracy": 0.80025, "f1": 0.7631188852653424}, "dna_sim_pair_50bp": {"accuracy": 0.575, "f1": 0.6858832224685883}, "protein_sim_pair_150bp": {"accuracy": 0.8816666666666667, "f1": 0.8913819479857216}, "protein_sim_pair_450bp": {"accuracy": 0.9511111111111111, "f1": 0.9517014270032931}, "dna_protein_pair": {"accuracy": 0.535, "f1": 0.08823529411764706}, "dna_protein_pair_100": {"accuracy": 0.4925, "f1": 0.01932367149758454}, "dna_protein_pair_full": {"accuracy": 0.56, "f1": 0.10204081632653061}, "dna_protein_pair_rand": {"accuracy": 0.531875, "f1": 0.1814207650273224}, "dna_protein_pair_rand_100": {"accuracy": 0.505625, "f1": 0.08342989571263036}, "dna_protein_pair_rand_full": {"accuracy": 0.525, "f1": 0.16483516483516483}}
|
| 38 |
+
{"seed": 40, "en": {"accuracy": 0.895, "f1": 0.8884165781083954}, "fr": {"accuracy": 0.804, "f1": 0.8082191780821918}, "de": {"accuracy": 0.7735, "f1": 0.7742899850523169}, "zh": {"accuracy": 0.7045, "f1": 0.7055306427503737}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9422222222222222, "f1": 0.9411764705882353}, "dna_sim_pair_150bp": {"accuracy": 0.80025, "f1": 0.761989871909443}, "dna_sim_pair_50bp": {"accuracy": 0.6735, "f1": 0.6666666666666666}, "protein_sim_pair_150bp": {"accuracy": 0.975, "f1": 0.9745618993781797}, "protein_sim_pair_450bp": {"accuracy": 0.9477777777777778, "f1": 0.9439809296781884}, "dna_protein_pair": {"accuracy": 0.525, "f1": 0.020618556701030927}, "dna_protein_pair_100": {"accuracy": 0.505, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5025, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.518125, "f1": 0.007722007722007722}, "dna_protein_pair_rand_100": {"accuracy": 0.49625, "f1": 0.0024752475247524753}, "dna_protein_pair_rand_full": {"accuracy": 0.489375, "f1": 0.0}}
|
| 39 |
+
{"seed": 41, "en": {"accuracy": 0.8945, "f1": 0.8893550078657577}, "fr": {"accuracy": 0.7835, "f1": 0.7931199235547062}, "de": {"accuracy": 0.7635, "f1": 0.7645594823295172}, "zh": {"accuracy": 0.711, "f1": 0.7026748971193416}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9358333333333333, "f1": 0.930776146239137}, "dna_sim_pair_150bp": {"accuracy": 0.70325, "f1": 0.5750089509488006}, "dna_sim_pair_50bp": {"accuracy": 0.825, "f1": 0.8099891422366993}, "protein_sim_pair_150bp": {"accuracy": 0.935, "f1": 0.9323308270676691}, "protein_sim_pair_450bp": {"accuracy": 0.8722222222222222, "f1": 0.8587223587223587}, "dna_protein_pair": {"accuracy": 0.4575, "f1": 0.0091324200913242}, "dna_protein_pair_100": {"accuracy": 0.5225, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5375, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.50625, "f1": 0.0025252525252525255}, "dna_protein_pair_rand_100": {"accuracy": 0.51875, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.5075, "f1": 0.0}}
|
| 40 |
+
{"seed": 42, "en": {"accuracy": 0.8865, "f1": 0.8777598276790523}, "fr": {"accuracy": 0.7995, "f1": 0.7946748591909882}, "de": {"accuracy": 0.773, "f1": 0.7686034658511722}, "zh": {"accuracy": 0.717, "f1": 0.6933911159263272}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9583333333333334, "f1": 0.9569460390355913}, "dna_sim_pair_150bp": {"accuracy": 0.81025, "f1": 0.7633302151543498}, "dna_sim_pair_50bp": {"accuracy": 0.828, "f1": 0.843351548269581}, "protein_sim_pair_150bp": {"accuracy": 0.9772222222222222, "f1": 0.9770821688093907}, "protein_sim_pair_450bp": {"accuracy": 0.885, "f1": 0.8726153846153846}, "dna_protein_pair": {"accuracy": 0.5325, "f1": 0.020942408376963352}, "dna_protein_pair_100": {"accuracy": 0.48, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.475, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.51625, "f1": 0.034912718204488775}, "dna_protein_pair_rand_100": {"accuracy": 0.50625, "f1": 0.0025252525252525255}, "dna_protein_pair_rand_full": {"accuracy": 0.508125, "f1": 0.0050568900126422255}}
|
| 41 |
+
{"seed": 43, "en": {"accuracy": 0.8895, "f1": 0.8826340945300053}, "fr": {"accuracy": 0.7975, "f1": 0.8013732221677293}, "de": {"accuracy": 0.769, "f1": 0.7694610778443114}, "zh": {"accuracy": 0.7145, "f1": 0.7024491922876498}, "dna_sim_pair_simple_150bp": {"accuracy": 0.945, "f1": 0.9437819420783645}, "dna_sim_pair_150bp": {"accuracy": 0.7815, "f1": 0.7125}, "dna_sim_pair_50bp": {"accuracy": 0.8105, "f1": 0.7761370348493798}, "protein_sim_pair_150bp": {"accuracy": 0.9561111111111111, "f1": 0.9566648381788261}, "protein_sim_pair_450bp": {"accuracy": 0.9411111111111111, "f1": 0.9394977168949772}, "dna_protein_pair": {"accuracy": 0.5025, "f1": 0.019704433497536946}, "dna_protein_pair_100": {"accuracy": 0.53, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.51, "f1": 0.02}, "dna_protein_pair_rand": {"accuracy": 0.5, "f1": 0.02912621359223301}, "dna_protein_pair_rand_100": {"accuracy": 0.495625, "f1": 0.007380073800738007}, "dna_protein_pair_rand_full": {"accuracy": 0.51625, "f1": 0.11238532110091744}}
|
| 42 |
+
{"seed": 44, "en": {"accuracy": 0.89, "f1": 0.8822269807280514}, "fr": {"accuracy": 0.7965, "f1": 0.8}, "de": {"accuracy": 0.7715, "f1": 0.7669556348801632}, "zh": {"accuracy": 0.7265, "f1": 0.7266366816591704}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9227777777777778, "f1": 0.9227777777777778}, "dna_sim_pair_150bp": {"accuracy": 0.7145, "f1": 0.6383787207093097}, "dna_sim_pair_50bp": {"accuracy": 0.583, "f1": 0.6669329073482428}, "protein_sim_pair_150bp": {"accuracy": 0.895, "f1": 0.9009952854897852}, "protein_sim_pair_450bp": {"accuracy": 0.8855555555555555, "f1": 0.8762019230769231}, "dna_protein_pair": {"accuracy": 0.55, "f1": 0.0425531914893617}, "dna_protein_pair_100": {"accuracy": 0.495, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.495, "f1": 0.0380952380952381}, "dna_protein_pair_rand": {"accuracy": 0.52, "f1": 0.03759398496240601}, "dna_protein_pair_rand_100": {"accuracy": 0.501875, "f1": 0.02447980416156671}, "dna_protein_pair_rand_full": {"accuracy": 0.530625, "f1": 0.18102508178844057}}
|
| 43 |
+
{"seed": 46, "en": {"accuracy": 0.881, "f1": 0.8717672413793104}, "fr": {"accuracy": 0.799, "f1": 0.7981927710843374}, "de": {"accuracy": 0.7675, "f1": 0.7606793618116315}, "zh": {"accuracy": 0.719, "f1": 0.6938997821350763}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9386111111111111, "f1": 0.9387981168651343}, "dna_sim_pair_150bp": {"accuracy": 0.84275, "f1": 0.8282828282828283}, "dna_sim_pair_50bp": {"accuracy": 0.669, "f1": 0.7328490718321227}, "protein_sim_pair_150bp": {"accuracy": 0.9388888888888889, "f1": 0.9405405405405406}, "protein_sim_pair_450bp": {"accuracy": 0.9205555555555556, "f1": 0.9228278467350243}, "dna_protein_pair": {"accuracy": 0.4975, "f1": 0.04739336492890995}, "dna_protein_pair_100": {"accuracy": 0.5325, "f1": 0.06030150753768844}, "dna_protein_pair_full": {"accuracy": 0.52, "f1": 0.08571428571428572}, "dna_protein_pair_rand": {"accuracy": 0.5, "f1": 0.0888382687927107}, "dna_protein_pair_rand_100": {"accuracy": 0.5175, "f1": 0.10440835266821345}, "dna_protein_pair_rand_full": {"accuracy": 0.531875, "f1": 0.1380897583429229}}
|
| 44 |
+
{"seed": 47, "en": {"accuracy": 0.834, "f1": 0.8224598930481284}, "fr": {"accuracy": 0.7635, "f1": 0.7689301416707377}, "de": {"accuracy": 0.749, "f1": 0.7507447864945382}, "zh": {"accuracy": 0.6865, "f1": 0.7041057102406796}, "dna_sim_pair_simple_150bp": {"accuracy": 0.7297222222222223, "f1": 0.7819852117409815}, "dna_sim_pair_150bp": {"accuracy": 0.65575, "f1": 0.7032967032967034}, "dna_sim_pair_50bp": {"accuracy": 0.4865, "f1": 0.6524534686971235}, "protein_sim_pair_150bp": {"accuracy": 0.5927777777777777, "f1": 0.716441005802708}, "protein_sim_pair_450bp": {"accuracy": 0.7344444444444445, "f1": 0.7864164432529044}, "dna_protein_pair": {"accuracy": 0.4925, "f1": 0.3867069486404834}, "dna_protein_pair_100": {"accuracy": 0.49, "f1": 0.08108108108108109}, "dna_protein_pair_full": {"accuracy": 0.4025, "f1": 0.37597911227154046}, "dna_protein_pair_rand": {"accuracy": 0.59375, "f1": 0.5015337423312883}, "dna_protein_pair_rand_100": {"accuracy": 0.536875, "f1": 0.21752903907074975}, "dna_protein_pair_rand_full": {"accuracy": 0.611875, "f1": 0.5806887238352465}}
|
| 45 |
+
{"seed": 48, "en": {"accuracy": 0.8855, "f1": 0.8785145888594165}, "fr": {"accuracy": 0.8, "f1": 0.8054474708171206}, "de": {"accuracy": 0.7585, "f1": 0.7509025270758123}, "zh": {"accuracy": 0.7235, "f1": 0.6939679026009962}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8538888888888889, "f1": 0.8681043129388164}, "dna_sim_pair_150bp": {"accuracy": 0.76, "f1": 0.7650513950073421}, "dna_sim_pair_50bp": {"accuracy": 0.4805, "f1": 0.6386086956521739}, "protein_sim_pair_150bp": {"accuracy": 0.9316666666666666, "f1": 0.9336211548839719}, "protein_sim_pair_450bp": {"accuracy": 0.8783333333333333, "f1": 0.8667072428484479}, "dna_protein_pair": {"accuracy": 0.495, "f1": 0.11403508771929824}, "dna_protein_pair_100": {"accuracy": 0.485, "f1": 0.009615384615384616}, "dna_protein_pair_full": {"accuracy": 0.49, "f1": 0.009708737864077669}, "dna_protein_pair_rand": {"accuracy": 0.534375, "f1": 0.15819209039548024}, "dna_protein_pair_rand_100": {"accuracy": 0.475, "f1": 0.011764705882352941}, "dna_protein_pair_rand_full": {"accuracy": 0.5125, "f1": 0.007633587786259542}}
|
| 46 |
+
{"seed": 49, "en": {"accuracy": 0.8725, "f1": 0.8665620094191523}, "fr": {"accuracy": 0.7945, "f1": 0.7984306032368809}, "de": {"accuracy": 0.757, "f1": 0.7633885102239533}, "zh": {"accuracy": 0.709, "f1": 0.7133004926108374}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9344444444444444, "f1": 0.9349503858875413}, "dna_sim_pair_150bp": {"accuracy": 0.761, "f1": 0.7193188490898414}, "dna_sim_pair_50bp": {"accuracy": 0.5405, "f1": 0.6866689396522332}, "protein_sim_pair_150bp": {"accuracy": 0.8266666666666667, "f1": 0.8469087340529932}, "protein_sim_pair_450bp": {"accuracy": 0.9038888888888889, "f1": 0.904367053620785}, "dna_protein_pair": {"accuracy": 0.51, "f1": 0.11711711711711711}, "dna_protein_pair_100": {"accuracy": 0.5225, "f1": 0.059113300492610835}, "dna_protein_pair_full": {"accuracy": 0.415, "f1": 0.39690721649484534}, "dna_protein_pair_rand": {"accuracy": 0.516875, "f1": 0.14773980154355015}, "dna_protein_pair_rand_100": {"accuracy": 0.518125, "f1": 0.089728453364817}, "dna_protein_pair_rand_full": {"accuracy": 0.594375, "f1": 0.5647216633132126}}
|
| 47 |
+
{"seed": 50, "en": {"accuracy": 0.8835, "f1": 0.8739859383450513}, "fr": {"accuracy": 0.8145, "f1": 0.8086642599277978}, "de": {"accuracy": 0.7875, "f1": 0.7703943814154511}, "zh": {"accuracy": 0.721, "f1": 0.6807780320366132}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9288888888888889, "f1": 0.9247943595769683}, "dna_sim_pair_150bp": {"accuracy": 0.643, "f1": 0.4408770555990603}, "dna_sim_pair_50bp": {"accuracy": 0.601, "f1": 0.6329346826126955}, "protein_sim_pair_150bp": {"accuracy": 0.925, "f1": 0.9175320708613317}, "protein_sim_pair_450bp": {"accuracy": 0.8094444444444444, "f1": 0.7687120701281187}, "dna_protein_pair": {"accuracy": 0.5075, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.4775, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.515, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.51, "f1": 0.0}, "dna_protein_pair_rand_100": {"accuracy": 0.509375, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.509375, "f1": 0.0025412960609911056}}
|
| 48 |
+
{"seed": 51, "en": {"accuracy": 0.8515, "f1": 0.837793555434189}, "fr": {"accuracy": 0.787, "f1": 0.780638516992791}, "de": {"accuracy": 0.7605, "f1": 0.7503908285565398}, "zh": {"accuracy": 0.7315, "f1": 0.6904899135446686}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9180555555555555, "f1": 0.9096477794793262}, "dna_sim_pair_150bp": {"accuracy": 0.71175, "f1": 0.5947275922671353}, "dna_sim_pair_50bp": {"accuracy": 0.575, "f1": 0.5350109409190372}, "protein_sim_pair_150bp": {"accuracy": 0.9122222222222223, "f1": 0.9035409035409036}, "protein_sim_pair_450bp": {"accuracy": 0.9566666666666667, "f1": 0.9544924154025671}, "dna_protein_pair": {"accuracy": 0.51, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.505, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.425, "f1": 0.29878048780487804}, "dna_protein_pair_rand": {"accuracy": 0.50875, "f1": 0.0025380710659898475}, "dna_protein_pair_rand_100": {"accuracy": 0.499375, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.569375, "f1": 0.40034812880765885}}
|
| 49 |
+
{"seed": 52, "en": {"accuracy": 0.8845, "f1": 0.8798751950078003}, "fr": {"accuracy": 0.7985, "f1": 0.802547770700637}, "de": {"accuracy": 0.767, "f1": 0.7761767531219981}, "zh": {"accuracy": 0.7185, "f1": 0.7087428867046043}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8886111111111111, "f1": 0.898455305140542}, "dna_sim_pair_150bp": {"accuracy": 0.817, "f1": 0.8174563591022443}, "dna_sim_pair_50bp": {"accuracy": 0.6125, "f1": 0.689875950380152}, "protein_sim_pair_150bp": {"accuracy": 0.6583333333333333, "f1": 0.7453416149068323}, "protein_sim_pair_450bp": {"accuracy": 0.9066666666666666, "f1": 0.9127725856697819}, "dna_protein_pair": {"accuracy": 0.5125, "f1": 0.2471042471042471}, "dna_protein_pair_100": {"accuracy": 0.4775, "f1": 0.11814345991561181}, "dna_protein_pair_full": {"accuracy": 0.4425, "f1": 0.4053333333333333}, "dna_protein_pair_rand": {"accuracy": 0.55875, "f1": 0.36510791366906475}, "dna_protein_pair_rand_100": {"accuracy": 0.535, "f1": 0.25301204819277107}, "dna_protein_pair_rand_full": {"accuracy": 0.66625, "f1": 0.6817640047675805}}
|
| 50 |
+
{"seed": 53, "en": {"accuracy": 0.871, "f1": 0.8679631525076765}, "fr": {"accuracy": 0.7545, "f1": 0.7754915409236397}, "de": {"accuracy": 0.727, "f1": 0.7524932003626473}, "zh": {"accuracy": 0.638, "f1": 0.6957983193277311}, "dna_sim_pair_simple_150bp": {"accuracy": 0.7597222222222222, "f1": 0.8027366020524516}, "dna_sim_pair_150bp": {"accuracy": 0.6675, "f1": 0.7074351077870655}, "dna_sim_pair_50bp": {"accuracy": 0.5065, "f1": 0.6680121089808274}, "protein_sim_pair_150bp": {"accuracy": 0.8994444444444445, "f1": 0.9042834479111581}, "protein_sim_pair_450bp": {"accuracy": 0.925, "f1": 0.9220103986135182}, "dna_protein_pair": {"accuracy": 0.4825, "f1": 0.14107883817427386}, "dna_protein_pair_100": {"accuracy": 0.545, "f1": 0.031914893617021274}, "dna_protein_pair_full": {"accuracy": 0.4925, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.541875, "f1": 0.22597676874340022}, "dna_protein_pair_rand_100": {"accuracy": 0.480625, "f1": 0.02120141342756184}, "dna_protein_pair_rand_full": {"accuracy": 0.505, "f1": 0.007518796992481203}}
|
| 51 |
+
{"seed": 54, "en": {"accuracy": 0.88, "f1": 0.8740818467995802}, "fr": {"accuracy": 0.767, "f1": 0.7803958529688972}, "de": {"accuracy": 0.761, "f1": 0.767056530214425}, "zh": {"accuracy": 0.694, "f1": 0.7054860442733397}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9063888888888889, "f1": 0.905892208880201}, "dna_sim_pair_150bp": {"accuracy": 0.7835, "f1": 0.7525714285714286}, "dna_sim_pair_50bp": {"accuracy": 0.5625, "f1": 0.6873883529832083}, "protein_sim_pair_150bp": {"accuracy": 0.98, "f1": 0.9793577981651376}, "protein_sim_pair_450bp": {"accuracy": 0.9127777777777778, "f1": 0.9051359516616314}, "dna_protein_pair": {"accuracy": 0.5275, "f1": 0.04060913705583756}, "dna_protein_pair_100": {"accuracy": 0.5075, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.4525, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.53375, "f1": 0.09466019417475728}, "dna_protein_pair_rand_100": {"accuracy": 0.51, "f1": 0.02}, "dna_protein_pair_rand_full": {"accuracy": 0.490625, "f1": 0.004884004884004884}}
|
| 52 |
+
{"seed": 55, "en": {"accuracy": 0.8785, "f1": 0.867574931880109}, "fr": {"accuracy": 0.8015, "f1": 0.7969309462915601}, "de": {"accuracy": 0.761, "f1": 0.7536082474226804}, "zh": {"accuracy": 0.7275, "f1": 0.7016967706622879}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9163888888888889, "f1": 0.9167358229598893}, "dna_sim_pair_150bp": {"accuracy": 0.77325, "f1": 0.7457246986262966}, "dna_sim_pair_50bp": {"accuracy": 0.692, "f1": 0.7192342752962625}, "protein_sim_pair_150bp": {"accuracy": 0.8861111111111111, "f1": 0.8881614839061648}, "protein_sim_pair_450bp": {"accuracy": 0.8755555555555555, "f1": 0.8761061946902655}, "dna_protein_pair": {"accuracy": 0.505, "f1": 0.16101694915254236}, "dna_protein_pair_100": {"accuracy": 0.4625, "f1": 0.009216589861751152}, "dna_protein_pair_full": {"accuracy": 0.4925, "f1": 0.028708133971291867}, "dna_protein_pair_rand": {"accuracy": 0.524375, "f1": 0.18435155412647375}, "dna_protein_pair_rand_100": {"accuracy": 0.511875, "f1": 0.044063647490820076}, "dna_protein_pair_rand_full": {"accuracy": 0.5475, "f1": 0.2505175983436853}}
|
| 53 |
+
{"seed": 56, "en": {"accuracy": 0.8905, "f1": 0.8836962294211365}, "fr": {"accuracy": 0.7965, "f1": 0.803666184273999}, "de": {"accuracy": 0.763, "f1": 0.76441351888668}, "zh": {"accuracy": 0.716, "f1": 0.7072164948453609}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9258333333333333, "f1": 0.928129205921938}, "dna_sim_pair_150bp": {"accuracy": 0.8035, "f1": 0.7763232783153102}, "dna_sim_pair_50bp": {"accuracy": 0.5485, "f1": 0.6859130434782609}, "protein_sim_pair_150bp": {"accuracy": 0.9416666666666667, "f1": 0.9421487603305785}, "protein_sim_pair_450bp": {"accuracy": 0.9088888888888889, "f1": 0.9021479713603818}, "dna_protein_pair": {"accuracy": 0.51, "f1": 0.06666666666666667}, "dna_protein_pair_100": {"accuracy": 0.4525, "f1": 0.00904977375565611}, "dna_protein_pair_full": {"accuracy": 0.5375, "f1": 0.04145077720207254}, "dna_protein_pair_rand": {"accuracy": 0.498125, "f1": 0.09470124013528748}, "dna_protein_pair_rand_100": {"accuracy": 0.516875, "f1": 0.022756005056890013}, "dna_protein_pair_rand_full": {"accuracy": 0.498125, "f1": 0.04290822407628129}}
|
| 54 |
+
{"seed": 57, "en": {"accuracy": 0.8785, "f1": 0.8662630709961475}, "fr": {"accuracy": 0.809, "f1": 0.8012486992715921}, "de": {"accuracy": 0.762, "f1": 0.7492096944151738}, "zh": {"accuracy": 0.744, "f1": 0.716500553709856}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9330555555555555, "f1": 0.9331484049930652}, "dna_sim_pair_150bp": {"accuracy": 0.72675, "f1": 0.6589703588143526}, "dna_sim_pair_50bp": {"accuracy": 0.495, "f1": 0.5483005366726297}, "protein_sim_pair_150bp": {"accuracy": 0.9361111111111111, "f1": 0.931178934769599}, "protein_sim_pair_450bp": {"accuracy": 0.9422222222222222, "f1": 0.9388954171562868}, "dna_protein_pair": {"accuracy": 0.5325, "f1": 0.010582010582010581}, "dna_protein_pair_100": {"accuracy": 0.515, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.49, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.50875, "f1": 0.0199501246882793}, "dna_protein_pair_rand_100": {"accuracy": 0.4975, "f1": 0.007407407407407408}, "dna_protein_pair_rand_full": {"accuracy": 0.504375, "f1": 0.0}}
|
| 55 |
+
{"seed": 58, "en": {"accuracy": 0.884, "f1": 0.8744588744588745}, "fr": {"accuracy": 0.7825, "f1": 0.7935453251067869}, "de": {"accuracy": 0.775, "f1": 0.7661122661122661}, "zh": {"accuracy": 0.713, "f1": 0.6985294117647058}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9380555555555555, "f1": 0.9378657007522987}, "dna_sim_pair_150bp": {"accuracy": 0.763, "f1": 0.7219941348973608}, "dna_sim_pair_50bp": {"accuracy": 0.534, "f1": 0.6783988957902002}, "protein_sim_pair_150bp": {"accuracy": 0.8894444444444445, "f1": 0.8958660387231816}, "protein_sim_pair_450bp": {"accuracy": 0.9477777777777778, "f1": 0.948180815876516}, "dna_protein_pair": {"accuracy": 0.515, "f1": 0.10185185185185185}, "dna_protein_pair_100": {"accuracy": 0.5125, "f1": 0.03940886699507389}, "dna_protein_pair_full": {"accuracy": 0.51, "f1": 0.057692307692307696}, "dna_protein_pair_rand": {"accuracy": 0.50625, "f1": 0.1649048625792812}, "dna_protein_pair_rand_100": {"accuracy": 0.503125, "f1": 0.03636363636363636}, "dna_protein_pair_rand_full": {"accuracy": 0.524375, "f1": 0.14205186020293123}}
|
| 56 |
+
{"seed": 59, "en": {"accuracy": 0.885, "f1": 0.8783068783068783}, "fr": {"accuracy": 0.771, "f1": 0.7829383886255924}, "de": {"accuracy": 0.75, "f1": 0.7591522157996147}, "zh": {"accuracy": 0.7105, "f1": 0.7050433010697912}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8911111111111111, "f1": 0.8955780500799148}, "dna_sim_pair_150bp": {"accuracy": 0.773, "f1": 0.7527233115468409}, "dna_sim_pair_50bp": {"accuracy": 0.572, "f1": 0.666406858924396}, "protein_sim_pair_150bp": {"accuracy": 0.9172222222222223, "f1": 0.9148084619782733}, "protein_sim_pair_450bp": {"accuracy": 0.9116666666666666, "f1": 0.9073966220151427}, "dna_protein_pair": {"accuracy": 0.4825, "f1": 0.06334841628959276}, "dna_protein_pair_100": {"accuracy": 0.51, "f1": 0.010101010101010102}, "dna_protein_pair_full": {"accuracy": 0.505, "f1": 0.1}, "dna_protein_pair_rand": {"accuracy": 0.501875, "f1": 0.06783625730994151}, "dna_protein_pair_rand_100": {"accuracy": 0.489375, "f1": 0.01684717208182912}, "dna_protein_pair_rand_full": {"accuracy": 0.525, "f1": 0.20168067226890757}}
|
| 57 |
+
{"seed": 60, "en": {"accuracy": 0.8575, "f1": 0.8534704370179949}, "fr": {"accuracy": 0.7545, "f1": 0.7732101616628175}, "de": {"accuracy": 0.7335, "f1": 0.7487034417727487}, "zh": {"accuracy": 0.683, "f1": 0.7067530064754857}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8894444444444445, "f1": 0.8941489361702127}, "dna_sim_pair_150bp": {"accuracy": 0.71375, "f1": 0.6800782341436156}, "dna_sim_pair_50bp": {"accuracy": 0.593, "f1": 0.7147862648913805}, "protein_sim_pair_150bp": {"accuracy": 0.9288888888888889, "f1": 0.9298245614035088}, "protein_sim_pair_450bp": {"accuracy": 0.9444444444444444, "f1": 0.943630214205186}, "dna_protein_pair": {"accuracy": 0.5275, "f1": 0.25882352941176473}, "dna_protein_pair_100": {"accuracy": 0.4875, "f1": 0.019138755980861243}, "dna_protein_pair_full": {"accuracy": 0.495, "f1": 0.019417475728155338}, "dna_protein_pair_rand": {"accuracy": 0.5175, "f1": 0.22954091816367264}, "dna_protein_pair_rand_100": {"accuracy": 0.490625, "f1": 0.06857142857142857}, "dna_protein_pair_rand_full": {"accuracy": 0.519375, "f1": 0.04234122042341221}}
|
| 58 |
+
{"seed": 63, "en": {"accuracy": 0.8895, "f1": 0.8845953002610966}, "fr": {"accuracy": 0.813, "f1": 0.8137450199203188}, "de": {"accuracy": 0.767, "f1": 0.77179236043095}, "zh": {"accuracy": 0.717, "f1": 0.7088477366255144}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9436111111111111, "f1": 0.9424113475177305}, "dna_sim_pair_150bp": {"accuracy": 0.751, "f1": 0.6710700132100397}, "dna_sim_pair_50bp": {"accuracy": 0.76, "f1": 0.7266514806378133}, "protein_sim_pair_150bp": {"accuracy": 0.9855555555555555, "f1": 0.9859002169197397}, "protein_sim_pair_450bp": {"accuracy": 0.9572222222222222, "f1": 0.9544108940201302}, "dna_protein_pair": {"accuracy": 0.515, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.495, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.45, "f1": 0.043478260869565216}, "dna_protein_pair_rand": {"accuracy": 0.494375, "f1": 0.01701093560145808}, "dna_protein_pair_rand_100": {"accuracy": 0.503125, "f1": 0.02692778457772338}, "dna_protein_pair_rand_full": {"accuracy": 0.50625, "f1": 0.1595744680851064}}
|
| 59 |
+
{"seed": 64, "en": {"accuracy": 0.884, "f1": 0.8755364806866953}, "fr": {"accuracy": 0.7845, "f1": 0.7904715605250364}, "de": {"accuracy": 0.7595, "f1": 0.76340383669454}, "zh": {"accuracy": 0.693, "f1": 0.7111947318908749}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9211111111111111, "f1": 0.915826911677534}, "dna_sim_pair_150bp": {"accuracy": 0.7215, "f1": 0.6210884353741497}, "dna_sim_pair_50bp": {"accuracy": 0.683, "f1": 0.7174688057040999}, "protein_sim_pair_150bp": {"accuracy": 0.9, "f1": 0.9024918743228603}, "protein_sim_pair_450bp": {"accuracy": 0.9038888888888889, "f1": 0.8982951205173427}, "dna_protein_pair": {"accuracy": 0.4575, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.495, "f1": 0.00980392156862745}, "dna_protein_pair_full": {"accuracy": 0.505, "f1": 0.208}, "dna_protein_pair_rand": {"accuracy": 0.486875, "f1": 0.03976608187134503}, "dna_protein_pair_rand_100": {"accuracy": 0.506875, "f1": 0.027127003699136867}, "dna_protein_pair_rand_full": {"accuracy": 0.566875, "f1": 0.34560906515580736}}
|
| 60 |
+
{"seed": 65, "en": {"accuracy": 0.875, "f1": 0.8697916666666666}, "fr": {"accuracy": 0.7785, "f1": 0.7850557981562348}, "de": {"accuracy": 0.762, "f1": 0.7650542941757157}, "zh": {"accuracy": 0.7245, "f1": 0.6990715456034954}, "dna_sim_pair_simple_150bp": {"accuracy": 0.95, "f1": 0.9495515695067265}, "dna_sim_pair_150bp": {"accuracy": 0.7825, "f1": 0.7284644194756554}, "dna_sim_pair_50bp": {"accuracy": 0.642, "f1": 0.6809269162210339}, "protein_sim_pair_150bp": {"accuracy": 0.9188888888888889, "f1": 0.9248197734294542}, "protein_sim_pair_450bp": {"accuracy": 0.9422222222222222, "f1": 0.9405714285714286}, "dna_protein_pair": {"accuracy": 0.47, "f1": 0.04504504504504504}, "dna_protein_pair_100": {"accuracy": 0.525, "f1": 0.010416666666666666}, "dna_protein_pair_full": {"accuracy": 0.4425, "f1": 0.2440677966101695}, "dna_protein_pair_rand": {"accuracy": 0.521875, "f1": 0.17118093174431204}, "dna_protein_pair_rand_100": {"accuracy": 0.51625, "f1": 0.046798029556650245}, "dna_protein_pair_rand_full": {"accuracy": 0.491875, "f1": 0.25753424657534246}}
|
| 61 |
+
{"seed": 66, "en": {"accuracy": 0.875, "f1": 0.8700623700623701}, "fr": {"accuracy": 0.7775, "f1": 0.789598108747045}, "de": {"accuracy": 0.762, "f1": 0.772683858643744}, "zh": {"accuracy": 0.69, "f1": 0.7024952015355086}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9405555555555556, "f1": 0.9398200224971879}, "dna_sim_pair_150bp": {"accuracy": 0.744, "f1": 0.6895087932080048}, "dna_sim_pair_50bp": {"accuracy": 0.717, "f1": 0.7647547797173733}, "protein_sim_pair_150bp": {"accuracy": 0.96, "f1": 0.9600443951165372}, "protein_sim_pair_450bp": {"accuracy": 0.8788888888888889, "f1": 0.8634085213032582}, "dna_protein_pair": {"accuracy": 0.49, "f1": 0.10526315789473684}, "dna_protein_pair_100": {"accuracy": 0.4875, "f1": 0.08071748878923767}, "dna_protein_pair_full": {"accuracy": 0.5, "f1": 0.33774834437086093}, "dna_protein_pair_rand": {"accuracy": 0.5025, "f1": 0.14038876889848811}, "dna_protein_pair_rand_100": {"accuracy": 0.47375, "f1": 0.09267241379310345}, "dna_protein_pair_rand_full": {"accuracy": 0.47625, "f1": 0.2049335863377609}}
|
| 62 |
+
{"seed": 67, "en": {"accuracy": 0.8685, "f1": 0.8616517622304051}, "fr": {"accuracy": 0.7935, "f1": 0.7908860759493671}, "de": {"accuracy": 0.753, "f1": 0.753}, "zh": {"accuracy": 0.695, "f1": 0.6894093686354379}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8944444444444445, "f1": 0.90041928721174}, "dna_sim_pair_150bp": {"accuracy": 0.80025, "f1": 0.7966403665054721}, "dna_sim_pair_50bp": {"accuracy": 0.493, "f1": 0.6558044806517311}, "protein_sim_pair_150bp": {"accuracy": 0.9533333333333334, "f1": 0.9538461538461539}, "protein_sim_pair_450bp": {"accuracy": 0.955, "f1": 0.9527696793002915}, "dna_protein_pair": {"accuracy": 0.495, "f1": 0.10619469026548672}, "dna_protein_pair_100": {"accuracy": 0.4775, "f1": 0.05429864253393665}, "dna_protein_pair_full": {"accuracy": 0.4625, "f1": 0.24028268551236748}, "dna_protein_pair_rand": {"accuracy": 0.5275, "f1": 0.17647058823529413}, "dna_protein_pair_rand_100": {"accuracy": 0.525, "f1": 0.11214953271028037}, "dna_protein_pair_rand_full": {"accuracy": 0.61125, "f1": 0.45818815331010454}}
|
| 63 |
+
{"seed": 68, "en": {"accuracy": 0.9045, "f1": 0.8957992362247681}, "fr": {"accuracy": 0.8145, "f1": 0.8070722828913156}, "de": {"accuracy": 0.7795, "f1": 0.7556786703601108}, "zh": {"accuracy": 0.729, "f1": 0.7057546145494028}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9486111111111111, "f1": 0.9454116258483328}, "dna_sim_pair_150bp": {"accuracy": 0.76025, "f1": 0.6869082598759386}, "dna_sim_pair_50bp": {"accuracy": 0.7865, "f1": 0.7423053711526856}, "protein_sim_pair_150bp": {"accuracy": 0.9677777777777777, "f1": 0.9662790697674418}, "protein_sim_pair_450bp": {"accuracy": 0.9522222222222222, "f1": 0.9514124293785311}, "dna_protein_pair": {"accuracy": 0.4975, "f1": 0.009852216748768473}, "dna_protein_pair_100": {"accuracy": 0.515, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.51, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.496875, "f1": 0.0024783147459727386}, "dna_protein_pair_rand_100": {"accuracy": 0.510625, "f1": 0.005082592121982211}, "dna_protein_pair_rand_full": {"accuracy": 0.513125, "f1": 0.002560819462227913}}
|
| 64 |
+
{"seed": 69, "en": {"accuracy": 0.883, "f1": 0.8728260869565218}, "fr": {"accuracy": 0.783, "f1": 0.783433133732535}, "de": {"accuracy": 0.763, "f1": 0.7569230769230769}, "zh": {"accuracy": 0.7285, "f1": 0.6998341625207297}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8816666666666667, "f1": 0.8869426751592356}, "dna_sim_pair_150bp": {"accuracy": 0.782, "f1": 0.7826520438683948}, "dna_sim_pair_50bp": {"accuracy": 0.4945, "f1": 0.6039952996474736}, "protein_sim_pair_150bp": {"accuracy": 0.8827777777777778, "f1": 0.8888888888888888}, "protein_sim_pair_450bp": {"accuracy": 0.8127777777777778, "f1": 0.8303975842979366}, "dna_protein_pair": {"accuracy": 0.48, "f1": 0.028037383177570093}, "dna_protein_pair_100": {"accuracy": 0.5325, "f1": 0.06030150753768844}, "dna_protein_pair_full": {"accuracy": 0.5075, "f1": 0.20242914979757085}, "dna_protein_pair_rand": {"accuracy": 0.5125, "f1": 0.0625}, "dna_protein_pair_rand_100": {"accuracy": 0.5725, "f1": 0.24168514412416853}, "dna_protein_pair_rand_full": {"accuracy": 0.57, "f1": 0.3497164461247637}}
|
| 65 |
+
{"seed": 70, "en": {"accuracy": 0.8975, "f1": 0.8877941981390257}, "fr": {"accuracy": 0.8075, "f1": 0.8022598870056498}, "de": {"accuracy": 0.7795, "f1": 0.7534935718278368}, "zh": {"accuracy": 0.7145, "f1": 0.6790331646992692}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9011111111111111, "f1": 0.8921865536038764}, "dna_sim_pair_150bp": {"accuracy": 0.59625, "f1": 0.3290402991275447}, "dna_sim_pair_50bp": {"accuracy": 0.5805, "f1": 0.44620462046204623}, "protein_sim_pair_150bp": {"accuracy": 0.8294444444444444, "f1": 0.8070395977372722}, "protein_sim_pair_450bp": {"accuracy": 0.73, "f1": 0.6261538461538462}, "dna_protein_pair": {"accuracy": 0.515, "f1": 0.01020408163265306}, "dna_protein_pair_100": {"accuracy": 0.445, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5125, "f1": 0.01015228426395939}, "dna_protein_pair_rand": {"accuracy": 0.491875, "f1": 0.00245398773006135}, "dna_protein_pair_rand_100": {"accuracy": 0.493125, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.528125, "f1": 0.03821656050955414}}
|
| 66 |
+
{"seed": 73, "en": {"accuracy": 0.8935, "f1": 0.8875989445910291}, "fr": {"accuracy": 0.7875, "f1": 0.7933884297520661}, "de": {"accuracy": 0.766, "f1": 0.7636363636363637}, "zh": {"accuracy": 0.706, "f1": 0.7074626865671642}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9566666666666667, "f1": 0.9558073654390935}, "dna_sim_pair_150bp": {"accuracy": 0.8345, "f1": 0.8091118800461361}, "dna_sim_pair_50bp": {"accuracy": 0.6435, "f1": 0.7267152165580683}, "protein_sim_pair_150bp": {"accuracy": 0.9361111111111111, "f1": 0.9386666666666666}, "protein_sim_pair_450bp": {"accuracy": 0.915, "f1": 0.9094138543516874}, "dna_protein_pair": {"accuracy": 0.5075, "f1": 0.029556650246305417}, "dna_protein_pair_100": {"accuracy": 0.4825, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.505, "f1": 0.05714285714285714}, "dna_protein_pair_rand": {"accuracy": 0.511875, "f1": 0.048721071863580996}, "dna_protein_pair_rand_100": {"accuracy": 0.496875, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.561875, "f1": 0.28976697061803447}}
|
| 67 |
+
{"seed": 75, "en": {"accuracy": 0.8905, "f1": 0.8797364085667215}, "fr": {"accuracy": 0.82, "f1": 0.8119122257053292}, "de": {"accuracy": 0.7825, "f1": 0.768}, "zh": {"accuracy": 0.7245, "f1": 0.6802089378990134}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9475, "f1": 0.945010183299389}, "dna_sim_pair_150bp": {"accuracy": 0.72675, "f1": 0.6232333678042055}, "dna_sim_pair_50bp": {"accuracy": 0.7485, "f1": 0.6838466373350094}, "protein_sim_pair_150bp": {"accuracy": 0.9327777777777778, "f1": 0.9301788805539527}, "protein_sim_pair_450bp": {"accuracy": 0.775, "f1": 0.7024246877296105}, "dna_protein_pair": {"accuracy": 0.515, "f1": 0.01020408163265306}, "dna_protein_pair_100": {"accuracy": 0.4775, "f1": 0.009478672985781991}, "dna_protein_pair_full": {"accuracy": 0.5075, "f1": 0.010050251256281407}, "dna_protein_pair_rand": {"accuracy": 0.513125, "f1": 0.010165184243964422}, "dna_protein_pair_rand_100": {"accuracy": 0.503125, "f1": 0.002509410288582183}, "dna_protein_pair_rand_full": {"accuracy": 0.495625, "f1": 0.035842293906810034}}
|
| 68 |
+
{"seed": 76, "en": {"accuracy": 0.888, "f1": 0.8816067653276956}, "fr": {"accuracy": 0.799, "f1": 0.804284323271665}, "de": {"accuracy": 0.775, "f1": 0.7722672064777328}, "zh": {"accuracy": 0.7135, "f1": 0.6995280545359203}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9447222222222222, "f1": 0.9440854172520371}, "dna_sim_pair_150bp": {"accuracy": 0.82475, "f1": 0.7976911976911977}, "dna_sim_pair_50bp": {"accuracy": 0.7565, "f1": 0.7243916242218449}, "protein_sim_pair_150bp": {"accuracy": 0.9566666666666667, "f1": 0.9568106312292359}, "protein_sim_pair_450bp": {"accuracy": 0.915, "f1": 0.9115095430884904}, "dna_protein_pair": {"accuracy": 0.51, "f1": 0.06666666666666667}, "dna_protein_pair_100": {"accuracy": 0.53, "f1": 0.09615384615384616}, "dna_protein_pair_full": {"accuracy": 0.47, "f1": 0.1092436974789916}, "dna_protein_pair_rand": {"accuracy": 0.49625, "f1": 0.09029345372460497}, "dna_protein_pair_rand_100": {"accuracy": 0.53625, "f1": 0.17738359201773837}, "dna_protein_pair_rand_full": {"accuracy": 0.52625, "f1": 0.0889423076923077}}
|
| 69 |
+
{"seed": 77, "en": {"accuracy": 0.8645, "f1": 0.8559276980329612}, "fr": {"accuracy": 0.7805, "f1": 0.782350024789291}, "de": {"accuracy": 0.756, "f1": 0.7489711934156379}, "zh": {"accuracy": 0.699, "f1": 0.6968781470292045}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9441666666666667, "f1": 0.9456021650879567}, "dna_sim_pair_150bp": {"accuracy": 0.824, "f1": 0.8041179744017808}, "dna_sim_pair_50bp": {"accuracy": 0.68, "f1": 0.7464342313787639}, "protein_sim_pair_150bp": {"accuracy": 0.935, "f1": 0.9366540335679481}, "protein_sim_pair_450bp": {"accuracy": 0.8944444444444445, "f1": 0.8879716981132075}, "dna_protein_pair": {"accuracy": 0.45, "f1": 0.20863309352517986}, "dna_protein_pair_100": {"accuracy": 0.4575, "f1": 0.08438818565400844}, "dna_protein_pair_full": {"accuracy": 0.45, "f1": 0.20863309352517986}, "dna_protein_pair_rand": {"accuracy": 0.56, "f1": 0.3669064748201439}, "dna_protein_pair_rand_100": {"accuracy": 0.556875, "f1": 0.19887005649717515}, "dna_protein_pair_rand_full": {"accuracy": 0.503125, "f1": 0.1829393627954779}}
|
| 70 |
+
{"seed": 78, "en": {"accuracy": 0.8585, "f1": 0.8571428571428571}, "fr": {"accuracy": 0.768, "f1": 0.7869605142332415}, "de": {"accuracy": 0.75, "f1": 0.7621313035204567}, "zh": {"accuracy": 0.6755, "f1": 0.7111704494882065}, "dna_sim_pair_simple_150bp": {"accuracy": 0.88, "f1": 0.8911290322580645}, "dna_sim_pair_150bp": {"accuracy": 0.7975, "f1": 0.8030155642023347}, "dna_sim_pair_50bp": {"accuracy": 0.522, "f1": 0.6802675585284281}, "protein_sim_pair_150bp": {"accuracy": 0.78, "f1": 0.8219424460431655}, "protein_sim_pair_450bp": {"accuracy": 0.9477777777777778, "f1": 0.9481236203090507}, "dna_protein_pair": {"accuracy": 0.5075, "f1": 0.2939068100358423}, "dna_protein_pair_100": {"accuracy": 0.4875, "f1": 0.00966183574879227}, "dna_protein_pair_full": {"accuracy": 0.4175, "f1": 0.3086053412462908}, "dna_protein_pair_rand": {"accuracy": 0.559375, "f1": 0.36083408884859475}, "dna_protein_pair_rand_100": {"accuracy": 0.485, "f1": 0.03286384976525822}, "dna_protein_pair_rand_full": {"accuracy": 0.591875, "f1": 0.47039740470397406}}
|
| 71 |
+
{"seed": 79, "en": {"accuracy": 0.872, "f1": 0.871356783919598}, "fr": {"accuracy": 0.7545, "f1": 0.7789284106258442}, "de": {"accuracy": 0.7325, "f1": 0.7558192606115929}, "zh": {"accuracy": 0.672, "f1": 0.7130358705161854}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9, "f1": 0.9051133368476542}, "dna_sim_pair_150bp": {"accuracy": 0.80025, "f1": 0.793058793058793}, "dna_sim_pair_50bp": {"accuracy": 0.5835, "f1": 0.6958744067177802}, "protein_sim_pair_150bp": {"accuracy": 0.9011111111111111, "f1": 0.9078674948240165}, "protein_sim_pair_450bp": {"accuracy": 0.9511111111111111, "f1": 0.9506172839506173}, "dna_protein_pair": {"accuracy": 0.4875, "f1": 0.14225941422594143}, "dna_protein_pair_100": {"accuracy": 0.51, "f1": 0.0297029702970297}, "dna_protein_pair_full": {"accuracy": 0.4975, "f1": 0.10666666666666667}, "dna_protein_pair_rand": {"accuracy": 0.500625, "f1": 0.18386108273748722}, "dna_protein_pair_rand_100": {"accuracy": 0.483125, "f1": 0.04171494785631518}, "dna_protein_pair_rand_full": {"accuracy": 0.505, "f1": 0.10609480812641084}}
|
| 72 |
+
{"seed": 80, "en": {"accuracy": 0.869, "f1": 0.8649484536082475}, "fr": {"accuracy": 0.769, "f1": 0.7830985915492957}, "de": {"accuracy": 0.739, "f1": 0.7528409090909091}, "zh": {"accuracy": 0.706, "f1": 0.6947040498442367}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9655555555555555, "f1": 0.9663226507332972}, "dna_sim_pair_150bp": {"accuracy": 0.83775, "f1": 0.8129143845488613}, "dna_sim_pair_50bp": {"accuracy": 0.862, "f1": 0.8614457831325302}, "protein_sim_pair_150bp": {"accuracy": 0.9805555555555555, "f1": 0.9801474758933636}, "protein_sim_pair_450bp": {"accuracy": 0.9594444444444444, "f1": 0.9598238855255916}, "dna_protein_pair": {"accuracy": 0.435, "f1": 0.12403100775193798}, "dna_protein_pair_100": {"accuracy": 0.42, "f1": 0.46543778801843316}, "dna_protein_pair_full": {"accuracy": 0.5175, "f1": 0.34576271186440677}, "dna_protein_pair_rand": {"accuracy": 0.520625, "f1": 0.1562156215621562}, "dna_protein_pair_rand_100": {"accuracy": 0.431875, "f1": 0.328159645232816}, "dna_protein_pair_rand_full": {"accuracy": 0.439375, "f1": 0.1416267942583732}}
|
| 73 |
+
{"seed": 81, "en": {"accuracy": 0.9075, "f1": 0.8987411056376574}, "fr": {"accuracy": 0.802, "f1": 0.7958762886597938}, "de": {"accuracy": 0.792, "f1": 0.7716794731064764}, "zh": {"accuracy": 0.72, "f1": 0.6755504055619931}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9544444444444444, "f1": 0.9519061583577713}, "dna_sim_pair_150bp": {"accuracy": 0.77575, "f1": 0.7079127320091175}, "dna_sim_pair_50bp": {"accuracy": 0.794, "f1": 0.7506053268765133}, "protein_sim_pair_150bp": {"accuracy": 0.9627777777777777, "f1": 0.9613832853025936}, "protein_sim_pair_450bp": {"accuracy": 0.9388888888888889, "f1": 0.9375}, "dna_protein_pair": {"accuracy": 0.5175, "f1": 0.02030456852791878}, "dna_protein_pair_100": {"accuracy": 0.4775, "f1": 0.009478672985781991}, "dna_protein_pair_full": {"accuracy": 0.4925, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.4825, "f1": 0.028169014084507043}, "dna_protein_pair_rand_100": {"accuracy": 0.50625, "f1": 0.0125}, "dna_protein_pair_rand_full": {"accuracy": 0.485625, "f1": 0.02140309155766944}}
|
| 74 |
+
{"seed": 82, "en": {"accuracy": 0.891, "f1": 0.8778026905829597}, "fr": {"accuracy": 0.8055, "f1": 0.7914209115281501}, "de": {"accuracy": 0.7645, "f1": 0.7543035993740219}, "zh": {"accuracy": 0.7195, "f1": 0.6835871404399323}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9416666666666667, "f1": 0.9387397899649942}, "dna_sim_pair_150bp": {"accuracy": 0.69175, "f1": 0.5550342836521112}, "dna_sim_pair_50bp": {"accuracy": 0.7675, "f1": 0.7576862949452841}, "protein_sim_pair_150bp": {"accuracy": 0.9355555555555556, "f1": 0.9292682926829269}, "protein_sim_pair_450bp": {"accuracy": 0.7711111111111111, "f1": 0.7086280056577087}, "dna_protein_pair": {"accuracy": 0.4975, "f1": 0.0}, "dna_protein_pair_100": {"accuracy": 0.5075, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.4775, "f1": 0.0}, "dna_protein_pair_rand": {"accuracy": 0.496875, "f1": 0.0024783147459727386}, "dna_protein_pair_rand_100": {"accuracy": 0.49375, "f1": 0.004914004914004914}, "dna_protein_pair_rand_full": {"accuracy": 0.503125, "f1": 0.07450523864959255}}
|
| 75 |
+
{"seed": 83, "en": {"accuracy": 0.876, "f1": 0.8693361433087461}, "fr": {"accuracy": 0.773, "f1": 0.7866541353383458}, "de": {"accuracy": 0.747, "f1": 0.7529296875}, "zh": {"accuracy": 0.7115, "f1": 0.7208514755684567}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8608333333333333, "f1": 0.8733249051833123}, "dna_sim_pair_150bp": {"accuracy": 0.74925, "f1": 0.7611336032388664}, "dna_sim_pair_50bp": {"accuracy": 0.4935, "f1": 0.6590373611578593}, "protein_sim_pair_150bp": {"accuracy": 0.9144444444444444, "f1": 0.9165763813651138}, "protein_sim_pair_450bp": {"accuracy": 0.9205555555555556, "f1": 0.9230769230769231}, "dna_protein_pair": {"accuracy": 0.49, "f1": 0.2916666666666667}, "dna_protein_pair_100": {"accuracy": 0.4475, "f1": 0.19047619047619047}, "dna_protein_pair_full": {"accuracy": 0.3725, "f1": 0.4308390022675737}, "dna_protein_pair_rand": {"accuracy": 0.559375, "f1": 0.3710972346119536}, "dna_protein_pair_rand_100": {"accuracy": 0.52875, "f1": 0.22745901639344263}, "dna_protein_pair_rand_full": {"accuracy": 0.52875, "f1": 0.49395973154362416}}
|
| 76 |
+
{"seed": 84, "en": {"accuracy": 0.889, "f1": 0.8776185226019846}, "fr": {"accuracy": 0.7965, "f1": 0.7845420857596612}, "de": {"accuracy": 0.775, "f1": 0.7511061946902655}, "zh": {"accuracy": 0.7285, "f1": 0.7146610614818707}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9291666666666667, "f1": 0.9300027449903925}, "dna_sim_pair_150bp": {"accuracy": 0.79625, "f1": 0.7724099413571628}, "dna_sim_pair_50bp": {"accuracy": 0.5215, "f1": 0.6414387411015362}, "protein_sim_pair_150bp": {"accuracy": 0.9294444444444444, "f1": 0.9333333333333333}, "protein_sim_pair_450bp": {"accuracy": 0.9694444444444444, "f1": 0.9688737973967176}, "dna_protein_pair": {"accuracy": 0.54, "f1": 0.031578947368421054}, "dna_protein_pair_100": {"accuracy": 0.475, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.495, "f1": 0.09009009009009009}, "dna_protein_pair_rand": {"accuracy": 0.494375, "f1": 0.03575685339690107}, "dna_protein_pair_rand_100": {"accuracy": 0.476875, "f1": 0.009467455621301775}, "dna_protein_pair_rand_full": {"accuracy": 0.5375, "f1": 0.1759465478841871}}
|
| 77 |
+
{"seed": 85, "en": {"accuracy": 0.878, "f1": 0.8675352877307275}, "fr": {"accuracy": 0.7915, "f1": 0.7822454308093995}, "de": {"accuracy": 0.763, "f1": 0.7489406779661016}, "zh": {"accuracy": 0.722, "f1": 0.7154554759467758}, "dna_sim_pair_simple_150bp": {"accuracy": 0.7969444444444445, "f1": 0.8218376797465269}, "dna_sim_pair_150bp": {"accuracy": 0.71975, "f1": 0.7503896682253396}, "dna_sim_pair_50bp": {"accuracy": 0.507, "f1": 0.6574009728978457}, "protein_sim_pair_150bp": {"accuracy": 0.8277777777777777, "f1": 0.8451548451548452}, "protein_sim_pair_450bp": {"accuracy": 0.9555555555555556, "f1": 0.9572649572649573}, "dna_protein_pair": {"accuracy": 0.515, "f1": 0.049019607843137254}, "dna_protein_pair_100": {"accuracy": 0.51, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.5375, "f1": 0.021164021164021163}, "dna_protein_pair_rand": {"accuracy": 0.525, "f1": 0.15178571428571427}, "dna_protein_pair_rand_100": {"accuracy": 0.515, "f1": 0.022670025188916875}, "dna_protein_pair_rand_full": {"accuracy": 0.510625, "f1": 0.024906600249066}}
|
| 78 |
+
{"seed": 86, "en": {"accuracy": 0.887, "f1": 0.8774403470715835}, "fr": {"accuracy": 0.802, "f1": 0.7995951417004049}, "de": {"accuracy": 0.7835, "f1": 0.7650569723277265}, "zh": {"accuracy": 0.7165, "f1": 0.6879471656576774}, "dna_sim_pair_simple_150bp": {"accuracy": 0.9388888888888889, "f1": 0.9373932840068299}, "dna_sim_pair_150bp": {"accuracy": 0.72075, "f1": 0.6309877766765775}, "dna_sim_pair_50bp": {"accuracy": 0.739, "f1": 0.7661290322580645}, "protein_sim_pair_150bp": {"accuracy": 0.9722222222222222, "f1": 0.972972972972973}, "protein_sim_pair_450bp": {"accuracy": 0.9411111111111111, "f1": 0.9368295589988082}, "dna_protein_pair": {"accuracy": 0.49, "f1": 0.009708737864077669}, "dna_protein_pair_100": {"accuracy": 0.4375, "f1": 0.0}, "dna_protein_pair_full": {"accuracy": 0.48, "f1": 0.018867924528301886}, "dna_protein_pair_rand": {"accuracy": 0.493125, "f1": 0.0073439412484700125}, "dna_protein_pair_rand_100": {"accuracy": 0.505625, "f1": 0.0025220680958385876}, "dna_protein_pair_rand_full": {"accuracy": 0.4875, "f1": 0.00966183574879227}}
|
finetune/gpt2_gene_multiv2_ft_en2.jsonl
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"seed": 9377, "dna_protein_pair_full": {"accuracy": 0.49, "f1": 0.24444444444444444}, "dna_protein_pair_rand_full": {"accuracy": 0.633, "f1": 0.5068861269734632}}
|
| 2 |
+
{"seed": 4277, "dna_protein_pair_full": {"accuracy": 0.3975, "f1": 0.5111561866125761}, "dna_protein_pair_rand_full": {"accuracy": 0.611375, "f1": 0.7062269677785127}}
|
| 3 |
+
{"seed": 4248, "dna_protein_pair_full": {"accuracy": 0.47, "f1": 0.12396694214876033}, "dna_protein_pair_rand_full": {"accuracy": 0.573125, "f1": 0.32947182407225606}}
|
| 4 |
+
{"seed": 1491, "dna_protein_pair_full": {"accuracy": 0.55, "f1": 0.25}, "dna_protein_pair_rand_full": {"accuracy": 0.551625, "f1": 0.3258785942492013}}
|
| 5 |
+
{"seed": 7840, "dna_protein_pair_full": {"accuracy": 0.5, "f1": 0.02912621359223301}, "dna_protein_pair_rand_full": {"accuracy": 0.504625, "f1": 0.017356806347632037}}
|
| 6 |
+
{"seed": 9385, "dna_protein_pair_full": {"accuracy": 0.545, "f1": 0.19469026548672566}, "dna_protein_pair_rand_full": {"accuracy": 0.644, "f1": 0.5427103403982017}}
|
| 7 |
+
{"seed": 4471, "dna_protein_pair_full": {"accuracy": 0.5425, "f1": 0.24066390041493776}, "dna_protein_pair_rand_full": {"accuracy": 0.69375, "f1": 0.6466685895586963}}
|
| 8 |
+
{"seed": 3747, "dna_protein_pair_full": {"accuracy": 0.4875, "f1": 0.019138755980861243}, "dna_protein_pair_rand_full": {"accuracy": 0.535125, "f1": 0.1733718604134252}}
|
| 9 |
+
{"seed": 7580, "dna_protein_pair_full": {"accuracy": 0.505, "f1": 0.05714285714285714}, "dna_protein_pair_rand_full": {"accuracy": 0.5785, "f1": 0.30128470783257355}}
|
| 10 |
+
{"seed": 4841, "dna_protein_pair_full": {"accuracy": 0.51, "f1": 0.125}, "dna_protein_pair_rand_full": {"accuracy": 0.504375, "f1": 0.0755420844019585}}
|
| 11 |
+
{"seed": 3364, "dna_protein_pair_full": {"accuracy": 0.4875, "f1": 0.03755868544600939}, "dna_protein_pair_rand_full": {"accuracy": 0.49925, "f1": 0.01765571358509073}}
|
| 12 |
+
{"seed": 757, "dna_protein_pair_full": {"accuracy": 0.4225, "f1": 0.5792349726775956}, "dna_protein_pair_rand_full": {"accuracy": 0.53425, "f1": 0.6640822214208438}}
|
| 13 |
+
{"seed": 3390, "dna_protein_pair_full": {"accuracy": 0.535, "f1": 0.16964285714285715}, "dna_protein_pair_rand_full": {"accuracy": 0.557875, "f1": 0.27801592161665645}}
|
| 14 |
+
{"seed": 8608, "dna_protein_pair_full": {"accuracy": 0.5075, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.508875, "f1": 0.05756776205325018}}
|
| 15 |
+
{"seed": 3639, "dna_protein_pair_full": {"accuracy": 0.5, "f1": 0.3630573248407643}, "dna_protein_pair_rand_full": {"accuracy": 0.69125, "f1": 0.6803830227743272}}
|
| 16 |
+
{"seed": 9238, "dna_protein_pair_full": {"accuracy": 0.4475, "f1": 0.12648221343873517}, "dna_protein_pair_rand_full": {"accuracy": 0.507375, "f1": 0.09630818619582665}}
|
| 17 |
+
{"seed": 2142, "dna_protein_pair_full": {"accuracy": 0.375, "f1": 0.34210526315789475}, "dna_protein_pair_rand_full": {"accuracy": 0.473, "f1": 0.2817717206132879}}
|
| 18 |
+
{"seed": 1568, "dna_protein_pair_full": {"accuracy": 0.4525, "f1": 0.14785992217898833}, "dna_protein_pair_rand_full": {"accuracy": 0.571625, "f1": 0.3382892450279977}}
|
| 19 |
+
{"seed": 7389, "dna_protein_pair_full": {"accuracy": 0.4675, "f1": 0.25263157894736843}, "dna_protein_pair_rand_full": {"accuracy": 0.4845, "f1": 0.28872024836150395}}
|
| 20 |
+
{"seed": 1632, "dna_protein_pair_full": {"accuracy": 0.455, "f1": 0.4293193717277487}, "dna_protein_pair_rand_full": {"accuracy": 0.748, "f1": 0.7583313354111724}}
|
| 21 |
+
{"seed": 2150, "dna_protein_pair_full": {"accuracy": 0.4225, "f1": 0.03347280334728033}, "dna_protein_pair_rand_full": {"accuracy": 0.504875, "f1": 0.046231639778473395}}
|
| 22 |
+
{"seed": 4923, "dna_protein_pair_full": {"accuracy": 0.4925, "f1": 0.22813688212927757}, "dna_protein_pair_rand_full": {"accuracy": 0.576125, "f1": 0.3910935536002873}}
|
| 23 |
+
{"seed": 7349, "dna_protein_pair_full": {"accuracy": 0.47, "f1": 0.04504504504504504}, "dna_protein_pair_rand_full": {"accuracy": 0.54125, "f1": 0.23060796645702306}}
|
| 24 |
+
{"seed": 8540, "dna_protein_pair_full": {"accuracy": 0.4425, "f1": 0.1890909090909091}, "dna_protein_pair_rand_full": {"accuracy": 0.591, "f1": 0.3854244928625094}}
|
| 25 |
+
{"seed": 6505, "dna_protein_pair_full": {"accuracy": 0.385, "f1": 0.22641509433962265}, "dna_protein_pair_rand_full": {"accuracy": 0.59225, "f1": 0.4062613760465963}}
|
| 26 |
+
{"seed": 7670, "dna_protein_pair_full": {"accuracy": 0.5025, "f1": 0.667779632721202}, "dna_protein_pair_rand_full": {"accuracy": 0.560375, "f1": 0.6925968009789354}}
|
| 27 |
+
{"seed": 1702, "dna_protein_pair_full": {"accuracy": 0.3325, "f1": 0.46060606060606063}, "dna_protein_pair_rand_full": {"accuracy": 0.530375, "f1": 0.5694969634467744}}
|
| 28 |
+
{"seed": 6106, "dna_protein_pair_full": {"accuracy": 0.3775, "f1": 0.5202312138728323}, "dna_protein_pair_rand_full": {"accuracy": 0.717375, "f1": 0.7624750499001997}}
|
| 29 |
+
{"seed": 8233, "dna_protein_pair_full": {"accuracy": 0.445, "f1": 0.26}, "dna_protein_pair_rand_full": {"accuracy": 0.54075, "f1": 0.2577777777777778}}
|
| 30 |
+
{"seed": 2369, "dna_protein_pair_full": {"accuracy": 0.4975, "f1": 0.34951456310679613}, "dna_protein_pair_rand_full": {"accuracy": 0.68375, "f1": 0.6386746643816053}}
|
| 31 |
+
{"seed": 8519, "dna_protein_pair_full": {"accuracy": 0.53, "f1": 0.14545454545454545}, "dna_protein_pair_rand_full": {"accuracy": 0.549125, "f1": 0.2542898490800083}}
|
| 32 |
+
{"seed": 691, "dna_protein_pair_full": {"accuracy": 0.495, "f1": 0.06481481481481481}, "dna_protein_pair_rand_full": {"accuracy": 0.527125, "f1": 0.14120317820658343}}
|
| 33 |
+
{"seed": 1606, "dna_protein_pair_full": {"accuracy": 0.455, "f1": 0.09917355371900827}, "dna_protein_pair_rand_full": {"accuracy": 0.519375, "f1": 0.12394622920938711}}
|
| 34 |
+
{"seed": 1324, "dna_protein_pair_full": {"accuracy": 0.4925, "f1": 0.037914691943127965}, "dna_protein_pair_rand_full": {"accuracy": 0.52675, "f1": 0.10156620787850024}}
|
| 35 |
+
{"seed": 104, "dna_protein_pair_full": {"accuracy": 0.45, "f1": 0.3413173652694611}, "dna_protein_pair_rand_full": {"accuracy": 0.65025, "f1": 0.6153423150948584}}
|
| 36 |
+
{"seed": 8031, "dna_protein_pair_full": {"accuracy": 0.5, "f1": 0.5073891625615764}, "dna_protein_pair_rand_full": {"accuracy": 0.703125, "f1": 0.7563852702841317}}
|
| 37 |
+
{"seed": 1318, "dna_protein_pair_full": {"accuracy": 0.4, "f1": 0.23076923076923078}, "dna_protein_pair_rand_full": {"accuracy": 0.5535, "f1": 0.3250188964474679}}
|
| 38 |
+
{"seed": 2368, "dna_protein_pair_full": {"accuracy": 0.4275, "f1": 0.5278350515463918}, "dna_protein_pair_rand_full": {"accuracy": 0.612375, "f1": 0.6792179580014482}}
|
| 39 |
+
{"seed": 9727, "dna_protein_pair_full": {"accuracy": 0.5325, "f1": 0.010582010582010581}, "dna_protein_pair_rand_full": {"accuracy": 0.556125, "f1": 0.2241642997596679}}
|
| 40 |
+
{"seed": 8341, "dna_protein_pair_full": {"accuracy": 0.5475, "f1": 0.15023474178403756}, "dna_protein_pair_rand_full": {"accuracy": 0.575375, "f1": 0.3405164045816346}}
|
| 41 |
+
{"seed": 8485, "dna_protein_pair_full": {"accuracy": 0.525, "f1": 0.08653846153846154}, "dna_protein_pair_rand_full": {"accuracy": 0.528375, "f1": 0.14425039691540031}}
|
| 42 |
+
{"seed": 4596, "dna_protein_pair_full": {"accuracy": 0.49, "f1": 0.04672897196261682}, "dna_protein_pair_rand_full": {"accuracy": 0.51775, "f1": 0.14266666666666666}}
|
| 43 |
+
{"seed": 6157, "dna_protein_pair_full": {"accuracy": 0.4075, "f1": 0.4261501210653753}, "dna_protein_pair_rand_full": {"accuracy": 0.615625, "f1": 0.5785939427161847}}
|
| 44 |
+
{"seed": 9356, "dna_protein_pair_full": {"accuracy": 0.43, "f1": 0.14285714285714285}, "dna_protein_pair_rand_full": {"accuracy": 0.516125, "f1": 0.10248087178298168}}
|
| 45 |
+
{"seed": 6565, "dna_protein_pair_full": {"accuracy": 0.5, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.506625, "f1": 0.012509382036527395}}
|
| 46 |
+
{"seed": 4526, "dna_protein_pair_full": {"accuracy": 0.4425, "f1": 0.6039076376554174}, "dna_protein_pair_rand_full": {"accuracy": 0.539125, "f1": 0.674666901967705}}
|
| 47 |
+
{"seed": 3552, "dna_protein_pair_full": {"accuracy": 0.53, "f1": 0.45348837209302323}, "dna_protein_pair_rand_full": {"accuracy": 0.748875, "f1": 0.7654955060114392}}
|
| 48 |
+
{"seed": 2017, "dna_protein_pair_full": {"accuracy": 0.5625, "f1": 0.5179063360881543}, "dna_protein_pair_rand_full": {"accuracy": 0.73175, "f1": 0.7463356973995272}}
|
| 49 |
+
{"seed": 2615, "dna_protein_pair_full": {"accuracy": 0.5, "f1": 0.4708994708994709}, "dna_protein_pair_rand_full": {"accuracy": 0.729875, "f1": 0.763126164638825}}
|
| 50 |
+
{"seed": 5587, "dna_protein_pair_full": {"accuracy": 0.525, "f1": 0.30656934306569344}, "dna_protein_pair_rand_full": {"accuracy": 0.701, "f1": 0.6584808680753855}}
|
| 51 |
+
{"seed": 3478, "dna_protein_pair_full": {"accuracy": 0.445, "f1": 0.40641711229946526}, "dna_protein_pair_rand_full": {"accuracy": 0.646875, "f1": 0.6221746689848869}}
|
| 52 |
+
{"seed": 2621, "dna_protein_pair_full": {"accuracy": 0.415, "f1": 0.4090909090909091}, "dna_protein_pair_rand_full": {"accuracy": 0.76025, "f1": 0.7766130910784999}}
|
| 53 |
+
{"seed": 7761, "dna_protein_pair_full": {"accuracy": 0.5025, "f1": 0.43304843304843305}, "dna_protein_pair_rand_full": {"accuracy": 0.691375, "f1": 0.6828516377649325}}
|
| 54 |
+
{"seed": 208, "dna_protein_pair_full": {"accuracy": 0.455, "f1": 0.4263157894736842}, "dna_protein_pair_rand_full": {"accuracy": 0.73275, "f1": 0.7495900679315999}}
|
| 55 |
+
{"seed": 3926, "dna_protein_pair_full": {"accuracy": 0.375, "f1": 0.4212962962962963}, "dna_protein_pair_rand_full": {"accuracy": 0.497375, "f1": 0.45961564305872865}}
|
| 56 |
+
{"seed": 1128, "dna_protein_pair_full": {"accuracy": 0.4475, "f1": 0.23529411764705882}, "dna_protein_pair_rand_full": {"accuracy": 0.571875, "f1": 0.3975373790677221}}
|
| 57 |
+
{"seed": 6132, "dna_protein_pair_full": {"accuracy": 0.43, "f1": 0.5714285714285714}, "dna_protein_pair_rand_full": {"accuracy": 0.59675, "f1": 0.697430125679985}}
|
| 58 |
+
{"seed": 5647, "dna_protein_pair_full": {"accuracy": 0.53, "f1": 0.4303030303030303}, "dna_protein_pair_rand_full": {"accuracy": 0.702375, "f1": 0.7012172167147698}}
|
| 59 |
+
{"seed": 7083, "dna_protein_pair_full": {"accuracy": 0.385, "f1": 0.5232558139534884}, "dna_protein_pair_rand_full": {"accuracy": 0.517625, "f1": 0.6022059581486444}}
|
| 60 |
+
{"seed": 2020, "dna_protein_pair_full": {"accuracy": 0.4175, "f1": 0.1824561403508772}, "dna_protein_pair_rand_full": {"accuracy": 0.5575, "f1": 0.3442015561319007}}
|
| 61 |
+
{"seed": 5151, "dna_protein_pair_full": {"accuracy": 0.4225, "f1": 0.21160409556313994}, "dna_protein_pair_rand_full": {"accuracy": 0.615, "f1": 0.42920681986656783}}
|
| 62 |
+
{"seed": 2881, "dna_protein_pair_full": {"accuracy": 0.4975, "f1": 0.009852216748768473}, "dna_protein_pair_rand_full": {"accuracy": 0.50275, "f1": 0.02975609756097561}}
|
| 63 |
+
{"seed": 6216, "dna_protein_pair_full": {"accuracy": 0.4575, "f1": 0.1422924901185771}, "dna_protein_pair_rand_full": {"accuracy": 0.548625, "f1": 0.23382134521536177}}
|
| 64 |
+
{"seed": 5399, "dna_protein_pair_full": {"accuracy": 0.535, "f1": 0.16216216216216217}, "dna_protein_pair_rand_full": {"accuracy": 0.53825, "f1": 0.18634361233480176}}
|
| 65 |
+
{"seed": 6209, "dna_protein_pair_full": {"accuracy": 0.4975, "f1": 0.6365280289330922}, "dna_protein_pair_rand_full": {"accuracy": 0.604, "f1": 0.7133031674208145}}
|
| 66 |
+
{"seed": 3661, "dna_protein_pair_full": {"accuracy": 0.46, "f1": 0.07692307692307693}, "dna_protein_pair_rand_full": {"accuracy": 0.527125, "f1": 0.11632796075683252}}
|
| 67 |
+
{"seed": 2554, "dna_protein_pair_full": {"accuracy": 0.4925, "f1": 0.00975609756097561}, "dna_protein_pair_rand_full": {"accuracy": 0.509, "f1": 0.022399203583872575}}
|
| 68 |
+
{"seed": 792, "dna_protein_pair_full": {"accuracy": 0.445, "f1": 0.6063829787234043}, "dna_protein_pair_rand_full": {"accuracy": 0.463625, "f1": 0.5924589229746414}}
|
| 69 |
+
{"seed": 9490, "dna_protein_pair_full": {"accuracy": 0.42, "f1": 0.2}, "dna_protein_pair_rand_full": {"accuracy": 0.548, "f1": 0.3700348432055749}}
|
| 70 |
+
{"seed": 8747, "dna_protein_pair_full": {"accuracy": 0.535, "f1": 0.44642857142857145}, "dna_protein_pair_rand_full": {"accuracy": 0.741, "f1": 0.7396984924623116}}
|
| 71 |
+
{"seed": 8740, "dna_protein_pair_full": {"accuracy": 0.46, "f1": 0.01818181818181818}, "dna_protein_pair_rand_full": {"accuracy": 0.526625, "f1": 0.10366863905325444}}
|
| 72 |
+
{"seed": 6640, "dna_protein_pair_full": {"accuracy": 0.46, "f1": 0.6142857142857143}, "dna_protein_pair_rand_full": {"accuracy": 0.565625, "f1": 0.6598139990210474}}
|
| 73 |
+
{"seed": 5929, "dna_protein_pair_full": {"accuracy": 0.48, "f1": 0.045871559633027525}, "dna_protein_pair_rand_full": {"accuracy": 0.531125, "f1": 0.17397049108125964}}
|
| 74 |
+
{"seed": 4092, "dna_protein_pair_full": {"accuracy": 0.48, "f1": 0.35}, "dna_protein_pair_rand_full": {"accuracy": 0.5145, "f1": 0.3636959370904325}}
|
| 75 |
+
{"seed": 5868, "dna_protein_pair_full": {"accuracy": 0.4975, "f1": 0.1518987341772152}, "dna_protein_pair_rand_full": {"accuracy": 0.595625, "f1": 0.3625615763546798}}
|
| 76 |
+
{"seed": 2852, "dna_protein_pair_full": {"accuracy": 0.4875, "f1": 0.07239819004524888}, "dna_protein_pair_rand_full": {"accuracy": 0.5245, "f1": 0.11534883720930232}}
|
| 77 |
+
{"seed": 3192, "dna_protein_pair_full": {"accuracy": 0.52, "f1": 0.06796116504854369}, "dna_protein_pair_rand_full": {"accuracy": 0.53, "f1": 0.14855072463768115}}
|
| 78 |
+
{"seed": 6069, "dna_protein_pair_full": {"accuracy": 0.4525, "f1": 0.6217616580310881}, "dna_protein_pair_rand_full": {"accuracy": 0.532, "f1": 0.6772413793103448}}
|
| 79 |
+
{"seed": 9895, "dna_protein_pair_full": {"accuracy": 0.43, "f1": 0.5854545454545454}, "dna_protein_pair_rand_full": {"accuracy": 0.3955, "f1": 0.48222698072805137}}
|
| 80 |
+
{"seed": 4622, "dna_protein_pair_full": {"accuracy": 0.505, "f1": 0.08333333333333333}, "dna_protein_pair_rand_full": {"accuracy": 0.512625, "f1": 0.1140649852306294}}
|
| 81 |
+
{"seed": 690, "dna_protein_pair_full": {"accuracy": 0.455, "f1": 0.043859649122807015}, "dna_protein_pair_rand_full": {"accuracy": 0.494, "f1": 0.04483246814535158}}
|
| 82 |
+
{"seed": 2905, "dna_protein_pair_full": {"accuracy": 0.42, "f1": 0.12781954887218044}, "dna_protein_pair_rand_full": {"accuracy": 0.533875, "f1": 0.21412012644889358}}
|
| 83 |
+
{"seed": 1038, "dna_protein_pair_full": {"accuracy": 0.445, "f1": 0.15267175572519084}, "dna_protein_pair_rand_full": {"accuracy": 0.638125, "f1": 0.4925503943908852}}
|
| 84 |
+
{"seed": 4612, "dna_protein_pair_full": {"accuracy": 0.5175, "f1": 0.4469914040114613}, "dna_protein_pair_rand_full": {"accuracy": 0.593375, "f1": 0.546114134226315}}
|
| 85 |
+
{"seed": 277, "dna_protein_pair_full": {"accuracy": 0.37, "f1": 0.411214953271028}, "dna_protein_pair_rand_full": {"accuracy": 0.5925, "f1": 0.5316091954022989}}
|
| 86 |
+
{"seed": 6277, "dna_protein_pair_full": {"accuracy": 0.5675, "f1": 0.2100456621004566}, "dna_protein_pair_rand_full": {"accuracy": 0.572125, "f1": 0.3481241668253666}}
|
| 87 |
+
{"seed": 2345, "dna_protein_pair_full": {"accuracy": 0.52, "f1": 0.02040816326530612}, "dna_protein_pair_rand_full": {"accuracy": 0.508125, "f1": 0.04141291108404385}}
|
| 88 |
+
{"seed": 8403, "dna_protein_pair_full": {"accuracy": 0.5025, "f1": 0.009950248756218905}, "dna_protein_pair_rand_full": {"accuracy": 0.501875, "f1": 0.0216056960471397}}
|
| 89 |
+
{"seed": 4328, "dna_protein_pair_full": {"accuracy": 0.4, "f1": 0.5020746887966805}, "dna_protein_pair_rand_full": {"accuracy": 0.583375, "f1": 0.6376780084791825}}
|
| 90 |
+
{"seed": 8255, "dna_protein_pair_full": {"accuracy": 0.4525, "f1": 0.13438735177865613}, "dna_protein_pair_rand_full": {"accuracy": 0.511125, "f1": 0.15801937567276642}}
|
| 91 |
+
{"seed": 7781, "dna_protein_pair_full": {"accuracy": 0.4675, "f1": 0.5125858123569794}, "dna_protein_pair_rand_full": {"accuracy": 0.655375, "f1": 0.6845898638599702}}
|
| 92 |
+
{"seed": 8414, "dna_protein_pair_full": {"accuracy": 0.5125, "f1": 0.10138248847926268}, "dna_protein_pair_rand_full": {"accuracy": 0.533625, "f1": 0.1772877618522602}}
|
| 93 |
+
{"seed": 7077, "dna_protein_pair_full": {"accuracy": 0.4825, "f1": 0.06334841628959276}, "dna_protein_pair_rand_full": {"accuracy": 0.511625, "f1": 0.08135433811427228}}
|
| 94 |
+
{"seed": 1349, "dna_protein_pair_full": {"accuracy": 0.44, "f1": 0.2631578947368421}, "dna_protein_pair_rand_full": {"accuracy": 0.5775, "f1": 0.38004402054292}}
|
| 95 |
+
{"seed": 2336, "dna_protein_pair_full": {"accuracy": 0.5125, "f1": 0.5894736842105263}, "dna_protein_pair_rand_full": {"accuracy": 0.68, "f1": 0.722161927501628}}
|
| 96 |
+
{"seed": 5417, "dna_protein_pair_full": {"accuracy": 0.395, "f1": 0.33879781420765026}, "dna_protein_pair_rand_full": {"accuracy": 0.4325, "f1": 0.18168709444844988}}
|
| 97 |
+
{"seed": 17, "dna_protein_pair_full": {"accuracy": 0.55, "f1": 0.6234309623430963}, "dna_protein_pair_rand_full": {"accuracy": 0.57725, "f1": 0.6246392896781354}}
|
| 98 |
+
{"seed": 3656, "dna_protein_pair_full": {"accuracy": 0.525, "f1": 0.020618556701030927}, "dna_protein_pair_rand_full": {"accuracy": 0.592875, "f1": 0.3698974656606694}}
|
| 99 |
+
{"seed": 8342, "dna_protein_pair_full": {"accuracy": 0.4875, "f1": 0.07239819004524888}, "dna_protein_pair_rand_full": {"accuracy": 0.5275, "f1": 0.16776750330250992}}
|
| 100 |
+
{"seed": 8521, "dna_protein_pair_full": {"accuracy": 0.44, "f1": 0.20567375886524822}, "dna_protein_pair_rand_full": {"accuracy": 0.612, "f1": 0.42497221193034457}}
|
| 101 |
+
{"seed": 7440, "dna_protein_pair_full": {"accuracy": 0.475, "f1": 0.3269230769230769}, "dna_protein_pair_rand_full": {"accuracy": 0.603875, "f1": 0.47663088356729977}}
|
| 102 |
+
{"seed": 5855, "dna_protein_pair_full": {"accuracy": 0.49, "f1": 0.4069767441860465}, "dna_protein_pair_rand_full": {"accuracy": 0.64875, "f1": 0.6561429270680372}}
|
| 103 |
+
{"seed": 6990, "dna_protein_pair_full": {"accuracy": 0.4125, "f1": 0.4418052256532066}, "dna_protein_pair_rand_full": {"accuracy": 0.62225, "f1": 0.640665873959572}}
|
| 104 |
+
{"seed": 354, "dna_protein_pair_full": {"accuracy": 0.49, "f1": 0.184}, "dna_protein_pair_rand_full": {"accuracy": 0.64475, "f1": 0.5163376446562288}}
|
| 105 |
+
{"seed": 6884, "dna_protein_pair_full": {"accuracy": 0.4075, "f1": 0.45011600928074247}, "dna_protein_pair_rand_full": {"accuracy": 0.59125, "f1": 0.5594179466451091}}
|
| 106 |
+
{"seed": 7496, "dna_protein_pair_full": {"accuracy": 0.46, "f1": 0.3532934131736527}, "dna_protein_pair_rand_full": {"accuracy": 0.582125, "f1": 0.45331152902698285}}
|
| 107 |
+
{"seed": 8078, "dna_protein_pair_full": {"accuracy": 0.5, "f1": 0.4350282485875706}, "dna_protein_pair_rand_full": {"accuracy": 0.651875, "f1": 0.6102169349195241}}
|
| 108 |
+
{"seed": 633, "dna_protein_pair_full": {"accuracy": 0.4825, "f1": 0.15510204081632653}, "dna_protein_pair_rand_full": {"accuracy": 0.5255, "f1": 0.17871051492860235}}
|
| 109 |
+
{"seed": 3696, "dna_protein_pair_full": {"accuracy": 0.475, "f1": 0.03669724770642202}, "dna_protein_pair_rand_full": {"accuracy": 0.495125, "f1": 0.02037351443123939}}
|
| 110 |
+
{"seed": 5461, "dna_protein_pair_full": {"accuracy": 0.49, "f1": 0.5233644859813084}, "dna_protein_pair_rand_full": {"accuracy": 0.687625, "f1": 0.7250522609748047}}
|
| 111 |
+
{"seed": 2135, "dna_protein_pair_full": {"accuracy": 0.495, "f1": 0.26277372262773724}, "dna_protein_pair_rand_full": {"accuracy": 0.667875, "f1": 0.5391153512575889}}
|
| 112 |
+
{"seed": 4135, "dna_protein_pair_full": {"accuracy": 0.3775, "f1": 0.15593220338983052}, "dna_protein_pair_rand_full": {"accuracy": 0.52925, "f1": 0.1932305055698372}}
|
| 113 |
+
{"seed": 5194, "dna_protein_pair_full": {"accuracy": 0.51, "f1": 0.10909090909090909}, "dna_protein_pair_rand_full": {"accuracy": 0.634625, "f1": 0.47437511238985797}}
|
| 114 |
+
{"seed": 1689, "dna_protein_pair_full": {"accuracy": 0.3925, "f1": 0.26586102719033233}, "dna_protein_pair_rand_full": {"accuracy": 0.727375, "f1": 0.7038696537678207}}
|
| 115 |
+
{"seed": 299, "dna_protein_pair_full": {"accuracy": 0.54, "f1": 0.34285714285714286}, "dna_protein_pair_rand_full": {"accuracy": 0.71375, "f1": 0.6547044632086851}}
|
| 116 |
+
{"seed": 5598, "dna_protein_pair_full": {"accuracy": 0.5075, "f1": 0.0}, "dna_protein_pair_rand_full": {"accuracy": 0.506375, "f1": 0.009530975670930524}}
|
| 117 |
+
{"seed": 7204, "dna_protein_pair_full": {"accuracy": 0.6675, "f1": 0.6683291770573566}, "dna_protein_pair_rand_full": {"accuracy": 0.6345, "f1": 0.7026037428803905}}
|
| 118 |
+
{"seed": 5368, "dna_protein_pair_full": {"accuracy": 0.38, "f1": 0.40669856459330145}, "dna_protein_pair_rand_full": {"accuracy": 0.46225, "f1": 0.281563126252505}}
|
| 119 |
+
{"seed": 2689, "dna_protein_pair_full": {"accuracy": 0.3875, "f1": 0.449438202247191}, "dna_protein_pair_rand_full": {"accuracy": 0.586625, "f1": 0.63128553907905}}
|
| 120 |
+
{"seed": 7058, "dna_protein_pair_full": {"accuracy": 0.435, "f1": 0.5568627450980392}, "dna_protein_pair_rand_full": {"accuracy": 0.6725, "f1": 0.7355672184093661}}
|
| 121 |
+
{"seed": 8135, "dna_protein_pair_full": {"accuracy": 0.5, "f1": 0.05660377358490566}, "dna_protein_pair_rand_full": {"accuracy": 0.54, "f1": 0.1704238052299369}}
|
| 122 |
+
{"seed": 9685, "dna_protein_pair_full": {"accuracy": 0.5125, "f1": 0.25287356321839083}, "dna_protein_pair_rand_full": {"accuracy": 0.619875, "f1": 0.5038342307064774}}
|
| 123 |
+
{"seed": 6034, "dna_protein_pair_full": {"accuracy": 0.51, "f1": 0.125}, "dna_protein_pair_rand_full": {"accuracy": 0.574625, "f1": 0.34494706448508183}}
|
| 124 |
+
{"seed": 311, "dna_protein_pair_full": {"accuracy": 0.5025, "f1": 0.17427385892116182}, "dna_protein_pair_rand_full": {"accuracy": 0.624875, "f1": 0.43452044469568496}}
|
finetune/gpt2_gene_multiv2_ft_en3.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
finetune/gpt2_gene_multiv2_ft_en_test_others.py
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import os
|
| 2 |
+
|
| 3 |
+
# # 设置环境变量
|
| 4 |
+
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
| 5 |
+
|
| 6 |
+
# # 打印环境变量以确认设置成功
|
| 7 |
+
# print(os.environ.get('HF_ENDPOINT'))
|
| 8 |
+
|
| 9 |
+
# import subprocess
|
| 10 |
+
# import os
|
| 11 |
+
|
| 12 |
+
# result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
|
| 13 |
+
# output = result.stdout
|
| 14 |
+
# for line in output.splitlines():
|
| 15 |
+
# if '=' in line:
|
| 16 |
+
# var, value = line.split('=', 1)
|
| 17 |
+
# os.environ[var] = value
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
from datasets import load_dataset
|
| 21 |
+
from transformers import AutoTokenizer, DataCollatorWithPadding
|
| 22 |
+
from transformers import Trainer
|
| 23 |
+
import evaluate
|
| 24 |
+
import numpy as np
|
| 25 |
+
from transformers import TrainingArguments
|
| 26 |
+
from transformers import AutoModelForSequenceClassification
|
| 27 |
+
import json
|
| 28 |
+
from transformers import set_seed
|
| 29 |
+
import random
|
| 30 |
+
import numpy as np
|
| 31 |
+
import torch
|
| 32 |
+
import sys
|
| 33 |
+
|
| 34 |
+
# seed = 42
|
| 35 |
+
# random.seed(seed)
|
| 36 |
+
# np.random.seed(seed)
|
| 37 |
+
# torch.manual_seed(seed)
|
| 38 |
+
# torch.cuda.manual_seed_all(seed)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# 动态生成随机种子
|
| 42 |
+
import random
|
| 43 |
+
#seed = random.randint(0, 10000)
|
| 44 |
+
seed = int(sys.argv[1])
|
| 45 |
+
#print(f"Generated seed: {seed}")
|
| 46 |
+
set_seed(seed)
|
| 47 |
+
result = {}
|
| 48 |
+
result["seed"] = seed
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# 假设你的JSON文件名为 'data.json' 并且每行是一个独立的JSON对象
|
| 53 |
+
raw_datasets = load_dataset('paws-x', 'en') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x
|
| 54 |
+
|
| 55 |
+
#分词器
|
| 56 |
+
tokenizer = AutoTokenizer.from_pretrained("dnagpt/gpt2_gene_multi_v2")
|
| 57 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 58 |
+
|
| 59 |
+
# 修改分词器的填充方向为左侧,默认有右侧,分类问题建议左侧
|
| 60 |
+
#tokenizer.padding_side = "left"
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
#分词函数
|
| 64 |
+
def tokenize_function(example):
|
| 65 |
+
#return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=256)
|
| 66 |
+
return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=256, padding="max_length")
|
| 67 |
+
#return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=1024) #padding="max_length")
|
| 68 |
+
|
| 69 |
+
#构建分词后的数据集
|
| 70 |
+
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
|
| 71 |
+
|
| 72 |
+
#训练数据构建
|
| 73 |
+
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
#指标函数定义
|
| 77 |
+
def compute_metrics(eval_pred):
|
| 78 |
+
predictions, labels = eval_pred
|
| 79 |
+
predictions = np.argmax(predictions, axis=1)
|
| 80 |
+
return {'accuracy': (predictions==labels).sum() / len(labels)}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
training_args = TrainingArguments(
|
| 85 |
+
output_dir="ds_job_dna_2222",
|
| 86 |
+
learning_rate=1e-5,
|
| 87 |
+
lr_scheduler_type="constant_with_warmup",
|
| 88 |
+
warmup_ratio=0.1,
|
| 89 |
+
optim='adamw_torch',
|
| 90 |
+
weight_decay=0.0,
|
| 91 |
+
seed=seed, # 使用动态生成的随机种子
|
| 92 |
+
per_device_train_batch_size=20,
|
| 93 |
+
per_device_eval_batch_size=20,
|
| 94 |
+
num_train_epochs=4, #训练多少轮
|
| 95 |
+
evaluation_strategy="epoch",
|
| 96 |
+
save_strategy="epoch",
|
| 97 |
+
logging_strategy="epoch",
|
| 98 |
+
load_best_model_at_end=True
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
#模型定义,文本分类模型
|
| 102 |
+
model = AutoModelForSequenceClassification.from_pretrained("dnagpt/gpt2_gene_multi_v2", num_labels=2)
|
| 103 |
+
model.config.pad_token_id = model.config.eos_token_id
|
| 104 |
+
|
| 105 |
+
trainer = Trainer(
|
| 106 |
+
model,
|
| 107 |
+
training_args,
|
| 108 |
+
train_dataset=tokenized_datasets["train"],
|
| 109 |
+
eval_dataset=tokenized_datasets["validation"],
|
| 110 |
+
data_collator=data_collator,
|
| 111 |
+
tokenizer=tokenizer,
|
| 112 |
+
compute_metrics=compute_metrics,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
trainer.train() #模型训练
|
| 116 |
+
|
| 117 |
+
#模型测试,英文数据集
|
| 118 |
+
predictions = trainer.predict(tokenized_datasets["test"])
|
| 119 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 120 |
+
metric = evaluate.load("glue", "mrpc")
|
| 121 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 122 |
+
result["en"] = ret
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
#模型测试,法文数据集
|
| 126 |
+
raw_datasets_fr = load_dataset('paws-x', 'fr') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x
|
| 127 |
+
tokenized_datasets_fr = raw_datasets_fr.map(tokenize_function, batched=True)
|
| 128 |
+
|
| 129 |
+
predictions = trainer.predict(tokenized_datasets_fr["test"])
|
| 130 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 131 |
+
metric = evaluate.load("glue", "mrpc")
|
| 132 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 133 |
+
result["fr"] = ret
|
| 134 |
+
|
| 135 |
+
#模型测试,德文数据集
|
| 136 |
+
raw_datasets_de = load_dataset('google-research-datasets/paws-x', 'de') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-
|
| 137 |
+
tokenized_datasets_de = raw_datasets_de.map(tokenize_function, batched=True)
|
| 138 |
+
predictions = trainer.predict(tokenized_datasets_de["test"])
|
| 139 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 140 |
+
metric = evaluate.load("glue", "mrpc")
|
| 141 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 142 |
+
result["de"] = ret
|
| 143 |
+
|
| 144 |
+
#模型测试,中文数据集
|
| 145 |
+
raw_datasets_zh = load_dataset('google-research-datasets/paws-x', 'zh') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-
|
| 146 |
+
tokenized_datasets_zh = raw_datasets_zh.map(tokenize_function, batched=True)
|
| 147 |
+
|
| 148 |
+
predictions = trainer.predict(tokenized_datasets_zh["test"])
|
| 149 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 150 |
+
metric = evaluate.load("glue", "mrpc")
|
| 151 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 152 |
+
result["zh"] = ret
|
| 153 |
+
|
| 154 |
+
#模型测试 dna数据集,150 bp长度 简单版本
|
| 155 |
+
raw_datasets_dna =load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_simple_150bp')['train'].train_test_split(test_size=0.2) #默认已经shuffle
|
| 156 |
+
tokenized_datasets_dna = raw_datasets_dna.map(tokenize_function, batched=True)
|
| 157 |
+
predictions = trainer.predict(tokenized_datasets_dna["test"])
|
| 158 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 159 |
+
metric = evaluate.load("glue", "mrpc")
|
| 160 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 161 |
+
result["dna_sim_pair_simple_150bp"] = ret
|
| 162 |
+
|
| 163 |
+
#模型测试 dna数据集,150长度,复杂版本 不相似
|
| 164 |
+
raw_datasets_dna = load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_150bp')['train'].train_test_split(test_size=0.2) #默认已经shuffle
|
| 165 |
+
tokenized_datasets_dna= raw_datasets_dna.map(tokenize_function, batched=True)
|
| 166 |
+
|
| 167 |
+
predictions = trainer.predict(tokenized_datasets_dna["test"])
|
| 168 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 169 |
+
metric = evaluate.load("glue", "mrpc")
|
| 170 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 171 |
+
result["dna_sim_pair_150bp"] = ret
|
| 172 |
+
|
| 173 |
+
#模型测试 dna数据集,50长度,复杂版本 不相似
|
| 174 |
+
raw_datasets_dna = load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_50bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 175 |
+
tokenized_datasets_dna = raw_datasets_dna.map(tokenize_function, batched=True)
|
| 176 |
+
predictions = trainer.predict(tokenized_datasets_dna["test"])
|
| 177 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 178 |
+
metric = evaluate.load("glue", "mrpc")
|
| 179 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 180 |
+
result["dna_sim_pair_50bp"] = ret
|
| 181 |
+
|
| 182 |
+
#模型测试 蛋白质数据集,50长度/150bp,复杂版本 不相似
|
| 183 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'protein_sim_pair_150bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 184 |
+
tokenized_datasets_dna_protein = raw_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 185 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 186 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 187 |
+
metric = evaluate.load("glue", "mrpc")
|
| 188 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 189 |
+
result["protein_sim_pair_150bp"] = ret
|
| 190 |
+
|
| 191 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 192 |
+
|
| 193 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'protein_sim_pair_450bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 194 |
+
tokenized_datasets_dna_protein = raw_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 195 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 196 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 197 |
+
metric = evaluate.load("glue", "mrpc")
|
| 198 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 199 |
+
result["protein_sim_pair_450bp"] = ret
|
| 200 |
+
|
| 201 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 202 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 203 |
+
|
| 204 |
+
# 定义翻转标签的函数
|
| 205 |
+
def flip_labels(example):
|
| 206 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 207 |
+
example["sentence1"] = example["sentence1"][:150]
|
| 208 |
+
example["sentence2"] = example["sentence2"][:50]
|
| 209 |
+
example['label'] = 1 - example['label']
|
| 210 |
+
return example
|
| 211 |
+
|
| 212 |
+
# 应用翻转标签函数
|
| 213 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 214 |
+
|
| 215 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 216 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 217 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 218 |
+
metric = evaluate.load("glue", "mrpc")
|
| 219 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 220 |
+
result["dna_protein_pair"] = ret
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 224 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 225 |
+
|
| 226 |
+
# 定义翻转标签的函数
|
| 227 |
+
def flip_labels(example):
|
| 228 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 229 |
+
example["sentence1"] = example["sentence1"][:300]
|
| 230 |
+
example["sentence2"] = example["sentence2"][:100]
|
| 231 |
+
example['label'] = 1 - example['label']
|
| 232 |
+
return example
|
| 233 |
+
|
| 234 |
+
# 应用翻转标签函数
|
| 235 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 236 |
+
|
| 237 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 238 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 239 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 240 |
+
metric = evaluate.load("glue", "mrpc")
|
| 241 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 242 |
+
result["dna_protein_pair_100"] = ret
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 250 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 251 |
+
|
| 252 |
+
# 定义翻转标签的函数
|
| 253 |
+
def flip_labels(example):
|
| 254 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 255 |
+
example["sentence1"] = example["sentence1"]
|
| 256 |
+
example["sentence2"] = example["sentence2"]
|
| 257 |
+
example['label'] = 1 - example['label']
|
| 258 |
+
return example
|
| 259 |
+
|
| 260 |
+
# 应用翻转标签函数
|
| 261 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 262 |
+
|
| 263 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 264 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 265 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 266 |
+
metric = evaluate.load("glue", "mrpc")
|
| 267 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 268 |
+
result["dna_protein_pair_full"] = ret
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
#模型测试 蛋白质数据集,随机版本
|
| 275 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 276 |
+
|
| 277 |
+
# 定义翻转标签的函数
|
| 278 |
+
def flip_labels(example):
|
| 279 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 280 |
+
example["sentence1"] = example["sentence1"][:150]
|
| 281 |
+
example["sentence2"] = example["sentence2"][:50]
|
| 282 |
+
example['label'] = 1 - example['label']
|
| 283 |
+
return example
|
| 284 |
+
|
| 285 |
+
# 应用翻转标签函数
|
| 286 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 287 |
+
|
| 288 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 289 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 290 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 291 |
+
metric = evaluate.load("glue", "mrpc")
|
| 292 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 293 |
+
result["dna_protein_pair_rand"] = ret
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
#模型测试 蛋白质数据集,随机版本
|
| 297 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 298 |
+
|
| 299 |
+
# 定义翻转标签的函数
|
| 300 |
+
def flip_labels(example):
|
| 301 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 302 |
+
example["sentence1"] = example["sentence1"][:300]
|
| 303 |
+
example["sentence2"] = example["sentence2"][:100]
|
| 304 |
+
example['label'] = 1 - example['label']
|
| 305 |
+
return example
|
| 306 |
+
|
| 307 |
+
# 应用翻转标签函数
|
| 308 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 309 |
+
|
| 310 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 311 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 312 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 313 |
+
metric = evaluate.load("glue", "mrpc")
|
| 314 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 315 |
+
result["dna_protein_pair_rand_100"] = ret
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
#模型测试 蛋白质数据集,随机版本
|
| 321 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 322 |
+
|
| 323 |
+
# 定义翻转标签的函数
|
| 324 |
+
def flip_labels(example):
|
| 325 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 326 |
+
example["sentence1"] = example["sentence1"]
|
| 327 |
+
example["sentence2"] = example["sentence2"]
|
| 328 |
+
example['label'] = 1 - example['label']
|
| 329 |
+
return example
|
| 330 |
+
|
| 331 |
+
# 应用翻转标签函数
|
| 332 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 333 |
+
|
| 334 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
|
| 335 |
+
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
|
| 336 |
+
preds = np.argmax(predictions.predictions, axis=-1)
|
| 337 |
+
metric = evaluate.load("glue", "mrpc")
|
| 338 |
+
ret = metric.compute(predictions=preds, references=predictions.label_ids)
|
| 339 |
+
result["dna_protein_pair_rand_full"] = ret
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
print(json.dumps(result))
|
| 344 |
+
|
finetune/gpt2_gene_multiv2_ft_en_test_others2.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import os
|
| 2 |
+
|
| 3 |
+
# # 设置环境变量
|
| 4 |
+
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
| 5 |
+
|
| 6 |
+
# # 打印环境变量以确认设置成功
|
| 7 |
+
# print(os.environ.get('HF_ENDPOINT'))
|
| 8 |
+
|
| 9 |
+
# import subprocess
|
| 10 |
+
# import os
|
| 11 |
+
|
| 12 |
+
# result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
|
| 13 |
+
# output = result.stdout
|
| 14 |
+
# for line in output.splitlines():
|
| 15 |
+
# if '=' in line:
|
| 16 |
+
# var, value = line.split('=', 1)
|
| 17 |
+
# os.environ[var] = value
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
from datasets import load_dataset
|
| 21 |
+
from transformers import AutoTokenizer, DataCollatorWithPadding
|
| 22 |
+
from transformers import Trainer
|
| 23 |
+
import evaluate
|
| 24 |
+
import numpy as np
|
| 25 |
+
from transformers import TrainingArguments
|
| 26 |
+
from transformers import AutoModelForSequenceClassification
|
| 27 |
+
import json
|
| 28 |
+
from transformers import set_seed
|
| 29 |
+
import random
|
| 30 |
+
import numpy as np
|
| 31 |
+
import torch
|
| 32 |
+
from tqdm import tqdm
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# seed = 42
|
| 36 |
+
# random.seed(seed)
|
| 37 |
+
# np.random.seed(seed)
|
| 38 |
+
# torch.manual_seed(seed)
|
| 39 |
+
# torch.cuda.manual_seed_all(seed)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# 动态生成随机种子
|
| 43 |
+
import random
|
| 44 |
+
seed = random.randint(0, 10000)
|
| 45 |
+
#print(f"Generated seed: {seed}")
|
| 46 |
+
set_seed(seed)
|
| 47 |
+
result = {}
|
| 48 |
+
result["seed"] = seed
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# 假设你的JSON文件名为 'data.json' 并且每行是一个独立的JSON对象
|
| 53 |
+
raw_datasets = load_dataset('paws-x', 'en') # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x
|
| 54 |
+
|
| 55 |
+
#分词器
|
| 56 |
+
tokenizer = AutoTokenizer.from_pretrained("dnagpt/gpt2_gene_multi_v2")
|
| 57 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
#分词函数
|
| 61 |
+
def tokenize_function(example):
|
| 62 |
+
return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=256, padding="max_length")
|
| 63 |
+
|
| 64 |
+
#构建分词后的数据集
|
| 65 |
+
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
|
| 66 |
+
|
| 67 |
+
#训练数据构建
|
| 68 |
+
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
#指标函数定义
|
| 72 |
+
def compute_metrics(eval_pred):
|
| 73 |
+
predictions, labels = eval_pred
|
| 74 |
+
predictions = np.argmax(predictions, axis=1)
|
| 75 |
+
return {'accuracy': (predictions==labels).sum() / len(labels)}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
training_args = TrainingArguments(
|
| 80 |
+
output_dir="ds_job_dna_2222",
|
| 81 |
+
learning_rate=1e-5,
|
| 82 |
+
lr_scheduler_type="constant_with_warmup",
|
| 83 |
+
warmup_ratio=0.1,
|
| 84 |
+
optim='adamw_torch',
|
| 85 |
+
weight_decay=0.0,
|
| 86 |
+
seed=seed, # 使用动态生成的随机种子
|
| 87 |
+
per_device_train_batch_size=64,
|
| 88 |
+
per_device_eval_batch_size=64,
|
| 89 |
+
num_train_epochs=4, #训练多少轮
|
| 90 |
+
evaluation_strategy="epoch",
|
| 91 |
+
save_strategy="epoch",
|
| 92 |
+
logging_strategy="epoch",
|
| 93 |
+
load_best_model_at_end=True
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
#模型定义,文本分类模型
|
| 97 |
+
model = AutoModelForSequenceClassification.from_pretrained("dnagpt/gpt2_gene_multi_v2", num_labels=2)
|
| 98 |
+
model.config.pad_token_id = model.config.eos_token_id
|
| 99 |
+
|
| 100 |
+
trainer = Trainer(
|
| 101 |
+
model,
|
| 102 |
+
training_args,
|
| 103 |
+
train_dataset=tokenized_datasets["train"],
|
| 104 |
+
eval_dataset=tokenized_datasets["validation"],
|
| 105 |
+
data_collator=data_collator,
|
| 106 |
+
tokenizer=tokenizer,
|
| 107 |
+
compute_metrics=compute_metrics,
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
trainer.train() #模型训练
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
#模型测试 蛋白质数据集,150长度/450bp,复杂版本 不相似
|
| 116 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle
|
| 117 |
+
|
| 118 |
+
# 定义翻转标签的函数
|
| 119 |
+
def flip_labels(example):
|
| 120 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 121 |
+
example["sentence1"] = example["sentence1"]
|
| 122 |
+
example["sentence2"] = example["sentence2"]
|
| 123 |
+
example['label'] = 1 - example['label']
|
| 124 |
+
return example
|
| 125 |
+
|
| 126 |
+
# 应用翻转标签函数
|
| 127 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 128 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True,num_proc=4)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# 确保模型在 GPU 上
|
| 133 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 134 |
+
model.to(device)
|
| 135 |
+
model.eval() # 进入推理模式,加速推理
|
| 136 |
+
|
| 137 |
+
# 取出测试集数据
|
| 138 |
+
test_dataset = tokenized_datasets_dna_protein["test"]
|
| 139 |
+
|
| 140 |
+
# 预存预测结果
|
| 141 |
+
preds = []
|
| 142 |
+
labels = []
|
| 143 |
+
|
| 144 |
+
# 批量大小(建议 64、128、256 视显存大小调整)
|
| 145 |
+
batch_size = 64
|
| 146 |
+
|
| 147 |
+
# 直接遍历数据集进行推理
|
| 148 |
+
for i in tqdm(range(0, len(test_dataset), batch_size), desc="Predicting"):
|
| 149 |
+
batch = test_dataset[i : i + batch_size]
|
| 150 |
+
|
| 151 |
+
# 转换为 Tensor 并移动到 GPU
|
| 152 |
+
inputs = {
|
| 153 |
+
"input_ids": torch.tensor(batch["input_ids"]).to(device),
|
| 154 |
+
"attention_mask": torch.tensor(batch["attention_mask"]).to(device),
|
| 155 |
+
}
|
| 156 |
+
batch_labels = batch["label"] # 原始标签
|
| 157 |
+
|
| 158 |
+
with torch.no_grad(): # 关闭梯度计算,减少内存占用
|
| 159 |
+
outputs = model(**inputs)
|
| 160 |
+
batch_preds = torch.argmax(outputs.logits, axis=-1).cpu().numpy() # 取最大概率的类别
|
| 161 |
+
|
| 162 |
+
preds.extend(batch_preds)
|
| 163 |
+
labels.extend(batch_labels)
|
| 164 |
+
|
| 165 |
+
metric = evaluate.load("glue", "mrpc")
|
| 166 |
+
ret = metric.compute(predictions=preds, references=labels)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
result["dna_protein_pair_full"] = ret
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
#############################################################
|
| 175 |
+
#模型测试 蛋白质数据集,随机版本
|
| 176 |
+
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair_rand')['train'].train_test_split(test_size=0.5) #默认已经shuffle
|
| 177 |
+
|
| 178 |
+
# 定义翻转标签的函数
|
| 179 |
+
def flip_labels(example):
|
| 180 |
+
# 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长,bert分词会产生错误,只生成unk一个token
|
| 181 |
+
example["sentence1"] = example["sentence1"]
|
| 182 |
+
example["sentence2"] = example["sentence2"]
|
| 183 |
+
example['label'] = 1 - example['label']
|
| 184 |
+
return example
|
| 185 |
+
|
| 186 |
+
# 应用翻转标签函数
|
| 187 |
+
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)
|
| 188 |
+
tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True,num_proc=4)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# 确保模型在 GPU 上
|
| 192 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 193 |
+
model.to(device)
|
| 194 |
+
model.eval() # 进入推理模式,加速推理
|
| 195 |
+
|
| 196 |
+
# 取出测试集数据
|
| 197 |
+
test_dataset = tokenized_datasets_dna_protein["test"]
|
| 198 |
+
|
| 199 |
+
# 预存预测结果
|
| 200 |
+
preds = []
|
| 201 |
+
labels = []
|
| 202 |
+
|
| 203 |
+
# 批量大小(建议 64、128、256 视显存大小调整)
|
| 204 |
+
batch_size = 64
|
| 205 |
+
|
| 206 |
+
# 直接遍历数据集进行推理
|
| 207 |
+
for i in tqdm(range(0, len(test_dataset), batch_size), desc="Predicting"):
|
| 208 |
+
batch = test_dataset[i : i + batch_size]
|
| 209 |
+
|
| 210 |
+
# 转换为 Tensor 并移动到 GPU
|
| 211 |
+
inputs = {
|
| 212 |
+
"input_ids": torch.tensor(batch["input_ids"]).to(device),
|
| 213 |
+
"attention_mask": torch.tensor(batch["attention_mask"]).to(device),
|
| 214 |
+
}
|
| 215 |
+
batch_labels = batch["label"] # 原始标签
|
| 216 |
+
|
| 217 |
+
with torch.no_grad(): # 关闭梯度计算,减少内存占用
|
| 218 |
+
outputs = model(**inputs)
|
| 219 |
+
batch_preds = torch.argmax(outputs.logits, axis=-1).cpu().numpy() # 取最大概率的类别
|
| 220 |
+
|
| 221 |
+
preds.extend(batch_preds)
|
| 222 |
+
labels.extend(batch_labels)
|
| 223 |
+
metric = evaluate.load("glue", "mrpc")
|
| 224 |
+
ret = metric.compute(predictions=preds, references=labels)
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
result["dna_protein_pair_rand_full"] = ret
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
print(json.dumps(result))
|
| 233 |
+
|
finetune/run_ft_all_2.sh
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
#source /etc/network_turbo
|
| 3 |
+
|
| 4 |
+
export HF_ENDPOINT=https://hf-mirror.com
|
| 5 |
+
|
| 6 |
+
for ((i=0;i<1000;i++))
|
| 7 |
+
do
|
| 8 |
+
echo "----------------------------------------------"$i
|
| 9 |
+
python gpt2_gene_multiv1_ft_en_test_others2.py >> gpt2_gene_multiv1_ft_en2.json
|
| 10 |
+
done
|
pretrain/gpt2_gene_multi_v1/ds_zero2_no_offload.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fp16": {
|
| 3 |
+
"enabled": "auto",
|
| 4 |
+
"loss_scale": 0,
|
| 5 |
+
"loss_scale_window": 100,
|
| 6 |
+
"initial_scale_power": 16,
|
| 7 |
+
"hysteresis": 2,
|
| 8 |
+
"min_loss_scale": 1e-10
|
| 9 |
+
},
|
| 10 |
+
|
| 11 |
+
"zero_optimization": {
|
| 12 |
+
"stage": 2,
|
| 13 |
+
"allgather_partitions": true,
|
| 14 |
+
"allgather_bucket_size": 1e8,
|
| 15 |
+
"overlap_comm": true,
|
| 16 |
+
"reduce_scatter": true,
|
| 17 |
+
"reduce_bucket_size": 1e8,
|
| 18 |
+
"contiguous_gradients": true
|
| 19 |
+
},
|
| 20 |
+
|
| 21 |
+
"gradient_accumulation_steps": "auto",
|
| 22 |
+
"gradient_clipping": "auto",
|
| 23 |
+
"steps_per_print": 2000,
|
| 24 |
+
"train_batch_size": "auto",
|
| 25 |
+
"train_micro_batch_size_per_gpu": "auto",
|
| 26 |
+
"wall_clock_breakdown": false
|
| 27 |
+
}
|
pretrain/gpt2_gene_multi_v1/gpt2_gene_multi_tokenizer/.ipynb_checkpoints/special_tokens_map-checkpoint.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"eos_token": "<|endoftext|>",
|
| 3 |
+
"pad_token": "<pad>",
|
| 4 |
+
"unk_token": "<unk>"
|
| 5 |
+
}
|
pretrain/gpt2_gene_multi_v1/gpt2_gene_multi_tokenizer/.ipynb_checkpoints/tokenizer_config-checkpoint.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<|endoftext|>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "<unk>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
}
|
| 27 |
+
},
|
| 28 |
+
"clean_up_tokenization_spaces": false,
|
| 29 |
+
"eos_token": "<|endoftext|>",
|
| 30 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 31 |
+
"pad_token": "<pad>",
|
| 32 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 33 |
+
"unk_token": "<unk>"
|
| 34 |
+
}
|
pretrain/gpt2_gene_multi_v1/gpt2_gene_multi_tokenizer/special_tokens_map.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"eos_token": "<|endoftext|>",
|
| 3 |
+
"pad_token": "<pad>",
|
| 4 |
+
"unk_token": "<unk>"
|
| 5 |
+
}
|
pretrain/gpt2_gene_multi_v1/gpt2_gene_multi_tokenizer/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pretrain/gpt2_gene_multi_v1/gpt2_gene_multi_tokenizer/tokenizer_config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<|endoftext|>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "<unk>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
}
|
| 27 |
+
},
|
| 28 |
+
"clean_up_tokenization_spaces": false,
|
| 29 |
+
"eos_token": "<|endoftext|>",
|
| 30 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 31 |
+
"pad_token": "<pad>",
|
| 32 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 33 |
+
"unk_token": "<unk>"
|
| 34 |
+
}
|
pretrain/gpt2_gene_multi_v1/run_clm_pt.py
ADDED
|
@@ -0,0 +1,646 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# coding=utf-8
|
| 3 |
+
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
|
| 4 |
+
#
|
| 5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 6 |
+
# you may not use this file except in compliance with the License.
|
| 7 |
+
# You may obtain a copy of the License at
|
| 8 |
+
#
|
| 9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 10 |
+
#
|
| 11 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 14 |
+
# See the License for the specific language governing permissions and
|
| 15 |
+
# limitations under the License.
|
| 16 |
+
"""
|
| 17 |
+
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
|
| 18 |
+
|
| 19 |
+
Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
|
| 20 |
+
https://huggingface.co/models?filter=text-generation
|
| 21 |
+
"""
|
| 22 |
+
# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
import subprocess
|
| 26 |
+
import os
|
| 27 |
+
|
| 28 |
+
result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
|
| 29 |
+
output = result.stdout
|
| 30 |
+
for line in output.splitlines():
|
| 31 |
+
if '=' in line:
|
| 32 |
+
var, value = line.split('=', 1)
|
| 33 |
+
os.environ[var] = value
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
import logging
|
| 38 |
+
import numpy as np
|
| 39 |
+
import math
|
| 40 |
+
import os
|
| 41 |
+
import sys
|
| 42 |
+
from dataclasses import dataclass, field
|
| 43 |
+
from itertools import chain
|
| 44 |
+
from typing import Optional, List, Dict, Any, Mapping
|
| 45 |
+
from pathlib import Path
|
| 46 |
+
import datasets
|
| 47 |
+
import torch
|
| 48 |
+
from datasets import load_dataset, concatenate_datasets
|
| 49 |
+
|
| 50 |
+
import transformers
|
| 51 |
+
from transformers import (
|
| 52 |
+
CONFIG_MAPPING,
|
| 53 |
+
MODEL_FOR_CAUSAL_LM_MAPPING,
|
| 54 |
+
AutoConfig,
|
| 55 |
+
AutoModelForCausalLM,
|
| 56 |
+
LlamaForCausalLM,
|
| 57 |
+
LlamaTokenizer,
|
| 58 |
+
AutoTokenizer,
|
| 59 |
+
HfArgumentParser,
|
| 60 |
+
Trainer,
|
| 61 |
+
TrainingArguments,
|
| 62 |
+
is_torch_tpu_available,
|
| 63 |
+
set_seed,
|
| 64 |
+
)
|
| 65 |
+
from transformers.testing_utils import CaptureLogger
|
| 66 |
+
from transformers.trainer_utils import get_last_checkpoint
|
| 67 |
+
from transformers.utils import send_example_telemetry
|
| 68 |
+
from transformers.utils.versions import require_version
|
| 69 |
+
|
| 70 |
+
from sklearn.metrics import accuracy_score
|
| 71 |
+
from peft import LoraConfig, TaskType, get_peft_model, PeftModel, get_peft_model_state_dict
|
| 72 |
+
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class SavePeftModelCallback(transformers.TrainerCallback):
|
| 76 |
+
def save_model(self, args, state, kwargs):
|
| 77 |
+
if state.best_model_checkpoint is not None:
|
| 78 |
+
checkpoint_folder = os.path.join(state.best_model_checkpoint, "pt_lora_model")
|
| 79 |
+
else:
|
| 80 |
+
checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
|
| 81 |
+
|
| 82 |
+
peft_model_path = os.path.join(checkpoint_folder, "pt_lora_model")
|
| 83 |
+
kwargs["model"].save_pretrained(peft_model_path)
|
| 84 |
+
kwargs["tokenizer"].save_pretrained(peft_model_path)
|
| 85 |
+
|
| 86 |
+
def on_save(self, args, state, control, **kwargs):
|
| 87 |
+
self.save_model(args, state, kwargs)
|
| 88 |
+
return control
|
| 89 |
+
|
| 90 |
+
def on_train_end(self, args, state, control, **kwargs):
|
| 91 |
+
peft_model_path = os.path.join(args.output_dir, "pt_lora_model")
|
| 92 |
+
kwargs["model"].save_pretrained(peft_model_path)
|
| 93 |
+
kwargs["tokenizer"].save_pretrained(peft_model_path)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def accuracy(predictions, references, normalize=True, sample_weight=None):
|
| 97 |
+
return {
|
| 98 |
+
"accuracy": float(
|
| 99 |
+
accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
|
| 100 |
+
)
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def compute_metrics(eval_preds):
|
| 105 |
+
preds, labels = eval_preds
|
| 106 |
+
# preds have the same shape as the labels, after the argmax(-1) has been calculated
|
| 107 |
+
# by preprocess_logits_for_metrics but we need to shift the labels
|
| 108 |
+
labels = labels[:, 1:].reshape(-1)
|
| 109 |
+
preds = preds[:, :-1].reshape(-1)
|
| 110 |
+
return accuracy(predictions=preds, references=labels)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def preprocess_logits_for_metrics(logits, labels):
|
| 114 |
+
if isinstance(logits, tuple):
|
| 115 |
+
# Depending on the model and config, logits may contain extra tensors,
|
| 116 |
+
# like past_key_values, but logits always come first
|
| 117 |
+
logits = logits[0]
|
| 118 |
+
return logits.argmax(dim=-1)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def fault_tolerance_data_collator(features: List) -> Dict[str, Any]:
|
| 122 |
+
if not isinstance(features[0], Mapping):
|
| 123 |
+
features = [vars(f) for f in features]
|
| 124 |
+
first = features[0]
|
| 125 |
+
batch = {}
|
| 126 |
+
|
| 127 |
+
# Special handling for labels.
|
| 128 |
+
# Ensure that tensor is created with the correct type
|
| 129 |
+
# (it should be automatically the case, but let's make sure of it.)
|
| 130 |
+
if "label" in first and first["label"] is not None:
|
| 131 |
+
label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"]
|
| 132 |
+
dtype = torch.long if isinstance(label, int) else torch.float
|
| 133 |
+
batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)
|
| 134 |
+
elif "label_ids" in first and first["label_ids"] is not None:
|
| 135 |
+
if isinstance(first["label_ids"], torch.Tensor):
|
| 136 |
+
batch["labels"] = torch.stack([f["label_ids"] for f in features])
|
| 137 |
+
else:
|
| 138 |
+
dtype = torch.long if isinstance(first["label_ids"][0], int) else torch.float
|
| 139 |
+
batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype)
|
| 140 |
+
|
| 141 |
+
# Handling of all other possible keys.
|
| 142 |
+
# Again, we will use the first element to figure out which key/values are not None for this model.
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
for k, v in first.items():
|
| 146 |
+
if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
|
| 147 |
+
if isinstance(v, torch.Tensor):
|
| 148 |
+
batch[k] = torch.stack([f[k] for f in features])
|
| 149 |
+
elif isinstance(v, np.ndarray):
|
| 150 |
+
batch[k] = torch.tensor(np.stack([f[k] for f in features]))
|
| 151 |
+
else:
|
| 152 |
+
batch[k] = torch.tensor([f[k] for f in features])
|
| 153 |
+
except ValueError: # quick fix by simply take the first example
|
| 154 |
+
for k, v in first.items():
|
| 155 |
+
if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
|
| 156 |
+
if isinstance(v, torch.Tensor):
|
| 157 |
+
batch[k] = torch.stack([features[0][k]] * len(features))
|
| 158 |
+
elif isinstance(v, np.ndarray):
|
| 159 |
+
batch[k] = torch.tensor(np.stack([features[0][k]] * len(features)))
|
| 160 |
+
else:
|
| 161 |
+
batch[k] = torch.tensor([features[0][k]] * len(features))
|
| 162 |
+
|
| 163 |
+
return batch
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
|
| 167 |
+
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
@dataclass
|
| 171 |
+
class ModelArguments:
|
| 172 |
+
"""
|
| 173 |
+
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
|
| 174 |
+
"""
|
| 175 |
+
|
| 176 |
+
model_name_or_path: Optional[str] = field(
|
| 177 |
+
default=None,
|
| 178 |
+
metadata={
|
| 179 |
+
"help": (
|
| 180 |
+
"The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
|
| 181 |
+
)
|
| 182 |
+
},
|
| 183 |
+
)
|
| 184 |
+
tokenizer_name_or_path: Optional[str] = field(
|
| 185 |
+
default=None,
|
| 186 |
+
metadata={
|
| 187 |
+
"help": (
|
| 188 |
+
"The tokenizer for weights initialization.Don't set if you want to train a model from scratch."
|
| 189 |
+
)
|
| 190 |
+
},
|
| 191 |
+
)
|
| 192 |
+
model_type: Optional[str] = field(
|
| 193 |
+
default=None,
|
| 194 |
+
metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
|
| 195 |
+
)
|
| 196 |
+
config_overrides: Optional[str] = field(
|
| 197 |
+
default=None,
|
| 198 |
+
metadata={
|
| 199 |
+
"help": (
|
| 200 |
+
"Override some existing default config settings when a model is trained from scratch. Example: "
|
| 201 |
+
"n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
|
| 202 |
+
)
|
| 203 |
+
},
|
| 204 |
+
)
|
| 205 |
+
config_name: Optional[str] = field(
|
| 206 |
+
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
| 207 |
+
)
|
| 208 |
+
tokenizer_name: Optional[str] = field(
|
| 209 |
+
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
| 210 |
+
)
|
| 211 |
+
cache_dir: Optional[str] = field(
|
| 212 |
+
default=None,
|
| 213 |
+
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
|
| 214 |
+
)
|
| 215 |
+
use_fast_tokenizer: bool = field(
|
| 216 |
+
default=True,
|
| 217 |
+
metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
|
| 218 |
+
)
|
| 219 |
+
model_revision: str = field(
|
| 220 |
+
default="main",
|
| 221 |
+
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
| 222 |
+
)
|
| 223 |
+
use_auth_token: bool = field(
|
| 224 |
+
default=False,
|
| 225 |
+
metadata={
|
| 226 |
+
"help": (
|
| 227 |
+
"Will use the token generated when running `huggingface-cli login` (necessary to use this script "
|
| 228 |
+
"with private models)."
|
| 229 |
+
)
|
| 230 |
+
},
|
| 231 |
+
)
|
| 232 |
+
torch_dtype: Optional[str] = field(
|
| 233 |
+
default=None,
|
| 234 |
+
metadata={
|
| 235 |
+
"help": (
|
| 236 |
+
"Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
|
| 237 |
+
"dtype will be automatically derived from the model's weights."
|
| 238 |
+
),
|
| 239 |
+
"choices": ["auto", "bfloat16", "float16", "float32"],
|
| 240 |
+
},
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
def __post_init__(self):
|
| 244 |
+
if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
|
| 245 |
+
raise ValueError(
|
| 246 |
+
"--config_overrides can't be used in combination with --config_name or --model_name_or_path"
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
@dataclass
|
| 251 |
+
class DataTrainingArguments:
|
| 252 |
+
"""
|
| 253 |
+
Arguments pertaining to what data we are going to input our model for training and eval.
|
| 254 |
+
"""
|
| 255 |
+
|
| 256 |
+
dataset_dir: Optional[str] = field(
|
| 257 |
+
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
|
| 258 |
+
)
|
| 259 |
+
dataset_config_name: Optional[str] = field(
|
| 260 |
+
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
| 261 |
+
)
|
| 262 |
+
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
|
| 263 |
+
validation_file: Optional[str] = field(
|
| 264 |
+
default=None,
|
| 265 |
+
metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
|
| 266 |
+
)
|
| 267 |
+
max_train_samples: Optional[int] = field(
|
| 268 |
+
default=None,
|
| 269 |
+
metadata={
|
| 270 |
+
"help": (
|
| 271 |
+
"For debugging purposes or quicker training, truncate the number of training examples to this "
|
| 272 |
+
"value if set."
|
| 273 |
+
)
|
| 274 |
+
},
|
| 275 |
+
)
|
| 276 |
+
max_eval_samples: Optional[int] = field(
|
| 277 |
+
default=None,
|
| 278 |
+
metadata={
|
| 279 |
+
"help": (
|
| 280 |
+
"For debugging purposes or quicker training, truncate the number of evaluation examples to this "
|
| 281 |
+
"value if set."
|
| 282 |
+
)
|
| 283 |
+
},
|
| 284 |
+
)
|
| 285 |
+
streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
|
| 286 |
+
block_size: Optional[int] = field(
|
| 287 |
+
default=None,
|
| 288 |
+
metadata={
|
| 289 |
+
"help": (
|
| 290 |
+
"Optional input sequence length after tokenization. "
|
| 291 |
+
"The training dataset will be truncated in block of this size for training. "
|
| 292 |
+
"Default to the model max input length for single sentence inputs (take into account special tokens)."
|
| 293 |
+
)
|
| 294 |
+
},
|
| 295 |
+
)
|
| 296 |
+
overwrite_cache: bool = field(
|
| 297 |
+
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
| 298 |
+
)
|
| 299 |
+
validation_split_percentage: Optional[float] = field(
|
| 300 |
+
default=0.05,
|
| 301 |
+
metadata={
|
| 302 |
+
"help": "The percentage of the train set used as validation set in case there's no validation split"
|
| 303 |
+
},
|
| 304 |
+
)
|
| 305 |
+
preprocessing_num_workers: Optional[int] = field(
|
| 306 |
+
default=None,
|
| 307 |
+
metadata={"help": "The number of processes to use for the preprocessing."},
|
| 308 |
+
)
|
| 309 |
+
keep_linebreaks: bool = field(
|
| 310 |
+
default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
|
| 311 |
+
)
|
| 312 |
+
data_cache_dir: Optional[str] = field(default="./", metadata={"help": "The datasets processed stored"})
|
| 313 |
+
|
| 314 |
+
def __post_init__(self):
|
| 315 |
+
if self.streaming:
|
| 316 |
+
require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
@dataclass
|
| 320 |
+
class MyTrainingArguments(TrainingArguments):
|
| 321 |
+
trainable : Optional[str] = field(default="q_proj,v_proj")
|
| 322 |
+
lora_rank : Optional[int] = field(default=8)
|
| 323 |
+
lora_dropout : Optional[float] = field(default=0.1)
|
| 324 |
+
lora_alpha : Optional[float] = field(default=32.)
|
| 325 |
+
modules_to_save : Optional[str] = field(default=None)
|
| 326 |
+
debug_mode : Optional[bool] = field(default=False)
|
| 327 |
+
peft_path : Optional[str] = field(default=None)
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
logger = logging.getLogger(__name__)
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
def main():
|
| 334 |
+
|
| 335 |
+
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, MyTrainingArguments))
|
| 336 |
+
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
| 337 |
+
# If we pass only one argument to the script and it's the path to a json file,
|
| 338 |
+
# let's parse it to get our arguments.
|
| 339 |
+
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
|
| 340 |
+
else:
|
| 341 |
+
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
| 342 |
+
|
| 343 |
+
# Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
|
| 344 |
+
# information sent is the one passed as arguments along with your Python/PyTorch versions.
|
| 345 |
+
send_example_telemetry("run_clm", model_args, data_args)
|
| 346 |
+
|
| 347 |
+
# Setup logging
|
| 348 |
+
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",datefmt="%m/%d/%Y %H:%M:%S",
|
| 349 |
+
level=logging.INFO, # if training_args.local_rank in [-1, 0] else logging.WARN,
|
| 350 |
+
handlers=[logging.StreamHandler(sys.stdout)],)
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
if training_args.should_log:
|
| 354 |
+
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
|
| 355 |
+
transformers.utils.logging.set_verbosity_info()
|
| 356 |
+
|
| 357 |
+
log_level = training_args.get_process_log_level()
|
| 358 |
+
logger.setLevel(log_level)
|
| 359 |
+
datasets.utils.logging.set_verbosity(log_level)
|
| 360 |
+
transformers.utils.logging.set_verbosity(log_level)
|
| 361 |
+
transformers.utils.logging.enable_default_handler()
|
| 362 |
+
transformers.utils.logging.enable_explicit_format()
|
| 363 |
+
# transformers.tokenization_utils.logging.set_verbosity_warning()
|
| 364 |
+
|
| 365 |
+
# Log on each process the small summary:
|
| 366 |
+
logger.warning(
|
| 367 |
+
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
| 368 |
+
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
| 369 |
+
)
|
| 370 |
+
|
| 371 |
+
# Detecting last checkpoint.
|
| 372 |
+
last_checkpoint = None
|
| 373 |
+
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
| 374 |
+
last_checkpoint = get_last_checkpoint(training_args.output_dir)
|
| 375 |
+
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
|
| 376 |
+
raise ValueError(
|
| 377 |
+
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
|
| 378 |
+
"Use --overwrite_output_dir to overcome."
|
| 379 |
+
)
|
| 380 |
+
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
|
| 381 |
+
logger.info(
|
| 382 |
+
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
|
| 383 |
+
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
# Set seed before initializing model.
|
| 387 |
+
set_seed(training_args.seed)
|
| 388 |
+
|
| 389 |
+
config_kwargs = {
|
| 390 |
+
"cache_dir": model_args.cache_dir,
|
| 391 |
+
"revision": model_args.model_revision,
|
| 392 |
+
"use_auth_token": True if model_args.use_auth_token else None,
|
| 393 |
+
}
|
| 394 |
+
if model_args.config_name:
|
| 395 |
+
config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
|
| 396 |
+
elif model_args.model_name_or_path:
|
| 397 |
+
config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
|
| 398 |
+
else:
|
| 399 |
+
config = CONFIG_MAPPING[model_args.model_type]()
|
| 400 |
+
logger.warning("You are instantiating a new config instance from scratch.")
|
| 401 |
+
if model_args.config_overrides is not None:
|
| 402 |
+
logger.info(f"Overriding config: {model_args.config_overrides}")
|
| 403 |
+
config.update_from_string(model_args.config_overrides)
|
| 404 |
+
logger.info(f"New config: {config}")
|
| 405 |
+
|
| 406 |
+
tokenizer_kwargs = {
|
| 407 |
+
"cache_dir": model_args.cache_dir,
|
| 408 |
+
"use_fast": model_args.use_fast_tokenizer,
|
| 409 |
+
"revision": model_args.model_revision,
|
| 410 |
+
"use_auth_token": True if model_args.use_auth_token else None,
|
| 411 |
+
}
|
| 412 |
+
if model_args.tokenizer_name:
|
| 413 |
+
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
|
| 414 |
+
elif model_args.tokenizer_name_or_path:
|
| 415 |
+
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path, **tokenizer_kwargs)
|
| 416 |
+
else:
|
| 417 |
+
raise ValueError(
|
| 418 |
+
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
|
| 419 |
+
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
# Preprocessing the datasets.
|
| 423 |
+
# First we tokenize all the texts.
|
| 424 |
+
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
|
| 425 |
+
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
|
| 426 |
+
|
| 427 |
+
def tokenize_function(examples):
|
| 428 |
+
with CaptureLogger(tok_logger) as cl:
|
| 429 |
+
output = tokenizer(examples["text"])
|
| 430 |
+
# clm input could be much much longer than block_size
|
| 431 |
+
if "Token indices sequence length is longer than the" in cl.out:
|
| 432 |
+
tok_logger.warning(
|
| 433 |
+
"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
|
| 434 |
+
" before being passed to the model."
|
| 435 |
+
)
|
| 436 |
+
return output
|
| 437 |
+
if data_args.block_size is None:
|
| 438 |
+
block_size = tokenizer.model_max_length
|
| 439 |
+
if block_size > 1024:
|
| 440 |
+
logger.warning(
|
| 441 |
+
"The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
|
| 442 |
+
" of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
|
| 443 |
+
" override this default with `--block_size xxx`."
|
| 444 |
+
)
|
| 445 |
+
block_size = 1024
|
| 446 |
+
else:
|
| 447 |
+
if data_args.block_size > tokenizer.model_max_length:
|
| 448 |
+
logger.warning(
|
| 449 |
+
f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
|
| 450 |
+
f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
|
| 451 |
+
)
|
| 452 |
+
block_size = min(data_args.block_size, tokenizer.model_max_length)
|
| 453 |
+
|
| 454 |
+
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
| 455 |
+
def group_texts(examples):
|
| 456 |
+
# Concatenate all texts.
|
| 457 |
+
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
| 458 |
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
| 459 |
+
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
| 460 |
+
# customize this part to your needs.
|
| 461 |
+
if total_length >= block_size:
|
| 462 |
+
total_length = (total_length // block_size) * block_size
|
| 463 |
+
# Split by chunks of max_len.
|
| 464 |
+
result = {
|
| 465 |
+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
| 466 |
+
for k, t in concatenated_examples.items()
|
| 467 |
+
}
|
| 468 |
+
result["labels"] = result["input_ids"].copy()
|
| 469 |
+
return result
|
| 470 |
+
with training_args.main_process_first(desc="dataset map tokenization and grouping"):
|
| 471 |
+
lm_datasets = []
|
| 472 |
+
path = Path(data_args.dataset_dir)
|
| 473 |
+
files = [file.name for file in path.glob("*.txt")]
|
| 474 |
+
if training_args.debug_mode is True:
|
| 475 |
+
files = [files[0]]
|
| 476 |
+
for idx, file in enumerate(files):
|
| 477 |
+
data_file = os.path.join(path, file)
|
| 478 |
+
filename = ''.join(file.split(".")[:-1])
|
| 479 |
+
cache_path = os.path.join(data_args.data_cache_dir, filename)
|
| 480 |
+
os.makedirs(cache_path, exist_ok=True)
|
| 481 |
+
try:
|
| 482 |
+
processed_dataset = datasets.load_from_disk(cache_path, keep_in_memory=False)
|
| 483 |
+
logger.info(f'training datasets-{filename} has been loaded from disk')
|
| 484 |
+
except Exception:
|
| 485 |
+
cache_dir = os.path.join(data_args.data_cache_dir, filename+"_text")
|
| 486 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 487 |
+
raw_dataset = load_dataset("text", data_files=data_file, cache_dir=cache_dir, keep_in_memory=False)
|
| 488 |
+
logger.info(f"{file} has been loaded")
|
| 489 |
+
tokenized_dataset = raw_dataset.map(
|
| 490 |
+
tokenize_function,
|
| 491 |
+
batched=True,
|
| 492 |
+
num_proc=data_args.preprocessing_num_workers,
|
| 493 |
+
remove_columns="text",
|
| 494 |
+
load_from_cache_file=True,
|
| 495 |
+
keep_in_memory=False,
|
| 496 |
+
cache_file_names = {k: os.path.join(cache_dir, 'tokenized.arrow') for k in raw_dataset},
|
| 497 |
+
desc="Running tokenizer on dataset",
|
| 498 |
+
)
|
| 499 |
+
grouped_datasets = tokenized_dataset.map(
|
| 500 |
+
group_texts,
|
| 501 |
+
batched=True,
|
| 502 |
+
num_proc=data_args.preprocessing_num_workers,
|
| 503 |
+
load_from_cache_file=True,
|
| 504 |
+
keep_in_memory=False,
|
| 505 |
+
cache_file_names = {k: os.path.join(cache_dir, 'grouped.arrow') for k in tokenized_dataset},
|
| 506 |
+
desc=f"Grouping texts in chunks of {block_size}",
|
| 507 |
+
)
|
| 508 |
+
processed_dataset = grouped_datasets
|
| 509 |
+
processed_dataset.save_to_disk(cache_path)
|
| 510 |
+
if idx == 0:
|
| 511 |
+
lm_datasets = processed_dataset['train']
|
| 512 |
+
else:
|
| 513 |
+
assert lm_datasets.features.type == processed_dataset["train"].features.type
|
| 514 |
+
lm_datasets = concatenate_datasets([lm_datasets, processed_dataset["train"]])
|
| 515 |
+
|
| 516 |
+
lm_datasets = lm_datasets.train_test_split(test_size = data_args.validation_split_percentage)
|
| 517 |
+
|
| 518 |
+
if training_args.do_train:
|
| 519 |
+
train_dataset = lm_datasets['train']
|
| 520 |
+
if data_args.max_train_samples is not None:
|
| 521 |
+
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
| 522 |
+
train_dataset = train_dataset.select(range(max_train_samples))
|
| 523 |
+
logger.info(f"Num train_samples {len(train_dataset)}")
|
| 524 |
+
logger.info("training example:")
|
| 525 |
+
logger.info(tokenizer.decode(train_dataset[0]['input_ids']))
|
| 526 |
+
if training_args.do_eval:
|
| 527 |
+
eval_dataset = lm_datasets["test"]
|
| 528 |
+
if data_args.max_eval_samples is not None:
|
| 529 |
+
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
| 530 |
+
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
| 531 |
+
logger.info(f"Num eval_samples {len(eval_dataset)}")
|
| 532 |
+
logger.info("training example:")
|
| 533 |
+
logger.info(tokenizer.decode(eval_dataset[0]['input_ids']))
|
| 534 |
+
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
if model_args.model_name_or_path:
|
| 538 |
+
torch_dtype = (
|
| 539 |
+
model_args.torch_dtype
|
| 540 |
+
if model_args.torch_dtype in ["auto", None]
|
| 541 |
+
else getattr(torch, model_args.torch_dtype)
|
| 542 |
+
)
|
| 543 |
+
model = LlamaForCausalLM.from_pretrained(
|
| 544 |
+
model_args.model_name_or_path,
|
| 545 |
+
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
| 546 |
+
config=config,
|
| 547 |
+
cache_dir=model_args.cache_dir,
|
| 548 |
+
revision=model_args.model_revision,
|
| 549 |
+
use_auth_token=True if model_args.use_auth_token else None,
|
| 550 |
+
torch_dtype=torch_dtype,
|
| 551 |
+
low_cpu_mem_usage=True
|
| 552 |
+
)
|
| 553 |
+
else:
|
| 554 |
+
model = AutoModelForCausalLM.from_config(config)
|
| 555 |
+
n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
|
| 556 |
+
logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
|
| 557 |
+
|
| 558 |
+
model_vocab_size = model.get_output_embeddings().weight.size(0)
|
| 559 |
+
|
| 560 |
+
model.resize_token_embeddings(len(tokenizer))
|
| 561 |
+
|
| 562 |
+
"""
|
| 563 |
+
if training_args.peft_path is not None:
|
| 564 |
+
logger.info("Peft from pre-trained model")
|
| 565 |
+
model = PeftModel.from_pretrained(model, training_args.peft_path)
|
| 566 |
+
else:
|
| 567 |
+
logger.info("Init new peft model")
|
| 568 |
+
target_modules = training_args.trainable.split(',')
|
| 569 |
+
modules_to_save = training_args.modules_to_save
|
| 570 |
+
if modules_to_save is not None:
|
| 571 |
+
modules_to_save = modules_to_save.split(',')
|
| 572 |
+
lora_rank = training_args.lora_rank
|
| 573 |
+
lora_dropout = training_args.lora_dropout
|
| 574 |
+
lora_alpha = training_args.lora_alpha
|
| 575 |
+
logger.info(f"target_modules: {target_modules}")
|
| 576 |
+
logger.info(f"lora_rank: {lora_rank}")
|
| 577 |
+
peft_config = LoraConfig(
|
| 578 |
+
task_type=TaskType.CAUSAL_LM,
|
| 579 |
+
target_modules=target_modules,
|
| 580 |
+
inference_mode=False,
|
| 581 |
+
r=lora_rank, lora_alpha=lora_alpha,
|
| 582 |
+
lora_dropout=lora_dropout,
|
| 583 |
+
modules_to_save=modules_to_save)
|
| 584 |
+
model = get_peft_model(model, peft_config)
|
| 585 |
+
model.print_trainable_parameters()
|
| 586 |
+
old_state_dict = model.state_dict
|
| 587 |
+
model.state_dict = (
|
| 588 |
+
lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
|
| 589 |
+
).__get__(model, type(model))
|
| 590 |
+
"""
|
| 591 |
+
|
| 592 |
+
|
| 593 |
+
# Initialize our Trainer
|
| 594 |
+
trainer = Trainer(
|
| 595 |
+
model=model,
|
| 596 |
+
args=training_args,
|
| 597 |
+
train_dataset=train_dataset if training_args.do_train else None,
|
| 598 |
+
eval_dataset=eval_dataset if training_args.do_eval else None,
|
| 599 |
+
tokenizer=tokenizer,
|
| 600 |
+
data_collator=fault_tolerance_data_collator,
|
| 601 |
+
compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
|
| 602 |
+
preprocess_logits_for_metrics=preprocess_logits_for_metrics
|
| 603 |
+
if training_args.do_eval and not is_torch_tpu_available()
|
| 604 |
+
else None,
|
| 605 |
+
)
|
| 606 |
+
trainer.add_callback(SavePeftModelCallback)
|
| 607 |
+
# Training
|
| 608 |
+
if training_args.do_train:
|
| 609 |
+
checkpoint = None
|
| 610 |
+
if training_args.resume_from_checkpoint is not None:
|
| 611 |
+
checkpoint = training_args.resume_from_checkpoint
|
| 612 |
+
elif last_checkpoint is not None:
|
| 613 |
+
checkpoint = last_checkpoint
|
| 614 |
+
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
| 615 |
+
|
| 616 |
+
metrics = train_result.metrics
|
| 617 |
+
|
| 618 |
+
max_train_samples = (
|
| 619 |
+
data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
|
| 620 |
+
)
|
| 621 |
+
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
|
| 622 |
+
|
| 623 |
+
trainer.log_metrics("train", metrics)
|
| 624 |
+
trainer.save_metrics("train", metrics)
|
| 625 |
+
trainer.save_state()
|
| 626 |
+
|
| 627 |
+
# Evaluation
|
| 628 |
+
if training_args.do_eval:
|
| 629 |
+
logger.info("*** Evaluate ***")
|
| 630 |
+
|
| 631 |
+
metrics = trainer.evaluate()
|
| 632 |
+
|
| 633 |
+
max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
|
| 634 |
+
metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
|
| 635 |
+
try:
|
| 636 |
+
perplexity = math.exp(metrics["eval_loss"])
|
| 637 |
+
except OverflowError:
|
| 638 |
+
perplexity = float("inf")
|
| 639 |
+
metrics["perplexity"] = perplexity
|
| 640 |
+
|
| 641 |
+
trainer.log_metrics("eval", metrics)
|
| 642 |
+
trainer.save_metrics("eval", metrics)
|
| 643 |
+
|
| 644 |
+
|
| 645 |
+
if __name__ == "__main__":
|
| 646 |
+
main()
|
pretrain/gpt2_gene_multi_v1/run_pt_gpt2.sh
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
lr=2e-4
|
| 2 |
+
# lora_rank=8
|
| 3 |
+
# lora_alpha=32
|
| 4 |
+
# lora_trainable="q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj"
|
| 5 |
+
# modules_to_save="embed_tokens,lm_head"
|
| 6 |
+
# lora_dropout=0.05
|
| 7 |
+
|
| 8 |
+
dna_eng_tokenizer_path=./gpt2_gene_multi_tokenizer
|
| 9 |
+
dataset_dir=./train_data
|
| 10 |
+
data_cache=temp_data_cache_dir
|
| 11 |
+
per_device_train_batch_size=32
|
| 12 |
+
per_device_eval_batch_size=32
|
| 13 |
+
gradient_accumulation_steps=8
|
| 14 |
+
output_dir=gpt2_gene_multi_v1
|
| 15 |
+
|
| 16 |
+
deepspeed_config_file=ds_zero2_no_offload.json
|
| 17 |
+
|
| 18 |
+
torchrun --nnodes 1 --nproc_per_node 6 run_clm_pt.py \
|
| 19 |
+
--config_name gene_eng_gpt2_v1 \
|
| 20 |
+
--deepspeed ${deepspeed_config_file} \
|
| 21 |
+
--tokenizer_name_or_path ${dna_eng_tokenizer_path} \
|
| 22 |
+
--dataset_dir ${dataset_dir} \
|
| 23 |
+
--data_cache_dir ${data_cache} \
|
| 24 |
+
--validation_split_percentage 0.001 \
|
| 25 |
+
--per_device_train_batch_size ${per_device_train_batch_size} \
|
| 26 |
+
--per_device_eval_batch_size ${per_device_eval_batch_size} \
|
| 27 |
+
--do_train \
|
| 28 |
+
--seed $RANDOM \
|
| 29 |
+
--fp16 \
|
| 30 |
+
--num_train_epochs 1 \
|
| 31 |
+
--lr_scheduler_type cosine \
|
| 32 |
+
--learning_rate ${lr} \
|
| 33 |
+
--warmup_ratio 0.05 \
|
| 34 |
+
--weight_decay 0.01 \
|
| 35 |
+
--logging_strategy steps \
|
| 36 |
+
--logging_steps 10 \
|
| 37 |
+
--save_strategy steps \
|
| 38 |
+
--save_total_limit 3 \
|
| 39 |
+
--save_steps 1000 \
|
| 40 |
+
--gradient_accumulation_steps ${gradient_accumulation_steps} \
|
| 41 |
+
--preprocessing_num_workers 128 \
|
| 42 |
+
--block_size 256 \
|
| 43 |
+
--output_dir ${output_dir} \
|
| 44 |
+
--overwrite_output_dir \
|
| 45 |
+
--ddp_timeout 30000 \
|
| 46 |
+
--logging_first_step True \
|
| 47 |
+
--torch_dtype float16 \
|
| 48 |
+
--gradient_checkpointing \
|
| 49 |
+
--ddp_find_unused_parameters False
|
pretrain/gpt2_gene_multi_v2/ds_zero2_no_offload.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fp16": {
|
| 3 |
+
"enabled": "auto",
|
| 4 |
+
"loss_scale": 0,
|
| 5 |
+
"loss_scale_window": 100,
|
| 6 |
+
"initial_scale_power": 16,
|
| 7 |
+
"hysteresis": 2,
|
| 8 |
+
"min_loss_scale": 1e-10
|
| 9 |
+
},
|
| 10 |
+
|
| 11 |
+
"zero_optimization": {
|
| 12 |
+
"stage": 2,
|
| 13 |
+
"allgather_partitions": true,
|
| 14 |
+
"allgather_bucket_size": 1e8,
|
| 15 |
+
"overlap_comm": true,
|
| 16 |
+
"reduce_scatter": true,
|
| 17 |
+
"reduce_bucket_size": 1e8,
|
| 18 |
+
"contiguous_gradients": true
|
| 19 |
+
},
|
| 20 |
+
|
| 21 |
+
"gradient_accumulation_steps": "auto",
|
| 22 |
+
"gradient_clipping": "auto",
|
| 23 |
+
"steps_per_print": 2000,
|
| 24 |
+
"train_batch_size": "auto",
|
| 25 |
+
"train_micro_batch_size_per_gpu": "auto",
|
| 26 |
+
"wall_clock_breakdown": false
|
| 27 |
+
}
|
pretrain/gpt2_gene_multi_v2/gpt2_gene_multi_tokenizer/.ipynb_checkpoints/special_tokens_map-checkpoint.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"eos_token": "<|endoftext|>",
|
| 3 |
+
"pad_token": "<pad>",
|
| 4 |
+
"unk_token": "<unk>"
|
| 5 |
+
}
|
pretrain/gpt2_gene_multi_v2/gpt2_gene_multi_tokenizer/.ipynb_checkpoints/tokenizer_config-checkpoint.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<|endoftext|>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "<unk>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
}
|
| 27 |
+
},
|
| 28 |
+
"clean_up_tokenization_spaces": false,
|
| 29 |
+
"eos_token": "<|endoftext|>",
|
| 30 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 31 |
+
"pad_token": "<pad>",
|
| 32 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 33 |
+
"unk_token": "<unk>"
|
| 34 |
+
}
|
pretrain/gpt2_gene_multi_v2/gpt2_gene_multi_tokenizer/special_tokens_map.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"eos_token": "<|endoftext|>",
|
| 3 |
+
"pad_token": "<pad>",
|
| 4 |
+
"unk_token": "<unk>"
|
| 5 |
+
}
|
pretrain/gpt2_gene_multi_v2/gpt2_gene_multi_tokenizer/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pretrain/gpt2_gene_multi_v2/gpt2_gene_multi_tokenizer/tokenizer_config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<|endoftext|>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "<unk>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
}
|
| 27 |
+
},
|
| 28 |
+
"clean_up_tokenization_spaces": false,
|
| 29 |
+
"eos_token": "<|endoftext|>",
|
| 30 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 31 |
+
"pad_token": "<pad>",
|
| 32 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 33 |
+
"unk_token": "<unk>"
|
| 34 |
+
}
|
pretrain/gpt2_gene_multi_v2/run_clm_formal.py
ADDED
|
@@ -0,0 +1,657 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# coding=utf-8
|
| 3 |
+
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
|
| 4 |
+
#
|
| 5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 6 |
+
# you may not use this file except in compliance with the License.
|
| 7 |
+
# You may obtain a copy of the License at
|
| 8 |
+
#
|
| 9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 10 |
+
#
|
| 11 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 14 |
+
# See the License for the specific language governing permissions and
|
| 15 |
+
# limitations under the License.
|
| 16 |
+
"""
|
| 17 |
+
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
|
| 18 |
+
|
| 19 |
+
Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
|
| 20 |
+
https://huggingface.co/models?filter=text-generation
|
| 21 |
+
"""
|
| 22 |
+
# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
|
| 23 |
+
|
| 24 |
+
import logging
|
| 25 |
+
import math
|
| 26 |
+
import os
|
| 27 |
+
import sys
|
| 28 |
+
from dataclasses import dataclass, field
|
| 29 |
+
from itertools import chain
|
| 30 |
+
from typing import Optional
|
| 31 |
+
|
| 32 |
+
import datasets
|
| 33 |
+
import evaluate
|
| 34 |
+
import torch
|
| 35 |
+
from datasets import load_dataset
|
| 36 |
+
|
| 37 |
+
import transformers
|
| 38 |
+
from transformers import (
|
| 39 |
+
CONFIG_MAPPING,
|
| 40 |
+
MODEL_FOR_CAUSAL_LM_MAPPING,
|
| 41 |
+
AutoConfig,
|
| 42 |
+
AutoModelForCausalLM,
|
| 43 |
+
AutoTokenizer,
|
| 44 |
+
HfArgumentParser,
|
| 45 |
+
Trainer,
|
| 46 |
+
TrainingArguments,
|
| 47 |
+
default_data_collator,
|
| 48 |
+
is_torch_xla_available,
|
| 49 |
+
set_seed,
|
| 50 |
+
)
|
| 51 |
+
from transformers.testing_utils import CaptureLogger
|
| 52 |
+
from transformers.trainer_utils import get_last_checkpoint
|
| 53 |
+
from transformers.utils import check_min_version, send_example_telemetry
|
| 54 |
+
from transformers.utils.versions import require_version
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
| 58 |
+
#check_min_version("4.49.0.dev0")
|
| 59 |
+
|
| 60 |
+
require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
| 61 |
+
|
| 62 |
+
logger = logging.getLogger(__name__)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
|
| 66 |
+
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@dataclass
|
| 70 |
+
class ModelArguments:
|
| 71 |
+
"""
|
| 72 |
+
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
model_name_or_path: Optional[str] = field(
|
| 76 |
+
default=None,
|
| 77 |
+
metadata={
|
| 78 |
+
"help": (
|
| 79 |
+
"The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
|
| 80 |
+
)
|
| 81 |
+
},
|
| 82 |
+
)
|
| 83 |
+
model_type: Optional[str] = field(
|
| 84 |
+
default=None,
|
| 85 |
+
metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
|
| 86 |
+
)
|
| 87 |
+
config_overrides: Optional[str] = field(
|
| 88 |
+
default=None,
|
| 89 |
+
metadata={
|
| 90 |
+
"help": (
|
| 91 |
+
"Override some existing default config settings when a model is trained from scratch. Example: "
|
| 92 |
+
"n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
|
| 93 |
+
)
|
| 94 |
+
},
|
| 95 |
+
)
|
| 96 |
+
config_name: Optional[str] = field(
|
| 97 |
+
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
| 98 |
+
)
|
| 99 |
+
tokenizer_name: Optional[str] = field(
|
| 100 |
+
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
| 101 |
+
)
|
| 102 |
+
cache_dir: Optional[str] = field(
|
| 103 |
+
default=None,
|
| 104 |
+
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
|
| 105 |
+
)
|
| 106 |
+
use_fast_tokenizer: bool = field(
|
| 107 |
+
default=True,
|
| 108 |
+
metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
|
| 109 |
+
)
|
| 110 |
+
model_revision: str = field(
|
| 111 |
+
default="main",
|
| 112 |
+
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
| 113 |
+
)
|
| 114 |
+
token: str = field(
|
| 115 |
+
default=None,
|
| 116 |
+
metadata={
|
| 117 |
+
"help": (
|
| 118 |
+
"The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
|
| 119 |
+
"generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
|
| 120 |
+
)
|
| 121 |
+
},
|
| 122 |
+
)
|
| 123 |
+
trust_remote_code: bool = field(
|
| 124 |
+
default=False,
|
| 125 |
+
metadata={
|
| 126 |
+
"help": (
|
| 127 |
+
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
| 128 |
+
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
| 129 |
+
" code, as it will execute code present on the Hub on your local machine."
|
| 130 |
+
)
|
| 131 |
+
},
|
| 132 |
+
)
|
| 133 |
+
torch_dtype: Optional[str] = field(
|
| 134 |
+
default=None,
|
| 135 |
+
metadata={
|
| 136 |
+
"help": (
|
| 137 |
+
"Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
|
| 138 |
+
"dtype will be automatically derived from the model's weights."
|
| 139 |
+
),
|
| 140 |
+
"choices": ["auto", "bfloat16", "float16", "float32"],
|
| 141 |
+
},
|
| 142 |
+
)
|
| 143 |
+
low_cpu_mem_usage: bool = field(
|
| 144 |
+
default=False,
|
| 145 |
+
metadata={
|
| 146 |
+
"help": (
|
| 147 |
+
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
|
| 148 |
+
"set True will benefit LLM loading time and RAM consumption."
|
| 149 |
+
)
|
| 150 |
+
},
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
def __post_init__(self):
|
| 154 |
+
if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
|
| 155 |
+
raise ValueError(
|
| 156 |
+
"--config_overrides can't be used in combination with --config_name or --model_name_or_path"
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
@dataclass
|
| 161 |
+
class DataTrainingArguments:
|
| 162 |
+
"""
|
| 163 |
+
Arguments pertaining to what data we are going to input our model for training and eval.
|
| 164 |
+
"""
|
| 165 |
+
|
| 166 |
+
dataset_name: Optional[str] = field(
|
| 167 |
+
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
|
| 168 |
+
)
|
| 169 |
+
dataset_config_name: Optional[str] = field(
|
| 170 |
+
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
| 171 |
+
)
|
| 172 |
+
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
|
| 173 |
+
validation_file: Optional[str] = field(
|
| 174 |
+
default=None,
|
| 175 |
+
metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
|
| 176 |
+
)
|
| 177 |
+
max_train_samples: Optional[int] = field(
|
| 178 |
+
default=None,
|
| 179 |
+
metadata={
|
| 180 |
+
"help": (
|
| 181 |
+
"For debugging purposes or quicker training, truncate the number of training examples to this "
|
| 182 |
+
"value if set."
|
| 183 |
+
)
|
| 184 |
+
},
|
| 185 |
+
)
|
| 186 |
+
max_eval_samples: Optional[int] = field(
|
| 187 |
+
default=None,
|
| 188 |
+
metadata={
|
| 189 |
+
"help": (
|
| 190 |
+
"For debugging purposes or quicker training, truncate the number of evaluation examples to this "
|
| 191 |
+
"value if set."
|
| 192 |
+
)
|
| 193 |
+
},
|
| 194 |
+
)
|
| 195 |
+
streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
|
| 196 |
+
block_size: Optional[int] = field(
|
| 197 |
+
default=None,
|
| 198 |
+
metadata={
|
| 199 |
+
"help": (
|
| 200 |
+
"Optional input sequence length after tokenization. "
|
| 201 |
+
"The training dataset will be truncated in block of this size for training. "
|
| 202 |
+
"Default to the model max input length for single sentence inputs (take into account special tokens)."
|
| 203 |
+
)
|
| 204 |
+
},
|
| 205 |
+
)
|
| 206 |
+
overwrite_cache: bool = field(
|
| 207 |
+
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
| 208 |
+
)
|
| 209 |
+
validation_split_percentage: Optional[int] = field(
|
| 210 |
+
default=5,
|
| 211 |
+
metadata={
|
| 212 |
+
"help": "The percentage of the train set used as validation set in case there's no validation split"
|
| 213 |
+
},
|
| 214 |
+
)
|
| 215 |
+
preprocessing_num_workers: Optional[int] = field(
|
| 216 |
+
default=None,
|
| 217 |
+
metadata={"help": "The number of processes to use for the preprocessing."},
|
| 218 |
+
)
|
| 219 |
+
keep_linebreaks: bool = field(
|
| 220 |
+
default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
def __post_init__(self):
|
| 224 |
+
if self.streaming:
|
| 225 |
+
require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
|
| 226 |
+
|
| 227 |
+
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
| 228 |
+
raise ValueError("Need either a dataset name or a training/validation file.")
|
| 229 |
+
else:
|
| 230 |
+
if self.train_file is not None:
|
| 231 |
+
extension = self.train_file.split(".")[-1]
|
| 232 |
+
assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
|
| 233 |
+
if self.validation_file is not None:
|
| 234 |
+
extension = self.validation_file.split(".")[-1]
|
| 235 |
+
assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def main():
|
| 239 |
+
# See all possible arguments in src/transformers/training_args.py
|
| 240 |
+
# or by passing the --help flag to this script.
|
| 241 |
+
# We now keep distinct sets of args, for a cleaner separation of concerns.
|
| 242 |
+
|
| 243 |
+
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
|
| 244 |
+
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
| 245 |
+
# If we pass only one argument to the script and it's the path to a json file,
|
| 246 |
+
# let's parse it to get our arguments.
|
| 247 |
+
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
|
| 248 |
+
else:
|
| 249 |
+
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
| 250 |
+
|
| 251 |
+
# Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
|
| 252 |
+
# information sent is the one passed as arguments along with your Python/PyTorch versions.
|
| 253 |
+
send_example_telemetry("run_clm", model_args, data_args)
|
| 254 |
+
|
| 255 |
+
# Setup logging
|
| 256 |
+
logging.basicConfig(
|
| 257 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
| 258 |
+
datefmt="%m/%d/%Y %H:%M:%S",
|
| 259 |
+
handlers=[logging.StreamHandler(sys.stdout)],
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
if training_args.should_log:
|
| 263 |
+
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
|
| 264 |
+
transformers.utils.logging.set_verbosity_info()
|
| 265 |
+
|
| 266 |
+
log_level = training_args.get_process_log_level()
|
| 267 |
+
logger.setLevel(log_level)
|
| 268 |
+
datasets.utils.logging.set_verbosity(log_level)
|
| 269 |
+
transformers.utils.logging.set_verbosity(log_level)
|
| 270 |
+
transformers.utils.logging.enable_default_handler()
|
| 271 |
+
transformers.utils.logging.enable_explicit_format()
|
| 272 |
+
|
| 273 |
+
# Log on each process the small summary:
|
| 274 |
+
logger.warning(
|
| 275 |
+
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
|
| 276 |
+
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
|
| 277 |
+
)
|
| 278 |
+
logger.info(f"Training/evaluation parameters {training_args}")
|
| 279 |
+
|
| 280 |
+
# Detecting last checkpoint.
|
| 281 |
+
last_checkpoint = None
|
| 282 |
+
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
| 283 |
+
last_checkpoint = get_last_checkpoint(training_args.output_dir)
|
| 284 |
+
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
|
| 285 |
+
raise ValueError(
|
| 286 |
+
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
|
| 287 |
+
"Use --overwrite_output_dir to overcome."
|
| 288 |
+
)
|
| 289 |
+
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
|
| 290 |
+
logger.info(
|
| 291 |
+
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
|
| 292 |
+
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
# Set seed before initializing model.
|
| 296 |
+
set_seed(training_args.seed)
|
| 297 |
+
|
| 298 |
+
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
|
| 299 |
+
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
|
| 300 |
+
# (the dataset will be downloaded automatically from the datasets Hub).
|
| 301 |
+
#
|
| 302 |
+
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
|
| 303 |
+
# 'text' is found. You can easily tweak this behavior (see below).
|
| 304 |
+
#
|
| 305 |
+
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
|
| 306 |
+
# download the dataset.
|
| 307 |
+
if data_args.dataset_name is not None:
|
| 308 |
+
# Downloading and loading a dataset from the hub.
|
| 309 |
+
raw_datasets = load_dataset(
|
| 310 |
+
data_args.dataset_name,
|
| 311 |
+
data_args.dataset_config_name,
|
| 312 |
+
cache_dir=model_args.cache_dir,
|
| 313 |
+
token=model_args.token,
|
| 314 |
+
streaming=data_args.streaming,
|
| 315 |
+
trust_remote_code=model_args.trust_remote_code,
|
| 316 |
+
)
|
| 317 |
+
if "validation" not in raw_datasets.keys():
|
| 318 |
+
raw_datasets["validation"] = load_dataset(
|
| 319 |
+
data_args.dataset_name,
|
| 320 |
+
data_args.dataset_config_name,
|
| 321 |
+
split=f"train[:{data_args.validation_split_percentage}%]",
|
| 322 |
+
cache_dir=model_args.cache_dir,
|
| 323 |
+
token=model_args.token,
|
| 324 |
+
streaming=data_args.streaming,
|
| 325 |
+
trust_remote_code=model_args.trust_remote_code,
|
| 326 |
+
)
|
| 327 |
+
raw_datasets["train"] = load_dataset(
|
| 328 |
+
data_args.dataset_name,
|
| 329 |
+
data_args.dataset_config_name,
|
| 330 |
+
split=f"train[{data_args.validation_split_percentage}%:]",
|
| 331 |
+
cache_dir=model_args.cache_dir,
|
| 332 |
+
token=model_args.token,
|
| 333 |
+
streaming=data_args.streaming,
|
| 334 |
+
trust_remote_code=model_args.trust_remote_code,
|
| 335 |
+
)
|
| 336 |
+
else:
|
| 337 |
+
data_files = {}
|
| 338 |
+
dataset_args = {}
|
| 339 |
+
if data_args.train_file is not None:
|
| 340 |
+
data_files["train"] = data_args.train_file
|
| 341 |
+
if data_args.validation_file is not None:
|
| 342 |
+
data_files["validation"] = data_args.validation_file
|
| 343 |
+
extension = (
|
| 344 |
+
data_args.train_file.split(".")[-1]
|
| 345 |
+
if data_args.train_file is not None
|
| 346 |
+
else data_args.validation_file.split(".")[-1]
|
| 347 |
+
)
|
| 348 |
+
if extension == "txt":
|
| 349 |
+
extension = "text"
|
| 350 |
+
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
|
| 351 |
+
raw_datasets = load_dataset(
|
| 352 |
+
extension,
|
| 353 |
+
data_files=data_files,
|
| 354 |
+
cache_dir=model_args.cache_dir,
|
| 355 |
+
token=model_args.token,
|
| 356 |
+
**dataset_args,
|
| 357 |
+
)
|
| 358 |
+
# If no validation data is there, validation_split_percentage will be used to divide the dataset.
|
| 359 |
+
if "validation" not in raw_datasets.keys():
|
| 360 |
+
raw_datasets["validation"] = load_dataset(
|
| 361 |
+
extension,
|
| 362 |
+
data_files=data_files,
|
| 363 |
+
split=f"train[:{data_args.validation_split_percentage}%]",
|
| 364 |
+
cache_dir=model_args.cache_dir,
|
| 365 |
+
token=model_args.token,
|
| 366 |
+
**dataset_args,
|
| 367 |
+
)
|
| 368 |
+
raw_datasets["train"] = load_dataset(
|
| 369 |
+
extension,
|
| 370 |
+
data_files=data_files,
|
| 371 |
+
split=f"train[{data_args.validation_split_percentage}%:]",
|
| 372 |
+
cache_dir=model_args.cache_dir,
|
| 373 |
+
token=model_args.token,
|
| 374 |
+
**dataset_args,
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
| 378 |
+
# https://huggingface.co/docs/datasets/loading_datasets.
|
| 379 |
+
|
| 380 |
+
# Load pretrained model and tokenizer
|
| 381 |
+
#
|
| 382 |
+
# Distributed training:
|
| 383 |
+
# The .from_pretrained methods guarantee that only one local process can concurrently
|
| 384 |
+
# download model & vocab.
|
| 385 |
+
|
| 386 |
+
config_kwargs = {
|
| 387 |
+
"cache_dir": model_args.cache_dir,
|
| 388 |
+
"revision": model_args.model_revision,
|
| 389 |
+
"token": model_args.token,
|
| 390 |
+
"trust_remote_code": model_args.trust_remote_code,
|
| 391 |
+
}
|
| 392 |
+
if model_args.config_name:
|
| 393 |
+
config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
|
| 394 |
+
elif model_args.model_name_or_path:
|
| 395 |
+
config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
|
| 396 |
+
else:
|
| 397 |
+
config = CONFIG_MAPPING[model_args.model_type]()
|
| 398 |
+
logger.warning("You are instantiating a new config instance from scratch.")
|
| 399 |
+
if model_args.config_overrides is not None:
|
| 400 |
+
logger.info(f"Overriding config: {model_args.config_overrides}")
|
| 401 |
+
config.update_from_string(model_args.config_overrides)
|
| 402 |
+
logger.info(f"New config: {config}")
|
| 403 |
+
|
| 404 |
+
tokenizer_kwargs = {
|
| 405 |
+
"cache_dir": model_args.cache_dir,
|
| 406 |
+
"use_fast": model_args.use_fast_tokenizer,
|
| 407 |
+
"revision": model_args.model_revision,
|
| 408 |
+
"token": model_args.token,
|
| 409 |
+
"trust_remote_code": model_args.trust_remote_code,
|
| 410 |
+
}
|
| 411 |
+
if model_args.tokenizer_name:
|
| 412 |
+
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
|
| 413 |
+
elif model_args.model_name_or_path:
|
| 414 |
+
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
|
| 415 |
+
else:
|
| 416 |
+
raise ValueError(
|
| 417 |
+
"You are instantiating a new tokenizer from scratch. This is not supported by this script. "
|
| 418 |
+
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
if model_args.model_name_or_path:
|
| 422 |
+
torch_dtype = (
|
| 423 |
+
model_args.torch_dtype
|
| 424 |
+
if model_args.torch_dtype in ["auto", None]
|
| 425 |
+
else getattr(torch, model_args.torch_dtype)
|
| 426 |
+
)
|
| 427 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 428 |
+
model_args.model_name_or_path,
|
| 429 |
+
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
| 430 |
+
config=config,
|
| 431 |
+
cache_dir=model_args.cache_dir,
|
| 432 |
+
revision=model_args.model_revision,
|
| 433 |
+
token=model_args.token,
|
| 434 |
+
trust_remote_code=model_args.trust_remote_code,
|
| 435 |
+
torch_dtype=torch_dtype,
|
| 436 |
+
low_cpu_mem_usage=model_args.low_cpu_mem_usage,
|
| 437 |
+
)
|
| 438 |
+
else:
|
| 439 |
+
model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
|
| 440 |
+
n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
|
| 441 |
+
logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
|
| 442 |
+
|
| 443 |
+
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
|
| 444 |
+
# on a small vocab and want a smaller embedding size, remove this test.
|
| 445 |
+
embedding_size = model.get_input_embeddings().weight.shape[0]
|
| 446 |
+
if len(tokenizer) > embedding_size:
|
| 447 |
+
model.resize_token_embeddings(len(tokenizer))
|
| 448 |
+
|
| 449 |
+
# Preprocessing the datasets.
|
| 450 |
+
# First we tokenize all the texts.
|
| 451 |
+
if training_args.do_train:
|
| 452 |
+
column_names = list(raw_datasets["train"].features)
|
| 453 |
+
else:
|
| 454 |
+
column_names = list(raw_datasets["validation"].features)
|
| 455 |
+
text_column_name = "text" if "text" in column_names else column_names[0]
|
| 456 |
+
|
| 457 |
+
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
|
| 458 |
+
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
|
| 459 |
+
|
| 460 |
+
def tokenize_function(examples):
|
| 461 |
+
with CaptureLogger(tok_logger) as cl:
|
| 462 |
+
output = tokenizer(examples[text_column_name])
|
| 463 |
+
# clm input could be much much longer than block_size
|
| 464 |
+
if "Token indices sequence length is longer than the" in cl.out:
|
| 465 |
+
tok_logger.warning(
|
| 466 |
+
"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
|
| 467 |
+
" before being passed to the model."
|
| 468 |
+
)
|
| 469 |
+
return output
|
| 470 |
+
|
| 471 |
+
with training_args.main_process_first(desc="dataset map tokenization"):
|
| 472 |
+
if not data_args.streaming:
|
| 473 |
+
tokenized_datasets = raw_datasets.map(
|
| 474 |
+
tokenize_function,
|
| 475 |
+
batched=True,
|
| 476 |
+
num_proc=data_args.preprocessing_num_workers,
|
| 477 |
+
remove_columns=column_names,
|
| 478 |
+
load_from_cache_file=not data_args.overwrite_cache,
|
| 479 |
+
desc="Running tokenizer on dataset",
|
| 480 |
+
)
|
| 481 |
+
else:
|
| 482 |
+
tokenized_datasets = raw_datasets.map(
|
| 483 |
+
tokenize_function,
|
| 484 |
+
batched=True,
|
| 485 |
+
remove_columns=column_names,
|
| 486 |
+
)
|
| 487 |
+
if hasattr(config, "max_position_embeddings"):
|
| 488 |
+
max_pos_embeddings = config.max_position_embeddings
|
| 489 |
+
else:
|
| 490 |
+
# Define a default value if the attribute is missing in the config.
|
| 491 |
+
max_pos_embeddings = 1024
|
| 492 |
+
|
| 493 |
+
if data_args.block_size is None:
|
| 494 |
+
block_size = tokenizer.model_max_length
|
| 495 |
+
if block_size > max_pos_embeddings:
|
| 496 |
+
logger.warning(
|
| 497 |
+
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
|
| 498 |
+
f"Using block_size={min(1024, max_pos_embeddings)} instead. You can change that default value by passing --block_size xxx."
|
| 499 |
+
)
|
| 500 |
+
if max_pos_embeddings > 0:
|
| 501 |
+
block_size = min(1024, max_pos_embeddings)
|
| 502 |
+
else:
|
| 503 |
+
block_size = 1024
|
| 504 |
+
else:
|
| 505 |
+
if data_args.block_size > tokenizer.model_max_length:
|
| 506 |
+
logger.warning(
|
| 507 |
+
f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model "
|
| 508 |
+
f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
|
| 509 |
+
)
|
| 510 |
+
block_size = min(data_args.block_size, tokenizer.model_max_length)
|
| 511 |
+
|
| 512 |
+
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
| 513 |
+
def group_texts(examples):
|
| 514 |
+
# Concatenate all texts.
|
| 515 |
+
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
| 516 |
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
| 517 |
+
# We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
|
| 518 |
+
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
| 519 |
+
total_length = (total_length // block_size) * block_size
|
| 520 |
+
# Split by chunks of max_len.
|
| 521 |
+
result = {
|
| 522 |
+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
| 523 |
+
for k, t in concatenated_examples.items()
|
| 524 |
+
}
|
| 525 |
+
result["labels"] = result["input_ids"].copy()
|
| 526 |
+
return result
|
| 527 |
+
|
| 528 |
+
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
|
| 529 |
+
# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
|
| 530 |
+
# to preprocess.
|
| 531 |
+
#
|
| 532 |
+
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
| 533 |
+
# https://huggingface.co/docs/datasets/process#map
|
| 534 |
+
|
| 535 |
+
with training_args.main_process_first(desc="grouping texts together"):
|
| 536 |
+
if not data_args.streaming:
|
| 537 |
+
lm_datasets = tokenized_datasets.map(
|
| 538 |
+
group_texts,
|
| 539 |
+
batched=True,
|
| 540 |
+
num_proc=data_args.preprocessing_num_workers,
|
| 541 |
+
load_from_cache_file=not data_args.overwrite_cache,
|
| 542 |
+
desc=f"Grouping texts in chunks of {block_size}",
|
| 543 |
+
)
|
| 544 |
+
else:
|
| 545 |
+
lm_datasets = tokenized_datasets.map(
|
| 546 |
+
group_texts,
|
| 547 |
+
batched=True,
|
| 548 |
+
)
|
| 549 |
+
|
| 550 |
+
if training_args.do_train:
|
| 551 |
+
if "train" not in tokenized_datasets:
|
| 552 |
+
raise ValueError("--do_train requires a train dataset")
|
| 553 |
+
train_dataset = lm_datasets["train"]
|
| 554 |
+
if data_args.max_train_samples is not None:
|
| 555 |
+
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
| 556 |
+
train_dataset = train_dataset.select(range(max_train_samples))
|
| 557 |
+
|
| 558 |
+
if training_args.do_eval:
|
| 559 |
+
if "validation" not in tokenized_datasets:
|
| 560 |
+
raise ValueError("--do_eval requires a validation dataset")
|
| 561 |
+
eval_dataset = lm_datasets["validation"]
|
| 562 |
+
if data_args.max_eval_samples is not None:
|
| 563 |
+
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
| 564 |
+
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
| 565 |
+
|
| 566 |
+
def preprocess_logits_for_metrics(logits, labels):
|
| 567 |
+
if isinstance(logits, tuple):
|
| 568 |
+
# Depending on the model and config, logits may contain extra tensors,
|
| 569 |
+
# like past_key_values, but logits always come first
|
| 570 |
+
logits = logits[0]
|
| 571 |
+
return logits.argmax(dim=-1)
|
| 572 |
+
|
| 573 |
+
metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir)
|
| 574 |
+
|
| 575 |
+
def compute_metrics(eval_preds):
|
| 576 |
+
preds, labels = eval_preds
|
| 577 |
+
# preds have the same shape as the labels, after the argmax(-1) has been calculated
|
| 578 |
+
# by preprocess_logits_for_metrics but we need to shift the labels
|
| 579 |
+
labels = labels[:, 1:].reshape(-1)
|
| 580 |
+
preds = preds[:, :-1].reshape(-1)
|
| 581 |
+
return metric.compute(predictions=preds, references=labels)
|
| 582 |
+
|
| 583 |
+
# Initialize our Trainer
|
| 584 |
+
trainer = Trainer(
|
| 585 |
+
model=model,
|
| 586 |
+
args=training_args,
|
| 587 |
+
train_dataset=train_dataset if training_args.do_train else None,
|
| 588 |
+
eval_dataset=eval_dataset if training_args.do_eval else None,
|
| 589 |
+
tokenizer=tokenizer,
|
| 590 |
+
# Data collator will default to DataCollatorWithPadding, so we change it.
|
| 591 |
+
data_collator=default_data_collator,
|
| 592 |
+
compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None,
|
| 593 |
+
preprocess_logits_for_metrics=preprocess_logits_for_metrics
|
| 594 |
+
if training_args.do_eval and not is_torch_xla_available()
|
| 595 |
+
else None,
|
| 596 |
+
)
|
| 597 |
+
|
| 598 |
+
# Training
|
| 599 |
+
if training_args.do_train:
|
| 600 |
+
checkpoint = None
|
| 601 |
+
if training_args.resume_from_checkpoint is not None:
|
| 602 |
+
checkpoint = training_args.resume_from_checkpoint
|
| 603 |
+
elif last_checkpoint is not None:
|
| 604 |
+
checkpoint = last_checkpoint
|
| 605 |
+
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
| 606 |
+
trainer.save_model() # Saves the tokenizer too for easy upload
|
| 607 |
+
|
| 608 |
+
metrics = train_result.metrics
|
| 609 |
+
|
| 610 |
+
max_train_samples = (
|
| 611 |
+
data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
|
| 612 |
+
)
|
| 613 |
+
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
|
| 614 |
+
|
| 615 |
+
trainer.log_metrics("train", metrics)
|
| 616 |
+
trainer.save_metrics("train", metrics)
|
| 617 |
+
trainer.save_state()
|
| 618 |
+
|
| 619 |
+
# Evaluation
|
| 620 |
+
if training_args.do_eval:
|
| 621 |
+
logger.info("*** Evaluate ***")
|
| 622 |
+
|
| 623 |
+
metrics = trainer.evaluate()
|
| 624 |
+
|
| 625 |
+
max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
|
| 626 |
+
metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
|
| 627 |
+
try:
|
| 628 |
+
perplexity = math.exp(metrics["eval_loss"])
|
| 629 |
+
except OverflowError:
|
| 630 |
+
perplexity = float("inf")
|
| 631 |
+
metrics["perplexity"] = perplexity
|
| 632 |
+
|
| 633 |
+
trainer.log_metrics("eval", metrics)
|
| 634 |
+
trainer.save_metrics("eval", metrics)
|
| 635 |
+
|
| 636 |
+
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
|
| 637 |
+
if data_args.dataset_name is not None:
|
| 638 |
+
kwargs["dataset_tags"] = data_args.dataset_name
|
| 639 |
+
if data_args.dataset_config_name is not None:
|
| 640 |
+
kwargs["dataset_args"] = data_args.dataset_config_name
|
| 641 |
+
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
| 642 |
+
else:
|
| 643 |
+
kwargs["dataset"] = data_args.dataset_name
|
| 644 |
+
|
| 645 |
+
if training_args.push_to_hub:
|
| 646 |
+
trainer.push_to_hub(**kwargs)
|
| 647 |
+
else:
|
| 648 |
+
trainer.create_model_card(**kwargs)
|
| 649 |
+
|
| 650 |
+
|
| 651 |
+
def _mp_fn(index):
|
| 652 |
+
# For xla_spawn (TPUs)
|
| 653 |
+
main()
|
| 654 |
+
|
| 655 |
+
|
| 656 |
+
if __name__ == "__main__":
|
| 657 |
+
main()
|
pretrain/gpt2_gene_multi_v2/run_pt_gpt2_formal.sh
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
lr=2e-4
|
| 2 |
+
dna_eng_tokenizer_path=./gpt2_gene_multi_tokenizer
|
| 3 |
+
train_file=train_data/gene_eng_zh_de_es.txt
|
| 4 |
+
data_cache=temp_data_cache_dir
|
| 5 |
+
per_device_train_batch_size=32
|
| 6 |
+
per_device_eval_batch_size=32
|
| 7 |
+
gradient_accumulation_steps=8
|
| 8 |
+
output_dir=gene_eng_zh_de_es
|
| 9 |
+
|
| 10 |
+
deepspeed_config_file=ds_zero2_no_offload.json
|
| 11 |
+
|
| 12 |
+
torchrun --nnodes 1 --nproc_per_node 6 run_clm_formal.py \
|
| 13 |
+
--config_name gene_eng_gpt2_v1 \
|
| 14 |
+
--deepspeed ${deepspeed_config_file} \
|
| 15 |
+
--tokenizer_name ${dna_eng_tokenizer_path} \
|
| 16 |
+
--train_file ${train_file} \
|
| 17 |
+
--cache_dir ${data_cache} \
|
| 18 |
+
--per_device_train_batch_size ${per_device_train_batch_size} \
|
| 19 |
+
--per_device_eval_batch_size ${per_device_eval_batch_size} \
|
| 20 |
+
--do_train \
|
| 21 |
+
--validation_split_percentage 1 \
|
| 22 |
+
--seed $RANDOM \
|
| 23 |
+
--fp16 \
|
| 24 |
+
--num_train_epochs 1 \
|
| 25 |
+
--lr_scheduler_type cosine \
|
| 26 |
+
--learning_rate ${lr} \
|
| 27 |
+
--warmup_ratio 0.05 \
|
| 28 |
+
--weight_decay 0.01 \
|
| 29 |
+
--logging_strategy steps \
|
| 30 |
+
--logging_steps 10 \
|
| 31 |
+
--save_strategy steps \
|
| 32 |
+
--save_total_limit 3 \
|
| 33 |
+
--save_steps 1000 \
|
| 34 |
+
--gradient_accumulation_steps ${gradient_accumulation_steps} \
|
| 35 |
+
--preprocessing_num_workers 128 \
|
| 36 |
+
--block_size 256 \
|
| 37 |
+
--output_dir ${output_dir} \
|
| 38 |
+
--overwrite_output_dir \
|
| 39 |
+
--ddp_timeout 30000 \
|
| 40 |
+
--logging_first_step True \
|
| 41 |
+
--torch_dtype float16 \
|
| 42 |
+
--gradient_checkpointing \
|
| 43 |
+
--ddp_find_unused_parameters False
|