magilogi
commited on
Commit
Β·
be36629
1
Parent(s):
49b4a14
add api-results
Browse files
data/api-results/__pycache__/api_results.cpython-311.pyc
ADDED
|
Binary file (647 Bytes). View file
|
|
|
data/api-results/api_results.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Corrected and cleaned data
|
| 2 |
+
gpt4 = {
|
| 3 |
+
'b4bqa': 0.94921875,
|
| 4 |
+
'medqa_og': 0.9232804232804233,
|
| 5 |
+
'medqa_g2b': 0.8994708994708994,
|
| 6 |
+
'medmcqa_og': 0.9166666666666666,
|
| 7 |
+
'medmcqa_g2b': 0.8879310344827587
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
gpt4o = {
|
| 11 |
+
'b4bqa': 0.96484375,
|
| 12 |
+
'medqa_og': 0.9021164021164021,
|
| 13 |
+
'medqa_g2b': 0.8835978835978836,
|
| 14 |
+
'medmcqa_og': 0.9051724137931034,
|
| 15 |
+
'medmcqa_g2b': 0.8649425287356322
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
gpt35turbo = {
|
| 19 |
+
'b4bqa': 0.9174107142857143,
|
| 20 |
+
'medmcqa_og': 0.9827586206896551,
|
| 21 |
+
'medmcqa_g2b': 0.9770114942528736,
|
| 22 |
+
'medqa_og': 0.9629629629629629,
|
| 23 |
+
'medqa_g2b': 0.9603174603174603
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
claude_opus = {
|
| 27 |
+
'b4bqa': 0.921875,
|
| 28 |
+
'medqa_og': 0.8571428571428571,
|
| 29 |
+
'medqa_g2b': 0.8333333333333334,
|
| 30 |
+
'medmcqa_og': 0.8649425287356322,
|
| 31 |
+
'medmcqa_g2b': 0.7988505747126436
|
| 32 |
+
}
|
data/csv/models_data.csv
CHANGED
|
@@ -12,9 +12,13 @@ T,Model,b4bqa,b4b,medmcqa_g2b,medmcqa_orig_filtered,medmcqa_diff,medqa_4options_
|
|
| 12 |
π’,"<a target=""_blank"" href=""https://huggingface.co/microsoft/phi-1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-phi-1</a>",19.64,21.18,24.14,25.86,-1.72,21.69,20.9,0.79
|
| 13 |
π’,"<a target=""_blank"" href=""https://huggingface.co/microsoft/phi-2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-phi-2</a>",47.49,44.79,37.64,42.24,-4.6,41.8,43.92,-2.12
|
| 14 |
π¬,"<a target=""_blank"" href=""https://huggingface.co/microsoft/Phi-3-medium-4k-instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-Phi-3-medium-4k-instruct</a>",69.98,65.94,60.34,72.41,-12.07,53.44,58.47,-5.03
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
πΆ,"<a target=""_blank"" href=""https://huggingface.co/ProbeMedicalYonseiMAILab/medllama3-v20"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ProbeMedicalYonseiMAILab-medllama3-v20</a>",71.93,74.75,65.23,80.17,-14.94,76.46,90.21,-13.75
|
| 19 |
π’,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-72B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen-Qwen2-72B</a>",91.02,83.72,71.55,77.87,-6.32,74.07,75.4,-1.33
|
| 20 |
π’,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen-Qwen2-7B</a>",80.41,70.28,55.17,63.51,-8.34,53.7,58.99,-5.29
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
π’,"<a target=""_blank"" href=""https://huggingface.co/microsoft/phi-1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-phi-1</a>",19.64,21.18,24.14,25.86,-1.72,21.69,20.9,0.79
|
| 13 |
π’,"<a target=""_blank"" href=""https://huggingface.co/microsoft/phi-2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-phi-2</a>",47.49,44.79,37.64,42.24,-4.6,41.8,43.92,-2.12
|
| 14 |
π¬,"<a target=""_blank"" href=""https://huggingface.co/microsoft/Phi-3-medium-4k-instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-Phi-3-medium-4k-instruct</a>",69.98,65.94,60.34,72.41,-12.07,53.44,58.47,-5.03
|
| 15 |
+
π’,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mistral-7B-v0.3"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai-Mistral-7B-v0.3</a>",70.31,61.99,48.28,56.9,-8.62,48.68,53.17,-4.49
|
| 16 |
+
π’,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mixtral-8x22B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai-Mixtral-8x22B-v0.1</a>",87.72,78.82,61.78,70.4,-8.62,67.46,71.43,-3.97
|
| 17 |
+
π’,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mixtral-8x7B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai-Mixtral-8x7B-v0.1</a>",86.1,74.75,55.46,64.94,-9.48,60.05,62.43,-2.38
|
| 18 |
πΆ,"<a target=""_blank"" href=""https://huggingface.co/ProbeMedicalYonseiMAILab/medllama3-v20"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ProbeMedicalYonseiMAILab-medllama3-v20</a>",71.93,74.75,65.23,80.17,-14.94,76.46,90.21,-13.75
|
| 19 |
π’,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-72B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen-Qwen2-72B</a>",91.02,83.72,71.55,77.87,-6.32,74.07,75.4,-1.33
|
| 20 |
π’,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen-Qwen2-7B</a>",80.41,70.28,55.17,63.51,-8.34,53.7,58.99,-5.29
|
| 21 |
+
π¬,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-7B-v2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">GPT-4</a>",94.92,,88.79,91.67,-2.88,89.95,92.33,-2.38
|
| 22 |
+
π¬,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-7B-v2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">GPT-4o</a>",96.48,,86.49,90.52,-4.03,88.36,90.21,-1.85
|
| 23 |
+
π¬,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-7B-v2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">GPT-3.5 Turbo</a>",91.74,,97.7,98.28,-0.58,96.03,96.3,-0.27
|
| 24 |
+
π¬,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-7B-v2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Claude Opus</a>",92.19,,79.89,86.49,-6.6,83.33,85.71,-2.38
|
src/__pycache__/models_info.cpython-311.pyc
CHANGED
|
Binary files a/src/__pycache__/models_info.cpython-311.pyc and b/src/__pycache__/models_info.cpython-311.pyc differ
|
|
|
src/json2df.py
CHANGED
|
@@ -1,14 +1,23 @@
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from models_info import model_info
|
| 5 |
|
| 6 |
directory = 'data/raw-eval-outputs'
|
| 7 |
data = []
|
| 8 |
|
|
|
|
| 9 |
def model_hyperlink(link, model_name):
|
| 10 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 11 |
|
|
|
|
| 12 |
def make_clickable_names(df):
|
| 13 |
df["Model"] = df.apply(
|
| 14 |
lambda row: model_hyperlink(row["Link"], row["Model"]), axis=1
|
|
@@ -35,11 +44,33 @@ for filename in os.listdir(directory):
|
|
| 35 |
|
| 36 |
data.append(row)
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
df = pd.DataFrame(data)
|
| 40 |
df = make_clickable_names(df)
|
| 41 |
df.drop(columns=["Link"], inplace=True)
|
| 42 |
|
|
|
|
| 43 |
df['medmcqa_diff'] = (df['medmcqa_g2b'] - df['medmcqa_orig_filtered']).round(2)
|
| 44 |
df['medqa_diff'] = (df['medqa_4options_g2b'] - df['medqa_4options_orig_filtered']).round(2)
|
| 45 |
|
|
@@ -60,7 +91,7 @@ cols = [
|
|
| 60 |
]]
|
| 61 |
df = df[cols]
|
| 62 |
|
| 63 |
-
|
| 64 |
output_csv = 'data/csv/models_data.csv'
|
| 65 |
df.to_csv(output_csv, index=False)
|
| 66 |
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import pandas as pd
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
# Add the path to api-results.py
|
| 7 |
+
sys.path.append(os.path.abspath('data/api-results'))
|
| 8 |
+
|
| 9 |
+
# Now import the API results
|
| 10 |
+
from api_results import gpt4, gpt4o, gpt35turbo, claude_opus
|
| 11 |
from models_info import model_info
|
| 12 |
|
| 13 |
directory = 'data/raw-eval-outputs'
|
| 14 |
data = []
|
| 15 |
|
| 16 |
+
# Function to create a clickable hyperlink for the model name
|
| 17 |
def model_hyperlink(link, model_name):
|
| 18 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 19 |
|
| 20 |
+
# Function to apply the hyperlink creation function to the DataFrame
|
| 21 |
def make_clickable_names(df):
|
| 22 |
df["Model"] = df.apply(
|
| 23 |
lambda row: model_hyperlink(row["Link"], row["Model"]), axis=1
|
|
|
|
| 44 |
|
| 45 |
data.append(row)
|
| 46 |
|
| 47 |
+
# Prepare the API results for integration
|
| 48 |
+
api_models = {
|
| 49 |
+
'GPT-4': gpt4,
|
| 50 |
+
'GPT-4o': gpt4o,
|
| 51 |
+
'GPT-3.5 Turbo': gpt35turbo,
|
| 52 |
+
'Claude Opus': claude_opus
|
| 53 |
+
}
|
| 54 |
|
| 55 |
+
for model_name, results in api_models.items():
|
| 56 |
+
row = {
|
| 57 |
+
'Model': model_name,
|
| 58 |
+
'b4bqa': round(results.get('b4bqa', 0) * 100, 2),
|
| 59 |
+
'medmcqa_g2b': round(results['medmcqa_g2b'] * 100, 2),
|
| 60 |
+
'medmcqa_orig_filtered': round(results['medmcqa_og'] * 100, 2),
|
| 61 |
+
'medqa_4options_g2b': round(results['medqa_g2b'] * 100, 2),
|
| 62 |
+
'medqa_4options_orig_filtered': round(results['medqa_og'] * 100, 2),
|
| 63 |
+
'T': model_info[model_name]['tuning'],
|
| 64 |
+
'Link': model_info[model_name]['link']
|
| 65 |
+
}
|
| 66 |
+
data.append(row)
|
| 67 |
+
|
| 68 |
+
# Create DataFrame from the collected data
|
| 69 |
df = pd.DataFrame(data)
|
| 70 |
df = make_clickable_names(df)
|
| 71 |
df.drop(columns=["Link"], inplace=True)
|
| 72 |
|
| 73 |
+
# Calculate differences between specific evaluation metrics
|
| 74 |
df['medmcqa_diff'] = (df['medmcqa_g2b'] - df['medmcqa_orig_filtered']).round(2)
|
| 75 |
df['medqa_diff'] = (df['medqa_4options_g2b'] - df['medqa_4options_orig_filtered']).round(2)
|
| 76 |
|
|
|
|
| 91 |
]]
|
| 92 |
df = df[cols]
|
| 93 |
|
| 94 |
+
# Save DataFrame to CSV
|
| 95 |
output_csv = 'data/csv/models_data.csv'
|
| 96 |
df.to_csv(output_csv, index=False)
|
| 97 |
|
src/models_info.py
CHANGED
|
@@ -76,4 +76,20 @@ model_info = {
|
|
| 76 |
"link": "https://huggingface.co/Qwen/Qwen2-7B",
|
| 77 |
"tuning": "π’" # Pre-trained
|
| 78 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
}
|
|
|
|
| 76 |
"link": "https://huggingface.co/Qwen/Qwen2-7B",
|
| 77 |
"tuning": "π’" # Pre-trained
|
| 78 |
},
|
| 79 |
+
"GPT-4": {
|
| 80 |
+
"link": "https://huggingface.co/Qwen/Qwen2-7B-v2",
|
| 81 |
+
"tuning": "π¬"
|
| 82 |
+
},
|
| 83 |
+
"GPT-4o": {
|
| 84 |
+
"link": "https://huggingface.co/Qwen/Qwen2-7B-v2",
|
| 85 |
+
"tuning": "π¬"
|
| 86 |
+
},
|
| 87 |
+
"GPT-3.5 Turbo": {
|
| 88 |
+
"link": "https://huggingface.co/Qwen/Qwen2-7B-v2",
|
| 89 |
+
"tuning": "π¬"
|
| 90 |
+
},
|
| 91 |
+
"Claude Opus": {
|
| 92 |
+
"link": "https://huggingface.co/Qwen/Qwen2-7B-v2",
|
| 93 |
+
"tuning": "π¬"
|
| 94 |
+
}
|
| 95 |
}
|