Spaces:
Sleeping
Sleeping
updated values and added ranking, new fields
Browse files- app.py +133 -27
- tiered_models_data.csv +23 -0
app.py
CHANGED
|
@@ -5,7 +5,7 @@ from PIL import Image
|
|
| 5 |
# Set up page config
|
| 6 |
st.set_page_config(
|
| 7 |
page_title="FactBench Leaderboard",
|
| 8 |
-
|
| 9 |
)
|
| 10 |
|
| 11 |
# Load the image
|
|
@@ -81,17 +81,82 @@ st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</di
|
|
| 81 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 82 |
|
| 83 |
# Load the data
|
| 84 |
-
data_path = "factbench_data.csv"
|
|
|
|
| 85 |
df = pd.read_csv(data_path)
|
| 86 |
|
| 87 |
# Create tabs
|
| 88 |
tab1, tab2, tab3 = st.tabs(
|
| 89 |
-
["Leaderboard", "Benchmark Details", "Submit
|
| 90 |
|
| 91 |
# Tab 1: Leaderboard
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
with tab1:
|
| 93 |
-
st.markdown('<div class="title">Leaderboard</div>',
|
| 94 |
-
unsafe_allow_html=True)
|
| 95 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
| 96 |
|
| 97 |
# Dropdown menu to filter tiers
|
|
@@ -100,21 +165,51 @@ with tab1:
|
|
| 100 |
|
| 101 |
# Filter the data based on the selected tier
|
| 102 |
if selected_tier != 'All Tiers':
|
| 103 |
-
filtered_df = df[df['
|
| 104 |
else:
|
| 105 |
filtered_df = df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
# Create HTML for the table
|
| 108 |
html = '''
|
| 109 |
<table>
|
| 110 |
<thead>
|
| 111 |
<tr>
|
|
|
|
| 112 |
<th>Tier</th>
|
| 113 |
<th>Model</th>
|
| 114 |
-
<th>
|
| 115 |
-
<th>
|
| 116 |
-
<th>
|
| 117 |
-
<th>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
</tr>
|
| 119 |
</thead>
|
| 120 |
<tbody>
|
|
@@ -122,27 +217,39 @@ with tab1:
|
|
| 122 |
|
| 123 |
# Generate the rows of the table
|
| 124 |
current_tier = None
|
| 125 |
-
for i, row in
|
| 126 |
-
if row['
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
else:
|
| 133 |
-
|
| 134 |
-
|
|
|
|
| 135 |
# Fill in model and scores
|
| 136 |
html += f'''
|
| 137 |
-
<td>{row['
|
| 138 |
-
<td>{row['
|
| 139 |
-
<td>{row['
|
| 140 |
-
<td>{row['
|
| 141 |
-
<td>{row['
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
</tr>
|
| 143 |
'''
|
| 144 |
|
| 145 |
-
# Close the
|
| 146 |
html += '''
|
| 147 |
</table>
|
| 148 |
'''
|
|
@@ -151,7 +258,6 @@ with tab1:
|
|
| 151 |
st.markdown(html, unsafe_allow_html=True)
|
| 152 |
|
| 153 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 154 |
-
|
| 155 |
# Tab 2: Details
|
| 156 |
with tab2:
|
| 157 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
|
|
|
| 5 |
# Set up page config
|
| 6 |
st.set_page_config(
|
| 7 |
page_title="FactBench Leaderboard",
|
| 8 |
+
layout="wide", # Layout remains wide, but content will be centered
|
| 9 |
)
|
| 10 |
|
| 11 |
# Load the image
|
|
|
|
| 81 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 82 |
|
| 83 |
# Load the data
|
| 84 |
+
# data_path = "factbench_data.csv"
|
| 85 |
+
data_path = "tiered_models_data.csv"
|
| 86 |
df = pd.read_csv(data_path)
|
| 87 |
|
| 88 |
# Create tabs
|
| 89 |
tab1, tab2, tab3 = st.tabs(
|
| 90 |
+
["Leaderboard", "Benchmark Details", "Submit your models"])
|
| 91 |
|
| 92 |
# Tab 1: Leaderboard
|
| 93 |
+
# with tab1:
|
| 94 |
+
# st.markdown('<div class="title">Leaderboard</div>',
|
| 95 |
+
# unsafe_allow_html=True)
|
| 96 |
+
# st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
| 97 |
+
|
| 98 |
+
# # Dropdown menu to filter tiers
|
| 99 |
+
# tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard']
|
| 100 |
+
# selected_tier = st.selectbox('Select Tier:', tiers)
|
| 101 |
+
|
| 102 |
+
# # Filter the data based on the selected tier
|
| 103 |
+
# if selected_tier != 'All Tiers':
|
| 104 |
+
# filtered_df = df[df['Tier'] == selected_tier]
|
| 105 |
+
# else:
|
| 106 |
+
# filtered_df = df
|
| 107 |
+
|
| 108 |
+
# # Create HTML for the table
|
| 109 |
+
# html = '''
|
| 110 |
+
# <table>
|
| 111 |
+
# <thead>
|
| 112 |
+
# <tr>
|
| 113 |
+
# <th>Tier</th>
|
| 114 |
+
# <th>Model</th>
|
| 115 |
+
# <th>FactScore</th>
|
| 116 |
+
# <th>SAFE</th>
|
| 117 |
+
# <th>Factcheck-GPT</th>
|
| 118 |
+
# <th>VERIFY</th>
|
| 119 |
+
# </tr>
|
| 120 |
+
# </thead>
|
| 121 |
+
# <tbody>
|
| 122 |
+
# '''
|
| 123 |
+
|
| 124 |
+
# # Generate the rows of the table
|
| 125 |
+
# current_tier = None
|
| 126 |
+
# for i, row in filtered_df.iterrows():
|
| 127 |
+
# if row['Tier'] != current_tier:
|
| 128 |
+
# if current_tier is not None:
|
| 129 |
+
# # Close the previous tier row
|
| 130 |
+
# html += ' </tr>'
|
| 131 |
+
# current_tier = row['Tier']
|
| 132 |
+
# html += f' <tr><td rowspan="4" style="vertical-align: middle;">{current_tier}</td>'
|
| 133 |
+
# else:
|
| 134 |
+
# html += ' <tr>'
|
| 135 |
+
|
| 136 |
+
# # Fill in model and scores
|
| 137 |
+
# html += f'''
|
| 138 |
+
# <td>{row['Model']}</td>
|
| 139 |
+
# <td>{row['FactScore']:.2f}</td>
|
| 140 |
+
# <td>{row['SAFE']:.2f}</td>
|
| 141 |
+
# <td>{row['Factcheck-GPT']:.2f}</td>
|
| 142 |
+
# <td>{row['VERIFY']:.2f}</td>
|
| 143 |
+
# </tr>
|
| 144 |
+
# '''
|
| 145 |
+
|
| 146 |
+
# # Close the last row and table tags
|
| 147 |
+
# html += '''
|
| 148 |
+
# </table>
|
| 149 |
+
# '''
|
| 150 |
+
|
| 151 |
+
# # Display the table
|
| 152 |
+
# st.markdown(html, unsafe_allow_html=True)
|
| 153 |
+
|
| 154 |
+
# st.markdown('</div>', unsafe_allow_html=True)
|
| 155 |
+
df['rank'] = df['factuality_score'].rank(
|
| 156 |
+
ascending=False, method='min').astype(int)
|
| 157 |
+
|
| 158 |
with tab1:
|
| 159 |
+
st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
|
|
|
|
| 160 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
| 161 |
|
| 162 |
# Dropdown menu to filter tiers
|
|
|
|
| 165 |
|
| 166 |
# Filter the data based on the selected tier
|
| 167 |
if selected_tier != 'All Tiers':
|
| 168 |
+
filtered_df = df[df['tier'] == selected_tier]
|
| 169 |
else:
|
| 170 |
filtered_df = df
|
| 171 |
+
# Add sorting functionality for Factuality Score
|
| 172 |
+
# sort_order = st.radio('Sort by Factuality Score:',
|
| 173 |
+
# ('Ascending', 'Descending'))
|
| 174 |
+
|
| 175 |
+
# # Sort the dataframe based on Factuality Score
|
| 176 |
+
# if sort_order == 'Ascending':
|
| 177 |
+
# filtered_df = filtered_df.sort_values(
|
| 178 |
+
# by='factuality_score', ascending=True)
|
| 179 |
+
# else:
|
| 180 |
+
# filtered_df = filtered_df.sort_values(
|
| 181 |
+
# by='factuality_score', ascending=False)
|
| 182 |
+
# Option to sort by Factuality Score in ascending order
|
| 183 |
+
sort_by_factuality = st.checkbox('Sort by Factuality Score')
|
| 184 |
+
|
| 185 |
+
# Sort the dataframe based on Factuality Score if the checkbox is selected
|
| 186 |
+
if sort_by_factuality:
|
| 187 |
+
updated_filtered_df = filtered_df.sort_values(
|
| 188 |
+
by='factuality_score', ascending=False)
|
| 189 |
+
else:
|
| 190 |
+
updated_filtered_df = filtered_df
|
| 191 |
|
| 192 |
# Create HTML for the table
|
| 193 |
html = '''
|
| 194 |
<table>
|
| 195 |
<thead>
|
| 196 |
<tr>
|
| 197 |
+
<th>Rank</th>
|
| 198 |
<th>Tier</th>
|
| 199 |
<th>Model</th>
|
| 200 |
+
<th>Factuality Score</th>
|
| 201 |
+
<th>Hallucination Score</th>
|
| 202 |
+
<th>Avg Tokens</th>
|
| 203 |
+
<th>Avg Factual Units</th>
|
| 204 |
+
<th>Avg Undecidable Units</th>
|
| 205 |
+
<th>Avg Unsupported Units</th>
|
| 206 |
+
<th>Factual Recall</th>
|
| 207 |
+
<th>Conceptual Understanding</th>
|
| 208 |
+
<th>Procedural Execution</th>
|
| 209 |
+
<th>Comparative Analysis</th>
|
| 210 |
+
<th>Recommendations and Insights</th>
|
| 211 |
+
<th>Domain-Specific Knowledge</th>
|
| 212 |
+
<th>Temporal Context</th>
|
| 213 |
</tr>
|
| 214 |
</thead>
|
| 215 |
<tbody>
|
|
|
|
| 217 |
|
| 218 |
# Generate the rows of the table
|
| 219 |
current_tier = None
|
| 220 |
+
for i, row in updated_filtered_df.iterrows():
|
| 221 |
+
# if row['tier'] != current_tier:
|
| 222 |
+
# if current_tier is not None:
|
| 223 |
+
# html += ' </tr>'
|
| 224 |
+
# current_tier = row['tier']
|
| 225 |
+
# # 7 models, change this number when more models
|
| 226 |
+
# html += f' <tr><td rowspan="7" style="vertical-align: middle;">{current_tier}</td>'
|
| 227 |
+
# else:
|
| 228 |
+
# html += ' <tr>'
|
| 229 |
+
|
| 230 |
+
html += ' <tr>'
|
| 231 |
# Fill in model and scores
|
| 232 |
html += f'''
|
| 233 |
+
<td>{row['rank']}</td>
|
| 234 |
+
<td>{row['tier']}</td>
|
| 235 |
+
<td>{row['model']}</td>
|
| 236 |
+
<td>{row['factuality_score']:.2f}</td>
|
| 237 |
+
<td>{row['hallucination_score']:.2f}</td>
|
| 238 |
+
<td>{row['avg_tokens']:.2f}</td>
|
| 239 |
+
<td>{row['avg_factual_units']:.2f}</td>
|
| 240 |
+
<td>{row['avg_undecidable_units']:.2f}</td>
|
| 241 |
+
<td>{row['avg_unsupported_units']:.2f}</td>
|
| 242 |
+
<td>{row['prompt_categories.Factual Recall']:.2f}</td>
|
| 243 |
+
<td>{row['prompt_categories.Conceptual Understanding']:.2f}</td>
|
| 244 |
+
<td>{row['prompt_categories.Procedural Execution']:.2f}</td>
|
| 245 |
+
<td>{row['prompt_categories.Comparative Analysis']:.2f}</td>
|
| 246 |
+
<td>{row['prompt_categories.Recommendations and Insights']:.2f}</td>
|
| 247 |
+
<td>{row['prompt_categories.Domain-Specific Knowledge']:.2f}</td>
|
| 248 |
+
<td>{row['prompt_categories.Temporal Context']:.2f}</td>
|
| 249 |
</tr>
|
| 250 |
'''
|
| 251 |
|
| 252 |
+
# Close the table
|
| 253 |
html += '''
|
| 254 |
</table>
|
| 255 |
'''
|
|
|
|
| 258 |
st.markdown(html, unsafe_allow_html=True)
|
| 259 |
|
| 260 |
st.markdown('</div>', unsafe_allow_html=True)
|
|
|
|
| 261 |
# Tab 2: Details
|
| 262 |
with tab2:
|
| 263 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
tiered_models_data.csv
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tier,model,factuality_score,hallucination_score,avg_tokens,avg_factual_units,avg_undecidable_units,avg_unsupported_units,prompt_categories.Factual Recall,prompt_categories.Conceptual Understanding,prompt_categories.Procedural Execution,prompt_categories.Comparative Analysis,prompt_categories.Recommendations and Insights,prompt_categories.Domain-Specific Knowledge,prompt_categories.Temporal Context
|
| 2 |
+
Tier 1: Easy,gpt4-o,75.69,0.64,561.72,23.91,4.61,1.01,76.49,78.49,66.14,76.13,76.3,75.91,69.52
|
| 3 |
+
Tier 1: Easy,gemini,73.81,0.68,516.41,22.23,4.47,1.12,73.35,79.39,66.7,72.44,73.64,74.31,71.42
|
| 4 |
+
Tier 1: Easy,llama3.1_70B_instruct,70.01,0.89,531.35,27.09,5.67,2.13,68.99,75.38,64.73,70.34,70.03,70.64,56.61
|
| 5 |
+
Tier 1: Easy,llama3.1_405B_instruct,68.64,0.93,550.74,26.6,6.15,2.19,66.07,74.67,65.88,70.18,68.29,70.91,49.97
|
| 6 |
+
Tier 1: Easy,claude-3.5-sonnet,74.95,0.65,395.77,22.64,4.03,1.19,74.84,77.74,69.55,74.87,75.3,76.4,64.19
|
| 7 |
+
Tier 1: Easy,commandR+,73.15,0.71,440.93,23.55,4.51,1.4,69.41,80.24,68.98,74.36,73.53,73.02,66.43
|
| 8 |
+
Tier 1: Easy,mistral-large-2,75.19,0.67,485.58,23.21,4.09,1.36,75.87,78.32,63.98,77.17,75.5,76.38,65.8
|
| 9 |
+
Tier 2: Moderate,gpt4-o,80.72,0.5,624.67,24.42,3.59,0.89,80.06,84.33,72.83,79.75,81.5,81.1,70.02
|
| 10 |
+
Tier 2: Moderate,gemini,78.02,0.57,565.97,22.16,3.71,0.97,74.13,81.74,73.13,77.32,78.37,80.04,68.03
|
| 11 |
+
Tier 2: Moderate,llama3.1_70B_instruct,75.76,0.71,607.44,25.35,4.33,1.76,63.87,77.92,72.94,78.67,79.56,76.83,47.71
|
| 12 |
+
Tier 2: Moderate,llama3.1_405B_instruct,75.05,0.7,599.3,25.24,4.74,1.41,67.96,78.09,68.51,76.16,77.31,76.25,65.43
|
| 13 |
+
Tier 2: Moderate,claude-3.5-sonnet,79.92,0.54,414.32,22.15,3.32,1.09,75.88,83.52,77.39,79.31,81.06,78.81,72.47
|
| 14 |
+
Tier 2: Moderate,commandR+,80.71,0.52,483.32,24.1,3.17,1.09,73.49,85.46,75.6,82.97,82.12,81.61,58.49
|
| 15 |
+
Tier 2: Moderate,mistral-large-2,79.97,0.52,528.44,22.65,3.21,1.02,77.21,81.23,75.2,81.24,80.86,82.03,63.63
|
| 16 |
+
Tier 3: Hard,gpt4-o,91.63,0.26,640.84,29.29,2.01,0.53,94.31,93.62,82.98,89.19,91.86,94.12
|
| 17 |
+
Tier 3: Hard,gemini,89.86,0.31,551.81,25.6,1.88,0.71,92.61,90.34,83.32,87.39,90.93,95.23
|
| 18 |
+
Tier 3: Hard,llama3.1_70B_instruct,89.3,0.33,607.75,31.38,2.08,0.83,75.5,91.75,83.61,87.11,93.03,93.08
|
| 19 |
+
Tier 3: Hard,llama3.1_405B_instruct,86.57,0.4,599.87,30.12,2.88,0.85,79.58,88.92,75.23,85.11,89.2,90.21,100.0
|
| 20 |
+
Tier 3: Hard,claude-3.5-sonnet,89.61,0.3,411.2,26.72,1.49,0.81,89.85,92.45,75.13,86.48,91.46,91.97,100.0
|
| 21 |
+
Tier 3: Hard,commandR+,91.65,0.25,499.06,27.95,1.57,0.54,87.71,91.8,87.16,89.79,94.12,93.85,100.0
|
| 22 |
+
Tier 3: Hard,mistral-large-2,92.0,0.25,523.57,27.8,1.8,0.55,92.96,92.33,90.58,89.41,92.81,92.41,100.0
|
| 23 |
+
|