factrbench

Running

farimafatahi commited on Oct 27, 2024

Commit

1f646d8

verified ·

1 Parent(s): 17d6ec9

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -171,18 +171,18 @@ with tab1:
     # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
     st.markdown('<div class="tab-content">', unsafe_allow_html=True)
-    st.markdown('Metrics Explanation')
-    st.markdown(    '''
     <div class="metric">
         <br/>
         <p style="font-size:16px;">
-        <strong> Factual Precision </strong> measures the  ratio of supported units divided by all units averaged over model responses. <strong> Hallucination Score </strong> measures the degree of incorrect or inconclusive content units in model response, with details provided in the paper. We also provide statistics on the average number of unsupported unit (<strong>Avg. Unsupported</strong>), average number of units labelled as undecided (<strong>Avg. Undecided</strong>), Average length of response in terms of the number of tokens, and the average verifiable units existing in the model responses.
         </p>
         <p style="font-size:16px;">
         🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models"
         </p>
     </div>
-    ''',
     unsafe_allow_html=True
     )

     # st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
     st.markdown('<div class="tab-content">', unsafe_allow_html=True)
+    st.markdown('# Metrics Explanation')
+    st.markdown("""
     <div class="metric">
         <br/>
         <p style="font-size:16px;">
+        <strong> Factual Precision </strong> measures the ratio of supported units divided by all units averaged over model responses. <strong> Hallucination Score </strong> quantifies the incorrect or inconclusive contents within a model response, as described in the paper. We also provide statistics on the average number of units labelled as unsupported (<strong>Avg. # Unsupported</strong>), the average number of units labelled as undecidable (<strong>Avg. # Undecided</strong>), the average length of the response in terms of the number of tokens, and the average verifiable units existing in the model responses (<strong>Avg. # Units</strong>).
         </p>
         <p style="font-size:16px;">
         🔒 for closed LLMs; 🔑 for open-weights LLMs; 🚨 for newly added models"
         </p>
     </div>
+    """,
     unsafe_allow_html=True
     )