Spaces:

anakin87
/

fact-checking-rocks

Running

App Files Files Community

anakin87 commited on Dec 19, 2022

Commit

4c41de2

1 Parent(s): 08d96b7

refactor EntailmentChecker: only relevant documents are used

Browse files

Files changed (3) hide show

Rock_fact_checker.py +3 -4
app_utils/backend_utils.py +6 -28
app_utils/entailment_checker.py +42 -8

Rock_fact_checker.py CHANGED Viewed

@@ -97,8 +97,8 @@ def main():
     # Display results
     if st.session_state.results:
-        results = st.session_state.results
-        docs, agg_entailment_info = results["documents"], results["agg_entailment_info"]
         # show different messages depending on entailment results
         max_key = max(agg_entailment_info, key=agg_entailment_info.get)
@@ -107,12 +107,11 @@ def main():
         st.markdown(f"###### Aggregate entailment information:")
         col1, col2 = st.columns([2, 1])
-        agg_entailment_info = results["agg_entailment_info"]
         fig = create_ternary_plot(agg_entailment_info)
         with col1:
             st.plotly_chart(fig, use_container_width=True)
         with col2:
-            st.write(results["agg_entailment_info"])
         st.markdown(f"###### Most Relevant snippets:")
         df, urls = create_df_for_relevant_snippets(docs)

     # Display results
     if st.session_state.results:
+        docs = st.session_state.results["documents"]
+        agg_entailment_info = st.session_state.results["aggregate_entailment_info"]
         # show different messages depending on entailment results
         max_key = max(agg_entailment_info, key=agg_entailment_info.get)
         st.markdown(f"###### Aggregate entailment information:")
         col1, col2 = st.columns([2, 1])
         fig = create_ternary_plot(agg_entailment_info)
         with col1:
             st.plotly_chart(fig, use_container_width=True)
         with col2:
+            st.write(agg_entailment_info)
         st.markdown(f"###### Most Relevant snippets:")
         df, urls = create_df_for_relevant_snippets(docs)

app_utils/backend_utils.py CHANGED Viewed

@@ -44,7 +44,11 @@ def start_haystack():
         embedding_model=RETRIEVER_MODEL,
         model_format=RETRIEVER_MODEL_FORMAT,
     )
-    entailment_checker = EntailmentChecker(model_name_or_path=NLI_MODEL, use_gpu=False)
     pipe = Pipeline()
     pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
@@ -60,30 +64,4 @@ pipe = start_haystack()
 def query(statement: str, retriever_top_k: int = 5):
     """Run query and verify statement"""
     params = {"retriever": {"top_k": retriever_top_k}}
-    results = pipe.run(statement, params=params)
-    scores, agg_con, agg_neu, agg_ent = 0, 0, 0, 0
-    for i, doc in enumerate(results["documents"]):
-        scores += doc.score
-        ent_info = doc.meta["entailment_info"]
-        con, neu, ent = (
-            ent_info["contradiction"],
-            ent_info["neutral"],
-            ent_info["entailment"],
-        )
-        agg_con += con * doc.score
-        agg_neu += neu * doc.score
-        agg_ent += ent * doc.score
-        # if in the first documents there is a strong evidence of entailment/contradiction,
-        # there is no need to consider less relevant documents
-        if max(agg_con, agg_ent) / scores > 0.5:
-            results["documents"] = results["documents"][: i + 1]
-            break
-    results["agg_entailment_info"] = {
-        "contradiction": round(agg_con / scores, 2),
-        "neutral": round(agg_neu / scores, 2),
-        "entailment": round(agg_ent / scores, 2),
-    }
-    return results

         embedding_model=RETRIEVER_MODEL,
         model_format=RETRIEVER_MODEL_FORMAT,
     )
+    entailment_checker = EntailmentChecker(
+        model_name_or_path=NLI_MODEL,
+        use_gpu=False,
+        entailment_contradiction_threshold=0.5,
+    )
     pipe = Pipeline()
     pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
 def query(statement: str, retriever_top_k: int = 5):
     """Run query and verify statement"""
     params = {"retriever": {"top_k": retriever_top_k}}
+    return pipe.run(statement, params=params)

app_utils/entailment_checker.py CHANGED Viewed

@@ -4,13 +4,14 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer, Auto
 import torch
 from haystack.nodes.base import BaseComponent
 from haystack.modeling.utils import initialize_device_settings
-from haystack.schema import Document, Answer, Span
 class EntailmentChecker(BaseComponent):
     """
     This node checks the entailment between every document content and the query.
-    It enrichs the documents metadata with entailment_info
     """
     outgoing_edges = 1
@@ -22,6 +23,7 @@ class EntailmentChecker(BaseComponent):
         tokenizer: Optional[str] = None,
         use_gpu: bool = True,
         batch_size: int = 16,
     ):
         """
         Load a Natural Language Inference model from Transformers.
@@ -31,7 +33,9 @@ class EntailmentChecker(BaseComponent):
         :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
         :param tokenizer: Name of the tokenizer (usually the same as model)
         :param use_gpu: Whether to use GPU (if available).
-        # :param batch_size: Number of Documents to be processed at a time.
         """
         super().__init__()
@@ -43,6 +47,7 @@ class EntailmentChecker(BaseComponent):
             pretrained_model_name_or_path=model_name_or_path, revision=model_version
         )
         self.batch_size = batch_size
         self.model.to(str(self.devices[0]))
         id2label = AutoConfig.from_pretrained(model_name_or_path).id2label
@@ -53,12 +58,41 @@ class EntailmentChecker(BaseComponent):
             )
     def run(self, query: str, documents: List[Document]):
-        for doc in documents:
-            entailment_dict = self.get_entailment(premise=doc.content, hypotesis=query)
-            doc.meta["entailment_info"] = entailment_dict
-        return {"documents": documents}, "output_1"
-    def run_batch():
         pass
     def get_entailment(self, premise, hypotesis):

 import torch
 from haystack.nodes.base import BaseComponent
 from haystack.modeling.utils import initialize_device_settings
+from haystack.schema import Document
 class EntailmentChecker(BaseComponent):
     """
     This node checks the entailment between every document content and the query.
+    It enrichs the documents metadata with entailment informations.
+    It also returns aggregate entailment information.
     """
     outgoing_edges = 1
         tokenizer: Optional[str] = None,
         use_gpu: bool = True,
         batch_size: int = 16,
+        entailment_contradiction_threshold: float = 0.5,
     ):
         """
         Load a Natural Language Inference model from Transformers.
         :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
         :param tokenizer: Name of the tokenizer (usually the same as model)
         :param use_gpu: Whether to use GPU (if available).
+        :param batch_size: Number of Documents to be processed at a time.
+        :param entailment_contradiction_threshold: if in the first N documents there is a strong evidence of entailment/contradiction
+        (aggregate entailment or contradiction are greater than the threshold), the less relevant documents are not taken into account
         """
         super().__init__()
             pretrained_model_name_or_path=model_name_or_path, revision=model_version
         )
         self.batch_size = batch_size
+        self.entailment_contradiction_threshold = entailment_contradiction_threshold
         self.model.to(str(self.devices[0]))
         id2label = AutoConfig.from_pretrained(model_name_or_path).id2label
             )
     def run(self, query: str, documents: List[Document]):
+        scores, agg_con, agg_neu, agg_ent = 0, 0, 0, 0
+        for i, doc in enumerate(documents):
+            entailment_info = self.get_entailment(premise=doc.content, hypotesis=query)
+            doc.meta["entailment_info"] = entailment_info
+            scores += doc.score
+            con, neu, ent = (
+                entailment_info["contradiction"],
+                entailment_info["neutral"],
+                entailment_info["entailment"],
+            )
+            agg_con += con * doc.score
+            agg_neu += neu * doc.score
+            agg_ent += ent * doc.score
+            # if in the first documents there is a strong evidence of entailment/contradiction,
+            # there is no need to consider less relevant documents
+            if max(agg_con, agg_ent) / scores > self.entailment_contradiction_threshold:
+                break
+        aggregate_entailment_info = {
+            "contradiction": round(agg_con / scores, 2),
+            "neutral": round(agg_neu / scores, 2),
+            "entailment": round(agg_ent / scores, 2),
+        }
+        entailment_checker_result = {
+            "documents": documents[: i + 1],
+            "aggregate_entailment_info": aggregate_entailment_info,
+        }
+        return entailment_checker_result, "output_1"
+    def run_batch(self, queries: List[str], documents: List[Document]):
         pass
     def get_entailment(self, premise, hypotesis):