Spaces:

IJ-Workshop
/

fosse

Sleeping

App Files Files Community

fosse commited on Oct 29

Commit

1a9e878

verified ·

1 Parent(s): b79b577

Create app.py

Browse files

Files changed (1) hide show

app.py +101 -0

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import gradio as gr
+from transformers import pipeline
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+# Assume 'pipe' is already defined as your NER pipeline
+# pipe = pipeline("ner", model="NbAiLab/nb-bert-base-ner", aggregation_strategy="simple") # Make sure your pipeline is initialized before launching the interface
+# Initialize the NER pipeline here so it's accessible to the function
+pipe = pipeline("ner", model="NbAiLab/nb-bert-base-ner", aggregation_strategy="simple")
+print("✅ NER pipeline ready for Gradio")
+def chunk_text(text, max_len=512):
+    """Splits text into chunks of a maximum length."""
+    chunks = []
+    current_chunk = ""
+    words = text.split()
+    for word in words:
+        if len(current_chunk) + len(word) + 1 > max_len:
+            chunks.append(current_chunk.strip())
+            current_chunk = word + " "
+        else:
+            current_chunk += word + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+def clean_entity_word(word):
+    """Removes '##' from subword tokens and handles potential spacing issues."""
+    if word.startswith("##"):
+        return word[2:]
+    return word
+def scrape_and_recognize_entities(url):
+    """
+    Scrapes text from a given URL and performs entity recognition.
+    Handles long text by chunking and cleans up subword tokens.
+    """
+    try:
+        response = requests.get(url)
+        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Find the main article content (adjust based on common website structures)
+        article_text = ""
+        article_body = soup.find('article') or soup.find('main') or soup.find('div', class_='article-body')
+        if article_body:
+            paragraphs = article_body.find_all('p')
+            article_text = '\n'.join([p.get_text() for p in paragraphs])
+        else:
+            # Fallback: get all paragraph text if specific article body not found
+            paragraphs = soup.find_all('p')
+            article_text = '\n'.join([p.get_text() for p in paragraphs])
+        if not article_text:
+            return pd.DataFrame({"Error": ["Could not extract article text from the provided URL."]})
+        # Chunk the text
+        chunks = chunk_text(article_text)
+        all_results = []
+        for chunk in chunks:
+            # Use the existing NER pipeline on each chunk
+            chunk_results = pipe(chunk)
+            all_results.extend(chunk_results)
+        # Clean up subword tokens in the results
+        cleaned_results = []
+        for entity in all_results:
+            entity['word'] = clean_entity_word(entity['word'])
+            cleaned_results.append(entity)
+        # Convert combined results to a pandas DataFrame
+        if cleaned_results:
+            df_results = pd.DataFrame(cleaned_results)
+        else:
+            df_results = pd.DataFrame({"Info": ["No entities found in the article."]})
+        return df_results
+    except requests.exceptions.RequestException as e:
+        # Return a DataFrame with the scraping error message
+        return pd.DataFrame({"Scraping Error": [f"Error fetching the webpage: {e}"]})
+    except Exception as e:
+        # Return a DataFrame with the processing error message
+        return pd.DataFrame({"Processing Error": [f"An error occurred during entity recognition: {e}"]})
+# Create Gradio interface
+iface = gr.Interface(
+    fn=scrape_and_recognize_entities,
+    inputs=gr.Textbox(label="Enter Article URL"),
+    outputs=gr.DataFrame(label="Extracted Entities"),
+    title="Article Scraper and Entity Recognizer"
+)
+# Launch the interface
+iface.launch(debug=True)