Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import requests
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
|
| 7 |
+
# Assume 'pipe' is already defined as your NER pipeline
|
| 8 |
+
# pipe = pipeline("ner", model="NbAiLab/nb-bert-base-ner", aggregation_strategy="simple") # Make sure your pipeline is initialized before launching the interface
|
| 9 |
+
|
| 10 |
+
# Initialize the NER pipeline here so it's accessible to the function
|
| 11 |
+
pipe = pipeline("ner", model="NbAiLab/nb-bert-base-ner", aggregation_strategy="simple")
|
| 12 |
+
print("✅ NER pipeline ready for Gradio")
|
| 13 |
+
|
| 14 |
+
def chunk_text(text, max_len=512):
|
| 15 |
+
"""Splits text into chunks of a maximum length."""
|
| 16 |
+
chunks = []
|
| 17 |
+
current_chunk = ""
|
| 18 |
+
words = text.split()
|
| 19 |
+
for word in words:
|
| 20 |
+
if len(current_chunk) + len(word) + 1 > max_len:
|
| 21 |
+
chunks.append(current_chunk.strip())
|
| 22 |
+
current_chunk = word + " "
|
| 23 |
+
else:
|
| 24 |
+
current_chunk += word + " "
|
| 25 |
+
if current_chunk:
|
| 26 |
+
chunks.append(current_chunk.strip())
|
| 27 |
+
return chunks
|
| 28 |
+
|
| 29 |
+
def clean_entity_word(word):
|
| 30 |
+
"""Removes '##' from subword tokens and handles potential spacing issues."""
|
| 31 |
+
if word.startswith("##"):
|
| 32 |
+
return word[2:]
|
| 33 |
+
return word
|
| 34 |
+
|
| 35 |
+
def scrape_and_recognize_entities(url):
|
| 36 |
+
"""
|
| 37 |
+
Scrapes text from a given URL and performs entity recognition.
|
| 38 |
+
Handles long text by chunking and cleans up subword tokens.
|
| 39 |
+
"""
|
| 40 |
+
try:
|
| 41 |
+
response = requests.get(url)
|
| 42 |
+
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
|
| 43 |
+
|
| 44 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 45 |
+
# Find the main article content (adjust based on common website structures)
|
| 46 |
+
article_text = ""
|
| 47 |
+
article_body = soup.find('article') or soup.find('main') or soup.find('div', class_='article-body')
|
| 48 |
+
|
| 49 |
+
if article_body:
|
| 50 |
+
paragraphs = article_body.find_all('p')
|
| 51 |
+
article_text = '\n'.join([p.get_text() for p in paragraphs])
|
| 52 |
+
else:
|
| 53 |
+
# Fallback: get all paragraph text if specific article body not found
|
| 54 |
+
paragraphs = soup.find_all('p')
|
| 55 |
+
article_text = '\n'.join([p.get_text() for p in paragraphs])
|
| 56 |
+
|
| 57 |
+
if not article_text:
|
| 58 |
+
return pd.DataFrame({"Error": ["Could not extract article text from the provided URL."]})
|
| 59 |
+
|
| 60 |
+
# Chunk the text
|
| 61 |
+
chunks = chunk_text(article_text)
|
| 62 |
+
|
| 63 |
+
all_results = []
|
| 64 |
+
for chunk in chunks:
|
| 65 |
+
# Use the existing NER pipeline on each chunk
|
| 66 |
+
chunk_results = pipe(chunk)
|
| 67 |
+
all_results.extend(chunk_results)
|
| 68 |
+
|
| 69 |
+
# Clean up subword tokens in the results
|
| 70 |
+
cleaned_results = []
|
| 71 |
+
for entity in all_results:
|
| 72 |
+
entity['word'] = clean_entity_word(entity['word'])
|
| 73 |
+
cleaned_results.append(entity)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# Convert combined results to a pandas DataFrame
|
| 77 |
+
if cleaned_results:
|
| 78 |
+
df_results = pd.DataFrame(cleaned_results)
|
| 79 |
+
else:
|
| 80 |
+
df_results = pd.DataFrame({"Info": ["No entities found in the article."]})
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
return df_results
|
| 84 |
+
|
| 85 |
+
except requests.exceptions.RequestException as e:
|
| 86 |
+
# Return a DataFrame with the scraping error message
|
| 87 |
+
return pd.DataFrame({"Scraping Error": [f"Error fetching the webpage: {e}"]})
|
| 88 |
+
except Exception as e:
|
| 89 |
+
# Return a DataFrame with the processing error message
|
| 90 |
+
return pd.DataFrame({"Processing Error": [f"An error occurred during entity recognition: {e}"]})
|
| 91 |
+
|
| 92 |
+
# Create Gradio interface
|
| 93 |
+
iface = gr.Interface(
|
| 94 |
+
fn=scrape_and_recognize_entities,
|
| 95 |
+
inputs=gr.Textbox(label="Enter Article URL"),
|
| 96 |
+
outputs=gr.DataFrame(label="Extracted Entities"),
|
| 97 |
+
title="Article Scraper and Entity Recognizer"
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# Launch the interface
|
| 101 |
+
iface.launch(debug=True)
|