fosse commited on
Commit
1a9e878
·
verified ·
1 Parent(s): b79b577

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import pandas as pd
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+
7
+ # Assume 'pipe' is already defined as your NER pipeline
8
+ # pipe = pipeline("ner", model="NbAiLab/nb-bert-base-ner", aggregation_strategy="simple") # Make sure your pipeline is initialized before launching the interface
9
+
10
+ # Initialize the NER pipeline here so it's accessible to the function
11
+ pipe = pipeline("ner", model="NbAiLab/nb-bert-base-ner", aggregation_strategy="simple")
12
+ print("✅ NER pipeline ready for Gradio")
13
+
14
+ def chunk_text(text, max_len=512):
15
+ """Splits text into chunks of a maximum length."""
16
+ chunks = []
17
+ current_chunk = ""
18
+ words = text.split()
19
+ for word in words:
20
+ if len(current_chunk) + len(word) + 1 > max_len:
21
+ chunks.append(current_chunk.strip())
22
+ current_chunk = word + " "
23
+ else:
24
+ current_chunk += word + " "
25
+ if current_chunk:
26
+ chunks.append(current_chunk.strip())
27
+ return chunks
28
+
29
+ def clean_entity_word(word):
30
+ """Removes '##' from subword tokens and handles potential spacing issues."""
31
+ if word.startswith("##"):
32
+ return word[2:]
33
+ return word
34
+
35
+ def scrape_and_recognize_entities(url):
36
+ """
37
+ Scrapes text from a given URL and performs entity recognition.
38
+ Handles long text by chunking and cleans up subword tokens.
39
+ """
40
+ try:
41
+ response = requests.get(url)
42
+ response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
43
+
44
+ soup = BeautifulSoup(response.content, 'html.parser')
45
+ # Find the main article content (adjust based on common website structures)
46
+ article_text = ""
47
+ article_body = soup.find('article') or soup.find('main') or soup.find('div', class_='article-body')
48
+
49
+ if article_body:
50
+ paragraphs = article_body.find_all('p')
51
+ article_text = '\n'.join([p.get_text() for p in paragraphs])
52
+ else:
53
+ # Fallback: get all paragraph text if specific article body not found
54
+ paragraphs = soup.find_all('p')
55
+ article_text = '\n'.join([p.get_text() for p in paragraphs])
56
+
57
+ if not article_text:
58
+ return pd.DataFrame({"Error": ["Could not extract article text from the provided URL."]})
59
+
60
+ # Chunk the text
61
+ chunks = chunk_text(article_text)
62
+
63
+ all_results = []
64
+ for chunk in chunks:
65
+ # Use the existing NER pipeline on each chunk
66
+ chunk_results = pipe(chunk)
67
+ all_results.extend(chunk_results)
68
+
69
+ # Clean up subword tokens in the results
70
+ cleaned_results = []
71
+ for entity in all_results:
72
+ entity['word'] = clean_entity_word(entity['word'])
73
+ cleaned_results.append(entity)
74
+
75
+
76
+ # Convert combined results to a pandas DataFrame
77
+ if cleaned_results:
78
+ df_results = pd.DataFrame(cleaned_results)
79
+ else:
80
+ df_results = pd.DataFrame({"Info": ["No entities found in the article."]})
81
+
82
+
83
+ return df_results
84
+
85
+ except requests.exceptions.RequestException as e:
86
+ # Return a DataFrame with the scraping error message
87
+ return pd.DataFrame({"Scraping Error": [f"Error fetching the webpage: {e}"]})
88
+ except Exception as e:
89
+ # Return a DataFrame with the processing error message
90
+ return pd.DataFrame({"Processing Error": [f"An error occurred during entity recognition: {e}"]})
91
+
92
+ # Create Gradio interface
93
+ iface = gr.Interface(
94
+ fn=scrape_and_recognize_entities,
95
+ inputs=gr.Textbox(label="Enter Article URL"),
96
+ outputs=gr.DataFrame(label="Extracted Entities"),
97
+ title="Article Scraper and Entity Recognizer"
98
+ )
99
+
100
+ # Launch the interface
101
+ iface.launch(debug=True)