import gradio as gr from transformers import pipeline import pandas as pd import requests from bs4 import BeautifulSoup # Assume 'pipe' is already defined as your NER pipeline # pipe = pipeline("ner", model="NbAiLab/nb-bert-base-ner", aggregation_strategy="simple") # Make sure your pipeline is initialized before launching the interface # Initialize the NER pipeline here so it's accessible to the function pipe = pipeline("ner", model="NbAiLab/nb-bert-base-ner", aggregation_strategy="simple") print("✅ NER pipeline ready for Gradio") def chunk_text(text, max_len=512): """Splits text into chunks of a maximum length.""" chunks = [] current_chunk = "" words = text.split() for word in words: if len(current_chunk) + len(word) + 1 > max_len: chunks.append(current_chunk.strip()) current_chunk = word + " " else: current_chunk += word + " " if current_chunk: chunks.append(current_chunk.strip()) return chunks def clean_entity_word(word): """Removes '##' from subword tokens and handles potential spacing issues.""" if word.startswith("##"): return word[2:] return word def scrape_and_recognize_entities(url): """ Scrapes text from a given URL and performs entity recognition. Handles long text by chunking and cleans up subword tokens. """ try: response = requests.get(url) response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx) soup = BeautifulSoup(response.content, 'html.parser') # Find the main article content (adjust based on common website structures) article_text = "" article_body = soup.find('article') or soup.find('main') or soup.find('div', class_='article-body') if article_body: paragraphs = article_body.find_all('p') article_text = '\n'.join([p.get_text() for p in paragraphs]) else: # Fallback: get all paragraph text if specific article body not found paragraphs = soup.find_all('p') article_text = '\n'.join([p.get_text() for p in paragraphs]) if not article_text: return pd.DataFrame({"Error": ["Could not extract article text from the provided URL."]}) # Chunk the text chunks = chunk_text(article_text) all_results = [] for chunk in chunks: # Use the existing NER pipeline on each chunk chunk_results = pipe(chunk) all_results.extend(chunk_results) # Clean up subword tokens in the results cleaned_results = [] for entity in all_results: entity['word'] = clean_entity_word(entity['word']) cleaned_results.append(entity) # Convert combined results to a pandas DataFrame if cleaned_results: df_results = pd.DataFrame(cleaned_results) else: df_results = pd.DataFrame({"Info": ["No entities found in the article."]}) return df_results except requests.exceptions.RequestException as e: # Return a DataFrame with the scraping error message return pd.DataFrame({"Scraping Error": [f"Error fetching the webpage: {e}"]}) except Exception as e: # Return a DataFrame with the processing error message return pd.DataFrame({"Processing Error": [f"An error occurred during entity recognition: {e}"]}) # Create Gradio interface iface = gr.Interface( fn=scrape_and_recognize_entities, inputs=gr.Textbox(label="Enter Article URL"), outputs=gr.DataFrame(label="Extracted Entities"), title="Article Scraper and Entity Recognizer" ) # Launch the interface iface.launch(debug=True)