import gradio as gr
from transformers import pipeline
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Assume 'pipe' is already defined as your NER pipeline
# pipe = pipeline("ner", model="NbAiLab/nb-bert-base-ner", aggregation_strategy="simple") # Make sure your pipeline is initialized before launching the interface

# Initialize the NER pipeline here so it's accessible to the function
pipe = pipeline("ner", model="NbAiLab/nb-bert-base-ner", aggregation_strategy="simple")
print("✅ NER pipeline ready for Gradio")

def chunk_text(text, max_len=512):
    """Splits text into chunks of a maximum length."""
    chunks = []
    current_chunk = ""
    words = text.split()
    for word in words:
        if len(current_chunk) + len(word) + 1 > max_len:
            chunks.append(current_chunk.strip())
            current_chunk = word + " "
        else:
            current_chunk += word + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

def clean_entity_word(word):
    """Removes '##' from subword tokens and handles potential spacing issues."""
    if word.startswith("##"):
        return word[2:]
    return word

def scrape_and_recognize_entities(url):
    """
    Scrapes text from a given URL and performs entity recognition.
    Handles long text by chunking and cleans up subword tokens.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

        soup = BeautifulSoup(response.content, 'html.parser')
        # Find the main article content (adjust based on common website structures)
        article_text = ""
        article_body = soup.find('article') or soup.find('main') or soup.find('div', class_='article-body')

        if article_body:
            paragraphs = article_body.find_all('p')
            article_text = '\n'.join([p.get_text() for p in paragraphs])
        else:
            # Fallback: get all paragraph text if specific article body not found
            paragraphs = soup.find_all('p')
            article_text = '\n'.join([p.get_text() for p in paragraphs])

        if not article_text:
            return pd.DataFrame({"Error": ["Could not extract article text from the provided URL."]})

        # Chunk the text
        chunks = chunk_text(article_text)

        all_results = []
        for chunk in chunks:
            # Use the existing NER pipeline on each chunk
            chunk_results = pipe(chunk)
            all_results.extend(chunk_results)

        # Clean up subword tokens in the results
        cleaned_results = []
        for entity in all_results:
            entity['word'] = clean_entity_word(entity['word'])
            cleaned_results.append(entity)


        # Convert combined results to a pandas DataFrame
        if cleaned_results:
            df_results = pd.DataFrame(cleaned_results)
        else:
            df_results = pd.DataFrame({"Info": ["No entities found in the article."]})


        return df_results

    except requests.exceptions.RequestException as e:
        # Return a DataFrame with the scraping error message
        return pd.DataFrame({"Scraping Error": [f"Error fetching the webpage: {e}"]})
    except Exception as e:
        # Return a DataFrame with the processing error message
        return pd.DataFrame({"Processing Error": [f"An error occurred during entity recognition: {e}"]})

# Create Gradio interface
iface = gr.Interface(
    fn=scrape_and_recognize_entities,
    inputs=gr.Textbox(label="Enter Article URL"),
    outputs=gr.DataFrame(label="Extracted Entities"),
    title="Article Scraper and Entity Recognizer"
)

# Launch the interface
iface.launch(debug=True)