pesquisa.ai / app.py
fschwartzer's picture
Update app.py
1e66579 verified
import subprocess
import sys
import os
def ensure_playwright_chromium():
"""Ensures Playwright Chromium browser is installed."""
try:
print("Checking and installing Playwright Chromium browser if needed...")
subprocess.run(
[sys.executable, "-m", "playwright", "install", "chromium"],
check=True,
capture_output=True,
text=True
)
print("Playwright Chromium browser is ready.")
except subprocess.CalledProcessError as e:
print(f"Error during Playwright Chromium installation: {e}")
print(f"Stdout: {e.stdout}")
print(f"Stderr: {e.stderr}")
except FileNotFoundError:
print("Error: Python executable or Playwright module not found. Ensure your environment is set up correctly.")
ensure_playwright_chromium()
import gradio as gr
import requests # Still used for other things potentially, or could be removed if not
from bs4 import BeautifulSoup
from bs4 import Comment
import re
import pandas as pd
import validators
# from selenium import webdriver # No longer used in the primary fetch path
# from selenium.webdriver.chrome.service import Service # No longer used
# from selenium.webdriver.chrome.options import Options # No longer used
# from selenium.webdriver.common.by import By # No longer used
import json
import time
import random
from playwright.sync_api import sync_playwright
from playwright_stealth import stealth_sync
# Constantes
ESTADOS_BR = ["AC", "AL", "AM", "AP", "BA", "CE", "DF", "ES", "GO", "MA", "MG", "MS", "MT", "PA", "PB", "PE", "PI", "PR", "RJ", "RN", "RO", "RR", "RS", "SC", "SE", "SP", "TO"]
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
]
# Dicionários (omitted for brevity, they are unchanged)
dict_topo = {
'plano <5%': 1,
'aclive_leve 5% e 30%': 0.95,
'declive_leve 5% e 30%': 0.90,
'aclive_acentuado >30%': 0.85,
'declive_acentuado >30%': 0.80,
'-': '-'
}
dict_rel = {
'plana': 1.1,
'ondulada': 1.00,
'montanhosa/acidentada': 0.80,
'-': '-'
}
dict_sup = {
'Seca': 1.00,
'Região inundável mas não atingida': 0.90,
'Região inundável mas atingida periodicamente': 0.70,
'Alagada': 0.60,
'-': '-'
}
dict_apr = {
'Loteamento': 1.00,
'Indústria': 0.90,
'Culturas': 0.80,
'-': '-'
}
dict_ace = {
'Ótima': 1.00,
'Muito boa': 0.95,
'Boa': 0.90,
'Desfavorável': 0.80,
'Má': 0.75,
'Péssima': 0.70,
'-': '-'
}
dict_ic = {
'id<5_novo': 1.00,
'id<5_bom': 0.95,
'id<5_reparos simples': 0.80,
'id<5_reparos importantes': 0.45,
'id entre 6 e 10_novo': 0.95,
'id entre 6 e 10_bom': 0.90,
'id entre 6 e 10_reparos simples': 0.75,
'id entre 6 e 10_reparos importantes': 0.40,
'id entre 11 e 30_novo': 0.85,
'id entre 11 e 30_bom': 0.80,
'id entre 11 e 30_reparos simples': 0.65,
'id entre 11 e 30_reparos importantes': 0.35,
'id entre 31 e 50_novo': 0.55,
'id entre 31 e 50_bom': 0.50,
'id entre 31 e 50_reparos simples': 0.45,
'id entre 31 e 50_reparos importantes': 0.25,
'id>50_novo': 0.30,
'id>50_bom': 0.20,
'id>50_reparos simples': 0.15,
'id>50_reparos importantes': 0.10,
'-': '-'
}
dict_pad = {
'Mínimo': 1.00,
'Baixo': 1.15,
'Normal c/ aspecto de baixo ': 1.30,
'Normal forte predominância': 1.45,
'Normal com aspecto de alto': 1.60,
'Alto': 1.75,
'Luxo': 1.90,
'-': '-'
}
PATTERNS = {
"endereco": [
r'(?:rua|avenida|estrada|alameda|praça|travessa)\s+[\w\s\d\-,.]+?\b(?:' + "|".join(ESTADOS_BR) + r')\b',
r'(?:endereço|localização|address)\s*:\s*[\w\s\d\-,.]+?\b(?:' + "|".join(ESTADOS_BR) + r')\b',
r'[\w\s\d\-,.]+\b(?:' + "|".join(ESTADOS_BR) + r')\b',
r'(?:rua|avenida|estrada|alameda|praça|travessa)\s+[\w\s\d\-,.]+',
r'([A-Za-z\s\-.À-ú]+)\s*-\s*([A-Za-z\s\-.À-ú]+)/([A-Z]{2})', # Improved for accented chars
],
"testada": [
r'(\d{1,3}(?:[.,]\d{1,2})?)m?\s*[xX]\s*\d',
r'(\d{1,3}(?:[.,]\d{1,2})?)\s*metros?\s*de\s*frente',
r'front\s*:\s*(\d{1,3}(?:[.,]\d{1,2})?)',
r"Metragem\s*de\s*frente\s*:\s*(\d+,\d+|\d+)\s*m",
r"(\d+,\d+|\d+)\s*m\s*[xX]\s*\d+"
],
"valor": [
r'R\$[\s]*<strong>[\s]*([\d.]+,[\d]{2})[\s]*<\/strong>',
r"Valor\s*do\s*imóvel\s*R\$\s*([\d.,]+)",
r'valor\s*:\s*R\$\s*(\d[\d\.,]*)',
r'preço\s*:\s*R\$\s*(\d[\d\.,]*)',
r'value\s*:\s*\$(\d[\d\.,]*)',
r'R\$\s*(\d[\d\.,]*)',
# r's*(\d[\d\.,]*)', # This pattern is too broad and can cause issues, commented out
],
"area": [
r"(\d[\d.,]*)\s*m²\s*de\s*área\s*total",
r"(\d[\d.,]*)\s*m²\s*de\s*área\s*construída",
r"(\d[\d.,]*)\s*m²\s*de\s*área\s*privativa",
r'área\s*(?:do\s*terreno|total)\s*[:–-]?\s*(\d[\d.,]*)\s*m²',
r'área\s*construída\s*[:–-]?\s*(\d[\d.,]*)\s*m²',
r'área\s*privativa\s*[:–-]?\s*(\d[\d.,]*)\s*m²',
r'area\s*:\s*(\d[\d.,]+)\s*m²',
r'size\s*:\s*(\d[\d.,]+)\s*sqft',
r"(\d+,\d+|\d+)\s*m²\s*área total",
r"(\d+,\d+|\d+)\s*m²\s*área privativa",
r"(\d{1,3}(?:\.\d{3})*(?:,\d{2})?)(m²|ha)\s+área total do terreno",
r'(\d[\d.,]+)\s*m²',
],
"dormitorios": [
r'(\d+)\s*(?:quarto|quartos|dormit[oó]rio|dormit[oó]rios|dorm\.|dorms\.)',
r'bedroom[s]?\s*:\s*(\d+)',
r'dormit[oó]rio[s]?\s*[:–-]?\s*(\d+)',
r'quarto[s]?\s*[:–-]?\s*(\d+)',
],
"banheiros": [
r'(\d+)\s*(?:banheiro|banheiros|wc|banho|banhos)',
r'bathroom[s]?\s*:\s*(\d+)',
],
"vagas": [
r'(\d+)\s*(?:vaga|vagas)(?:\s*de\s*garagem)?',
r'parking\s*:\s*(\d+)',
r'garagem\s*[:–-]?\s*(\d+)\s*vaga',
],
"suites": [
r'(\d+)\s*(?:su[ií]te|su[ií]tes)',
r'suite[s]?\s*:\s*(\d+)',
],
}
def extract_info(text, patterns):
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
if match.groups():
if len(match.groups()) == 3 and "/" in match.group(0) and pattern == r'([A-Za-z\s\-.À-ú]+)\s*-\s*([A-Za-z\s\-.À-ú]+)/([A-Z]{2})':
bairro = match.group(1).strip()
cidade = match.group(2).strip()
estado = match.group(3).strip()
return f"{bairro} - {cidade}/{estado}"
return match.group(1).strip()
return match.group(0).strip()
return '-'
def clean_text_for_testada(page_text):
return re.sub(r'^\s*\*?\s*\!\[Image[^\n]*\n?', '', page_text, flags=re.MULTILINE)
def extract_metadata(soup):
metadata = {}
for meta in soup.find_all("meta"):
name_prop = meta.get("property") or meta.get("name")
if name_prop and meta.get("content"):
metadata[name_prop.lower()] = meta["content"]
return metadata
def extract_json_scripts(soup):
scripts = soup.find_all("script", type="application/ld+json")
data = []
for script in scripts:
try:
# Remove comments within the script tag if any
script_content = ""
for content_part in script.contents:
if isinstance(content_part, Comment):
continue
script_content += str(content_part)
json_data = json.loads(script_content)
data.append(json_data)
except (json.JSONDecodeError, TypeError):
continue
return data
def get_main_page_text(soup_obj):
# Try to find common main content containers
main_containers = ["article", "main", "[role='main']"] # Common semantic tags/attributes
# Less specific, but common for content blocks
div_selectors = [
"div[class*='content']", "div[id*='content']",
"div[class*='main']", "div[id*='main']",
"div[class*='body']", "div[id*='body']",
"div[class*='post']", "div[id*='post']",
"div[class*='listing-details']" # Specific to listings
]
content_element = None
for selector in main_containers:
element = soup_obj.select_one(selector)
if element:
content_element = element
break
if not content_element:
for selector in div_selectors:
element = soup_obj.select_one(selector)
if element and len(element.get_text(strip=True)) > 200: # Heuristic for meaningful content
content_element = element
break
if content_element:
return content_element.get_text(separator='\n', strip=True)
# Fallback to body if no specific main content found
body_tag = soup_obj.body
if body_tag:
return body_tag.get_text(separator='\n', strip=True)
return soup_obj.get_text(separator='\n', strip=True) # Ultimate fallback
def smart_fetch_url_info(user_input_url):
if not validators.url(user_input_url):
return pd.DataFrame(), "URL inválida. Verifique e tente novamente.", None, None
html_content = None
page_title = ""
final_url = user_input_url # To store the URL after potential redirects
with sync_playwright() as p:
browser = None
context = None
page = None
try:
browser = p.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-first-run',
'--no-zygote',
# '--single-process', # Potentially for very constrained environments, but can be less stable
'--disable-gpu',
'--disable-blink-features=AutomationControlled'
]
)
context = browser.new_context(
user_agent=random.choice(USER_AGENTS),
viewport={'width': 1920, 'height': 1080},
locale='pt-BR',
timezone_id='America/Sao_Paulo',
# ignore_https_errors=True # Use with caution if SSL certs are an issue
)
context.set_default_navigation_timeout(60000) # 60 seconds for navigation
context.set_default_timeout(45000) # 45 seconds for other operations
page = context.new_page()
stealth_sync(page) # Apply stealth patches
print(f"Fetching {user_input_url} with Playwright...")
# Try to block common resource types that slow down loading and are not needed for text
# page.route("**/*", lambda route: route.abort() if route.request.resource_type in {"image", "stylesheet", "font", "media"} else route.continue_())
response = page.goto(user_input_url, wait_until="domcontentloaded", timeout=60000)
# Wait for potential dynamic content loading, or specific selectors if known
# Example: page.wait_for_selector("body", timeout=10000)
time.sleep(random.uniform(4, 8)) # Allow time for JS execution
html_content = page.content()
page_title = page.title()
final_url = page.url # Get the URL after any redirects
except Exception as e:
print(f"Playwright fetching error for {user_input_url}: {e}")
return pd.DataFrame(), f"Erro ao buscar com Playwright: {e}", None, None
finally:
if page:
try: page.close()
except Exception as e_page: print(f"Error closing page: {e_page}")
if context:
try: context.close()
except Exception as e_ctx: print(f"Error closing context: {e_ctx}")
if browser:
try: browser.close()
except Exception as e_browser: print(f"Error closing browser: {e_browser}")
if not html_content:
return pd.DataFrame(), "Não foi possível obter o conteúdo da página com Playwright.", None, None
soup = BeautifulSoup(html_content, "html.parser")
metadata = extract_metadata(soup)
json_scripts = extract_json_scripts(soup)
page_text = get_main_page_text(soup)
cleaned_text_for_testada_val = clean_text_for_testada(page_text) # Renamed variable
# Enhanced Endereço Extraction
endereco_json_parts = []
if json_scripts:
for script_data_list in json_scripts: # json_scripts is a list of dicts/lists
# Handle if script_data_list is a list itself (e.g., graph of entities)
items_to_check = script_data_list if isinstance(script_data_list, list) else [script_data_list]
for script_data in items_to_check:
if isinstance(script_data, dict):
# Common Schema.org types for properties
if script_data.get("@type") in ["RealEstateListing", "Residence", "Place", "Apartment", "House", "SingleFamilyResidence"]:
addr_obj = script_data.get("address")
if isinstance(addr_obj, dict):
street = addr_obj.get("streetAddress", "")
locality = addr_obj.get("addressLocality", "")
region = addr_obj.get("addressRegion", "")
postal_code = addr_obj.get("postalCode", "")
country = addr_obj.get("addressCountry", "")
# Construct address string, prefer more specific parts
current_addr_parts = [p for p in [street, locality, region, postal_code, country] if p]
if current_addr_parts:
endereco_json_parts.append(", ".join(current_addr_parts))
break # Found one, assume it's the primary
if endereco_json_parts: break
endereco_json_val = endereco_json_parts[0] if endereco_json_parts else None
endereco = (
extract_info(page_text, PATTERNS["endereco"]) or
metadata.get("og:street-address") or # More specific OG tags
metadata.get("og:locality") or
metadata.get("og:region") or
metadata.get("twitter:data1") or # Sometimes address is here
metadata.get("place:location:street_address") or # Facebook Places
endereco_json_val or
metadata.get("og:address") # Generic fallback
)
if not endereco or endereco == '-': # If regex fails, try a broader search in metadata description
description_text = metadata.get("description", "") + " " + metadata.get("og:description", "")
endereco = extract_info(description_text, PATTERNS["endereco"])
valor_str = extract_info(page_text, PATTERNS["valor"])
area_str = extract_info(page_text, PATTERNS["area"])
dorm_str = extract_info(page_text, PATTERNS["dormitorios"])
banheiros_str = extract_info(page_text, PATTERNS["banheiros"])
vagas_str = extract_info(page_text, PATTERNS["vagas"])
suites_str = extract_info(page_text, PATTERNS["suites"])
testada_str = extract_info(cleaned_text_for_testada_val, PATTERNS["testada"])
# Data Cleaning
try:
valor_cleaned = str(valor_str).replace('R$', '').replace('.', '').replace(',', '.').strip()
valor_float = float(valor_cleaned) if valor_cleaned and valor_cleaned.replace('.', '', 1).replace('-', '', 1).isdigit() else '-'
except (AttributeError, ValueError):
valor_float = '-'
try:
area_match = re.search(r'(\d[\d,.]*)', str(area_str)) # Get first number sequence
area_cleaned = area_match.group(1).replace('.', '').replace(',', '.') if area_match else str(area_str)
area_float = float(area_cleaned) if area_cleaned and area_cleaned.replace('.', '', 1).isdigit() else '-'
except (AttributeError, ValueError):
area_float = '-'
def to_int_or_dash(val_str):
if isinstance(val_str, (int, float)): return int(val_str)
if isinstance(val_str, str):
cleaned_val = re.sub(r'\D', '', val_str) # Remove non-digits
if cleaned_val.isdigit(): return int(cleaned_val)
return '-'
dorm_int = to_int_or_dash(dorm_str)
banheiros_int = to_int_or_dash(banheiros_str)
vagas_int = to_int_or_dash(vagas_str)
suites_int = to_int_or_dash(suites_str)
try:
testada_cleaned = str(testada_str).replace(',', '.')
testada_float = float(testada_cleaned) if testada_cleaned and testada_cleaned.replace('.', '', 1).isdigit() else '-'
except (AttributeError, ValueError):
testada_float = '-'
result_text = f"**{page_title}**\n\nURL: {final_url}\n\n{page_text[:10000]}..."
df = pd.DataFrame([{
"Endereço": endereco if endereco and endereco != '-' else 'Não encontrado',
"Área": area_float,
"Testada": testada_float,
"Valor": valor_float,
"Dorm": dorm_int,
"Banheiros": banheiros_int,
"Vagas": vagas_int,
"Suítes": suites_int,
"URL": final_url, # Use final URL after redirects
"Topografia": '-', "Relevo": '-', "Superfície": '-', "Aproveitamento": '-',
"Acessibilidade": '-', "Idade e conservação": '-', "Padrão construtivo": '-', "Outra característica": '-'
}])
return df, result_text, endereco, valor_float
# Acumulador (omitted for brevity, unchanged)
def adicionar_ao_acumulado(df_atual, df_acumulado, topo, rel, sup, apr, ace, ic, pad, var):
if df_atual.empty:
return df_acumulado, df_acumulado, ""
df_novo = df_atual.copy()
df_novo.insert(0, "Dado", f"Dado {len(df_acumulado)+1}")
# Adicionar valores dos dropdowns
df_novo["Topografia"] = topo
df_novo["Relevo"] = rel
df_novo["Superfície"] = sup
df_novo["Aproveitamento"] = apr
df_novo["Acessibilidade"] = ace
df_novo["Idade e conservação"] = ic
df_novo["Padrão construtivo"] = pad
df_novo["Outra característica"] = var
# Calcular VU (Valor / Área), evitando divisão por zero ou nulos
df_novo["VU"] = df_novo.apply(
lambda row: round(row["Valor"] / row["Área"], 2)
if isinstance(row["Valor"], (int, float)) and isinstance(row["Área"], (int, float)) and row["Área"] != 0 and row["Área"] != '-' and row["Valor"] != '-'
else '-',
axis=1
)
# Reordenar colunas para colocar VU depois de Valor
cols = df_novo.columns.tolist()
valor_index = cols.index("Valor")
vu_index = cols.index("VU")
cols.insert(valor_index + 1, cols.pop(vu_index))
df_novo = df_novo[cols]
df_acumulado = pd.concat([df_acumulado, df_novo], ignore_index=True)
# Filtrar apenas valores numéricos para cálculos estatísticos
valor_numeric = pd.to_numeric(df_acumulado["Valor"], errors="coerce")
vu_numeric = pd.to_numeric(df_acumulado["VU"], errors="coerce")
# Calcular estatísticas
quantidade_dados = len(df_acumulado)
# Estatísticas para "Valor"
valor_max_val = valor_numeric.max() if not valor_numeric.isna().all() else '-'
valor_min_val = valor_numeric.min() if not valor_numeric.isna().all() else '-'
valor_medio_val = valor_numeric.mean() if not valor_numeric.isna().all() else '-'
valor_mediana_val = valor_numeric.median() if not valor_numeric.isna().all() else '-'
# Estatísticas para "VU"
vu_max_val = vu_numeric.max() if not vu_numeric.isna().all() else '-'
vu_min_val = vu_numeric.min() if not vu_numeric.isna().all() else '-'
vu_medio_val = vu_numeric.mean() if not vu_numeric.isna().all() else '-'
vu_mediana_val = vu_numeric.median() if not vu_numeric.isna().all() else '-'
# Criar texto com estatísticas
stats_text = (
f"**Quantidade de dados:** {quantidade_dados}\n\n"
f"**Valor:**\n"
f"- Máximo: {valor_max_val if isinstance(valor_max_val, str) else f'R$ {valor_max_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n"
f"- Mínimo: {valor_min_val if isinstance(valor_min_val, str) else f'R$ {valor_min_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n"
f"- Média: {valor_medio_val if isinstance(valor_medio_val, str) else f'R$ {valor_medio_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n"
f"- Mediana: {valor_mediana_val if isinstance(valor_mediana_val, str) else f'R$ {valor_mediana_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n\n"
f"**VU (Valor Unitário - R$/m²):**\n"
f"- Máximo: {vu_max_val if isinstance(vu_max_val, str) else f'R$ {vu_max_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n"
f"- Mínimo: {vu_min_val if isinstance(vu_min_val, str) else f'R$ {vu_min_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n"
f"- Média: {vu_medio_val if isinstance(vu_medio_val, str) else f'R$ {vu_medio_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n"
f"- Mediana: {vu_mediana_val if isinstance(vu_mediana_val, str) else f'R$ {vu_mediana_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}"
)
return df_acumulado, df_acumulado, stats_text
# Limpeza do anúncio atual (omitted for brevity, unchanged)
def clear_fields():
empty_df = pd.DataFrame({
"Endereço": ['-'], "Área": ['-'], "Testada": ['-'], "Valor": ['-'], "VU": ['-'],
"Dorm": ['-'], "Banheiros": ['-'], "Vagas": ['-'], "Suítes": ['-'], "URL": ['-'],
"Topografia": ['-'], "Relevo": ['-'], "Superfície": ['-'], "Aproveitamento": ['-'],
"Acessibilidade": ['-'], "Idade e conservação": ['-'], "Padrão construtivo": ['-'],
"Outra característica": ['-']
})
return "", empty_df, "", False, None, '-', '-', '-', '-', '-', '-', '-', '-'
# Função para excluir linhas com "Dado" vazio (omitted for brevity, unchanged)
def excluir_dados_vazios(df_acumulado):
if "Dado" not in df_acumulado.columns or df_acumulado.empty:
return df_acumulado, df_acumulado
df_acumulado = df_acumulado[df_acumulado["Dado"].astype(str).str.strip() != ""]
df_acumulado = df_acumulado.reset_index(drop=True)
# Re-numerar a coluna "Dado"
for i in range(len(df_acumulado)):
df_acumulado.loc[i, "Dado"] = f"Dado {i+1}"
return df_acumulado, df_acumulado
def toggle_output_text(show_text, result_text):
return gr.update(visible=show_text), result_text
# take_screenshot (omitted for brevity, unchanged from your last version)
def take_screenshot(url, endereco, valor, filename="screenshot.png"):
if not url or not validators.url(url):
print("URL inválida para screenshot.")
return None # Or a placeholder image path
with sync_playwright() as p:
browser = None
context = None
page = None
try:
browser = p.chromium.launch(
headless=True,
args=[
'--disable-blink-features=AutomationControlled',
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu'
]
)
context = browser.new_context(
user_agent=random.choice(USER_AGENTS),
viewport={'width': 1920, 'height': 1080},
locale='pt-BR',
timezone_id='America/Sao_Paulo',
)
page = context.new_page()
stealth_sync(page)
print(f"Taking screenshot of {url}")
page.goto(url, timeout=60000, wait_until="networkidle") # networkidle might be better for screenshots
# time.sleep(random.uniform(3,6)) # Extra wait if networkidle isn't enough
if endereco and valor and str(endereco).strip() != '-' and isinstance(valor, (int, float)):
safe_endereco = re.sub(r'[\\/*?:"<>|]', "", str(endereco)).replace(' ', '_')[:100] # Limit length
filename = f"{safe_endereco}_R${valor:.2f}.png".replace(",", "_") # Use underscore for comma
else:
# Generate a more unique default name
timestamp = time.strftime("%Y%m%d-%H%M%S")
filename = f"screenshot_{timestamp}.png"
# Ensure the directory for screenshots exists if not saving to current dir
# os.makedirs("screenshots", exist_ok=True)
# screenshot_path = os.path.join("screenshots", filename)
# For Gradio, relative path is fine if it's served correctly
screenshot_path = filename
page.screenshot(path=screenshot_path, full_page=True)
print(f"Screenshot salvo como {screenshot_path}")
return screenshot_path
except Exception as e:
print(f"Error during screenshot for {url}: {e}")
return None
finally:
if page:
try: page.close()
except Exception: pass
if context:
try: context.close()
except Exception: pass
if browser:
try: browser.close()
except Exception: pass
# Tema
theme = gr.themes.Default(primary_hue=gr.themes.colors.yellow, secondary_hue=gr.themes.colors.blue) # Using Default for broader compatibility
# App principal
with gr.Blocks(theme=theme, css="""
/* CSS (omitted for brevity, unchanged) */
@import url('https://fonts.googleapis.com/css2?family=Quicksand:wght@400;700&display=swap');
.small-file-upload { height: 65px; text-align: center; color: black; border: 2px solid black !important; box-sizing: border-box; }
.small-file-upload span { display: none; }
.small-file-upload input[type="file"] { color: black; }
.small-file-upload label { color: black; }
.small span { font-size: 1.0em; white-space: nowrap; width: auto; display: inline-block; } /* Adjusted font size */
/* .small span dados { font-size: 0.8em; white-space: nowrap; width: auto; display: inline-block; } */ /* This specific selector might not work as intended, keep it general */
h1 { text-align: center; font-family: 'Quicksand', sans-serif; font-weight: 700; margin: 20px 0; color: black; }
.map-container { height: 600px !important; margin: 0; padding: 0; }
""") as app:
gr.Markdown(
"<div style='font-size: 1.5em; text-align: center;'>" # Centered title
"<span style='color: gray;'>Pesquisa.AI - </span>"
"<span style='color: gray;'>aval</span>"
"<span style='color: #FFD700;'>ia</span>" # Gold color
"<span style='color: gray;'>.se</span>"
"</div>"
)
df_acumulado_state = gr.State(pd.DataFrame(columns=["Dado", "Endereço", "Área", "Testada", "Valor", "VU", "Dorm", "Banheiros", "Vagas", "Suítes", "URL", "Topografia", "Relevo", "Superfície", "Aproveitamento", "Acessibilidade", "Idade e conservação", "Padrão construtivo", "Outra característica"]))
with gr.Row():
with gr.Column(scale=1, min_width=250): # Added min_width
user_input = gr.Textbox(label="Cole a URL do anúncio aqui")
gr.Markdown("**ANÚNCIO ATUAL**")
submit_button = gr.Button("1. Carregar Dados do Anúncio", variant="primary", elem_id="load_button")
screenshot_button = gr.Button("2. Print da Página do Anúncio", elem_id="print_button")
clear_button = gr.Button("Limpar Campos Atuais", variant="stop", elem_id="clear_button") # Changed variant
gr.Markdown("**BANCO DE DADOS**")
add_data = gr.Button("3. Adicionar ao Banco", variant="primary", elem_id="add_db_button")
delete_data = gr.Button("Excluir Linha Selecionada do Banco", elem_id="delete_db_button")
# infos = gr.Button("Geolocalização", elem_id="geo_button") # If you implement this
with gr.Column(scale=4): # Adjusted scale
output_table = gr.Dataframe(
headers=["Endereço", "Área", "Testada", "Valor", "VU", "Dorm", "Banheiros", "Vagas", "Suítes", "URL", "Topografia", "Relevo", "Superfície", "Aproveitamento", "Acessibilidade", "Idade e conservação", "Padrão construtivo", "Outra característica"],
datatype=["str", "number", "number", "number", "number", "number", "number", "number", "number", "str", "str", "str", "str", "str", "str", "str", "str", "str"],
interactive=True,
row_count=(1, "fixed"), # Let row_count determine height for single row table
wrap=True
)
with gr.Accordion("Características Adicionais (para adicionar ao banco)", open=False):
with gr.Row():
topo_drop = gr.Dropdown(label="Topografia", choices=list(dict_topo.keys()), value='-', interactive=True)
rel_drop = gr.Dropdown(label="Relevo", choices=list(dict_rel.keys()), value='-', interactive=True)
sup_drop = gr.Dropdown(label="Superfície", choices=list(dict_sup.keys()), value='-', interactive=True)
apr_drop = gr.Dropdown(label="Aproveitamento", choices=list(dict_apr.keys()), value='-', interactive=True)
with gr.Row():
ace_drop = gr.Dropdown(label="Acessibilidade", choices=list(dict_ace.keys()), value='-', interactive=True)
ic_drop = gr.Dropdown(label="Idade e conservação", choices=list(dict_ic.keys()), value='-', interactive=True)
pad_drop = gr.Dropdown(label="Padrão construtivo", choices=list(dict_pad.keys()), value='-', interactive=True)
var_drop = gr.Textbox(label="Outra característica", value='-', interactive=True)
with gr.Row():
show_text_checkbox = gr.Checkbox(label="Mostrar Texto Extraído do Anúncio", value=False, scale=4)
screenshot_output = gr.Image(label="Print do Anúncio", type="filepath", height=200, scale=1, interactive=False) # Made non-interactive
output_text = gr.Markdown(label="Texto Extraído (Pré-visualização)", visible=False)
gr.Markdown("---") # Separator
gr.Markdown("**BANCO DE DADOS ACUMULADOS**")
acumulado_table = gr.Dataframe(
headers=["Dado", "Endereço", "Área", "Testada", "Valor", "VU", "Dorm", "Banheiros", "Vagas", "Suítes", "URL", "Topografia", "Relevo", "Superfície", "Aproveitamento", "Acessibilidade", "Idade e conservação", "Padrão construtivo", "Outra característica"],
datatype=["str", "str", "number", "number", "number", "number", "number", "number", "number", "number", "str", "str", "str", "str", "str", "str", "str", "str", "str"],
interactive=True, # Set to true if you want to edit or select rows for deletion
# height=400, # Removed height parameter
wrap=True
)
# If 'acumulado_table' needs a specific height, try CSS or check Gradio docs for your version's method.
# For Gradio 3.x: acumulado_table.style(height=400) might work if needed after the definition.
stats_output = gr.Markdown(label="Estatísticas dos Dados Acumulados")
# Lógica dos botões
extracted_endereco_state = gr.State()
extracted_valor_state = gr.State() # This will store the numerical value for screenshot naming
submit_button.click(
smart_fetch_url_info, # Use the new Playwright-based function
inputs=user_input,
outputs=[output_table, output_text, extracted_endereco_state, extracted_valor_state]
)
clear_button.click(
clear_fields,
outputs=[user_input, output_table, output_text, show_text_checkbox, screenshot_output, topo_drop, rel_drop, sup_drop, apr_drop, ace_drop, ic_drop, pad_drop, var_drop]
)
show_text_checkbox.change(
toggle_output_text,
inputs=[show_text_checkbox, output_text], # Pass current output_text to preserve it
outputs=[output_text, output_text] # Update visibility and content
)
screenshot_button.click(
take_screenshot,
inputs=[user_input, extracted_endereco_state, extracted_valor_state], # user_input is the original URL
outputs=screenshot_output
)
add_data.click(
adicionar_ao_acumulado,
inputs=[output_table, df_acumulado_state, topo_drop, rel_drop, sup_drop, apr_drop, ace_drop, ic_drop, pad_drop, var_drop],
outputs=[df_acumulado_state, acumulado_table, stats_output]
)
delete_data.click(
excluir_dados_vazios, # This function needs to know which row to delete
inputs=acumulado_table, # Pass the editable dataframe
outputs=[df_acumulado_state, acumulado_table] # Update state and table
)
# Modify excluir_dados_vazios to work with selected row from editable DataFrame
# For row deletion, you'd typically make 'acumulado_table' interactive and get selected index
# The current 'excluir_dados_vazios' removes ALL rows with empty "Dado" field, not selected ones.
# To delete a selected row, 'acumulado_table' needs to be interactive and you'd get the selection event.
# For simplicity, the current `excluir_dados_vazios` is kept, but it's not ideal for targeted deletion.
# A more advanced deletion would require handling selection events from the DataFrame.
app.launch(share=True, debug=True) # Added debug=True for more detailed logs during development