import subprocess import sys import os def ensure_playwright_chromium(): """Ensures Playwright Chromium browser is installed.""" try: print("Checking and installing Playwright Chromium browser if needed...") subprocess.run( [sys.executable, "-m", "playwright", "install", "chromium"], check=True, capture_output=True, text=True ) print("Playwright Chromium browser is ready.") except subprocess.CalledProcessError as e: print(f"Error during Playwright Chromium installation: {e}") print(f"Stdout: {e.stdout}") print(f"Stderr: {e.stderr}") except FileNotFoundError: print("Error: Python executable or Playwright module not found. Ensure your environment is set up correctly.") ensure_playwright_chromium() import gradio as gr import requests # Still used for other things potentially, or could be removed if not from bs4 import BeautifulSoup from bs4 import Comment import re import pandas as pd import validators # from selenium import webdriver # No longer used in the primary fetch path # from selenium.webdriver.chrome.service import Service # No longer used # from selenium.webdriver.chrome.options import Options # No longer used # from selenium.webdriver.common.by import By # No longer used import json import time import random from playwright.sync_api import sync_playwright from playwright_stealth import stealth_sync # Constantes ESTADOS_BR = ["AC", "AL", "AM", "AP", "BA", "CE", "DF", "ES", "GO", "MA", "MG", "MS", "MT", "PA", "PB", "PE", "PI", "PR", "RJ", "RN", "RO", "RR", "RS", "SC", "SE", "SP", "TO"] USER_AGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36", ] # Dicionários (omitted for brevity, they are unchanged) dict_topo = { 'plano <5%': 1, 'aclive_leve 5% e 30%': 0.95, 'declive_leve 5% e 30%': 0.90, 'aclive_acentuado >30%': 0.85, 'declive_acentuado >30%': 0.80, '-': '-' } dict_rel = { 'plana': 1.1, 'ondulada': 1.00, 'montanhosa/acidentada': 0.80, '-': '-' } dict_sup = { 'Seca': 1.00, 'Região inundável mas não atingida': 0.90, 'Região inundável mas atingida periodicamente': 0.70, 'Alagada': 0.60, '-': '-' } dict_apr = { 'Loteamento': 1.00, 'Indústria': 0.90, 'Culturas': 0.80, '-': '-' } dict_ace = { 'Ótima': 1.00, 'Muito boa': 0.95, 'Boa': 0.90, 'Desfavorável': 0.80, 'Má': 0.75, 'Péssima': 0.70, '-': '-' } dict_ic = { 'id<5_novo': 1.00, 'id<5_bom': 0.95, 'id<5_reparos simples': 0.80, 'id<5_reparos importantes': 0.45, 'id entre 6 e 10_novo': 0.95, 'id entre 6 e 10_bom': 0.90, 'id entre 6 e 10_reparos simples': 0.75, 'id entre 6 e 10_reparos importantes': 0.40, 'id entre 11 e 30_novo': 0.85, 'id entre 11 e 30_bom': 0.80, 'id entre 11 e 30_reparos simples': 0.65, 'id entre 11 e 30_reparos importantes': 0.35, 'id entre 31 e 50_novo': 0.55, 'id entre 31 e 50_bom': 0.50, 'id entre 31 e 50_reparos simples': 0.45, 'id entre 31 e 50_reparos importantes': 0.25, 'id>50_novo': 0.30, 'id>50_bom': 0.20, 'id>50_reparos simples': 0.15, 'id>50_reparos importantes': 0.10, '-': '-' } dict_pad = { 'Mínimo': 1.00, 'Baixo': 1.15, 'Normal c/ aspecto de baixo ': 1.30, 'Normal forte predominância': 1.45, 'Normal com aspecto de alto': 1.60, 'Alto': 1.75, 'Luxo': 1.90, '-': '-' } PATTERNS = { "endereco": [ r'(?:rua|avenida|estrada|alameda|praça|travessa)\s+[\w\s\d\-,.]+?\b(?:' + "|".join(ESTADOS_BR) + r')\b', r'(?:endereço|localização|address)\s*:\s*[\w\s\d\-,.]+?\b(?:' + "|".join(ESTADOS_BR) + r')\b', r'[\w\s\d\-,.]+\b(?:' + "|".join(ESTADOS_BR) + r')\b', r'(?:rua|avenida|estrada|alameda|praça|travessa)\s+[\w\s\d\-,.]+', r'([A-Za-z\s\-.À-ú]+)\s*-\s*([A-Za-z\s\-.À-ú]+)/([A-Z]{2})', # Improved for accented chars ], "testada": [ r'(\d{1,3}(?:[.,]\d{1,2})?)m?\s*[xX]\s*\d', r'(\d{1,3}(?:[.,]\d{1,2})?)\s*metros?\s*de\s*frente', r'front\s*:\s*(\d{1,3}(?:[.,]\d{1,2})?)', r"Metragem\s*de\s*frente\s*:\s*(\d+,\d+|\d+)\s*m", r"(\d+,\d+|\d+)\s*m\s*[xX]\s*\d+" ], "valor": [ r'R\$[\s]*[\s]*([\d.]+,[\d]{2})[\s]*<\/strong>', r"Valor\s*do\s*imóvel\s*R\$\s*([\d.,]+)", r'valor\s*:\s*R\$\s*(\d[\d\.,]*)', r'preço\s*:\s*R\$\s*(\d[\d\.,]*)', r'value\s*:\s*\$(\d[\d\.,]*)', r'R\$\s*(\d[\d\.,]*)', # r's*(\d[\d\.,]*)', # This pattern is too broad and can cause issues, commented out ], "area": [ r"(\d[\d.,]*)\s*m²\s*de\s*área\s*total", r"(\d[\d.,]*)\s*m²\s*de\s*área\s*construída", r"(\d[\d.,]*)\s*m²\s*de\s*área\s*privativa", r'área\s*(?:do\s*terreno|total)\s*[:–-]?\s*(\d[\d.,]*)\s*m²', r'área\s*construída\s*[:–-]?\s*(\d[\d.,]*)\s*m²', r'área\s*privativa\s*[:–-]?\s*(\d[\d.,]*)\s*m²', r'area\s*:\s*(\d[\d.,]+)\s*m²', r'size\s*:\s*(\d[\d.,]+)\s*sqft', r"(\d+,\d+|\d+)\s*m²\s*área total", r"(\d+,\d+|\d+)\s*m²\s*área privativa", r"(\d{1,3}(?:\.\d{3})*(?:,\d{2})?)(m²|ha)\s+área total do terreno", r'(\d[\d.,]+)\s*m²', ], "dormitorios": [ r'(\d+)\s*(?:quarto|quartos|dormit[oó]rio|dormit[oó]rios|dorm\.|dorms\.)', r'bedroom[s]?\s*:\s*(\d+)', r'dormit[oó]rio[s]?\s*[:–-]?\s*(\d+)', r'quarto[s]?\s*[:–-]?\s*(\d+)', ], "banheiros": [ r'(\d+)\s*(?:banheiro|banheiros|wc|banho|banhos)', r'bathroom[s]?\s*:\s*(\d+)', ], "vagas": [ r'(\d+)\s*(?:vaga|vagas)(?:\s*de\s*garagem)?', r'parking\s*:\s*(\d+)', r'garagem\s*[:–-]?\s*(\d+)\s*vaga', ], "suites": [ r'(\d+)\s*(?:su[ií]te|su[ií]tes)', r'suite[s]?\s*:\s*(\d+)', ], } def extract_info(text, patterns): for pattern in patterns: match = re.search(pattern, text, re.IGNORECASE) if match: if match.groups(): if len(match.groups()) == 3 and "/" in match.group(0) and pattern == r'([A-Za-z\s\-.À-ú]+)\s*-\s*([A-Za-z\s\-.À-ú]+)/([A-Z]{2})': bairro = match.group(1).strip() cidade = match.group(2).strip() estado = match.group(3).strip() return f"{bairro} - {cidade}/{estado}" return match.group(1).strip() return match.group(0).strip() return '-' def clean_text_for_testada(page_text): return re.sub(r'^\s*\*?\s*\!\[Image[^\n]*\n?', '', page_text, flags=re.MULTILINE) def extract_metadata(soup): metadata = {} for meta in soup.find_all("meta"): name_prop = meta.get("property") or meta.get("name") if name_prop and meta.get("content"): metadata[name_prop.lower()] = meta["content"] return metadata def extract_json_scripts(soup): scripts = soup.find_all("script", type="application/ld+json") data = [] for script in scripts: try: # Remove comments within the script tag if any script_content = "" for content_part in script.contents: if isinstance(content_part, Comment): continue script_content += str(content_part) json_data = json.loads(script_content) data.append(json_data) except (json.JSONDecodeError, TypeError): continue return data def get_main_page_text(soup_obj): # Try to find common main content containers main_containers = ["article", "main", "[role='main']"] # Common semantic tags/attributes # Less specific, but common for content blocks div_selectors = [ "div[class*='content']", "div[id*='content']", "div[class*='main']", "div[id*='main']", "div[class*='body']", "div[id*='body']", "div[class*='post']", "div[id*='post']", "div[class*='listing-details']" # Specific to listings ] content_element = None for selector in main_containers: element = soup_obj.select_one(selector) if element: content_element = element break if not content_element: for selector in div_selectors: element = soup_obj.select_one(selector) if element and len(element.get_text(strip=True)) > 200: # Heuristic for meaningful content content_element = element break if content_element: return content_element.get_text(separator='\n', strip=True) # Fallback to body if no specific main content found body_tag = soup_obj.body if body_tag: return body_tag.get_text(separator='\n', strip=True) return soup_obj.get_text(separator='\n', strip=True) # Ultimate fallback def smart_fetch_url_info(user_input_url): if not validators.url(user_input_url): return pd.DataFrame(), "URL inválida. Verifique e tente novamente.", None, None html_content = None page_title = "" final_url = user_input_url # To store the URL after potential redirects with sync_playwright() as p: browser = None context = None page = None try: browser = p.chromium.launch( headless=True, args=[ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--no-first-run', '--no-zygote', # '--single-process', # Potentially for very constrained environments, but can be less stable '--disable-gpu', '--disable-blink-features=AutomationControlled' ] ) context = browser.new_context( user_agent=random.choice(USER_AGENTS), viewport={'width': 1920, 'height': 1080}, locale='pt-BR', timezone_id='America/Sao_Paulo', # ignore_https_errors=True # Use with caution if SSL certs are an issue ) context.set_default_navigation_timeout(60000) # 60 seconds for navigation context.set_default_timeout(45000) # 45 seconds for other operations page = context.new_page() stealth_sync(page) # Apply stealth patches print(f"Fetching {user_input_url} with Playwright...") # Try to block common resource types that slow down loading and are not needed for text # page.route("**/*", lambda route: route.abort() if route.request.resource_type in {"image", "stylesheet", "font", "media"} else route.continue_()) response = page.goto(user_input_url, wait_until="domcontentloaded", timeout=60000) # Wait for potential dynamic content loading, or specific selectors if known # Example: page.wait_for_selector("body", timeout=10000) time.sleep(random.uniform(4, 8)) # Allow time for JS execution html_content = page.content() page_title = page.title() final_url = page.url # Get the URL after any redirects except Exception as e: print(f"Playwright fetching error for {user_input_url}: {e}") return pd.DataFrame(), f"Erro ao buscar com Playwright: {e}", None, None finally: if page: try: page.close() except Exception as e_page: print(f"Error closing page: {e_page}") if context: try: context.close() except Exception as e_ctx: print(f"Error closing context: {e_ctx}") if browser: try: browser.close() except Exception as e_browser: print(f"Error closing browser: {e_browser}") if not html_content: return pd.DataFrame(), "Não foi possível obter o conteúdo da página com Playwright.", None, None soup = BeautifulSoup(html_content, "html.parser") metadata = extract_metadata(soup) json_scripts = extract_json_scripts(soup) page_text = get_main_page_text(soup) cleaned_text_for_testada_val = clean_text_for_testada(page_text) # Renamed variable # Enhanced Endereço Extraction endereco_json_parts = [] if json_scripts: for script_data_list in json_scripts: # json_scripts is a list of dicts/lists # Handle if script_data_list is a list itself (e.g., graph of entities) items_to_check = script_data_list if isinstance(script_data_list, list) else [script_data_list] for script_data in items_to_check: if isinstance(script_data, dict): # Common Schema.org types for properties if script_data.get("@type") in ["RealEstateListing", "Residence", "Place", "Apartment", "House", "SingleFamilyResidence"]: addr_obj = script_data.get("address") if isinstance(addr_obj, dict): street = addr_obj.get("streetAddress", "") locality = addr_obj.get("addressLocality", "") region = addr_obj.get("addressRegion", "") postal_code = addr_obj.get("postalCode", "") country = addr_obj.get("addressCountry", "") # Construct address string, prefer more specific parts current_addr_parts = [p for p in [street, locality, region, postal_code, country] if p] if current_addr_parts: endereco_json_parts.append(", ".join(current_addr_parts)) break # Found one, assume it's the primary if endereco_json_parts: break endereco_json_val = endereco_json_parts[0] if endereco_json_parts else None endereco = ( extract_info(page_text, PATTERNS["endereco"]) or metadata.get("og:street-address") or # More specific OG tags metadata.get("og:locality") or metadata.get("og:region") or metadata.get("twitter:data1") or # Sometimes address is here metadata.get("place:location:street_address") or # Facebook Places endereco_json_val or metadata.get("og:address") # Generic fallback ) if not endereco or endereco == '-': # If regex fails, try a broader search in metadata description description_text = metadata.get("description", "") + " " + metadata.get("og:description", "") endereco = extract_info(description_text, PATTERNS["endereco"]) valor_str = extract_info(page_text, PATTERNS["valor"]) area_str = extract_info(page_text, PATTERNS["area"]) dorm_str = extract_info(page_text, PATTERNS["dormitorios"]) banheiros_str = extract_info(page_text, PATTERNS["banheiros"]) vagas_str = extract_info(page_text, PATTERNS["vagas"]) suites_str = extract_info(page_text, PATTERNS["suites"]) testada_str = extract_info(cleaned_text_for_testada_val, PATTERNS["testada"]) # Data Cleaning try: valor_cleaned = str(valor_str).replace('R$', '').replace('.', '').replace(',', '.').strip() valor_float = float(valor_cleaned) if valor_cleaned and valor_cleaned.replace('.', '', 1).replace('-', '', 1).isdigit() else '-' except (AttributeError, ValueError): valor_float = '-' try: area_match = re.search(r'(\d[\d,.]*)', str(area_str)) # Get first number sequence area_cleaned = area_match.group(1).replace('.', '').replace(',', '.') if area_match else str(area_str) area_float = float(area_cleaned) if area_cleaned and area_cleaned.replace('.', '', 1).isdigit() else '-' except (AttributeError, ValueError): area_float = '-' def to_int_or_dash(val_str): if isinstance(val_str, (int, float)): return int(val_str) if isinstance(val_str, str): cleaned_val = re.sub(r'\D', '', val_str) # Remove non-digits if cleaned_val.isdigit(): return int(cleaned_val) return '-' dorm_int = to_int_or_dash(dorm_str) banheiros_int = to_int_or_dash(banheiros_str) vagas_int = to_int_or_dash(vagas_str) suites_int = to_int_or_dash(suites_str) try: testada_cleaned = str(testada_str).replace(',', '.') testada_float = float(testada_cleaned) if testada_cleaned and testada_cleaned.replace('.', '', 1).isdigit() else '-' except (AttributeError, ValueError): testada_float = '-' result_text = f"**{page_title}**\n\nURL: {final_url}\n\n{page_text[:10000]}..." df = pd.DataFrame([{ "Endereço": endereco if endereco and endereco != '-' else 'Não encontrado', "Área": area_float, "Testada": testada_float, "Valor": valor_float, "Dorm": dorm_int, "Banheiros": banheiros_int, "Vagas": vagas_int, "Suítes": suites_int, "URL": final_url, # Use final URL after redirects "Topografia": '-', "Relevo": '-', "Superfície": '-', "Aproveitamento": '-', "Acessibilidade": '-', "Idade e conservação": '-', "Padrão construtivo": '-', "Outra característica": '-' }]) return df, result_text, endereco, valor_float # Acumulador (omitted for brevity, unchanged) def adicionar_ao_acumulado(df_atual, df_acumulado, topo, rel, sup, apr, ace, ic, pad, var): if df_atual.empty: return df_acumulado, df_acumulado, "" df_novo = df_atual.copy() df_novo.insert(0, "Dado", f"Dado {len(df_acumulado)+1}") # Adicionar valores dos dropdowns df_novo["Topografia"] = topo df_novo["Relevo"] = rel df_novo["Superfície"] = sup df_novo["Aproveitamento"] = apr df_novo["Acessibilidade"] = ace df_novo["Idade e conservação"] = ic df_novo["Padrão construtivo"] = pad df_novo["Outra característica"] = var # Calcular VU (Valor / Área), evitando divisão por zero ou nulos df_novo["VU"] = df_novo.apply( lambda row: round(row["Valor"] / row["Área"], 2) if isinstance(row["Valor"], (int, float)) and isinstance(row["Área"], (int, float)) and row["Área"] != 0 and row["Área"] != '-' and row["Valor"] != '-' else '-', axis=1 ) # Reordenar colunas para colocar VU depois de Valor cols = df_novo.columns.tolist() valor_index = cols.index("Valor") vu_index = cols.index("VU") cols.insert(valor_index + 1, cols.pop(vu_index)) df_novo = df_novo[cols] df_acumulado = pd.concat([df_acumulado, df_novo], ignore_index=True) # Filtrar apenas valores numéricos para cálculos estatísticos valor_numeric = pd.to_numeric(df_acumulado["Valor"], errors="coerce") vu_numeric = pd.to_numeric(df_acumulado["VU"], errors="coerce") # Calcular estatísticas quantidade_dados = len(df_acumulado) # Estatísticas para "Valor" valor_max_val = valor_numeric.max() if not valor_numeric.isna().all() else '-' valor_min_val = valor_numeric.min() if not valor_numeric.isna().all() else '-' valor_medio_val = valor_numeric.mean() if not valor_numeric.isna().all() else '-' valor_mediana_val = valor_numeric.median() if not valor_numeric.isna().all() else '-' # Estatísticas para "VU" vu_max_val = vu_numeric.max() if not vu_numeric.isna().all() else '-' vu_min_val = vu_numeric.min() if not vu_numeric.isna().all() else '-' vu_medio_val = vu_numeric.mean() if not vu_numeric.isna().all() else '-' vu_mediana_val = vu_numeric.median() if not vu_numeric.isna().all() else '-' # Criar texto com estatísticas stats_text = ( f"**Quantidade de dados:** {quantidade_dados}\n\n" f"**Valor:**\n" f"- Máximo: {valor_max_val if isinstance(valor_max_val, str) else f'R$ {valor_max_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n" f"- Mínimo: {valor_min_val if isinstance(valor_min_val, str) else f'R$ {valor_min_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n" f"- Média: {valor_medio_val if isinstance(valor_medio_val, str) else f'R$ {valor_medio_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n" f"- Mediana: {valor_mediana_val if isinstance(valor_mediana_val, str) else f'R$ {valor_mediana_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n\n" f"**VU (Valor Unitário - R$/m²):**\n" f"- Máximo: {vu_max_val if isinstance(vu_max_val, str) else f'R$ {vu_max_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n" f"- Mínimo: {vu_min_val if isinstance(vu_min_val, str) else f'R$ {vu_min_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n" f"- Média: {vu_medio_val if isinstance(vu_medio_val, str) else f'R$ {vu_medio_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n" f"- Mediana: {vu_mediana_val if isinstance(vu_mediana_val, str) else f'R$ {vu_mediana_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}" ) return df_acumulado, df_acumulado, stats_text # Limpeza do anúncio atual (omitted for brevity, unchanged) def clear_fields(): empty_df = pd.DataFrame({ "Endereço": ['-'], "Área": ['-'], "Testada": ['-'], "Valor": ['-'], "VU": ['-'], "Dorm": ['-'], "Banheiros": ['-'], "Vagas": ['-'], "Suítes": ['-'], "URL": ['-'], "Topografia": ['-'], "Relevo": ['-'], "Superfície": ['-'], "Aproveitamento": ['-'], "Acessibilidade": ['-'], "Idade e conservação": ['-'], "Padrão construtivo": ['-'], "Outra característica": ['-'] }) return "", empty_df, "", False, None, '-', '-', '-', '-', '-', '-', '-', '-' # Função para excluir linhas com "Dado" vazio (omitted for brevity, unchanged) def excluir_dados_vazios(df_acumulado): if "Dado" not in df_acumulado.columns or df_acumulado.empty: return df_acumulado, df_acumulado df_acumulado = df_acumulado[df_acumulado["Dado"].astype(str).str.strip() != ""] df_acumulado = df_acumulado.reset_index(drop=True) # Re-numerar a coluna "Dado" for i in range(len(df_acumulado)): df_acumulado.loc[i, "Dado"] = f"Dado {i+1}" return df_acumulado, df_acumulado def toggle_output_text(show_text, result_text): return gr.update(visible=show_text), result_text # take_screenshot (omitted for brevity, unchanged from your last version) def take_screenshot(url, endereco, valor, filename="screenshot.png"): if not url or not validators.url(url): print("URL inválida para screenshot.") return None # Or a placeholder image path with sync_playwright() as p: browser = None context = None page = None try: browser = p.chromium.launch( headless=True, args=[ '--disable-blink-features=AutomationControlled', '--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu' ] ) context = browser.new_context( user_agent=random.choice(USER_AGENTS), viewport={'width': 1920, 'height': 1080}, locale='pt-BR', timezone_id='America/Sao_Paulo', ) page = context.new_page() stealth_sync(page) print(f"Taking screenshot of {url}") page.goto(url, timeout=60000, wait_until="networkidle") # networkidle might be better for screenshots # time.sleep(random.uniform(3,6)) # Extra wait if networkidle isn't enough if endereco and valor and str(endereco).strip() != '-' and isinstance(valor, (int, float)): safe_endereco = re.sub(r'[\\/*?:"<>|]', "", str(endereco)).replace(' ', '_')[:100] # Limit length filename = f"{safe_endereco}_R${valor:.2f}.png".replace(",", "_") # Use underscore for comma else: # Generate a more unique default name timestamp = time.strftime("%Y%m%d-%H%M%S") filename = f"screenshot_{timestamp}.png" # Ensure the directory for screenshots exists if not saving to current dir # os.makedirs("screenshots", exist_ok=True) # screenshot_path = os.path.join("screenshots", filename) # For Gradio, relative path is fine if it's served correctly screenshot_path = filename page.screenshot(path=screenshot_path, full_page=True) print(f"Screenshot salvo como {screenshot_path}") return screenshot_path except Exception as e: print(f"Error during screenshot for {url}: {e}") return None finally: if page: try: page.close() except Exception: pass if context: try: context.close() except Exception: pass if browser: try: browser.close() except Exception: pass # Tema theme = gr.themes.Default(primary_hue=gr.themes.colors.yellow, secondary_hue=gr.themes.colors.blue) # Using Default for broader compatibility # App principal with gr.Blocks(theme=theme, css=""" /* CSS (omitted for brevity, unchanged) */ @import url('https://fonts.googleapis.com/css2?family=Quicksand:wght@400;700&display=swap'); .small-file-upload { height: 65px; text-align: center; color: black; border: 2px solid black !important; box-sizing: border-box; } .small-file-upload span { display: none; } .small-file-upload input[type="file"] { color: black; } .small-file-upload label { color: black; } .small span { font-size: 1.0em; white-space: nowrap; width: auto; display: inline-block; } /* Adjusted font size */ /* .small span dados { font-size: 0.8em; white-space: nowrap; width: auto; display: inline-block; } */ /* This specific selector might not work as intended, keep it general */ h1 { text-align: center; font-family: 'Quicksand', sans-serif; font-weight: 700; margin: 20px 0; color: black; } .map-container { height: 600px !important; margin: 0; padding: 0; } """) as app: gr.Markdown( "
" # Centered title "Pesquisa.AI - " "aval" "ia" # Gold color ".se" "
" ) df_acumulado_state = gr.State(pd.DataFrame(columns=["Dado", "Endereço", "Área", "Testada", "Valor", "VU", "Dorm", "Banheiros", "Vagas", "Suítes", "URL", "Topografia", "Relevo", "Superfície", "Aproveitamento", "Acessibilidade", "Idade e conservação", "Padrão construtivo", "Outra característica"])) with gr.Row(): with gr.Column(scale=1, min_width=250): # Added min_width user_input = gr.Textbox(label="Cole a URL do anúncio aqui") gr.Markdown("**ANÚNCIO ATUAL**") submit_button = gr.Button("1. Carregar Dados do Anúncio", variant="primary", elem_id="load_button") screenshot_button = gr.Button("2. Print da Página do Anúncio", elem_id="print_button") clear_button = gr.Button("Limpar Campos Atuais", variant="stop", elem_id="clear_button") # Changed variant gr.Markdown("**BANCO DE DADOS**") add_data = gr.Button("3. Adicionar ao Banco", variant="primary", elem_id="add_db_button") delete_data = gr.Button("Excluir Linha Selecionada do Banco", elem_id="delete_db_button") # infos = gr.Button("Geolocalização", elem_id="geo_button") # If you implement this with gr.Column(scale=4): # Adjusted scale output_table = gr.Dataframe( headers=["Endereço", "Área", "Testada", "Valor", "VU", "Dorm", "Banheiros", "Vagas", "Suítes", "URL", "Topografia", "Relevo", "Superfície", "Aproveitamento", "Acessibilidade", "Idade e conservação", "Padrão construtivo", "Outra característica"], datatype=["str", "number", "number", "number", "number", "number", "number", "number", "number", "str", "str", "str", "str", "str", "str", "str", "str", "str"], interactive=True, row_count=(1, "fixed"), # Let row_count determine height for single row table wrap=True ) with gr.Accordion("Características Adicionais (para adicionar ao banco)", open=False): with gr.Row(): topo_drop = gr.Dropdown(label="Topografia", choices=list(dict_topo.keys()), value='-', interactive=True) rel_drop = gr.Dropdown(label="Relevo", choices=list(dict_rel.keys()), value='-', interactive=True) sup_drop = gr.Dropdown(label="Superfície", choices=list(dict_sup.keys()), value='-', interactive=True) apr_drop = gr.Dropdown(label="Aproveitamento", choices=list(dict_apr.keys()), value='-', interactive=True) with gr.Row(): ace_drop = gr.Dropdown(label="Acessibilidade", choices=list(dict_ace.keys()), value='-', interactive=True) ic_drop = gr.Dropdown(label="Idade e conservação", choices=list(dict_ic.keys()), value='-', interactive=True) pad_drop = gr.Dropdown(label="Padrão construtivo", choices=list(dict_pad.keys()), value='-', interactive=True) var_drop = gr.Textbox(label="Outra característica", value='-', interactive=True) with gr.Row(): show_text_checkbox = gr.Checkbox(label="Mostrar Texto Extraído do Anúncio", value=False, scale=4) screenshot_output = gr.Image(label="Print do Anúncio", type="filepath", height=200, scale=1, interactive=False) # Made non-interactive output_text = gr.Markdown(label="Texto Extraído (Pré-visualização)", visible=False) gr.Markdown("---") # Separator gr.Markdown("**BANCO DE DADOS ACUMULADOS**") acumulado_table = gr.Dataframe( headers=["Dado", "Endereço", "Área", "Testada", "Valor", "VU", "Dorm", "Banheiros", "Vagas", "Suítes", "URL", "Topografia", "Relevo", "Superfície", "Aproveitamento", "Acessibilidade", "Idade e conservação", "Padrão construtivo", "Outra característica"], datatype=["str", "str", "number", "number", "number", "number", "number", "number", "number", "number", "str", "str", "str", "str", "str", "str", "str", "str", "str"], interactive=True, # Set to true if you want to edit or select rows for deletion # height=400, # Removed height parameter wrap=True ) # If 'acumulado_table' needs a specific height, try CSS or check Gradio docs for your version's method. # For Gradio 3.x: acumulado_table.style(height=400) might work if needed after the definition. stats_output = gr.Markdown(label="Estatísticas dos Dados Acumulados") # Lógica dos botões extracted_endereco_state = gr.State() extracted_valor_state = gr.State() # This will store the numerical value for screenshot naming submit_button.click( smart_fetch_url_info, # Use the new Playwright-based function inputs=user_input, outputs=[output_table, output_text, extracted_endereco_state, extracted_valor_state] ) clear_button.click( clear_fields, outputs=[user_input, output_table, output_text, show_text_checkbox, screenshot_output, topo_drop, rel_drop, sup_drop, apr_drop, ace_drop, ic_drop, pad_drop, var_drop] ) show_text_checkbox.change( toggle_output_text, inputs=[show_text_checkbox, output_text], # Pass current output_text to preserve it outputs=[output_text, output_text] # Update visibility and content ) screenshot_button.click( take_screenshot, inputs=[user_input, extracted_endereco_state, extracted_valor_state], # user_input is the original URL outputs=screenshot_output ) add_data.click( adicionar_ao_acumulado, inputs=[output_table, df_acumulado_state, topo_drop, rel_drop, sup_drop, apr_drop, ace_drop, ic_drop, pad_drop, var_drop], outputs=[df_acumulado_state, acumulado_table, stats_output] ) delete_data.click( excluir_dados_vazios, # This function needs to know which row to delete inputs=acumulado_table, # Pass the editable dataframe outputs=[df_acumulado_state, acumulado_table] # Update state and table ) # Modify excluir_dados_vazios to work with selected row from editable DataFrame # For row deletion, you'd typically make 'acumulado_table' interactive and get selected index # The current 'excluir_dados_vazios' removes ALL rows with empty "Dado" field, not selected ones. # To delete a selected row, 'acumulado_table' needs to be interactive and you'd get the selection event. # For simplicity, the current `excluir_dados_vazios` is kept, but it's not ideal for targeted deletion. # A more advanced deletion would require handling selection events from the DataFrame. app.launch(share=True, debug=True) # Added debug=True for more detailed logs during development