Spaces:
Running
Running
| import subprocess | |
| import sys | |
| import os | |
| def ensure_playwright_chromium(): | |
| """Ensures Playwright Chromium browser is installed.""" | |
| try: | |
| print("Checking and installing Playwright Chromium browser if needed...") | |
| subprocess.run( | |
| [sys.executable, "-m", "playwright", "install", "chromium"], | |
| check=True, | |
| capture_output=True, | |
| text=True | |
| ) | |
| print("Playwright Chromium browser is ready.") | |
| except subprocess.CalledProcessError as e: | |
| print(f"Error during Playwright Chromium installation: {e}") | |
| print(f"Stdout: {e.stdout}") | |
| print(f"Stderr: {e.stderr}") | |
| except FileNotFoundError: | |
| print("Error: Python executable or Playwright module not found. Ensure your environment is set up correctly.") | |
| ensure_playwright_chromium() | |
| import gradio as gr | |
| import requests # Still used for other things potentially, or could be removed if not | |
| from bs4 import BeautifulSoup | |
| from bs4 import Comment | |
| import re | |
| import pandas as pd | |
| import validators | |
| # from selenium import webdriver # No longer used in the primary fetch path | |
| # from selenium.webdriver.chrome.service import Service # No longer used | |
| # from selenium.webdriver.chrome.options import Options # No longer used | |
| # from selenium.webdriver.common.by import By # No longer used | |
| import json | |
| import time | |
| import random | |
| from playwright.sync_api import sync_playwright | |
| from playwright_stealth import stealth_sync | |
| # Constantes | |
| ESTADOS_BR = ["AC", "AL", "AM", "AP", "BA", "CE", "DF", "ES", "GO", "MA", "MG", "MS", "MT", "PA", "PB", "PE", "PI", "PR", "RJ", "RN", "RO", "RR", "RS", "SC", "SE", "SP", "TO"] | |
| USER_AGENTS = [ | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15", | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0", | |
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36", | |
| ] | |
| # Dicionários (omitted for brevity, they are unchanged) | |
| dict_topo = { | |
| 'plano <5%': 1, | |
| 'aclive_leve 5% e 30%': 0.95, | |
| 'declive_leve 5% e 30%': 0.90, | |
| 'aclive_acentuado >30%': 0.85, | |
| 'declive_acentuado >30%': 0.80, | |
| '-': '-' | |
| } | |
| dict_rel = { | |
| 'plana': 1.1, | |
| 'ondulada': 1.00, | |
| 'montanhosa/acidentada': 0.80, | |
| '-': '-' | |
| } | |
| dict_sup = { | |
| 'Seca': 1.00, | |
| 'Região inundável mas não atingida': 0.90, | |
| 'Região inundável mas atingida periodicamente': 0.70, | |
| 'Alagada': 0.60, | |
| '-': '-' | |
| } | |
| dict_apr = { | |
| 'Loteamento': 1.00, | |
| 'Indústria': 0.90, | |
| 'Culturas': 0.80, | |
| '-': '-' | |
| } | |
| dict_ace = { | |
| 'Ótima': 1.00, | |
| 'Muito boa': 0.95, | |
| 'Boa': 0.90, | |
| 'Desfavorável': 0.80, | |
| 'Má': 0.75, | |
| 'Péssima': 0.70, | |
| '-': '-' | |
| } | |
| dict_ic = { | |
| 'id<5_novo': 1.00, | |
| 'id<5_bom': 0.95, | |
| 'id<5_reparos simples': 0.80, | |
| 'id<5_reparos importantes': 0.45, | |
| 'id entre 6 e 10_novo': 0.95, | |
| 'id entre 6 e 10_bom': 0.90, | |
| 'id entre 6 e 10_reparos simples': 0.75, | |
| 'id entre 6 e 10_reparos importantes': 0.40, | |
| 'id entre 11 e 30_novo': 0.85, | |
| 'id entre 11 e 30_bom': 0.80, | |
| 'id entre 11 e 30_reparos simples': 0.65, | |
| 'id entre 11 e 30_reparos importantes': 0.35, | |
| 'id entre 31 e 50_novo': 0.55, | |
| 'id entre 31 e 50_bom': 0.50, | |
| 'id entre 31 e 50_reparos simples': 0.45, | |
| 'id entre 31 e 50_reparos importantes': 0.25, | |
| 'id>50_novo': 0.30, | |
| 'id>50_bom': 0.20, | |
| 'id>50_reparos simples': 0.15, | |
| 'id>50_reparos importantes': 0.10, | |
| '-': '-' | |
| } | |
| dict_pad = { | |
| 'Mínimo': 1.00, | |
| 'Baixo': 1.15, | |
| 'Normal c/ aspecto de baixo ': 1.30, | |
| 'Normal forte predominância': 1.45, | |
| 'Normal com aspecto de alto': 1.60, | |
| 'Alto': 1.75, | |
| 'Luxo': 1.90, | |
| '-': '-' | |
| } | |
| PATTERNS = { | |
| "endereco": [ | |
| r'(?:rua|avenida|estrada|alameda|praça|travessa)\s+[\w\s\d\-,.]+?\b(?:' + "|".join(ESTADOS_BR) + r')\b', | |
| r'(?:endereço|localização|address)\s*:\s*[\w\s\d\-,.]+?\b(?:' + "|".join(ESTADOS_BR) + r')\b', | |
| r'[\w\s\d\-,.]+\b(?:' + "|".join(ESTADOS_BR) + r')\b', | |
| r'(?:rua|avenida|estrada|alameda|praça|travessa)\s+[\w\s\d\-,.]+', | |
| r'([A-Za-z\s\-.À-ú]+)\s*-\s*([A-Za-z\s\-.À-ú]+)/([A-Z]{2})', # Improved for accented chars | |
| ], | |
| "testada": [ | |
| r'(\d{1,3}(?:[.,]\d{1,2})?)m?\s*[xX]\s*\d', | |
| r'(\d{1,3}(?:[.,]\d{1,2})?)\s*metros?\s*de\s*frente', | |
| r'front\s*:\s*(\d{1,3}(?:[.,]\d{1,2})?)', | |
| r"Metragem\s*de\s*frente\s*:\s*(\d+,\d+|\d+)\s*m", | |
| r"(\d+,\d+|\d+)\s*m\s*[xX]\s*\d+" | |
| ], | |
| "valor": [ | |
| r'R\$[\s]*<strong>[\s]*([\d.]+,[\d]{2})[\s]*<\/strong>', | |
| r"Valor\s*do\s*imóvel\s*R\$\s*([\d.,]+)", | |
| r'valor\s*:\s*R\$\s*(\d[\d\.,]*)', | |
| r'preço\s*:\s*R\$\s*(\d[\d\.,]*)', | |
| r'value\s*:\s*\$(\d[\d\.,]*)', | |
| r'R\$\s*(\d[\d\.,]*)', | |
| # r's*(\d[\d\.,]*)', # This pattern is too broad and can cause issues, commented out | |
| ], | |
| "area": [ | |
| r"(\d[\d.,]*)\s*m²\s*de\s*área\s*total", | |
| r"(\d[\d.,]*)\s*m²\s*de\s*área\s*construída", | |
| r"(\d[\d.,]*)\s*m²\s*de\s*área\s*privativa", | |
| r'área\s*(?:do\s*terreno|total)\s*[:–-]?\s*(\d[\d.,]*)\s*m²', | |
| r'área\s*construída\s*[:–-]?\s*(\d[\d.,]*)\s*m²', | |
| r'área\s*privativa\s*[:–-]?\s*(\d[\d.,]*)\s*m²', | |
| r'area\s*:\s*(\d[\d.,]+)\s*m²', | |
| r'size\s*:\s*(\d[\d.,]+)\s*sqft', | |
| r"(\d+,\d+|\d+)\s*m²\s*área total", | |
| r"(\d+,\d+|\d+)\s*m²\s*área privativa", | |
| r"(\d{1,3}(?:\.\d{3})*(?:,\d{2})?)(m²|ha)\s+área total do terreno", | |
| r'(\d[\d.,]+)\s*m²', | |
| ], | |
| "dormitorios": [ | |
| r'(\d+)\s*(?:quarto|quartos|dormit[oó]rio|dormit[oó]rios|dorm\.|dorms\.)', | |
| r'bedroom[s]?\s*:\s*(\d+)', | |
| r'dormit[oó]rio[s]?\s*[:–-]?\s*(\d+)', | |
| r'quarto[s]?\s*[:–-]?\s*(\d+)', | |
| ], | |
| "banheiros": [ | |
| r'(\d+)\s*(?:banheiro|banheiros|wc|banho|banhos)', | |
| r'bathroom[s]?\s*:\s*(\d+)', | |
| ], | |
| "vagas": [ | |
| r'(\d+)\s*(?:vaga|vagas)(?:\s*de\s*garagem)?', | |
| r'parking\s*:\s*(\d+)', | |
| r'garagem\s*[:–-]?\s*(\d+)\s*vaga', | |
| ], | |
| "suites": [ | |
| r'(\d+)\s*(?:su[ií]te|su[ií]tes)', | |
| r'suite[s]?\s*:\s*(\d+)', | |
| ], | |
| } | |
| def extract_info(text, patterns): | |
| for pattern in patterns: | |
| match = re.search(pattern, text, re.IGNORECASE) | |
| if match: | |
| if match.groups(): | |
| if len(match.groups()) == 3 and "/" in match.group(0) and pattern == r'([A-Za-z\s\-.À-ú]+)\s*-\s*([A-Za-z\s\-.À-ú]+)/([A-Z]{2})': | |
| bairro = match.group(1).strip() | |
| cidade = match.group(2).strip() | |
| estado = match.group(3).strip() | |
| return f"{bairro} - {cidade}/{estado}" | |
| return match.group(1).strip() | |
| return match.group(0).strip() | |
| return '-' | |
| def clean_text_for_testada(page_text): | |
| return re.sub(r'^\s*\*?\s*\!\[Image[^\n]*\n?', '', page_text, flags=re.MULTILINE) | |
| def extract_metadata(soup): | |
| metadata = {} | |
| for meta in soup.find_all("meta"): | |
| name_prop = meta.get("property") or meta.get("name") | |
| if name_prop and meta.get("content"): | |
| metadata[name_prop.lower()] = meta["content"] | |
| return metadata | |
| def extract_json_scripts(soup): | |
| scripts = soup.find_all("script", type="application/ld+json") | |
| data = [] | |
| for script in scripts: | |
| try: | |
| # Remove comments within the script tag if any | |
| script_content = "" | |
| for content_part in script.contents: | |
| if isinstance(content_part, Comment): | |
| continue | |
| script_content += str(content_part) | |
| json_data = json.loads(script_content) | |
| data.append(json_data) | |
| except (json.JSONDecodeError, TypeError): | |
| continue | |
| return data | |
| def get_main_page_text(soup_obj): | |
| # Try to find common main content containers | |
| main_containers = ["article", "main", "[role='main']"] # Common semantic tags/attributes | |
| # Less specific, but common for content blocks | |
| div_selectors = [ | |
| "div[class*='content']", "div[id*='content']", | |
| "div[class*='main']", "div[id*='main']", | |
| "div[class*='body']", "div[id*='body']", | |
| "div[class*='post']", "div[id*='post']", | |
| "div[class*='listing-details']" # Specific to listings | |
| ] | |
| content_element = None | |
| for selector in main_containers: | |
| element = soup_obj.select_one(selector) | |
| if element: | |
| content_element = element | |
| break | |
| if not content_element: | |
| for selector in div_selectors: | |
| element = soup_obj.select_one(selector) | |
| if element and len(element.get_text(strip=True)) > 200: # Heuristic for meaningful content | |
| content_element = element | |
| break | |
| if content_element: | |
| return content_element.get_text(separator='\n', strip=True) | |
| # Fallback to body if no specific main content found | |
| body_tag = soup_obj.body | |
| if body_tag: | |
| return body_tag.get_text(separator='\n', strip=True) | |
| return soup_obj.get_text(separator='\n', strip=True) # Ultimate fallback | |
| def smart_fetch_url_info(user_input_url): | |
| if not validators.url(user_input_url): | |
| return pd.DataFrame(), "URL inválida. Verifique e tente novamente.", None, None | |
| html_content = None | |
| page_title = "" | |
| final_url = user_input_url # To store the URL after potential redirects | |
| with sync_playwright() as p: | |
| browser = None | |
| context = None | |
| page = None | |
| try: | |
| browser = p.chromium.launch( | |
| headless=True, | |
| args=[ | |
| '--no-sandbox', | |
| '--disable-setuid-sandbox', | |
| '--disable-dev-shm-usage', | |
| '--disable-accelerated-2d-canvas', | |
| '--no-first-run', | |
| '--no-zygote', | |
| # '--single-process', # Potentially for very constrained environments, but can be less stable | |
| '--disable-gpu', | |
| '--disable-blink-features=AutomationControlled' | |
| ] | |
| ) | |
| context = browser.new_context( | |
| user_agent=random.choice(USER_AGENTS), | |
| viewport={'width': 1920, 'height': 1080}, | |
| locale='pt-BR', | |
| timezone_id='America/Sao_Paulo', | |
| # ignore_https_errors=True # Use with caution if SSL certs are an issue | |
| ) | |
| context.set_default_navigation_timeout(60000) # 60 seconds for navigation | |
| context.set_default_timeout(45000) # 45 seconds for other operations | |
| page = context.new_page() | |
| stealth_sync(page) # Apply stealth patches | |
| print(f"Fetching {user_input_url} with Playwright...") | |
| # Try to block common resource types that slow down loading and are not needed for text | |
| # page.route("**/*", lambda route: route.abort() if route.request.resource_type in {"image", "stylesheet", "font", "media"} else route.continue_()) | |
| response = page.goto(user_input_url, wait_until="domcontentloaded", timeout=60000) | |
| # Wait for potential dynamic content loading, or specific selectors if known | |
| # Example: page.wait_for_selector("body", timeout=10000) | |
| time.sleep(random.uniform(4, 8)) # Allow time for JS execution | |
| html_content = page.content() | |
| page_title = page.title() | |
| final_url = page.url # Get the URL after any redirects | |
| except Exception as e: | |
| print(f"Playwright fetching error for {user_input_url}: {e}") | |
| return pd.DataFrame(), f"Erro ao buscar com Playwright: {e}", None, None | |
| finally: | |
| if page: | |
| try: page.close() | |
| except Exception as e_page: print(f"Error closing page: {e_page}") | |
| if context: | |
| try: context.close() | |
| except Exception as e_ctx: print(f"Error closing context: {e_ctx}") | |
| if browser: | |
| try: browser.close() | |
| except Exception as e_browser: print(f"Error closing browser: {e_browser}") | |
| if not html_content: | |
| return pd.DataFrame(), "Não foi possível obter o conteúdo da página com Playwright.", None, None | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| metadata = extract_metadata(soup) | |
| json_scripts = extract_json_scripts(soup) | |
| page_text = get_main_page_text(soup) | |
| cleaned_text_for_testada_val = clean_text_for_testada(page_text) # Renamed variable | |
| # Enhanced Endereço Extraction | |
| endereco_json_parts = [] | |
| if json_scripts: | |
| for script_data_list in json_scripts: # json_scripts is a list of dicts/lists | |
| # Handle if script_data_list is a list itself (e.g., graph of entities) | |
| items_to_check = script_data_list if isinstance(script_data_list, list) else [script_data_list] | |
| for script_data in items_to_check: | |
| if isinstance(script_data, dict): | |
| # Common Schema.org types for properties | |
| if script_data.get("@type") in ["RealEstateListing", "Residence", "Place", "Apartment", "House", "SingleFamilyResidence"]: | |
| addr_obj = script_data.get("address") | |
| if isinstance(addr_obj, dict): | |
| street = addr_obj.get("streetAddress", "") | |
| locality = addr_obj.get("addressLocality", "") | |
| region = addr_obj.get("addressRegion", "") | |
| postal_code = addr_obj.get("postalCode", "") | |
| country = addr_obj.get("addressCountry", "") | |
| # Construct address string, prefer more specific parts | |
| current_addr_parts = [p for p in [street, locality, region, postal_code, country] if p] | |
| if current_addr_parts: | |
| endereco_json_parts.append(", ".join(current_addr_parts)) | |
| break # Found one, assume it's the primary | |
| if endereco_json_parts: break | |
| endereco_json_val = endereco_json_parts[0] if endereco_json_parts else None | |
| endereco = ( | |
| extract_info(page_text, PATTERNS["endereco"]) or | |
| metadata.get("og:street-address") or # More specific OG tags | |
| metadata.get("og:locality") or | |
| metadata.get("og:region") or | |
| metadata.get("twitter:data1") or # Sometimes address is here | |
| metadata.get("place:location:street_address") or # Facebook Places | |
| endereco_json_val or | |
| metadata.get("og:address") # Generic fallback | |
| ) | |
| if not endereco or endereco == '-': # If regex fails, try a broader search in metadata description | |
| description_text = metadata.get("description", "") + " " + metadata.get("og:description", "") | |
| endereco = extract_info(description_text, PATTERNS["endereco"]) | |
| valor_str = extract_info(page_text, PATTERNS["valor"]) | |
| area_str = extract_info(page_text, PATTERNS["area"]) | |
| dorm_str = extract_info(page_text, PATTERNS["dormitorios"]) | |
| banheiros_str = extract_info(page_text, PATTERNS["banheiros"]) | |
| vagas_str = extract_info(page_text, PATTERNS["vagas"]) | |
| suites_str = extract_info(page_text, PATTERNS["suites"]) | |
| testada_str = extract_info(cleaned_text_for_testada_val, PATTERNS["testada"]) | |
| # Data Cleaning | |
| try: | |
| valor_cleaned = str(valor_str).replace('R$', '').replace('.', '').replace(',', '.').strip() | |
| valor_float = float(valor_cleaned) if valor_cleaned and valor_cleaned.replace('.', '', 1).replace('-', '', 1).isdigit() else '-' | |
| except (AttributeError, ValueError): | |
| valor_float = '-' | |
| try: | |
| area_match = re.search(r'(\d[\d,.]*)', str(area_str)) # Get first number sequence | |
| area_cleaned = area_match.group(1).replace('.', '').replace(',', '.') if area_match else str(area_str) | |
| area_float = float(area_cleaned) if area_cleaned and area_cleaned.replace('.', '', 1).isdigit() else '-' | |
| except (AttributeError, ValueError): | |
| area_float = '-' | |
| def to_int_or_dash(val_str): | |
| if isinstance(val_str, (int, float)): return int(val_str) | |
| if isinstance(val_str, str): | |
| cleaned_val = re.sub(r'\D', '', val_str) # Remove non-digits | |
| if cleaned_val.isdigit(): return int(cleaned_val) | |
| return '-' | |
| dorm_int = to_int_or_dash(dorm_str) | |
| banheiros_int = to_int_or_dash(banheiros_str) | |
| vagas_int = to_int_or_dash(vagas_str) | |
| suites_int = to_int_or_dash(suites_str) | |
| try: | |
| testada_cleaned = str(testada_str).replace(',', '.') | |
| testada_float = float(testada_cleaned) if testada_cleaned and testada_cleaned.replace('.', '', 1).isdigit() else '-' | |
| except (AttributeError, ValueError): | |
| testada_float = '-' | |
| result_text = f"**{page_title}**\n\nURL: {final_url}\n\n{page_text[:10000]}..." | |
| df = pd.DataFrame([{ | |
| "Endereço": endereco if endereco and endereco != '-' else 'Não encontrado', | |
| "Área": area_float, | |
| "Testada": testada_float, | |
| "Valor": valor_float, | |
| "Dorm": dorm_int, | |
| "Banheiros": banheiros_int, | |
| "Vagas": vagas_int, | |
| "Suítes": suites_int, | |
| "URL": final_url, # Use final URL after redirects | |
| "Topografia": '-', "Relevo": '-', "Superfície": '-', "Aproveitamento": '-', | |
| "Acessibilidade": '-', "Idade e conservação": '-', "Padrão construtivo": '-', "Outra característica": '-' | |
| }]) | |
| return df, result_text, endereco, valor_float | |
| # Acumulador (omitted for brevity, unchanged) | |
| def adicionar_ao_acumulado(df_atual, df_acumulado, topo, rel, sup, apr, ace, ic, pad, var): | |
| if df_atual.empty: | |
| return df_acumulado, df_acumulado, "" | |
| df_novo = df_atual.copy() | |
| df_novo.insert(0, "Dado", f"Dado {len(df_acumulado)+1}") | |
| # Adicionar valores dos dropdowns | |
| df_novo["Topografia"] = topo | |
| df_novo["Relevo"] = rel | |
| df_novo["Superfície"] = sup | |
| df_novo["Aproveitamento"] = apr | |
| df_novo["Acessibilidade"] = ace | |
| df_novo["Idade e conservação"] = ic | |
| df_novo["Padrão construtivo"] = pad | |
| df_novo["Outra característica"] = var | |
| # Calcular VU (Valor / Área), evitando divisão por zero ou nulos | |
| df_novo["VU"] = df_novo.apply( | |
| lambda row: round(row["Valor"] / row["Área"], 2) | |
| if isinstance(row["Valor"], (int, float)) and isinstance(row["Área"], (int, float)) and row["Área"] != 0 and row["Área"] != '-' and row["Valor"] != '-' | |
| else '-', | |
| axis=1 | |
| ) | |
| # Reordenar colunas para colocar VU depois de Valor | |
| cols = df_novo.columns.tolist() | |
| valor_index = cols.index("Valor") | |
| vu_index = cols.index("VU") | |
| cols.insert(valor_index + 1, cols.pop(vu_index)) | |
| df_novo = df_novo[cols] | |
| df_acumulado = pd.concat([df_acumulado, df_novo], ignore_index=True) | |
| # Filtrar apenas valores numéricos para cálculos estatísticos | |
| valor_numeric = pd.to_numeric(df_acumulado["Valor"], errors="coerce") | |
| vu_numeric = pd.to_numeric(df_acumulado["VU"], errors="coerce") | |
| # Calcular estatísticas | |
| quantidade_dados = len(df_acumulado) | |
| # Estatísticas para "Valor" | |
| valor_max_val = valor_numeric.max() if not valor_numeric.isna().all() else '-' | |
| valor_min_val = valor_numeric.min() if not valor_numeric.isna().all() else '-' | |
| valor_medio_val = valor_numeric.mean() if not valor_numeric.isna().all() else '-' | |
| valor_mediana_val = valor_numeric.median() if not valor_numeric.isna().all() else '-' | |
| # Estatísticas para "VU" | |
| vu_max_val = vu_numeric.max() if not vu_numeric.isna().all() else '-' | |
| vu_min_val = vu_numeric.min() if not vu_numeric.isna().all() else '-' | |
| vu_medio_val = vu_numeric.mean() if not vu_numeric.isna().all() else '-' | |
| vu_mediana_val = vu_numeric.median() if not vu_numeric.isna().all() else '-' | |
| # Criar texto com estatísticas | |
| stats_text = ( | |
| f"**Quantidade de dados:** {quantidade_dados}\n\n" | |
| f"**Valor:**\n" | |
| f"- Máximo: {valor_max_val if isinstance(valor_max_val, str) else f'R$ {valor_max_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n" | |
| f"- Mínimo: {valor_min_val if isinstance(valor_min_val, str) else f'R$ {valor_min_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n" | |
| f"- Média: {valor_medio_val if isinstance(valor_medio_val, str) else f'R$ {valor_medio_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n" | |
| f"- Mediana: {valor_mediana_val if isinstance(valor_mediana_val, str) else f'R$ {valor_mediana_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n\n" | |
| f"**VU (Valor Unitário - R$/m²):**\n" | |
| f"- Máximo: {vu_max_val if isinstance(vu_max_val, str) else f'R$ {vu_max_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n" | |
| f"- Mínimo: {vu_min_val if isinstance(vu_min_val, str) else f'R$ {vu_min_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n" | |
| f"- Média: {vu_medio_val if isinstance(vu_medio_val, str) else f'R$ {vu_medio_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}\n" | |
| f"- Mediana: {vu_mediana_val if isinstance(vu_mediana_val, str) else f'R$ {vu_mediana_val:,.2f}'.replace(',', 'X').replace('.', ',').replace('X', '.')}" | |
| ) | |
| return df_acumulado, df_acumulado, stats_text | |
| # Limpeza do anúncio atual (omitted for brevity, unchanged) | |
| def clear_fields(): | |
| empty_df = pd.DataFrame({ | |
| "Endereço": ['-'], "Área": ['-'], "Testada": ['-'], "Valor": ['-'], "VU": ['-'], | |
| "Dorm": ['-'], "Banheiros": ['-'], "Vagas": ['-'], "Suítes": ['-'], "URL": ['-'], | |
| "Topografia": ['-'], "Relevo": ['-'], "Superfície": ['-'], "Aproveitamento": ['-'], | |
| "Acessibilidade": ['-'], "Idade e conservação": ['-'], "Padrão construtivo": ['-'], | |
| "Outra característica": ['-'] | |
| }) | |
| return "", empty_df, "", False, None, '-', '-', '-', '-', '-', '-', '-', '-' | |
| # Função para excluir linhas com "Dado" vazio (omitted for brevity, unchanged) | |
| def excluir_dados_vazios(df_acumulado): | |
| if "Dado" not in df_acumulado.columns or df_acumulado.empty: | |
| return df_acumulado, df_acumulado | |
| df_acumulado = df_acumulado[df_acumulado["Dado"].astype(str).str.strip() != ""] | |
| df_acumulado = df_acumulado.reset_index(drop=True) | |
| # Re-numerar a coluna "Dado" | |
| for i in range(len(df_acumulado)): | |
| df_acumulado.loc[i, "Dado"] = f"Dado {i+1}" | |
| return df_acumulado, df_acumulado | |
| def toggle_output_text(show_text, result_text): | |
| return gr.update(visible=show_text), result_text | |
| # take_screenshot (omitted for brevity, unchanged from your last version) | |
| def take_screenshot(url, endereco, valor, filename="screenshot.png"): | |
| if not url or not validators.url(url): | |
| print("URL inválida para screenshot.") | |
| return None # Or a placeholder image path | |
| with sync_playwright() as p: | |
| browser = None | |
| context = None | |
| page = None | |
| try: | |
| browser = p.chromium.launch( | |
| headless=True, | |
| args=[ | |
| '--disable-blink-features=AutomationControlled', | |
| '--no-sandbox', | |
| '--disable-dev-shm-usage', | |
| '--disable-gpu' | |
| ] | |
| ) | |
| context = browser.new_context( | |
| user_agent=random.choice(USER_AGENTS), | |
| viewport={'width': 1920, 'height': 1080}, | |
| locale='pt-BR', | |
| timezone_id='America/Sao_Paulo', | |
| ) | |
| page = context.new_page() | |
| stealth_sync(page) | |
| print(f"Taking screenshot of {url}") | |
| page.goto(url, timeout=60000, wait_until="networkidle") # networkidle might be better for screenshots | |
| # time.sleep(random.uniform(3,6)) # Extra wait if networkidle isn't enough | |
| if endereco and valor and str(endereco).strip() != '-' and isinstance(valor, (int, float)): | |
| safe_endereco = re.sub(r'[\\/*?:"<>|]', "", str(endereco)).replace(' ', '_')[:100] # Limit length | |
| filename = f"{safe_endereco}_R${valor:.2f}.png".replace(",", "_") # Use underscore for comma | |
| else: | |
| # Generate a more unique default name | |
| timestamp = time.strftime("%Y%m%d-%H%M%S") | |
| filename = f"screenshot_{timestamp}.png" | |
| # Ensure the directory for screenshots exists if not saving to current dir | |
| # os.makedirs("screenshots", exist_ok=True) | |
| # screenshot_path = os.path.join("screenshots", filename) | |
| # For Gradio, relative path is fine if it's served correctly | |
| screenshot_path = filename | |
| page.screenshot(path=screenshot_path, full_page=True) | |
| print(f"Screenshot salvo como {screenshot_path}") | |
| return screenshot_path | |
| except Exception as e: | |
| print(f"Error during screenshot for {url}: {e}") | |
| return None | |
| finally: | |
| if page: | |
| try: page.close() | |
| except Exception: pass | |
| if context: | |
| try: context.close() | |
| except Exception: pass | |
| if browser: | |
| try: browser.close() | |
| except Exception: pass | |
| # Tema | |
| theme = gr.themes.Default(primary_hue=gr.themes.colors.yellow, secondary_hue=gr.themes.colors.blue) # Using Default for broader compatibility | |
| # App principal | |
| with gr.Blocks(theme=theme, css=""" | |
| /* CSS (omitted for brevity, unchanged) */ | |
| @import url('https://fonts.googleapis.com/css2?family=Quicksand:wght@400;700&display=swap'); | |
| .small-file-upload { height: 65px; text-align: center; color: black; border: 2px solid black !important; box-sizing: border-box; } | |
| .small-file-upload span { display: none; } | |
| .small-file-upload input[type="file"] { color: black; } | |
| .small-file-upload label { color: black; } | |
| .small span { font-size: 1.0em; white-space: nowrap; width: auto; display: inline-block; } /* Adjusted font size */ | |
| /* .small span dados { font-size: 0.8em; white-space: nowrap; width: auto; display: inline-block; } */ /* This specific selector might not work as intended, keep it general */ | |
| h1 { text-align: center; font-family: 'Quicksand', sans-serif; font-weight: 700; margin: 20px 0; color: black; } | |
| .map-container { height: 600px !important; margin: 0; padding: 0; } | |
| """) as app: | |
| gr.Markdown( | |
| "<div style='font-size: 1.5em; text-align: center;'>" # Centered title | |
| "<span style='color: gray;'>Pesquisa.AI - </span>" | |
| "<span style='color: gray;'>aval</span>" | |
| "<span style='color: #FFD700;'>ia</span>" # Gold color | |
| "<span style='color: gray;'>.se</span>" | |
| "</div>" | |
| ) | |
| df_acumulado_state = gr.State(pd.DataFrame(columns=["Dado", "Endereço", "Área", "Testada", "Valor", "VU", "Dorm", "Banheiros", "Vagas", "Suítes", "URL", "Topografia", "Relevo", "Superfície", "Aproveitamento", "Acessibilidade", "Idade e conservação", "Padrão construtivo", "Outra característica"])) | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=250): # Added min_width | |
| user_input = gr.Textbox(label="Cole a URL do anúncio aqui") | |
| gr.Markdown("**ANÚNCIO ATUAL**") | |
| submit_button = gr.Button("1. Carregar Dados do Anúncio", variant="primary", elem_id="load_button") | |
| screenshot_button = gr.Button("2. Print da Página do Anúncio", elem_id="print_button") | |
| clear_button = gr.Button("Limpar Campos Atuais", variant="stop", elem_id="clear_button") # Changed variant | |
| gr.Markdown("**BANCO DE DADOS**") | |
| add_data = gr.Button("3. Adicionar ao Banco", variant="primary", elem_id="add_db_button") | |
| delete_data = gr.Button("Excluir Linha Selecionada do Banco", elem_id="delete_db_button") | |
| # infos = gr.Button("Geolocalização", elem_id="geo_button") # If you implement this | |
| with gr.Column(scale=4): # Adjusted scale | |
| output_table = gr.Dataframe( | |
| headers=["Endereço", "Área", "Testada", "Valor", "VU", "Dorm", "Banheiros", "Vagas", "Suítes", "URL", "Topografia", "Relevo", "Superfície", "Aproveitamento", "Acessibilidade", "Idade e conservação", "Padrão construtivo", "Outra característica"], | |
| datatype=["str", "number", "number", "number", "number", "number", "number", "number", "number", "str", "str", "str", "str", "str", "str", "str", "str", "str"], | |
| interactive=True, | |
| row_count=(1, "fixed"), # Let row_count determine height for single row table | |
| wrap=True | |
| ) | |
| with gr.Accordion("Características Adicionais (para adicionar ao banco)", open=False): | |
| with gr.Row(): | |
| topo_drop = gr.Dropdown(label="Topografia", choices=list(dict_topo.keys()), value='-', interactive=True) | |
| rel_drop = gr.Dropdown(label="Relevo", choices=list(dict_rel.keys()), value='-', interactive=True) | |
| sup_drop = gr.Dropdown(label="Superfície", choices=list(dict_sup.keys()), value='-', interactive=True) | |
| apr_drop = gr.Dropdown(label="Aproveitamento", choices=list(dict_apr.keys()), value='-', interactive=True) | |
| with gr.Row(): | |
| ace_drop = gr.Dropdown(label="Acessibilidade", choices=list(dict_ace.keys()), value='-', interactive=True) | |
| ic_drop = gr.Dropdown(label="Idade e conservação", choices=list(dict_ic.keys()), value='-', interactive=True) | |
| pad_drop = gr.Dropdown(label="Padrão construtivo", choices=list(dict_pad.keys()), value='-', interactive=True) | |
| var_drop = gr.Textbox(label="Outra característica", value='-', interactive=True) | |
| with gr.Row(): | |
| show_text_checkbox = gr.Checkbox(label="Mostrar Texto Extraído do Anúncio", value=False, scale=4) | |
| screenshot_output = gr.Image(label="Print do Anúncio", type="filepath", height=200, scale=1, interactive=False) # Made non-interactive | |
| output_text = gr.Markdown(label="Texto Extraído (Pré-visualização)", visible=False) | |
| gr.Markdown("---") # Separator | |
| gr.Markdown("**BANCO DE DADOS ACUMULADOS**") | |
| acumulado_table = gr.Dataframe( | |
| headers=["Dado", "Endereço", "Área", "Testada", "Valor", "VU", "Dorm", "Banheiros", "Vagas", "Suítes", "URL", "Topografia", "Relevo", "Superfície", "Aproveitamento", "Acessibilidade", "Idade e conservação", "Padrão construtivo", "Outra característica"], | |
| datatype=["str", "str", "number", "number", "number", "number", "number", "number", "number", "number", "str", "str", "str", "str", "str", "str", "str", "str", "str"], | |
| interactive=True, # Set to true if you want to edit or select rows for deletion | |
| # height=400, # Removed height parameter | |
| wrap=True | |
| ) | |
| # If 'acumulado_table' needs a specific height, try CSS or check Gradio docs for your version's method. | |
| # For Gradio 3.x: acumulado_table.style(height=400) might work if needed after the definition. | |
| stats_output = gr.Markdown(label="Estatísticas dos Dados Acumulados") | |
| # Lógica dos botões | |
| extracted_endereco_state = gr.State() | |
| extracted_valor_state = gr.State() # This will store the numerical value for screenshot naming | |
| submit_button.click( | |
| smart_fetch_url_info, # Use the new Playwright-based function | |
| inputs=user_input, | |
| outputs=[output_table, output_text, extracted_endereco_state, extracted_valor_state] | |
| ) | |
| clear_button.click( | |
| clear_fields, | |
| outputs=[user_input, output_table, output_text, show_text_checkbox, screenshot_output, topo_drop, rel_drop, sup_drop, apr_drop, ace_drop, ic_drop, pad_drop, var_drop] | |
| ) | |
| show_text_checkbox.change( | |
| toggle_output_text, | |
| inputs=[show_text_checkbox, output_text], # Pass current output_text to preserve it | |
| outputs=[output_text, output_text] # Update visibility and content | |
| ) | |
| screenshot_button.click( | |
| take_screenshot, | |
| inputs=[user_input, extracted_endereco_state, extracted_valor_state], # user_input is the original URL | |
| outputs=screenshot_output | |
| ) | |
| add_data.click( | |
| adicionar_ao_acumulado, | |
| inputs=[output_table, df_acumulado_state, topo_drop, rel_drop, sup_drop, apr_drop, ace_drop, ic_drop, pad_drop, var_drop], | |
| outputs=[df_acumulado_state, acumulado_table, stats_output] | |
| ) | |
| delete_data.click( | |
| excluir_dados_vazios, # This function needs to know which row to delete | |
| inputs=acumulado_table, # Pass the editable dataframe | |
| outputs=[df_acumulado_state, acumulado_table] # Update state and table | |
| ) | |
| # Modify excluir_dados_vazios to work with selected row from editable DataFrame | |
| # For row deletion, you'd typically make 'acumulado_table' interactive and get selected index | |
| # The current 'excluir_dados_vazios' removes ALL rows with empty "Dado" field, not selected ones. | |
| # To delete a selected row, 'acumulado_table' needs to be interactive and you'd get the selection event. | |
| # For simplicity, the current `excluir_dados_vazios` is kept, but it's not ideal for targeted deletion. | |
| # A more advanced deletion would require handling selection events from the DataFrame. | |
| app.launch(share=True, debug=True) # Added debug=True for more detailed logs during development |