import requests
from bs4 import BeautifulSoup
import json
import re
import time
from urllib.parse import urljoin, quote
import logging
import urllib3
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PhoneDBScraper:
    def __init__(self):
        self.base_url = "https://phonedb.net"
        self.session = requests.Session()
        
        # Configure session with better headers and SSL handling
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
        
        # Set up retry strategy
        retry_strategy = Retry(
            total=3,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"],  # Updated parameter name
            backoff_factor=1
        )
        
        adapter = HTTPAdapter(max_retries=retry_strategy)
        self.session.mount("http://", adapter)
        self.session.mount("https://", adapter)
        
        # Disable SSL verification (use with caution)
        self.session.verify = False
        
    def search_phone(self, phone_name):
        """Search for a phone by name and return search results"""
        # Try different search approaches
        search_urls = [
            f"{self.base_url}/index.php?m=device&s=query&q={quote(phone_name)}",
            f"{self.base_url}/search?q={quote(phone_name)}",
            f"{self.base_url}/index.php?m=device&s=list&q={quote(phone_name)}"
        ]
        
        for search_url in search_urls:
            try:
                logger.info(f"Trying search URL: {search_url}")
                response = self.session.get(search_url, timeout=30)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find search results with multiple selectors
                results = []
                
                # Look for various possible result containers
                selectors = [
                    'div.device-item',
                    'div.device',
                    'div.phone-item', 
                    'tr[onclick*="device"]',
                    'a[href*="device"]',
                    'a[href*="phone"]',
                    'td a[href*="index.php"]'
                ]
                
                search_results = []
                for selector in selectors:
                    found = soup.select(selector)
                    if found:
                        search_results.extend(found)
                        break
                
                # Also try finding links with device IDs
                if not search_results:
                    search_results = soup.find_all('a', href=re.compile(r'(device|phone|id=\d+)'))
                
                for result in search_results[:10]:  # Limit to first 10 results
                    title = ""
                    link = ""
                    
                    if result.name == 'a':
                        link = result.get('href', '')
                        title = result.get_text(strip=True) or result.get('title', '')
                    elif result.name in ['div', 'tr']:
                        link_elem = result.find('a')
                        if link_elem:
                            link = link_elem.get('href', '')
                            title = link_elem.get_text(strip=True) or result.get_text(strip=True)
                        else:
                            # Check for onclick events with device info
                            onclick = result.get('onclick', '')
                            if 'device' in onclick:
                                # Extract device ID from onclick
                                device_match = re.search(r'id=(\d+)', onclick)
                                if device_match:
                                    link = f"/index.php?m=device&id={device_match.group(1)}"
                                    title = result.get_text(strip=True)
                    
                    # Clean up the link and title
                    if link and title:
                        # Clean title
                        title = re.sub(r'\s+', ' ', title).strip()
                        
                        # Ensure absolute URL
                        if link.startswith('/'):
                            link = self.base_url + link
                        elif not link.startswith('http'):
                            link = f"{self.base_url}/{link}"
                        
                        # Filter relevant results
                        if any(word.lower() in title.lower() for word in phone_name.split()):
                            results.append({
                                'title': title,
                                'url': link
                            })
                
                if results:
                    logger.info(f"Found {len(results)} results using URL: {search_url}")
                    return results
                    
            except Exception as e:
                logger.warning(f"Search URL failed {search_url}: {e}")
                continue
        
        logger.error(f"All search methods failed for: {phone_name}")
        return []
    
    def get_phone_specs(self, phone_url):
        """Extract detailed specifications from a phone page"""
        try:
            logger.info(f"Fetching specs from: {phone_url}")
            response = self.session.get(phone_url, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract phone data
            phone_data = {
                'name': '',
                'brand': '',
                'images': [],
                'specifications': {},
                'source_url': phone_url
            }
            
            # Get phone name from multiple possible locations
            title_candidates = [
                soup.find('h1'),
                soup.find('h2'), 
                soup.find('title'),
                soup.find('div', class_=re.compile(r'title|name|header')),
                soup.find('td', string=re.compile(r'Model|Name', re.I))
            ]
            
            for candidate in title_candidates:
                if candidate:
                    title = candidate.get_text(strip=True)
                    if title and len(title) > 3:
                        phone_data['name'] = title
                        break
            
            # Extract brand from title or URL
            if phone_data['name']:
                phone_data['brand'] = phone_data['name'].split()[0]
            
            # Get images with multiple approaches
            images = []
            
            # Look for images in various containers
            img_selectors = [
                'img[src*="phone"]',
                'img[src*="device"]', 
                'img[src*="mobile"]',
                'img[alt*="phone"]',
                'img[alt*="device"]',
                '.device-image img',
                '.phone-image img',
                'td img',
                'div img'
            ]
            
            for selector in img_selectors:
                imgs = soup.select(selector)
                for img in imgs:
                    src = img.get('src', '')
                    if src:
                        # Convert relative URLs to absolute
                        if src.startswith('/'):
                            img_url = self.base_url + src
                        elif not src.startswith('http'):
                            img_url = f"{self.base_url}/{src}"
                        else:
                            img_url = src
                        
                        # Avoid duplicates and filter out tiny images
                        if img_url not in images and not any(x in src.lower() for x in ['icon', 'logo', 'button', 'spacer']):
                            images.append(img_url)
            
            phone_data['images'] = images[:5]  # Limit to 5 images
            
            # Extract specifications using multiple methods
            specs = {}
            
            # Method 1: PhoneDB specific table structure
            spec_tables = soup.find_all('table')
            for table in spec_tables:
                rows = table.find_all('tr')
                for row in rows:
                    cells = row.find_all(['td', 'th'])
                    if len(cells) >= 2:
                        key = cells[0].get_text(strip=True)
                        value = cells[1].get_text(strip=True)
                        
                        # Clean up key and value
                        key = re.sub(r'[^\w\s]', '', key).strip()
                        value = re.sub(r'\s+', ' ', value).strip()
                        
                        if key and value and len(key) < 100 and len(value) < 500:
                            specs[key] = value
            
            # Method 2: Look for labeled specifications
            labeled_specs = soup.find_all(['dt', 'label', 'b', 'strong'])
            for label in labeled_specs:
                label_text = label.get_text(strip=True)
                if ':' in label_text:
                    key, value = label_text.split(':', 1)
                    specs[key.strip()] = value.strip()
                else:
                    # Look for value in next sibling
                    sibling = label.find_next_sibling()
                    if sibling:
                        value = sibling.get_text(strip=True)
                        if value:
                            specs[label_text] = value
            
            # Method 3: Extract common phone specifications from text
            text_content = soup.get_text()
            
            # Updated patterns for better matching
            spec_patterns = {
                'Display Size': r'(\d+\.?\d*)\s*(?:inch|"|″)',
                'Display Resolution': r'(\d+)\s*[x×]\s*(\d+)',
                'RAM': r'(\d+)\s*GB\s*(?:RAM|Memory)',
                'Storage': r'(\d+)\s*GB\s*(?:storage|internal|ROM)',
                'Battery': r'(\d+)\s*mAh',
                'Main Camera': r'(\d+(?:\.\d+)?)\s*MP(?:\s+main|\s+primary|\s+rear)?',
                'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:front|selfie|secondary)',
                'Operating System': r'(Android|iOS)\s*[\d\.]*',
                'Processor': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*[\w\d\s]*',
                'Network': r'(2G|3G|4G|5G|LTE)',
                'Weight': r'(\d+)\s*(?:g|gram)',
                'Dimensions': r'(\d+\.?\d*)\s*[x×]\s*(\d+\.?\d*)\s*[x×]\s*(\d+\.?\d*)\s*mm'
            }
            
            for spec_name, pattern in spec_patterns.items():
                if spec_name not in specs:  # Don't override existing specs
                    matches = re.findall(pattern, text_content, re.IGNORECASE)
                    if matches:
                        if spec_name == 'Display Resolution':
                            specs[spec_name] = f"{matches[0][0]}x{matches[0][1]}"
                        elif spec_name == 'Dimensions':
                            specs[spec_name] = f"{matches[0][0]}×{matches[0][1]}×{matches[0][2]} mm"
                        else:
                            specs[spec_name] = matches[0] if isinstance(matches[0], str) else str(matches[0])
            
            phone_data['specifications'] = specs
            
            logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")
            return phone_data
            
        except Exception as e:
            logger.error(f"Error extracting specs from {phone_url}: {e}")
            return None
    
    def scrape_phone_by_name(self, phone_name, get_first_result=True):
        """Main method to scrape phone specs by name"""
        logger.info(f"Searching for: {phone_name}")
        
        # Search for the phone
        search_results = self.search_phone(phone_name)
        
        if not search_results:
            logger.warning(f"No results found for: {phone_name}")
            return None
        
        results = []
        
        # Process results
        targets = [search_results[0]] if get_first_result else search_results
        
        for result in targets:
            logger.info(f"Scraping: {result['title']}")
            
            phone_data = self.get_phone_specs(result['url'])
            if phone_data:
                results.append(phone_data)
                
            # Be respectful with requests
            time.sleep(1)
        
        return results[0] if get_first_result and results else results

    def scrape_multiple_phones(self, phone_names):
        """Scrape multiple phones and return structured JSON"""
        all_phones = []
        
        for phone_name in phone_names:
            try:
                phone_data = self.scrape_phone_by_name(phone_name)
                if phone_data:
                    all_phones.append(phone_data)
                time.sleep(2)  # Be respectful between requests
            except Exception as e:
                logger.error(f"Error scraping {phone_name}: {e}")
                continue
        
        return all_phones

    def save_to_json(self, data, filename):
        """Save data to JSON file"""
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            logger.info(f"Data saved to {filename}")
        except Exception as e:
            logger.error(f"Error saving to JSON: {e}")

# Example usage with error handling and alternative sites
def main():
    scraper = PhoneDBScraper()
    
    # Example 1: Scrape a single phone
    phone_name = "iPhone 15 Pro"
    print(f"Attempting to scrape: {phone_name}")
    
    result = scraper.scrape_phone_by_name(phone_name)
    
    if result:
        print(f"✅ Successfully scraped {result['name']}")
        print(f"Found {len(result['specifications'])} specifications")
        print(f"Found {len(result['images'])} images")
        print(json.dumps(result, indent=2))
        scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_specs.json")
    else:
        print(f"❌ Failed to scrape {phone_name}")
        print("This might be due to:")
        print("1. PhoneDB.net blocking automated requests")
        print("2. Phone not found in their database")
        print("3. Site structure changes")
        print("\nAlternative solutions:")
        print("- Try with a different phone name")
        print("- Use a VPN if blocked by IP")
        print("- Consider using alternative sites like GSMArena")
    
    # Example 2: Test with multiple phones
    phone_list = [
        "Samsung Galaxy S24",
        "Google Pixel 8", 
        "OnePlus 12"
    ]
    
    print(f"\nTesting multiple phones: {phone_list}")
    results = scraper.scrape_multiple_phones(phone_list)
    
    if results:
        scraper.save_to_json(results, "multiple_phones_specs.json")
        print(f"✅ Successfully scraped {len(results)}/{len(phone_list)} phones")
        
        for phone in results:
            print(f"- {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
    else:
        print("❌ No phones were successfully scraped")

# Enhanced GSMArena scraper as main alternative
class GSMArenaScraperAlternative:
    """Enhanced GSMArena scraper with full functionality"""
    
    def __init__(self):
        self.base_url = "https://www.gsmarena.com"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        })
    
    def search_phone(self, phone_name):
        """Search GSMArena for phone"""
        search_url = f"{self.base_url}/results.php3"
        params = {'sQuickSearch': 'yes', 'sName': phone_name}
        
        try:
            logger.info(f"Searching GSMArena for: {phone_name}")
            response = self.session.get(search_url, params=params, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            results = []
            
            # Find search results in makers section
            makers = soup.find_all('div', class_='makers')
            for maker in makers:
                links = maker.find_all('a')
                for link in links[:5]:  # Limit results
                    href = link.get('href', '')
                    title = link.get_text(strip=True)
                    
                    if href and title and phone_name.lower().replace(' ', '') in title.lower().replace(' ', ''):
                        full_url = self.base_url + '/' + href if not href.startswith('http') else href
                        results.append({
                            'title': title,
                            'url': full_url
                        })
            
            logger.info(f"Found {len(results)} results on GSMArena")
            return results
            
        except Exception as e:
            logger.error(f"GSMArena search failed: {e}")
            return []
    
    def get_phone_specs(self, phone_url):
        """Extract detailed specifications from GSMArena phone page"""
        try:
            logger.info(f"Fetching specs from GSMArena: {phone_url}")
            response = self.session.get(phone_url, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            phone_data = {
                'name': '',
                'brand': '',
                'images': [],
                'specifications': {},
                'source_url': phone_url
            }
            
            # Get phone name
            title_elem = soup.find('h1', class_='specs-phone-name-title')
            if not title_elem:
                title_elem = soup.find('h1') or soup.find('title')
            
            if title_elem:
                phone_data['name'] = title_elem.get_text(strip=True)
                phone_data['brand'] = phone_data['name'].split()[0] if phone_data['name'] else ''
            
            # Get images
            images = []
            
            # Main phone image
            main_img_container = soup.find('div', class_='specs-photo-main')
            if main_img_container:
                img = main_img_container.find('img')
                if img and img.get('src'):
                    img_url = urljoin(phone_url, img['src'])
                    images.append(img_url)
            
            # Additional images from carousel or gallery
            carousel = soup.find('div', class_='carousel-item') or soup.find('div', class_='specs-photos')
            if carousel:
                for img in carousel.find_all('img'):
                    src = img.get('src', '')
                    if src:
                        img_url = urljoin(phone_url, src)
                        if img_url not in images:
                            images.append(img_url)
            
            phone_data['images'] = images[:5]
            
            # Extract specifications from GSMArena's table structure
            specs = {}
            
            # GSMArena uses specific table structure
            spec_tables = soup.find_all('table', cellspacing='0')
            
            for table in spec_tables:
                # Get category header
                category = ''
                category_elem = table.find_previous('th') or table.find_previous('h2')
                if category_elem:
                    category = category_elem.get_text(strip=True)
                
                rows = table.find_all('tr')
                for row in rows:
                    cells = row.find_all(['td', 'th'])
                    if len(cells) >= 2:
                        key = cells[0].get_text(strip=True)
                        value = cells[1].get_text(strip=True)
                        
                        # Clean up the key and value
                        key = re.sub(r'[^\w\s]', '', key).strip()
                        value = re.sub(r'\s+', ' ', value).strip()
                        
                        if key and value and len(key) < 100:
                            # Add category prefix if available
                            final_key = f"{category} - {key}" if category and len(category) < 30 else key
                            specs[final_key] = value
            
            # Also extract from the detailed specs list structure
            detail_lists = soup.find_all(['ul', 'li'], class_=re.compile(r'spec|detail'))
            for detail_list in detail_lists:
                items = detail_list.find_all('li') if detail_list.name == 'ul' else [detail_list]
                for item in items:
                    text = item.get_text(strip=True)
                    if ':' in text:
                        parts = text.split(':', 1)
                        if len(parts) == 2:
                            key, value = parts
                            specs[key.strip()] = value.strip()
            
            # Extract key specs using patterns from page text
            page_text = soup.get_text()
            
            key_patterns = {
                'Display Size': r'(\d+\.?\d*)\s*(?:inch|")\s*display',
                'Display Resolution': r'(\d+)\s*[x×]\s*(\d+)\s*pixels',
                'RAM': r'(\d+)\s*GB\s*RAM',
                'Storage': r'(\d+)\s*GB\s*(?:storage|internal)',
                'Battery Capacity': r'(\d+)\s*mAh',
                'Main Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:main|primary|rear)',
                'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*front',
                'Operating System': r'(Android|iOS)\s*([\d\.]+)?',
                'Chipset': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*([\w\d\s]+)?',
                'Weight': r'(\d+)\s*g\s*weight',
                'Launch Date': r'(January|February|March|April|May|June|July|August|September|October|November|December)\s*(\d{4})'
            }
            
            for spec_name, pattern in key_patterns.items():
                if spec_name not in specs:
                    match = re.search(pattern, page_text, re.IGNORECASE)
                    if match:
                        if spec_name == 'Display Resolution':
                            specs[spec_name] = f"{match.group(1)}×{match.group(2)}"
                        elif spec_name == 'Launch Date':
                            specs[spec_name] = f"{match.group(1)} {match.group(2)}"
                        else:
                            specs[spec_name] = match.group(0)
            
            phone_data['specifications'] = specs
            logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")
            
            return phone_data
            
        except Exception as e:
            logger.error(f"Error extracting GSMArena specs from {phone_url}: {e}")
            return None
    
    def scrape_phone_by_name(self, phone_name, get_first_result=True):
        """Main method to scrape phone specs by name from GSMArena"""
        search_results = self.search_phone(phone_name)
        
        if not search_results:
            logger.warning(f"No results found for: {phone_name}")
            return None
        
        results = []
        targets = [search_results[0]] if get_first_result else search_results
        
        for result in targets:
            logger.info(f"Scraping: {result['title']}")
            phone_data = self.get_phone_specs(result['url'])
            if phone_data:
                results.append(phone_data)
            time.sleep(2)  # Be respectful
        
        return results[0] if get_first_result and results else results
    
    def scrape_multiple_phones(self, phone_names):
        """Scrape multiple phones from GSMArena"""
        all_phones = []
        
        for phone_name in phone_names:
            try:
                phone_data = self.scrape_phone_by_name(phone_name)
                if phone_data:
                    all_phones.append(phone_data)
                time.sleep(3)  # Be respectful between requests
            except Exception as e:
                logger.error(f"Error scraping {phone_name}: {e}")
                continue
        
        return all_phones
    
    def save_to_json(self, data, filename):
        """Save data to JSON file"""
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            logger.info(f"Data saved to {filename}")
        except Exception as e:
            logger.error(f"Error saving to JSON: {e}")

def test_alternative_scraper():
    """Test the enhanced GSMArena scraper"""
    print("\n" + "="*50)
    print("Testing Enhanced GSMArena Scraper")
    print("="*50)
    
    gsm_scraper = GSMArenaScraperAlternative()
    
    # Test single phone
    phone_name = "iPhone 15 Pro"
    print(f"Testing single phone: {phone_name}")
    
    result = gsm_scraper.scrape_phone_by_name(phone_name)
    
    if result:
        print(f"✅ Successfully scraped: {result['name']}")
        print(f"📱 Found {len(result['specifications'])} specifications")
        print(f"🖼️ Found {len(result['images'])} images")
        
        # Show some key specs
        key_specs = ['Display Size', 'RAM', 'Storage', 'Battery Capacity', 'Main Camera']
        print("\n📋 Key Specifications:")
        for spec in key_specs:
            for key, value in result['specifications'].items():
                if spec.lower() in key.lower():
                    print(f"  • {key}: {value}")
                    break
        
        # Save result
        gsm_scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_gsmarena_specs.json")
        
    else:
        print(f"❌ Failed to scrape {phone_name}")
    
    # Test multiple phones
    print(f"\n" + "-"*40)
    print("Testing Multiple Phones")
    print("-"*40)
    
    phone_list = ["Samsung Galaxy S24", "Google Pixel 8"]
    results = gsm_scraper.scrape_multiple_phones(phone_list)
    
    if results:
        print(f"✅ Successfully scraped {len(results)}/{len(phone_list)} phones")
        gsm_scraper.save_to_json(results, "multiple_phones_gsmarena_specs.json")
        
        for phone in results:
            print(f"📱 {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
    else:
        print("❌ No phones were successfully scraped")

# Main function with both scrapers
def main():
    print("🚀 Phone Specifications Scraper")
    print("="*50)
    
    # Try PhoneDB first
    try:
        print("Attempting PhoneDB scraper...")
        scraper = PhoneDBScraper()
        phone_name = "iPhone 15 Pro"
        result = scraper.scrape_phone_by_name(phone_name)
        
        if result:
            print(f"✅ PhoneDB: Successfully scraped {result['name']}")
            scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_phonedb_specs.json")
            return
        else:
            print("❌ PhoneDB scraper failed, trying GSMArena...")
            
    except Exception as e:
        print(f"❌ PhoneDB initialization failed: {str(e)}")
        print("🔄 Switching to GSMArena scraper...")
    
    # Use GSMArena as fallback
    test_alternative_scraper()

if __name__ == "__main__":
#    main()
    # Uncomment the line below to test GSMArena alternative
     test_alternative_scraper()