import requests from bs4 import BeautifulSoup import json import re import time from urllib.parse import urljoin, quote import logging import urllib3 from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry # Disable SSL warnings urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class PhoneDBScraper: def __init__(self): self.base_url = "https://phonedb.net" self.session = requests.Session() # Configure session with better headers and SSL handling self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', }) # Set up retry strategy retry_strategy = Retry( total=3, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["HEAD", "GET", "OPTIONS"], # Updated parameter name backoff_factor=1 ) adapter = HTTPAdapter(max_retries=retry_strategy) self.session.mount("http://", adapter) self.session.mount("https://", adapter) # Disable SSL verification (use with caution) self.session.verify = False def search_phone(self, phone_name): """Search for a phone by name and return search results""" # Try different search approaches search_urls = [ f"{self.base_url}/index.php?m=device&s=query&q={quote(phone_name)}", f"{self.base_url}/search?q={quote(phone_name)}", f"{self.base_url}/index.php?m=device&s=list&q={quote(phone_name)}" ] for search_url in search_urls: try: logger.info(f"Trying search URL: {search_url}") response = self.session.get(search_url, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Find search results with multiple selectors results = [] # Look for various possible result containers selectors = [ 'div.device-item', 'div.device', 'div.phone-item', 'tr[onclick*="device"]', 'a[href*="device"]', 'a[href*="phone"]', 'td a[href*="index.php"]' ] search_results = [] for selector in selectors: found = soup.select(selector) if found: search_results.extend(found) break # Also try finding links with device IDs if not search_results: search_results = soup.find_all('a', href=re.compile(r'(device|phone|id=\d+)')) for result in search_results[:10]: # Limit to first 10 results title = "" link = "" if result.name == 'a': link = result.get('href', '') title = result.get_text(strip=True) or result.get('title', '') elif result.name in ['div', 'tr']: link_elem = result.find('a') if link_elem: link = link_elem.get('href', '') title = link_elem.get_text(strip=True) or result.get_text(strip=True) else: # Check for onclick events with device info onclick = result.get('onclick', '') if 'device' in onclick: # Extract device ID from onclick device_match = re.search(r'id=(\d+)', onclick) if device_match: link = f"/index.php?m=device&id={device_match.group(1)}" title = result.get_text(strip=True) # Clean up the link and title if link and title: # Clean title title = re.sub(r'\s+', ' ', title).strip() # Ensure absolute URL if link.startswith('/'): link = self.base_url + link elif not link.startswith('http'): link = f"{self.base_url}/{link}" # Filter relevant results if any(word.lower() in title.lower() for word in phone_name.split()): results.append({ 'title': title, 'url': link }) if results: logger.info(f"Found {len(results)} results using URL: {search_url}") return results except Exception as e: logger.warning(f"Search URL failed {search_url}: {e}") continue logger.error(f"All search methods failed for: {phone_name}") return [] def get_phone_specs(self, phone_url): """Extract detailed specifications from a phone page""" try: logger.info(f"Fetching specs from: {phone_url}") response = self.session.get(phone_url, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Extract phone data phone_data = { 'name': '', 'brand': '', 'images': [], 'specifications': {}, 'source_url': phone_url } # Get phone name from multiple possible locations title_candidates = [ soup.find('h1'), soup.find('h2'), soup.find('title'), soup.find('div', class_=re.compile(r'title|name|header')), soup.find('td', string=re.compile(r'Model|Name', re.I)) ] for candidate in title_candidates: if candidate: title = candidate.get_text(strip=True) if title and len(title) > 3: phone_data['name'] = title break # Extract brand from title or URL if phone_data['name']: phone_data['brand'] = phone_data['name'].split()[0] # Get images with multiple approaches images = [] # Look for images in various containers img_selectors = [ 'img[src*="phone"]', 'img[src*="device"]', 'img[src*="mobile"]', 'img[alt*="phone"]', 'img[alt*="device"]', '.device-image img', '.phone-image img', 'td img', 'div img' ] for selector in img_selectors: imgs = soup.select(selector) for img in imgs: src = img.get('src', '') if src: # Convert relative URLs to absolute if src.startswith('/'): img_url = self.base_url + src elif not src.startswith('http'): img_url = f"{self.base_url}/{src}" else: img_url = src # Avoid duplicates and filter out tiny images if img_url not in images and not any(x in src.lower() for x in ['icon', 'logo', 'button', 'spacer']): images.append(img_url) phone_data['images'] = images[:5] # Limit to 5 images # Extract specifications using multiple methods specs = {} # Method 1: PhoneDB specific table structure spec_tables = soup.find_all('table') for table in spec_tables: rows = table.find_all('tr') for row in rows: cells = row.find_all(['td', 'th']) if len(cells) >= 2: key = cells[0].get_text(strip=True) value = cells[1].get_text(strip=True) # Clean up key and value key = re.sub(r'[^\w\s]', '', key).strip() value = re.sub(r'\s+', ' ', value).strip() if key and value and len(key) < 100 and len(value) < 500: specs[key] = value # Method 2: Look for labeled specifications labeled_specs = soup.find_all(['dt', 'label', 'b', 'strong']) for label in labeled_specs: label_text = label.get_text(strip=True) if ':' in label_text: key, value = label_text.split(':', 1) specs[key.strip()] = value.strip() else: # Look for value in next sibling sibling = label.find_next_sibling() if sibling: value = sibling.get_text(strip=True) if value: specs[label_text] = value # Method 3: Extract common phone specifications from text text_content = soup.get_text() # Updated patterns for better matching spec_patterns = { 'Display Size': r'(\d+\.?\d*)\s*(?:inch|"|″)', 'Display Resolution': r'(\d+)\s*[x×]\s*(\d+)', 'RAM': r'(\d+)\s*GB\s*(?:RAM|Memory)', 'Storage': r'(\d+)\s*GB\s*(?:storage|internal|ROM)', 'Battery': r'(\d+)\s*mAh', 'Main Camera': r'(\d+(?:\.\d+)?)\s*MP(?:\s+main|\s+primary|\s+rear)?', 'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:front|selfie|secondary)', 'Operating System': r'(Android|iOS)\s*[\d\.]*', 'Processor': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*[\w\d\s]*', 'Network': r'(2G|3G|4G|5G|LTE)', 'Weight': r'(\d+)\s*(?:g|gram)', 'Dimensions': r'(\d+\.?\d*)\s*[x×]\s*(\d+\.?\d*)\s*[x×]\s*(\d+\.?\d*)\s*mm' } for spec_name, pattern in spec_patterns.items(): if spec_name not in specs: # Don't override existing specs matches = re.findall(pattern, text_content, re.IGNORECASE) if matches: if spec_name == 'Display Resolution': specs[spec_name] = f"{matches[0][0]}x{matches[0][1]}" elif spec_name == 'Dimensions': specs[spec_name] = f"{matches[0][0]}×{matches[0][1]}×{matches[0][2]} mm" else: specs[spec_name] = matches[0] if isinstance(matches[0], str) else str(matches[0]) phone_data['specifications'] = specs logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}") return phone_data except Exception as e: logger.error(f"Error extracting specs from {phone_url}: {e}") return None def scrape_phone_by_name(self, phone_name, get_first_result=True): """Main method to scrape phone specs by name""" logger.info(f"Searching for: {phone_name}") # Search for the phone search_results = self.search_phone(phone_name) if not search_results: logger.warning(f"No results found for: {phone_name}") return None results = [] # Process results targets = [search_results[0]] if get_first_result else search_results for result in targets: logger.info(f"Scraping: {result['title']}") phone_data = self.get_phone_specs(result['url']) if phone_data: results.append(phone_data) # Be respectful with requests time.sleep(1) return results[0] if get_first_result and results else results def scrape_multiple_phones(self, phone_names): """Scrape multiple phones and return structured JSON""" all_phones = [] for phone_name in phone_names: try: phone_data = self.scrape_phone_by_name(phone_name) if phone_data: all_phones.append(phone_data) time.sleep(2) # Be respectful between requests except Exception as e: logger.error(f"Error scraping {phone_name}: {e}") continue return all_phones def save_to_json(self, data, filename): """Save data to JSON file""" try: with open(filename, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.info(f"Data saved to {filename}") except Exception as e: logger.error(f"Error saving to JSON: {e}") # Example usage with error handling and alternative sites def main(): scraper = PhoneDBScraper() # Example 1: Scrape a single phone phone_name = "iPhone 15 Pro" print(f"Attempting to scrape: {phone_name}") result = scraper.scrape_phone_by_name(phone_name) if result: print(f"✅ Successfully scraped {result['name']}") print(f"Found {len(result['specifications'])} specifications") print(f"Found {len(result['images'])} images") print(json.dumps(result, indent=2)) scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_specs.json") else: print(f"❌ Failed to scrape {phone_name}") print("This might be due to:") print("1. PhoneDB.net blocking automated requests") print("2. Phone not found in their database") print("3. Site structure changes") print("\nAlternative solutions:") print("- Try with a different phone name") print("- Use a VPN if blocked by IP") print("- Consider using alternative sites like GSMArena") # Example 2: Test with multiple phones phone_list = [ "Samsung Galaxy S24", "Google Pixel 8", "OnePlus 12" ] print(f"\nTesting multiple phones: {phone_list}") results = scraper.scrape_multiple_phones(phone_list) if results: scraper.save_to_json(results, "multiple_phones_specs.json") print(f"✅ Successfully scraped {len(results)}/{len(phone_list)} phones") for phone in results: print(f"- {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images") else: print("❌ No phones were successfully scraped") # Enhanced GSMArena scraper as main alternative class GSMArenaScraperAlternative: """Enhanced GSMArena scraper with full functionality""" def __init__(self): self.base_url = "https://www.gsmarena.com" self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', }) def search_phone(self, phone_name): """Search GSMArena for phone""" search_url = f"{self.base_url}/results.php3" params = {'sQuickSearch': 'yes', 'sName': phone_name} try: logger.info(f"Searching GSMArena for: {phone_name}") response = self.session.get(search_url, params=params, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') results = [] # Find search results in makers section makers = soup.find_all('div', class_='makers') for maker in makers: links = maker.find_all('a') for link in links[:5]: # Limit results href = link.get('href', '') title = link.get_text(strip=True) if href and title and phone_name.lower().replace(' ', '') in title.lower().replace(' ', ''): full_url = self.base_url + '/' + href if not href.startswith('http') else href results.append({ 'title': title, 'url': full_url }) logger.info(f"Found {len(results)} results on GSMArena") return results except Exception as e: logger.error(f"GSMArena search failed: {e}") return [] def get_phone_specs(self, phone_url): """Extract detailed specifications from GSMArena phone page""" try: logger.info(f"Fetching specs from GSMArena: {phone_url}") response = self.session.get(phone_url, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') phone_data = { 'name': '', 'brand': '', 'images': [], 'specifications': {}, 'source_url': phone_url } # Get phone name title_elem = soup.find('h1', class_='specs-phone-name-title') if not title_elem: title_elem = soup.find('h1') or soup.find('title') if title_elem: phone_data['name'] = title_elem.get_text(strip=True) phone_data['brand'] = phone_data['name'].split()[0] if phone_data['name'] else '' # Get images images = [] # Main phone image main_img_container = soup.find('div', class_='specs-photo-main') if main_img_container: img = main_img_container.find('img') if img and img.get('src'): img_url = urljoin(phone_url, img['src']) images.append(img_url) # Additional images from carousel or gallery carousel = soup.find('div', class_='carousel-item') or soup.find('div', class_='specs-photos') if carousel: for img in carousel.find_all('img'): src = img.get('src', '') if src: img_url = urljoin(phone_url, src) if img_url not in images: images.append(img_url) phone_data['images'] = images[:5] # Extract specifications from GSMArena's table structure specs = {} # GSMArena uses specific table structure spec_tables = soup.find_all('table', cellspacing='0') for table in spec_tables: # Get category header category = '' category_elem = table.find_previous('th') or table.find_previous('h2') if category_elem: category = category_elem.get_text(strip=True) rows = table.find_all('tr') for row in rows: cells = row.find_all(['td', 'th']) if len(cells) >= 2: key = cells[0].get_text(strip=True) value = cells[1].get_text(strip=True) # Clean up the key and value key = re.sub(r'[^\w\s]', '', key).strip() value = re.sub(r'\s+', ' ', value).strip() if key and value and len(key) < 100: # Add category prefix if available final_key = f"{category} - {key}" if category and len(category) < 30 else key specs[final_key] = value # Also extract from the detailed specs list structure detail_lists = soup.find_all(['ul', 'li'], class_=re.compile(r'spec|detail')) for detail_list in detail_lists: items = detail_list.find_all('li') if detail_list.name == 'ul' else [detail_list] for item in items: text = item.get_text(strip=True) if ':' in text: parts = text.split(':', 1) if len(parts) == 2: key, value = parts specs[key.strip()] = value.strip() # Extract key specs using patterns from page text page_text = soup.get_text() key_patterns = { 'Display Size': r'(\d+\.?\d*)\s*(?:inch|")\s*display', 'Display Resolution': r'(\d+)\s*[x×]\s*(\d+)\s*pixels', 'RAM': r'(\d+)\s*GB\s*RAM', 'Storage': r'(\d+)\s*GB\s*(?:storage|internal)', 'Battery Capacity': r'(\d+)\s*mAh', 'Main Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:main|primary|rear)', 'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*front', 'Operating System': r'(Android|iOS)\s*([\d\.]+)?', 'Chipset': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*([\w\d\s]+)?', 'Weight': r'(\d+)\s*g\s*weight', 'Launch Date': r'(January|February|March|April|May|June|July|August|September|October|November|December)\s*(\d{4})' } for spec_name, pattern in key_patterns.items(): if spec_name not in specs: match = re.search(pattern, page_text, re.IGNORECASE) if match: if spec_name == 'Display Resolution': specs[spec_name] = f"{match.group(1)}×{match.group(2)}" elif spec_name == 'Launch Date': specs[spec_name] = f"{match.group(1)} {match.group(2)}" else: specs[spec_name] = match.group(0) phone_data['specifications'] = specs logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}") return phone_data except Exception as e: logger.error(f"Error extracting GSMArena specs from {phone_url}: {e}") return None def scrape_phone_by_name(self, phone_name, get_first_result=True): """Main method to scrape phone specs by name from GSMArena""" search_results = self.search_phone(phone_name) if not search_results: logger.warning(f"No results found for: {phone_name}") return None results = [] targets = [search_results[0]] if get_first_result else search_results for result in targets: logger.info(f"Scraping: {result['title']}") phone_data = self.get_phone_specs(result['url']) if phone_data: results.append(phone_data) time.sleep(2) # Be respectful return results[0] if get_first_result and results else results def scrape_multiple_phones(self, phone_names): """Scrape multiple phones from GSMArena""" all_phones = [] for phone_name in phone_names: try: phone_data = self.scrape_phone_by_name(phone_name) if phone_data: all_phones.append(phone_data) time.sleep(3) # Be respectful between requests except Exception as e: logger.error(f"Error scraping {phone_name}: {e}") continue return all_phones def save_to_json(self, data, filename): """Save data to JSON file""" try: with open(filename, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.info(f"Data saved to {filename}") except Exception as e: logger.error(f"Error saving to JSON: {e}") def test_alternative_scraper(): """Test the enhanced GSMArena scraper""" print("\n" + "="*50) print("Testing Enhanced GSMArena Scraper") print("="*50) gsm_scraper = GSMArenaScraperAlternative() # Test single phone phone_name = "iPhone 15 Pro" print(f"Testing single phone: {phone_name}") result = gsm_scraper.scrape_phone_by_name(phone_name) if result: print(f"✅ Successfully scraped: {result['name']}") print(f"📱 Found {len(result['specifications'])} specifications") print(f"🖼️ Found {len(result['images'])} images") # Show some key specs key_specs = ['Display Size', 'RAM', 'Storage', 'Battery Capacity', 'Main Camera'] print("\n📋 Key Specifications:") for spec in key_specs: for key, value in result['specifications'].items(): if spec.lower() in key.lower(): print(f" • {key}: {value}") break # Save result gsm_scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_gsmarena_specs.json") else: print(f"❌ Failed to scrape {phone_name}") # Test multiple phones print(f"\n" + "-"*40) print("Testing Multiple Phones") print("-"*40) phone_list = ["Samsung Galaxy S24", "Google Pixel 8"] results = gsm_scraper.scrape_multiple_phones(phone_list) if results: print(f"✅ Successfully scraped {len(results)}/{len(phone_list)} phones") gsm_scraper.save_to_json(results, "multiple_phones_gsmarena_specs.json") for phone in results: print(f"📱 {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images") else: print("❌ No phones were successfully scraped") # Main function with both scrapers def main(): print("🚀 Phone Specifications Scraper") print("="*50) # Try PhoneDB first try: print("Attempting PhoneDB scraper...") scraper = PhoneDBScraper() phone_name = "iPhone 15 Pro" result = scraper.scrape_phone_by_name(phone_name) if result: print(f"✅ PhoneDB: Successfully scraped {result['name']}") scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_phonedb_specs.json") return else: print("❌ PhoneDB scraper failed, trying GSMArena...") except Exception as e: print(f"❌ PhoneDB initialization failed: {str(e)}") print("🔄 Switching to GSMArena scraper...") # Use GSMArena as fallback test_alternative_scraper() if __name__ == "__main__": # main() # Uncomment the line below to test GSMArena alternative test_alternative_scraper()