Spaces:

NitinBot001
/

PhoneArena

Sleeping

App Files Files Community

PhoneArena / app3.py

NitinBot001

Upload app3.py

a6c126b verified 6 months ago

raw

history blame contribute delete

28.8 kB

	import requests
	from bs4 import BeautifulSoup
	import json
	import re
	import time
	from urllib.parse import urljoin, quote
	import logging
	import urllib3
	from requests.adapters import HTTPAdapter
	from urllib3.util.retry import Retry

	# Disable SSL warnings
	urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class PhoneDBScraper:
	def __init__(self):
	self.base_url = "https://phonedb.net"
	self.session = requests.Session()

	# Configure session with better headers and SSL handling
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.9',
	'Accept-Encoding': 'gzip, deflate, br',
	'DNT': '1',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1',
	})

	# Set up retry strategy
	retry_strategy = Retry(
	total=3,
	status_forcelist=[429, 500, 502, 503, 504],
	allowed_methods=["HEAD", "GET", "OPTIONS"], # Updated parameter name
	backoff_factor=1
	)

	adapter = HTTPAdapter(max_retries=retry_strategy)
	self.session.mount("http://", adapter)
	self.session.mount("https://", adapter)

	# Disable SSL verification (use with caution)
	self.session.verify = False

	def search_phone(self, phone_name):
	"""Search for a phone by name and return search results"""
	# Try different search approaches
	search_urls = [
	f"{self.base_url}/index.php?m=device&s=query&q={quote(phone_name)}",
	f"{self.base_url}/search?q={quote(phone_name)}",
	f"{self.base_url}/index.php?m=device&s=list&q={quote(phone_name)}"
	]

	for search_url in search_urls:
	try:
	logger.info(f"Trying search URL: {search_url}")
	response = self.session.get(search_url, timeout=30)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Find search results with multiple selectors
	results = []

	# Look for various possible result containers
	selectors = [
	'div.device-item',
	'div.device',
	'div.phone-item',
	'tr[onclick*="device"]',
	'a[href*="device"]',
	'a[href*="phone"]',
	'td a[href*="index.php"]'
	]

	search_results = []
	for selector in selectors:
	found = soup.select(selector)
	if found:
	search_results.extend(found)
	break

	# Also try finding links with device IDs
	if not search_results:
	search_results = soup.find_all('a', href=re.compile(r'(device\|phone\|id=\d+)'))

	for result in search_results[:10]: # Limit to first 10 results
	title = ""
	link = ""

	if result.name == 'a':
	link = result.get('href', '')
	title = result.get_text(strip=True) or result.get('title', '')
	elif result.name in ['div', 'tr']:
	link_elem = result.find('a')
	if link_elem:
	link = link_elem.get('href', '')
	title = link_elem.get_text(strip=True) or result.get_text(strip=True)
	else:
	# Check for onclick events with device info
	onclick = result.get('onclick', '')
	if 'device' in onclick:
	# Extract device ID from onclick
	device_match = re.search(r'id=(\d+)', onclick)
	if device_match:
	link = f"/index.php?m=device&id={device_match.group(1)}"
	title = result.get_text(strip=True)

	# Clean up the link and title
	if link and title:
	# Clean title
	title = re.sub(r'\s+', ' ', title).strip()

	# Ensure absolute URL
	if link.startswith('/'):
	link = self.base_url + link
	elif not link.startswith('http'):
	link = f"{self.base_url}/{link}"

	# Filter relevant results
	if any(word.lower() in title.lower() for word in phone_name.split()):
	results.append({
	'title': title,
	'url': link
	})

	if results:
	logger.info(f"Found {len(results)} results using URL: {search_url}")
	return results

	except Exception as e:
	logger.warning(f"Search URL failed {search_url}: {e}")
	continue

	logger.error(f"All search methods failed for: {phone_name}")
	return []

	def get_phone_specs(self, phone_url):
	"""Extract detailed specifications from a phone page"""
	try:
	logger.info(f"Fetching specs from: {phone_url}")
	response = self.session.get(phone_url, timeout=30)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Extract phone data
	phone_data = {
	'name': '',
	'brand': '',
	'images': [],
	'specifications': {},
	'source_url': phone_url
	}

	# Get phone name from multiple possible locations
	title_candidates = [
	soup.find('h1'),
	soup.find('h2'),
	soup.find('title'),
	soup.find('div', class_=re.compile(r'title\|name\|header')),
	soup.find('td', string=re.compile(r'Model\|Name', re.I))
	]

	for candidate in title_candidates:
	if candidate:
	title = candidate.get_text(strip=True)
	if title and len(title) > 3:
	phone_data['name'] = title
	break

	# Extract brand from title or URL
	if phone_data['name']:
	phone_data['brand'] = phone_data['name'].split()[0]

	# Get images with multiple approaches
	images = []

	# Look for images in various containers
	img_selectors = [
	'img[src*="phone"]',
	'img[src*="device"]',
	'img[src*="mobile"]',
	'img[alt*="phone"]',
	'img[alt*="device"]',
	'.device-image img',
	'.phone-image img',
	'td img',
	'div img'
	]

	for selector in img_selectors:
	imgs = soup.select(selector)
	for img in imgs:
	src = img.get('src', '')
	if src:
	# Convert relative URLs to absolute
	if src.startswith('/'):
	img_url = self.base_url + src
	elif not src.startswith('http'):
	img_url = f"{self.base_url}/{src}"
	else:
	img_url = src

	# Avoid duplicates and filter out tiny images
	if img_url not in images and not any(x in src.lower() for x in ['icon', 'logo', 'button', 'spacer']):
	images.append(img_url)

	phone_data['images'] = images[:5] # Limit to 5 images

	# Extract specifications using multiple methods
	specs = {}

	# Method 1: PhoneDB specific table structure
	spec_tables = soup.find_all('table')
	for table in spec_tables:
	rows = table.find_all('tr')
	for row in rows:
	cells = row.find_all(['td', 'th'])
	if len(cells) >= 2:
	key = cells[0].get_text(strip=True)
	value = cells[1].get_text(strip=True)

	# Clean up key and value
	key = re.sub(r'[^\w\s]', '', key).strip()
	value = re.sub(r'\s+', ' ', value).strip()

	if key and value and len(key) < 100 and len(value) < 500:
	specs[key] = value

	# Method 2: Look for labeled specifications
	labeled_specs = soup.find_all(['dt', 'label', 'b', 'strong'])
	for label in labeled_specs:
	label_text = label.get_text(strip=True)
	if ':' in label_text:
	key, value = label_text.split(':', 1)
	specs[key.strip()] = value.strip()
	else:
	# Look for value in next sibling
	sibling = label.find_next_sibling()
	if sibling:
	value = sibling.get_text(strip=True)
	if value:
	specs[label_text] = value

	# Method 3: Extract common phone specifications from text
	text_content = soup.get_text()

	# Updated patterns for better matching
	spec_patterns = {
	'Display Size': r'(\d+\.?\d)\s(?:inch\|"\|″)',
	'Display Resolution': r'(\d+)\s[x×]\s(\d+)',
	'RAM': r'(\d+)\sGB\s(?:RAM\|Memory)',
	'Storage': r'(\d+)\sGB\s(?:storage\|internal\|ROM)',
	'Battery': r'(\d+)\s*mAh',
	'Main Camera': r'(\d+(?:\.\d+)?)\s*MP(?:\s+main\|\s+primary\|\s+rear)?',
	'Front Camera': r'(\d+(?:\.\d+)?)\sMP\s(?:front\|selfie\|secondary)',
	'Operating System': r'(Android\|iOS)\s[\d\.]',
	'Processor': r'(Snapdragon\|Exynos\|A\d+\|Kirin\|MediaTek\|Dimensity)\s[\w\d\s]',
	'Network': r'(2G\|3G\|4G\|5G\|LTE)',
	'Weight': r'(\d+)\s*(?:g\|gram)',
	'Dimensions': r'(\d+\.?\d)\s[x×]\s(\d+\.?\d)\s[x×]\s(\d+\.?\d)\smm'
	}

	for spec_name, pattern in spec_patterns.items():
	if spec_name not in specs: # Don't override existing specs
	matches = re.findall(pattern, text_content, re.IGNORECASE)
	if matches:
	if spec_name == 'Display Resolution':
	specs[spec_name] = f"{matches[0][0]}x{matches[0][1]}"
	elif spec_name == 'Dimensions':
	specs[spec_name] = f"{matches[0][0]}×{matches[0][1]}×{matches[0][2]} mm"
	else:
	specs[spec_name] = matches[0] if isinstance(matches[0], str) else str(matches[0])

	phone_data['specifications'] = specs

	logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")
	return phone_data

	except Exception as e:
	logger.error(f"Error extracting specs from {phone_url}: {e}")
	return None

	def scrape_phone_by_name(self, phone_name, get_first_result=True):
	"""Main method to scrape phone specs by name"""
	logger.info(f"Searching for: {phone_name}")

	# Search for the phone
	search_results = self.search_phone(phone_name)

	if not search_results:
	logger.warning(f"No results found for: {phone_name}")
	return None

	results = []

	# Process results
	targets = [search_results[0]] if get_first_result else search_results

	for result in targets:
	logger.info(f"Scraping: {result['title']}")

	phone_data = self.get_phone_specs(result['url'])
	if phone_data:
	results.append(phone_data)

	# Be respectful with requests
	time.sleep(1)

	return results[0] if get_first_result and results else results

	def scrape_multiple_phones(self, phone_names):
	"""Scrape multiple phones and return structured JSON"""
	all_phones = []

	for phone_name in phone_names:
	try:
	phone_data = self.scrape_phone_by_name(phone_name)
	if phone_data:
	all_phones.append(phone_data)
	time.sleep(2) # Be respectful between requests
	except Exception as e:
	logger.error(f"Error scraping {phone_name}: {e}")
	continue

	return all_phones

	def save_to_json(self, data, filename):
	"""Save data to JSON file"""
	try:
	with open(filename, 'w', encoding='utf-8') as f:
	json.dump(data, f, indent=2, ensure_ascii=False)
	logger.info(f"Data saved to {filename}")
	except Exception as e:
	logger.error(f"Error saving to JSON: {e}")

	# Example usage with error handling and alternative sites
	def main():
	scraper = PhoneDBScraper()

	# Example 1: Scrape a single phone
	phone_name = "iPhone 15 Pro"
	print(f"Attempting to scrape: {phone_name}")

	result = scraper.scrape_phone_by_name(phone_name)

	if result:
	print(f"✅ Successfully scraped {result['name']}")
	print(f"Found {len(result['specifications'])} specifications")
	print(f"Found {len(result['images'])} images")
	print(json.dumps(result, indent=2))
	scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_specs.json")
	else:
	print(f"❌ Failed to scrape {phone_name}")
	print("This might be due to:")
	print("1. PhoneDB.net blocking automated requests")
	print("2. Phone not found in their database")
	print("3. Site structure changes")
	print("\nAlternative solutions:")
	print("- Try with a different phone name")
	print("- Use a VPN if blocked by IP")
	print("- Consider using alternative sites like GSMArena")

	# Example 2: Test with multiple phones
	phone_list = [
	"Samsung Galaxy S24",
	"Google Pixel 8",
	"OnePlus 12"
	]

	print(f"\nTesting multiple phones: {phone_list}")
	results = scraper.scrape_multiple_phones(phone_list)

	if results:
	scraper.save_to_json(results, "multiple_phones_specs.json")
	print(f"✅ Successfully scraped {len(results)}/{len(phone_list)} phones")

	for phone in results:
	print(f"- {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
	else:
	print("❌ No phones were successfully scraped")

	# Enhanced GSMArena scraper as main alternative
	class GSMArenaScraperAlternative:
	"""Enhanced GSMArena scraper with full functionality"""

	def __init__(self):
	self.base_url = "https://www.gsmarena.com"
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate',
	'Connection': 'keep-alive',
	})

	def search_phone(self, phone_name):
	"""Search GSMArena for phone"""
	search_url = f"{self.base_url}/results.php3"
	params = {'sQuickSearch': 'yes', 'sName': phone_name}

	try:
	logger.info(f"Searching GSMArena for: {phone_name}")
	response = self.session.get(search_url, params=params, timeout=30)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')
	results = []

	# Find search results in makers section
	makers = soup.find_all('div', class_='makers')
	for maker in makers:
	links = maker.find_all('a')
	for link in links[:5]: # Limit results
	href = link.get('href', '')
	title = link.get_text(strip=True)

	if href and title and phone_name.lower().replace(' ', '') in title.lower().replace(' ', ''):
	full_url = self.base_url + '/' + href if not href.startswith('http') else href
	results.append({
	'title': title,
	'url': full_url
	})

	logger.info(f"Found {len(results)} results on GSMArena")
	return results

	except Exception as e:
	logger.error(f"GSMArena search failed: {e}")
	return []

	def get_phone_specs(self, phone_url):
	"""Extract detailed specifications from GSMArena phone page"""
	try:
	logger.info(f"Fetching specs from GSMArena: {phone_url}")
	response = self.session.get(phone_url, timeout=30)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	phone_data = {
	'name': '',
	'brand': '',
	'images': [],
	'specifications': {},
	'source_url': phone_url
	}

	# Get phone name
	title_elem = soup.find('h1', class_='specs-phone-name-title')
	if not title_elem:
	title_elem = soup.find('h1') or soup.find('title')

	if title_elem:
	phone_data['name'] = title_elem.get_text(strip=True)
	phone_data['brand'] = phone_data['name'].split()[0] if phone_data['name'] else ''

	# Get images
	images = []

	# Main phone image
	main_img_container = soup.find('div', class_='specs-photo-main')
	if main_img_container:
	img = main_img_container.find('img')
	if img and img.get('src'):
	img_url = urljoin(phone_url, img['src'])
	images.append(img_url)

	# Additional images from carousel or gallery
	carousel = soup.find('div', class_='carousel-item') or soup.find('div', class_='specs-photos')
	if carousel:
	for img in carousel.find_all('img'):
	src = img.get('src', '')
	if src:
	img_url = urljoin(phone_url, src)
	if img_url not in images:
	images.append(img_url)

	phone_data['images'] = images[:5]

	# Extract specifications from GSMArena's table structure
	specs = {}

	# GSMArena uses specific table structure
	spec_tables = soup.find_all('table', cellspacing='0')

	for table in spec_tables:
	# Get category header
	category = ''
	category_elem = table.find_previous('th') or table.find_previous('h2')
	if category_elem:
	category = category_elem.get_text(strip=True)

	rows = table.find_all('tr')
	for row in rows:
	cells = row.find_all(['td', 'th'])
	if len(cells) >= 2:
	key = cells[0].get_text(strip=True)
	value = cells[1].get_text(strip=True)

	# Clean up the key and value
	key = re.sub(r'[^\w\s]', '', key).strip()
	value = re.sub(r'\s+', ' ', value).strip()

	if key and value and len(key) < 100:
	# Add category prefix if available
	final_key = f"{category} - {key}" if category and len(category) < 30 else key
	specs[final_key] = value

	# Also extract from the detailed specs list structure
	detail_lists = soup.find_all(['ul', 'li'], class_=re.compile(r'spec\|detail'))
	for detail_list in detail_lists:
	items = detail_list.find_all('li') if detail_list.name == 'ul' else [detail_list]
	for item in items:
	text = item.get_text(strip=True)
	if ':' in text:
	parts = text.split(':', 1)
	if len(parts) == 2:
	key, value = parts
	specs[key.strip()] = value.strip()

	# Extract key specs using patterns from page text
	page_text = soup.get_text()

	key_patterns = {
	'Display Size': r'(\d+\.?\d)\s(?:inch\|")\s*display',
	'Display Resolution': r'(\d+)\s[x×]\s(\d+)\s*pixels',
	'RAM': r'(\d+)\sGB\sRAM',
	'Storage': r'(\d+)\sGB\s(?:storage\|internal)',
	'Battery Capacity': r'(\d+)\s*mAh',
	'Main Camera': r'(\d+(?:\.\d+)?)\sMP\s(?:main\|primary\|rear)',
	'Front Camera': r'(\d+(?:\.\d+)?)\sMP\sfront',
	'Operating System': r'(Android\|iOS)\s*([\d\.]+)?',
	'Chipset': r'(Snapdragon\|Exynos\|A\d+\|Kirin\|MediaTek\|Dimensity)\s*([\w\d\s]+)?',
	'Weight': r'(\d+)\sg\sweight',
	'Launch Date': r'(January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December)\s*(\d{4})'
	}

	for spec_name, pattern in key_patterns.items():
	if spec_name not in specs:
	match = re.search(pattern, page_text, re.IGNORECASE)
	if match:
	if spec_name == 'Display Resolution':
	specs[spec_name] = f"{match.group(1)}×{match.group(2)}"
	elif spec_name == 'Launch Date':
	specs[spec_name] = f"{match.group(1)} {match.group(2)}"
	else:
	specs[spec_name] = match.group(0)

	phone_data['specifications'] = specs
	logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")

	return phone_data

	except Exception as e:
	logger.error(f"Error extracting GSMArena specs from {phone_url}: {e}")
	return None

	def scrape_phone_by_name(self, phone_name, get_first_result=True):
	"""Main method to scrape phone specs by name from GSMArena"""
	search_results = self.search_phone(phone_name)

	if not search_results:
	logger.warning(f"No results found for: {phone_name}")
	return None

	results = []
	targets = [search_results[0]] if get_first_result else search_results

	for result in targets:
	logger.info(f"Scraping: {result['title']}")
	phone_data = self.get_phone_specs(result['url'])
	if phone_data:
	results.append(phone_data)
	time.sleep(2) # Be respectful

	return results[0] if get_first_result and results else results

	def scrape_multiple_phones(self, phone_names):
	"""Scrape multiple phones from GSMArena"""
	all_phones = []

	for phone_name in phone_names:
	try:
	phone_data = self.scrape_phone_by_name(phone_name)
	if phone_data:
	all_phones.append(phone_data)
	time.sleep(3) # Be respectful between requests
	except Exception as e:
	logger.error(f"Error scraping {phone_name}: {e}")
	continue

	return all_phones

	def save_to_json(self, data, filename):
	"""Save data to JSON file"""
	try:
	with open(filename, 'w', encoding='utf-8') as f:
	json.dump(data, f, indent=2, ensure_ascii=False)
	logger.info(f"Data saved to {filename}")
	except Exception as e:
	logger.error(f"Error saving to JSON: {e}")

	def test_alternative_scraper():
	"""Test the enhanced GSMArena scraper"""
	print("\n" + "="*50)
	print("Testing Enhanced GSMArena Scraper")
	print("="*50)

	gsm_scraper = GSMArenaScraperAlternative()

	# Test single phone
	phone_name = "iPhone 15 Pro"
	print(f"Testing single phone: {phone_name}")

	result = gsm_scraper.scrape_phone_by_name(phone_name)

	if result:
	print(f"✅ Successfully scraped: {result['name']}")
	print(f"📱 Found {len(result['specifications'])} specifications")
	print(f"🖼️ Found {len(result['images'])} images")

	# Show some key specs
	key_specs = ['Display Size', 'RAM', 'Storage', 'Battery Capacity', 'Main Camera']
	print("\n📋 Key Specifications:")
	for spec in key_specs:
	for key, value in result['specifications'].items():
	if spec.lower() in key.lower():
	print(f" • {key}: {value}")
	break

	# Save result
	gsm_scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_gsmarena_specs.json")

	else:
	print(f"❌ Failed to scrape {phone_name}")

	# Test multiple phones
	print(f"\n" + "-"*40)
	print("Testing Multiple Phones")
	print("-"*40)

	phone_list = ["Samsung Galaxy S24", "Google Pixel 8"]
	results = gsm_scraper.scrape_multiple_phones(phone_list)

	if results:
	print(f"✅ Successfully scraped {len(results)}/{len(phone_list)} phones")
	gsm_scraper.save_to_json(results, "multiple_phones_gsmarena_specs.json")

	for phone in results:
	print(f"📱 {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
	else:
	print("❌ No phones were successfully scraped")

	# Main function with both scrapers
	def main():
	print("🚀 Phone Specifications Scraper")
	print("="*50)

	# Try PhoneDB first
	try:
	print("Attempting PhoneDB scraper...")
	scraper = PhoneDBScraper()
	phone_name = "iPhone 15 Pro"
	result = scraper.scrape_phone_by_name(phone_name)

	if result:
	print(f"✅ PhoneDB: Successfully scraped {result['name']}")
	scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_phonedb_specs.json")
	return
	else:
	print("❌ PhoneDB scraper failed, trying GSMArena...")

	except Exception as e:
	print(f"❌ PhoneDB initialization failed: {str(e)}")
	print("🔄 Switching to GSMArena scraper...")

	# Use GSMArena as fallback
	test_alternative_scraper()

	if __name__ == "__main__":
	# main()
	# Uncomment the line below to test GSMArena alternative
	test_alternative_scraper()