PhoneArena / app3.py
NitinBot001's picture
Upload app3.py
a6c126b verified
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from urllib.parse import urljoin, quote
import logging
import urllib3
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class PhoneDBScraper:
def __init__(self):
self.base_url = "https://phonedb.net"
self.session = requests.Session()
# Configure session with better headers and SSL handling
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
})
# Set up retry strategy
retry_strategy = Retry(
total=3,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["HEAD", "GET", "OPTIONS"], # Updated parameter name
backoff_factor=1
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
# Disable SSL verification (use with caution)
self.session.verify = False
def search_phone(self, phone_name):
"""Search for a phone by name and return search results"""
# Try different search approaches
search_urls = [
f"{self.base_url}/index.php?m=device&s=query&q={quote(phone_name)}",
f"{self.base_url}/search?q={quote(phone_name)}",
f"{self.base_url}/index.php?m=device&s=list&q={quote(phone_name)}"
]
for search_url in search_urls:
try:
logger.info(f"Trying search URL: {search_url}")
response = self.session.get(search_url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find search results with multiple selectors
results = []
# Look for various possible result containers
selectors = [
'div.device-item',
'div.device',
'div.phone-item',
'tr[onclick*="device"]',
'a[href*="device"]',
'a[href*="phone"]',
'td a[href*="index.php"]'
]
search_results = []
for selector in selectors:
found = soup.select(selector)
if found:
search_results.extend(found)
break
# Also try finding links with device IDs
if not search_results:
search_results = soup.find_all('a', href=re.compile(r'(device|phone|id=\d+)'))
for result in search_results[:10]: # Limit to first 10 results
title = ""
link = ""
if result.name == 'a':
link = result.get('href', '')
title = result.get_text(strip=True) or result.get('title', '')
elif result.name in ['div', 'tr']:
link_elem = result.find('a')
if link_elem:
link = link_elem.get('href', '')
title = link_elem.get_text(strip=True) or result.get_text(strip=True)
else:
# Check for onclick events with device info
onclick = result.get('onclick', '')
if 'device' in onclick:
# Extract device ID from onclick
device_match = re.search(r'id=(\d+)', onclick)
if device_match:
link = f"/index.php?m=device&id={device_match.group(1)}"
title = result.get_text(strip=True)
# Clean up the link and title
if link and title:
# Clean title
title = re.sub(r'\s+', ' ', title).strip()
# Ensure absolute URL
if link.startswith('/'):
link = self.base_url + link
elif not link.startswith('http'):
link = f"{self.base_url}/{link}"
# Filter relevant results
if any(word.lower() in title.lower() for word in phone_name.split()):
results.append({
'title': title,
'url': link
})
if results:
logger.info(f"Found {len(results)} results using URL: {search_url}")
return results
except Exception as e:
logger.warning(f"Search URL failed {search_url}: {e}")
continue
logger.error(f"All search methods failed for: {phone_name}")
return []
def get_phone_specs(self, phone_url):
"""Extract detailed specifications from a phone page"""
try:
logger.info(f"Fetching specs from: {phone_url}")
response = self.session.get(phone_url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract phone data
phone_data = {
'name': '',
'brand': '',
'images': [],
'specifications': {},
'source_url': phone_url
}
# Get phone name from multiple possible locations
title_candidates = [
soup.find('h1'),
soup.find('h2'),
soup.find('title'),
soup.find('div', class_=re.compile(r'title|name|header')),
soup.find('td', string=re.compile(r'Model|Name', re.I))
]
for candidate in title_candidates:
if candidate:
title = candidate.get_text(strip=True)
if title and len(title) > 3:
phone_data['name'] = title
break
# Extract brand from title or URL
if phone_data['name']:
phone_data['brand'] = phone_data['name'].split()[0]
# Get images with multiple approaches
images = []
# Look for images in various containers
img_selectors = [
'img[src*="phone"]',
'img[src*="device"]',
'img[src*="mobile"]',
'img[alt*="phone"]',
'img[alt*="device"]',
'.device-image img',
'.phone-image img',
'td img',
'div img'
]
for selector in img_selectors:
imgs = soup.select(selector)
for img in imgs:
src = img.get('src', '')
if src:
# Convert relative URLs to absolute
if src.startswith('/'):
img_url = self.base_url + src
elif not src.startswith('http'):
img_url = f"{self.base_url}/{src}"
else:
img_url = src
# Avoid duplicates and filter out tiny images
if img_url not in images and not any(x in src.lower() for x in ['icon', 'logo', 'button', 'spacer']):
images.append(img_url)
phone_data['images'] = images[:5] # Limit to 5 images
# Extract specifications using multiple methods
specs = {}
# Method 1: PhoneDB specific table structure
spec_tables = soup.find_all('table')
for table in spec_tables:
rows = table.find_all('tr')
for row in rows:
cells = row.find_all(['td', 'th'])
if len(cells) >= 2:
key = cells[0].get_text(strip=True)
value = cells[1].get_text(strip=True)
# Clean up key and value
key = re.sub(r'[^\w\s]', '', key).strip()
value = re.sub(r'\s+', ' ', value).strip()
if key and value and len(key) < 100 and len(value) < 500:
specs[key] = value
# Method 2: Look for labeled specifications
labeled_specs = soup.find_all(['dt', 'label', 'b', 'strong'])
for label in labeled_specs:
label_text = label.get_text(strip=True)
if ':' in label_text:
key, value = label_text.split(':', 1)
specs[key.strip()] = value.strip()
else:
# Look for value in next sibling
sibling = label.find_next_sibling()
if sibling:
value = sibling.get_text(strip=True)
if value:
specs[label_text] = value
# Method 3: Extract common phone specifications from text
text_content = soup.get_text()
# Updated patterns for better matching
spec_patterns = {
'Display Size': r'(\d+\.?\d*)\s*(?:inch|"|β€³)',
'Display Resolution': r'(\d+)\s*[xΓ—]\s*(\d+)',
'RAM': r'(\d+)\s*GB\s*(?:RAM|Memory)',
'Storage': r'(\d+)\s*GB\s*(?:storage|internal|ROM)',
'Battery': r'(\d+)\s*mAh',
'Main Camera': r'(\d+(?:\.\d+)?)\s*MP(?:\s+main|\s+primary|\s+rear)?',
'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:front|selfie|secondary)',
'Operating System': r'(Android|iOS)\s*[\d\.]*',
'Processor': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*[\w\d\s]*',
'Network': r'(2G|3G|4G|5G|LTE)',
'Weight': r'(\d+)\s*(?:g|gram)',
'Dimensions': r'(\d+\.?\d*)\s*[xΓ—]\s*(\d+\.?\d*)\s*[xΓ—]\s*(\d+\.?\d*)\s*mm'
}
for spec_name, pattern in spec_patterns.items():
if spec_name not in specs: # Don't override existing specs
matches = re.findall(pattern, text_content, re.IGNORECASE)
if matches:
if spec_name == 'Display Resolution':
specs[spec_name] = f"{matches[0][0]}x{matches[0][1]}"
elif spec_name == 'Dimensions':
specs[spec_name] = f"{matches[0][0]}Γ—{matches[0][1]}Γ—{matches[0][2]} mm"
else:
specs[spec_name] = matches[0] if isinstance(matches[0], str) else str(matches[0])
phone_data['specifications'] = specs
logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")
return phone_data
except Exception as e:
logger.error(f"Error extracting specs from {phone_url}: {e}")
return None
def scrape_phone_by_name(self, phone_name, get_first_result=True):
"""Main method to scrape phone specs by name"""
logger.info(f"Searching for: {phone_name}")
# Search for the phone
search_results = self.search_phone(phone_name)
if not search_results:
logger.warning(f"No results found for: {phone_name}")
return None
results = []
# Process results
targets = [search_results[0]] if get_first_result else search_results
for result in targets:
logger.info(f"Scraping: {result['title']}")
phone_data = self.get_phone_specs(result['url'])
if phone_data:
results.append(phone_data)
# Be respectful with requests
time.sleep(1)
return results[0] if get_first_result and results else results
def scrape_multiple_phones(self, phone_names):
"""Scrape multiple phones and return structured JSON"""
all_phones = []
for phone_name in phone_names:
try:
phone_data = self.scrape_phone_by_name(phone_name)
if phone_data:
all_phones.append(phone_data)
time.sleep(2) # Be respectful between requests
except Exception as e:
logger.error(f"Error scraping {phone_name}: {e}")
continue
return all_phones
def save_to_json(self, data, filename):
"""Save data to JSON file"""
try:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Data saved to {filename}")
except Exception as e:
logger.error(f"Error saving to JSON: {e}")
# Example usage with error handling and alternative sites
def main():
scraper = PhoneDBScraper()
# Example 1: Scrape a single phone
phone_name = "iPhone 15 Pro"
print(f"Attempting to scrape: {phone_name}")
result = scraper.scrape_phone_by_name(phone_name)
if result:
print(f"βœ… Successfully scraped {result['name']}")
print(f"Found {len(result['specifications'])} specifications")
print(f"Found {len(result['images'])} images")
print(json.dumps(result, indent=2))
scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_specs.json")
else:
print(f"❌ Failed to scrape {phone_name}")
print("This might be due to:")
print("1. PhoneDB.net blocking automated requests")
print("2. Phone not found in their database")
print("3. Site structure changes")
print("\nAlternative solutions:")
print("- Try with a different phone name")
print("- Use a VPN if blocked by IP")
print("- Consider using alternative sites like GSMArena")
# Example 2: Test with multiple phones
phone_list = [
"Samsung Galaxy S24",
"Google Pixel 8",
"OnePlus 12"
]
print(f"\nTesting multiple phones: {phone_list}")
results = scraper.scrape_multiple_phones(phone_list)
if results:
scraper.save_to_json(results, "multiple_phones_specs.json")
print(f"βœ… Successfully scraped {len(results)}/{len(phone_list)} phones")
for phone in results:
print(f"- {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
else:
print("❌ No phones were successfully scraped")
# Enhanced GSMArena scraper as main alternative
class GSMArenaScraperAlternative:
"""Enhanced GSMArena scraper with full functionality"""
def __init__(self):
self.base_url = "https://www.gsmarena.com"
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
})
def search_phone(self, phone_name):
"""Search GSMArena for phone"""
search_url = f"{self.base_url}/results.php3"
params = {'sQuickSearch': 'yes', 'sName': phone_name}
try:
logger.info(f"Searching GSMArena for: {phone_name}")
response = self.session.get(search_url, params=params, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
results = []
# Find search results in makers section
makers = soup.find_all('div', class_='makers')
for maker in makers:
links = maker.find_all('a')
for link in links[:5]: # Limit results
href = link.get('href', '')
title = link.get_text(strip=True)
if href and title and phone_name.lower().replace(' ', '') in title.lower().replace(' ', ''):
full_url = self.base_url + '/' + href if not href.startswith('http') else href
results.append({
'title': title,
'url': full_url
})
logger.info(f"Found {len(results)} results on GSMArena")
return results
except Exception as e:
logger.error(f"GSMArena search failed: {e}")
return []
def get_phone_specs(self, phone_url):
"""Extract detailed specifications from GSMArena phone page"""
try:
logger.info(f"Fetching specs from GSMArena: {phone_url}")
response = self.session.get(phone_url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
phone_data = {
'name': '',
'brand': '',
'images': [],
'specifications': {},
'source_url': phone_url
}
# Get phone name
title_elem = soup.find('h1', class_='specs-phone-name-title')
if not title_elem:
title_elem = soup.find('h1') or soup.find('title')
if title_elem:
phone_data['name'] = title_elem.get_text(strip=True)
phone_data['brand'] = phone_data['name'].split()[0] if phone_data['name'] else ''
# Get images
images = []
# Main phone image
main_img_container = soup.find('div', class_='specs-photo-main')
if main_img_container:
img = main_img_container.find('img')
if img and img.get('src'):
img_url = urljoin(phone_url, img['src'])
images.append(img_url)
# Additional images from carousel or gallery
carousel = soup.find('div', class_='carousel-item') or soup.find('div', class_='specs-photos')
if carousel:
for img in carousel.find_all('img'):
src = img.get('src', '')
if src:
img_url = urljoin(phone_url, src)
if img_url not in images:
images.append(img_url)
phone_data['images'] = images[:5]
# Extract specifications from GSMArena's table structure
specs = {}
# GSMArena uses specific table structure
spec_tables = soup.find_all('table', cellspacing='0')
for table in spec_tables:
# Get category header
category = ''
category_elem = table.find_previous('th') or table.find_previous('h2')
if category_elem:
category = category_elem.get_text(strip=True)
rows = table.find_all('tr')
for row in rows:
cells = row.find_all(['td', 'th'])
if len(cells) >= 2:
key = cells[0].get_text(strip=True)
value = cells[1].get_text(strip=True)
# Clean up the key and value
key = re.sub(r'[^\w\s]', '', key).strip()
value = re.sub(r'\s+', ' ', value).strip()
if key and value and len(key) < 100:
# Add category prefix if available
final_key = f"{category} - {key}" if category and len(category) < 30 else key
specs[final_key] = value
# Also extract from the detailed specs list structure
detail_lists = soup.find_all(['ul', 'li'], class_=re.compile(r'spec|detail'))
for detail_list in detail_lists:
items = detail_list.find_all('li') if detail_list.name == 'ul' else [detail_list]
for item in items:
text = item.get_text(strip=True)
if ':' in text:
parts = text.split(':', 1)
if len(parts) == 2:
key, value = parts
specs[key.strip()] = value.strip()
# Extract key specs using patterns from page text
page_text = soup.get_text()
key_patterns = {
'Display Size': r'(\d+\.?\d*)\s*(?:inch|")\s*display',
'Display Resolution': r'(\d+)\s*[xΓ—]\s*(\d+)\s*pixels',
'RAM': r'(\d+)\s*GB\s*RAM',
'Storage': r'(\d+)\s*GB\s*(?:storage|internal)',
'Battery Capacity': r'(\d+)\s*mAh',
'Main Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:main|primary|rear)',
'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*front',
'Operating System': r'(Android|iOS)\s*([\d\.]+)?',
'Chipset': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*([\w\d\s]+)?',
'Weight': r'(\d+)\s*g\s*weight',
'Launch Date': r'(January|February|March|April|May|June|July|August|September|October|November|December)\s*(\d{4})'
}
for spec_name, pattern in key_patterns.items():
if spec_name not in specs:
match = re.search(pattern, page_text, re.IGNORECASE)
if match:
if spec_name == 'Display Resolution':
specs[spec_name] = f"{match.group(1)}Γ—{match.group(2)}"
elif spec_name == 'Launch Date':
specs[spec_name] = f"{match.group(1)} {match.group(2)}"
else:
specs[spec_name] = match.group(0)
phone_data['specifications'] = specs
logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")
return phone_data
except Exception as e:
logger.error(f"Error extracting GSMArena specs from {phone_url}: {e}")
return None
def scrape_phone_by_name(self, phone_name, get_first_result=True):
"""Main method to scrape phone specs by name from GSMArena"""
search_results = self.search_phone(phone_name)
if not search_results:
logger.warning(f"No results found for: {phone_name}")
return None
results = []
targets = [search_results[0]] if get_first_result else search_results
for result in targets:
logger.info(f"Scraping: {result['title']}")
phone_data = self.get_phone_specs(result['url'])
if phone_data:
results.append(phone_data)
time.sleep(2) # Be respectful
return results[0] if get_first_result and results else results
def scrape_multiple_phones(self, phone_names):
"""Scrape multiple phones from GSMArena"""
all_phones = []
for phone_name in phone_names:
try:
phone_data = self.scrape_phone_by_name(phone_name)
if phone_data:
all_phones.append(phone_data)
time.sleep(3) # Be respectful between requests
except Exception as e:
logger.error(f"Error scraping {phone_name}: {e}")
continue
return all_phones
def save_to_json(self, data, filename):
"""Save data to JSON file"""
try:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Data saved to {filename}")
except Exception as e:
logger.error(f"Error saving to JSON: {e}")
def test_alternative_scraper():
"""Test the enhanced GSMArena scraper"""
print("\n" + "="*50)
print("Testing Enhanced GSMArena Scraper")
print("="*50)
gsm_scraper = GSMArenaScraperAlternative()
# Test single phone
phone_name = "iPhone 15 Pro"
print(f"Testing single phone: {phone_name}")
result = gsm_scraper.scrape_phone_by_name(phone_name)
if result:
print(f"βœ… Successfully scraped: {result['name']}")
print(f"πŸ“± Found {len(result['specifications'])} specifications")
print(f"πŸ–ΌοΈ Found {len(result['images'])} images")
# Show some key specs
key_specs = ['Display Size', 'RAM', 'Storage', 'Battery Capacity', 'Main Camera']
print("\nπŸ“‹ Key Specifications:")
for spec in key_specs:
for key, value in result['specifications'].items():
if spec.lower() in key.lower():
print(f" β€’ {key}: {value}")
break
# Save result
gsm_scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_gsmarena_specs.json")
else:
print(f"❌ Failed to scrape {phone_name}")
# Test multiple phones
print(f"\n" + "-"*40)
print("Testing Multiple Phones")
print("-"*40)
phone_list = ["Samsung Galaxy S24", "Google Pixel 8"]
results = gsm_scraper.scrape_multiple_phones(phone_list)
if results:
print(f"βœ… Successfully scraped {len(results)}/{len(phone_list)} phones")
gsm_scraper.save_to_json(results, "multiple_phones_gsmarena_specs.json")
for phone in results:
print(f"πŸ“± {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
else:
print("❌ No phones were successfully scraped")
# Main function with both scrapers
def main():
print("πŸš€ Phone Specifications Scraper")
print("="*50)
# Try PhoneDB first
try:
print("Attempting PhoneDB scraper...")
scraper = PhoneDBScraper()
phone_name = "iPhone 15 Pro"
result = scraper.scrape_phone_by_name(phone_name)
if result:
print(f"βœ… PhoneDB: Successfully scraped {result['name']}")
scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_phonedb_specs.json")
return
else:
print("❌ PhoneDB scraper failed, trying GSMArena...")
except Exception as e:
print(f"❌ PhoneDB initialization failed: {str(e)}")
print("πŸ”„ Switching to GSMArena scraper...")
# Use GSMArena as fallback
test_alternative_scraper()
if __name__ == "__main__":
# main()
# Uncomment the line below to test GSMArena alternative
test_alternative_scraper()