Spaces:

NitinBot001
/

PhoneArena

Sleeping

App Files Files Community

NitinBot001 commited on Jun 26

Commit

a6c126b

verified ·

1 Parent(s): 1f2ebfc

Upload app3.py

Browse files

Files changed (1) hide show

app3.py +691 -0

app3.py ADDED Viewed

	@@ -0,0 +1,691 @@

+import requests
+from bs4 import BeautifulSoup
+import json
+import re
+import time
+from urllib.parse import urljoin, quote
+import logging
+import urllib3
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+# Disable SSL warnings
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class PhoneDBScraper:
+    def __init__(self):
+        self.base_url = "https://phonedb.net"
+        self.session = requests.Session()
+        # Configure session with better headers and SSL handling
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+        })
+        # Set up retry strategy
+        retry_strategy = Retry(
+            total=3,
+            status_forcelist=[429, 500, 502, 503, 504],
+            allowed_methods=["HEAD", "GET", "OPTIONS"],  # Updated parameter name
+            backoff_factor=1
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        self.session.mount("http://", adapter)
+        self.session.mount("https://", adapter)
+        # Disable SSL verification (use with caution)
+        self.session.verify = False
+    def search_phone(self, phone_name):
+        """Search for a phone by name and return search results"""
+        # Try different search approaches
+        search_urls = [
+            f"{self.base_url}/index.php?m=device&s=query&q={quote(phone_name)}",
+            f"{self.base_url}/search?q={quote(phone_name)}",
+            f"{self.base_url}/index.php?m=device&s=list&q={quote(phone_name)}"
+        ]
+        for search_url in search_urls:
+            try:
+                logger.info(f"Trying search URL: {search_url}")
+                response = self.session.get(search_url, timeout=30)
+                response.raise_for_status()
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # Find search results with multiple selectors
+                results = []
+                # Look for various possible result containers
+                selectors = [
+                    'div.device-item',
+                    'div.device',
+                    'div.phone-item',
+                    'tr[onclick*="device"]',
+                    'a[href*="device"]',
+                    'a[href*="phone"]',
+                    'td a[href*="index.php"]'
+                ]
+                search_results = []
+                for selector in selectors:
+                    found = soup.select(selector)
+                    if found:
+                        search_results.extend(found)
+                        break
+                # Also try finding links with device IDs
+                if not search_results:
+                    search_results = soup.find_all('a', href=re.compile(r'(device|phone|id=\d+)'))
+                for result in search_results[:10]:  # Limit to first 10 results
+                    title = ""
+                    link = ""
+                    if result.name == 'a':
+                        link = result.get('href', '')
+                        title = result.get_text(strip=True) or result.get('title', '')
+                    elif result.name in ['div', 'tr']:
+                        link_elem = result.find('a')
+                        if link_elem:
+                            link = link_elem.get('href', '')
+                            title = link_elem.get_text(strip=True) or result.get_text(strip=True)
+                        else:
+                            # Check for onclick events with device info
+                            onclick = result.get('onclick', '')
+                            if 'device' in onclick:
+                                # Extract device ID from onclick
+                                device_match = re.search(r'id=(\d+)', onclick)
+                                if device_match:
+                                    link = f"/index.php?m=device&id={device_match.group(1)}"
+                                    title = result.get_text(strip=True)
+                    # Clean up the link and title
+                    if link and title:
+                        # Clean title
+                        title = re.sub(r'\s+', ' ', title).strip()
+                        # Ensure absolute URL
+                        if link.startswith('/'):
+                            link = self.base_url + link
+                        elif not link.startswith('http'):
+                            link = f"{self.base_url}/{link}"
+                        # Filter relevant results
+                        if any(word.lower() in title.lower() for word in phone_name.split()):
+                            results.append({
+                                'title': title,
+                                'url': link
+                            })
+                if results:
+                    logger.info(f"Found {len(results)} results using URL: {search_url}")
+                    return results
+            except Exception as e:
+                logger.warning(f"Search URL failed {search_url}: {e}")
+                continue
+        logger.error(f"All search methods failed for: {phone_name}")
+        return []
+    def get_phone_specs(self, phone_url):
+        """Extract detailed specifications from a phone page"""
+        try:
+            logger.info(f"Fetching specs from: {phone_url}")
+            response = self.session.get(phone_url, timeout=30)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Extract phone data
+            phone_data = {
+                'name': '',
+                'brand': '',
+                'images': [],
+                'specifications': {},
+                'source_url': phone_url
+            }
+            # Get phone name from multiple possible locations
+            title_candidates = [
+                soup.find('h1'),
+                soup.find('h2'),
+                soup.find('title'),
+                soup.find('div', class_=re.compile(r'title|name|header')),
+                soup.find('td', string=re.compile(r'Model|Name', re.I))
+            ]
+            for candidate in title_candidates:
+                if candidate:
+                    title = candidate.get_text(strip=True)
+                    if title and len(title) > 3:
+                        phone_data['name'] = title
+                        break
+            # Extract brand from title or URL
+            if phone_data['name']:
+                phone_data['brand'] = phone_data['name'].split()[0]
+            # Get images with multiple approaches
+            images = []
+            # Look for images in various containers
+            img_selectors = [
+                'img[src*="phone"]',
+                'img[src*="device"]',
+                'img[src*="mobile"]',
+                'img[alt*="phone"]',
+                'img[alt*="device"]',
+                '.device-image img',
+                '.phone-image img',
+                'td img',
+                'div img'
+            ]
+            for selector in img_selectors:
+                imgs = soup.select(selector)
+                for img in imgs:
+                    src = img.get('src', '')
+                    if src:
+                        # Convert relative URLs to absolute
+                        if src.startswith('/'):
+                            img_url = self.base_url + src
+                        elif not src.startswith('http'):
+                            img_url = f"{self.base_url}/{src}"
+                        else:
+                            img_url = src
+                        # Avoid duplicates and filter out tiny images
+                        if img_url not in images and not any(x in src.lower() for x in ['icon', 'logo', 'button', 'spacer']):
+                            images.append(img_url)
+            phone_data['images'] = images[:5]  # Limit to 5 images
+            # Extract specifications using multiple methods
+            specs = {}
+            # Method 1: PhoneDB specific table structure
+            spec_tables = soup.find_all('table')
+            for table in spec_tables:
+                rows = table.find_all('tr')
+                for row in rows:
+                    cells = row.find_all(['td', 'th'])
+                    if len(cells) >= 2:
+                        key = cells[0].get_text(strip=True)
+                        value = cells[1].get_text(strip=True)
+                        # Clean up key and value
+                        key = re.sub(r'[^\w\s]', '', key).strip()
+                        value = re.sub(r'\s+', ' ', value).strip()
+                        if key and value and len(key) < 100 and len(value) < 500:
+                            specs[key] = value
+            # Method 2: Look for labeled specifications
+            labeled_specs = soup.find_all(['dt', 'label', 'b', 'strong'])
+            for label in labeled_specs:
+                label_text = label.get_text(strip=True)
+                if ':' in label_text:
+                    key, value = label_text.split(':', 1)
+                    specs[key.strip()] = value.strip()
+                else:
+                    # Look for value in next sibling
+                    sibling = label.find_next_sibling()
+                    if sibling:
+                        value = sibling.get_text(strip=True)
+                        if value:
+                            specs[label_text] = value
+            # Method 3: Extract common phone specifications from text
+            text_content = soup.get_text()
+            # Updated patterns for better matching
+            spec_patterns = {
+                'Display Size': r'(\d+\.?\d*)\s*(?:inch|"|″)',
+                'Display Resolution': r'(\d+)\s*[x×]\s*(\d+)',
+                'RAM': r'(\d+)\s*GB\s*(?:RAM|Memory)',
+                'Storage': r'(\d+)\s*GB\s*(?:storage|internal|ROM)',
+                'Battery': r'(\d+)\s*mAh',
+                'Main Camera': r'(\d+(?:\.\d+)?)\s*MP(?:\s+main|\s+primary|\s+rear)?',
+                'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:front|selfie|secondary)',
+                'Operating System': r'(Android|iOS)\s*[\d\.]*',
+                'Processor': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*[\w\d\s]*',
+                'Network': r'(2G|3G|4G|5G|LTE)',
+                'Weight': r'(\d+)\s*(?:g|gram)',
+                'Dimensions': r'(\d+\.?\d*)\s*[x×]\s*(\d+\.?\d*)\s*[x×]\s*(\d+\.?\d*)\s*mm'
+            }
+            for spec_name, pattern in spec_patterns.items():
+                if spec_name not in specs:  # Don't override existing specs
+                    matches = re.findall(pattern, text_content, re.IGNORECASE)
+                    if matches:
+                        if spec_name == 'Display Resolution':
+                            specs[spec_name] = f"{matches[0][0]}x{matches[0][1]}"
+                        elif spec_name == 'Dimensions':
+                            specs[spec_name] = f"{matches[0][0]}×{matches[0][1]}×{matches[0][2]} mm"
+                        else:
+                            specs[spec_name] = matches[0] if isinstance(matches[0], str) else str(matches[0])
+            phone_data['specifications'] = specs
+            logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")
+            return phone_data
+        except Exception as e:
+            logger.error(f"Error extracting specs from {phone_url}: {e}")
+            return None
+    def scrape_phone_by_name(self, phone_name, get_first_result=True):
+        """Main method to scrape phone specs by name"""
+        logger.info(f"Searching for: {phone_name}")
+        # Search for the phone
+        search_results = self.search_phone(phone_name)
+        if not search_results:
+            logger.warning(f"No results found for: {phone_name}")
+            return None
+        results = []
+        # Process results
+        targets = [search_results[0]] if get_first_result else search_results
+        for result in targets:
+            logger.info(f"Scraping: {result['title']}")
+            phone_data = self.get_phone_specs(result['url'])
+            if phone_data:
+                results.append(phone_data)
+            # Be respectful with requests
+            time.sleep(1)
+        return results[0] if get_first_result and results else results
+    def scrape_multiple_phones(self, phone_names):
+        """Scrape multiple phones and return structured JSON"""
+        all_phones = []
+        for phone_name in phone_names:
+            try:
+                phone_data = self.scrape_phone_by_name(phone_name)
+                if phone_data:
+                    all_phones.append(phone_data)
+                time.sleep(2)  # Be respectful between requests
+            except Exception as e:
+                logger.error(f"Error scraping {phone_name}: {e}")
+                continue
+        return all_phones
+    def save_to_json(self, data, filename):
+        """Save data to JSON file"""
+        try:
+            with open(filename, 'w', encoding='utf-8') as f:
+                json.dump(data, f, indent=2, ensure_ascii=False)
+            logger.info(f"Data saved to {filename}")
+        except Exception as e:
+            logger.error(f"Error saving to JSON: {e}")
+# Example usage with error handling and alternative sites
+def main():
+    scraper = PhoneDBScraper()
+    # Example 1: Scrape a single phone
+    phone_name = "iPhone 15 Pro"
+    print(f"Attempting to scrape: {phone_name}")
+    result = scraper.scrape_phone_by_name(phone_name)
+    if result:
+        print(f"✅ Successfully scraped {result['name']}")
+        print(f"Found {len(result['specifications'])} specifications")
+        print(f"Found {len(result['images'])} images")
+        print(json.dumps(result, indent=2))
+        scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_specs.json")
+    else:
+        print(f"❌ Failed to scrape {phone_name}")
+        print("This might be due to:")
+        print("1. PhoneDB.net blocking automated requests")
+        print("2. Phone not found in their database")
+        print("3. Site structure changes")
+        print("\nAlternative solutions:")
+        print("- Try with a different phone name")
+        print("- Use a VPN if blocked by IP")
+        print("- Consider using alternative sites like GSMArena")
+    # Example 2: Test with multiple phones
+    phone_list = [
+        "Samsung Galaxy S24",
+        "Google Pixel 8",
+        "OnePlus 12"
+    ]
+    print(f"\nTesting multiple phones: {phone_list}")
+    results = scraper.scrape_multiple_phones(phone_list)
+    if results:
+        scraper.save_to_json(results, "multiple_phones_specs.json")
+        print(f"✅ Successfully scraped {len(results)}/{len(phone_list)} phones")
+        for phone in results:
+            print(f"- {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
+    else:
+        print("❌ No phones were successfully scraped")
+# Enhanced GSMArena scraper as main alternative
+class GSMArenaScraperAlternative:
+    """Enhanced GSMArena scraper with full functionality"""
+    def __init__(self):
+        self.base_url = "https://www.gsmarena.com"
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+        })
+    def search_phone(self, phone_name):
+        """Search GSMArena for phone"""
+        search_url = f"{self.base_url}/results.php3"
+        params = {'sQuickSearch': 'yes', 'sName': phone_name}
+        try:
+            logger.info(f"Searching GSMArena for: {phone_name}")
+            response = self.session.get(search_url, params=params, timeout=30)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            results = []
+            # Find search results in makers section
+            makers = soup.find_all('div', class_='makers')
+            for maker in makers:
+                links = maker.find_all('a')
+                for link in links[:5]:  # Limit results
+                    href = link.get('href', '')
+                    title = link.get_text(strip=True)
+                    if href and title and phone_name.lower().replace(' ', '') in title.lower().replace(' ', ''):
+                        full_url = self.base_url + '/' + href if not href.startswith('http') else href
+                        results.append({
+                            'title': title,
+                            'url': full_url
+                        })
+            logger.info(f"Found {len(results)} results on GSMArena")
+            return results
+        except Exception as e:
+            logger.error(f"GSMArena search failed: {e}")
+            return []
+    def get_phone_specs(self, phone_url):
+        """Extract detailed specifications from GSMArena phone page"""
+        try:
+            logger.info(f"Fetching specs from GSMArena: {phone_url}")
+            response = self.session.get(phone_url, timeout=30)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            phone_data = {
+                'name': '',
+                'brand': '',
+                'images': [],
+                'specifications': {},
+                'source_url': phone_url
+            }
+            # Get phone name
+            title_elem = soup.find('h1', class_='specs-phone-name-title')
+            if not title_elem:
+                title_elem = soup.find('h1') or soup.find('title')
+            if title_elem:
+                phone_data['name'] = title_elem.get_text(strip=True)
+                phone_data['brand'] = phone_data['name'].split()[0] if phone_data['name'] else ''
+            # Get images
+            images = []
+            # Main phone image
+            main_img_container = soup.find('div', class_='specs-photo-main')
+            if main_img_container:
+                img = main_img_container.find('img')
+                if img and img.get('src'):
+                    img_url = urljoin(phone_url, img['src'])
+                    images.append(img_url)
+            # Additional images from carousel or gallery
+            carousel = soup.find('div', class_='carousel-item') or soup.find('div', class_='specs-photos')
+            if carousel:
+                for img in carousel.find_all('img'):
+                    src = img.get('src', '')
+                    if src:
+                        img_url = urljoin(phone_url, src)
+                        if img_url not in images:
+                            images.append(img_url)
+            phone_data['images'] = images[:5]
+            # Extract specifications from GSMArena's table structure
+            specs = {}
+            # GSMArena uses specific table structure
+            spec_tables = soup.find_all('table', cellspacing='0')
+            for table in spec_tables:
+                # Get category header
+                category = ''
+                category_elem = table.find_previous('th') or table.find_previous('h2')
+                if category_elem:
+                    category = category_elem.get_text(strip=True)
+                rows = table.find_all('tr')
+                for row in rows:
+                    cells = row.find_all(['td', 'th'])
+                    if len(cells) >= 2:
+                        key = cells[0].get_text(strip=True)
+                        value = cells[1].get_text(strip=True)
+                        # Clean up the key and value
+                        key = re.sub(r'[^\w\s]', '', key).strip()
+                        value = re.sub(r'\s+', ' ', value).strip()
+                        if key and value and len(key) < 100:
+                            # Add category prefix if available
+                            final_key = f"{category} - {key}" if category and len(category) < 30 else key
+                            specs[final_key] = value
+            # Also extract from the detailed specs list structure
+            detail_lists = soup.find_all(['ul', 'li'], class_=re.compile(r'spec|detail'))
+            for detail_list in detail_lists:
+                items = detail_list.find_all('li') if detail_list.name == 'ul' else [detail_list]
+                for item in items:
+                    text = item.get_text(strip=True)
+                    if ':' in text:
+                        parts = text.split(':', 1)
+                        if len(parts) == 2:
+                            key, value = parts
+                            specs[key.strip()] = value.strip()
+            # Extract key specs using patterns from page text
+            page_text = soup.get_text()
+            key_patterns = {
+                'Display Size': r'(\d+\.?\d*)\s*(?:inch|")\s*display',
+                'Display Resolution': r'(\d+)\s*[x×]\s*(\d+)\s*pixels',
+                'RAM': r'(\d+)\s*GB\s*RAM',
+                'Storage': r'(\d+)\s*GB\s*(?:storage|internal)',
+                'Battery Capacity': r'(\d+)\s*mAh',
+                'Main Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:main|primary|rear)',
+                'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*front',
+                'Operating System': r'(Android|iOS)\s*([\d\.]+)?',
+                'Chipset': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*([\w\d\s]+)?',
+                'Weight': r'(\d+)\s*g\s*weight',
+                'Launch Date': r'(January|February|March|April|May|June|July|August|September|October|November|December)\s*(\d{4})'
+            }
+            for spec_name, pattern in key_patterns.items():
+                if spec_name not in specs:
+                    match = re.search(pattern, page_text, re.IGNORECASE)
+                    if match:
+                        if spec_name == 'Display Resolution':
+                            specs[spec_name] = f"{match.group(1)}×{match.group(2)}"
+                        elif spec_name == 'Launch Date':
+                            specs[spec_name] = f"{match.group(1)} {match.group(2)}"
+                        else:
+                            specs[spec_name] = match.group(0)
+            phone_data['specifications'] = specs
+            logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")
+            return phone_data
+        except Exception as e:
+            logger.error(f"Error extracting GSMArena specs from {phone_url}: {e}")
+            return None
+    def scrape_phone_by_name(self, phone_name, get_first_result=True):
+        """Main method to scrape phone specs by name from GSMArena"""
+        search_results = self.search_phone(phone_name)
+        if not search_results:
+            logger.warning(f"No results found for: {phone_name}")
+            return None
+        results = []
+        targets = [search_results[0]] if get_first_result else search_results
+        for result in targets:
+            logger.info(f"Scraping: {result['title']}")
+            phone_data = self.get_phone_specs(result['url'])
+            if phone_data:
+                results.append(phone_data)
+            time.sleep(2)  # Be respectful
+        return results[0] if get_first_result and results else results
+    def scrape_multiple_phones(self, phone_names):
+        """Scrape multiple phones from GSMArena"""
+        all_phones = []
+        for phone_name in phone_names:
+            try:
+                phone_data = self.scrape_phone_by_name(phone_name)
+                if phone_data:
+                    all_phones.append(phone_data)
+                time.sleep(3)  # Be respectful between requests
+            except Exception as e:
+                logger.error(f"Error scraping {phone_name}: {e}")
+                continue
+        return all_phones
+    def save_to_json(self, data, filename):
+        """Save data to JSON file"""
+        try:
+            with open(filename, 'w', encoding='utf-8') as f:
+                json.dump(data, f, indent=2, ensure_ascii=False)
+            logger.info(f"Data saved to {filename}")
+        except Exception as e:
+            logger.error(f"Error saving to JSON: {e}")
+def test_alternative_scraper():
+    """Test the enhanced GSMArena scraper"""
+    print("\n" + "="*50)
+    print("Testing Enhanced GSMArena Scraper")
+    print("="*50)
+    gsm_scraper = GSMArenaScraperAlternative()
+    # Test single phone
+    phone_name = "iPhone 15 Pro"
+    print(f"Testing single phone: {phone_name}")
+    result = gsm_scraper.scrape_phone_by_name(phone_name)
+    if result:
+        print(f"✅ Successfully scraped: {result['name']}")
+        print(f"📱 Found {len(result['specifications'])} specifications")
+        print(f"🖼️ Found {len(result['images'])} images")
+        # Show some key specs
+        key_specs = ['Display Size', 'RAM', 'Storage', 'Battery Capacity', 'Main Camera']
+        print("\n📋 Key Specifications:")
+        for spec in key_specs:
+            for key, value in result['specifications'].items():
+                if spec.lower() in key.lower():
+                    print(f"  • {key}: {value}")
+                    break
+        # Save result
+        gsm_scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_gsmarena_specs.json")
+    else:
+        print(f"❌ Failed to scrape {phone_name}")
+    # Test multiple phones
+    print(f"\n" + "-"*40)
+    print("Testing Multiple Phones")
+    print("-"*40)
+    phone_list = ["Samsung Galaxy S24", "Google Pixel 8"]
+    results = gsm_scraper.scrape_multiple_phones(phone_list)
+    if results:
+        print(f"✅ Successfully scraped {len(results)}/{len(phone_list)} phones")
+        gsm_scraper.save_to_json(results, "multiple_phones_gsmarena_specs.json")
+        for phone in results:
+            print(f"📱 {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
+    else:
+        print("❌ No phones were successfully scraped")
+# Main function with both scrapers
+def main():
+    print("🚀 Phone Specifications Scraper")
+    print("="*50)
+    # Try PhoneDB first
+    try:
+        print("Attempting PhoneDB scraper...")
+        scraper = PhoneDBScraper()
+        phone_name = "iPhone 15 Pro"
+        result = scraper.scrape_phone_by_name(phone_name)
+        if result:
+            print(f"✅ PhoneDB: Successfully scraped {result['name']}")
+            scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_phonedb_specs.json")
+            return
+        else:
+            print("❌ PhoneDB scraper failed, trying GSMArena...")
+    except Exception as e:
+        print(f"❌ PhoneDB initialization failed: {str(e)}")
+        print("🔄 Switching to GSMArena scraper...")
+    # Use GSMArena as fallback
+    test_alternative_scraper()
+if __name__ == "__main__":
+#    main()
+    # Uncomment the line below to test GSMArena alternative
+     test_alternative_scraper()