电商爬虫练习-手机数据库爬虫

电商爬虫练习-手机数据库爬虫

技术教程gslnedu2025-07-03 15:07:003A+A-

今天给群里一位小伙伴写了个手机数据爬虫,数据量很大,单品牌高达上千个产品数据,数据来源网站:https://phonedb.net/,仅作学习交流使用,请勿对目标网站造成损害,正在学习电商爬虫的朋友可以看看。

以下是代码:

import requests, re
from lxml import etree
import warnings
from multiprocessing.dummy import Pool

pool1 = Pool(3)
pool2 = Pool(3)
List = []

warnings.filterwarnings('ignore')

def crawl1(link):
    response2 = requests.get('https://phonedb.net/' + link, params=params, cookies=cookies, headers=headers,
                            verify=False).text

    html2 = etree.HTML(response2)
    prolinks = html2.xpath('//div[@class="content_block_title"]/a/@href')

    for prolink in prolinks:
        List.append(prolink)
        print('已添加链接:' + prolink)


def crawl2(prolink):
    response3 = requests.get('https://phonedb.net/' + prolink, params=params, cookies=cookies, headers=headers,
                             verify=False)

    html3 = etree.HTML(response3.text)
    title = html3.xpath('//h1/text()')[0]
    brand = html3.xpath('//a[@id="datasheet_item_id1"]/following-sibling::a/text()')[0]
    brief = html3.xpath('//table/tr[4]/td/text()')[0]
    Manufacturer = html3.xpath('//a[@id="datasheet_item_id3"]/following-sibling::a/text()')[0]
    lens = html3.xpath('//a[@id="datasheet_item_id85"]/parent::*/text()')[0]
    inch = html3.xpath('//a[@id="datasheet_item_id86"]/following-sibling::a/text()')[0]
    Display_type = html3.xpath('//a[@id="datasheet_item_id106"]/following-sibling::a/text()')[0]
    battery = html3.xpath('//a[@id="datasheet_item_id442"]/following-sibling::a/text()')[0]
    country = html3.xpath('//a[@id="datasheet_item_id494"]/following-sibling::a/text()')[0]
    Regions = html3.xpath('//a[@id="datasheet_item_id496"]/following-sibling::a/text()')[0]
    print(title, brand, brief, Manufacturer, lens, inch, Display_type, battery, country, Regions)


cookies = {
    '_ga': 'GA1.2.739326458.1749462647',
    '_gid': 'GA1.2.2008628745.1749462647',
    'viewed_device': '.23843.',
    '__gads': 'ID=4cb82f90b464eb87:T=1749462801:RT=1749462801:S=ALNI_MZwm84OZ_v0LsJqQz_YuYJP7UaH_A',
    '__gpi': 'UID=0000112736146a3d:T=1749462801:RT=1749462801:S=ALNI_MZXdKeRHBRck7e-RcPtgL3O9--0ng',
    '__eoi': 'ID=d969f88626211e75:T=1749462801:RT=1749462801:S=AA-AfjZ40jrvbgc8ngFRb1sLINIP',
    '_gat': '1',
    '_ga_3LYF7SMBX2': 'GS2.2.s1749462647$o1$g1$t1749463415$j60$l0$h0',
}

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'DNT': '1',
    'Pragma': 'no-cache',
    'Referer': 'https://phonedb.net/index.php?m=device&s=list&first=huawei&filter=1769',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Google Chrome";v="137", "Chromium";v="137", "Not/A)Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    # 'Cookie': '_ga=GA1.2.739326458.1749462647; _gid=GA1.2.2008628745.1749462647; viewed_device=.23843.; __gads=ID=4cb82f90b464eb87:T=1749462801:RT=1749462801:S=ALNI_MZwm84OZ_v0LsJqQz_YuYJP7UaH_A; __gpi=UID=0000112736146a3d:T=1749462801:RT=1749462801:S=ALNI_MZXdKeRHBRck7e-RcPtgL3O9--0ng; __eoi=ID=d969f88626211e75:T=1749462801:RT=1749462801:S=AA-AfjZ40jrvbgc8ngFRb1sLINIP; _gat=1; _ga_3LYF7SMBX2=GS2.2.s1749462647$o1$g1$t1749463415$j60$l0$h0',
}

params = {
    'm': 'device',
    's': 'list',
    'first': 'huawei',
    'filter': '0',
}

response = requests.get('https://phonedb.net/index.php', params=params, cookies=cookies, headers=headers, verify=False).text
links = re.findall('title="Jump to page \d+" href="(.*?)">', response)
#print(links)

pool1.map(crawl1, links)
pool2.map(crawl2, List)
点击这里复制本文地址 以上内容由朽木教程网整理呈现,请务必在转载分享时注明本文地址!如对内容有疑问,请联系我们,谢谢!
qrcode

朽木教程网 © All Rights Reserved.  蜀ICP备2024111239号-8