电商爬虫练习-手机数据库爬虫
今天给群里一位小伙伴写了个手机数据爬虫,数据量很大,单品牌高达上千个产品数据,数据来源网站:https://phonedb.net/,仅作学习交流使用,请勿对目标网站造成损害,正在学习电商爬虫的朋友可以看看。
以下是代码:
import requests, re
from lxml import etree
import warnings
from multiprocessing.dummy import Pool
pool1 = Pool(3)
pool2 = Pool(3)
List = []
warnings.filterwarnings('ignore')
def crawl1(link):
response2 = requests.get('https://phonedb.net/' + link, params=params, cookies=cookies, headers=headers,
verify=False).text
html2 = etree.HTML(response2)
prolinks = html2.xpath('//div[@class="content_block_title"]/a/@href')
for prolink in prolinks:
List.append(prolink)
print('已添加链接:' + prolink)
def crawl2(prolink):
response3 = requests.get('https://phonedb.net/' + prolink, params=params, cookies=cookies, headers=headers,
verify=False)
html3 = etree.HTML(response3.text)
title = html3.xpath('//h1/text()')[0]
brand = html3.xpath('//a[@id="datasheet_item_id1"]/following-sibling::a/text()')[0]
brief = html3.xpath('//table/tr[4]/td/text()')[0]
Manufacturer = html3.xpath('//a[@id="datasheet_item_id3"]/following-sibling::a/text()')[0]
lens = html3.xpath('//a[@id="datasheet_item_id85"]/parent::*/text()')[0]
inch = html3.xpath('//a[@id="datasheet_item_id86"]/following-sibling::a/text()')[0]
Display_type = html3.xpath('//a[@id="datasheet_item_id106"]/following-sibling::a/text()')[0]
battery = html3.xpath('//a[@id="datasheet_item_id442"]/following-sibling::a/text()')[0]
country = html3.xpath('//a[@id="datasheet_item_id494"]/following-sibling::a/text()')[0]
Regions = html3.xpath('//a[@id="datasheet_item_id496"]/following-sibling::a/text()')[0]
print(title, brand, brief, Manufacturer, lens, inch, Display_type, battery, country, Regions)
cookies = {
'_ga': 'GA1.2.739326458.1749462647',
'_gid': 'GA1.2.2008628745.1749462647',
'viewed_device': '.23843.',
'__gads': 'ID=4cb82f90b464eb87:T=1749462801:RT=1749462801:S=ALNI_MZwm84OZ_v0LsJqQz_YuYJP7UaH_A',
'__gpi': 'UID=0000112736146a3d:T=1749462801:RT=1749462801:S=ALNI_MZXdKeRHBRck7e-RcPtgL3O9--0ng',
'__eoi': 'ID=d969f88626211e75:T=1749462801:RT=1749462801:S=AA-AfjZ40jrvbgc8ngFRb1sLINIP',
'_gat': '1',
'_ga_3LYF7SMBX2': 'GS2.2.s1749462647$o1$g1$t1749463415$j60$l0$h0',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'DNT': '1',
'Pragma': 'no-cache',
'Referer': 'https://phonedb.net/index.php?m=device&s=list&first=huawei&filter=1769',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36',
'sec-ch-ua': '"Google Chrome";v="137", "Chromium";v="137", "Not/A)Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
# 'Cookie': '_ga=GA1.2.739326458.1749462647; _gid=GA1.2.2008628745.1749462647; viewed_device=.23843.; __gads=ID=4cb82f90b464eb87:T=1749462801:RT=1749462801:S=ALNI_MZwm84OZ_v0LsJqQz_YuYJP7UaH_A; __gpi=UID=0000112736146a3d:T=1749462801:RT=1749462801:S=ALNI_MZXdKeRHBRck7e-RcPtgL3O9--0ng; __eoi=ID=d969f88626211e75:T=1749462801:RT=1749462801:S=AA-AfjZ40jrvbgc8ngFRb1sLINIP; _gat=1; _ga_3LYF7SMBX2=GS2.2.s1749462647$o1$g1$t1749463415$j60$l0$h0',
}
params = {
'm': 'device',
's': 'list',
'first': 'huawei',
'filter': '0',
}
response = requests.get('https://phonedb.net/index.php', params=params, cookies=cookies, headers=headers, verify=False).text
links = re.findall('title="Jump to page \d+" href="(.*?)">', response)
#print(links)
pool1.map(crawl1, links)
pool2.map(crawl2, List)