import requests
import lxml
import re
import json
from lxml import etree
import urllib3
urllib3.disable_warnings()
import time
import xlwt
import demjson
class spiders():
#初始化
def __init__(self):
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
self.data_sku = [] #用来存储ajax需要的data_pid
self.data_comment = []
self.data_money = [] #现在的价格
self.data_money_before = []
self.data_name = []
self.data_url = []
# 得到每一页的网页源码
def get_html(self,url): #请求页面
try:
res = requests.get(url, headers=self.headers)
res.encoding = res.apparent_encoding
if res.status_code == 200:
html = res.text
return html
else:
time.sleep(0.1)
return self.get_html(url)
except Exception as e: # except BaseException 这个也可以 e是打印出错误的原因
print("问题是",e)
pass
def get_sku(self,html): #在原始页面解析 data_sku 和品牌机型
r = etree.HTML(html)
node_list = r.xpath('//ul[@class="gl-warp clearfix"]/li')
self.data_sku.clear()
for node in node_list:
self.data_sku.append(node.xpath('./div/@data-sku'))
self.data_url.append(node.xpath('./div/div[@class="p-img"]/a/@href'))
self.data_name.append(node.xpath('./div/div[@class="p-name"]/a/em/text()'))
s = str(node.xpath('./div/div[@class="p-img"]/a/@href'))
s1 = s.replace("['", "")
s2 = s1.replace("']", "")
self.data_url.append(s2)
self.data_sku = [i[0] for i in self.data_sku] #把这样的[[7621084],[6946605],[7357933]]的数据变成['7624081', '6946605', '7357933']
def parse_comment(self,html): #评论数解析
json_comment = json.loads(html) #loads转成字典
comment_list = json_comment["CommentsCount"]
for comment in comment_list:
self.data_comment.append(comment['CommentCountStr'])
def join_url_comment(self): #拼接评论数代码
url_comment_start = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds='
comment_sku = ','.join(self.data_sku)
comment_end = url_comment_start + comment_sku
return comment_end
# pass
def join_url_money_up(self): #拼接上半部分代码
url_money_start_up = 'https://p.3.cn/prices/mgets?callback=jQuery873263&ext=11000000&pin=&type=1&area=6_303_304_36864&skuIds=J_'
money_sku_up = ','.join(self.data_sku[:30])
money_end_up = url_money_start_up + money_sku_up
return money_end_up
def join_url_money_down(self): #拼接下半部分url
global num
if num == 3:
return 0
num += 1
url_money_start_down = 'https://p.3.cn/prices/mgets?callback=jQuery873263&ext=11000000&pin=&type=1&area=6_303_304_36864&skuIds=J_'
money_sku_down = ','.join(self.data_sku[30:])
money_end_down = url_money_start_down + money_sku_down
return money_end_down
def parse_money(self,html): #解析价钱
#print(html)
s = re.findall(r'873263((.*?))', html)[0]
json_s = demjson.decode(s) #把字符串转成list
for money in json_s:
self.data_money.append(money['p'])
self.data_money_before.append((money['m']))
class Excel(spiders): #存储
def __init__(self):
spiders.__init__(self)
# 创建一个xlwt对象。
self.f = xlwt.Workbook(encoding='utf-8')
# 创建一个单表 sheet1, 在单表里面插入
self.sheet1 = self.f.add_sheet(u'sheet1', cell_overwrite_ok=True)
def write_jd(self):
j = 0
for name in self.data_name:
self.sheet1.write(j,0,name)
j += 1
m = 0
for money in self.data_money:
self.sheet1.write(m,1,money)
m += 1
n = 0
for comment in self.data_comment:
self.sheet1.write(n,2,comment)
n += 1
self.f.save(r'jd_spider606.xls')
if __name__ == '__main__':
sp = spiders()
e = Excel()#开辟内存
num = 0
li_url = ['https://list.jd.com/list.html?cat=652,828,841&ev=1107_97252&sort=sort_totalsales15_desc&trans=1&JL=3_%E5%88%86%E7%B1%BB_%E6%99%BA%E8%83%BD%E9%9F%B3%E7%AE%B1#J_crumbsBar','https://list.jd.com/list.html?cat=652,828,841&ev=1107_97252&page=2&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main','https://list.jd.com/list.html?cat=652,828,841&ev=1107_97252&page=3&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main','https://list.jd.com/list.html?cat=652,828,841&ev=1107_97252&page=4&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main']
for url in li_url:
sp.get_sku(sp.get_html(url)) # 获得sku
url_comment = sp.join_url_comment() # 拼接评论url
comment_html = sp.get_html(url_comment) # 请求评论
sp.parse_comment(comment_html) # 解析评论
url_money_up = sp.join_url_money_up() # 拼接价钱上半部分 钱
money_html_up = sp.get_html(url_money_up) # 请求
sp.parse_money(money_html_up) # 解析
url_money_dowm = sp.join_url_money_down() # 拼接价钱下半部分 钱
if url_money_dowm == 0:
break
money_html_down = sp.get_html(url_money_dowm) # 请求
sp.parse_money(money_html_down) # 解析
e.write_jd()