zoukankan      html  css  js  c++  java
  • 爬去京东——智能音箱

    import requests
    import lxml
    import re
    import json
    from lxml import etree
    import urllib3
    urllib3.disable_warnings()
    import time
    import xlwt
    import demjson

    class spiders():
    #初始化
    def __init__(self):
    self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
    self.data_sku = [] #用来存储ajax需要的data_pid
    self.data_comment = []
    self.data_money = [] #现在的价格
    self.data_money_before = []
    self.data_name = []
    self.data_url = []

    # 得到每一页的网页源码
    def get_html(self,url): #请求页面
    try:
    res = requests.get(url, headers=self.headers)
    res.encoding = res.apparent_encoding
    if res.status_code == 200:
    html = res.text
    return html
    else:
    time.sleep(0.1)
    return self.get_html(url)
    except Exception as e: # except BaseException 这个也可以 e是打印出错误的原因
    print("问题是",e)
    pass



    def get_sku(self,html): #在原始页面解析 data_sku 和品牌机型
    r = etree.HTML(html)
    node_list = r.xpath('//ul[@class="gl-warp clearfix"]/li')
    self.data_sku.clear()
    for node in node_list:
    self.data_sku.append(node.xpath('./div/@data-sku'))
    self.data_url.append(node.xpath('./div/div[@class="p-img"]/a/@href'))
    self.data_name.append(node.xpath('./div/div[@class="p-name"]/a/em/text()'))
    s = str(node.xpath('./div/div[@class="p-img"]/a/@href'))
    s1 = s.replace("['", "")
    s2 = s1.replace("']", "")
    self.data_url.append(s2)

    self.data_sku = [i[0] for i in self.data_sku] #把这样的[[7621084],[6946605],[7357933]]的数据变成['7624081', '6946605', '7357933']


    def parse_comment(self,html): #评论数解析

    json_comment = json.loads(html) #loads转成字典
    comment_list = json_comment["CommentsCount"]

    for comment in comment_list:
    self.data_comment.append(comment['CommentCountStr'])

    def join_url_comment(self): #拼接评论数代码
    url_comment_start = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds='
    comment_sku = ','.join(self.data_sku)
    comment_end = url_comment_start + comment_sku
    return comment_end
    # pass

    def join_url_money_up(self): #拼接上半部分代码
    url_money_start_up = 'https://p.3.cn/prices/mgets?callback=jQuery873263&ext=11000000&pin=&type=1&area=6_303_304_36864&skuIds=J_'
    money_sku_up = ','.join(self.data_sku[:30])
    money_end_up = url_money_start_up + money_sku_up
    return money_end_up

    def join_url_money_down(self): #拼接下半部分url
    global num
    if num == 3:
    return 0
    num += 1
    url_money_start_down = 'https://p.3.cn/prices/mgets?callback=jQuery873263&ext=11000000&pin=&type=1&area=6_303_304_36864&skuIds=J_'
    money_sku_down = ','.join(self.data_sku[30:])
    money_end_down = url_money_start_down + money_sku_down
    return money_end_down

    def parse_money(self,html): #解析价钱
    #print(html)
    s = re.findall(r'873263((.*?))', html)[0]
    json_s = demjson.decode(s) #把字符串转成list
    for money in json_s:
    self.data_money.append(money['p'])
    self.data_money_before.append((money['m']))

    class Excel(spiders): #存储
    def __init__(self):
    spiders.__init__(self)
    # 创建一个xlwt对象。
    self.f = xlwt.Workbook(encoding='utf-8')

    # 创建一个单表 sheet1, 在单表里面插入
    self.sheet1 = self.f.add_sheet(u'sheet1', cell_overwrite_ok=True)

    def write_jd(self):
    j = 0
    for name in self.data_name:
    self.sheet1.write(j,0,name)
    j += 1

    m = 0
    for money in self.data_money:
    self.sheet1.write(m,1,money)
    m += 1

    n = 0
    for comment in self.data_comment:
    self.sheet1.write(n,2,comment)
    n += 1

    self.f.save(r'jd_spider606.xls')


    if __name__ == '__main__':
    sp = spiders()
    e = Excel()#开辟内存
    num = 0

    li_url = ['https://list.jd.com/list.html?cat=652,828,841&ev=1107_97252&sort=sort_totalsales15_desc&trans=1&JL=3_%E5%88%86%E7%B1%BB_%E6%99%BA%E8%83%BD%E9%9F%B3%E7%AE%B1#J_crumbsBar','https://list.jd.com/list.html?cat=652,828,841&ev=1107_97252&page=2&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main','https://list.jd.com/list.html?cat=652,828,841&ev=1107_97252&page=3&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main','https://list.jd.com/list.html?cat=652,828,841&ev=1107_97252&page=4&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main']
    for url in li_url:
    sp.get_sku(sp.get_html(url)) # 获得sku
    url_comment = sp.join_url_comment() # 拼接评论url
    comment_html = sp.get_html(url_comment) # 请求评论
    sp.parse_comment(comment_html) # 解析评论

    url_money_up = sp.join_url_money_up() # 拼接价钱上半部分 钱
    money_html_up = sp.get_html(url_money_up) # 请求
    sp.parse_money(money_html_up) # 解析

    url_money_dowm = sp.join_url_money_down() # 拼接价钱下半部分 钱
    if url_money_dowm == 0:
    break
    money_html_down = sp.get_html(url_money_dowm) # 请求
    sp.parse_money(money_html_down) # 解析

    e.write_jd()
  • 相关阅读:
    ReentrantLock-公平锁、非公平锁、互斥锁、自旋锁
    行动的阻碍
    AQS-等待队列
    AQS-volatile、CAS
    UML类图符号
    最小堆
    红黑树
    Java面试题-Collection框架
    Java面试题-Java特性
    Qt QString中文 char* UTF-8 QByteArray QTextCodec unicode gb2312 GBK 乱码和转码问题
  • 原文地址:https://www.cnblogs.com/yuanjia8888/p/9145151.html
Copyright © 2011-2022 走看看