zoukankan      html  css  js  c++  java
  • JD 评论晒图爬虫

    JD 评论晒图爬虫

    #coding=utf-8
    import requests
    import re
    import os
    
    __author__ = 'depy'
    
    """
    jd 评论晒图爬虫
    @productId 商品id
    @startpage 开始页数
    @endpage 结束页数
    """
    
    class JDPIC(object):
        def __init__(self,productId,startpage,endpage=20):
            self.headers = {
                'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
                'Accept':'*/*',
                'Accept-Encoding':'gzip, deflate, sdch, br',
                'Accept-Language':'zh-CN,zh;q=0.8',
                'Cookie':''
            }
            self.url = 'https://club.jd.com/discussion/getProductPageImageCommentList.action'
            self.startpage = startpage
            self.productId = productId
            self.endpage = endpage
    
        def sendReq(self,page):
            params = {
                'productId':self.productId,
                'isShadowSku':'0',
                'callback':'jQuery219465',
                'page':page,
                'pageSize':20
            }
            r = requests.get(self.url,params=params,headers=self.headers,timeout=10)
            regex = re.findall(r'"imageUrl":"//(.*?)"',r.text)
            return regex
    
        def downloadImageFile(self,imgUrl):
            local_filename = imgUrl.split('/')[-1]
            print "Download Image File=", local_filename
            imgUrl = 'http://'+imgUrl
            r = requests.get(imgUrl, headers =self.headers,stream=True, timeout=20)
            dirName = 'JDPIC1'
            if not os.path.exists(dirName):
                os.makedirs(dirName)
            with open(dirName+'/'+local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
                        f.flush()
                f.close()
    
    if __name__ == '__main__':
        J = JDPIC(1111,51,100)  #商品id自行修改
        #print J.endpage
        list = range(int(J.startpage),int(J.endpage)+1)
        for i in list:
            regexlist = J.sendReq(i)
            for picurl in regexlist:
                J.downloadImageFile(picurl)
    
        print "downpic success"
  • 相关阅读:
    C#处理json实战
    HDU3994(Folyd + 期望概率)
    POJ1270 Following Orders (拓扑排序)
    HDU 3634 City Planning (离散化)
    HDU4762(JAVA大数)
    POJ3026(BFS + prim)
    POJ1679(次小生成树)
    UVA10487(二分)
    ZOJ 2048(Prim 或者 Kruskal)
    FZU 1856 The Troop (JAVA高精度)
  • 原文地址:https://www.cnblogs.com/depycode/p/6933960.html
Copyright © 2011-2022 走看看