zoukankan      html  css  js  c++  java
  • JD 评论晒图爬虫

    JD 评论晒图爬虫

    #coding=utf-8
    import requests
    import re
    import os
    
    __author__ = 'depy'
    
    """
    jd 评论晒图爬虫
    @productId 商品id
    @startpage 开始页数
    @endpage 结束页数
    """
    
    class JDPIC(object):
        def __init__(self,productId,startpage,endpage=20):
            self.headers = {
                'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
                'Accept':'*/*',
                'Accept-Encoding':'gzip, deflate, sdch, br',
                'Accept-Language':'zh-CN,zh;q=0.8',
                'Cookie':''
            }
            self.url = 'https://club.jd.com/discussion/getProductPageImageCommentList.action'
            self.startpage = startpage
            self.productId = productId
            self.endpage = endpage
    
        def sendReq(self,page):
            params = {
                'productId':self.productId,
                'isShadowSku':'0',
                'callback':'jQuery219465',
                'page':page,
                'pageSize':20
            }
            r = requests.get(self.url,params=params,headers=self.headers,timeout=10)
            regex = re.findall(r'"imageUrl":"//(.*?)"',r.text)
            return regex
    
        def downloadImageFile(self,imgUrl):
            local_filename = imgUrl.split('/')[-1]
            print "Download Image File=", local_filename
            imgUrl = 'http://'+imgUrl
            r = requests.get(imgUrl, headers =self.headers,stream=True, timeout=20)
            dirName = 'JDPIC1'
            if not os.path.exists(dirName):
                os.makedirs(dirName)
            with open(dirName+'/'+local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
                        f.flush()
                f.close()
    
    if __name__ == '__main__':
        J = JDPIC(1111,51,100)  #商品id自行修改
        #print J.endpage
        list = range(int(J.startpage),int(J.endpage)+1)
        for i in list:
            regexlist = J.sendReq(i)
            for picurl in regexlist:
                J.downloadImageFile(picurl)
    
        print "downpic success"
  • 相关阅读:
    修复 Visual Studio Error “No exports were found that match the constraint”
    RabbitMQ Config
    Entity Framework Extended Library
    Navisworks API 简单二次开发 (自定义工具条)
    NavisWorks Api 简单使用与Gantt
    SQL SERVER 竖表变成横表
    SQL SERVER 多数据导入
    Devexpress GridControl.Export
    mongo DB for C#
    Devexress XPO xpPageSelector 使用
  • 原文地址:https://www.cnblogs.com/depycode/p/6933960.html
Copyright © 2011-2022 走看看