zoukankan      html  css  js  c++  java
  • 淘宝产品抓取实战

    #!coding=utf-8
    import requests
    import re
    import time
    import json
    from requests.packages.urllib3.exceptions import InsecureRequestWarning
    import pandas as pd
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)  ###禁止提醒SSL警告
     
    class tb(object):####手机端
         def __init__(self,path,seach):  ###保存数据路径
            self.path = path  ###保存数据路径
            self.seach= seach ##搜索词
            self.s = requests.session()
            headers = {
                'Host':'s.m.taobao.com',
                'Accept-Encoding':'br, gzip, deflate',
                'Connection':'keep-alive',
                'Accept':'application/json',
                'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/10.6b8836 Mobile/16A366 Safari/605.1.15',
                'Accept-Language':'zh-cn',
                'X-Requested-With':'XMLHttpRequest',
                       }
            self.s.headers.update(headers)  ##插入头信息
     
     
        def seachdata(self):
            for i in range(0,100):
                time.sleep(1.25)
                url='https://s.m.taobao.com/search?event_submit_do_new_search_auction=1&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&from=1&q={}&sst=1&n=20&buying=buyitnow&m=api4h5&abtest=18&wlsort=18&style=list&closeModues=nav%2Cselecthot%2Conesearch&page={}'.format(self.seach,i)   ##爬取的网址
                print(i)
                req = self.s.get(url=url, verify=False).text  #爬取页面结果
                try:
                    js=json.loads(req)
                    print(js)
                except:
                    print('err')
                listItem=js['listItem']
                title=[]  ##名称
                sold=[]   ##月销量
                commentCount=[]  ##评论量
                item_id=[]  ##商品ID
                userId=[]   ##商家ID
                nick=[]  ##商家名称
                location=[]  ##商家地址
                pic_path=[]  ##图片
                itemNumId=[]  ##商品NID
                originalPrice=[]  ##原价
                price=[]  ##售价
                category=[]  ##类别ID
                itemurl=[]  ##商品链接
                if listItem==[]:
                    break
     
                for j in listItem:  ##数据提取
     
                    title.append(j['title'])
                    sold.append(j['sold'])
                    try:
                        commentCount.append(j['commentCount'])
                    except:
                        commentCount.append('')
                    item_id.append(j['item_id'])
                    userId.append(j['userId'])
                    nick.append(j['nick'])
                    location.append(j['location'])
                    pic_path.append(j['pic_path'])
                    itemNumId.append(j['itemNumId'])
                    originalPrice.append(j['originalPrice'])
                    price.append(j['price'])
                    try:
                        category.append(j['category'])
                    except:
                        category.append('')
                    itemurl.append(j['url'])
                    data={
                        'title_名称':title,
                        'sold_月销量': sold,
                        'commentCount_评论量': commentCount,
                        'item_id_商品ID': item_id,
                        'userId_商家ID': userId,
                        'nick_商家名称': nick,
                        'location_商家地址': location,
                        'pic_path_图片': pic_path,
                        'itemNumId_商品NID': itemNumId,
                        'originalPrice_原价': originalPrice,
                        'price_售价': price,
                        'category_类别ID': category,
                        'itemurl_商品链接': itemurl,
                                }
     
                    df=pd.DataFrame(data)
                    if i==0:
                        df.to_csv(self.path+r'out.csv', index=False, header=1, encoding="GB18030")
                    else:
                        df.to_csv(self.path+r'out.csv', index=False, header=0, mode='a', encoding="GB18030")###保存文件
     
     
    if __name__ == '__main__':
        t=tb(r'E:	aobao','手机')
        t.seachdata()
  • 相关阅读:
    MVC1
    Linux中的软连接与硬连接
    python之multiprocessing(二):multiprocessing.Pool
    python之multiprocessing(一)
    python之paramiko(一)
    python_Exception之:TypeError: 'int' object is not iterable
    python之socket(一)
    python之pymysql(一)
    生成树协议--STP
    路由协议--RIP
  • 原文地址:https://www.cnblogs.com/chenxi188/p/10524190.html
Copyright © 2011-2022 走看看