zoukankan      html  css  js  c++  java
  • scrapy 爬取天猫商品信息

    spider
    # -*- coding: utf-8 -*-
    from urllib.parse import urlencode
    import requests
    import scrapy
    import re
    import json
    from ..items import TmallItem
    
    cookie = {'thw': 'cn', 'hng': 'CN%7Czh-CN%7CCNY%7C156', 'tracknick': 'yzhy1372', 'tg': '0', 'miid': '813697773983481206', 'x': 'e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0', '_cc_': 'UIHiLt3xSw%3D%3D', 'enc': '52fRsc7qpI96LDqf%2FkMA7AfWwN0%2BYmGMXsa4AdC3He4jEbrP%2BRbmYwz%2Bn3xwMrIk4fqBuRCR6BYtQvI%2FP7UBRw%3D%3D', 'UM_distinctid': '165c600d3903a8-0dc9190eb920d3-c343567-100200-165c600d39319', 'cna': 'iSbqEnsQrkoCAXM7KlL0pQWu', 't': '8489c373deedc2a297ebe4c4ad6debb5', '_uab_collina': '153991002330679083015734', '_umdata': '6AF5B463492A874D05644EF9A3CE888C0BB3EC8395620198BCCF71C40733CB6AAB98C444C566382ECD43AD3E795C914C010C8EDA083E64FAFA9E46E3CF4DEA41', '_m_h5_tk': 'bf46d22c8564ad537f01664eb002112c_1539921942514', '_m_h5_tk_enc': 'f2a1bff4b69d2c036314c66504744070', 'v': '0', 'cookie2': '2b9488dea40dbe840f20ea5f14836ef7', '_tb_token_': 'fb83ee7ebeed7', 'alitrackid': 'www.taobao.com', 'lastalitrackid': 'www.taobao.com', 'JSESSIONID': '9787B4CF4D2812E2BA1E407B224AE53A', 'isg': 'BOfnzJhvcDexNPXcxwaGYkk8dhtxxJBNn5b9BrlUMnacqAVqyz-ynoHpzuiTQJPG', 'Hm_lvt_dde6ba2851f3db0ddc415ce0f895822e': '1539912803,1539913323,1539944839,1539944853', 'Hm_lpvt_dde6ba2851f3db0ddc415ce0f895822e': '1539944853', 'unb': '624984624', 'uc1': 'cookie16=VFC%2FuZ9az08KUQ56dCrZDlbNdA%3D%3D&cookie21=WqG3DMC9FxUx&cookie15=VT5L2FSpMGV7TQ%3D%3D&existShop=false&pas=0&cookie14=UoTfItnW5e2f1g%3D%3D&tag=8&lng=zh_CN', 'sg': '244', '_l_g_': 'Ug%3D%3D', 'skt': '5c93ad4f47f0c1ca', 'cookie1': 'U%2BTs5qAQHjB1CoYPMJcEQ4UfC6zh%2FdhqLG66mPjcz38%3D', 'csg': 'e312c3a6', 'uc3': 'vt3=F8dByRmq%2Bp63ob4wR7I%3D&id2=VW3j%2BbmcVcIV&nk2=GhETDBFSx%2Fs%3D&lg2=VT5L2FSpMGV7TQ%3D%3D', 'existShop': 'MTUzOTk0NTUzNw%3D%3D', 'lgc': 'yzhy1372', 'dnk': 'yzhy1372', '_nk_': 'yzhy1372', 'cookie17': 'VW3j%2BbmcVcIV', 'mt': 'np='}
    
    class MianbaoSpider(scrapy.Spider):
        name = "mianbao"
        # allowed_domains = ["https://www.taobao.com"]
        def start_requests(self):
            url = 'https://s.taobao.com/search'
            pars = {
                'q': '女士上衣',     #搜索关键字
                'initiative_id': 'staobaoz_20181019',
                'ie': 'utf8',
                'tab': 'mall',       #搜索天猫 1,all天猫淘宝 2,tmall天猫 3,old二手
                # 's': '0',            #页码  44递增
                'sort': 'sale-desc'  #默认 default
                                     #排序类型
                                     # #credit-desc信用排序
                                     # #price-asc 价格升序
                                     #price-desc 价格降序序
            }
            data = urlencode(pars)
            urls = [url+'?'+data+'&s='+str(page) for page in range(0,450,44)]  #翻页爬取
            for u in urls:
                yield scrapy.Request(u,self.mianbao,cookies=cookie)
    
    
        def mianbao(self, response):
            res = re.compile(r'g_page_config = (.*?);s*g_srp_loadCss',re.S)
            datas = json.loads(res.findall(response.text)[0])['mods']['itemlist']['data']['auctions']
            for i in datas:
                title = i['raw_title']  #商品名称
                pic_url = 'http:'+i['pic_url']  #图片链接  #列表页图片
                # view_price = i['view_price']  #商品价格
                detail_url = 'https:'+i['detail_url']  #商品详情url
                nick = i['nick']  #店铺名称
                view_sales = i['view_sales']   #付款人数
                item_loc = i['item_loc']  #商品所在地
                comment_count = i['comment_count']  #评论数
                user_id = i['user_id']  #取评论内容用
                yield scrapy.Request(detail_url,self.detail_info,meta={'title':title,'nick':nick,'view_sales':view_sales,'item_loc':item_loc,'comment_count':comment_count,'pic_url':pic_url,'user_id':user_id})
    
    
        def detail_info(self,response):
            item = TmallItem()
            res = re.compile(r'"defaultItemPrice":"(.*?)",',re.S)
            price = res.findall(response.text)[0]     #单价
            good_imgs = response.xpath('//*[@id="J_UlThumb"]/li/a/img/@src').extract()#抓取图片
            good_info = response.xpath('//*[@id="J_AttrUL"]/li/text()').extract()
            if len(good_info) == 0:   #商品详情
                good_infos = '暂无'
            else:
                good_infos = good_info
            item_id = re.findall(r'id=(.*?)&',response.url)[0]  #这里是取出商品id
            user_id = response.meta['user_id'] #取出商家id
    
            url = 'https://rate.tmall.com/list_detail_rate.htm'
            data = {
                'itemId': item_id,   #商品id
                'sellerId': user_id    #商家id
            }
            headers = {
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
            }
    
            try:
                rote_response = requests.get(url=url,params=data,headers=headers)  #发起请求
                rote_json = json.loads(re.findall(r'jsonp128((.*?))',rote_response.text)[0])['rateDetail']['rateList']
                rote_list = []  # 评论列表
                for i in rote_json:
                    rote_dict = {}
                    rote_dict['auctionSku'] = i['auctionSku']  #购买商品名称
                    rote_dict['rateContent'] = i['rateContent']  #商品评论内容
                    rote_dict['pics'] = i['pics']  #评价图片
                    if len(rote_list) < 5: #每件商品只抓5条评论
                        rote_list.append(rote_dict)  #把评论内容放到列表里
            except:
                print('该商品评论 无法抓取')
                rote_list = []
    
            item['title'] = response.meta['title']
            item['nick'] = response.meta['nick']
            item['price'] = price
            item['view_sales'] = response.meta['view_sales']
            item['item_loc'] = response.meta['item_loc']
            item['comment_count'] = response.meta['comment_count']
            item['pic_url'] = response.meta['pic_url']
            item['good_infos'] = good_infos
            item['good_imgs'] = good_imgs
            item['rote_list'] = rote_list
            return item
    piplines
    # -*- coding: utf-8 -*-
    import pymongo
    mongo = pymongo.MongoClient('127.0.0.1',27017)
    mongodb = mongo['tmall']
    mongocoll = mongodb['good_info']
    import os
    import requests
    import csv
    
    import pymysql
    db = pymysql.connect(
        db = 'test',
        user = 'root',
        port = 3306,
        host = 'localhost',
        password = 'mysql',
        charset = 'utf8'
    )
    cursor = db.cursor()
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    class TmallPipeline(object):
        def process_item(self, item, spider):
            good_imgs = item['good_imgs']
            title = item['title']
    
            path = 'tmalls/' + title  #商品信息路径
            if not os.path.exists(path):
                os.mkdir(path)
    
            img = []  #更改图片链接
            count = 0
            for i in good_imgs:
                count += 1
                url = 'https:'+i[:-13]
                img.append(url)
                with open(path+'\'+str(count)+'.jpg','wb') as f:  #写入图片
                    response = requests.get(url)
                    f.write(response.content)
    
                item['good_imgs'] = img
                with open(path+'\'+'商品信息'+'.csv','w+',encoding='utf-8',newline='') as f:
                    writer = csv.writer(f)
                    for k, j in dict(item).items():
                        datas = [
                            [k, j]
                        ]
                        writer.writerows(datas)
                    writer.writerows('
    ')
            mongocoll.insert(dict(item))
    
            title = item['title']
            price = item['price']
            good_infos = item['good_infos']
            view_sales = item['view_sales']
            comment_count = item['comment_count']
            item_loc = item['item_loc']
            nick = item['nick']
            sql = 'insert into tmall values (0,%s,%s,%s,%s,%s,%s,%s)'
            cursor.execute(sql,[title,price,str(good_infos),view_sales,comment_count,item_loc,nick])
            db.commit()
    
            return item
  • 相关阅读:
    基于socket的TCP和UDP编程
    (转)基于socket的TCP和UDP编程
    MFC的本质
    Windows程序内部运行机制 转自http://www.cnblogs.com/zhili/p/WinMain.html
    mysql主从切换
    mysql主从不一致解决方法
    postgresql+slony-i安装配置主从
    inndb存储引擎调优
    mysql 锁优化
    NDB Cluster 存储引擎物理备份
  • 原文地址:https://www.cnblogs.com/lvye001/p/9821027.html
Copyright © 2011-2022 走看看