zoukankan      html  css  js  c++  java
  • scrapy项目suningbook

    # -*- coding: utf-8 -*-
    import scrapy
    from copy import deepcopy
    
    class SuningSpider(scrapy.Spider):
        name = 'suning'
        allowed_domains = ['suning.com']
        start_urls = ['https://book.suning.com/']
    
        def parse(self, response):
            # li_list = response.xpath("//div[@class='menu-list']//div[@class='submenu-left']/ul/li")
            # #//div[@class="menu-list"]/div[14]//div[@class="submenu-left"]/p/a/text()
            # for li in li_list:
            #     item = {}
            #     item["title_1"] = li.xpath("./a/text()").extract_first()
            #     item["href_1"] = li.xpath("./a/@href").extract_first()
            #     print(item)
            #     yield item
            # menu_list = response.xpath("//div[@class='menu-list']/div[@class='menu-sub']")
            # for menu_sub in menu_list:
            #     item = {}
            #     item["title_1"] = menu_sub.xpath("./div/p/a/text()").extract()
            #     item["href_1"] = menu_sub.xpath("./div/p/a/@href").extract()
            #
            #     item["title_2"] = menu_sub.xpath("./div/ul/li/a/text()").extract()
            #     item["href_2"] = menu_sub.xpath("./div/ul/li/a/@href").extract()
            #
            #
            #     print(item)
            #     yield item
    
            # menu_list = response.xpath("//div[@class='menu-list']/div[@class='menu-sub']")
            #
            # for menu in menu_list:
            #     item = {}
            #     p_list = menu.xpath("./div[1]/p")
            #     ul_list = menu.xpath("./div/ul")
            #     for p in p_list:
            #
            #         item["title_1"] = p.xpath("./a/text()").extract()
            #         item["href_1"] = p.xpath("./a/@href").extract()
            #         # print(item)
            #
            #     for ul in ul_list:
            #
            #         li_list = ul.xpath("./li")
            #         for li in li_list:
            #
            #             item["title_2"] = li.xpath("./a/text()").extract_first()
            #             item["href_2"] = li.xpath("./a/@href").extract_first()
            #
            #             print(item)
            #             yield item
            menu_list = response.xpath("//div[@class='menu-list']/div[@class='menu-sub']")
            print("========")
            for menu in menu_list:
                item = {}
                div_list = menu.xpath("./div")
                for div_lr in div_list:
                    p_list = div_lr.xpath("./p")
                    ul_list = div_lr.xpath("./ul")
        #<div><p>小说</p><ul><li></li><li></li></ul><p>青春文学</p><ul><li></li><li></li></ul><p>艺术</p><ul><li></li><li></li></ul></div>
        #由于p标签和ul是同级的,但p标签是大分类,所以要让li下的a附属于大分类,就要同时循环,用zip
                    for p,ul in zip(p_list,ul_list):
                        item["title_1"] = p.xpath("./a/text()").extract()
                        item["href_1"] = p.xpath("./a/@href").extract()
    
                        li_list = ul.xpath("./li")
                        for li in li_list:
                            #https://list.suning.com/1-502688-0.html
                            #https://list.suning.com/1-502688-0-0-0-0-0-14-0-4.html
                            # item["url"] = response.xpath("")
                            item["title_2"] = li.xpath("./a/text()").extract_first()
    
                            item["href_2"] = li.xpath("./a/@href").extract_first()
                            item["href_2"] = item["href_2"].rsplit('.',1)[0]+"-0-0-0-0-14-0-4.html"
    
                            # print(item)
                            # yield item
                            yield scrapy.Request(
                                item["href_2"], #列表页
                                callback = self.parse_list,
                                meta = {"item":deepcopy(item)}
                            )
    
                            # https://list.suning.com/emall/showProductList.do?ci=502679&pg=03&cp=0&il=0&iy=-1&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=010&paging=1&sub=0
                            # next_part_url = 'https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=-1&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=010&paging=1&sub=0'
                            # ci = item["href_2"].split("-")[1]
                            # cp = item["href_2"].split("-")[2]
                            # cp = cp.split(".")[0]
                            # next_part_url = next_part_url.format(ci, cp)
                            # # item["href_3"] =next_part_url
                            # yield scrapy.Request(
                            #     next_part_url,
                            #     callback=self.parse_list,
                            #     meta={"item": deepcopy(item)}
                            # )
    
    
    
    
        def parse_list(self,response):
            print(response.request.url)
            # print(response.meta)
            item = deepcopy(response.meta["item"])
    
            # li_list1 = response.xpath("//div[@id='filter-results']/ul/li")
            li_list1 = response.xpath("//li[@name='']")
    
            for li in li_list1:
                item["book_name"] = li.xpath(".//p[@class='sell-point']/a/text()").extract_first()
                # item["book_href"] = li.xpath(".//div[@class='res-info']/p[2]/a/@href").extract_first()
                # item["book_price"] = li.xpath(".//div[@class='res-info']/p[1]/em/text()").extract_first()
                # item["shop_name"] = li.xpath(".//div[@class='res-info']/p[4]/@salesname").extract_first()
                # item["shop_price"] = li.xpath(".//div[@class='res-info']/p[4]/a/@href").extract_first()
                # print(item)
                yield item
                # item1 = deepcopy(item)
                # print(item1)
    
            page_count = response.xpath("//a[@id='nextPage']/preceding-sibling::*[1]/text()").extract_first()
            if page_count:
                # current_page_num = int(response.xpath("//a[@class='cur']/text()").extract_first())
                current_page = response.xpath("//link[@rel='canonical']/@href").extract_first()
                current_page_num = int(current_page.split('-')[2])
                # url = 'https://list.suning.com/1-502687-1-0-0-0-0-14-0-4.html'
                # next = response.xpath('//a[@id="nextPage"]/@href').extract_first()
                url_num = item["href_2"].rsplit('-')[1]
                if current_page_num < int(page_count):
                    next_url = 'https://list.suning.com/1-{}-{}-0-0-0-0-14-0-4.html'.format(url_num,current_page_num + 1)
    
                    yield scrapy.Request(
                        next_url,
                        callback=self.parse_list,
                        meta={"item": response.meta["item"]}
                    )

    pipelines.py

    import json,codecs
    from scrapy.exceptions import DropItem
    #第一种要在settings中配置保存路径  SUNING_FILE_PATH="suningdata.log"
    #两个piplines的执行顺序,根据权重,先打开第一个,后打开第二个,先执行第一个,后执行第二个,先关闭第二个,后关闭第一个
    class SuningbookPipeline(object):
        def __init__(self,path):
            self.f = None
            self.path = path
    
        @classmethod
        def from_crawler(cls, crawler):
            """
            初始化时候,用于创建pipeline对象
            :param crawler:
            :return:
            """
            print('File.from_crawler')
            #去所有的配置文件中找SUNING_FILE_PATH
            path = crawler.settings.get('SUNING_FILE_PATH')
            return cls(path)
    
        def open_spider(self,spider):
            """
            爬虫开始执行时,调用
            :param spider:
            :return:
            """
            # if spider.name == 'chouti':#多个爬虫项目时,执行chouti的pipelines
            print('File.open_spider')
            self.f = open(self.path,'a+',encoding='utf-8')
    
        def process_item(self, item, spider):
            # f = open('xx.log','a+')
            # f.write(item['href']+'
    ')
            # f.close()
            lines = json.dumps(dict(item), ensure_ascii=False) + "
    "
            self.f.write(lines)
            return item #这个return item 的作用是交给下一个pipeliens里面的process_item的item,
                        # 如果没有return下一个pipelines不会接收到值,为空
            # raise DropItem() #如果不想让下面的pipelines的process_item执行,可以不用return item 用这个raise DropItem,抛出异常
    
    
        def close_spider(self,spider):
            """
            爬虫关闭时,被调用
            :param spider:
            :return:
            """
            print('File.close_spider')
            self.f.close()
    #可以设置两个pipelines ,一个保存到文件,一个保存到数据库
    # class DbSuningbookPipeline(object):
    #     def __init__(self,path):
    #         self.f = None
    #         self.path = path
    #
    #     @classmethod
    #     def from_crawler(cls, crawler):
    #         """
    #         初始化时候,用于创建pipeline对象
    #         :param crawler:
    #         :return:
    #         """
    #         print('File.from_crawler')
    #         #去所有的配置文件中找SUNING_FILE_PATH
    #         path = crawler.settings.get('SUNING_FILE_PATH')
    #         return cls(path)
    #
    #     def open_spider(self,spider):
    #         """
    #         爬虫开始执行时,调用
    #         :param spider:
    #         :return:
    #         """
    #         # if spider.name == 'chouti':
    #         print('File.open_spider')
    #         self.f = open(self.path,'a+',encoding='utf-8')
    #
    #     def process_item(self, item, spider):
    #         # f = open('xx.log','a+')
    #         # f.write(item['href']+'
    ')
    #         # f.close()
    #         lines = json.dumps(dict(item), ensure_ascii=False) + "
    "
    #         self.f.write(lines)
    #         return item
    #
    #
    #     def close_spider(self,spider):
    #         """
    #         爬虫关闭时,被调用
    #         :param spider:
    #         :return:
    #         """
    #         print('File.close_spider')
    #         self.f.close()
    #第二种
    # class SuningbookPipeline(object):
    #     """
    #     将数据保存到json文件,由于文件编码问题太多,这里用codecs打开,可以避免很多编码异常问题
    #         在类加载时候自动打开文件,制定名称、打开类型(只读),编码
    #         重载process_item,将item写入json文件,由于json.dumps处理的是dict,所以这里要把item转为dict
    #         为了避免编码问题,这里还要把ensure_ascii设置为false,最后将item返回回去,因为其他类可能要用到
    #         调用spider_closed信号量,当爬虫关闭时候,关闭文件
    #     """
    #     def __init__(self):
    #         self.file = codecs.open('suning.json', 'w', encoding="utf-8")
    #
    #     def process_item(self, item, spider):
    #         lines = json.dumps(dict(item), ensure_ascii=False) + "
    "
    #         ## 注意需要有一个参数ensure_ascii=False ,不然数据会直接为utf编码的方式存入比如:“/xe15”
    #         self.file.write(lines)
    #         return item
    #
    #     def spider_closed(self, spider):
    #         self.file.close()
    
    
    #第三种
    # class SuningbookPipeline(object):
    #     def open_spider(self,spider):
    #         self.f = open('xxx.text','a+',encoding='utf-8')
    #
    #     def process_item(self, item, spider):
    #         # print(item)
    #         line = json.dumps(dict(item), ensure_ascii=False) + "
    "
    #         self.f.write(line)
    #         return item
    #
    #     def close_spider(self,spider):
    #         self.f.close()
    
    # #第四种
    # class SuningbookPipeline(object):
    #     def process_item(self, item, spider):
    #
    #         with open('data.txt', 'a') as f:
    #             f.write(item['title_1'])
    #             f.write(item['href_1'])
    #             f.write(item['book_name'] + '
    ')
    #         return item

    settings

    #配置保存路径
    SUNING_FILE_PATH="suningdata.log"
    
    # 修改默认的去重规则
    # DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
    # DUPEFILTER_CLASS = 'xdb.dupefilters.XdbDupeFilter'
    
    # 限制深度
    # DEPTH_LIMIT = 3

     项目地址:https://github.com/CH-chen/suningbook

  • 相关阅读:
    树型表的设计 上海
    FTP通讯封装 上海
    线程淡写 上海
    TCP通讯故障 上海
    设计模式引导 上海
    初试Delegate 上海
    c# 扫描端口 上海
    攻读计算机研究生的看法(转载) 上海
    挖掘表字段中的汉字 上海
    新生活运动 上海
  • 原文地址:https://www.cnblogs.com/chvv/p/10332465.html
Copyright © 2011-2022 走看看