zoukankan      html  css  js  c++  java
  • 京东图书分布式爬虫

      1.jd.py

    # -*- coding: utf-8 -*-
    import scrapy
    from copy import deepcopy
    import json
    import urllib
    from scrapy_redis.spiders import RedisSpider
    from JingDong.items import JingdongItem
    class JdSpider(RedisSpider):
        name = 'jd'
        # allowed_domains = ['jd.com']
        # start_urls = ['https://book.jd.com/booksort.html/']
        redis_key = 'jingdong'
        #
        #解析分类
        def parse(self, response):
            dt_list=response.xpath('//div[@class="mc"]/dl/dt') #大分类列表
            for dt in dt_list:
                item=JingdongItem()
                item['b_cate']=dt.xpath('./a/text()').extract_first()#当前循环大标题
                em_list=dt.xpath('./following-sibling::dd[1]/em') #下小分类列表
                for em in em_list:
                    item['s_href']=em.xpath('./a/@href').extract_first()  #小分类的标题
                    item['s_cate']=em.xpath('./a/text()').extract_first() #小分类得到url
                    if item['s_href'] is not  None:
                            item['s_href']='https:'+item['s_href']
                            yield scrapy.Request(item['s_href'],callback=self.parse_book_list,meta={'item':item})
    
        #解析每个小分类下的所有图书
        def parse_book_list(self,response):
            item=response.meta['item']
            li_list=response.xpath('//div[@id="plist"]/ul/li')
            for li in li_list:
                item['book_name']=li.xpath('.//div[@class="p-name"]/a/em/text()').extract_first().strip()
                item['book_img']=li.xpath('.//div[@class="p-img"]//img/@src').extract_first()  #封面照片连接
                if item['book_img'] is None:
                    item['book_img']=li.xpath('.//div[@class="p-img"]//img/@data-lazy-img').extract_first()
                item['book_img']='https:'+item['book_img'] if item['book_img'] is not None else None
                item['book_author']=li.xpath('.//span[@class="author_type_1"]/a/text()').extract() #作者
                item['book_publish']=li.xpath('.//span[@class="p-bi-store"]/a/@title').extract_first() #出版社
                item['book_publish_date']=li.xpath('.//span[@class="p-bi-date"]/text()').extract_first().strip()  #出版日期
                item['book_sku']=li.xpath('./div/@data-sku').extract_first()#书籍编号
    
                #这边价格是去后端再次请求得到的
                #因为这边后边还需要用到item,所以需要用到深拷贝,sccrapy有三级分类的都要注意是否要使用深拷贝
                yield scrapy.Request('https://p.3.cn/prices/mgets?skuIds={}'.format(item['book_sku']),callback=self.parse_book_price,meta={'item':deepcopy(item)})
    
            #列表翻页
            next_url=response.xpath('//a[@class="pn-next"]/@href').extract_first()
            if next_url is not None:
                next_url=urllib.parse.urljoin(response.url,next_url)
                yield scrapy.Request(next_url,callback=self.parse_book_list,meta={'item',item})
    
    
    
        #解析价格函数
        def parse_book_price(self,response):
            item=response.meta['item']
            #这边返回的json串
            book_info_dict=json.loads(response.body.decode())
            item['boo_price']=book_info_dict[0]['op']
            print('=========',item)

       2.item

    import scrapy
    
    
    class JingdongItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        b_cate=scrapy.Field()
        s_href=scrapy.Field()
        s_cate=scrapy.Field()
        book_name=scrapy.Field()
        book_img=scrapy.Field()
        book_author=scrapy.Field()
        book_publish=scrapy.Field()
        book_publish_date=scrapy.Field()
        book_sku=scrapy.Field()
        boo_price=scrapy.Field()

      3.settings

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for JingDong project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'JingDong'
    
    SPIDER_MODULES = ['JingDong.spiders']
    NEWSPIDER_MODULE = 'JingDong.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'JingDong.middlewares.JingdongSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'JingDong.middlewares.JingdongDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       # 'JingDong.pipelines.JingdongPipeline': 300,
        'scrapy_redis.pipelines.RedisPipeline': 400
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    SCHEDULER_PERSIST = True
    REDIS_URL='redis://127.0.0.1:6379'
  • 相关阅读:
    【转】每天一个linux命令(28):tar命令
    【转】每天一个linux命令(27):linux chmod命令
    【转】每天一个linux命令(26):用SecureCRT来上传和下载文件
    【转】每天一个linux命令(25):linux文件属性详解
    【转】每天一个linux命令(24):Linux文件类型与扩展名
    C#常用多线程方法
    追本溯源 —— 汉语词汇含义的演化
    追本溯源 —— 汉语词汇含义的演化
    古书与二十四史
    古书与二十四史
  • 原文地址:https://www.cnblogs.com/tjp40922/p/10765474.html
Copyright © 2011-2022 走看看