zoukankan      html  css  js  c++  java
  • scrapy的CrawlSpider使用

    1.创建项目

    我这里的项目名称为scrapyuniversal,然后我创建在D盘根目录。创建方法如下

    打开cmd,切换到d盘根目录。然后输入以下命令:

    scrapy startproject scrapyuniversal

    如果创建成功,d盘的根目录下将生成一个名为scrapyuniversal的文件夹。

    2.创建crawl模板

    打开命令行窗口,然后定位到d盘刚才创建的scrapyuniversal文件夹。然后输入以下命令

    scrapy genspider -t crawl china tech.china.com
    

    如创建成功会在scrapyuniversal目录下的spider目录里多一个spider文件,下面我们就来看这个spider文件。 代码含注释

    3.目录结构

    scrapyuniversal
    │  scrapy.cfg
    │  spider.sql
    │  start.py
    │  
    └─scrapyuniversal
        │  items.py
        │  loaders.py
        │  middlewares.py
        │  pipelines.py
        │  settings.py
        │  __init__.py
        │  
        ├─spiders
        │  │  china.py
        │  │  __init__.py
        │  │  
    
    
    

      

    4.china.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from ..items import NewsItem
    from ..loaders import ChinaLoader
    class ChinaSpider(CrawlSpider):
        name = 'china'
        allowed_domains = ['tech.china.com']
    
        start_urls = ['http://tech.china.com/articles/']
        #当我们follow一个链接时, 我们其实是用rules把这个链接返回的response再提取一遍.
        #第二个rule是设定设定只取两页
        rules = (
            Rule(LinkExtractor(allow='article/.*.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
                 callback='parse_item'),
            Rule(LinkExtractor(restrict_xpaths="//div[@id='pageStyle']//span[text()<3]")),
        )
    
        def parse_item(self, response):
           #item存储方式,保存格式乱所以改用itemloader
           # item=NewsItem()
           # item['title']=response.xpath("//h1[@id='chan_newsTitle']/text()").extract_first()
           # item['url']=response.url
           # item['text']=''.join(response.xpath("//div[@id='chan_newsDetail']//text()").extract()).strip()
           # #re_first正则表达式提取时间
           # item['datetime']=response.xpath("//div[@id='chan_newsInfo']/text()").re_first('(d+-d+-d+sd+:d+:d+)')
           # item['source']=response.xpath('//div[@id="chan_newsInfo"]/text()').re_first("来源: (.*)").strip()
           # item['website']="中华网"
           # yield item
    
           loader = ChinaLoader(item=NewsItem(), response=response)
           loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
           loader.add_value('url', response.url)
           loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
           loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(d+-d+-d+sd+:d+:d+)')
           loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)')
           loader.add_value('website', '中华网')
           yield loader.load_item()
    

     5.loader.py

    #!/usr/bin/env python  
    # encoding: utf-8  
    from scrapy.loader import ItemLoader
    from scrapy.loader.processors import TakeFirst,Join,Compose
    
    class NewsLoader(ItemLoader):
        """
        定义一个通用Out Processor为TakeFirst
        TakeFirst:取迭代对象中第一个非空元素,相当于之前item用的extract_first
        """
        default_output_processor = TakeFirst()
    class ChinaLoader(NewsLoader):
        """
        Compose第一个参数
        Join:把列表拼成字符串
        Compose第二个参数是一个匿名函数
        对字符串进一步处理
        """
        text_out=Compose(Join(),lambda s: s.strip())
        source_out=Compose(Join(),lambda s:s.strip())
    

      6.items.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    from scrapy import Field,Item
    
    
    class NewsItem(Item):
        #标题
        title=Field()
        #链接
        url=Field()
        #正文
        text=Field()
        #发布时间
        datetime=Field()
        #来源
        source=Field()
        #站点名称,直接赋值中华网
        website=Field()
    

      7.中间件的修改,随机获取useragent逻辑

    # -*- coding: utf-8 -*-
    
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    from scrapy import signals
    import random
    class ProcessHeaderMidware():
        """process request add request info"""
        def __init__(self):
           self.USER_AGENT_LIST= ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
             "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
             "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
             "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
             "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
             "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
             "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
             ]
        def process_request(self, request, spider):
            """
            随机从列表中获得header, 并传给user_agent进行使用
            """
            ua = random.choice(self.USER_AGENT_LIST)
            spider.logger.info(msg='now entring download midware')
            if ua:
                request.headers['User-Agent'] = ua
                # Add desired logging message here.
                spider.logger.info(u'User-Agent is : {} {}'.format(request.headers.get('User-Agent'), request))
    

      8.settings.py

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for scrapyuniversal project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'scrapyuniversal'
    
    SPIDER_MODULES = ['scrapyuniversal.spiders']
    NEWSPIDER_MODULE = 'scrapyuniversal.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'scrapyuniversal (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'scrapyuniversal.middlewares.ScrapyuniversalSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'scrapyuniversal.middlewares.ScrapyuniversalDownloaderMiddleware': 543,
    #}
    DOWNLOADER_MIDDLEWARES = {
        'scrapyuniversal.middlewares.ProcessHeaderMidware': 543,
    
    }
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'scrapyuniversal.pipelines.ScrapyuniversalPipeline': 300,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
    HTTP_PROXY="127.0.0.1:5000" #替换成需要的代理
    

      

     

  • 相关阅读:
    移动函数的封装示例
    如何从不均衡类中进行机器学习
    DPM(Deformable Parts Model)--原理(一)
    K-means聚类算法
    机器学习中对核函数的理解
    总结:Bias(偏差),Error(误差),Variance(方差)及CV(交叉验证)
    技术干货
    神经网络入门
    目标函数、损失函数、代价函数
    地铁客流检测训练问题记录
  • 原文地址:https://www.cnblogs.com/c-x-a/p/9040548.html
Copyright © 2011-2022 走看看