zoukankan      html  css  js  c++  java
  • scrapy项目1

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class ItcastSpider(scrapy.Spider):
        name = 'itcast' #爬虫名称
        allowed_domains = ['itcast.cn'] #允许爬取的范围
        start_urls = ['http://www.itcast.cn/channel/teacher.shtml']  # 最开始请求的url地址
    
        def parse(self, response):
            # # 处理start_urls对应的响应
            # ret1 = response.xpath("//div[@class='tea_con']//h3/text()").extract()
            # print(ret1)
    
            list_li = response.xpath("//div[@class='tea_con']//li")
            # list1 = []
            for li in list_li:
                item = {}
                item["name"] = li.xpath(".//h3/text()").extract_first()
                item["title"] = li.xpath(".//h4/text()").extract_first()
                # list_li.append(item)
                # print(item)
                #yield后面的值必须是 Request对象, BaseItem类, dict字典 or None,列表会出错
                yield item  #item的值传到pipelines

    pipelines.py

    #使用pipeline,要开启settings中的ITEM_PIPELINES
    #多个pipeline的作用,不同的pipeline处理不同的item内容,一个爬虫项目包含多个爬虫,一个spider可能做不同的操作,比如存入不同的数据库
    # pipeline的权重越小,优先级越高,pipeline中的process_item方法名不能改为其他名称
    
    class Myspider02Pipeline(object):
        def process_item(self, item, spider):
            item["hello"] = "world"
            return item  #这个上一个判断要有return,否则下面打印就是None
    
    class Myspider02Pipeline1(object):
        def process_item(self, item, spider):
            print(item)   #数值会先经过小的,后经过大的,经过小的添加hell world,如果下面大,先经过下面,这个打印就不会有helloworld
            return item
    
    
    #实现存储方法
    import json
    class Myspider02Pipeline2(object):
        def process_item(self, item, spider):
            with open("xxxx.text",'a')as f:
                json.dumps(item,f,ensure_ascii=False,indent=2)

    settings.py

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for myspider02 project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'myspider02'
    
    SPIDER_MODULES = ['myspider02.spiders']
    NEWSPIDER_MODULE = 'myspider02.spiders'
    
    LOG_LEVEL = "WARNING" #log日志比warning小的都不会显示
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'myspider02 (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = True
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'myspider02.middlewares.Myspider02SpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'myspider02.middlewares.Myspider02DownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    #可以定义多个pipelines,数值代表执行的先后,数据越小越先经过这个管道
    ITEM_PIPELINES = {
       'myspider02.pipelines.Myspider02Pipeline': 300,
       'myspider02.pipelines.Myspider02Pipeline1': 301,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  • 相关阅读:
    从Oracle提供两种cube产品说开
    Sql Server DWBI的几个学习资料
    Unload Oracle data into text file
    初学Java的几个tips
    我常用的Oracle知识点汇总
    benefits by using svn
    如何在windows上使用putty来显示远端linux的桌面
    building commercial website using Microsoft tech stack
    Understand Thread and Lock
    Update google calendar by sunbird
  • 原文地址:https://www.cnblogs.com/chvv/p/10332457.html
Copyright © 2011-2022 走看看