zoukankan      html  css  js  c++  java
  • scrapy爬虫案例数据存入MongoDB

    爬虫py文件

    # -*- coding: utf-8 -*-
    import scrapy
    from ..items import RtysItem
    
    
    class RtSpider(scrapy.Spider):
        name = 'rt'      #爬虫名,启动项目时用
        # allowed_domains = ['www.baidu.com']     #定义爬虫范围  注释掉就可以
        start_urls = ['https://www.woyaogexing.com/touxiang/']    #起始url 项目启动时,会自动向url发起请求
        def parse(self, response):  # response直接代替响应
            div_list=response.xpath('//div[@class="list-left z"]/div[2]/div')  #解析数据
            for i in div_list:
                name = i.xpath('./a/text()').extract_first() #变量名 要与items.py中实例化的变量名一样
                img_url = i.xpath('./a/img/@src').extract_first()
                lianjie_url = i.xpath('./a/@href').extract_first()
                items = RtysItem()  #实例化items
                items['name']=name    #将实例化的字段存进字典中
                items['img_url']=img_url
                items['lianjie_url']=lianjie_url
                yield items  #发送给管道
    

    pipelines.py 文件

                # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    import pymongo
    
    class RtysPipeline(object):
        def process_item(self, item, spider):
            coon = pymongo.MongoClient('localhost',27017)  #连接mongodb数据库
            db = coon.rtys  #创建数据库 有的话就直接用 没有就相当于创建
            table = db.rt   #创建表 有的话就直接用 没有就相当于创建
            table.insert_one(dict(item))  #查入一条数据 转化成字典
            return item
    

    存入Mongo时要注意settings.py的配置 注释部分需要打开

    settings.py文件

    
    # -*- coding: utf-8 -*-
    
    # Scrapy settings for rtys project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://docs.scrapy.org/en/latest/topics/settings.html
    #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'rtys'
    
    SPIDER_MODULES = ['rtys.spiders']
    NEWSPIDER_MODULE = 'rtys.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False   #False 爬的网站不受限制   True爬的网站受限制
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'rtys.middlewares.RtysSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    
    
    篡改ip的时候需要打开中间件
    #DOWNLOADER_MIDDLEWARES = {
    #    'rtys.middlewares.RtysDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://docs.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {     #需要注开
       'rtys.pipelines.RtysPipeline': 300,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    

    items.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class RtysItem(scrapy.Item):
        # define the fields for your item here like:
        name = scrapy.Field()  #设置要爬取的字段名 爬几个就写几个
        img_url = scrapy.Field()
        lianjie_url = scrapy.Field()
        pass
    

    遇到问题 raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name)) NameError:

    解决

    应该是在settings.py文件中ITEM_PIPELINES = [ **]里面的类名拼写错误,我也是遇到同样的问题。
    
  • 相关阅读:
    Poj 1973 Software Company(二分+并行DP)
    Bellman-Ford算法及其队列优化(SPFA)
    Java程序打包成exe可执行文件
    zabbix监控入门初步
    网页解析器
    urllib2下载网页的三种方法
    ubuntu14.04允许root远程链接、修改主机名
    Iptalbes练习题(三)
    Iptalbes练习题(二)
    htop的使用
  • 原文地址:https://www.cnblogs.com/pp8080/p/12191213.html
Copyright © 2011-2022 走看看