zoukankan      html  css  js  c++  java
  • scrapy 实现mysql 数据保存

    开始用scrapy 爬取数据的时候  开始用同步操作始终会报1064  的错误  因为 mysql 语法和导入的字段不兼容

    尝试了  n  次之后  开始用  异步爬取  虽然一路报错 但是还是能把数据保存到mysql 数据库里

    关于spider:

    # -*- coding: utf-8 -*-
    import scrapy
    from urllib import parse
    import re
    from copy import deepcopy
    from ..items import MyspiderItem
    
    
    class TbSpider(scrapy.Spider):
        name = 'tb'
    
        allowed_domains = []
        start_urls = ['http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/m?kw=%E6%A1%82%E6%9E%97%E7%94%B5%E5%AD%90%E7%A7%91%E6%8A%80%E5%A4%A7%E5%AD%A6%E5%8C%97%E6%B5%B7%E6%A0%A1%E5%8C%BA&pn=26140',
                      ]
    
        def parse(self, response):  # 总页面
            item = MyspiderItem()
    
            all_elements = response.xpath(".//div[@class='i']")
            # print(all_elements)
    
            for all_element in all_elements:
                content = all_element.xpath("./a/text()").extract_first()
                content = "".join(content.split())
                change = re.compile(r'[d]+.')
                content = change.sub('', content)
                item['comment'] = content
    
                person = all_element.xpath("./p/text()").extract_first()
                person = "".join(person.split())
                # 去掉点赞数 评论数
                change2 = re.compile(r'点[d]+回[d]+')
                person = change2.sub('',person)
                # 选择日期
                change3 = re.compile(r'[d]?[d]?-[d][d](?=)')
                date = change3.findall(person)
    
                # 如果为今天则选择时间
                change4 = re.compile(r'[d]?[d]?:[d][d](?=)')
                time = change4.findall(person)
    
                person = change3.sub('',person)
                person = change4.sub('',person)
    
                if time ==[]:
                    item['time'] = date
                else:
                    item['time'] = time
    
                item['name'] = person
    
                # 增加密码 活跃
                item['is_active'] = '1'
                item['password'] = '123456'
    
    
                print(item)
                yield item
    
    
    
            # 下一页
            next_url ='http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/' + parse.unquote( response.xpath(".//div[@class='bc p']/a/@href").extract_first())
    
            print(next_url)
            yield  scrapy.Request(
                next_url,
                callback=self.parse,
    
            )

    关于  item

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class MyspiderItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        comment = scrapy.Field()
        time = scrapy.Field()
        name = scrapy.Field()
        password = scrapy.Field()
        is_active = scrapy.Field()

    关于setting

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for mySpider project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'mySpider'
    
    SPIDER_MODULES = ['mySpider.spiders']
    NEWSPIDER_MODULE = 'mySpider.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    
    
    MYSQL_HOST = 'localhost'
    MYSQL_DBNAME = 'mu_ke'
    MYSQL_USER = 'root'
    MYSQL_PASSWD = 'root'
    
    
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'mySpider.middlewares.MyspiderSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'mySpider.middlewares.MyspiderDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'mySpider.pipelines.MysqlPipelineTwo':200,
    
    }
    
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

    关于  异步的爬取   重点

    import pymysql
    from twisted.enterprise import adbapi
    
    
    class MysqlPipelineTwo(object):
        def __init__(self, dbpool):
            self.dbpool = dbpool
    
        @classmethod
        def from_settings(cls, settings):  # 函数名固定,会被scrapy调用,直接可用settings的值
            """
            数据库建立连接
            :param settings: 配置参数
            :return: 实例化参数
            """
            adbparams = dict(
                host='127.0.0.1',
                db='mu_ke',
                user='root',
                password='root',
                cursorclass=pymysql.cursors.DictCursor  # 指定cursor类型
            )
            # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
            dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
            # 返回实例化参数
            return cls(dbpool)
    
        def process_item(self, item, spider):
            """
            使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
            """
            query = self.dbpool.runInteraction(self.do_insert, item)  # 指定操作方法和操作数据
            # 添加异常处理
            query.addCallback(self.handle_error)  # 处理异常
    
        def do_insert(self, cursor, item):
            # 对数据库进行插入操作,并不需要commit,twisted会自动commit
            insert_sql = """
            insert into login_person(name,password,is_active,comment,time) VALUES(%s,%s,%s,%s,%s)
                        """
            cursor.execute(insert_sql, (item['name'], item['password'], item['is_active'], item['comment'],
                                        item['time']))
    
        def handle_error(self, failure):
            if failure:
                # 打印错误信息
                print(failure)
  • 相关阅读:
    mvn -v提示Permission denied
    JMeter生成HTML报告
    Python数据可视化神器pyecharts
    ThinkPHP 3.2.3 使用 Swift Mailer 邮件系统发送邮件
    qt的webkit
    Windows Driver Kit Version 7.1.0 ( 也就是 7600.16385.1 ) 下载地址
    erlang的调试配置
    emacs之开始就加载tag
    emacs之配置etags-select
    emacs之配置yasnippet
  • 原文地址:https://www.cnblogs.com/fromlantianwei/p/10607956.html
Copyright © 2011-2022 走看看