zoukankan      html  css  js  c++  java
  • scrapy 数据存储mysql

    scrapy 数据存储mysql 处理:

    user-agent 池:

    UserAgent = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    

    数据库同步插入数据模板:

    同步
    class MysqlPipeline(object):
        def open_spider(self, spider):
            self.db = pymysql.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, password=MYSQL_PWD,
                                      database=MYSQL_DB)
            self.cursor = self.db.cursor()
            print('数据库连接success')
    
        def process_item(self, item, spider):
    
            info_list = (item['company_name'], item['company_person'], item['company_money'],
                         item['company_establish'], item['company_phone'], item['company_address'],
                         item['company_status'])
    
            ins = """
                insert into test1(company_name,company_person,company_money,company_establish,company_phone,company_address' 
                  'company_status,) VALUES (%s,%s,%s,%s,%s,%s,%s)"""
            try:
                self.cursor.execute(ins, info_list)
                self.db.commit()
                print('插入重构')
            except Exception as e:
                print(e)
    
            return item
    
        def close_spider(self, spider):
            self.cursor.close()
            self.db.close()
    

    csv 存储:

    csv 存储
    
    import csv
    class WangyinPipeline_a(object):
        def __init__(self):
            self.f = open("qichacha_sh.csv", "w")
            self.writer = csv.writer(self.f)
            self.writer.writerow(
                ['company_name', 'company_city', 'company_person', 'company_money', 'company_establish', 'company_email',
                 'company_phone', 'company_address'' 
                  ''company_status', 'company_link'])
    
        def process_item(self, item, spider):
            wangyiyun_list = item['company_name'], item['company_city'], item['company_person'], item['company_money'], 
                             item['company_establish'], item['company_email'], item['company_phone'], item[
                                 'company_address'], 
                             item['company_status'], item['company_link']
    
            self.writer.writerow(wangyiyun_list)
            return item
    
        def close_spider(self, spider):  # 关闭
            self.f.close()
    

    插入模板:

    import pymysql
    
    
    # 同步
    class QccPipeline(object):
        def __init__(self):
            dbparams = {
                'host': '127.0.0.1',
                'port': 3306,
                'user': 'root',
                'password': '155345',
                'database': 'qcc',
                'charset': 'utf8'
    
            }
            self.conn = pymysql.connect(**dbparams)
            self.cursor = self.conn.cursor()
            self._sql = None
    
        def process_item(self, item, spider):
            self.cursor.execute(self.sql, (item['title'], ...))
            self.conn.commit()
    
            return item
    
        @property
        def sql(self):
            if not self._sql:
                # 创建表 sql  语句
                self._sql = """
                文本: longtext  长度不写
                时间   datetime
                注意:
                    id 不为空
                    item 内的company_name = scrapy.Field() 
                            company 对应表子段
                 id title text content author touxiang time x-id 
                insert into art( id,....) values(null,%s,%s,%s,%s。。。。) 
    
                """
                return self._sql
            return self._sql
    
    

    异步插入模板:

    # 异步
    from twisted.enterprise import adbapi  # 数据库处理
    from pymysql import cursors
    
    
    class ty(object):
        def __init__(self):
            dbparams = {
                'host': '127.0.0.1',
                'port': 3306,
                'user': 'root',
                'password': '155345',
                'database': 'jianshu',
                'charset': 'utf8',
                'cursorclass': cursors.DictCursor
    
            }
            self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
            self._sql = None
    
        @property
        def sql(self):
            if not self._sql:
                # 创建表 sql  语句
                self._sql = """
                文本: longtext  长度不写
                时间   datetime
                注意:
                    id 不为空
                    item 内的company_name = scrapy.Field() 
                            company 对应表子段
                 id title text content author touxiang time x-id 
                insert into art( id,....) values(null,%s,%s,%s,%s。。。。) 
    
                """
                return self._sql
            return self._sql
    
        def process_item(self, item, spider):
            defer = self.dbpool.runInteraction((self.insert_item, item))
            defer.addErrback(self.handle_error, item, spider)
    
        def insert_item(self, cursor, item):
            cursor.execute(self.sql, (item['title'], ...))
    
        def handler_error(self, error, item, spider):
            print('=' * 10 + 'error' + '=' * 10)
            print(error)
    
    
    

    数据库插入包错: 1064

    class MysqlPipeline(object):
    
        def __init__(self):
            self.host = settings.DB_HOST
            self.port = settings.DB_PORT
            self.user = settings.DB_USER
            self.pwd = settings.DB_PWD
            self.name = settings.DB_NAME
            self.charset = settings.DB_CHARSET
    
            self.connect()
    
        def connect(self):
            self.conn = pymysql.connect(host=self.host,
                                        port=self.port,
                                        user=self.user,
                                        password=self.pwd,
                                        db=self.name,
                                        charset=self.charset)
            self.cursor = self.conn.cursor()
    
        def close_spider(self, spider):
            self.conn.close()
            self.cursor.close()
    
        def process_item(self, item, spider):
            try:
                print('在插入 ***************************************          ')
                # sql = 'insert into zb_contest_data(craw_date, cd_filename, cd_content, create_time) values("%s", "%s", "%s", "%s")' % (
                #     item['craw_date'], item['file_name'], str(item['file_content']), item['create_time'])
                file_count = pymysql.escape_string(item['file_content'])
                sql = 'insert into zb_contest_data(craw_date, cd_filename, cd_content, create_time) values("%s", "%s", "%s", "%s")' % (
                    item['craw_date'], item['file_name'], file_count, item['create_time'])
    
                # sql = 'insert into zb_contest_data(craw_date, cd_filename, cd_content, create_time) values("%s", "%s", "%s", "%s")' % (
                #     item['craw_date'], item['file_name'], item['file_content'], item['create_time'])
    
                # 执行sql语句
                self.cursor.execute(sql)
                self.conn.commit()
               
            except Exception as e:
                print(e)
                pass
            return item
    

    翻页乱码:

    乱码:
    	settings.py
    
     1 FEED_EXPORT_ENCODING ='utf-8' #插入这两行,解码成中文,不然出现的是乱码
    
    2 FEED_EXPORT_ENCODING = 'gb18030'
    
    import scrapy
    
    from movie.items import MovieItem
    
    class MovieproSpider(scrapy.Spider):
        name = 'moviePro'
        allowed_domains = ['4567tv.tv']
        start_urls = ['https://www.4567tv.tv/frim/index1.html']
        page = 1
        page_url = 'https://www.4567tv.tv/frim/index1-%s.html'
    
    
        def parse(self, response):
            li_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]')
            for li in li_list:
                item = MovieItem()
                name = li.xpath('./div/div/h4/a/text()').extract_first()
                detail_url = 'https://www.4567tv.tv' + li.xpath('./div/div/h4/a/@href').extract_first()
                item['name'] = name
                yield scrapy.Request(url = detail_url, callback = self.parse_detail, meta ={'item':item})
    
            if self.page <= 10:
                self.page += 1
                new_page_url = self.page_url % self.page
                yield scrapy.Request(url = new_page_url, callback = (self.parse))
    
        def parse_detail(self, response):
            item = response.meta['item'] #注意这里实例化的是meta的,是parse函数传递过来的第二层内容
            actor = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[3]/a/text()').extract_first() 
            link = response.xpath('/html/body/div[1]/div/div/div/div[1]/a/@href').extract_first()
            item['actor'] = actor
            item['link'] = 'https://www.4567tv.tv' + link #这个连接是相对路径,重新构造地址
            yield item
    

    数据库事务处理:

    自定义的管道,将完整的爬取数据,保存到MySql数据库中
    class DingdianPipeline(object):
        def process_item(self, item, spider):
            dbu = dbutil.MYSQLdbUtil()
            dbu.getConnection()  # 开启事物
    
            # 1.添加
            try:
                #sql = "insert into movies (电影排名,电影名称,电影短评,评价分数,评价人数)values(%s,%s,%s,%s,%s)"
                sql = "insert into ebook (novel_name,author,novelurl,serialstatus,serialsize,ebookdate,newchapter)values(%s,%s,%s,%s,%s,%s,%s)"
                #date = [item['rank'],item['title'],item['quote'],item['star']]
                #dbu.execute(sql, date, True)
                dbu.execute(sql, (item['novel_name'],item['author'],item['novelurl'],item['serialstatus'],item['serialsize'],item['date'],item['newchapter']),True)
                #dbu.execute(sql,True)
                dbu.commit()
                print('插入数据库成功!!')
            except:
                dbu.rollback()
                dbu.commit()  # 回滚后要提交
            finally:
                dbu.close()
            return item
    
  • 相关阅读:
    联赛前第五阶段总结
    陶陶摘苹果 —— 线段树维护单调栈
    联赛前第三阶段总结
    联赛前第四阶段总结
    [NOIP
    超级跳马 —— 矩阵快速幂优化DP
    我的博客园美化
    Wedding —— 2-SAT
    C++运算符优先级
    water——小根堆+BFS
  • 原文地址:https://www.cnblogs.com/shaozheng/p/12781104.html
Copyright © 2011-2022 走看看