使用MySQL数据库存储
安装mysql模块包
pip install mysqlclient
相关库文件
sudo apt-get install libmysqlclient-devel
sudo apt-get install python-devel mysql-devel
阻塞型的数据写入操作
class MysqlPipeline(object): def __init__(self): self.conn = pymysql.connect('192.168.1.1', 'root', '123456', 'titlespider', charset='utf-8', use_unicode=True) self.cursor = self.conn.cursor() def process_item(self, item, spider): insert_sql = """ insert into article(title, cteate_time, url, content) VALUES (%s, %s, %s, %s) """ self.cursor.execute(insert_sql, (item['title'], item['cteate_time'], item['url'], item['content'])) self.conn.commit() return item
使用twisted提供的数据库连接池,异步化写入,缓解写数据操作堵塞
# 首先在settings定义数据库关键字变量 MYSQL_HOST = '192.168.1.1' MYSQL_USER = 'root' MYSQL_PASSWD = '123456' MYSQL_DB = 'articlespider' # 然后编写Pipeline类 from twisted.enterprise import adbapi import pymysql import pymysql.cursors class MysqlTwistedPipeline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): conn_dict = dict( host = settings['MYSQL_HOST'], user = settings['MYSQL_USER'], passwd = settings['MYSQL_PASSWD'], database = settings['MYSQL_DB'], charset = 'utf-8', cursorclass = pymysql.cursors.DictCursor, use_unicode = True ) dbpool = adbapi.ConnectionPool('PyMySQL', **conn_dict) return cls(dbpool) def process_item(self, item, spider): query = self.dbpool.runInteraction(self.do_insert, item) query.addErrorback(self.handle_error) return item def do_insert(self, item): insert_sql = """ insert into article(title, cteate_time, url, content) VALUES (%s, %s, %s, %s) """ self.cursor.execute(insert_sql, (item['title'], item['cteate_time'], item['url'], item['content'])) self.conn.commit() def handle_error(self, failure): print(failure)
使用类似django-model的方式写入数据库
https://github.com/scrapy-plugins/scrapy-djangoitem