1.mogodb交互管道类正规写法
#MogoDB交互
import Pymongo
#管道类
class MogoDBPipeline(object):
def __init__(self, mongo_uri, mongo_db):
#初始化方法__new__:构造方法,在内存中开辟一块空间
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri,27017)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
#表名为goods,插入数据
self.db['goods'].insert_one(dict(item))
# 在一个项目中可能存在多个管道类, 如果该管道类后面还有管道类需要存储数据, 必须return item
return item
def close_spider(self, spider):
self.client.close()
2.MySQL交互管道类正规写法
# 1.连接数据库
# 2.获取游标
# 3.准备SQL语句
# 4.执行SQL语句
# 5.提交commit
class MysqlPipeline(object):
def __init__(self, host, port, user, password, db):
self.host = host
self.port = port
self.user = user
self.password = password
self.db = db
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('HOST'),
port=crawler.settings.get('PORT'),
user=crawler.settings.get('USER'),
password=crawler.settings.get('PASSWORD'),
db=crawler.settings.get('DB'),
)
def open_spider(self, spider):
self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, password=self.password, db=self.db, charset='utf8')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
data = dict(item)
sql = "insert into zhegoods values ('%s','%s', '%s')"%(data['name'], data['price'], data['imgname'])
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
settings配置
#额外加上代码
MONGO_URI = 'localhost'
MONGO_DB = '自定义库名'
# MYSQL配置
HOST = 'localhost'
PORT = 3306
USER = 'root'
PASSWORD = ''
DB = '自定义库名'
ps:mongodb可以直接声明库名和表名,数据存储过程中会自己建表建库,而MYsQL则需要先准备好库和表
ITEM_PIPELINES = {
# 其中300代表权重, 权重越小,引擎越优先运行
# 'z8b.pipelines.Z8BPipeline': 300, ps:爬虫名.管道类.类名:权重
'z8b.pipelines.MysqlPipeline': 301,
}