scrapy 数据存储mysql 处理:
user-agent 池:
UserAgent = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
数据库同步插入数据模板:
同步
class MysqlPipeline(object):
def open_spider(self, spider):
self.db = pymysql.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, password=MYSQL_PWD,
database=MYSQL_DB)
self.cursor = self.db.cursor()
print('数据库连接success')
def process_item(self, item, spider):
info_list = (item['company_name'], item['company_person'], item['company_money'],
item['company_establish'], item['company_phone'], item['company_address'],
item['company_status'])
ins = """
insert into test1(company_name,company_person,company_money,company_establish,company_phone,company_address'
'company_status,) VALUES (%s,%s,%s,%s,%s,%s,%s)"""
try:
self.cursor.execute(ins, info_list)
self.db.commit()
print('插入重构')
except Exception as e:
print(e)
return item
def close_spider(self, spider):
self.cursor.close()
self.db.close()
csv 存储:
csv 存储
import csv
class WangyinPipeline_a(object):
def __init__(self):
self.f = open("qichacha_sh.csv", "w")
self.writer = csv.writer(self.f)
self.writer.writerow(
['company_name', 'company_city', 'company_person', 'company_money', 'company_establish', 'company_email',
'company_phone', 'company_address''
''company_status', 'company_link'])
def process_item(self, item, spider):
wangyiyun_list = item['company_name'], item['company_city'], item['company_person'], item['company_money'],
item['company_establish'], item['company_email'], item['company_phone'], item[
'company_address'],
item['company_status'], item['company_link']
self.writer.writerow(wangyiyun_list)
return item
def close_spider(self, spider): # 关闭
self.f.close()
插入模板:
import pymysql
# 同步
class QccPipeline(object):
def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': '155345',
'database': 'qcc',
'charset': 'utf8'
}
self.conn = pymysql.connect(**dbparams)
self.cursor = self.conn.cursor()
self._sql = None
def process_item(self, item, spider):
self.cursor.execute(self.sql, (item['title'], ...))
self.conn.commit()
return item
@property
def sql(self):
if not self._sql:
# 创建表 sql 语句
self._sql = """
文本: longtext 长度不写
时间 datetime
注意:
id 不为空
item 内的company_name = scrapy.Field()
company 对应表子段
id title text content author touxiang time x-id
insert into art( id,....) values(null,%s,%s,%s,%s。。。。)
"""
return self._sql
return self._sql
异步插入模板:
# 异步
from twisted.enterprise import adbapi # 数据库处理
from pymysql import cursors
class ty(object):
def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': '155345',
'database': 'jianshu',
'charset': 'utf8',
'cursorclass': cursors.DictCursor
}
self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
self._sql = None
@property
def sql(self):
if not self._sql:
# 创建表 sql 语句
self._sql = """
文本: longtext 长度不写
时间 datetime
注意:
id 不为空
item 内的company_name = scrapy.Field()
company 对应表子段
id title text content author touxiang time x-id
insert into art( id,....) values(null,%s,%s,%s,%s。。。。)
"""
return self._sql
return self._sql
def process_item(self, item, spider):
defer = self.dbpool.runInteraction((self.insert_item, item))
defer.addErrback(self.handle_error, item, spider)
def insert_item(self, cursor, item):
cursor.execute(self.sql, (item['title'], ...))
def handler_error(self, error, item, spider):
print('=' * 10 + 'error' + '=' * 10)
print(error)
数据库插入包错: 1064
class MysqlPipeline(object):
def __init__(self):
self.host = settings.DB_HOST
self.port = settings.DB_PORT
self.user = settings.DB_USER
self.pwd = settings.DB_PWD
self.name = settings.DB_NAME
self.charset = settings.DB_CHARSET
self.connect()
def connect(self):
self.conn = pymysql.connect(host=self.host,
port=self.port,
user=self.user,
password=self.pwd,
db=self.name,
charset=self.charset)
self.cursor = self.conn.cursor()
def close_spider(self, spider):
self.conn.close()
self.cursor.close()
def process_item(self, item, spider):
try:
print('在插入 *************************************** ')
# sql = 'insert into zb_contest_data(craw_date, cd_filename, cd_content, create_time) values("%s", "%s", "%s", "%s")' % (
# item['craw_date'], item['file_name'], str(item['file_content']), item['create_time'])
file_count = pymysql.escape_string(item['file_content'])
sql = 'insert into zb_contest_data(craw_date, cd_filename, cd_content, create_time) values("%s", "%s", "%s", "%s")' % (
item['craw_date'], item['file_name'], file_count, item['create_time'])
# sql = 'insert into zb_contest_data(craw_date, cd_filename, cd_content, create_time) values("%s", "%s", "%s", "%s")' % (
# item['craw_date'], item['file_name'], item['file_content'], item['create_time'])
# 执行sql语句
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
pass
return item
翻页乱码:
乱码:
settings.py
1 FEED_EXPORT_ENCODING ='utf-8' #插入这两行,解码成中文,不然出现的是乱码
2 FEED_EXPORT_ENCODING = 'gb18030'
import scrapy
from movie.items import MovieItem
class MovieproSpider(scrapy.Spider):
name = 'moviePro'
allowed_domains = ['4567tv.tv']
start_urls = ['https://www.4567tv.tv/frim/index1.html']
page = 1
page_url = 'https://www.4567tv.tv/frim/index1-%s.html'
def parse(self, response):
li_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]')
for li in li_list:
item = MovieItem()
name = li.xpath('./div/div/h4/a/text()').extract_first()
detail_url = 'https://www.4567tv.tv' + li.xpath('./div/div/h4/a/@href').extract_first()
item['name'] = name
yield scrapy.Request(url = detail_url, callback = self.parse_detail, meta ={'item':item})
if self.page <= 10:
self.page += 1
new_page_url = self.page_url % self.page
yield scrapy.Request(url = new_page_url, callback = (self.parse))
def parse_detail(self, response):
item = response.meta['item'] #注意这里实例化的是meta的,是parse函数传递过来的第二层内容
actor = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[3]/a/text()').extract_first()
link = response.xpath('/html/body/div[1]/div/div/div/div[1]/a/@href').extract_first()
item['actor'] = actor
item['link'] = 'https://www.4567tv.tv' + link #这个连接是相对路径,重新构造地址
yield item
数据库事务处理:
自定义的管道,将完整的爬取数据,保存到MySql数据库中
class DingdianPipeline(object):
def process_item(self, item, spider):
dbu = dbutil.MYSQLdbUtil()
dbu.getConnection() # 开启事物
# 1.添加
try:
#sql = "insert into movies (电影排名,电影名称,电影短评,评价分数,评价人数)values(%s,%s,%s,%s,%s)"
sql = "insert into ebook (novel_name,author,novelurl,serialstatus,serialsize,ebookdate,newchapter)values(%s,%s,%s,%s,%s,%s,%s)"
#date = [item['rank'],item['title'],item['quote'],item['star']]
#dbu.execute(sql, date, True)
dbu.execute(sql, (item['novel_name'],item['author'],item['novelurl'],item['serialstatus'],item['serialsize'],item['date'],item['newchapter']),True)
#dbu.execute(sql,True)
dbu.commit()
print('插入数据库成功!!')
except:
dbu.rollback()
dbu.commit() # 回滚后要提交
finally:
dbu.close()
return item