面向对象保存保存数据。
1,CSV
代码:
1 """ 2 豆瓣top250四种保存方式 3 """ 4 import csv 5 import random 6 import time 7 import parsel 8 import requests 9 10 class douBanSpider(): 11 # url = 'https://movie.douban.com/top250' 12 headers = { 13 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36', 14 'Cookie': 'cookie' 15 } 16 def __init__(self, url, headers=headers): 17 self.url = url 18 self.headers = headers 19 20 def getHtml(self): 21 response = requests.get(url=self.url, headers=self.headers) 22 response.encoding = response.apparent_encoding 23 response.encoding = 'utf-8' 24 return response.text 25 26 def parseHtmlByXpath(self): 27 movieListDatas = [] 28 movieDictDatas = [] 29 selector = parsel.Selector(self.getHtml()) 30 results = selector.xpath('//div/ol/li') 31 for item in results: 32 title = item.xpath('.//div[@class="hd"]/a/span[1]/text()').get() 33 movieInfo = item.xpath('.//div[@class="bd"]/p/text()').getall() 34 director = movieInfo[0].split(' ')[0].strip() 35 try: 36 actors = movieInfo[0].split(' ')[1].strip() 37 except: 38 actors = '请从详情页获取!' 39 releaseYear = movieInfo[1].split('\xa0/\xa0')[0].strip() 40 country = movieInfo[1].split('\xa0/\xa0')[1].strip() 41 movieType = movieInfo[1].split('\xa0/\xa0')[2].strip() 42 movieStar = item.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()').get() 43 reviewCount = item.xpath('.//div[@class="star"]/span[last()]/text()').get() 44 try: 45 oneWordDes = item.xpath('.//p[@class="quote"]/span/text()').get() 46 except: 47 oneWordDes = None 48 movieListDatas.append([title, director, actors, releaseYear, country, movieType, movieStar, reviewCount, oneWordDes]) 49 dit = { 50 '电影名称':title, 51 '导演':director, 52 '演员':actors, 53 '年份':releaseYear, 54 '国家':country, 55 '类型':movieType, 56 '评分':movieStar, 57 '评论总数':reviewCount, 58 '一句话描述':oneWordDes, 59 } 60 print(dit) 61 movieDictDatas.append(dit) 62 # print(title, director, actors, releaseYear, country, movieType, movieStar, reviewCount, oneWordDes, sep=' | ') 63 # print(movieDictDatas) 64 # print(movieListDatas) 65 66 return movieListDatas 67 def saveToCsv(self): 68 f = open('20211229豆瓣top250.csv', mode='a', encoding='utf-8-sig', newline='') 69 csvWriter = csv.DictWriter(f, fieldnames=[ 70 '电影名称', 71 '导演', 72 '演员', 73 '年份', 74 '国家', 75 '类型', 76 '评分', 77 '评论总数', 78 '一句话描述', 79 ]) 80 csvWriter.writeheader() # 写入头 81 datas = self.parseHtmlByXpath() 82 for data in datas: 83 csvWriter.writerow(data) 84 f.close() 85 86 def saveTocsv2(self): 87 f = open('20211229豆瓣250.csv', mode='a', encoding='utf-8', newline='') 88 lis = ['电影名称', 89 '导演', 90 '演员', 91 '年份', 92 '国家', 93 '类型', 94 '评分', 95 '评论总数', 96 '一句话描述',] 97 csvWriter = csv.writer(f) 98 csvWriter.writerow(lis) 99 datas = self.parseHtmlByXpath() 100 for data in datas: 101 csvWriter.writerow(data) 102 f.close() 103 104 def run(self): 105 self.saveTocsv2() 106 107 if __name__ == "__main__": 108 for start in range(0, 250+1, 25): 109 print(f'************************正在爬取{int(start/25 + 1)}页内容************************') 110 time.sleep(random.uniform(2,5)) 111 url = f'https://movie.douban.com/top250?start={start}&filter=' 112 app = douBanSpider(url=url) 113 app.run() 114 break
第二种,excel。
这种保存方式,不知道是啥原因,不能直接在类里面定义一个savetoexcel之类的方法,因为保存的时候只能保存到最后一组数据。所以试着改了种思路,在类里创建excel文档,然后在保存数据之前再打开文档,然后逐条写入,然后写入后关闭保存,这样循环就能保存到所有数据了。猜测是不是io流问题,不知道。。。。。。
1 """ 2 第二种方式,xlsx 3 """ 4 import parsel 5 import requests 6 import time 7 import random 8 import openpyxl as op 9 10 class douBanSpider(): 11 headers = { 12 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36', 13 'Cookie': 'cookie' 14 } 15 def __init__(self, url, headers=headers): 16 self.url = url 17 18 def getHtml(self): 19 response = requests.get(url=self.url, headers=self.headers) 20 response.encoding = response.apparent_encoding 21 response.encoding = 'utf-8' 22 return response.text 23 24 def parseHtmlByXpath(self): 25 selector = parsel.Selector(self.getHtml()) 26 movieListDatas = [] 27 movieDictDatas = [] 28 selector = parsel.Selector(self.getHtml()) 29 results = selector.xpath('//div/ol/li') 30 for item in results: 31 title = item.xpath('.//div[@class="hd"]/a/span[1]/text()').get() 32 movieInfo = item.xpath('.//div[@class="bd"]/p/text()').getall() 33 director = movieInfo[0].split(' ')[0].strip() 34 try: 35 actors = movieInfo[0].split(' ')[1].strip() 36 except: 37 actors = '请从详情页获取!' 38 releaseYear = movieInfo[1].split('\xa0/\xa0')[0].strip() 39 country = movieInfo[1].split('\xa0/\xa0')[1].strip() 40 movieType = movieInfo[1].split('\xa0/\xa0')[2].strip() 41 movieStar = item.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()').get() 42 reviewCount = item.xpath('.//div[@class="star"]/span[last()]/text()').get() 43 try: 44 oneWordDes = item.xpath('.//p[@class="quote"]/span/text()').get() 45 except: 46 oneWordDes = None 47 movieListDatas.append( 48 [title, director, actors, releaseYear, country, movieType, movieStar, reviewCount, oneWordDes]) 49 dit = { 50 '电影名称': title, 51 '导演': director, 52 '演员': actors, 53 '年份': releaseYear, 54 '国家': country, 55 '类型': movieType, 56 '评分': movieStar, 57 '评论总数': reviewCount, 58 '一句话描述': oneWordDes, 59 } 60 print(dit) 61 movieDictDatas.append(dit) 62 # print(title, director, actors, releaseYear, country, movieType, movieStar, reviewCount, oneWordDes, sep=' | ') 63 # print(movieDictDatas) 64 # print(movieListDatas) 65 66 return movieListDatas 67 68 def parseHtmlByCss(self): 69 movieListDatas = [] 70 selector = parsel.Selector(self.getHtml()) 71 results = selector.css('div ol li') 72 for item in results: 73 title = item.css('.hd a span:nth-child(1)::text').get() # 获取电影中文名 74 def doMovieInfo(): 75 movieInfos = [] 76 movieInfo = item.css('.bd p::text').getall() # 获取导演和演员信息 77 # print(movieInfo) 78 try: 79 director = movieInfo[0].split('\xa0\xa0\xa0')[0].strip() 80 except: 81 director = None 82 movieInfos.append(director) 83 try: 84 actors = movieInfo[0].split('\xa0\xa0\xa0')[1].strip() 85 except: 86 actors = None 87 movieInfos.append(actors) 88 try: 89 releaseYear = movieInfo[1].split('/')[0].strip() 90 except: 91 releaseYear = None 92 movieInfos.append(releaseYear) 93 try: 94 releaseCountry = movieInfo[1].split('/')[1].strip() 95 except: 96 releaseCountry = None 97 movieInfos.append(releaseCountry) 98 try: 99 movieType = movieInfo[1].split('/')[2].strip() 100 except: 101 movieType = None 102 movieInfos.append(movieType) 103 return movieInfos 104 movifInfos = doMovieInfo() 105 # 开始获取演员等信息 106 director = movifInfos[0] 107 actors = movifInfos[1] 108 releaseYear = movifInfos[2] 109 country = movifInfos[3] 110 movieType = movifInfos[4] 111 112 # 获取评论分数和评论人数 113 try: 114 movieStar = item.css('.star .rating_num::text').get() 115 except: 116 movieStar = None 117 try: 118 reviewCount = item.css('.star span:nth-child(4)::text').get().strip('人评价') 119 except: 120 reviewCount = None 121 # 获取一句话描述 122 try: 123 oneWordDes = item.css('.quote .inq::text').get() 124 except: 125 oneWordDes = None 126 127 print(title, director, actors, releaseYear, country, movieType, movieStar, reviewCount, oneWordDes, sep=' | ') 128 movieListDatas.append([title, director, actors, releaseYear, country, movieType, movieStar, reviewCount, oneWordDes]) 129 # print(movieListDatas) 130 return movieListDatas 131 def create_workbook(self): 132 """ 133 创建一个xlsx文档用来存储数据 134 """ 135 wb = op.Workbook() # 创建工作簿 136 ws = wb.create_sheet(title='summary', index=0) # 在工作簿中新建一个标题为summary的表 137 wb.remove(wb['Sheet']) # 删除默认表 138 # 添加头 139 ws.append(['电影名称', '导演', '演员', '年份', '国家', '类型', '评分', '评论总数', '一句话描述']) 140 wb.close() # 关闭工作簿 141 wb.save('202201014豆瓣top250.xlsx') # 保存工作簿 142 143 def save_to_excel(self): 144 # 打开工作簿 145 wbook = op.load_workbook('202201014豆瓣top250.xlsx') # 打开工作簿 146 summary = wbook['summary'] # 选中表 147 148 # 要保存的数据 149 for data in self.parseHtmlByCss(): 150 summary.append(data) 151 wbook.close() 152 wbook.save('afafafafafafaf.xlsx') 153 154 def run(self): 155 self.save_to_excel() 156 if __name__ == "__main__": 157 wbook = op.load_workbook('202201014豆瓣top250.xlsx') # 打开工作簿 158 summary = wbook['summary'] # 选中表 159 for start in range(0, 250+1, 25): 160 print(f'************************************正在爬取{int(start/25 + 1)}页内容************************************************') 161 # time.sleep(random.uniform(2,5)) 162 url = f'https://movie.douban.com/top250?start={start}&filter=' 163 app = douBanSpider(url=url) 164 app.create_workbook() 165 datas = app.parseHtmlByCss() 166 for data in datas: 167 summary.append(data) 168 wbook.close() 169 wbook.save('afafafafafafaf.xlsx') 170 # app.create_workbook() 171 # app.run()
第三种,保存到MySql
Mysql最重要的是注意语法,特别是与python结合的时候,写sql语句的时候,首先是sql语法,然后才是python语法,也就是VALUES()里的要用python格式化的变量在Mysql语法里它是一个字符串。
1 """ 2 保存到数据库 3 """ 4 import requests 5 import parsel 6 import pymysql 7 8 class douBanSpider(): 9 headers = { 10 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36' 11 } 12 13 def __init__(self, url, headers=headers): 14 self.url = url 15 self.headers = headers 16 17 def get_html(self): 18 response = requests.get(url=self.url, headers=self.headers) 19 response.raise_for_status() 20 response.encoding = response.apparent_encoding 21 response.encoding = 'utf-8' 22 return response.text 23 24 def parse_html_by_css(self): 25 titles = [] 26 daoyan = [] 27 yanyuan = [] 28 nianfen = [] 29 guojia = [] 30 dianyingleixing = [] 31 pingfen = [] 32 pinglunrenshu = [] 33 jianduanmiaoshu = [] 34 html_data = self.get_html() 35 selector = parsel.Selector(html_data) 36 results = selector.css('div ol li') 37 # 开始提取数据 38 for item in results: 39 alldata = [] 40 # 标题 41 title = item.css('.hd a span:nth-child(1)::text').get() 42 titles.append(title) 43 def getMovieInfo(): 44 movieInfos = [] 45 movieInfo = item.css('.bd p::text').getall() 46 # print(movieInfo) 47 try: 48 directors = movieInfo[0].split('\xa0\xa0\xa0')[0].strip() 49 except: 50 directors = None 51 movieInfos.append(directors) 52 try: 53 actors = movieInfo[0].split('\xa0\xa0\xa0')[1].strip() 54 except: 55 actors = None 56 movieInfos.append(actors) 57 try: 58 year = movieInfo[1].split('/')[0].strip() 59 except: 60 year = None 61 movieInfos.append(year) 62 try: 63 country = movieInfo[1].split('/')[1].strip() 64 except: 65 country = None 66 movieInfos.append(country) 67 try: 68 movieType = movieInfo[1].split('/')[2].strip() 69 except: 70 movieType = None 71 movieInfos.append(movieType) 72 73 return movieInfos 74 # 导演以及演员信息 75 movieInfos = getMovieInfo() 76 # 提取 77 directors = movieInfos[0] 78 daoyan.append(directors) 79 actors = movieInfos[1] 80 yanyuan.append(actors) 81 year = movieInfos[2] 82 nianfen.append(year) 83 country = movieInfos[3] 84 guojia.append(country) 85 movieType = movieInfos[4] 86 dianyingleixing.append(movieType) 87 # 评分 88 reviewScore = item.css('.star .rating_num::text').get() 89 pingfen.append(reviewScore) 90 try: 91 reviewCount = item.css('.star span:nth-child(4)::text').get().strip('人评价') 92 except: 93 reviewCount = None 94 pinglunrenshu.append(reviewCount) 95 try: 96 oneWordDes = item.css('.quote span::text').get() 97 except: 98 oneWordDes = None 99 jianduanmiaoshu.append(oneWordDes) 100 print(title, directors, actors, year, country, movieType, reviewScore, reviewCount, oneWordDes, sep=' | ') 101 # alldata.append([title, directors, actors, year, country, movieType, reviewScore, reviewCount, oneWordDes]) 102 zipdata = zip(titles, daoyan, yanyuan, nianfen, guojia, dianyingleixing, pingfen, pinglunrenshu, jianduanmiaoshu) 103 return zipdata 104 105 # def parse_html_by_xpath(self): 106 def database_connect(self): 107 database = pymysql.Connect(host='127.0.0.1', user='root', password='', database='douban', charset='utf8') 108 return database 109 110 def create_table(self): 111 database = self.database_connect() 112 cursor = database.cursor() 113 sql = """ 114 CREATE TABLE IF NOT EXISTS `top250` 115 ( 116 `id` INT NOT NULL AUTO_INCREMENT COMMENT "自增id", 117 `movieTitle` VARCHAR(255) NOT NULL COMMENT "电影名", 118 `directors` VARCHAR(255) NOT NULL COMMENT "导演", 119 `actors` VARCHAR(255) NOT NULL COMMENT "演员", 120 `year` INT(10) NOT NULL COMMENT "发布年份", 121 `country` VARCHAR(255) NOT NULL COMMENT "国家", 122 `movieType` VARCHAR(255) NOT NULL COMMENT "电影类型", 123 `reviewScore` FLOAT(5) NOT NULL COMMENT "评分", 124 `reviewCount` INT(10) NOT NULL COMMENT "评论人数", 125 `description` VARCHAR(255) NOT NULL COMMENT "一句话描述", 126 PRIMARY KEY (`id`) 127 ) ENGINE = MyISAM 128 """ 129 cursor.execute(sql) 130 database.commit() 131 database.close() 132 133 def update_database(self): 134 database = self.database_connect() 135 cursor = database.cursor() 136 # 后面我感觉这个电影评分用decimal类型比较好,这种数据类型能精确到小数位几位,所以要更改数据库 137 sql = """ 138 ALTER TABLE `top250` CHANGE `reviewScore` `reviewScore` DECIMAL(2,1) NOT NULL COMMENT '评分'; 139 """ 140 cursor.execute(sql) # 再次运行sql语句 141 database.commit() # 提交数据库 142 database.close() 143 144 def insert_to_table(self): 145 with self.database_connect() as dbconnect: 146 datas = self.parse_html_by_css() 147 cursor = dbconnect.cursor() 148 for title, directors, actors, year, country, movieType, reviewScore, reviewCount, oneWordDes in datas: 149 # sql语句一定要注意语法,特别是{}两边的引号 150 sql = "INSERT INTO top250 (movieTitle, directors, actors, year, country, movieType, reviewScore, reviewCount, description) " \ 151 "VALUES ('{}','{}','{}','{}','{}','{}','{}','{}','{}');".format(title, directors, actors, year, country, movieType, reviewScore, reviewCount, oneWordDes) 152 # insertSql = "INSERT INTO top250 (movieTitle, directors, actors, year, country, movieType, reviewScore, reviewCount, description) VALUES('肖申克的救赎', '导演: 弗兰克·德拉邦特 Frank Darabont', '主演: 蒂姆·罗宾斯 Tim Robbins /...', '1994', '美国', '犯罪 剧情', '9.7', '2518046', '希望让人自由。');" 153 cursor.execute(sql) 154 dbconnect.commit() 155 156 def run(self): 157 self.insert_to_table() 158 if __name__ == "__main__": 159 for filter in range(0, 225+1, 25): 160 url = f'https://movie.douban.com/top250?start={filter}&filter=' 161 app = douBanSpider(url=url) 162 # app.database_connect() # 连接数据 163 # app.create_table() # 创建数据表 164 # app.update_database() # 更新数据库,将评分类型改为decimal 165 # app.insert_to_table() # 写入数据 166 app.run()
暂时没尝试服务器数据库的写入速度,本地数据库的写入速度相当于秒入。