zoukankan      html  css  js  c++  java
  • 面向对象保存爬虫数据 Python

    面向对象保存保存数据。

    1,CSV

    代码:

      1 """
      2     豆瓣top250四种保存方式
      3 """
      4 import csv
      5 import random
      6 import time
      7 import parsel
      8 import requests
      9 
     10 class douBanSpider():
     11     # url = 'https://movie.douban.com/top250'
     12     headers = {
     13         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
     14         'Cookie': 'cookie'
     15     }
     16     def __init__(self, url, headers=headers):
     17         self.url = url
     18         self.headers = headers
     19 
     20     def getHtml(self):
     21         response = requests.get(url=self.url, headers=self.headers)
     22         response.encoding = response.apparent_encoding
     23         response.encoding = 'utf-8'
     24         return response.text
     25 
     26     def parseHtmlByXpath(self):
     27         movieListDatas = []
     28         movieDictDatas = []
     29         selector = parsel.Selector(self.getHtml())
     30         results = selector.xpath('//div/ol/li')
     31         for item in results:
     32             title = item.xpath('.//div[@class="hd"]/a/span[1]/text()').get()
     33             movieInfo = item.xpath('.//div[@class="bd"]/p/text()').getall()
     34             director = movieInfo[0].split('   ')[0].strip()
     35             try:
     36                 actors = movieInfo[0].split('   ')[1].strip()
     37             except:
     38                 actors = '请从详情页获取!'
     39             releaseYear = movieInfo[1].split('\xa0/\xa0')[0].strip()
     40             country = movieInfo[1].split('\xa0/\xa0')[1].strip()
     41             movieType = movieInfo[1].split('\xa0/\xa0')[2].strip()
     42             movieStar = item.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()').get()
     43             reviewCount = item.xpath('.//div[@class="star"]/span[last()]/text()').get()
     44             try:
     45                 oneWordDes = item.xpath('.//p[@class="quote"]/span/text()').get()
     46             except:
     47                 oneWordDes = None
     48             movieListDatas.append([title, director, actors, releaseYear, country, movieType, movieStar, reviewCount, oneWordDes])
     49             dit = {
     50                 '电影名称':title,
     51                 '导演':director,
     52                 '演员':actors,
     53                 '年份':releaseYear,
     54                 '国家':country,
     55                 '类型':movieType,
     56                 '评分':movieStar,
     57                 '评论总数':reviewCount,
     58                 '一句话描述':oneWordDes,
     59             }
     60             print(dit)
     61             movieDictDatas.append(dit)
     62             # print(title, director, actors, releaseYear, country, movieType, movieStar, reviewCount, oneWordDes, sep=' | ')
     63             # print(movieDictDatas)
     64             # print(movieListDatas)
     65 
     66         return movieListDatas
     67     def saveToCsv(self):
     68         f = open('20211229豆瓣top250.csv', mode='a', encoding='utf-8-sig', newline='')
     69         csvWriter = csv.DictWriter(f, fieldnames=[
     70             '电影名称',
     71             '导演',
     72             '演员',
     73             '年份',
     74             '国家',
     75             '类型',
     76             '评分',
     77             '评论总数',
     78             '一句话描述',
     79         ])
     80         csvWriter.writeheader() # 写入头
     81         datas = self.parseHtmlByXpath()
     82         for data in datas:
     83             csvWriter.writerow(data)
     84         f.close()
     85 
     86     def saveTocsv2(self):
     87         f = open('20211229豆瓣250.csv', mode='a', encoding='utf-8', newline='')
     88         lis = ['电影名称',
     89             '导演',
     90             '演员',
     91             '年份',
     92             '国家',
     93             '类型',
     94             '评分',
     95             '评论总数',
     96             '一句话描述',]
     97         csvWriter = csv.writer(f)
     98         csvWriter.writerow(lis)
     99         datas = self.parseHtmlByXpath()
    100         for data in datas:
    101             csvWriter.writerow(data)
    102         f.close()
    103 
    104     def run(self):
    105         self.saveTocsv2()
    106 
    107 if __name__ == "__main__":
    108     for start in range(0, 250+1, 25):
    109         print(f'************************正在爬取{int(start/25 + 1)}页内容************************')
    110         time.sleep(random.uniform(2,5))
    111         url = f'https://movie.douban.com/top250?start={start}&filter='
    112         app = douBanSpider(url=url)
    113         app.run()
    114         break

     第二种,excel。

    这种保存方式,不知道是啥原因,不能直接在类里面定义一个savetoexcel之类的方法,因为保存的时候只能保存到最后一组数据。所以试着改了种思路,在类里创建excel文档,然后在保存数据之前再打开文档,然后逐条写入,然后写入后关闭保存,这样循环就能保存到所有数据了。猜测是不是io流问题,不知道。。。。。。

      1 """
      2     第二种方式,xlsx
      3 """
      4 import parsel
      5 import requests
      6 import time
      7 import random
      8 import openpyxl as op
      9 
     10 class douBanSpider():
     11     headers = {
     12         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
     13         'Cookie': 'cookie'
     14     }
     15     def __init__(self, url, headers=headers):
     16         self.url = url
     17 
     18     def getHtml(self):
     19         response = requests.get(url=self.url, headers=self.headers)
     20         response.encoding = response.apparent_encoding
     21         response.encoding = 'utf-8'
     22         return response.text
     23 
     24     def parseHtmlByXpath(self):
     25         selector = parsel.Selector(self.getHtml())
     26         movieListDatas = []
     27         movieDictDatas = []
     28         selector = parsel.Selector(self.getHtml())
     29         results = selector.xpath('//div/ol/li')
     30         for item in results:
     31             title = item.xpath('.//div[@class="hd"]/a/span[1]/text()').get()
     32             movieInfo = item.xpath('.//div[@class="bd"]/p/text()').getall()
     33             director = movieInfo[0].split('   ')[0].strip()
     34             try:
     35                 actors = movieInfo[0].split('   ')[1].strip()
     36             except:
     37                 actors = '请从详情页获取!'
     38             releaseYear = movieInfo[1].split('\xa0/\xa0')[0].strip()
     39             country = movieInfo[1].split('\xa0/\xa0')[1].strip()
     40             movieType = movieInfo[1].split('\xa0/\xa0')[2].strip()
     41             movieStar = item.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()').get()
     42             reviewCount = item.xpath('.//div[@class="star"]/span[last()]/text()').get()
     43             try:
     44                 oneWordDes = item.xpath('.//p[@class="quote"]/span/text()').get()
     45             except:
     46                 oneWordDes = None
     47             movieListDatas.append(
     48                 [title, director, actors, releaseYear, country, movieType, movieStar, reviewCount, oneWordDes])
     49             dit = {
     50                 '电影名称': title,
     51                 '导演': director,
     52                 '演员': actors,
     53                 '年份': releaseYear,
     54                 '国家': country,
     55                 '类型': movieType,
     56                 '评分': movieStar,
     57                 '评论总数': reviewCount,
     58                 '一句话描述': oneWordDes,
     59             }
     60             print(dit)
     61             movieDictDatas.append(dit)
     62             # print(title, director, actors, releaseYear, country, movieType, movieStar, reviewCount, oneWordDes, sep=' | ')
     63             # print(movieDictDatas)
     64             # print(movieListDatas)
     65 
     66         return movieListDatas
     67 
     68     def parseHtmlByCss(self):
     69         movieListDatas = []
     70         selector = parsel.Selector(self.getHtml())
     71         results = selector.css('div ol li')
     72         for item in results:
     73             title = item.css('.hd a span:nth-child(1)::text').get() # 获取电影中文名
     74             def doMovieInfo():
     75                 movieInfos = []
     76                 movieInfo = item.css('.bd p::text').getall() # 获取导演和演员信息
     77                 # print(movieInfo)
     78                 try:
     79                     director = movieInfo[0].split('\xa0\xa0\xa0')[0].strip()
     80                 except:
     81                     director = None
     82                 movieInfos.append(director)
     83                 try:
     84                     actors = movieInfo[0].split('\xa0\xa0\xa0')[1].strip()
     85                 except:
     86                     actors = None
     87                 movieInfos.append(actors)
     88                 try:
     89                     releaseYear = movieInfo[1].split('/')[0].strip()
     90                 except:
     91                     releaseYear = None
     92                 movieInfos.append(releaseYear)
     93                 try:
     94                     releaseCountry = movieInfo[1].split('/')[1].strip()
     95                 except:
     96                     releaseCountry = None
     97                 movieInfos.append(releaseCountry)
     98                 try:
     99                     movieType = movieInfo[1].split('/')[2].strip()
    100                 except:
    101                     movieType = None
    102                 movieInfos.append(movieType)
    103                 return movieInfos
    104             movifInfos = doMovieInfo()
    105             # 开始获取演员等信息
    106             director = movifInfos[0]
    107             actors = movifInfos[1]
    108             releaseYear = movifInfos[2]
    109             country = movifInfos[3]
    110             movieType = movifInfos[4]
    111 
    112             # 获取评论分数和评论人数
    113             try:
    114                 movieStar = item.css('.star .rating_num::text').get()
    115             except:
    116                 movieStar = None
    117             try:
    118                 reviewCount = item.css('.star span:nth-child(4)::text').get().strip('人评价')
    119             except:
    120                 reviewCount = None
    121             # 获取一句话描述
    122             try:
    123                 oneWordDes = item.css('.quote .inq::text').get()
    124             except:
    125                 oneWordDes = None
    126 
    127             print(title, director, actors, releaseYear, country, movieType, movieStar, reviewCount, oneWordDes, sep=' | ')
    128             movieListDatas.append([title, director, actors, releaseYear, country, movieType, movieStar, reviewCount, oneWordDes])
    129         # print(movieListDatas)
    130         return movieListDatas
    131     def create_workbook(self):
    132         """
    133             创建一个xlsx文档用来存储数据
    134         """
    135         wb = op.Workbook() # 创建工作簿
    136         ws = wb.create_sheet(title='summary', index=0) # 在工作簿中新建一个标题为summary的表
    137         wb.remove(wb['Sheet']) # 删除默认表
    138         # 添加头
    139         ws.append(['电影名称', '导演', '演员', '年份', '国家', '类型', '评分', '评论总数', '一句话描述'])
    140         wb.close() # 关闭工作簿
    141         wb.save('202201014豆瓣top250.xlsx') # 保存工作簿
    142 
    143     def save_to_excel(self):
    144         # 打开工作簿
    145         wbook = op.load_workbook('202201014豆瓣top250.xlsx') # 打开工作簿
    146         summary = wbook['summary'] # 选中表
    147 
    148         # 要保存的数据
    149         for data in self.parseHtmlByCss():
    150             summary.append(data)
    151             wbook.close()
    152             wbook.save('afafafafafafaf.xlsx')
    153 
    154     def run(self):
    155         self.save_to_excel()
    156 if __name__ == "__main__":
    157     wbook = op.load_workbook('202201014豆瓣top250.xlsx')  # 打开工作簿
    158     summary = wbook['summary']  # 选中表
    159     for start in range(0, 250+1, 25):
    160         print(f'************************************正在爬取{int(start/25 + 1)}页内容************************************************')
    161         # time.sleep(random.uniform(2,5))
    162         url = f'https://movie.douban.com/top250?start={start}&filter='
    163         app = douBanSpider(url=url)
    164         app.create_workbook()
    165         datas =  app.parseHtmlByCss()
    166         for data in datas:
    167             summary.append(data)
    168             wbook.close()
    169             wbook.save('afafafafafafaf.xlsx')
    170         # app.create_workbook()
    171         # app.run()

     第三种,保存到MySql

    Mysql最重要的是注意语法,特别是与python结合的时候,写sql语句的时候,首先是sql语法,然后才是python语法,也就是VALUES()里的要用python格式化的变量在Mysql语法里它是一个字符串。

      1 """
      2     保存到数据库
      3 """
      4 import requests
      5 import parsel
      6 import pymysql
      7 
      8 class douBanSpider():
      9     headers = {
     10         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
     11     }
     12 
     13     def __init__(self, url, headers=headers):
     14         self.url = url
     15         self.headers = headers
     16 
     17     def get_html(self):
     18         response = requests.get(url=self.url, headers=self.headers)
     19         response.raise_for_status()
     20         response.encoding = response.apparent_encoding
     21         response.encoding = 'utf-8'
     22         return response.text
     23 
     24     def parse_html_by_css(self):
     25         titles = []
     26         daoyan = []
     27         yanyuan = []
     28         nianfen = []
     29         guojia = []
     30         dianyingleixing = []
     31         pingfen = []
     32         pinglunrenshu = []
     33         jianduanmiaoshu = []
     34         html_data = self.get_html()
     35         selector = parsel.Selector(html_data)
     36         results = selector.css('div ol li')
     37         # 开始提取数据
     38         for item in results:
     39             alldata = []
     40             # 标题
     41             title = item.css('.hd a span:nth-child(1)::text').get()
     42             titles.append(title)
     43             def getMovieInfo():
     44                 movieInfos = []
     45                 movieInfo = item.css('.bd p::text').getall()
     46                 # print(movieInfo)
     47                 try:
     48                     directors = movieInfo[0].split('\xa0\xa0\xa0')[0].strip()
     49                 except:
     50                     directors = None
     51                 movieInfos.append(directors)
     52                 try:
     53                     actors = movieInfo[0].split('\xa0\xa0\xa0')[1].strip()
     54                 except:
     55                     actors = None
     56                 movieInfos.append(actors)
     57                 try:
     58                     year = movieInfo[1].split('/')[0].strip()
     59                 except:
     60                     year = None
     61                 movieInfos.append(year)
     62                 try:
     63                     country = movieInfo[1].split('/')[1].strip()
     64                 except:
     65                     country = None
     66                 movieInfos.append(country)
     67                 try:
     68                     movieType = movieInfo[1].split('/')[2].strip()
     69                 except:
     70                     movieType = None
     71                 movieInfos.append(movieType)
     72 
     73                 return movieInfos
     74             # 导演以及演员信息
     75             movieInfos = getMovieInfo()
     76             # 提取
     77             directors = movieInfos[0]
     78             daoyan.append(directors)
     79             actors = movieInfos[1]
     80             yanyuan.append(actors)
     81             year = movieInfos[2]
     82             nianfen.append(year)
     83             country = movieInfos[3]
     84             guojia.append(country)
     85             movieType = movieInfos[4]
     86             dianyingleixing.append(movieType)
     87             # 评分
     88             reviewScore = item.css('.star .rating_num::text').get()
     89             pingfen.append(reviewScore)
     90             try:
     91                 reviewCount = item.css('.star span:nth-child(4)::text').get().strip('人评价')
     92             except:
     93                 reviewCount = None
     94             pinglunrenshu.append(reviewCount)
     95             try:
     96                 oneWordDes = item.css('.quote span::text').get()
     97             except:
     98                 oneWordDes = None
     99             jianduanmiaoshu.append(oneWordDes)
    100             print(title, directors, actors, year, country, movieType, reviewScore, reviewCount, oneWordDes, sep=' | ')
    101             # alldata.append([title, directors, actors, year, country, movieType, reviewScore, reviewCount, oneWordDes])
    102         zipdata = zip(titles, daoyan, yanyuan, nianfen, guojia, dianyingleixing, pingfen, pinglunrenshu, jianduanmiaoshu)
    103         return zipdata
    104 
    105     # def parse_html_by_xpath(self):
    106     def database_connect(self):
    107         database = pymysql.Connect(host='127.0.0.1', user='root', password='', database='douban', charset='utf8')
    108         return database
    109 
    110     def create_table(self):
    111         database = self.database_connect()
    112         cursor = database.cursor()
    113         sql = """
    114                 CREATE TABLE IF NOT EXISTS `top250`
    115                     (
    116                         `id` INT NOT NULL AUTO_INCREMENT COMMENT "自增id",
    117                         `movieTitle` VARCHAR(255) NOT NULL COMMENT "电影名",
    118                         `directors` VARCHAR(255) NOT NULL COMMENT "导演",
    119                         `actors` VARCHAR(255) NOT NULL COMMENT "演员",
    120                         `year` INT(10) NOT NULL COMMENT "发布年份",
    121                         `country` VARCHAR(255) NOT NULL COMMENT "国家",
    122                         `movieType` VARCHAR(255) NOT NULL COMMENT "电影类型",
    123                         `reviewScore` FLOAT(5) NOT NULL COMMENT "评分",
    124                         `reviewCount` INT(10) NOT NULL COMMENT "评论人数",
    125                         `description` VARCHAR(255) NOT NULL COMMENT "一句话描述",
    126                         PRIMARY KEY (`id`)
    127                     ) ENGINE = MyISAM
    128             """
    129         cursor.execute(sql)
    130         database.commit()
    131         database.close()
    132 
    133     def update_database(self):
    134         database = self.database_connect()
    135         cursor = database.cursor()
    136         # 后面我感觉这个电影评分用decimal类型比较好,这种数据类型能精确到小数位几位,所以要更改数据库
    137         sql = """
    138              ALTER TABLE `top250` CHANGE `reviewScore` `reviewScore` DECIMAL(2,1) NOT NULL COMMENT '评分';
    139         """
    140         cursor.execute(sql)  # 再次运行sql语句
    141         database.commit()  # 提交数据库
    142         database.close()
    143 
    144     def insert_to_table(self):
    145         with self.database_connect() as dbconnect:
    146             datas = self.parse_html_by_css()
    147             cursor = dbconnect.cursor()
    148             for title, directors, actors, year, country, movieType, reviewScore, reviewCount, oneWordDes in datas:
    149                 # sql语句一定要注意语法,特别是{}两边的引号
    150                 sql = "INSERT INTO top250 (movieTitle, directors, actors, year, country, movieType, reviewScore, reviewCount, description) " \
    151                   "VALUES ('{}','{}','{}','{}','{}','{}','{}','{}','{}');".format(title, directors, actors, year, country, movieType, reviewScore, reviewCount, oneWordDes)
    152                 # insertSql = "INSERT INTO top250 (movieTitle, directors, actors, year, country, movieType, reviewScore, reviewCount, description) VALUES('肖申克的救赎', '导演: 弗兰克·德拉邦特 Frank Darabont', '主演: 蒂姆·罗宾斯 Tim Robbins /...', '1994', '美国', '犯罪 剧情', '9.7', '2518046', '希望让人自由。');"
    153                 cursor.execute(sql)
    154                 dbconnect.commit()
    155 
    156     def run(self):
    157         self.insert_to_table()
    158 if __name__ == "__main__":
    159     for filter in range(0, 225+1, 25):
    160         url = f'https://movie.douban.com/top250?start={filter}&filter='
    161         app = douBanSpider(url=url)
    162         # app.database_connect() # 连接数据
    163         # app.create_table() # 创建数据表
    164         # app.update_database() # 更新数据库,将评分类型改为decimal
    165         # app.insert_to_table() # 写入数据
    166         app.run()

    暂时没尝试服务器数据库的写入速度,本地数据库的写入速度相当于秒入。

  • 相关阅读:
    洛谷 P1443 马的遍历 BFS
    洛谷 P1583 魔法照片 快排
    洛谷 P1093 奖学金 冒泡排序
    洛谷 P3811 【模板】乘法逆元 如题
    洛谷 P3384 【模板】树链剖分 如题
    洛谷 P3379 【模板】最近公共祖先(LCA) 如题
    vijos 信息传递 tarjan找环
    洛谷 P3373 【模板】线段树 2 如题(区间加法+区间乘法+区间求和)
    酒厂选址
    ⑨要写信
  • 原文地址:https://www.cnblogs.com/mafu/p/15745751.html
Copyright © 2011-2022 走看看