zoukankan      html  css  js  c++  java
  • 数据持久化存储

    一数据持久化存储-csv文件

    1.作用

      将爬取的数据存放到本地的csv文件中

    2.使用流程

    1、导入模块
    2、打开csv文件
    3、初始化写入对象
    4、写入数据(参数为列表)
    import csv 
    
    with open('film.csv','w') as f:
        writer = csv.writer(f)
        writer.writerow([])
    writer.writerows([(),(),()])

    3.示例代码

    创建text.csv文件,在文件中写入数据

    # 单行写入(writerow([]))
    import csv
    with open('test.csv','w',newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['步惊云','36'])
        writer.writerow(['聂风','36'])
    
    # 多行写入(writerows([(),(),()]
    import csv
    with open('test.csv','w') as f:
        writer = csv.writer(f)
        writer.writerows([('聂风','36'),('秦霜','25'),('孔慈','30')])

    练习:猫眼电影数据存入本地 maoyanfilm.csv 文件 - 使用writerow方法实现

    思考:使用 writerows()方法实现?

    import csv
    from urllib import request, parse
    import re
    import time
    import random
    from useragents import ua_list
    
    
    class MaoyanSpider(object):
      def __init__(self):
        self.url = 'https://maoyan.com/board/4?offset={}'
        # 计数
        self.num = 0
    
    
      def get_html(self, url):
        headers = {
          'User-Agent': random.choice(ua_list)
        }
        req = request.Request(url=url, headers=headers)
        res = request.urlopen(req)
        html = res.read().decode('utf-8')
        # 直接调用解析函数
        self.parse_html(html)
    
      def parse_html(self, html):
        # 创建正则的编译对象
        re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> '
        pattern = re.compile(re_, re.S)
        # film_list:[('霸王别姬','张国荣','1993')]
        film_list = pattern.findall(html)
        self.write_html(film_list)
    
      # 存入csv文件-writerrows
      def write_html(self, film_list):
        L = []
        with open('maoyanfilm.csv', 'a',newline='') as f:
          # 初始化写入对象,注意参数f不能忘
          writer = csv.writer(f)
          for film in film_list:
            t = (
              film[0].strip(),
              film[1].strip(),
              film[2].strip()[5:15]
            )
            self.num += 1
            L.append(t)
          # writerow()参数为列表
          writer.writerows(L)
          print(L)
    
      # def write_html(self,film_list):
      #   with open('maoyanfilm.csv','a') as f:
      #     #初始化写入对象,注意参数f不能忘
      #     writer=csv.writer(f)
      #     for film in film_list:
      #       L=[
      #         film[0].strip(),
      #         film[1].strip(),
      #         film[2].strip()[5:15]
      #       ]
      #       self.num+=1
      #       # writerow()参数为列表
      #       writer.writerow(L)
    
      def main(self):
        for offset in range(0, 91, 10):
          url = self.url.format(offset)
          self.get_html(url)
          time.sleep(random.randint(1, 2))
        print('共抓取数据', self.num, "")
    
    
    if __name__ == '__main__':
      start = time.time()
      spider = MaoyanSpider()
      spider.main()
      end = time.time()
      print('执行时间:%.2f' % (end - start))
    writeow方法
    ua_list = [
      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
      'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
      'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
      'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
    
    ]
    useragents.py配置文件

    二数据持久化存储 - MySQL数据库

    1.在数据库中建库建表

    # 连接到mysql数据库
    mysql -h127.0.0.1 -uroot -p123456
    # 建库建表
    create database maoyandb charset utf8;
    use maoyandb;
    create table filmtab(
    name varchar(100),
    star varchar(300),
    time varchar(50)
    )charset=utf8;

    2.回顾pymysql基本使用

    import pymysql
    
    # 创建2个对象
    db = pymysql.connect('localhost','root','123456','maoyandb',charset='utf8')
    cursor = db.cursor()
    
    # 执行SQL命令并提交到数据库执行
    # execute()方法第二个参数为列表传参补位
    cursor.execute('insert into filmtab values(%s,%s,%s)',['霸王别姬','张国荣','1993'])
    db.commit()
    
    # 关闭
    cursor.close()
    db.close()

    3.来试试高效的executemany()方法?

    import pymysql
    
    # 创建2个对象
    db = pymysql.connect('192.168.153.137','tiger','123456','maoyandb',charset='utf8')
    cursor = db.cursor()
    
    # 抓取的数据
    film_list = [('月光宝盒','周星驰','1994'),('大圣娶亲','周星驰','1994')]
    
    # 执行SQL命令并提交到数据库执行
    # execute()方法第二个参数为列表传参补位
    cursor.executemany('insert into filmtab values(%s,%s,%s)',film_list)
    db.commit()
    
    # 关闭
    cursor.close()
    db.close()
    import pymysql
    
    # __init__(self):
        self.db = pymysql.connect('IP',... ...)
        self.cursor = self.db.cursor()
    # write_data(self):
        self.cursor.execute('sql',[data1])
        self.cursor.executemany('sql',[(data1),(data2),(data3)])
        self.db.commit()
    # main(self):
        self.cursor.close()
        self.db.close()

    4.将电影信息存入MySQL数据库(尽量使用executemany方法)

    import pymysql
    from urllib import request, parse
    import re
    import time
    import random
    from useragents import ua_list
    
    
    class MaoyanSpider(object):
      def __init__(self):
        self.url = 'https://maoyan.com/board/4?offset={}'
        # 计数
        self.num = 0
        self.db = pymysql.connect(
          'localhost', 'root', '123456', 'maoyandb', charset='utf8'
        )
        self.cursor = self.db.cursor()
    
      def get_html(self, url):
        headers = {
          'User-Agent': random.choice(ua_list)
        }
        req = request.Request(url=url, headers=headers)
        res = request.urlopen(req)
        html = res.read().decode('utf-8')
        # 直接调用解析函数
        self.parse_html(html)
    
      def parse_html(self, html):
        # 创建正则的编译对象
        re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> '
        pattern = re.compile(re_, re.S)
        # film_list:[('霸王别姬','张国荣','1993')]
        film_list = pattern.findall(html)
        self.write_html(film_list)
    
      def write_html(self, film_list):
        ins = 'insert into filmtab values(%s,%s,%s)'
        for film in film_list:
          L = [
            film[0].strip(),
            film[1].strip(),
            film[2].strip()[5:15]
          ]
          self.num+=1
          print('爬取成功',self.num,'')
          self.cursor.execute(ins, L)
          #提交到数据库执行
          self.db.commit()
    
      def main(self):
        for offset in range(0, 91, 10):
          url = self.url.format(offset)
    
          self.get_html(url)
          time.sleep(random.randint(1, 2))
        print('共抓取数据', self.num, "")
        # 断开数据库
        self.cursor.close()
        self.db.close()
    
    
    if __name__ == '__main__':
      start = time.time()
      spider = MaoyanSpider()
      spider.main()
      end = time.time()
      print('执行时间:%.2f' % (end - start))
    示例
    import pymysql
    from urllib import request, parse
    import re
    import time
    import random
    from useragents import ua_list
    
    
    class MaoyanSpider(object):
      def __init__(self):
        self.url = 'https://maoyan.com/board/4?offset={}'
        # 计数
        self.num = 0
        self.db = pymysql.connect(
          'localhost', 'root', '123456', 'maoyandb', charset='utf8'
        )
        self.cursor = self.db.cursor()
    
      def get_html(self, url):
        headers = {
          'User-Agent': random.choice(ua_list)
        }
        req = request.Request(url=url, headers=headers)
        res = request.urlopen(req)
        html = res.read().decode('utf-8')
        # 直接调用解析函数
        self.parse_html(html)
    
      def parse_html(self, html):
        # 创建正则的编译对象
        re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> '
        pattern = re.compile(re_, re.S)
        # film_list:[('霸王别姬','张国荣','1993')]
        film_list = pattern.findall(html)
        self.write_html(film_list)
      #mysql-executemany
      def write_html(self, film_list):
        L=[]
        ins = 'insert into filmtab values(%s,%s,%s)'
        for film in film_list:
          t = [
            film[0].strip(),
            film[1].strip(),
            film[2].strip()[5:15]
          ]
          self.num+=1
          L.append(t)
          print('爬取成功',self.num,'')
        self.cursor.executemany(ins, L)
        #提交到数据库执行
        self.db.commit()
    
      def main(self):
        for offset in range(0, 91, 10):
          url = self.url.format(offset)
    
          self.get_html(url)
          time.sleep(random.randint(1, 2))
        print('共抓取数据', self.num, "")
        # 断开数据库
        self.cursor.close()
        self.db.close()
    
    
    if __name__ == '__main__':
      start = time.time()
      spider = MaoyanSpider()
      spider.main()
      end = time.time()
      print('执行时间:%.2f' % (end - start))
    executemany方法

    5.做个SQL查询

    1、查询20年以前的电影的名字和上映时间
       select name,time from film where time<=(now()-interval 20 year);
    2、查询1990-2000年的电影名字和上映时间
       select name,time from film where time>='1990-01-01' and time<='2000-12-31';

     三数据持久化存储-MongoDB数据库

    (一)特点

      1.非关系型数据库

      2.库/集合(MySQL中表),文档(MySQL中表记录)

      3.无须手动建库建集合

      4.基本命令

    #查看所有的库
        >show dbs
    #切换库
        >use 库名
    #查看库中所有的集合
        >show collections
    #查看集合中所有文档
        >db.集合名.find().pretty()
    #统计集合中文档个数
        >db.集合名.count()

    (二)pymongo操作mongodb数据库

    import pymongo
    #1.连接对象
    conn = pymongo.MongoClient(
      host='127.0.0.1',
      port=27017
    )
    #2.库对象
    db = conn['maoyandb'] #第一种方法
    # db = conn.maoyandb  #第二种方法
    #3.集合对象
    myset = db['filmtab']
    #4.插入数据库
    #插入一个
    myset.insert_one({'name':'赵敏'})
    #插入多个
    myset.insert_many()
    import pymongo
    
    # __init__(self):
        self.conn = pymongo.MongoClient('IP',27017)
        self.db = self.conn['db_name']
        self.myset = self.db['set_name']
    # write_data(self):
        self.myset.insert_one(dict)
    
    # MongoDB - Commmand
    >show dbs
    >use db_name
    >show collections
    >db.collection_name.find().pretty()
    >db.collection_name.count()
    >db.collection_name.drop()
    >db.dropDatabase()

    练习:将电影信息存入MongoDB数据库

    import pymongo
    from urllib import request
    import re
    import time
    import random
    from useragents import ua_list
    
    
    class MaoyanSpider(object):
      def __init__(self):
        self.url = 'https://maoyan.com/board/4?offset={}'
        # 计数
        self.num = 0
        # 创建3个对象
        self.conn = pymongo.MongoClient(host='127.0.0.1',port=27017)
        self.db = self.conn['maoyandb']
        self.myset = self.db['filmset']
    
      def get_html(self, url):
        headers = {
          'User-Agent': random.choice(ua_list)
        }
        req = request.Request(url=url, headers=headers)
        res = request.urlopen(req)
        html = res.read().decode('utf-8')
        # 直接调用解析函数
        self.parse_html(html)
    
      def parse_html(self, html):
        # 创建正则的编译对象
        re_ = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p> '
        pattern = re.compile(re_, re.S)
        # film_list:[('霸王别姬','张国荣','1993')]
        film_list = pattern.findall(html)
        self.write_html(film_list)
    
      # mysql-executemany
      def write_html(self, film_list):
        for film in film_list:
          film_dict = {
            'name':film[0].strip(),
            'star':film[1].strip(),
            'time':film[2].strip()[5:15]
          }
          self.num+=1
          #插入mongodb数据库
          self.myset.insert_one(film_dict)
          print('爬取成功', self.num, '')
    
    
      def main(self):
        for offset in range(0, 91, 10):
          url = self.url.format(offset)
    
          self.get_html(url)
          time.sleep(random.randint(1, 2))
        print('共抓取数据', self.num, "")
    
    
    if __name__ == '__main__':
      start = time.time()
      spider = MaoyanSpider()
      spider.main()
      end = time.time()
      print('执行时间:%.2f' % (end - start))
    代码

    多级页面数据抓取

    # 整体思路 
    1、爬取一级页面,提取 所需数据+链接,继续跟进
    2、爬取二级页面,提取 所需数据+链接,继续跟进
    3、... ... 
    # 代码实现思路
    1、所有数据最终都会在一级页面解析函数中拿到
    2、避免重复代码 - 请求、解析需定义函数

    电影天堂案例 - 二级页面抓取

    领取任务:

    # 地址
    电影天堂 - 2019年新片精品 - 更多
    # 目标
    电影名称、下载链接
    
    # 分析
    *********一级页面需抓取***********
            1、电影名称
            2、电影链接
            
    *********二级页面需抓取***********
            1、下载链接

    实现步骤:

      1、确定响应内容中是否存在所需抓取数据

      2、找URL规律

    第1页 :https://www.dytt8.net/html/gndy/dyzz/list_23_1.html
    第2页 :https://www.dytt8.net/html/gndy/dyzz/list_23_2.html
    第n页 :https://www.dytt8.net/html/gndy/dyzz/list_23_n.html

    写正则表达式:

    1、一级页面正则表达式
       <table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">(.*?)</a>.*?</table>
    2、二级页面正则表达式
       <td style="WORD-WRAP.*?>.*?>(.*?)</a>

     代码实现:

    import csv
    from urllib import request
    import re
    from useragents import ua_list
    import time
    import random
    
    
    class FilmSkySpider(object):
      def __init__(self):
        self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
    
      # 获取相应内容
      def get_html(self, url):
        headers = {'User-Agent': random.choice(ua_list)}
        req = request.Request(url=url, headers=headers)
        res = request.urlopen(req)
        #通过网站查看网页远吗,查看网站charset='gb2312'
        #如果遇到解码错误,识别不了一些字符,则 ignore
        html = res.read().decode('gb2312','ignore')
        return html
    
      # 正则解析功能函数
      def re_func(self, re_bds, html):
        pattern = re.compile(re_bds, re.S)
        r_list = pattern.findall(html)
        return r_list
      # 获取数据函数 html是一级页面相应内容
      def parse_page(self, html):
        # 想办法获取到 电影名称和下载链接
        re_bds = r'<table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">(.*?)</a>.*?</table>'
        # one_page_list:[('/html/xxx','幸福猎人'),()]
        one_page_list = self.re_func(re_bds, html)
        with open('movefilm.csv', 'a', newline='') as f:
          # 初始化写入对象,注意参数f不能忘
          writer = csv.writer(f)
          item = {}
          for film in one_page_list:
              item['name'] = film[1].strip()
              link = 'https://www.dytt8.net' + film[0]
              item['download'] = self.parse_two_page(link)
              time.sleep(random.uniform(1, 3))
              print(item)
          writer.writerows(item)
    
      # 解析二级页面数据
      def parse_two_page(self, link):
        html = self.get_html(link)
        re_bds = r'<td style="WORD-WRAP.*?>.*?>(.*?)</a>'
        # two_page_list:['ftp://xxxx.mkv']
        two_page_list = self.re_func(re_bds, html)
        download = two_page_list[0].strip()
        return download
    
      def main(self):
        for page in range(1, 11):
          url = self.url.format(page)
          html = self.get_html(url)
          self.parse_page(html)
    
    
    
    if __name__ == '__main__':
      start = time.time()
      spider = FilmSkySpider()
      spider.main()
      end = time.time()
      print('执行时间:%.2f' % (end - start))
    代码

    练习:把电影天堂数据存入MySQL数据库

  • 相关阅读:
    某公司的软件测试笔试题,要是不看书,有的题目还真是不好答
    性能测试面试题总结(答案全)
    最简单实用的docker专题
    什么是LAMP和WAMP,一篇文章和你讲清楚
    版本管理工具Visual SourceSafe的超详细使用教程
    抓包工具Fidder详解(主要来抓取Android中app的请求)
    除了angular/vue/react的新轮子lit
    你可以在浏览器中直接运行 TypeScript 吗?
    Introducing the .NET Hot Reload experience for editing code at runtime
    vscode编辑器快捷方式
  • 原文地址:https://www.cnblogs.com/maplethefox/p/11326873.html
Copyright © 2011-2022 走看看