zoukankan      html  css  js  c++  java
  • Python爬取猫眼电影案例

     1 from urllib import request
     2 from urllib import parse
     3 import time
     4 import re
     5 import pymysql
     6 
     7 class MaoyanSpider(object):
     8     def __init__(self):
     9         self.baseurl = 'https://maoyan.com/board/4?offset='
    10         self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}
    11         # 爬取页数计数
    12         self.page = 1
    13         # 创建2个对象
    14         self.db = pymysql.connect(
    15             'localhost','root','123456','spider',
    16             charset='utf8'
    17         )
    18         self.cursor = self.db.cursor()
    19 
    20 
    21     # 获取页面
    22     def get_page(self,url):
    23         req = request.Request(url,headers=self.headers)
    24         res = request.urlopen(req)
    25         html = res.read().decode('utf-8')
    26         # 直接调用解析函数
    27         self.parse_page(html)
    28 
    29     # 解析页面
    30     def parse_page(self,html):
    31         # 正则解析
    32         p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S)
    33         r_list = p.findall(html)
    34         # r_list : [('霸王别姬','张国荣','1993'),(),()]
    35         self.write_page(r_list)
    36 
    37     # 保存数据(存到mysql数据库)
    38     def write_page(self,r_list):
    39         ins = 'insert into film(name,star,time) 
    40                values(%s,%s,%s)'
    41         for rt in r_list:
    42             film_list = [
    43                 rt[0].strip(),
    44                 rt[1].strip(),
    45                 rt[2].strip()[5:15]
    46              ]
    47 
    48             self.cursor.execute(ins,film_list)
    49             # 提交到数据库执行
    50             self.db.commit()
    51 
    52     # 主函数
    53     def main(self):
    54         # 用range函数可获取某些查询参数的值
    55         for offset in range(0,41,10):
    56             url = self.baseurl + str(offset)
    57             self.get_page(url)
    58             print('第%d页爬取成功' % self.page)
    59             self.page += 1
    60             time.sleep(1)
    61         # 等所有页面爬完后再关闭
    62         self.cursor.close()
    63         self.db.close()
    64 
    65 if __name__ == '__main__':
    66     spider = MaoyanSpider()
    67     spider.main()
  • 相关阅读:
    福大软工1816 · 第六次作业
    福大软工1816 · 第五次作业
    BETA 版冲刺前准备
    Alpha 事后诸葛亮(团队)
    Alpha 答辩总结
    Alpha 冲刺 (10/10)
    Alpha 冲刺 (9/10)
    Alpha 冲刺 (8/10)
    Alpha 冲刺 (7/10)
    Alpha 冲刺 (6/10)
  • 原文地址:https://www.cnblogs.com/OmySql/p/10796199.html
Copyright © 2011-2022 走看看