zoukankan      html  css  js  c++  java
  • 猫眼电影爬取

    '''爬取猫眼电影TOP100,并将其保存'''
    from urllib import request
    import re
    import csv
    import time
    import random
    import os
    
    
    class MaoyanSpider:
        def __init__(self):
            self.url = 'https://maoyan.com/board/4?offset={}'
            self.ua_list = [
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
                'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
                'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
                'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
            ]
    
        # 获取页面
        def get_page(self, url):
            # 每次使用随机的User-Agent
            headers = {'User-Agent': random.choice(self.ua_list)}
            req = request.Request(url=url, headers=headers)
            res = request.urlopen(req)
            html = res.read().decode('utf-8')
            # 调用解析函数
            self.parse_page(html)
    
        # 解析页面
        def parse_page(self, html):
            pattern = re.compile(
                r'<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>',
                re.S)
            r_list = pattern.findall(html)
            self.save_info(r_list)
    
        # 打印页面
        # def save_info(self, r_list):
        #     one_film_dict = {}
        #     for rt in r_list:
        #         one_film_dict['name'] = rt[0].strip()
        #         one_film_dict['stars'] = rt[1].strip()
        #         one_film_dict['time'] = rt[2].strip()
        #         print(one_film_dict)
    
        # 保存页面
        def save_info(self, r_list):
            film_list = []
            with open('./maoyan.csv', 'a', encoding='utf-8', newline='') as f:
                # 方法一:一条一条写入csv
                # for rt in r_list:
                #     writer = csv.writer(f)
                #     writer.writerow([rt[0].strip(), rt[1].strip(), rt[2].strip()])
                # 方法二:一次性写入csv,减少IO
                writer = csv.writer(f)
                for rt in r_list:
                    # 把处理过的数据定义成元组
                    t = (rt[0].strip(), rt[1].strip(), rt[2].strip())
                    film_list.append(t)
                writer.writerows(film_list)
    
        # 主函数
        def main(self):
            if os.path.exists('./maoyan.csv'):
                os.remove('./maoyan.csv')
            with open('./maoyan.csv', 'a', encoding='utf-8', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(['电影名称', '主演', '上映时间'])
            i = 1
            for offset in range(0, 91, 10):
                url = self.url.format(offset)
                self.get_page(url)
                print('第{}页成功下载'.format(i))
                i += 1
                # time.sleep(random.randint(1, 4))
    
    
    if __name__ == '__main__':
        start = time.time()
        spider = MaoyanSpider()
        spider.main()
        end = time.time()
        print('程序执行时间为: %.2f' % (end - start))
  • 相关阅读:
    17种正则表达式
    网页滚动条的处理
    My GIS 2012
    spring 的mvc项目启动报错:java.util.zip.ZipException
    ASP.NET读取XML文件的方法
    urlrewritingnet重写的几点
    ASP.NET 伪静态页面的实现
    URL重写组件UrlRewriter 在Windows XP下的运用
    我们搞web开发,总结一些常用功能源码
    图片二进制存取
  • 原文地址:https://www.cnblogs.com/yuxiangyang/p/11212544.html
Copyright © 2011-2022 走看看