zoukankan      html  css  js  c++  java
  • 多种方法爬取猫眼电影Top100排行榜,保存到csv文件,下载封面图

    参考链接:

    https://blog.csdn.net/BF02jgtRS00XKtCx/article/details/83663400

    https://www.makcyun.top/web_scraping_withpython1.html

    因猫眼网站有些更新,参考链接中的部分代码执行报错,特修改一下

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    import csv
    import re
    from multiprocessing.pool import Pool
    
    import requests
    from bs4 import BeautifulSoup
    from lxml import etree
    from requests.exceptions import RequestException
    
    
    def get_one_page(url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'
            }
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            else:
                return None
        except RequestException:
            return None
    
    
    # 获取封面大图
    def get_thumb(url):
        # url = 'https://p0.meituan.net/movie/46c29a8b8d8424bdda7715e6fd779c66235684.jpg@160w_220h_1e_1c'
        pattern = re.compile(r'(.*?)@.*?')
        thumb = re.search(pattern, url)
        return thumb.group(1)
        # http://p0.meituan.net/movie/5420be40e3b755ffe04779b9b199e935256906.jpg@160w_220h_1e_1c
        # 去掉@160w_220h_1e_1c就是大图
    
    
    # 提取上映时间函数
    def get_release_time(data):
        pattern = re.compile(r'(.*?)((|$)')
        items = re.search(pattern, data)
        if items is None:
            return '未知'
        return items.group(1)  # 返回匹配到的第一个括号(.*?)中结果即时间
    
    
    # 提取国家/地区函数
    def get_release_area(data):
        pattern = re.compile(r'.*((.*))')
        # $表示匹配一行字符串的结尾,这里就是(.*?);(|$,表示匹配字符串含有(,或者只有(.*?)
        items = re.search(pattern, data)
        if items is None:
            return '未知'
        return items.group(1)
    
    
    # 使用正则表达式的写法
    def parse_one_page(html):
        pattern = re.compile(
            '<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',
            re.S)  # re.S表示匹配任意字符,如果不加,则无法匹配换行符
        items = re.findall(pattern, html)
        for item in items:
            yield {
                'index': item[0],
                'thumb': get_thumb(item[1]),  # 定义get_thumb()方法进一步处理网址
                'name': item[2],
                'star': item[3].strip()[3:],
                # 'time': item[4].strip()[5:],
                # 用一个方法分别提取time里的日期和地区
                'time': get_release_time(item[4].strip()[5:]),
                'area': get_release_area(item[4].strip()[5:]),
                'score': item[5].strip() + item[6].strip()
                # 评分score由整数+小数两部分组成
            }
    
    
    # lxml结合xpath提取
    def parse_one_page2(html):
        parse = etree.HTML(html)
        items = parse.xpath('/html/body/div[4]//div//dd')
        for item in items:
            yield {
                'index': item.xpath('./i/text()')[0],
                'thumb': get_thumb(str(item.xpath('./a/img[2]/@data-src')[0].strip())),
                'name': item.xpath('./div/div/div[1]/p[1]/a/@title')[0],
                'star': item.xpath('.//p[@class="star"]/text()')[0].strip()[3:],
                'realease_time': get_release_time(item.xpath('.//p[@class="releasetime"]/text()')[0].strip()[5:]),
                'area': get_release_area(item.xpath('.//p[@class="releasetime"]/text()')[0].strip()[5:]),
                'score': item.xpath('./div/div/div[2]/p/i[1]/text()')[0] + item.xpath('./div/div/div[2]/p/i[2]/text()')[0],
            }
    
    # 使用BeautifulSoup结合css选择器
    def parse_one_page3(html):
        soup = BeautifulSoup(html, 'lxml')
        items = range(10)
        for item in items:
            yield {
                'index': soup.select('i.board-index')[item].string,
                'thumb': get_thumb(soup.select('.board-img')[item]['data-src']),
                'name': soup.select('.name a')[item].string,
                'star': soup.select('.star')[item].string.strip()[3:],
                'time': get_release_time(soup.select('.releasetime')[item].string.strip()[5:]),
                'area': get_release_area(soup.select('.releasetime')[item].string.strip()[5:]),
                'score': soup.select('.integer')[item].string + soup.select('.fraction')[item].string
            }
    
    # Beautiful Soup + find_all函数提取
    def parse_one_page4(html):
        soup = BeautifulSoup(html, 'lxml')
        items = range(10)
        for item in items:
            yield {
                'index': soup.find_all(class_='board-index')[item].string,
                'thumb': get_thumb(soup.find_all(class_='board-img')[item].attrs['data-src']),
                'name': soup.find_all(name='p', attrs={'class': 'name'})[item].string,
                'star': soup.find_all(name='p', attrs={'class': 'star'})[item].string.strip()[3:],
                'time': get_release_time(soup.find_all(class_='releasetime')[item].string.strip()[5:]),
                'area': get_release_area(soup.find_all(class_='releasetime')[item].string.strip()[5:]),
                'score': soup.find_all(name='i', attrs={'class': 'integer'})[item].string +
                         soup.find_all(name='i', attrs={'class': 'fraction'})[item].string
            }
    
    
    # 数据存储到csv
    def write_to_file3(item):
        with open('猫眼top100.csv', 'a', encoding='utf_8_sig', newline='') as f:
            # 'a'为追加模式(添加)
            # utf_8_sig格式导出csv不乱码
            fieldnames = ['index', 'thumb', 'name', 'star', 'time', 'area', 'score']
            w = csv.DictWriter(f, fieldnames=fieldnames)
            # w.writeheader()
            w.writerow(item)
    
    
    # 下载封面图片
    def download_thumb(name, url, num):
        try:
            response = requests.get(url)
            with open('封面图/' + name + '.jpg', 'wb') as f:
                f.write(response.content)
                print('第%s部电影封面下载完毕' % num)
                print('------')
        except RequestException as e:
            print(e)
            pass
        # 不能是w,否则会报错,因为图片是二进制数据所以要用wb
    
    
    def main(offset):
        url = 'http://maoyan.com/board/4?offset=' + str(offset)
        html = get_one_page(url)
        for item in parse_one_page4(html):
            write_to_file3(item)
            download_thumb(item['name'], item['thumb'], item['index'])
    
    
    if __name__ == '__main__':
        pool = Pool()
        pool.map(main, [i * 10 for i in range(10)])
  • 相关阅读:
    MVC之Servlet控制器(二)
    MVC之Servlet控制器(一)
    基于Java实现批量下载网络图片
    @ModelAttribute运用详解
    MyBatis
    理解RESTful架构
    并行计算结课论文边写边总结2
    并行计算结课论文边写边总结(1)
    CUDA笔记(六)
    ubuntu12.04
  • 原文地址:https://www.cnblogs.com/sanduzxcvbnm/p/10246159.html
Copyright © 2011-2022 走看看