zoukankan      html  css  js  c++  java
  • 爬取豆瓣top250电影的信息

    import requests
    import re
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
    }
    
    
    # 爬虫三部曲
    # 1.发送请求
    def get_html(url):
        response = requests.get(url, headers=headers)
        return response
    
    
    # 2.解析数据
    def parse_html(response):
        movie_data_list = re.findall(
            '<div class="item">.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价',
            response.text, re.S)
        return movie_data_list
    
    
    # 3.保存数据
    def save_data(movie_data_list, num):
    
        # ('https://movie.douban.com/subject/1292052/', '肖申克的救赎', '9.7', '1737867')
        url, name, point, commit = movie_data_list
        # 处理数据的格式
        movie_data = f'''
        电影排名:{num}
        详情页url:{url}
        电影名字:{name}
        电影评分:{point}
        评价人数:{commit}
        '''
        print(movie_data)
        with open('douban_top250.txt', 'a', encoding='utf-8') as f:
            f.write(movie_data)
    
    
    if __name__ == '__main__':
        number = 0
        num = 1
        for line in range(10):
            
            url = f'https://movie.douban.com/top250?start={number}&filter='
            # url = f'https://movie.douban.com/top250?start={line * 25}&filter='
            number += 25
            # print(url)
            index_response = get_html(url)
            movie_data_list = parse_html(index_response)
    
            for movie_tuple in movie_data_list:
                # ('https://movie.douban.com/subject/1292052/', '肖申克的救赎', '9.7', '1737867')
                save_data(movie_tuple, num)
                num += 1
    
  • 相关阅读:
    Maven
    Maven
    Maven
    Maven
    Maven
    Maven
    Maven
    Maven
    Maven教程
    SASS
  • 原文地址:https://www.cnblogs.com/chanyuli/p/12134847.html
Copyright © 2011-2022 走看看