zoukankan      html  css  js  c++  java
  • 爬取豆瓣电影top250

    爬取豆瓣电影top250

    import requests
    import re
    import codecs
    from bs4 import BeautifulSoup
    from openpyxl import Workbook
    wb = Workbook()
    dest_filename = '电影.xlsx'
    ws1 = wb.active
    ws1.title = "电影top250"
    
    DOWNLOAD_URL = 'http://movie.douban.com/top250/'
    
    
    def download_page(url):
        """获取url地址页面内容"""
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
        }
        data = requests.get(url, headers=headers).content
        return data
    
    
    def get_li(doc):
        soup = BeautifulSoup(doc, 'html.parser')
        ol = soup.find('ol', class_='grid_view')
        name = []  # 名字
        star_con = []  # 评价人数
        score = []  # 评分
        info_list = []  # 短评
        for i in ol.find_all('li'):
            detail = i.find('div', attrs={'class': 'hd'})
            movie_name = detail.find(
                'span', attrs={'class': 'title'}).get_text()  # 电影名字
            level_star = i.find(
                'span', attrs={'class': 'rating_num'}).get_text()  # 评分
            star = i.find('div', attrs={'class': 'star'})
            star_num = star.find(text=re.compile('评价'))  # 评价
    
            info = i.find('span', attrs={'class': 'inq'})  # 短评
            if info:  # 判断是否有短评
                info_list.append(info.get_text())
            else:
                info_list.append('无')
            score.append(level_star)
    
            name.append(movie_name)
            star_con.append(star_num)
        page = soup.find('span', attrs={'class': 'next'}).find('a')  # 获取下一页
        if page:
            return name, star_con, score, info_list, DOWNLOAD_URL + page['href']
        return name, star_con, score, info_list, None
    
    
    def main():
        url = DOWNLOAD_URL
        name = []
        star_con = []
        score = []
        info = []
        while url:
            doc = download_page(url)
            movie, star, level_num, info_list, url = get_li(doc)
            name = name + movie
            star_con = star_con + star
            score = score + level_num
            info = info + info_list
        for (i, m, o, p) in zip(name, star_con, score, info):
            col_A = 'A%s' % (name.index(i) + 1)
            col_B = 'B%s' % (name.index(i) + 1)
            col_C = 'C%s' % (name.index(i) + 1)
            col_D = 'D%s' % (name.index(i) + 1)
            ws1[col_A] = i
            ws1[col_B] = m
            ws1[col_C] = o
            ws1[col_D] = p
        wb.save(filename=dest_filename)
    
    
    if __name__ == '__main__':
        main()
    

      

  • 相关阅读:
    MySQL中如何使用布尔类型【转】
    你所不知道的Android Studio调试技巧【转】
    设计模式之工厂模式(factory pattern)【转】
    layuiadmin+tp5后台内容管理系统【转】
    PHPStorm怎么修改选中的背景颜色呢?【转】
    PHP保留两位小数的几种方法【转】
    jquery的css()函数同时设置多个css属性值
    Flutter text设置行间距【转】
    Flutter入门-布局Container、Padding、Align、Center【转】
    redis下载地址
  • 原文地址:https://www.cnblogs.com/dengyanchuan/p/11086649.html
Copyright © 2011-2022 走看看