zoukankan      html  css  js  c++  java
  • Python3.8 爬取豆瓣电影TOP250 练手爬虫

     1 #!/usr/bin/env python
     2 # encoding=utf-8
     3 import requests
     4 import re
     5 import codecs
     6 from bs4 import BeautifulSoup
     7 from openpyxl import Workbook
     8 wb = Workbook()
     9 dest_filename = '电影.xlsx'
    10 ws1 = wb.active
    11 ws1.title = "电影top250"
    12 
    13 DOWNLOAD_URL = 'http://movie.douban.com/top250/'
    14 
    15 
    16 def download_page(url):
    17     """获取url地址页面内容"""
    18     headers = {
    19         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
    20     }
    21     data = requests.get(url, headers=headers).content
    22     return data
    23 
    24 
    25 def get_li(doc):
    26     soup = BeautifulSoup(doc, 'html.parser')
    27     ol = soup.find('ol', class_='grid_view')
    28     name = []  # 名字
    29     star_con = []  # 评价人数
    30     score = []  # 评分
    31     info_list = []  # 短评
    32     desc_list = [] # 简介
    33     for i in ol.find_all('li'):
    34         detail = i.find('div', attrs={'class': 'hd'})
    35         movie_name = detail.find(
    36             'span', attrs={'class': 'title'}).get_text()  # 电影名字
    37         level_star = i.find(
    38             'span', attrs={'class': 'rating_num'}).get_text()  # 评分
    39         star = i.find('div', attrs={'class': 'star'})
    40         star_num = star.find(text=re.compile('评价'))  # 评价
    41         info = i.find('span', attrs={'class': 'inq'})  # 短评
    42         desc = i.find('p', attrs={'class': ''})  # 介绍
    43         desc_list.append(desc.get_text())
    44         
    45         if info:  # 判断是否有短评
    46             info_list.append(info.get_text())
    47         else:
    48             info_list.append('')
    49         score.append(level_star)
    50 
    51         name.append(movie_name)
    52         star_con.append(star_num)
    53     page = soup.find('span', attrs={'class': 'next'}).find('a')  # 获取下一页
    54     if page:
    55         return name, star_con, score, info_list, desc_list, DOWNLOAD_URL + page['href']
    56     return name, star_con, score, info_list, desc_list,None
    57 
    58 
    59 def main():
    60     url = DOWNLOAD_URL
    61     name = []
    62     star_con = []
    63     score = []
    64     info = []
    65     desc = []
    66     while url:
    67         doc = download_page(url)
    68         movie, star, level_num, info_list, desc_list, url = get_li(doc)
    69         name = name + movie
    70         star_con = star_con + star
    71         score = score + level_num
    72         info = info + info_list
    73         desc = desc + desc_list
    74     for (i, m, o, p , d) in zip(name, star_con, score, info , desc):
    75         col_A = 'A%s' % (name.index(i) + 1)
    76         col_B = 'B%s' % (name.index(i) + 1)
    77         col_C = 'C%s' % (name.index(i) + 1)
    78         col_D = 'D%s' % (name.index(i) + 1)
    79         col_E = 'E%s' % (name.index(i) + 1)
    80         ws1[col_A] = i
    81         ws1[col_B] = m
    82         ws1[col_C] = o
    83         ws1[col_D] = p
    84         ws1[col_E] = d
    85     wb.save(filename=dest_filename)
    86 
    87 
    88 if __name__ == '__main__':
    89     main()

    用完python写爬虫,再也不想用php写了,方便太多了。php只有无数的正则匹配,效率低,还写的累。。。

    滴水成冰,世间不存在毫无意义的付出,时间终会给你答案。
  • 相关阅读:
    基于google Zxing实现二维码、条形码扫描,仿微信二维码扫描效果(转)
    javascript中call、apply、argument、callee、caller
    Google Analytics10条有用教程(转)
    小问题
    MySQL exists的用法介绍
    warning C4005: “AF_IPX”: 宏重定义的解决办法
    /MT、/MD编译选项,以及可能引起在不同堆中申请、释放内存的问题
    _mkdir
    文件读写操作总结
    Distinct
  • 原文地址:https://www.cnblogs.com/soupig/p/15715242.html
Copyright © 2011-2022 走看看