zoukankan      html  css  js  c++  java
  • python 爬取豆瓣电影写入到excel中

     1 from bs4 import BeautifulSoup
     2 import requests
     3 import urllib.request as req
     4 import xlwt
     5 
     6 class Spider(object):
     7     def __init__(self):
     8         self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
     9         self.proxies = proxies
    10         self.url = 'https://movie.douban.com/top250'
    11 
    12     def get_url(self, num):
    13         next_url = 'https://movie.douban.com/top250?start=%d&filter=' % num
    14         return next_url
    15 
    16     def run(self):
    17         movie_index_list = []
    18         movie_name_list = []
    19         hero_name_list = []
    20         movie_pic_list = []
    21         movie_link_list = []
    22         movie_score_list = []
    23         movie_content_list = []
    24         for num in range(10):
    25             next_url = self.get_url(num * 25)
    26             response = requests.get(next_url, proxies=self.proxies, headers=self.headers).text
    27             ret = BeautifulSoup(response, "html.parser")
    28             for tag in ret.find_all(attrs={"class": "item"}):
    29                 movie_index = tag.find('em').get_text()
    30                 movie_name = tag.find(attrs={"class": "title"}).get_text()
    31                 hero_name = tag.find('p').get_text().strip().split(' ')[1]
    32                 movie_pic = tag.find('img').get('src')
    33                 movie_link = tag.find('a').get("href")
    34                 movie_score = tag.find(attrs={"class": "rating_num"}).get_text()
    35                 movie_content = tag.find_all('span')[-2].get_text()
    36                 movie_index_list.append(movie_index)
    37                 movie_name_list.append(movie_name)
    38                 hero_name_list.append(hero_name)
    39                 movie_pic_list.append(movie_pic)
    40                 movie_link_list.append(movie_link)
    41                 movie_score_list.append(movie_score)
    42                 movie_content_list.append(movie_content)
    43 
    44         return movie_index_list, movie_name_list, hero_name_list, movie_pic_list, movie_link_list, movie_score_list, movie_content_list
    45 
    46     def write_to_excel(self):
    47         movie_index, movie_name, hero_name, movie_pic, movie_link, movie_score, movie_content = self.run()
    48         workbook = xlwt.Workbook(encoding='utf-8')
    49         worksheet = workbook.add_sheet('sheet1')
    50 
    51         for i in range(250):
    52             for j in range(7):
    53                 if j == 0:
    54                     worksheet.write(i, 0, movie_index[i])
    55                 elif j == 1:
    56                     worksheet.write(i, 1, movie_name[i])
    57                 elif j == 2:
    58                     worksheet.write(i, 2, hero_name[i])
    59                 elif j == 3:
    60                     worksheet.write(i, 3, movie_pic[i])
    61                 elif j == 4:
    62                     worksheet.write(i, 4,  movie_link[i])
    63                 elif j == 5:
    64                     worksheet.write(i, 5, movie_score[i])
    65                 else:
    66                     worksheet.write(i, 6, movie_content[i])
    67         workbook.save('./spider.xls')
    68 
    69 if __name__ == '__main__':
    70     sp = Spider()
    71     sp.write_to_excel()

    运行结果如下图:

  • 相关阅读:
    无标题
    OSI七层模型介绍
    Microsoft Visual Studio .NET 系统必备
    如何得到硬盘序列号[C#]
    session变量
    使用Installshield制作asp,asp.net应用的安装程序
    如何远程备份sql server数据库
    VS.NET打印思想与2003/5DataGrid、DataGridView及二维数据如ListView等终极打印实现(全部源码)
    6.22打包建立ISS虚拟目录,安装完运行你想运行的程序
    关于网关的精典描述通俗易懂
  • 原文地址:https://www.cnblogs.com/hello-python2020/p/14109829.html
Copyright © 2011-2022 走看看