zoukankan      html  css  js  c++  java
  • 爬取豆瓣电影信息保存到Excel

     1 from bs4 import BeautifulSoup
     2 import requests
     3 import html.parser
     4 from openpyxl import Workbook,load_workbook
     5 import os
     6 class DouBan(object):
     7 
     8     def __init__(self):
     9         self.url = 'https://movie.douban.com/'
    10         self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
    11 
    12     def openUrl(self, url):
    13         response = requests.get(url,headers=self.header)
    14         return response
    15 
    16     def getUrl(self):
    17         response = self.openUrl(self.url)
    18         douban_html = response.text
    19         # print(douban_html)
    20         soup = BeautifulSoup(douban_html,'html.parser')
    21         hrefs = soup.select("li.poster > a")
    22         return hrefs
    23         # for href in hrefs:
    24         #     print(href['href']
    25     def getMsg(self):
    26         hrefs = self.getUrl()
    27         for num,href in enumerate(hrefs):
    28             msg_list = []
    29             print(href['href'])
    30             response = self.openUrl(href['href'])
    31             html_mover = response.text
    32             soup = BeautifulSoup(html_mover,'html.parser')
    33             all_info = soup.select('div#content')
    34             # print(all_info)
    35             title = all_info[0].select('h1')[0].text.replace('
    ','')
    36             msg_list.append(title)
    37             # print(title)
    38             info = all_info[0].select('#info')[0].text
    39             msg_list.append(info)
    40             # print(info)
    41             describe = all_info[0].select('div#link-report span')[0].text.replace(' ','')
    42             msg_list.append(describe)
    43             # print(describe)
    44             # return title,info,describe
    45             for col in range(3):
    46                 self.saveMsg(num+1, col+1,  msg_list[col])
    47 
    48     def saveMsg(self, row_, column_,msg):
    49         # msg = self.getMsg()
    50         # a = os.path.exists('//move_msg.xlsx')
    51         # if a=False:
    52         #     os.mkdir('move_msg.xlsx')
    53         
    54         wb = load_workbook('move_msg.xlsx')
    55         sheet = wb.active
    56         sheet.cell(row=row_, column=column_).value = msg
    57         wb.save('move_msg.xlsx')
    58 
    59 
    60 
    61 
    62 if __name__ == "__main__":
    63     db = DouBan()
    64     db.getMsg()
  • 相关阅读:
    ASP.NET Repeater的用法初探
    ADO.NET 数据查询和数据操作
    ASP.NET 一般处理程序基础1(Get Post 表单提交 Http协议 Nvelocity模板引擎)
    接口继承
    《需求工程》阅读笔记2
    《需求工程》阅读笔记1
    Python3.0中的strip方法失效问题以及re.sub方法无法执行问题
    使用Python爬取豆瓣电影详细数据
    《软件方法》阅读笔记——3
    基于layui实现了将查询出的数据分页显示
  • 原文地址:https://www.cnblogs.com/royfans/p/7474662.html
Copyright © 2011-2022 走看看