地址:https://movie.douban.com/top250
开发内容:
首先 我们选择使用Python脚本语言开发这个项目
代码:
import os
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
"""
Author:
Damon
功能:
爬取豆瓣网Top250电影信息保存到本地
"""
# 目标网址
URL = "https://movie.douban.com/top250?start={}"
# 按照爬取顺序保存每个电影的网址
entity_url = []
def save_data(result):
"""
保存爬取信息到本地
:return: None
"""
f = open('movice.txt', "a", encoding="utf8")
f.write("========================================================================================================
")
f.write("排名:" + result['top'] + "
")
f.write("评分:" + result['grade'] + "
")
f.write("名称:" + result['name'] + "
")
f.write("导演:" + result['director'] + "
")
f.write("编剧:" + result['scriptwriter'] + "
")
f.write("主演:" + result['protagonist'] + "
")
f.write("简介:" + result['synopsis'] + "
")
f.write("影评:" + "
")
f.write(" " + result['film_review']['first_user'] + ":" + result['film_review']['first_discuss'] + "
")
f.write(" " + result['film_review']['second_user'] + ":" + result['film_review']['second_discuss'] + "
")
f.write(" " + result['film_review']['thirdly_user'] + ":" + result['film_review']['thirdly_discuss'] + "
")
f.write(" " + result['film_review']['fourthly_user'] + ":" + result['film_review']['fourthly_discuss'] + "
")
f.write(" " + result['film_review']['fifth_user'] + ":" + result['film_review']['fifth_discuss'] + "
")
f.write("网址" + result['url'] + "
")
f.close()
print("已处理:" + result['name'] + " " + result['top'])
def analysis_page(num, url):
"""
解析网页,获取想要的数据
:param num: Top排行
:param url: 电影详情url
:return: None
"""
# 保存电影整体信息
result = {}
# 保存影评信息
film_review = {}
try:
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
}
res = requests.get(url, headers=headers)
res.encoding = "utf-8"
except RequestException as e:
print("请求索引页异常:", repr(e))
print("链接:", url)
# [待处理异常] 目标网址崩溃的异常处理。 解决方法:退出程序
os._exit(0)
soup = BeautifulSoup(res.text, "html.parser")
# 如果该网页不存在则 跳过 执行下一个网页
title = soup.select("title")[0].text
if title == "页面不存在":
f = open('movice.txt', "a", encoding="utf8")
f.write("========================================================================================================
")
f.write("排名:Top" + str(num) + "
")
f.write("ERROR:页面不存在
")
f.write("网址:" + url + "
")
f.close()
return -1
try:
# 排名
result['top'] = "Top" + str(num)
# 评分
result['grade'] = soup.select("#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > strong")[0].text
# 名称
result['name'] = soup.select("#content > h1")[0].text.replace("
", "")
# 导演
result['director'] = soup.select("#info > span > span.attrs")[0].text
try:
# 编剧
result['scriptwriter'] = soup.select("#info > span > span.attrs")[1].text
# 主演
result['protagonist'] = soup.select("#info > span.actor > span.attrs")[0].text
except:
# 编剧
result['scriptwriter'] = ""
# 主演
result['protagonist'] = ""
try:
# 简介
result['synopsis'] = soup.select("#link-report > span.short > span")[0].text.replace("
", "").replace(" ", "")
except:
# 简介
result['synopsis'] = soup.select("#link-report > span")[0].text.replace("
", "").replace(" ", "")
# 第一个影评用户名
film_review['first_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[0].text
# 第一个影评用户评论
film_review['first_discuss'] = soup.select("#hot-comments > div > div > p")[0].text
# 第二个影评用户名
film_review['second_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[1].text
# 第二个影评用户评论
film_review['second_discuss'] = soup.select("#hot-comments > div > div > p")[1].text
# 第三个影评用户名
film_review['thirdly_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[2].text
# 第三个影评用户评论
film_review['thirdly_discuss'] = soup.select("#hot-comments > div > div > p")[2].text
# 第四个影评用户名
film_review['fourthly_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[3].text
# 第四个影评用户评论
film_review['fourthly_discuss'] = soup.select("#hot-comments > div > div > p")[3].text
# 第五个影评用户名
film_review['fifth_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[4].text
# 第五个影评用户评论
film_review['fifth_discuss'] = soup.select("#hot-comments > div > div > p")[4].text
# 影评
result['film_review'] = film_review
# 网址
result['url'] = url
except:
print("异常链接:", url, "------------------------------------")
# [待处理异常] 目标网址崩溃的异常处理。 解决方法:退出程序
os._exit(0)
# 保存数据到本地 txt
save_data(result)
def get_entity_url(url):
"""
爬取目标网址中每一个电影的网址
:param url: 目标网址
:return: None
"""
try:
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
}
res = requests.get(url, headers=headers)
res.encoding = "utf-8"
except RequestException as e:
print("请求索引页异常:", repr(e))
print("链接:", url)
# [待处理异常] 目标网址崩溃的异常处理。 解决方法:退出程序
os._exit(0)
soup = BeautifulSoup(res.text, "html.parser")
entity = soup.select("#content > div > div.article > ol > li > div > div.info > div.hd > a")
for i in range(len(entity)):
entity_url.append(entity[i]['href'])
def make_url(num):
"""
生成每页URL
:param num: 待生成网页 index
:return: None
"""
url = URL.format(num * 25)
get_entity_url(url)
if __name__ == '__main__':
# 获取所有电影的url
for i in range(10):
make_url(i)
print("已成功获取所有电影URL!")
# 根据获取到的url解析出想要的数据保存到本地
for i in range(len(entity_url)):
state = analysis_page((i + 1), entity_url[i])
if state == -1:
continue