python 爬虫爬取豆瓣Top250榜单
这是一个小作业。
request模块
使用request.get(url)可以爬取一个网址的信息
# 构造合理的HTTP请求头, 伪装成浏览器, 绕过反爬虫机制,否则会被反爬虫机制拒绝(418)。 https://www.kesci.com/home/project/5dd6003700b0b900365feaeb
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.44 Safari/537.36"
r = requests.get('http://movie.douban.com/top250?start=225', headers={'User-Agent': user_agent})
print(r.status_code) # 418 表示返回失败, 200表示返回成功
f = open("1.txt", "w", encoding='utf-8')
html = r.text # 打印页面信息
print(html, file = f)
BeatifulSoup 模块
详细参考官方文档
1、安装
pip install beautifulsoup4
pip list // 查看安装的python 模块
2、建立BeautifulSoup4对象
bs = BeautifulSoup(html, "html.parser") #创建beautifulSoup4对象
print(bs.prettify()) # 有缩进地输出bs所有内容
3、访问一个标签内容
html = '''
<!DOCTYPE html>
<!--STATUS OK-->
<html>
<head>
<meta content="text/html;charset=utf-8" http-equiv="content-type"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="always" name="referrer"/>
<link href="https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/bdorz/baidu.min.css" rel="stylesheet" type="text/css"/>
<title>
百度一下,你就知道 </title>
</head>
<body link="#0000cc">
<div id="wrapper">
<div id="head">
<div class="head_wrapper">
<div id="u1">
<a class="mnav" href="http://news.baidu.com" name="tj_trnews">
新闻 </a>
<a class="mnav" href="https://www.hao123.com" name="tj_trhao123">
hao123 </a>
<a class="mnav" href="http://map.baidu.com" name="tj_trmap">
地图 </a>
<a class="mnav" href="http://v.baidu.com" name="tj_trvideo">
视频 </a>
<a class="mnav" href="http://tieba.baidu.com" name="tj_trtieba">
贴吧 </a>
<a class="bri" href="//www.baidu.com/more/" name="tj_briicon" style="display: block;">
更多产品 </a>
</div>
</div>
</div>
</div>
</body>
</html>
'''
# 以以上html页面为例
bs.title # 获得<title>两个标签内的所有内容
bs.div # 获取两个<div>标签之间地所有内容(默认第一个出现地div)
bs.a # 获取第一个出现地<a>标签的内容
bs.head # 获取head标签的内容。
bs.title.name # 标签名称,即title
bs.title.string # 输出title标签内第一个文字部分,或者子标签内第一个文字部分,如果有多个文字,无法输出
bs.a.stirng
bs.get_text() # 输出一个标签以及它的子标签的所有 文字
bs.find_all("a") #获取所有的a标签,并返回一个list
bs.find_all("div") #获取所有的div标签,返回一个list
bs.find_add(id='u1') # 返回所有id='u1'的标签
bs.find_all(id=True) # 返回所有存在id的标签
bs.find_all(class_='mnav') # 返回所有class为mnav的标签
bs.find_all(attrs={"class":"mnav"}) # 返回class=mnav的所有标签,此处class可以为
bs.find_all(attrs={"name":"tj_trnews"})
# 返回所有name=tj_trnews的标签
# find_all参数可以使用多个
# 访问子标签
tmp = bs.find_all("head")
print(t[0].a) # 注意find返回一个bs4对象,find_all返回一个list
csv 模块
1、读入
import csv
with open('a.csv','r') as myFile:
lines=csv.reader(myFile)
for line in lines:
print (line)
# 另外的写法
# f = open("a.csv", "r")
# lines = csv.reader(f)
# lines = csv.reader('a.csv', 'r')
2、写入
headers = ['class','name','sex','height','year']
rows = [
[1,'xiaoming','male',168,23],
[1,'xiaohong','female',162,22],
[2,'xiaozhang','female',163,21],
[2,'xiaoli','male',158,21]
]
with open('test.csv','w', newline='')as f: # 不加newline=''会出现隔行输出的情况
f_csv = csv.writer(f)
f_csv.writerow(headers) # 可以将一个list写入到一行
f_csv.writerows(rows) # 可以写入多行
完整代码
最后写入csv存在乱码的情况,原因为csv文件对于中文编码默认为ansi,输出时为utf-8,修改方式使用记事本打开csv,点击另存为,然后选择编码为ansi,记事本既支持utf-8也支持ansi。
# coding=utf-8
# html = '''
# <ol class="grid_view">
# <li>
# <div class="item">
# <div class="pic">
# <em class="">1</em>
# <a href="https://movie.douban.com/subject/1292052/">
# <img width="100" alt="肖申克的救赎" src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp" class="">
# </a>
# </div>
# <div class="info">
# <div class="hd">
# <a href="https://movie.douban.com/subject/1292052/" class="">
# <span class="title">肖申克的救赎</span>
# <span class="title"> / The Shawshank Redemption</span>
# <span class="other"> / 月黑高飞(港) / 刺激1995(台)</span>
# </a>
# <span class="playable">[可播放]</span>
# </div>
# <div class="bd">
# <p class="">
# 导演: 弗兰克·德拉邦特 Frank Darabont 主演: 蒂姆·罗宾斯 Tim Robbins /...<br>
# 1994 / 美国 / 犯罪 剧情
# </p>
# <div class="star">
# <span class="rating5-t"></span>
# <span class="rating_num" property="v:average">9.7</span>
# <span property="v:best" content="10.0"></span>
# <span>2476527人评价</span>
# </div>
# <p class="quote">
# <span class="inq">希望让人自由。</span>
# </p>
# </div>
# </div>
# </div>
# </li>
# </ol>
# ```
import requests
from bs4 import BeautifulSoup
import csv
Name = []
Name2 = []
Url = []
Actor = []
Score = []
Number = []
def getHtml(html):
bs = BeautifulSoup(html, "html.parser")
totlist = bs.find_all('ol', class_='grid_view')
for nowMovie in totlist[0].find_all('li'):
tmp = nowMovie.find_all('span', class_='title')
movie_name = tmp[0].string
if len(tmp) > 1:
movie_name2 = tmp[1].string
movie_name2 = movie_name2[3:]
else :
movie_name2 = ''
tmp = nowMovie.find('div', class_ = 'hd')
movie_url = tmp.a.get('href') # 此处和python字典的用法一样
tmp = nowMovie.find('div', attrs={'class':'bd'})
movie_actor = tmp.p.getText()
tmp = nowMovie.find('span', attrs={'class':'rating_num'})
movie_score = tmp.string
tmp = nowMovie.find_all('span', class_=False)
tmpstr = tmp[1].string
movie_number = tmpstr.strip("人评价")
Url.append(movie_url)
Name.append(movie_name)
Name2.append(movie_name2)
Actor.append(movie_actor)
Score.append(movie_score)
Number.append(movie_number)
return
def printCsv():
File = open('a.csv', 'w' , newline = '', encoding="utf-8" ) # 因为存在外文韩语日语等,所以需要用utf-8
Print = csv.writer(File)
Print.writerow(['电影名', '英文名', '评分', '评价人数', '演员', '电影链接'])
for i in range(0, 250):
nowlist = []
nowlist.append(Name[i])
# nowlist.append(Name2[i]) # 存在日语韩语,注意编码
nowlist.append(Score[i])
nowlist.append(Number[i])
# nowlist.append(Actor[i])
nowlist.append(Url[i])
Print.writerow(nowlist)
def main():
# 构造合理的HTTP请求头, 伪装成浏览器, 绕过反爬虫机制,否则会被反爬虫机制拒绝(418)。 https://www.kesci.com/home/project/5dd6003700b0b900365feaeb
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.44 Safari/537.36"
targetUrl = 'https://movie.douban.com/top250?start='
for i in range(0, 226, 25):
print(targetUrl + str(i))
r = requests.get(targetUrl + str(i), headers={'User-Agent': user_agent})
print(r.status_code) # 418 表示返回失败, 200表示返回成功
getHtml(r.text)
printCsv()
if __name__=='__main__':
main()