代码如下:
完成了html代码中提取需要的信息操作
import ssl
from bs4 import BeautifulSoup
import re
import urllib.request, urllib.error
import xlwt
import sqlite3
def main():
ssl._create_default_https_context = ssl._create_unverified_context
baseUrl = 'https://movie.douban.com/top250?start='
getData(baseUrl)
findLink = re.compile(r'<a href="(.*?)">', re.S)
findImg = re.compile(r'src="(.*?)"', re.S)
findTitle = re.compile(r'<span class="title">(.*?)</span>', re.S)
findIto = re.compile(r'<p class="">(.*?)</p>', re.S)
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>', re.S)
findInq = re.compile(r'<span class="inq">(.*?)</span>', re.S)
#爬取网页
def getData(baseUrl):
global dlist
dlist = []
for i in range(0, 10):
temp= []
url = baseUrl + str(i*25)
html = askUrl(url)
#逐一解析数据
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_="item"):
content = []
# print(item)
item = str(item)
#目标链接地址
link = re.findall(findLink, item)[0]
content.append(link)
#图片地址
pic = re.findall(findImg, item)
content.append(pic)
#标题
title = re.findall(findTitle, item)
# print(title)
if (len(title) == 1):
content.append(title[0])
content.append(' ')
else:
content.append(title[0])
content.append(title[1])
#介绍
ito = re.findall(findIto, item)[0]
ito = re.sub(r'
', '', ito)
ito = re.sub(r'...<br/>', '', ito)
ito = re.sub(r' ', '', ito)
content.append(ito)
#评分
rating = re.findall(findRating, item)[0]
content.append(rating)
#inq
inq = re.findall(findInq, item)
content.append(inq)
dlist.append(content)
print(len(dlist))
return dlist
#得到指定一个URL的网页内容
def askUrl(url):
head = { #用来模拟请求头信息
"User-Agent": "Mozilla / 5.0(Macintosh;IntelMacOSX10_15_3) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 81.0.4044.122Safari / 537.36"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
# print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
if __name__ == "__main__":
main()