# 爬取糗图上的图片
import re
import urllib.request
import os
def handler_request(url, page):
url = url + str(page) + "/"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple
WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
request = urllib.request.Request(url, headers=headers)
return request
def download_image(page, html):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple
WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
pattern = re.compile(r'<img src="(.*?)" alt=".*?" />')
src_list = pattern.findall(html)
dirs = os.path.join(os.getcwd(), "糗图")
if not os.path.exists(dirs):
os.makedirs(dirs)
for i, src in enumerate(src_list):
src = "https:" + src
# print(src)
file_name = os.path.join(dirs, "第" + str(page) + "页" + str(i) + ".jpg")
print("图片%s开始下载..." % (str(page) + "页" + str(i) + ".jpg"))
try:
request = urllib.request.Request(src, headers=headers)
image = urllib.request.urlopen(request).read()
except Exception as e:
print("图片%s下载出错了" % (str(page) + "页" + str(i) + ".jpg"))
continue
print("图片%s已经下载完毕" % (str(page) + "页" + str(i) + ".jpg"))
with open(file_name, "wb") as f:
f.write(image)
if __name__ == '__main__':
url = "https://www.qiushibaike.com/pic/page/"
start_page = int(input("请输入你想要查询的起始页:"))
end_page = int(input("请输入你想要查询的结束页:"))
for page in range(start_page, end_page + 1):
print("第%s页开始下载..." % page)
request = handler_request(url, page)
content = urllib.request.urlopen(request).read().decode()
download_image(page, content)
print("第%s页已经下载完毕" % page)
print()
print()
# print(content)