#抓取网页图片
#适用于html页面结构为:li>img
#抓取单个网页图片小程序
#version:V1.0
#author:yxmichael
#更新时间:20210511
import requests
from bs4 import BeautifulSoup
import os,shutil
import time
def getHtmlText(url,code='utf-8'):
try:
r = requests.get(url,timeout = 30,headers = my_headers)
r.raise_for_status
r.encoding = code
return r.text
except:
return ""
def parseHtml(nlist,html):
try:
soup = BeautifulSoup(html,'html.parser')
div_main = soup.find('div',attrs={'id':'main'})
lis = div_main.findAll('li')
for li in lis:
a_href = li.find('a')['href']
if a_href != '#':
img_src = li.find('img')['src']
img_name = a_href.split('/')[-1]
#img_name =img_name[-1]
#print("{} {}
".format(a_href,img_src))
nlist.append([img_name,a_href,img_src])
except:
print("")
def delOldDir(dir_path):
if os.path.exists(dir_path):
shutil.rmtree(dir_path)
def downImg(nlist,nums,site_url,dir_path):
if not os.path.exists(dir_path):
os.mkdir(dir_path)
os.chdir(dir_path)
print("
正在获取原图……")
for i in range(nums):
img = nlist[i]
img_name = img[0]
img_href= site_url + img[1]
file_name = dir_path +'/' + img_name
r= requests.get(img_href,timeout=30)
with open(file_name,'wb') as f:
f.write(r.content)
progressBar(i,nums)
def downImgMicro(nlist,nums,site_url,dir_path):
if not os.path.exists(dir_path):
os.mkdir(dir_path)
os.chdir(dir_path)
print("
正在获取缩微图……")
for i in range(nums):
img = nlist[i]
img_name = img[0]
img_src = site_url + img[2]
prefix = '缩微图_'
file_name = dir_path +'/' + prefix + img_name
r= requests.get(img_src,timeout=30)
with open(file_name,'wb') as f:
f.write(r.content)
progressBar(i,nums)
def progressBar(i,total):
print('
当前进度:{0}{1:.0f}%'.format('▉'*(i+1),((i+1)/total*100)),end='')
def printHead():
num = 80
print("{}".format("*"*num))
str_intro = '''
抓取单个网页图片小程序
version:V1.0
author:yxmichael
更新时间:20210511
'''
print(str_intro)
print("{}".format("*"*num))
print("
正在抓取……
")
def main():
global my_headers
my_headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
site_url = 'http://127.0.0.1/pg/'
imgList = []
start_time = time.time()
cur_path = os.getcwd() + '/'
tmp = '老照片'
dir_path = cur_path + tmp
dir_path_micro = cur_path + tmp + '_缩微图'
printHead()
html = getHtmlText(site_url)
parseHtml(imgList,html)
nums = len(imgList)
#nums =3
delOldDir(dir_path)
delOldDir(dir_path_micro)
downImg(imgList,nums,site_url,dir_path)
downImgMicro(imgList,nums,site_url,dir_path_micro)
seconds = time.time() - start_time
print("
成功下载{}张图片,耗时:{:.1f}秒。
保存路径{}".format(nums,seconds,dir_path))
input("请按任意键退出……")
main()