# 自学, 不足之处还请大佬不吝指导,在此谢过.
from requests_html import HTMLSession
from lxml import etree
import re
import urllib3
urllib3.disable_warnings()
url = "https://www.q.com/feature/travel/2527.html"
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.56',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
}
session = HTMLSession()
session.verify = False
r = session.get(url, headers = header)
# render()等价于浏览器打开
r.html.render() # 使用render()渲染 首次使用, 自动下载chromium
# $('.img')
images = r.html.find('.img')
for i in images:
# 链接地址
links = i.xpath('.//a')[0].attrs.get('href') # /imgbuy/105-0128.html
buy_links = 'https://www.q.com' + links # https://www.q.com/imgbuy/105-0128.html
htmllink = re.findall("/imgbuy/(.+?)$",links)[0] # 105-0128.html
# print(links, buy_links,htmllink)
# 标题
title = i.xpath('.//a/img')[0].attrs.get('alt',"未获取到标题")
if len(title) < 1:
title ="未取到标题"+ htmllink # 解决文件保存的文件名同名问题,否则保存图片时文件重复无法保存(覆盖).
else:
title = title +htmllink
# 图片地址
src = str(i.xpath('.//a/img')[0].attrs.get('lowsrc')) # 取出的值为list,需使用stc()转换为字符串
print(title, src)
try:
#保存图片
r_save_pic = session.get(src, headers = header)
# r.content
with open("D:/Pictures/OP/"+"%s.jpg"%str(title),"wb") as fp:
fp.write(r_save_pic.content)
except Exception as msg:
print("下载中出现异常:%s"%str(msg))
r.close()