import requests
from lxml import etree
from furl import furl
url = 'https://dsd.com'
html = requests.get(url).text
#re.findall('"objURL":"(.*?)",',html, re.S)
element = etree.HTML(html)
#//div/img/@src
#li[contains(@title, '省')]
#[@href and @lmv='电视剧']
#[@href|@lmv]
#item[@公司名称='" + strArray[0] + "' and @是否发过='0']
#xpath('//div[contains(@class,"a") and contains(@class,"b")]')
#//div[contains(concat(' ', @class, ' '), 'demo')]
imgs = [img.xpath('./text()')
for img in element.xpath('//div[@class="reader-container"]/div//img')]
html = '''<div class="mod flow-ppt-mod">
<div class="page-1 ppt-page-item batch-50-1" id="pageNo-1">
<div class="ppt-image-wrap ppt-16-9">
<img src="https://sdsd.com?pn=1" alt="">
</div>
</div>
<div class="page-2 ppt-page-item batch-50-1" id="pageNo-2">
<div class="ppt-image-wrap ppt-16-9">
<img data-src="https://sdsd.com?pn=2">
</div>
</div>'''
element = etree.HTML(html)
#//div/img/@src
#li[contains(@title, '省')]
#[@href and @lmv='电视剧']
#[@href|@lmv]
#item[@公司名称='" + strArray[0] + "' and @是否发过='0']
#xpath('//div[contains(@class,"a") and contains(@class,"b")]')
#//div[contains(concat(' ', @class, ' '), 'demo')]
urls = [url
for img in element.xpath('//div//img')
for url in img.xpath('./@src') + img.xpath('./@data-src')]
def download(url):
try:
pic = requests.get(url, timeout=5)
except requests.exceptions.ConnectionError:
print('图片无法下载')
#保存图片路径
#kv = dict([s.split('=') for s in urls[0].split('?')[1].split('&')])
f = furl(url)
path = r'C:\Users\Semi-Luy\Desktop\ppt' + '\\' + f.args['pn'] + '.jpg'
fp = open(path, 'wb')
fp.write(pic.content)
fp.close()
print("开始下载图片:\r\n")
for url in urls:
print(url)
download(url)