import requests import re import os from lxml import html#此处直接引入etree报错是因为版本问题,换个方式引入 etree = html.etree#引入etree方法 from string import punctuation # headers = { # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36' # # } # for page in range(1,4): # source= requests.get('https://www.doutula.com/article/list/?page='+str(page),headers=headers).text # base = etree.HTML(source).xpath('//*[@id="home"]/div/div[2]/a') # for i in base: # pic = i.xpath('div[2]/div/img/@data-original') # title = i.xpath('div[1]/text()')[0] # title = re.sub('W', '', title) # # print(title, pic) # if os.path.isdir('E:\img\'+title):#判断文件夹中是否存在文件 # pass # else: # os.mkdir('E:\img\'+title) # if len(pic)!=0: # for img in pic: # pic_name = img.split('_')[-1] # pic_content = requests.get(img,headers=headers).content # # print(title,pic_name) # op = open('E:\img\'+title+'/'+pic_name,'wb') # op.write(pic_content) # op.close() # headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4356.6 Safari/537.36' # }#headers获取方式查看headers # source = requests.get('http://www.adoutu.com/article/list/2',headers=headers).text # # print(source) # # base = etree.HTML(source).xpath('//div[@class="article-img-list row"]') # # for i in base: # # srcs = i.xpath('div/img/@src') # # print(srcs) # # '/html/body/div/div/div/div[3]/div[2]/div[1]/div/div/a/div[2]/div[1]/img' # # # # '/html/body/div/div/div/div[3]/div[2]/div[1]/div/div' # # '/html/body/div[2]/div/div/div[3]/div[2]/div[1]/div/div/a/div[1]/div[1]/span' # base=etree.HTML(source).xpath('//div[@class="item-content"]') # for i in base: # a=i.xpath('div//div/img/@src') # b=i.xpath('div//div/span/text()')[0] # b = re.sub('W','', b)#正则表达式中的替换W为正则,''为替换成的内容,b为替换的是谁,后面含可以加替换的次数 # if os.path.isdir('E:\img1\'+b): # pass#判断文件夹中是否存在文件 # else: # os.mkdir('E:\img1\'+b)#创建一个文件夹 # if len(a)!=0:#判断a中有无元素 # for img in a: # pic_name = img.split('0')[-1]#图片名,按0分割取最后一个元素为图片名 # pic_content = requests.get(img,headers=headers).content#获取图片连接的页面元素,以二进制形式显示 # op = open('E:\img1\'+b+'/'+pic_name,'wb') # op.write(pic_content) # op.close()