# -*-coding:utf-8-*-
from lxml import etree
import requests
import sys, os, re
class Meizitu(object):
def __init__(self):
# 设置序号为图片名
self.i = 1
# 构建url
self.url = 'http://www.meizitu.com/a/more_{}.html'
# 构建header
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
}
self.pattern = re.compile('http://www.meizitu.com/a/d+.html', re.S) # 使用正则匹配url
self.img_url = []
def get_page(self, url):
response = requests.get(url, headers=self.headers)
# print(response.encoding) # 返回内容编码编码格式
return response.content
def parse_data(self, detail_data):
# 取得图片页url
str_data1 = self.pattern.findall(detail_data.decode('utf-8', 'ignore'))
return str_data1
def pic_info(self, image_list):
# 详细页面
img_url = []
for url in image_list:
img_page = self.get_page(url)
html = etree.HTML(img_page)
if html is not None:
img_detial_list = html.xpath('//img/@src')
if img_detial_list:
for i in img_detial_list:
if i not in self.img_url:
self.img_url.append(i) # 去重存到self.img_url
img_url.append(i)
def download(self):
# print(image_list)
if not os.path.exists('images'):
os.makedirs('images')
for url in self.img_url:
print(url)
data = self.get_page(url)
with open('images' + os.sep + str(self.i) + '.jpg', 'wb+') as f:
f.write(data)
self.i = self.i + 1
def run(self):
page = input('输入页码')
self.url = 'http://www.meizitu.com/a/more_{}.html'.format(page)
print(self.url)
detail_data = self.get_page(self.url)
# 抽取图片url
image_list = self.parse_data(detail_data)
self.pic_info(image_list)
self.download()
if __name__ == '__main__':
meizi = Meizitu()
meizi.run()