爬取http://www.mzitu.com/all里面的图片
1 import urllib.request 2 import re 3 import os 4 5 url = 'http://www.mzitu.com/all/' # 爬虫入口 6 req = urllib.request.Request(url) 7 req.add_header('Referer','http://www.mzitu.com/all/') 8 req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36') 9 10 html = urllib.request.urlopen(req).read().decode('utf-8') # 得到入口页面的HTML 11 12 reg = re.compile(r'<a.+?href="(.+?)"') 13 14 hrlist = re.findall(reg,html) # 得到所有图片的页面链接 15 print(hrlist) 16 17 def getImgUrl(url): # 根据图片页面的链接得到图片链接 18 html = getHtml(url).decode('utf-8') 19 reg = re.compile(r'<img.+?src="(.+?.jpg)"') 20 imgUrl = re.findall(reg,html) 21 return imgUrl # 返回得到的图片链接 22 23 def getHtml(url): # 得到HTML页面信息 24 req = urllib.request.Request(url) 25 req.add_header('Referer','http://www.mzitu.com/all/') 26 req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36') 27 28 res = urllib.request.urlopen(req) 29 html = res.read() # 得到入口页面的HTML 30 31 return html # 返回得到的字符串形式的HTML页面 32 33 # 根据图片的链接下载图片 34 def download(url): 35 filename = url.split('/')[-1] 36 with open(filename,'wb') as f: 37 img = getHtml(url) 38 f.write(img) 39 40 # 保存所有图片 41 def save_all(folder='mm'): 42 os.mkdir(folder) 43 os.chdir(folder) 44 for each in hrlist: 45 imgUrl_list = getImgUrl(each) 46 for imgUrl in imgUrl_list: 47 download(imgUrl) 48 49 50 51 if __name__=='__main__': 52 save_all()