边学边写代码,记录下来。这段代码用于批量抓取主站下所有子网页中符合特定尺寸要求的的图片文件,支持中断。
原理很简单:使用BeautifulSoup4分析网页,获取网页<a/>和<img/>元素,对<a/>集合反复弹栈入栈,对<img/>集合进行筛选下载。
具体代码如下:import os
import sys import time import urllib.request from urllib.parse import urljoin,urlparse from bs4 import BeautifulSoup from threading import Thread ''' class Download(Thread): #多线程下载代码1. 为每一个图片分配一个下载线程 def __init__(self,url,filepath): Thread.__init__(self) self.url = url self.filepath = filepath def run(self): length = 0 try: opener = urllib.request.build_opener() opener.addheaders = [('User-agent','Mozilla/5.0')] urlhandle = opener.open(self.url,timeout = 30) urlhead = urlhandle.info() if 'Content-Length' in urlhead: length = int(urlhead['Content-Length']) data = urlhandle.read(10*1024) while data: with open(self.filepath,'ab+') as wf: wf.write(data) data = urlhandle.read(10*1024) except Exception as ex: print(self.url | ' ' + '× ' + str(ex)) try: os.remove(self.filepath) with open('/home/maple/Desktop/bad','a') as badFile: #超时未能完成下载则删除文件并将图片url记录到未下载链接列表中 badFile.write(self.url+' ') except: pass ''' def maple(root): tasks = [] #多线程集合 urls = [root] #待分析的网页链接 urld = [] #已分析并完成图片下载的网页链接 if os.path.exists('tmpUrls'): #读取本地待分析和已分析网页链接数据 with open('tmpUrls','r') as urlsFile: urls = urlsFile.readlines() for url in urls: if url[0] == '' or url[0] == ' ': urls.remove(url) urls = [line[:-1] for line in urls] if os.path.exists('tmpUrld'): with open('tmpUrld','r') as urldFile: urld = urldFile.readlines() for url in urld: if url[0] == '' or url[0] == ' ': urld.remove(url) urld = [line[:-1] for line in urld] try: times =3 #设置网页读取失败后重试的次数 while urls: curl = urls.pop() urld.append(curl) print('=================== Current Page: '+curl+' =======================') try: response = urllib.request.urlopen(curl,timeout = 5) html = response.read() data = html.decode('utf8') soup = BeautifulSoup(data) #使用BeautifulSoup获取网页元素集 except Exception as ex: #读取网页失败,重试 print(ex) if times > 0: urls.append(curl) urld.remove(curl) times -= 1 else:
if curl in urld:
urld.remove(curl) times = 3 continue path = '/home/maple/Desktop/images/' count = 1 for list in soup.find_all('img'): #获取网页中所有图片链接 width = 0 height = 0 dict = list.attrs if "src" in dict: image = dict['src'] img = image[image.rfind('.'):] if "alt" in dict: #该站点图片链接中提供的图片名属性,不同站点给出的属性可能不同甚至不一定给出图片名属性 fname = dict['alt'] filepath=os.path.join(path,fname+img) else: filepath = os.path.join(path,str(count)+img) count +=1 if "width" in dict: #获取站点图片链接中提供的图片尺寸属性,width和height属性不一定给出 width = int(dict['width']) if "height" in dict: height = int(dict['height']) num=1 while os.path.exists(filepath): #如获取的图片名与本地图片重名则自动按序重命名 fname,fext=os.path.splitext(filepath) if '('+str(num-1)+')'+fext in filepath: filepath = filepath.replace('('+str(num-1)+')'+fext,'('+str(num)+')'+fext) else: fname += '('+str(num)+')' filepath = fname+fext num +=1 for i in range(0,3): #图片下载失败后重试(如使用多线程部分的代码则无此循环) try: if (width == 0 or width >= 250) or (height ==0 or height >= 350): length = 0 image_handle = urllib.request.urlopen(dict['src'],timeout = 5+i*10) #每次重试的超时时间依次递增 image_head = image_handle.info() if 'Content-Length' in image_head: #获取图片实际大小 length = int(image_head['Content-Length']) print(dict['src']+' ==== SIZE:{}*{} -- {}KB'.format(width,height,length/1000)) if length > 20*1000: #只下载超过一定大小的图片,避免下载网页中的图标或者链接图 with open(filepath, 'wb') as file: image_data = image_handle.read() file.write(image_data) print('√') break ''' task = Download(dict['src'],filepath) #多线程下载代码2.为图片资源分配下载线程 task.setDaemon( True ) #将线程置为后台线程 task.start() tasks.append(task) #启动线程并将线程加入线程集合中 ''' except Exception as ex: if i < 2: continue else: #重试3次后依然下载失败则将图片url记录到未下载列表中 print('× '+str(ex)) try: os.remove(filepath) with open('/home/maple/Desktop/bad','a') as badFile: badFile.write(dict['src']+' ') except: pass continue ''' if len(tasks) >= 10: while len([task for task in tasks if task.isAlive()]): time.sleep(2) tasks = [] ''' for a in soup.find_all('a'): #获取当前页面中所有的链接地址,未分析的网页链接入栈 dict = a.attrs if 'href' in dict: url = dict['href'] if urlparse(url)[1]: if urlparse(url)[1] == urlparse(curl)[1]: pass else: url = urljoin(curl,url) if url not in urls and url not in urld: urls.append(url) except KeyboardInterrupt as kbi: #键盘终端,按下<C-c>终止程序,将已分析和未分析链接地址记录到本地 with open('tmpUrls','w') as urlsFile: tmpList = [line + ' ' for line in urls] urlsFile.writelines(tmpList) with open('tmpUrld','w') as urldFile: tmpList = [line + ' ' for line in urld] urldFile.writelines(tmpList) if __name__ == '__main__': print(""" +++++++++++++++++++++++ version: python3.4 +++++++++++++++++=++++ """) url = 'http://www.msnzx.com/' #示例站点(子页和图片太多,运行完成需要很长时间) maple(url)
这段代码某些细节部分是专门针对 http://www.msnzx.com/ 这个站点的,下载其他站点数据仅需要微调一下就行了。其中分析网页直接使用了强大的第三方模块BeautifulSoup4,方便快捷。下载图片部分的实方式实在太多,上述代码中包含了2种下载方式:
1、直接使用url.request读写流一次性下载,下载任意文件时程序都是阻塞的。这种方式适合下载size较小的图片。图片要么完全下载,要么完全不下载(得到的本地文件size = 0),网络条件不佳的时候可以捕获超时异常记录未成功下载的图片url。
2、以多线程的方式下载,为每个图片资源分配一个下载线程。上述程序的注释部分即是多线程下载代码。这种方式下载迅速,就算网络不佳,也能下载到图片的部分内容。
另外还有很多下载方式,如单独调用其他模块(如urllib.request中的urlretrieve,之前文章中实现的文件多线程下载模块download)或者系统工具如wget,curl等。这种直接调用的方式能够为每一个图片分配多线程进行下载。实现方式也最简单。