zoukankan      html  css  js  c++  java
  • python爬虫 分页获取图片并下载

    --刚接触python2天,想高速上手,就写了个爬虫,写完之后,成就感暴增,用起来顺手多了。

    奋斗

    1.源代码

    #coding=utf-8
    import urllib
    import re
    class Page():
    	__slots__ = ('url', 'regex', 'arg' )
    	def __init__(this ,url ,regex ,arg ):
    		if not arg : 
    			arg['download'] = False
    			arg['write'] = False
    			arg['outpath'] = ''
    		this.filter = Filter(url,{
    			'regex' : regex,
    			'custom' : arg['custom'] if arg.has_key('custom') else ''
    		})
    		this.url = url;
    		this.outpath = arg['outpath'] if arg.has_key('outpath') else ''
    		this.download =arg['download'] if arg.has_key('download') else False
    		this.write = arg['write'] if arg.has_key('write') else False
    		this.pagin = arg['pagin'] if arg.has_key('pagin') else False
    
    	def start(this ,*prefix):
    		_pagin = this.pagin; _getHtml = this.getHtml;_prefix = '1';
    		if len(prefix) >= 1 : _prefix = prefix[0];
    		_getHtml(this.url ,_prefix);
    		if _pagin != False : 
    			_start = _pagin['start']; _end = _pagin['end']; _rule = _pagin['rule'];
    			while _start <= _end :
    				_getHtml(_rule.replace('{page}',str(_start)) ,str(_start));
    				_start += 1
    
    	def down(this ,url ,prefix):
    		try:
    			filename = str(prefix) + '_' + url[url.rfind("/")+1:]
    			urllib.urlretrieve(url, this.outpath + filename);
    			print 'download-succeed	->',filename
    		except:
    			print 'download->failed'
    
    	def downs(this ,arr ,prefix):
    		for x in arr: this.down(x ,prefix);
    
    	def writeFile(this ,arr):
    		_file = open(this.outpath + 'list.txt', 'a+')
    		try:
    			_file.writelines('
    
    '+'
    '.join(arr))
    		finally:
    			_file.close()
    
    	def getHtml(this ,url ,prefix):
    		try:
    			_p = urllib.urlopen(url); html = _p.read(); _p.close()
    			html = unicode(html, "gb2312").encode("utf8")
    			arr = this.filter.execute(html ,prefix)
    			if this.download == True : this.downs(arr ,prefix);
    			if this.write == True : this.writeFile(arr);
    		except:
    			print "catch finally exception." 
    
    class Filter():
    	def __init__(this ,url ,arg):
    		this.arg = arg
    		this.url = url
    
    	def _getDomain(this):
    		url = this.url.split('/')
    		return url[0]+'//'+url[2]
    
    	def _getRealUrl(this ,domain, url):
    		if url[0] == '/' : return  domain + url;
    		if 'http://' in url : return url 
    		#==============须要处理的字符串链接... 
    		return domain + '/' +url;
    
    	def execute(this ,html ,prefix):
    		_arg = this.arg; arr=[]; getRealUrl = this._getRealUrl;
    		its = re.finditer( _arg['regex'] ,html)
    		for match in its: arr.append(getRealUrl(this._getDomain() ,match.groups()[0]))
    		if _arg.has_key('custom') == True and _arg['custom'] != '' : _arg['custom'](arr ,prefix);
    		return arr
    
    def paginList(arr ,prefix):
    	num = 1;
    	for x in arr:
    		Page(x ,'<p><imgssrc="(.*?)"salt.*?</p>' ,{
    			'download' : True,
    			'outpath' : 'f:/temp/'
    		}).start(prefix+'_'+str(num));
    		num+=1
    
    Page("http://www.netbian.com/fengjing/" ,'<li><ashref="(.*?

    )"s.*?salt="(.*?)"s.*?</li>' ,{ 'custom' : paginList, 'pagin' : { 'start' : 2, 'end' : 10, 'rule' : 'http://www.netbian.com/fengjing/index_{page}.htm' } }).start()



    2.执行例如以下

    $ python getjpg.py
    download-succeed        -> 1_1_1bdbc1d1628a1f0ebd5fc60055ee506e.jpg
    download-succeed        -> 1_2_01b5b45171979aace617ab79299d7515.jpg
    download-succeed        -> 1_3_5698c42371add40501a328ef2c753b4d.jpg
    download-succeed        -> 1_4_f7219087ce29c474a777867b8e4755ed.jpg
    download-succeed        -> 1_5_58bf8172ea8bbc4cee0a0f8240f2b289.jpg
    download-succeed        -> 1_6_b4700f4bd96f90039ed662ebbf6c1f7c.jpg
    download-succeed        -> 1_7_8a637b3362acddac4671d9ad02e4a93f.jpg
    download-succeed        -> 1_8_f28e22908b68d6fbe42a15c4fcd62613.jpg
    download-succeed        -> 1_9_03806c0b3d33cfc3a3eb4ea3bbe8ca9e.jpg
    download-succeed        -> 1_10_cf26fb246e9b57c06e328af94e60450b.jpg
    download-succeed        -> 1_11_7563610f39bd29b8381201b95eed2624.jpg
    download-succeed        -> 1_12_8ccaccede13d0f377d0d8822243f3b6a.jpg
    download-succeed        -> 1_13_c95a0207db67a334be4812cec25d7023.jpg
    download-succeed        -> 1_14_71ce070aef91660e8dad60a5919ec505.jpg
    download-succeed        -> 1_15_9a647a8f449cdb3208a561b4c9fe2ce6.jpg
    download-succeed        -> 1_16_45d9992e3d5080cf14ef73da14066283.jpg
    download-succeed        -> 1_17_7bd84ee7d6f5cb911a3b1dbc6e0775c4.jpg
    download-succeed        -> 1_18_8397b9d434a187444c389ebff48bcfb5.jpg
    download-succeed        -> 2_1_f14e658f2464769756039e1ff18d5693.jpg
    download-succeed        -> 2_2_ad051a669008969800ccd324de056465.jpg
    download-succeed        -> 2_3_6190ffe369199b95274100996b02359a.jpg
    download-succeed        -> 2_4_f14dce28d960941781a12a57123076df.jpg
    download-succeed        -> 2_5_c7fb3b6f700339e9f3c9ee02474211eb.jpg
    download-succeed        -> 2_6_327f1a33b8c5989a2d014ea41565caef.jpg

    ...


    3.结果例如以下


  • 相关阅读:
    linux打开window文件出现乱码 修正
    深入理解计算机系统 第2章 信息的表示和处理
    算法导论 第六章 堆排序 习题6.58 k路合并排序
    python 的二进制、八进制、十六进制数表示
    算法导论 第二章 合并排序-分治算法
    在Emacs中用ibus输入法切换输入法问题
    python round(x[, n])函数
    算法导论 第六章 堆排序
    普林斯顿算法课第四周作业_8Puzzle
    webshell 提权思路
  • 原文地址:https://www.cnblogs.com/yjbjingcha/p/7219607.html
Copyright © 2011-2022 走看看