zoukankan html css js c++ java

python多线程实现抓取网页

Python实现抓取网页

以下的Python抓取网页的程序比較0基础，仅仅能抓取第一页的url所属的页面，仅仅要预定URL足够多。保证你抓取的网页是无限级别的哈，以下是代码：

##coding:utf-8
'''
	无限抓取网页
	@author wangbingyu
	@date 2014-06-26
'''
import sys,urllib,re,thread,time,threading

'''
创建下载线程类
'''
class download(threading.Thread):
	def __init__(self,url,threadName):
		threading.Thread.__init__(self,name=threadName)
		self.thread_stop = False
		self.url = url
	
	def run(self):
		while not self.thread_stop:
			self.list = self.getUrl(self.url)
			self.downloading(self.list)
	
	def stop(self):
		self.thread_stop = True
			
	def downloading(self,list):
		try:
			for i in range(len(list) - 1):
				urllib.urlretrieve(list[i],'E:uploaddownload\%s.html' %  time.time())
		except Exception,ex:
			print Exception,'_upload:',ex
	
	def getUrl(self,url):
		result = []
		s = urllib.urlopen(url).read();
		ss = s.replace(' ','')
		urls=re.findall('<a.*?href=.*?</a>',ss,re.I)
		for i in urls:
			tmp = i.split('"')
			try:
				if tmp[1]:
					if re.match(r'http://.*',tmp[1]):
						result.append(tmp[1])
			except Exception,ex:
				print Exception,":getUrl",ex 
		return result

if __name__ == '__main__':
	list = ['http://www.baidu.com','http://www.qq.com','http://www.taobao.com','http://www.sina.com.cn']
	for i in range(len(list)):
		#print list[i]
		download(list[i],'thread%s' % i).start()
	#list = ['http://www.baidu.com','http://www.sina.com.cn']
	#obj = download('http://www.baidu.com','threadName')
	#obj.start();
	
input()

查看全文

相关阅读:
201671010410 冯婷秀实验三作业互评与改进
读《构建之法》感想
实验十四团队项目评审&课程学习总结
201671010412 郭佳实验四附加实验
201671010412 郭佳英文文本统计分析
201671010412 郭佳实验二软件工程个人项目
201671010412 郭佳实验三作业互评与改进
在阅读《现代软件工程—构建之法》后的思考问题
金生芳-实验十四团队项目评审&课程学习总结
实验四附加实验-项目互评

原文地址：https://www.cnblogs.com/ldxsuanfa/p/10951568.html