Python实现抓取网页
以下的Python抓取网页的程序比較0基础,仅仅能抓取第一页的url所属的页面,仅仅要预定URL足够多。保证你抓取的网页是无限级别的哈,以下是代码:
##coding:utf-8
'''
无限抓取网页
@author wangbingyu
@date 2014-06-26
'''
import sys,urllib,re,thread,time,threading
'''
创建下载线程类
'''
class download(threading.Thread):
def __init__(self,url,threadName):
threading.Thread.__init__(self,name=threadName)
self.thread_stop = False
self.url = url
def run(self):
while not self.thread_stop:
self.list = self.getUrl(self.url)
self.downloading(self.list)
def stop(self):
self.thread_stop = True
def downloading(self,list):
try:
for i in range(len(list) - 1):
urllib.urlretrieve(list[i],'E:uploaddownload\%s.html' % time.time())
except Exception,ex:
print Exception,'_upload:',ex
def getUrl(self,url):
result = []
s = urllib.urlopen(url).read();
ss = s.replace(' ','')
urls=re.findall('<a.*?href=.*?</a>',ss,re.I)
for i in urls:
tmp = i.split('"')
try:
if tmp[1]:
if re.match(r'http://.*',tmp[1]):
result.append(tmp[1])
except Exception,ex:
print Exception,":getUrl",ex
return result
if __name__ == '__main__':
list = ['http://www.baidu.com','http://www.qq.com','http://www.taobao.com','http://www.sina.com.cn']
for i in range(len(list)):
#print list[i]
download(list[i],'thread%s' % i).start()
#list = ['http://www.baidu.com','http://www.sina.com.cn']
#obj = download('http://www.baidu.com','threadName')
#obj.start();
input()