总感觉有什么地方出错了。这爬虫总是不通用。。
#coding:utf-8 import Queue,re,urllib2 import threading from BeautifulSoup import BeautifulSoup # 目标 task_url="http://www.baidu.com/" # 结果队列 result_list={} result_list.update({task_url:0}) spider_list=Queue.Queue(10) class ThreadPool(object): def __init__(self, size): self._queue = Queue.Queue() self._data_ready = threading.Condition() self._exit_flag = threading.Event() self._threads = [] for i in range(size): t = threading.Thread(target=self._run, name=str(i)) t.start() self._threads.append(t) def add_task(self, callback, *args): self._queue.put((callback, args)) with self._data_ready: self._data_ready.notify() def join(self): self._queue.join() self._exit_flag.set() with self._data_ready: self._data_ready.notify_all() for t in self._threads: t.join() def _run(self): while True: with self._data_ready: while self._queue.empty() and not self._exit_flag.is_set(): self._data_ready.wait() if self._exit_flag.is_set(): break cb, args = self._queue.get_nowait() cb(*args) self._queue.task_done() def spr_url(url): try: body_text=urllib2.urlopen(url).read() soup=BeautifulSoup(body_text) links=soup.findAll('a') for link in links: _url=link.get('href').encode('utf-8') if re.match('^(javascript|:;|#|mailto)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',_url): continue if re.match('^(http|https)',_url): if not re.match('^'+url,_url): continue else: if result_list.has_key(_url): continue else: rst=_url.encode('utf-8') print "[*][!] 发现连接:"+rst result_list.update({rst:0}) else: if result_list.has_key(url+_url): continue else: rst=url+_url print "[*][!] 发现新连接: "+rst.encode('utf-8') result_list.update({rst.encode('utf-8'):0}) except Exception,error: print error while True: # 查看目标有多少个待爬 for url in result_list: if result_list[url]==0: # 每次放入10个任务到任务池,控制数量 if not spider_list.full(): spider_list.put(url) # 判断队列是否还有任务,如果有则爬,没有则任务结束 if spider_list.empty(): print "Spider is Finish!" for r_item in result_list: print "-------- Sprider Results -----------" print "URl: " + r_item print "------------------------------------" break else: thr=ThreadPool(10) # 取出URL 并且爬虫,结果放入result_list for num in range(spider_list.qsize()): thr.add_task(spr_url,spider_list.get()) thr.join()