zoukankan html css js c++ java

Python 爬虫修正

总感觉有什么地方出错了。这爬虫总是不通用。。

#coding:utf-8
import Queue,re,urllib2
import threading
from BeautifulSoup import BeautifulSoup

# 目标
task_url="http://www.baidu.com/"
# 结果队列
result_list={}
result_list.update({task_url:0})
spider_list=Queue.Queue(10)

class ThreadPool(object):
    def __init__(self, size):
        self._queue = Queue.Queue()
        self._data_ready = threading.Condition()
        self._exit_flag = threading.Event()
        self._threads = []
        for i in range(size):
            t = threading.Thread(target=self._run, name=str(i))
            t.start()
            self._threads.append(t)

    def add_task(self, callback, *args):
        self._queue.put((callback, args))
        with self._data_ready:
            self._data_ready.notify()

    def join(self):
        self._queue.join()
        self._exit_flag.set()
        with self._data_ready:
            self._data_ready.notify_all()
        for t in self._threads:
            t.join()

    def _run(self):
        while True:
            with self._data_ready:
                while self._queue.empty() and not self._exit_flag.is_set():
                    self._data_ready.wait()
                if self._exit_flag.is_set():
                    break
                cb, args = self._queue.get_nowait()

            cb(*args)
            self._queue.task_done()

def spr_url(url):
    try:
        body_text=urllib2.urlopen(url).read()
        soup=BeautifulSoup(body_text)
        links=soup.findAll('a')
        for link in links:
            _url=link.get('href').encode('utf-8')
            if re.match('^(javascript|:;|#|mailto)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',_url):
                continue
            if re.match('^(http|https)',_url):
                if not re.match('^'+url,_url):
                    continue
                else:
                    if result_list.has_key(_url):
                        continue
                    else:
                        rst=_url.encode('utf-8')
                        print "[*][!] 发现连接:"+rst
                        result_list.update({rst:0})
            else:
                if result_list.has_key(url+_url):
                    continue
                else:
                    rst=url+_url
                    print "[*][!] 发现新连接: "+rst.encode('utf-8')
                    result_list.update({rst.encode('utf-8'):0})
    except Exception,error:
        print error


while True:
    # 查看目标有多少个待爬
    for url in result_list:
        if result_list[url]==0:
            # 每次放入10个任务到任务池，控制数量
            if not spider_list.full():
                spider_list.put(url)

    # 判断队列是否还有任务，如果有则爬，没有则任务结束
    if spider_list.empty():
        print "Spider is Finish!"
        for r_item in result_list:
            print "-------- Sprider Results -----------"
            print "URl: " + r_item
            print "------------------------------------"
        break
    else:
        thr=ThreadPool(10)
        # 取出URL 并且爬虫，结果放入result_list
        for num in range(spider_list.qsize()):
            thr.add_task(spr_url,spider_list.get())
        thr.join()

查看全文

相关阅读:
C#3.0实现变异赋值(Mutantic Assignment)
博客园积分算法探讨
 C#动静结合编程之二: 两种哲学
 REST构架风格介绍之二：CRUD
C# vs C++之一：委托 vs 函数指针
 REST构架风格介绍之一：状态表述转移
 C#动静结合编程之三：Duck Typing
C#动静结合编程之四：泛型委托
 C# vs C++之二：GC vs RAII
Ecshop文章分类列表页如何自定义Title以提高SEO效果

原文地址：https://www.cnblogs.com/xiaoCon/p/3699229.html