zoukankan      html  css  js  c++  java
  • Python 爬虫修正

    总感觉有什么地方出错了。这爬虫总是不通用。。

    #coding:utf-8
    import Queue,re,urllib2
    import threading
    from BeautifulSoup import BeautifulSoup
    
    # 目标
    task_url="http://www.baidu.com/"
    # 结果队列
    result_list={}
    result_list.update({task_url:0})
    spider_list=Queue.Queue(10)
    
    class ThreadPool(object):
        def __init__(self, size):
            self._queue = Queue.Queue()
            self._data_ready = threading.Condition()
            self._exit_flag = threading.Event()
            self._threads = []
            for i in range(size):
                t = threading.Thread(target=self._run, name=str(i))
                t.start()
                self._threads.append(t)
    
        def add_task(self, callback, *args):
            self._queue.put((callback, args))
            with self._data_ready:
                self._data_ready.notify()
    
        def join(self):
            self._queue.join()
            self._exit_flag.set()
            with self._data_ready:
                self._data_ready.notify_all()
            for t in self._threads:
                t.join()
    
        def _run(self):
            while True:
                with self._data_ready:
                    while self._queue.empty() and not self._exit_flag.is_set():
                        self._data_ready.wait()
                    if self._exit_flag.is_set():
                        break
                    cb, args = self._queue.get_nowait()
    
                cb(*args)
                self._queue.task_done()
    
    def spr_url(url):
        try:
            body_text=urllib2.urlopen(url).read()
            soup=BeautifulSoup(body_text)
            links=soup.findAll('a')
            for link in links:
                _url=link.get('href').encode('utf-8')
                if re.match('^(javascript|:;|#|mailto)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',_url):
                    continue
                if re.match('^(http|https)',_url):
                    if not re.match('^'+url,_url):
                        continue
                    else:
                        if result_list.has_key(_url):
                            continue
                        else:
                            rst=_url.encode('utf-8')
                            print "[*][!] 发现连接:"+rst
                            result_list.update({rst:0})
                else:
                    if result_list.has_key(url+_url):
                        continue
                    else:
                        rst=url+_url
                        print "[*][!] 发现新连接: "+rst.encode('utf-8')
                        result_list.update({rst.encode('utf-8'):0})
        except Exception,error:
            print error
    
    
    while True:
        # 查看目标有多少个待爬
        for url in result_list:
            if result_list[url]==0:
                # 每次放入10个任务到任务池,控制数量
                if not spider_list.full():
                    spider_list.put(url)
    
        # 判断队列是否还有任务,如果有则爬,没有则任务结束
        if spider_list.empty():
            print "Spider is Finish!"
            for r_item in result_list:
                print "-------- Sprider Results -----------"
                print "URl: " + r_item
                print "------------------------------------"
            break
        else:
            thr=ThreadPool(10)
            # 取出URL 并且爬虫,结果放入result_list
            for num in range(spider_list.qsize()):
                thr.add_task(spr_url,spider_list.get())
            thr.join()
  • 相关阅读:
    学点 C 语言(40): 函数 多参函数
    存取 ListBox 列表 回复 "徐强" 的问题
    博客园RSS订阅汇总
    博客园电子期刊2012年2月刊发布啦
    上周热点回顾(3.53.11)
    博客园电子期刊2012年3月刊发布啦
    上周热点回顾(3.264.1)
    上周热点回顾(3.193.25)
    上周热点回顾(4.24.8)
    上周热点回顾(2.273.4)
  • 原文地址:https://www.cnblogs.com/xiaoCon/p/3699229.html
Copyright © 2011-2022 走看看