zoukankan      html  css  js  c++  java
  • Python 爬虫插件

    #coding:utf-8
    import sys,urllib2,re,Queue
    sys.path.append("..")
    
    from lib.Http_Class import Http_Class
    from BeautifulSoup import BeautifulSoup
    
    ####################################
    #
    #        Spider 爬虫模块
    #
    ####################################
    
    class Spider_module:
        def setW3AScan(self,w3ascan):
            self.w3ascan=w3ascan
            self.result_list={}
            self.q_list=Queue.Queue()
            self.tmp_list=Queue.Queue()
    
        def start(self,aa):
            url="http://lucifr.com/"
            print "[*] 爬虫目标:"+url
            self.result_list.update({url:0})
            try:
                while True:
                    # 判断爬虫是否有爬过
                    for url in self.result_list:
                        if self.result_list[url]==0:
                            self.q_list.put(url)
                            self.result_list[url]=1
    
                    # 判断任务队列是否为空,如果是则直接退出
                    # 否则处理任务
                    if self.q_list.empty():
                        print "[*] 结束爬虫任务."
                        break
                    else:
                        for tmp in range(self.q_list.qsize()):
                            spider_url=self.q_list.get()
                            obj=Http_Class()
                            try:
                                html=obj._do("get",spider_url)
                            except:
                                self.w3ascan.log_create("url: %s Field! " % spider_url,"Spider_module")
                                print "url: %s Field! " % spider_url
                                continue
                            soup=BeautifulSoup(html)
                            links=soup.findAll('a')
                            for link in links:
                                _url=link.get('href').encode('utf-8')
                                if re.match('^(javascript|:;|#|mailto)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',_url):
                                    continue
                                if re.match('^(http|https)',_url):
                                    if not re.match('^'+url,_url):
                                        continue
                                    else:
                                        if self.result_list.has_key(url):
                                            continue
                                        else:
                                            rst=_url.encode('utf-8')
                                            print "[*][!] 发现新连接: "+rst
                                            self.result_list.update({rst:0})
                                else:
                                    if self.result_list.has_key(url+_url):
                                        continue
                                    else:
                                        rst=url+_url
                                        print "[*][!] 发现新连接: "+rst.encode('utf-8')
                                        self.result_list.update({rst.encode('utf-8'):0})
    
            except Exception,error:
                print "[*] 发生异常情况,捕获并写入日志。"
                self.w3ascan.log_create("Url: %s get Url Error! Source: %s" % (url,error),"Spider_module")
    
        def save(self):
            print "[*]保存爬虫结果"
    
    def getPluginClass():
        return Spider_module
    
    if __name__=="__main__":
        t=Spider_module()
        t.start("aaa")
  • 相关阅读:
    cf D. Vessels
    cf C. Hamburgers
    zoj 3758 Singles' Day
    zoj 3777 Problem Arrangement
    zoj 3778 Talented Chef
    hdu 5087 Revenge of LIS II
    zoj 3785 What day is that day?
    zoj 3787 Access System
    判断给定图是否存在合法拓扑排序
    树-堆结构练习——合并果子之哈夫曼树
  • 原文地址:https://www.cnblogs.com/xiaoCon/p/3699052.html
Copyright © 2011-2022 走看看