zoukankan      html  css  js  c++  java
  • Python 爬虫插件

    #coding:utf-8
    import sys,urllib2,re,Queue
    sys.path.append("..")

    from lib.Http_Class import Http_Class
    from BeautifulSoup import BeautifulSoup

    ####################################
    #
    # Spider 爬虫模块
    #
    ####################################

    class Spider_module:
    def setW3AScan(self,w3ascan):
    self.w3ascan=w3ascan
    self.result_list={}
    self.q_list=Queue.Queue()
    self.tmp_list=Queue.Queue()

    def start(self,aa):
    url="http://lucifr.com/"
    print "[*] 爬虫目标:"+url
    self.result_list.update({url:0})
    try:
    while True:
    # 判断爬虫是否有爬过
    for url in self.result_list:
    if self.result_list[url]==0:
    self.q_list.put(url)
    self.result_list[url]=1

    # 判断任务队列是否为空,如果是则直接退出
    # 否则处理任务
    if self.q_list.empty():
    print "[*] 结束爬虫任务."
    break
    else:
    for tmp in range(self.q_list.qsize()):
    spider_url=self.q_list.get()
    obj=Http_Class()
    try:
    html=obj._do("get",spider_url)
    except:
    self.w3ascan.log_create("url: %s Field! " % spider_url,"Spider_module")
    print "url: %s Field! " % spider_url
    continue
    soup=BeautifulSoup(html)
    links=soup.findAll('a')
    for link in links:
    _url=link.get('href').encode('utf-8')
    if re.match('^(javascript|:;|#|mailto)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',_url):
    continue
    if re.match('^(http|https)',_url):
    if not re.match('^'+url,_url):
    continue
    else:
    if self.result_list.has_key(url):
    continue
    else:
    rst=_url.encode('utf-8')
    print "[*][!] 发现新连接: "+rst
    self.result_list.update({rst:0})
    else:
    if self.result_list.has_key(url+_url):
    continue
    else:
    rst=url+_url
    print "[*][!] 发现新连接: "+rst.encode('utf-8')
    self.result_list.update({rst.encode('utf-8'):0})

    except Exception,error:
    print "[*] 发生异常情况,捕获并写入日志。"
    self.w3ascan.log_create("Url: %s get Url Error! Source: %s" % (url,error),"Spider_module")

    def save(self):
    print "[*]保存爬虫结果"

    def getPluginClass():
    return Spider_module

    if __name__=="__main__":
    t=Spider_module()
    t.start("aaa")

  • 相关阅读:
    win10 UWP button
    内网分享资源
    内网分享资源
    CF724F Uniformly Branched Trees
    win10 UWP FlipView
    win10 UWP FlipView
    win10 UWP FlipView
    搭建阿里云 centos mysql tomcat jdk
    搭建阿里云 centos mysql tomcat jdk
    win10 UWP 申请微软开发者
  • 原文地址:https://www.cnblogs.com/L-H-R-X-hehe/p/3813120.html
Copyright © 2011-2022 走看看