zoukankan      html  css  js  c++  java
  • 乌云漏洞爬虫的数据库版本(mysql)

    特别鸣谢阮思绮同学!虽然感觉这个冷冷的博客也没人看23333

    import mysql.connector
    import sys, os
    import urllib.request
    import re
    import itertools
    user = 'root'
    pwd  = ''
    host = '127.0.0.1'
    db   = 'test'
    data_file = 'wooyun.dat'
    create_table_sql = "CREATE TABLE IF NOT EXISTS mytable (id int(10) AUTO_INCREMENT PRIMARY KEY, type varchar(300) , info varchar(1000) , detail varchar(5000) , repair varchar(1000) )CHARACTER SET utf8"
    insert_sql = "INSERT INTO mytable (type, info, detail, repair) VALUES ( %s, %s, %s, %s)"
    select_sql = "SELECT id, type, info, detail, repair FROM mytable"
    cnx = mysql.connector.connect(user=user, password=pwd, host=host, database=db)
    cursor = cnx.cursor()
    
    def create_table_sql_api(a):
        try:
            cursor.execute(a)
        except mysql.connector.Error as err:
            print("create table 'mytable' failed.")
            print("Error: {}".format(err.msg))
            sys.exit()
    
    def insert_sql_api(a,b):
        try:
            cursor.execute(a,b)
        except mysql.connector.Error as err:
            print("insert table 'mytable' failed.")
            print("Error: {}".format(err.msg))
            sys.exit()
    
    def select_sql_api(a):
        try:
            cursor.execute(a)
            for (id, type, info, detail, repair) in cursor:
                print("ID:{}  type:{}  info:{}  repair:{}".format(id, type, info, detail, repair))
        except mysql.connector.Error as err:
            print("query table 'mytable' failed.")
            print("Error: {}".format(err.msg))
            sys.exit()
    
    def get_html_response(url):
        html_response = urllib.request.urlopen(url).read().decode('utf-8')
        return html_response
    
    def geturl(starturl):
        a=get_html_response(starturl)
        childurl=(re.findall(r'/bugs/wooyun-w*-w*',a))
        return childurl
    
    def get_nextpage(starturl):
        d=get_html_response(starturl)
        nextpage=(re.findall(r'searchbug.php?q=6YeR6J6N&pNO=w',d))
        return nextpage
    
    starturl="http://www.wooyun.org/searchbug.php?q=6YeR6J6N"
    result=[]
    final=[]
    type_wooyun_n=[]
    info_n=[]
    detail_n=[]
    repair_n=[]
    #output=open("D:\wooyun.csv","w+")
    
    create_table_sql_api(create_table_sql)
    
    for i in get_nextpage(starturl):
        result+=geturl('http://wooyun.org/'+re.sub('金融','6YeR6J6N',i))
        #扫描各种漏洞的url地址放入result中
    result=set(result)#去除result中重复的地址
    
    for i in result:
        k=get_html_response('http://wooyun.org/'+re.sub('金融','%E9%87%91%E8%9E%8D',i))#下载页面到k
        type_wooyun=re.findall(r'漏洞类型:.*.</h3>',k)
        info=re.findall(r'<h3>w*:.*.</h3>',k)#空白字符用/s,寻找所有适用于<h3>标签的文字
        detail=re.findall(r'<p class="detail">.*.</p>',k)
        repair=re.findall(r'修复方案:</h3>s*<p class="detail">.*.s*</p>',k)
        for j in type_wooyun:#漏洞类型,为之后进行数据库分类做准备
            j=re.sub(r':s',':',j)
            j=re.sub(r'	','',j)
            j=re.sub(r'</h3>','',j)
            type_wooyun_n+=j
        for j in info:#处理概要
            j=re.sub(r':s',':',j)
            j=re.sub(r'<h3>','',j)
            j=re.sub(r'</h3>','',j)
            j=re.sub(r'<ashref=".*.">','',j)
            j=re.sub(r'</a>','',j)
            j=re.sub(r'<imgheight=".*./>','',j)
            j=j.split()
            info_n+=j
        for j in detail:#处理详情
            j=re.sub(r':s',':',j)
            j=re.sub(r'<psclass="detail">','',j)
            j=re.sub(r'</p>','',j)
            j=re.sub(r'"starget="_blank"><imgssrc="/upload/.*.width="600"/></a>',',',j)
            j=re.sub(r'<a href="',' http://www.wooyun.org',j)
            j=re.sub(r'对本漏洞信息进行评价,.*.备学习价值','',j)
            detail_n+=j
        for j in repair:#处理回复方法
            j=re.sub(r'</br>',',',j)
            j=re.sub(r'</p>',',',j)
            j=re.sub(r'</h3>',',',j)
            j=re.sub(r'<psclass="detail">','',j)
            j=re.sub(r'',':',j)
            j=j.split()
            repair_n+=j
        
        type_wooyun_str="".join(itertools.chain(*type_wooyun_n))
        info_str="".join(itertools.chain(*info_n))
        detail_str="".join(itertools.chain(*detail_n))  
        repair_str="".join(itertools.chain(*repair_n)) 
        final.append(type_wooyun_str)
        final.append(info_str)
        final.append(detail_str)
        final.append(repair_str)
        insert_sql_api(insert_sql,tuple(final))
        select_sql_api(select_sql)
        #output.writelines(final)
        #output.writelines('
    
    ')
        final.clear()
        repair_n.clear()
        info_n.clear()
        type_wooyun_n.clear()
        detail_n.clear()
    
        
        
    
    cnx.commit()
    cursor.close()
    cnx.close()
    #output.close()
    因为弱小,所以要变强,因为不想灭亡,所以选择战斗
  • 相关阅读:
    冲刺博客 五
    冲刺博客 四
    冲刺第一天
    软件工程概论第十周学习进度
    软件工程概论第九周学习进度
    找水王
    软件工程概论第八周学习进度
    软件工程概论第七周学习进度
    四则运算最终版
    二维数组最大值
  • 原文地址:https://www.cnblogs.com/cmjason/p/3918978.html
Copyright © 2011-2022 走看看