zoukankan      html  css  js  c++  java
  • 乌云爬虫分项、参数化、优化

    import mysql.connector
    import sys, os
    import urllib.request
    import re
    import itertools
    import base64
    
    search_item='金融'#搜索项改这个就可以了
    #以后只需要修改search_item就可以了
    #转成bytes string
    bytesString = search_item.encode(encoding="utf-8")
    encodestr = base64.b64encode(bytesString)
    #base64 编码
    
    user = 'root'
    pwd  = ''
    host = '127.0.0.1'
    db   = 'test'
    data_file = 'wooyun.dat'
    create_table_sql = "CREATE TABLE IF NOT EXISTS mytable (id int(10) AUTO_INCREMENT PRIMARY KEY,serial_number_sql varchar(100), title_sql varchar(100), 
        loophole_type_sql varchar(100) , industry_sql varchar(100) , author_sql varchar(100) , yield_time_sql varchar(100), 
        loophole_mood_sql varchar(100), hazard_rating_sql varchar(100), reveal_mood_sql varchar(200),
        detail_sql varchar(5000), repair_sql varchar(2000), path_sql varchar(50))
        CHARACTER SET utf8"
    
    insert_sql = "INSERT INTO mytable (serial_number_sql, title_sql, loophole_type_sql, industry_sql, 
        author_sql, yield_time_sql, loophole_mood_sql, hazard_rating_sql, reveal_mood_sql, 
        detail_sql, repair_sql, path_sql) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
    
    cnx = mysql.connector.connect(user=user, password=pwd, host=host, database=db)
    cursor = cnx.cursor()
    
    def create_table_sql_api(a):
        try:
            cursor.execute(a)
        except mysql.connector.Error as err:
            print("create table 'mytable' failed.")
            print("Error: {}".format(err.msg))
            sys.exit()
    
    def insert_sql_api(a,b):
        try:
            cursor.execute(a,b)
        except mysql.connector.Error as err:
            print("insert table 'mytable' failed.")
            print("Error: {}".format(err.msg))
            sys.exit()
    
    create_table_sql_api(create_table_sql)
    #mysql数据库
    
    starturl="http://www.wooyun.org/searchbug.php?q="+encodestr.decode()
    loophole=[]
    nextpage=[]
    result=[]
    #定义页面跳转相关变量
    
    def get_html_response(url):
        html_response = urllib.request.urlopen(url).read().decode('utf-8')
        return html_response
    
    def geturl(starturl):
        a=get_html_response(starturl)
        childurl=(re.findall(r'/bugs/wooyun-w*-w*',a))
        return childurl
    
    def get_nextpage(starturl):
        d=get_html_response(starturl)
        num_p=0
        num=re.findall(r'd*s页',d)
        for i in num:
            i=re.sub(r's页','',i)
            num_p=i
        for x in range(1,int(num_p)):
            x='searchbug.php?q='+encodestr.decode()+'&pNO='+str(x)
            nextpage.append(x)
        return nextpage
    
    def download_img(url):
        img_name=re.sub(r'http://wooyun.org/upload/d*/','',url)
        download_img=urllib.request.urlretrieve(url,'D:wooyun\%s'%img_name)
    
    def download_html(i,title):
        html_path='D:\wooyun_html\'+title+'.html'
        download_html=open(html_path,'w+',encoding='utf-8')
        download_html.write(i)
        download_html.close()
        return('wooyun_html\'+title+'.html')
    
    for i in get_nextpage(starturl):
        result+=geturl('http://wooyun.org/'+i)
        #扫描各种漏洞的url地址放入result中
    result=set(result)#去除result中重复的地址
    
    serial_number_p=''
    title_p=''
    refered_industry_p=''
    author_p=''
    yield_time_p=''
    loophole_type_p=''
    loophole_mood_p=''
    hazard_rating_p=''
    reveal_mood_p=[]
    detail_p=[]
    repair_p=''
    final=[]
    #定义漏洞相关变量
    
    for i in result:
        k=get_html_response('http://wooyun.org/'+re.sub(search_item,encodestr,i))#下载页面到k
    
        #基础信息提取
        serial_number=re.findall(r'">WooYun-w{4}-w*',k)
        title=re.findall(r'漏洞标题:.*.</h3>',k)
        refered_industry=re.findall(r'相关厂商:.*.',k)
        author=re.findall(r'<a href="http://www.wooyun.org/whitehats/S*">',k)
        yield_time=re.findall(r'提交时间:.*.',k)
        loophole_type=re.findall(r'漏洞类型:.*.',k)
        hazard_rating=re.findall(r'危害等级:.*.</h3>',k)
        loophole_mood=re.findall(r'漏洞状态:s*S*s*</h3>',k)
        #详细信息提取
        reveal_mood=re.findall(r'd*-d*-d*:s*S*<br/>',k)
        detail=re.findall(r'<p class="detail">.*.</p>',k)
        repair=re.findall(r'修复方案:</h3>s*<p class="detail">.*.s*</p>',k)
        #基础信息处理
        for j in serial_number:
            j=re.sub(r'">','',j)
            serial_number_p=j
    
        for j in title:
            j=re.sub('漏洞标题:		','',j)
            j=re.sub(r's</h3>','',j)
            title_p=j
    
        for j in refered_industry:
            j=re.sub(r'相关厂商:		<a href="http://www.wooyun.org/corps/','',j)
            j=re.sub(r'">
    ','',j)
            refered_industry_p=j
        
        for j in author:
            j=re.sub(r'<a href="http://www.wooyun.org/whitehats/','',j)
            j=re.sub(r'">','',j)
            author_p=j
    
        for j in yield_time:
            j=re.sub(r'提交时间:		','',j)
            j=re.sub(r'</h3>
    ','',j)
            yield_time_p=j
    
        for j in loophole_type:
            j=re.sub(r'漏洞类型:		','',j)
            j=re.sub(r'</h3>
    ','',j)
            loophole_type_p=j
    
        for j in hazard_rating:
            j=re.sub(r'危害等级:		','',j)
            j=re.sub(r'</h3>','',j)
            hazard_rating_p=j
    
        for j in loophole_mood:
            j=re.sub(r'漏洞状态:s*','',j)
            j=re.sub(r's*</h3>','',j)
            loophole_mood_p=j
        #详细信息处理
        for j in reveal_mood:
            j=re.sub('<br/>','',j)
            reveal_mood_p.append(j)
        
        for j in detail:#处理详情
            j=re.sub(r':s',':',j)
            j=re.sub(r'<p class="detail">','',j)
            j=re.sub(r'</p>','',j)
            j=re.sub(r'"starget="_blank"><imgssrc="/upload/.*.width="600"/></a>',',',j)
            j=re.sub(r'<a href="',' http://wooyun.org',j)
            j=re.sub(r'对本漏洞信息进行评价,.*.备学习价值','',j)
            detail_p.append(j)
        
        for j in repair:#处理回复方法
            j=re.sub(r'</br>','',j)
            j=re.sub(r'</p>','',j)
            j=re.sub(r'修复方案:</h3>','',j)
            j=re.sub(r'<psclass="detail">','',j)
            j=re.sub(r'',':',j)
            j=j.split()
            repair_p=j
        
        serial_number_str= "".join(itertools.chain(*serial_number_p))
        title_str="".join(itertools.chain(*title_p))
        loophole_type_str="".join(itertools.chain(*loophole_type_p))
        refered_industry_str="".join(itertools.chain(*refered_industry_p))
        author_str="".join(itertools.chain(*author_p))
        yield_time_str="".join(itertools.chain(*yield_time_p))
        loophole_mood_str="".join(itertools.chain(*loophole_mood_p))
        hazard_rating_str="".join(itertools.chain(*hazard_rating_p))
        detail_str="".join(itertools.chain(*detail_p))  
        reveal_mood_str="".join(itertools.chain(*reveal_mood_p))
        repair_str="".join(itertools.chain(*repair_p))
    
        '''img=re.findall(r'http://wooyun.org/upload/d*/w*.w{3}',detail_str)
        for j in img:
            download_img(j)'''
        path=download_html(k,serial_number_str)
    
        final.append(serial_number_str)
        final.append(title_str)
        final.append(loophole_type_str)
        final.append(refered_industry_str)
        final.append(author_str)
        final.append(yield_time_str)
        final.append(loophole_mood_str)
        final.append(hazard_rating_str)
        final.append(reveal_mood_str)
        final.append(detail_str)
        final.append(repair_str)
        final.append(path)
    
        insert_sql_api(insert_sql,tuple(final))
        detail_p.clear()
        reveal_mood_p.clear()
        final.clear()
        
    
    cnx.commit()
    cursor.close()
    cnx.close()
    因为弱小,所以要变强,因为不想灭亡,所以选择战斗
  • 相关阅读:
    安装node.js webkit环境[一]
    wpf 窗口最小化后,触发某事件弹出最小化窗口并置顶
    c# 旋转图片 无GDI+一般性错误
    类库里面添加日志记录 log4net
    string转xml
    DES c#加密后java解密
    使用排序字典排序
    怎么让一段xml被识别为字符串
    新装iis 页面503错误 DefaultAppPool停止解决方案
    hession
  • 原文地址:https://www.cnblogs.com/cmjason/p/3929487.html
Copyright © 2011-2022 走看看