zoukankan      html  css  js  c++  java
  • python3-爬取cnnvd漏洞信息

      因为工作需要cnnvd漏洞信息,以前用着集客搜、八爪鱼之类的工具,但对其效果和速度都不满意。最近开始接触学习爬虫,作为初学者,还需要慢慢完善。先记录下第一个爬虫。还想着在多进程和IP代理方向改善学习。

      这个是运行情况,速度还是无法忍受,多进程在数据获取应该能快很多,IP代理应该能忽视短时间多次访问被限制的问题,从而可以提高速度。

    输出 excel 如图:

    以下是整个代码:

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    # by 默不知然
    
    import urllib.request
    from urllib import parse
    from bs4 import BeautifulSoup
    import http.cookiejar
    import xlwt
    import zlib
    import re
    import time
    import xlsxwriter
    import sys
    import datetime
    import pymysql
    
    '''
    运行方法:
    python vulnerabilities_crawler 2017-10-01 2017-10-31 178
    第一个为开始时间,第二个为结束时间,第三个为总页数。
    
    '''
    
    
    #获得漏洞详情链接列表
    def vulnerabilities_url_list(url,start_time,end_time):
        header = {
            'User-Agent': 'Mozilla/5.0 (Linux; Android 4.1.2; Nexus 7 Build/JZ054K) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19',
            'Accept-Encoding': 'gzip, deflate',
            'Referer': 'http://cnnvd.org.cn/web/vulnerability/queryLds.tag'
        }    
        data = {
            'qstartdate':'2017-10-30',                #---------------》开始日期
            'qenddate':'2017-10-31'                    #---------------》结束日期
        }
        data['qstartdate'] = start_time 
        data['qenddate'] = end_time
        data = parse.urlencode(data).encode('utf-8')
        vulnerabilities_url_html = urllib.request.Request(url,headers=header,data=data)
        vulnerabilities_url_cookie = http.cookiejar.CookieJar()
        vulnerabilities_url_opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(vulnerabilities_url_cookie))
        vulnerabilities_url_html = vulnerabilities_url_opener.open(vulnerabilities_url_html)
        vulnerabilities_url_html = zlib.decompress(vulnerabilities_url_html.read(), 16+zlib.MAX_WBITS)
        vulnerabilities_url_html = vulnerabilities_url_html.decode()
        
        #提取漏洞详情链接
        response = r'href="(.+?)" target="_blank" class="a_title2"'
        vulnerabilities_link_list = re.compile(response).findall(vulnerabilities_url_html)
        
        #添加http前序
        i = 0
        for link in vulnerabilities_link_list:
            vulnerabilities_lists.append('http://cnnvd.org.cn'+vulnerabilities_link_list[i])
            i+=1
            print("已完成爬行第%d个漏洞链接"%i)
            time.sleep(0.2)
    
    #漏洞信息爬取函数
    def vulnerabilities_data(url):
        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
            'Accept-Encoding': 'gzip, deflate, sdch',
        }
        vulnerabilities_data_html = urllib.request.Request(url,headers=header)
        vulnerabilities_data_cookie = http.cookiejar.CookieJar()
        vulnerabilities_data_opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(vulnerabilities_data_cookie))
        vulnerabilities_data_html = vulnerabilities_data_opener.open(vulnerabilities_data_html)
        vulnerabilities_data_html = zlib.decompress(vulnerabilities_data_html.read(), 16+zlib.MAX_WBITS)
        vulnerabilities_data_html = vulnerabilities_data_html.decode()
    
        global vulnerabilities_result_list
        vulnerabilities_result_list=[]    #抓取信息列表命名
        
        #添加漏洞信息详情
        vulnerabilities_detainled_soup1 = BeautifulSoup(vulnerabilities_data_html,'html.parser')
        vulnerabilities_detainled_data = vulnerabilities_detainled_soup1.find('div',attrs={'class':'detail_xq w770'})    ##定义 漏洞信息详情 块的soup
        vulnerabilities_detainled_data = vulnerabilities_detainled_data.decode()
        vulnerabilities_detainled_soup = BeautifulSoup(vulnerabilities_detainled_data,'html.parser')    #二次匹配    
    
        vulnerabilities_detainled_data_list = vulnerabilities_detainled_soup.find_all('li')    #标签a信息汇总    
        
        try:
            vulnerabilities_name = vulnerabilities_detainled_soup.h2.string    #漏洞名称
        except:
            vulnerabilities_name = ''
        vulnerabilities_result_list.append(vulnerabilities_name)
        
        try:
            vulnerabilities_cnnvd_num = vulnerabilities_detainled_soup.span.string    #cnnvd编号
            vulnerabilities_cnnvd_num = re.findall(r":([sS]*)",vulnerabilities_cnnvd_num)[0]
        except:
            vulnerabilities_cnnvd_num = ''
        vulnerabilities_result_list.append(vulnerabilities_cnnvd_num)
        
        try:                            #漏洞等级
            vulnerabilities_rank = vulnerabilities_detainled_soup.a.decode()
            vulnerabilities_rank = re.search(u'([u4e00-u9fa5]+)',vulnerabilities_rank).group(0)
        except:
            vulnerabilities_rank = ''
        vulnerabilities_result_list.append(vulnerabilities_rank)
    
        vulnerabilities_cve_html = vulnerabilities_detainled_data_list[2].decode()    #漏洞cve编号
        vulnerabilities_cve_soup = BeautifulSoup(vulnerabilities_cve_html,'html.parser')
        try:
            vulnerabilities_cve = vulnerabilities_cve_soup.a.string
            vulnerabilities_cve = vulnerabilities_cve.replace("
    ","").replace("	","").replace("
    ","").replace(" ","")
        except:
            vulnerabilities_cve = ''
        vulnerabilities_result_list.append(vulnerabilities_cve)
        
        vulnerabilities_type_html = vulnerabilities_detainled_data_list[3].decode()    #漏洞类型
        vulnerabilities_type_soup = BeautifulSoup(vulnerabilities_type_html,'html.parser')
        try:
            vulnerabilities_type = vulnerabilities_type_soup.a.string
            vulnerabilities_type = vulnerabilities_type.replace("
    ","").replace("	","").replace("
    ","").replace(" ","")
        except:
            vulnerabilities_type = ''
        vulnerabilities_result_list.append(vulnerabilities_type)
        
        vulnerabilities_time_html = vulnerabilities_detainled_data_list[4].decode()    #发布时间
        vulnerabilities_time_soup = BeautifulSoup(vulnerabilities_time_html,'html.parser')
        try:    
            vulnerabilities_time = vulnerabilities_time_soup.a.string
            vulnerabilities_time = vulnerabilities_time.replace("
    ","").replace("	","").replace("
    ","")
        except:
            vulnerabilities_time = ''
        vulnerabilities_result_list.append(vulnerabilities_time)
    
        vulnerabilities_attack_html = vulnerabilities_detainled_data_list[5].decode()    #威胁类型
        vulnerabilities_attack_soup = BeautifulSoup(vulnerabilities_attack_html,'html.parser')
        try:    
            vulnerabilities_attack = vulnerabilities_attack_soup.a.string
            vulnerabilities_attack = vulnerabilities_attack.replace("
    ","").replace("	","").replace("
    ","")
        except:
            vulnerabilities_attack = ''
        vulnerabilities_result_list.append(vulnerabilities_attack)
    
        vulnerabilities_update_html = vulnerabilities_detainled_data_list[6].decode()    #更新时间
        vulnerabilities_update_soup = BeautifulSoup(vulnerabilities_update_html,'html.parser')
        try:
            vulnerabilities_update = vulnerabilities_update_soup.a.string
            vulnerabilities_update = vulnerabilities_update.replace("
    ","").replace("	","").replace("
    ","")
        except:
            vulnerabilities_update = ''    
        vulnerabilities_result_list.append(vulnerabilities_update)
    
        vulnerabilities_firm_html = vulnerabilities_detainled_data_list[7].decode()    #厂商
        vulnerabilities_firm_soup = BeautifulSoup(vulnerabilities_firm_html,'html.parser')
        try:
            vulnerabilities_firm = vulnerabilities_firm_soup.a.string
            vulnerabilities_firm = vulnerabilities_firm.replace("
    ","").replace("	","").replace("
    ","")
        except:
            vulnerabilities_firm = ''
        vulnerabilities_result_list.append(vulnerabilities_firm)
    
        vulnerabilities_source_html = vulnerabilities_detainled_data_list[8].decode()    #漏洞来源
        vulnerabilities_source_soup = BeautifulSoup(vulnerabilities_source_html,'html.parser')
        try:
            vulnerabilities_source = vulnerabilities_source_soup.a.string
            vulnerabilities_source = vulnerabilities_source.replace("
    ","").replace("	","").replace("
    ","")
        except:
            vulnerabilities_source = ''
        vulnerabilities_result_list.append(vulnerabilities_source)
        
    
        #添加漏洞简介详情
        vulnerabilities_title_html = vulnerabilities_detainled_soup1.find('div',attrs={'class':'d_ldjj'})    #定义 漏洞简介 块的soup
        vulnerabilities_title_html = vulnerabilities_title_html.decode()
        vulnerabilities_title_soup2 = BeautifulSoup(vulnerabilities_title_html,'html.parser')
    
        try:
            vulnerabilities_titles1 = vulnerabilities_title_soup2.find_all(name='p')[0].string
            vulnerabilities_titles2 = vulnerabilities_title_soup2.find_all(name='p')[1].string
            vulnerabilities_titles = vulnerabilities_titles1 + vulnerabilities_titles2
            vulnerabilities_titles = vulnerabilities_titles.replace(' ','').replace('	','').replace('
    ','').replace('
    ','')
        except:
            vulnerabilities_titles = ''
        vulnerabilities_result_list.append(vulnerabilities_titles)
    
    
        #漏洞公告
        vulnerabilities_notice_html = vulnerabilities_detainled_soup1.find('div',attrs={'class':'d_ldjj m_t_20'})    #定义 漏洞公告 块的soup
        vulnerabilities_notice_html = vulnerabilities_notice_html.decode()
        vulnerabilities_notice_soup2 = BeautifulSoup(vulnerabilities_notice_html,'html.parser')
        
        try:
            vulnerabilities_notice1 = vulnerabilities_notice_soup2.find_all(name='p')[0].string
            vulnerabilities_notice2 = vulnerabilities_notice_soup2.find_all(name='p')[1].string
            vulnerabilities_notice = vulnerabilities_notice1+vulnerabilities_notice2
            vulnerabilities_notice = vulnerabilities_notice.replace('
    ','').replace('
    ','').replace('	','')
        except:
            vulnerabilities_notice = ''
        vulnerabilities_result_list.append(vulnerabilities_notice)
    
    
        #参考网址
        vulnerabilities_reference_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[1]    #定义 参考网址 块的soup
        vulnerabilities_reference_html = vulnerabilities_reference_html.decode()
        vulnerabilities_reference_soup2 = BeautifulSoup(vulnerabilities_reference_html,'html.parser')
    
        try:
            vulnerabilities_reference = vulnerabilities_reference_soup2.find_all(name='p')[1].string
            vulnerabilities_reference = vulnerabilities_reference.replace('
    ','').replace('
    ','').replace('	','').replace('链接:','')
        except:
            vulnerabilities_reference = ''
        vulnerabilities_result_list.append(vulnerabilities_reference)
        
    
        #受影响实体
        vulnerabilities_effect_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[2]    #定义 受影响实体 块的soup
        vulnerabilities_effect_html = vulnerabilities_effect_html.decode()
        vulnerabilities_effect_soup2 = BeautifulSoup(vulnerabilities_effect_html,'html.parser')
        try:
            vulnerabilities_effect = vulnerabilities_effect_soup2.find_all(name='p')[0].string
            vulnerabilities_effect = vulnerabilities_effect.replace('
    ','').replace('
    ','').replace('	','').replace(' ','')
        except:
            try:
                vulnerabilities_effect = vulnerabilities_effect_soup2.find_all(name='a')[0].string
                vulnerabilities_effect = vulnerabilities_effect.replace('
    ','').replace('
    ','').replace('	','').replace(' ','')        
            except:
                vulnerabilities_effect = ''
        vulnerabilities_result_list.append(vulnerabilities_effect)
    
    
    
        #补丁
        vulnerabilities_patch_html = vulnerabilities_detainled_soup1.find_all('div',attrs={'class':'d_ldjj m_t_20'})[3]    #定义 补丁 块的soup
        vulnerabilities_patch_html = vulnerabilities_patch_html.decode()
        vulnerabilities_patch_soup2 = BeautifulSoup(vulnerabilities_patch_html,'html.parser')
        
    
        try:
            vulnerabilities_patch = vulnerabilities_patch_soup2.find_all(name='p')[0].string
            vulnerabilities_patch = vulnerabilities_patch.replace('
    ','').replace('
    ','').replace('	','').replace(' ','')
        except:
            vulnerabilities_patch = ''
        vulnerabilities_result_list.append(vulnerabilities_patch)
    
    #漏洞信息写入excel
    def vulnerabilities_excel(excel):
        workbook = xlsxwriter.Workbook('vulnerabilities_data.xlsx')
        worksheet = workbook.add_worksheet()
    
        row = 0
        col = 0
        worksheet.write(row,0,'漏洞名称')
        worksheet.write(row,1,'CNNVD编号')
        worksheet.write(row,2,'危害等级')
        worksheet.write(row,3,'CVE编号')
        worksheet.write(row,4,'漏洞类型')
        worksheet.write(row,5,'发布时间')
        worksheet.write(row,6,'攻击途径')
        worksheet.write(row,7,'更新时间')
        worksheet.write(row,8,'厂商')
        worksheet.write(row,9,'漏洞来源')
        worksheet.write(row,10,'漏洞描述')
        worksheet.write(row,11,'解决方案')
        worksheet.write(row,12,'参考链接')
        worksheet.write(row,13,'受影响实体')
        worksheet.write(row,14,'补丁')
    
        row = 1
        for i in range(len(excel)):
            worksheet.write(row,col,excel[i][0])
            worksheet.write(row,col+1,excel[i][1])
            worksheet.write(row,col+2,excel[i][2])
            worksheet.write(row,col+3,excel[i][3])
            worksheet.write(row,col+4,excel[i][4])
            worksheet.write(row,col+5,excel[i][5])
            worksheet.write(row,col+6,excel[i][6])
            worksheet.write(row,col+7,excel[i][7])
            worksheet.write(row,col+8,excel[i][8])
            worksheet.write(row,col+9,excel[i][9])
            worksheet.write(row,col+10,excel[i][10])
            worksheet.write(row,col+11,excel[i][11])
            worksheet.write(row,col+12,excel[i][12])
            worksheet.write(row,col+13,excel[i][13])
            worksheet.write(row,col+14,excel[i][14])
            row += 1
        workbook.close()
    
    #漏洞信息写入数据库
    def vulnerabilities_mysql(excel):
        db = pymysql.connect('127.0.0.1','root','xxxx','spider',charset='utf8')
        cursor = db.cursor()
        for i in range(len(excel)):
            sql="INSERT INTO cnnvd(vulnerabilities_name,cnnvd_num,vulnerabilities_rank,cve_num,vulnerabilities_type,release_time,attack_path,update_time,company,vulnerabilities_source,vulnerabilities_data,solution,reference,object,path) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"
            try:
                cursor.execute(sql,(excel[i][0],excel[i][1],excel[i][2],excel[i][3],excel[i][4],excel[i][5],excel[i][6],excel[i][7],excel[i][8],excel[i][9],excel[i][10],excel[i][11],excel[i][12],excel[i][13],excel[i][14]))
            except:
                print('写入数据库失败')
        print('写入数据库完毕!!!')
        db.commit()
        db.close()
    
    #爬取代理ip
    
    
    
    def main():
        #调用漏洞列表函数并获得漏洞链接列表
        begin = datetime.datetime.now()
        global vulnerabilities_lists
        vulnerabilities_lists=[]
        j = 1
        page_count = sys.argv[3]
        page_count = int(page_count) 
        start_time = sys.argv[1]
        end_time = sys.argv[2]
        while j<=page_count:
            try:
                vulnerabilities_url = 'http://cnnvd.org.cn/web/vulnerability/queryLds.tag?pageno=%d&repairLd='%j
                vulnerabilities_url_list(vulnerabilities_url,start_time,end_time)
                print("已完成爬行第%d页"%j)
                print('
    ')
                time.sleep(2)
                j+=1
            except:
                print('爬取失败,等待5秒后重新爬取。')
                time.sleep(5)
    
        #调用漏洞信息函数并爬取漏洞信息
        vulnerabilities_result_lists = []    
        a=0
        while a < len(vulnerabilities_lists):
            try:
                vulnerabilities_data(vulnerabilities_lists[a])
                vulnerabilities_result_lists.append(vulnerabilities_result_list)
                a+=1
                print("完成爬行第%d个漏洞信息"%a)
                time.sleep(1)
            except:
                print('爬取失败,等待5秒后重新爬取。')
                time.sleep(5)
        
        #漏洞信息写入excel
        vulnerabilities_excel(vulnerabilities_result_lists)
        
        #漏洞信息写入MySQL
        #vulnerabilities_mysql(vulnerabilities_result_lists)
        
        #爬行结束
        end = datetime.datetime.now()
        total_time = end - begin
        print ('漏洞信息爬取结束')
        print ('应该爬行漏洞数量: ',len(vulnerabilities_lists))
        print ('爬行时间: ',total_time)
    
    if __name__ == '__main__':
        main() 
  • 相关阅读:
    fastadmin中curd生成的表单将数字展示为文字
    fastadmin弹窗效果表单
    fastadmin 框架中图片点击放大
    linux vi 编辑文件常用快捷键
    mysql 数据库执行创建索引语句异常 Specified key was too long; max key length is 767 bytes
    eclipse 版本库信息存储错误,导致每次更新提交信息总弹出要输入账号密码问题 解决
    外国域名无法访问 metricbeat.docker.yml 无法下载问题解决
    ELK elasticsearch docker 多台服务器集群
    zipkin 服务跟踪
    多线程 采用spring线程池ThreadPoolTaskExecutor提高程序处理能力 笔记
  • 原文地址:https://www.cnblogs.com/kaiho/p/7804542.html
Copyright © 2011-2022 走看看