zoukankan      html  css  js  c++  java
  • 操作excel文件爬取nvd.nist数据

    #!/usr/bin/env python
    # encoding: utf-8
    #@author: jack
    import random
    from time import sleep
    import pandas as pd
    from openpyxl import load_workbook
    from urllib import request
    from lxml import etree
    
    wb = load_workbook('cve.xlsx')#要读取的excel文件名,用openpyxl考虑到wrxl库网友反馈对excel后面版本兼容不是太好
    sheet = wb['Sheet1']#默认excel右下角表名称
    cve_list =[]
    for i in sheet["D"][1:25]:#第D列第一行开始读起到24结束
        cve_code = i.value#读取到的每个列表参数
        cve_list.append(cve_code)#前面定义的空列表来存放excel读取的数据
        start_url = 'https://nvd.nist.gov/vuln/detail/'#老美网站待爬数据,右击页面看到数据是静态的爬起来舒服
        score_li=[]
        vector3_li=[]
        vector2_li=[]
        for url_code in cve_list:#取列表参数
            url = '{}{}'.format(start_url,url_code)#url拼接
            response = request.urlopen(url)
            result = response.read().decode()
            html = etree.HTML(result)
            v3BaseScore = html.xpath('//span[@data-testid="vuln-cvssv3-base-score"]/text()')#etree定位so easy
            Vector3 = html.xpath('//span[@data-testid="vuln-cvssv3-vector"]/text()')
            Vector2 = html.xpath('//span[@data-testid="vuln-cvssv2-vector"]/text()')
            score_li.append(' '.join(v3BaseScore))#格式化保存页面提取的数据
            vector3_li.append(' '.join(Vector3))
            vector2_li.append(' '.join(Vector2))
        df1 = pd.DataFrame({'v3BaseScore': score_li})#构建表头字段pandas方法
        df2 = pd.DataFrame({'Vector3': vector3_li})
        df3 = pd.DataFrame({'Vector2': vector2_li})
        All = [df1, df2, df3]
        writer = pd.ExcelWriter('test1.xlsx')#新建excel文件,
        df1.to_excel(writer, sheet_name='Sheet1', startcol=1, index=False)#指定列逐行写入数据
        df2.to_excel(writer, sheet_name='Sheet1', startcol=2, index=False)
        df3.to_excel(writer, sheet_name='Sheet1', startcol=3, index=False)
        writer.save()
        writer.close()

    业务需求现学pandas和openpyxl,

    1、页面分析

    2、定位分析

    3、数据读写分析

    4、网站容易爬挂,并发调低,user-agent代理可以搞起

    import base64
    import random
    from multiprocessing.pool import ThreadPool
    import time
    import pandas as pd
    from openpyxl import load_workbook
    from urllib import request
    from lxml import etree
    from proxies import *
    
    def task1():
        user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]
    
    
        # count = 0
        header = {}
    
        header['User-Agent'] = random.choice(user_agent_list)
        header.update({
            'Host':' nvd.nist.gov',
            'User-Agent:':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        })
    
        time.sleep(0.3)
        # read cve excle
        wb = load_workbook('cve.xlsx')
        sheet = wb['Sheet1']
        cve_list = []
    
        # part request
        for i in sheet["D"][1:]:
            cve_code = i.value
            cve_list.append(cve_code)
            start_url = 'https://nvd.nist.gov/vuln/detail/'
    
            score_li = []
            vector3_li = []
            vector3_href_li = []
            vector2_li = []
            vector2_href_li = []
            for url_code in cve_list:
                url = '{}{}'.format(start_url, url_code)
                res = request.Request(url,headers=header)
                response = request.urlopen(res)
                result = response.read().decode()
    
                # 数据清洗部分
                html = etree.HTML(result)
                score_list = html.xpath('//span[@data-testid="vuln-cvssv3-base-score"]/text()')
                vector3_list = html.xpath('//span[@data-testid="vuln-cvssv3-vector"]/text()')
                vector2_list = html.xpath('//span[@data-testid="vuln-cvssv2-vector"]/text()')
                vector3_href_list = html.xpath('//span[@data-testid="vuln-cvssv3-vector"]//a/@href')
                vector2_href_list = html.xpath('//span[@data-testid="vuln-cvssv2-vector"]//a/@href')
                score_li.append(' '.join(score_list))
                vector3_li.append(' '.join(vector3_list))
                vector3_href_li.append(' '.join(vector3_href_list))
                vector2_li.append(' '.join(vector2_list))
                vector2_href_li.append(' '.join(vector2_href_list))
    
                # create some Pandas DateFrame from some data
                df1 = pd.DataFrame({'CVSSv3.0BaseScore': score_li})
                df2 = pd.DataFrame({'CVSS v3.0 Vector': vector3_li})
                df3 = pd.DataFrame({'CVSS v3.0 Vector link': vector3_href_li})
                df4 = pd.DataFrame({'CVSS v2.0 Vector': vector2_li})
                df5 = pd.DataFrame({'CVSS v2.0 Vector link': vector2_href_li})
                All = [df1, df2, df3, df4, df5]
    
                # create a Pandas Excel writer using xlswriter
                writer = pd.ExcelWriter('basescore.xlsx')
    
                df1.to_excel(writer, sheet_name='Sheet2', startcol=2, index=False)
                df2.to_excel(writer, sheet_name='Sheet2', startcol=3, index=False)
                df3.to_excel(writer, sheet_name='Sheet2', startcol=4, index=False)
                df4.to_excel(writer, sheet_name='Sheet2', startcol=5, index=False)
                df5.to_excel(writer, sheet_name='Sheet2', startcol=6, index=False)
    
                writer.save()
                writer.close()
    
    def process_request(self, request, spider):
        PROXIES = [
            {'ip_port': '61.160.233.8', 'user_pass': ''},
            {'ip_port': '125.93.149.186', 'user_pass': ''},
            {'ip_port': '58.38.86.181', 'user_pass': ''},
            {'ip_port': '119.142.86.110', 'user_pass': ''},
            {'ip_port': '124.161.16.89', 'user_pass': ''},
            {'ip_port': '61.160.233.8', 'user_pass': ''},
            {'ip_port': '101.94.131.237', 'user_pass': ''},
            {'ip_port': '219.157.162.97', 'user_pass': ''},
            {'ip_port': '61.152.89.18', 'user_pass': ''},
            {'ip_port': '139.224.132.192', 'user_pass': ''}
        ]
        proxy = random.choice(PROXIES)
        if proxy['user_pass'] is not None:
            request.meta['proxy'] = "http://%s" % proxy['ip_port']
            encodebytes = base64.b64encode(proxy['user_pass'].encode(encoding='utf-8'))  # 注意encodebytes类型是byte,不是str
            encoded_user_pass = str(encodebytes, 'utf-8')
            request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        else:
            request.meta['proxy'] = "http://%s" % proxy['ip_port']
    
    if __name__ == '__main__':
    
        pool = ThreadPool()
        pool.apply_async(task1)
        pool.apply_async(process_request)
        pool.close()
        pool.join()
    优化版
  • 相关阅读:
    在Windows上搭建Git Server
    Android Studio Intent使用(显式、隐式)
    Android Studio2.0 教程从入门到精通Windows版
    Android Studio2.0 教程从入门到精通Windows版
    【转】自动化框架中引入ExtentReport美化报告
    阿里巴巴Java开发规约IDEA插件安装及使用
    13位时间戳与格式化日期之间的转换实现
    计算机基础知识试题及答案
    构建最基础的Spring项目及所需要的jar包
    (二)SpringMVC+mybatis实践
  • 原文地址:https://www.cnblogs.com/jackzz/p/11160978.html
Copyright © 2011-2022 走看看