zoukankan      html  css  js  c++  java
  • python3 fofa爬取类

    代码实现:

    # coding=utf-8
    
    import requests
    import configparser
    import base64
    from lxml import etree
    
    class FofaSpider:
    
        def __init__(self, search_keyword, page=5):
            self.search_keyword = search_keyword
            self.page = page
            self.getConfig()
    
        def goSpider(self):
            headers = {
                "Connection": "keep-alive",
                "Cookie": "_fofapro_ars_session=" + self.cookie,
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                              'Chrome/65.0.3325.181 Safari/537.36 '
            }
    
            for i in range(1,self.page + 1):
                url = "https://fofa.so/result?result?q=" + str(self.search_keyword,'utf-8') + '&qbase64=' + str(base64.b64encode(self.search_keyword), 'utf-8') + '&page=' + str(i)
                resp = requests.get(url=url, headers=headers)
    
    
                if(resp.status_code == 304):
                    print("程序停止, 可能的情况是你当前会员与要爬去的页数不相匹配!")
                    exit()
    
                print("开始解析...")
                #开始解析
                with open(__file__[0:-7] + 'fofa_res.txt','a+',encoding='utf-8') as f:
                    lxml_tree = etree.HTML(resp.content.decode('utf-8'))
                    url_list = lxml_tree.xpath('//div[@class="list_mod_t"]//a[@target="_blank"]/@href') #正常获取到的URL
                    url_list2 = lxml_tree.xpath('//div[@class="list_mod_t"]//div[@class="ip-no-url"]//text()') #不正常获取到的URL,就是有时候没有a标签,类型是ip-no-url
                    url_list.extend(url_list2)
                    url_title = lxml_tree.xpath('//ul[@class="list_sx1"]//li[1]//text()')
                    for url_res, title_res in zip(url_list, url_title):
                        f.write(url_res.strip().replace('
    ', '').replace('
    ', '') + "    " + title_res.strip().replace('
    ', '').replace('
    ', '') + '
    ')
                        print(url_res.strip().replace('
    ', '').replace('
    ', '') + "   " + title_res.strip().replace('
    ', '').replace('
    ', ''))
                print("解析结束...")
    
        def getConfig(self):
            conf = configparser.ConfigParser()
            conf.read(__file__[0:-7] + 'fofa.config')  # 读config.ini文件
            self.fofa_key = conf.get('config', 'fofa_key')
            self.fofa_email = conf.get('config', 'fofa_email')
            self.cookie = conf.get('config','cookie')
    
    
    if '__main__' == __name__:
        search_keyword = input("输入你要的Fofa搜索语法: ").encode("utf-8")
        page = input("输入你要爬取的页数(默认为5): ").encode("utf-8")
        if(page):
            fofa = FofaSpider(search_keyword, int(page))
        else:
            fofa = FofaSpider(search_keyword)
        fofa.goSpider()
    

    效果:

  • 相关阅读:
    NOIP1996 第三题
    vijos P1071
    USACO 2.3
    NOIP2006 第二题(change)
    NOIP2006 第二题
    NOIP2005 第三题
    Building Block 动态规划
    砝码问题 Weight
    装箱问题(Packing DP)
    算法第二章上机实践报告
  • 原文地址:https://www.cnblogs.com/zpchcbd/p/12606657.html
Copyright © 2011-2022 走看看