zoukankan      html  css  js  c++  java
  • hmm CDN检测

    # -*- coding:utf-8 -*-
     
    import sys
    import re
    from hmmlearn import hmm
    import numpy as np
    from sklearn.externals import joblib
    import matplotlib.pyplot as plt
    import tldextract
    import os
    
    
    def iterbrowse(path):          
        for home, dirs, files in os.walk(path): 
            for filename in files: 
                yield os.path.join(home, filename)
    
    
    def extract_domain(domain):
        suffix = {'.com','.la','.io', '.co', '.cn','.info', '.net', '.org','.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx','.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag','.am','.asia', '.at', '.be', '.com.br', '.net.br', '.name', '.live', '.news', '.bz', '.tech', '.pub', '.wang', '.space', '.top', '.xin', '.social', '.date', '.site', '.red', '.studio', '.link', '.online', '.help', '.kr', '.club', '.com.bz', '.net.bz', '.cc', '.band', '.market', '.com.co', '.net.co', '.nom.co', '.lawyer', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.wiki', '.design', '.software', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl','.nu','.co.nz','.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'}
    
        domain = domain.lower()
        names = domain.split(".")
        if len(names) >= 3: 
        if ("."+".".join(names[-2:])) in suffix:
            return ".".join(names[-3:]), ".".join(names[:-3]) 
        elif ("."+names[-1]) in suffix:
            return ".".join(names[-2:]), ".".join(names[:-2]) 
        print "New domain suffix found. Use tld extract domain..."
    
        pos = domain.rfind("/")
        if pos >= 0: # maybe subdomain contains /, for dns tunnel tool
            ext = tldextract.extract(domain[pos+1:])
            subdomain = domain[:pos+1] + ext.subdomain
        else: 
            ext = tldextract.extract(domain)
            subdomain = ext.subdomain
        if ext.suffix:
            mdomain = ext.domain + "." + ext.suffix
        else:
            mdomain = ext.domain
        return mdomain, subdomain
     
    
    def parse(log):
        data = log.split('^')
        SRC_PORT_IDX = 5-1
        DST_PORT_IDX = 6-1
        PROTOCOL_IDX = 7-1
        protol  = data[PROTOCOL_IDX]
        dstport = data[DST_PORT_IDX]
        if '17' == protol and ('53' == dstport):
        DNS_QUERY_NAME_IDX = 55-1 # domain
        if (len(data) < 55):
            print "error line:"
            print log
            return ("", "")
        domain = data[DNS_QUERY_NAME_IDX]
        mdomain, subdomain = extract_domain(domain)
        return (mdomain, subdomain)
        else:
        print "error line not a DNS:"
        print log
        return ("", "")
    
    
     
    #处理域名的最小长度
    MIN_LEN=3
     
    #状态个数
    N=5
    #最大似然概率阈值
    T=-50
     
    #模型文件名
    FILE_MODEL="hmm-cdn.m"
     
    
    def get_cdn_domains(dir_path):
        domain_list=[]
        for path in iterbrowse(dir_path):
        with open(path) as f:
            for line in f:
            mdomain, sub_domain = parse(line)
            if len(sub_domain) >= MIN_LEN:
                domain_list.append(sub_domain)
                    if len(domain_list) >= 2000:
                        return domain_list
                    #else:
                    #    print path, "pass line:", line
        return  domain_list
        
    
    def domain2ver(domain):
        ver=[]
        for i in range(0,len(domain)):
            ver.append([ord(domain[i])])
        return ver
     
    
    def train_hmm(domain_list):
        X = [[0]]
        X_lens = [1]
        for domain in domain_list:
            ver=domain2ver(domain)
            np_ver = np.array(ver)
            #print len(np_ver)
            try:
            X=np.concatenate([X,np_ver])
        except ValueError:
                print domain
                print len(X), len(np_ver)
                print X
                print np_ver
                raise
            X_lens.append(len(np_ver))
     
        remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)
        remodel.fit(X,X_lens)
        joblib.dump(remodel, FILE_MODEL)
     
        return remodel
     
    
    def test(remodel, domain_list):
        x=[]
        y=[]
        for domain in domain_list:
            domain_ver=domain2ver(domain)
            np_ver = np.array(domain_ver)
            pro = remodel.score(np_ver)
            print  "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
            x.append(len(domain))
            y.append(pro)
        return x,y
     
     
    if __name__ == '__main__':
        domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_cdn")
        remodel=train_hmm(domain_list)
        remodel=joblib.load(FILE_MODEL)
    
        x_1,y_1=test(remodel, domain_list)
        print x_1
        print y_1
        #sys.exit(0)
        domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_black")
        x_2,y_2=test(remodel, domain_list)
        print x_2
        print y_2
        domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
        x_3,y_3=test(remodel, domain_list)
        print x_3
        print y_3
        #%matplotlib inline
        fig,ax=plt.subplots()
        ax.set_xlabel('Domain Length')
        ax.set_ylabel('HMM Score')
        ax.scatter(x_3,y_3,color='b',label="WHITE")
        ax.scatter(x_2, y_2, color='g', label="BLACK")
        ax.scatter(x_1, y_1, color='r', label="CDN")
        ax.legend(loc='right')
        plt.show()

    使用pickle保存和加载模型:

    # -*- coding:utf-8 -*-
     
    import sys
    import re
    from hmmlearn import hmm
    import numpy as np
    #from sklearn.externals import joblib
    import matplotlib.pyplot as plt
    import tldextract
    import os
    import pickle
    
    def iterbrowse(path):          
        for home, dirs, files in os.walk(path): 
            for filename in files: 
                yield os.path.join(home, filename)
    
    
    def extract_domain(domain):
        suffix = {'.com','.la','.io', '.co', '.cn','.info', '.net', '.org','.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx','.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag','.am','.asia', '.at', '.be', '.com.br', '.net.br', '.name', '.live', '.news', '.bz', '.tech', '.pub', '.wang', '.space', '.top', '.xin', '.social', '.date', '.site', '.red', '.studio', '.link', '.online', '.help', '.kr', '.club', '.com.bz', '.net.bz', '.cc', '.band', '.market', '.com.co', '.net.co', '.nom.co', '.lawyer', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.wiki', '.design', '.software', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl','.nu','.co.nz','.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'}
    
        domain = domain.lower()
        names = domain.split(".")
        if len(names) >= 3: 
        if ("."+".".join(names[-2:])) in suffix:
            return ".".join(names[-3:]), ".".join(names[:-3]) 
        elif ("."+names[-1]) in suffix:
            return ".".join(names[-2:]), ".".join(names[:-2]) 
        print "New domain suffix found. Use tld extract domain..."
    
        pos = domain.rfind("/")
        if pos >= 0: # maybe subdomain contains /, for dns tunnel tool
            ext = tldextract.extract(domain[pos+1:])
            subdomain = domain[:pos+1] + ext.subdomain
        else: 
            ext = tldextract.extract(domain)
            subdomain = ext.subdomain
        if ext.suffix:
            mdomain = ext.domain + "." + ext.suffix
        else:
            mdomain = ext.domain
        return mdomain, subdomain
     
    
    def parse(log):
        data = log.split('^')
        SRC_PORT_IDX = 5-1
        DST_PORT_IDX = 6-1
        PROTOCOL_IDX = 7-1
        protol  = data[PROTOCOL_IDX]
        dstport = data[DST_PORT_IDX]
        if '17' == protol and ('53' == dstport):
        DNS_QUERY_NAME_IDX = 55-1 # domain
        if (len(data) < 55):
            print "error line:"
            print log
            return ("", "")
        domain = data[DNS_QUERY_NAME_IDX]
        mdomain, subdomain = extract_domain(domain)
        return (mdomain, subdomain)
        else:
        print "error line not a DNS:"
        print log
        return ("", "")
    
    
     
    #处理域名的最小长度
    MIN_LEN=1
     
    #状态个数
    N=8
    #最大似然概率阈值
    T=-50
     
    #模型文件名
    FILE_MODEL="hmm-cdn.m"
    FILE_MODEL2 ="hmm-cdn-white.pkl"
     
    
    def get_cdn_domains(dir_path):
        domain_list=[]
        for path in iterbrowse(dir_path):
        with open(path) as f:
            for line in f:
            mdomain, sub_domain = parse(line)
            if len(sub_domain) >= MIN_LEN:
                domain_list.append(sub_domain)
                    if len(domain_list) >= 3000:
                        return domain_list
                    #else:
                    #    print path, "pass line:", line
        return  domain_list
        
    
    def domain2ver(domain):
        ver=[]
        for i in range(0,len(domain)):
            ver.append([ord(domain[i])])
        return ver
     
    
    def train_hmm(domain_list):
        if os.path.exists(FILE_MODEL2):
            print "found model file, use it..."
            file_model = open(FILE_MODEL2, 'rb')
            model = pickle.load(file_model)
            file_model.close()
            return model
    
        X = [[0]]
        X_lens = [1]
        for domain in domain_list:
            ver=domain2ver(domain)
            np_ver = np.array(ver)
            #print len(np_ver)
            try:
            X=np.concatenate([X,np_ver])
        except ValueError:
                print domain
                print len(X), len(np_ver)
                print X
                print np_ver
                raise
            X_lens.append(len(np_ver))
     
        #remodel = hmm.GaussianHMM(n_components=N, covariance_type="spherical", n_iter=500) #spherical, diag full,tied
        remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=500)
        remodel.fit(X,X_lens)
        #joblib.dump(remodel, FILE_MODEL)
    
        file_model = open(FILE_MODEL2, 'wb')
        pickle.dump(remodel, file_model)
        file_model.close()
     
        return remodel
     
    
    def test(remodel, domain_list):
        x=[]
        y=[]
        for domain in domain_list:
            domain_ver=domain2ver(domain)
            np_ver = np.array(domain_ver)
            pro = remodel.score(np_ver)
            print  "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
            x.append(len(domain))
            y.append(pro)
        return x,y
     
     
    if __name__ == '__main__':
        domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_cdn")
        domain_list2 = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
        #remodel=train_hmm(domain_list)
        remodel=train_hmm(domain_list+domain_list2)
        #remodel=joblib.load(FILE_MODEL)
    
        x_1,y_1=test(remodel, domain_list)
        print x_1
        print y_1
        #sys.exit(0)
        domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_black")
        x_2,y_2=test(remodel, domain_list)
        print x_2
        print y_2
        domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
        x_3,y_3=test(remodel, domain_list)
        print x_3
        print y_3
        #%matplotlib inline
        fig,ax=plt.subplots()
        ax.set_xlabel('Domain Length')
        ax.set_ylabel('HMM Score')
        #ax.scatter(x_3,y_3,color='b',label="WHITE")
        ax.scatter(x_2, y_2, color='g', label="DNS tunnel")
        ax.scatter(x_1, y_1, color='r', label="CDN")
        ax.legend(loc='right')
        plt.show()

     其中:X = [[0]],X_lens = [] 也可以按照下面方式进行读写。除去了冗余的初始化。

    def train_hmm(domain_list):
        if os.path.exists(FILE_MODEL2):
            print "found model file, use it..."
            file_model = open(FILE_MODEL2, 'rb')
            model = pickle.load(file_model)
            file_model.close()
            return model
    
        #X = [[0]]
        #X_lens = [1]
        X = []
        X_lens = []
        #print X
        for domain in domain_list:
            ver=domain2ver(domain)
            #np_ver = np.array(ver)
            try:
                #X=np.concatenate([X,np_ver])
                X = X + ver
            except ValueError:
                print domain
                print X
                print ver
                raise
            X_lens.append(len(ver))
        #remodel = hmm.GaussianHMM(n_components=N, covariance_type="spherical", n_iter=500) #spherical, diag full,tied
        remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=500)
        remodel.fit(X,X_lens)
        #joblib.dump(remodel, FILE_MODEL)
    
        file_model = open(FILE_MODEL2, 'wb')
        pickle.dump(remodel, file_model)
        file_model.close()
    
        return remodel
  • 相关阅读:
    使用 requests 维持会话
    使用 requests 发送 POST 请求
    使用 requests 发送 GET 请求
    requests 安装
    使用 urllib 分析 Robots 协议
    使用 urllib 解析 URL 链接
    使用 urllib 处理 HTTP 异常
    使用 urllib 处理 Cookies 信息
    使用 urllib 设置代理服务
    按单生产程序发布
  • 原文地址:https://www.cnblogs.com/bonelee/p/7986678.html
Copyright © 2011-2022 走看看