zoukankan      html  css  js  c++  java
  • hmm CDN检测

    # -*- coding:utf-8 -*-
     
    import sys
    import re
    from hmmlearn import hmm
    import numpy as np
    from sklearn.externals import joblib
    import matplotlib.pyplot as plt
    import tldextract
    import os
    
    
    def iterbrowse(path):          
        for home, dirs, files in os.walk(path): 
            for filename in files: 
                yield os.path.join(home, filename)
    
    
    def extract_domain(domain):
        suffix = {'.com','.la','.io', '.co', '.cn','.info', '.net', '.org','.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx','.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag','.am','.asia', '.at', '.be', '.com.br', '.net.br', '.name', '.live', '.news', '.bz', '.tech', '.pub', '.wang', '.space', '.top', '.xin', '.social', '.date', '.site', '.red', '.studio', '.link', '.online', '.help', '.kr', '.club', '.com.bz', '.net.bz', '.cc', '.band', '.market', '.com.co', '.net.co', '.nom.co', '.lawyer', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.wiki', '.design', '.software', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl','.nu','.co.nz','.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'}
    
        domain = domain.lower()
        names = domain.split(".")
        if len(names) >= 3: 
        if ("."+".".join(names[-2:])) in suffix:
            return ".".join(names[-3:]), ".".join(names[:-3]) 
        elif ("."+names[-1]) in suffix:
            return ".".join(names[-2:]), ".".join(names[:-2]) 
        print "New domain suffix found. Use tld extract domain..."
    
        pos = domain.rfind("/")
        if pos >= 0: # maybe subdomain contains /, for dns tunnel tool
            ext = tldextract.extract(domain[pos+1:])
            subdomain = domain[:pos+1] + ext.subdomain
        else: 
            ext = tldextract.extract(domain)
            subdomain = ext.subdomain
        if ext.suffix:
            mdomain = ext.domain + "." + ext.suffix
        else:
            mdomain = ext.domain
        return mdomain, subdomain
     
    
    def parse(log):
        data = log.split('^')
        SRC_PORT_IDX = 5-1
        DST_PORT_IDX = 6-1
        PROTOCOL_IDX = 7-1
        protol  = data[PROTOCOL_IDX]
        dstport = data[DST_PORT_IDX]
        if '17' == protol and ('53' == dstport):
        DNS_QUERY_NAME_IDX = 55-1 # domain
        if (len(data) < 55):
            print "error line:"
            print log
            return ("", "")
        domain = data[DNS_QUERY_NAME_IDX]
        mdomain, subdomain = extract_domain(domain)
        return (mdomain, subdomain)
        else:
        print "error line not a DNS:"
        print log
        return ("", "")
    
    
     
    #处理域名的最小长度
    MIN_LEN=3
     
    #状态个数
    N=5
    #最大似然概率阈值
    T=-50
     
    #模型文件名
    FILE_MODEL="hmm-cdn.m"
     
    
    def get_cdn_domains(dir_path):
        domain_list=[]
        for path in iterbrowse(dir_path):
        with open(path) as f:
            for line in f:
            mdomain, sub_domain = parse(line)
            if len(sub_domain) >= MIN_LEN:
                domain_list.append(sub_domain)
                    if len(domain_list) >= 2000:
                        return domain_list
                    #else:
                    #    print path, "pass line:", line
        return  domain_list
        
    
    def domain2ver(domain):
        ver=[]
        for i in range(0,len(domain)):
            ver.append([ord(domain[i])])
        return ver
     
    
    def train_hmm(domain_list):
        X = [[0]]
        X_lens = [1]
        for domain in domain_list:
            ver=domain2ver(domain)
            np_ver = np.array(ver)
            #print len(np_ver)
            try:
            X=np.concatenate([X,np_ver])
        except ValueError:
                print domain
                print len(X), len(np_ver)
                print X
                print np_ver
                raise
            X_lens.append(len(np_ver))
     
        remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)
        remodel.fit(X,X_lens)
        joblib.dump(remodel, FILE_MODEL)
     
        return remodel
     
    
    def test(remodel, domain_list):
        x=[]
        y=[]
        for domain in domain_list:
            domain_ver=domain2ver(domain)
            np_ver = np.array(domain_ver)
            pro = remodel.score(np_ver)
            print  "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
            x.append(len(domain))
            y.append(pro)
        return x,y
     
     
    if __name__ == '__main__':
        domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_cdn")
        remodel=train_hmm(domain_list)
        remodel=joblib.load(FILE_MODEL)
    
        x_1,y_1=test(remodel, domain_list)
        print x_1
        print y_1
        #sys.exit(0)
        domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_black")
        x_2,y_2=test(remodel, domain_list)
        print x_2
        print y_2
        domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
        x_3,y_3=test(remodel, domain_list)
        print x_3
        print y_3
        #%matplotlib inline
        fig,ax=plt.subplots()
        ax.set_xlabel('Domain Length')
        ax.set_ylabel('HMM Score')
        ax.scatter(x_3,y_3,color='b',label="WHITE")
        ax.scatter(x_2, y_2, color='g', label="BLACK")
        ax.scatter(x_1, y_1, color='r', label="CDN")
        ax.legend(loc='right')
        plt.show()

    使用pickle保存和加载模型:

    # -*- coding:utf-8 -*-
     
    import sys
    import re
    from hmmlearn import hmm
    import numpy as np
    #from sklearn.externals import joblib
    import matplotlib.pyplot as plt
    import tldextract
    import os
    import pickle
    
    def iterbrowse(path):          
        for home, dirs, files in os.walk(path): 
            for filename in files: 
                yield os.path.join(home, filename)
    
    
    def extract_domain(domain):
        suffix = {'.com','.la','.io', '.co', '.cn','.info', '.net', '.org','.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx','.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag','.am','.asia', '.at', '.be', '.com.br', '.net.br', '.name', '.live', '.news', '.bz', '.tech', '.pub', '.wang', '.space', '.top', '.xin', '.social', '.date', '.site', '.red', '.studio', '.link', '.online', '.help', '.kr', '.club', '.com.bz', '.net.bz', '.cc', '.band', '.market', '.com.co', '.net.co', '.nom.co', '.lawyer', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.wiki', '.design', '.software', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl','.nu','.co.nz','.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'}
    
        domain = domain.lower()
        names = domain.split(".")
        if len(names) >= 3: 
        if ("."+".".join(names[-2:])) in suffix:
            return ".".join(names[-3:]), ".".join(names[:-3]) 
        elif ("."+names[-1]) in suffix:
            return ".".join(names[-2:]), ".".join(names[:-2]) 
        print "New domain suffix found. Use tld extract domain..."
    
        pos = domain.rfind("/")
        if pos >= 0: # maybe subdomain contains /, for dns tunnel tool
            ext = tldextract.extract(domain[pos+1:])
            subdomain = domain[:pos+1] + ext.subdomain
        else: 
            ext = tldextract.extract(domain)
            subdomain = ext.subdomain
        if ext.suffix:
            mdomain = ext.domain + "." + ext.suffix
        else:
            mdomain = ext.domain
        return mdomain, subdomain
     
    
    def parse(log):
        data = log.split('^')
        SRC_PORT_IDX = 5-1
        DST_PORT_IDX = 6-1
        PROTOCOL_IDX = 7-1
        protol  = data[PROTOCOL_IDX]
        dstport = data[DST_PORT_IDX]
        if '17' == protol and ('53' == dstport):
        DNS_QUERY_NAME_IDX = 55-1 # domain
        if (len(data) < 55):
            print "error line:"
            print log
            return ("", "")
        domain = data[DNS_QUERY_NAME_IDX]
        mdomain, subdomain = extract_domain(domain)
        return (mdomain, subdomain)
        else:
        print "error line not a DNS:"
        print log
        return ("", "")
    
    
     
    #处理域名的最小长度
    MIN_LEN=1
     
    #状态个数
    N=8
    #最大似然概率阈值
    T=-50
     
    #模型文件名
    FILE_MODEL="hmm-cdn.m"
    FILE_MODEL2 ="hmm-cdn-white.pkl"
     
    
    def get_cdn_domains(dir_path):
        domain_list=[]
        for path in iterbrowse(dir_path):
        with open(path) as f:
            for line in f:
            mdomain, sub_domain = parse(line)
            if len(sub_domain) >= MIN_LEN:
                domain_list.append(sub_domain)
                    if len(domain_list) >= 3000:
                        return domain_list
                    #else:
                    #    print path, "pass line:", line
        return  domain_list
        
    
    def domain2ver(domain):
        ver=[]
        for i in range(0,len(domain)):
            ver.append([ord(domain[i])])
        return ver
     
    
    def train_hmm(domain_list):
        if os.path.exists(FILE_MODEL2):
            print "found model file, use it..."
            file_model = open(FILE_MODEL2, 'rb')
            model = pickle.load(file_model)
            file_model.close()
            return model
    
        X = [[0]]
        X_lens = [1]
        for domain in domain_list:
            ver=domain2ver(domain)
            np_ver = np.array(ver)
            #print len(np_ver)
            try:
            X=np.concatenate([X,np_ver])
        except ValueError:
                print domain
                print len(X), len(np_ver)
                print X
                print np_ver
                raise
            X_lens.append(len(np_ver))
     
        #remodel = hmm.GaussianHMM(n_components=N, covariance_type="spherical", n_iter=500) #spherical, diag full,tied
        remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=500)
        remodel.fit(X,X_lens)
        #joblib.dump(remodel, FILE_MODEL)
    
        file_model = open(FILE_MODEL2, 'wb')
        pickle.dump(remodel, file_model)
        file_model.close()
     
        return remodel
     
    
    def test(remodel, domain_list):
        x=[]
        y=[]
        for domain in domain_list:
            domain_ver=domain2ver(domain)
            np_ver = np.array(domain_ver)
            pro = remodel.score(np_ver)
            print  "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
            x.append(len(domain))
            y.append(pro)
        return x,y
     
     
    if __name__ == '__main__':
        domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_cdn")
        domain_list2 = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
        #remodel=train_hmm(domain_list)
        remodel=train_hmm(domain_list+domain_list2)
        #remodel=joblib.load(FILE_MODEL)
    
        x_1,y_1=test(remodel, domain_list)
        print x_1
        print y_1
        #sys.exit(0)
        domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_black")
        x_2,y_2=test(remodel, domain_list)
        print x_2
        print y_2
        domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
        x_3,y_3=test(remodel, domain_list)
        print x_3
        print y_3
        #%matplotlib inline
        fig,ax=plt.subplots()
        ax.set_xlabel('Domain Length')
        ax.set_ylabel('HMM Score')
        #ax.scatter(x_3,y_3,color='b',label="WHITE")
        ax.scatter(x_2, y_2, color='g', label="DNS tunnel")
        ax.scatter(x_1, y_1, color='r', label="CDN")
        ax.legend(loc='right')
        plt.show()

     其中:X = [[0]],X_lens = [] 也可以按照下面方式进行读写。除去了冗余的初始化。

    def train_hmm(domain_list):
        if os.path.exists(FILE_MODEL2):
            print "found model file, use it..."
            file_model = open(FILE_MODEL2, 'rb')
            model = pickle.load(file_model)
            file_model.close()
            return model
    
        #X = [[0]]
        #X_lens = [1]
        X = []
        X_lens = []
        #print X
        for domain in domain_list:
            ver=domain2ver(domain)
            #np_ver = np.array(ver)
            try:
                #X=np.concatenate([X,np_ver])
                X = X + ver
            except ValueError:
                print domain
                print X
                print ver
                raise
            X_lens.append(len(ver))
        #remodel = hmm.GaussianHMM(n_components=N, covariance_type="spherical", n_iter=500) #spherical, diag full,tied
        remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=500)
        remodel.fit(X,X_lens)
        #joblib.dump(remodel, FILE_MODEL)
    
        file_model = open(FILE_MODEL2, 'wb')
        pickle.dump(remodel, file_model)
        file_model.close()
    
        return remodel
  • 相关阅读:
    Redis 客户端连接
    Redis 性能测试
    Redis 安全
    Redis 数据备份与恢复
    Redis 数据类型
    Redis 配置
    Redis 安装
    Redis 简介
    Redis教程
    如何修改Oracle Enterprise Linux时区?
  • 原文地址:https://www.cnblogs.com/bonelee/p/7986678.html
Copyright © 2011-2022 走看看