zoukankan      html  css  js  c++  java
  • DGA ngram kmeans+TSNE用于绘图

    # -*- coding:utf-8 -*-
    
    import sys
    import re
    import numpy as np
    from sklearn.externals import joblib
    import csv
    import matplotlib.pyplot as plt
    import os
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn import cross_validation
    import os
    from sklearn.naive_bayes import GaussianNB
    from sklearn.cluster import KMeans
    from sklearn.manifold import TSNE
    
    
    #处理域名的最小长度
    MIN_LEN=10
    
    #随机程度
    random_state = 170
    
    
    def load_alexa(filename):
        domain_list=[]
        csv_reader = csv.reader(open(filename))
        for row in csv_reader:
            domain=row[1]
            if domain >= MIN_LEN:
                domain_list.append(domain)
        return domain_list
    
    
    def load_dga(filename):
        domain_list=[]
        #xsxqeadsbgvpdke.co.uk,Domain used by Cryptolocker - Flashback DGA for 13 Apr 2017,2017-04-13,
        # http://osint.bambenekconsulting.com/manual/cl.txt
        with open(filename) as f:
            for line in f:
                domain=line.split(",")[0]
                if domain >= MIN_LEN:
                    domain_list.append(domain)
        return  domain_list
    
    
    def nb_dga():
        x1_domain_list = load_alexa("../data/top-1000.csv")
        x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")
        x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")
    
        x_domain_list=np.concatenate((x1_domain_list, x2_domain_list,x3_domain_list))
    
        y1=[0]*len(x1_domain_list)
        y2=[1]*len(x2_domain_list)
        y3=[2]*len(x3_domain_list)
    
        y=np.concatenate((y1, y2,y3))
    
        print x_domain_list
        cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",
                                              token_pattern=r"w", min_df=1)
        x= cv.fit_transform(x_domain_list).toarray()
    
        clf = GaussianNB()
        print  cross_validation.cross_val_score(clf, x, y, n_jobs=-1, cv=3)
    
    def kmeans_dga():
        x1_domain_list = load_alexa("../data/dga/top-100.csv")
        x2_domain_list = load_dga("../data/dga/dga-cryptolocke-50.txt")
        x3_domain_list = load_dga("../data/dga/dga-post-tovar-goz-50.txt")
    
        x_domain_list=np.concatenate((x1_domain_list, x2_domain_list,x3_domain_list))
        #x_domain_list = np.concatenate((x1_domain_list, x2_domain_list))
    
        y1=[0]*len(x1_domain_list)
        y2=[1]*len(x2_domain_list)
        y3=[1]*len(x3_domain_list)
    
        y=np.concatenate((y1, y2,y3))
        #y = np.concatenate((y1, y2))
    
        #print x_domain_list
    
        cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",
                                              token_pattern=r"w", min_df=1)
        x= cv.fit_transform(x_domain_list).toarray()
        model=KMeans(n_clusters=2, random_state=random_state)
        y_pred = model.fit_predict(x)
        #print  y_pred
    
        tsne = TSNE(learning_rate=100)
        x=tsne.fit_transform(x)
        print x
        print x_domain_list
    
        for i,label in enumerate(x):
            #print label
            x1,x2=x[i]
            if y_pred[i] == 1:
                plt.scatter(x1,x2,marker='o')
            else:
                plt.scatter(x1, x2,marker='x')
            #plt.annotate(label,xy=(x1,x2),xytext=(x1,x2))
    
        plt.show()
    
    if __name__ == '__main__':
        #nb_dga()
        kmeans_dga()
  • 相关阅读:
    设计模式(九)外观模式Facade(结构型)
    设计模式(八)装饰器模式Decorator(结构型)
    Linux新手生存笔记[1]——Linux目录结构及说明
    设计模式(三)建造者模式Builder(创建型)
    设计模式(七)组合模式Composite(结构型)
    Linux新手生存笔记[0]——写在前面
    给出两个数m和n,求它们的最大公因子,即能够同时整出m和n的最大正整数
    Linux新手生存笔记[2]——vim训练稿
    Linux新手生存笔记[10]——shell脚本基础3函数及常用命令
    设计模式 ( 十二 ) 职责链模式(Chain of Responsibility)(对象行为
  • 原文地址:https://www.cnblogs.com/bonelee/p/7850022.html
Copyright © 2011-2022 走看看