zoukankan      html  css  js  c++  java
  • 检测用户命令序列异常——使用LSTM分类算法【使用朴素贝叶斯,类似垃圾邮件分类的做法也可以,将命令序列看成是垃圾邮件】

    通过 搜集 Linux 服务器 的 bash 操作 日志, 通过 训练 识别 出 特定 用户 的 操作 习惯, 然后 进一步 识别 出 异常 操作 行为。

    使用 SEA 数据 集 涵盖 70 多个 UNIX 系统 用户 的 行为 日志, 这些 数据 来自 UNIX 系统 acct 机制 记录 的 用户 使用 的 命令。 SEA 数据 集中 每个 用户 都 采集 了 15000 条 命令, 从 用户 集合 中 随机 抽取 50 个 用户 作为 正常 用户, 剩余 用户 的 命令 块 中 随机 插入 模拟 命令 作为 内部 伪装 者 攻击 数据。其中 训练 集合 大小 为 80, 测试 集合 大小 为 70。


    数据集示意:

    cpp
    sh
    xrdb
    cpp
    sh
    xrdb
    mkpts
    test
    stty
    hostname
    date
    echo
    [
    find
    chmod
    tty
    echo
    env
    echo
    sh
    userenv
    wait4wm
    xhost
    xsetroot
    reaper
    xmodmap
    sh
    [
    cat
    stty
    hostname
    date
    echo
    [
    find
    chmod
    tty
    echo
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    launchef
    launchef
    sh
    9term
    sh
    launchef
    sh
    launchef
    hostname
    [
    cat
    stty
    hostname
    date
    echo
    [
    find
    chmod
    tty
    echo
    sh
    more
    sh
    more
    sh
    ex
    sendmail
    sendmail
    sh
    MediaMai
    sendmail
    sh
    rm
    MediaMai
    sh
    rm
    MediaMai
    launchef
    launchef
    sh
    sh
    more
    sh
    sh
    rm
    MediaMai
    netstat
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    sh
    netscape
    more
    sh
    rm
    sh
    MediaMai
    =
    telnet
    tput
    netscape
    netscape
    netscape
    netscape
    netscape
    
    # -*- coding:utf-8 -*-
    
    import sys
    
    import re
    import numpy as np
    
    
    import nltk
    import csv
    import matplotlib.pyplot as plt
    from nltk.probability import FreqDist
    from sklearn.feature_extraction.text import CountVectorizer
    
    from sklearn import cross_validation
    from tflearn.data_utils import to_categorical, pad_sequences
    from tflearn.datasets import imdb
    import tflearn
    
    #测试样本数
    N=80
    
    def load_user_cmd_new(filename):
        cmd_list=[]
        dist=[]
        with open(filename) as f:
            i=0
            x=[]
            for line in f:
                line=line.strip('
    ')
                x.append(line)
                dist.append(line)
                i+=1
                if i == 100:
                    cmd_list.append(x)
                    x=[]
                    i=0
    
        fdist = FreqDist(dist).keys()
        return cmd_list,fdist
    
    def load_user_cmd(filename):
        cmd_list=[]
        dist_max=[]
        dist_min=[]
        dist=[]
        with open(filename) as f:
            i=0
            x=[]
            for line in f:
                line=line.strip('
    ')
                x.append(line)
                dist.append(line)
                i+=1
                if i == 100:
                    cmd_list.append(x)
                    x=[]
                    i=0
    
        fdist = FreqDist(dist).keys()
        dist_max=set(fdist[0:50])
        dist_min = set(fdist[-50:])
        return cmd_list,dist_max,dist_min
    
    def get_user_cmd_feature(user_cmd_list,dist_max,dist_min):
        user_cmd_feature=[]
        for cmd_block in user_cmd_list:
            f1=len(set(cmd_block))
            fdist = FreqDist(cmd_block).keys()
            f2=fdist[0:10]
            f3=fdist[-10:]
            f2 = len(set(f2) & set(dist_max))
            f3=len(set(f3)&set(dist_min))
            x=[f1,f2,f3]
            user_cmd_feature.append(x)
        return user_cmd_feature
    
    def get_user_cmd_feature_new(user_cmd_list,dist):
        user_cmd_feature=[]
        for cmd_list in user_cmd_list:
            x=[]
            for cmd in  cmd_list:
                v = [0] * len(dist)
                for i in range(0, len(dist)):
                    if cmd == dist[i]:
                        v[i] = 1
                x.append(v)
            user_cmd_feature.append(x)
        return user_cmd_feature
    
    def get_label(filename,index=0):
        x=[]
        with open(filename) as f:
            for line in f:
                line=line.strip('
    ')
                x.append( int(line.split()[index]))
        return x
    
    
    def do_knn(x_train,y_train,x_test,y_test):
        neigh = KNeighborsClassifier(n_neighbors=3)
        neigh.fit(x_train, y_train)
        y_predict=neigh.predict(x_test)
        score = np.mean(y_test == y_predict) * 100
    
        print  score
    
    
    def do_rnn(x_train,x_test,y_train,y_test):
        global n_words
        # Data preprocessing
        # Sequence padding
        print "GET n_words embedding %d" % n_words
    
    
        #x_train = pad_sequences(x_train, maxlen=100, value=0.)
        #x_test = pad_sequences(x_test, maxlen=100, value=0.)
        # Converting labels to binary vectors
        y_train = to_categorical(y_train, nb_classes=2)
        y_test = to_categorical(y_test, nb_classes=2)
    
        # Network building
        net = tflearn.input_data(shape=[None, 100,n_words])
        net = tflearn.lstm(net, 10,  return_seq=True)
        net = tflearn.lstm(net, 10, )
        net = tflearn.fully_connected(net, 2, activation='softmax')
        net = tflearn.regression(net, optimizer='adam', learning_rate=0.1,name="output",
                                 loss='categorical_crossentropy')
    
        # Training
    
        model = tflearn.DNN(net, tensorboard_verbose=3)
        model.fit(x_train, y_train, validation_set=(x_test, y_test), show_metric=True,
                 batch_size=32,run_id="maidou")
    
    
    if __name__ == '__main__':
        user_cmd_list,dist=load_user_cmd_new("../data/MasqueradeDat/User7")
        #print  "Dist:(%s)" % dist
        n_words=len(dist)
        user_cmd_feature=get_user_cmd_feature_new(user_cmd_list,dist)
    
        labels=get_label("../data/MasqueradeDat/label.txt",6)
        y=[0]*50+labels
    
        x_train=user_cmd_feature[0:N]
        y_train=y[0:N]
    
        x_test=user_cmd_feature[N:150]
        y_test=y[N:150]
    
        #print x_train
    
        do_rnn(x_train,x_test,y_train,y_test)
    

     效果:

    Training Step: 30  | total loss: 0.10088 | time: 1.185s
    | Adam | epoch: 010 | loss: 0.10088 - acc: 0.9591 | val_loss: 0.18730 - val_acc: 0.9571 -- iter: 80/80
    --

  • 相关阅读:
    Java 类加载机制详解
    设置菜单栏中和地址栏对应的活动项高亮
    相交链表
    二叉树的最大深度 递归
    买卖股票的最佳时机 一次遍历
    对称二叉树 递归&迭代
    二叉树的中序遍历 --采用递归
    最大子序和 动态规划
    前K个高频单词 字符型 用Hash表+Collections排序 + 优先队列
    加一 (运用取余数)
  • 原文地址:https://www.cnblogs.com/bonelee/p/10000250.html
Copyright © 2011-2022 走看看