zoukankan      html  css  js  c++  java
  • Keras 构建DNN 对用户名检测判断是否为非法用户名(从数据预处理到模型在线预测)

    一.  数据集的准备与预处理

    1 . 收集dataset

    (大量用户名--包含正常用户名与非法用户名)

    包含两个txt文件  legal_name.txt  ilegal_name.txt. 如下图所示


    2. 用文件进行预处理

    # Data sets
    import os
    import pandas as pd
    
    DATAPATH = "../dataset"
    
    POS = os.path.join(DATAPATH, "legal_name.txt")
    POS_OUTPUT = os.path.join(DATAPATH, "legal_name.csv")
    
    NEG = os.path.join(DATAPATH, "ilegal_name.txt")
    NEG_OUTPUT = os.path.join(DATAPATH, "ilegal_name.csv")
    
    
    def process_org_data(input_data, output_data, lable):
        reader = pd.read_csv(input_data, iterator=True)
        while True:
            try:
                train = reader.get_chunk(10000)
                train['username'] = train['username'].astype(str)
                train['username'] = map(lambda x: x.strip(), train['username'])
                train['length'] = train['username'].apply(len)
               
                ... ...
    
                train['label'] = map(lambda x: lable, train['username'])
                train.to_csv(output_data, encoding='utf-8', mode='a', index=False, header=False)
    
            except StopIteration:
                print "Iteration is stopped."
                break
    
    
    if __name__ == '__main__':
        process_org_data(POS, POS_OUTPUT, 1)
        process_org_data(NEG, NEG_OUTPUT, 0)
    根据需求提取相应的特征, 输出成 csv 格式,包含特征列与label列

    把合法用户dataset与非法用户dataset,合并打乱,切割成 train.csv 和 test.csv


    pos_dataset = read_dataset(POS)
    neg_dataset = read_dataset(NEG)
    dataset = pd.concat([pos_dataset, neg_dataset])
    dataset = dataset.sample(frac=1).reset_index(drop=True)
    
    train_data = dataset.loc[:200000, :]
    test_data = dataset.loc[200000:, :]
    
    
    train_data.to_csv(os.path.join(DataPath, "train.csv"), index=False)
    test_data.to_csv(os.path.join(DataPath, "test.csv"), index=False)
    


    二.  Keras 构建DNN模型进行训练与模型保存


    import pandas as pd
    import os
    import tensorflow as tf
    import matplotlib.pyplot as plt
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    import numpy as np
    
    tf.logging.set_verbosity(tf.logging.INFO)
    
    DataPath = "../dataset"
    
    TRAIN = os.path.join(DataPath, "train.csv")
    TEST = os.path.join(DataPath, "test.csv")
    
    COLUMNS = ["username", ... , "label"]
    
    train_dataset = pd.read_csv(TRAIN, skipinitialspace=True, skiprows=1, names=COLUMNS)
    test_dataset = pd.read_csv(TEST, skipinitialspace=True, skiprows=1, names=COLUMNS)
    
    for col in train_dataset.columns[1:]:
        train_dataset[col] = pd.to_numeric(train_dataset[col], errors='coerce')
    
    for col in test_dataset.columns[1:]:
        test_dataset[col] = pd.to_numeric(test_dataset[col], errors='coerce')
    
    X_train = train_dataset.iloc[:, range(1, 19)].values
    y_train = train_dataset.iloc[:, 19].values
    
    X_test = test_dataset.iloc[:, range(1, 19)].values
    y_test = test_dataset.iloc[:, 19].values
    
    
    def build_model():
        ############
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(64, input_dim=18))
        # model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Activation('relu'))
    
        model.add(tf.keras.layers.Dense(32))
        # model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Activation('relu'))
    
        model.add(tf.keras.layers.Dense(16))
        # model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Activation('relu'))
    
        model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
        return 
    
    
    
    if __name__ == '__main__':
        model_file = './my_model.h5'
        if (os.path.isfile(model_file)):
            print('model file detected. Loading.')
            model = tf.keras.models.load_model(model_file)
        else:
            print('No model file detected.  Starting from scratch.')
            model = build_model()
            model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
            model.fit(X_train, y_train, batch_size=100, epochs=1, validation_data=(X_test, y_test))
    
    
    模型输出为  my_model.h5 , 准确率百分之90%


    三. 导出tensorflow-serving 模型, 运行在线预测服务

    def save_model_for_production(model, version, path='prod_models'):
        tf.keras.backend.set_learning_phase(1)
        if not os.path.exists(path):
            os.mkdir(path)
        export_path = os.path.join(
            tf.compat.as_bytes(path),
            tf.compat.as_bytes(version))
        builder = tf.saved_model.builder.SavedModelBuilder(export_path)
    
        model_input = tf.saved_model.utils.build_tensor_info(model.input)
        model_output = tf.saved_model.utils.build_tensor_info(model.output)
    
        prediction_signature = (
            tf.saved_model.signature_def_utils.build_signature_def(
                inputs={'inputs': model_input},
                outputs={'output': model_output},
                method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
    
        with tf.keras.backend.get_session() as sess:
            builder.add_meta_graph_and_variables(
                sess=sess, tags=[tf.saved_model.tag_constants.SERVING],
                signature_def_map={
                    'predict':
                        prediction_signature,
                })
    
            builder.save()

    导出为 tensorflow serving 模型

    export_path = "tf-model"
    save_model_for_production(model, "7", export_path)

    运行在线预测服务(tensorflow 官方方法)

    /serving/bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --port=9000 --model_name=username --model_base_path=/data/model/tf-model
    
    

    四. client通过grpc 调用预测服务

    #!/usr/bin/env python  
    # encoding: utf-8  
    
    """ 
    @version: v1.0 
    @author: zwqjoy 
    @contact: zwqjoy@163.com 
    @site: https://blog.csdn.net/zwqjoy 
    @file: client
    @time: 2018/6/29 15:02 
    """
    
    from grpc.beta import implementations
    import tensorflow as tf
    
    from tensorflow_serving.apis import predict_pb2
    from tensorflow_serving.apis import prediction_service_pb2
    import numpy as np
    
    tf.app.flags.DEFINE_string('server', '172.xxx.xxx.xxx:9000',
                               'PredictionService host:port')
    tf.app.flags.DEFINE_string('username', 'demo_user',
                               '传入一个username')
    FLAGS = tf.app.flags.FLAGS
    
    
    def nametovec(username):
        username = username.astype(str)
    
        length = len(username)
        
        ... ...
    
        return np.array([length, ...])
    
    
    def main(_):
        host, port = FLAGS.server.split(':')
        channel = implementations.insecure_channel(host, int(port))
        stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)
        # Send request
    
        # See prediction_service.proto for gRPC request/response details.
        data = nametovec(FLAGS.username)
        data = data.astype(np.float32)
    
        request = predict_pb2.PredictRequest()
        request.model_spec.name = 'username'  # 这个name跟tensorflow_model_server  --model_name="username" 对应
        request.model_spec.signature_name = 'predict'  # 这个signature_name  跟signature_def_map 对应
        request.inputs['inputs'].CopyFrom(
            tf.contrib.util.make_tensor_proto(data, shape=(1, 18)))
        result = stub.Predict(request, 10.0)  # 10 secs timeout
        print(result)
    
    
    if __name__ == '__main__':
        tf.app.run()
    


  • 相关阅读:
    Fizz Buzz 问题
    旋转字符串
    合并排序数组
    尾部的零
    A + B 问题
    CentOS6.x安装RabbitMQ
    MySql游标
    MySql存储过程
    找出n个自然数(1,2,3,……,n)中取r个数的组合
    正则表达式
  • 原文地址:https://www.cnblogs.com/WayneZeng/p/9290689.html
Copyright © 2011-2022 走看看