zoukankan      html  css  js  c++  java
  • 基于Transformer进行多标签文本预测

    1.深度模型增加参数空间,提高拟合能力;

    2.Attention机制捕捉各基础特征间的关联信息,组合性更加强悍;

    3.文本多标签预测难度较大,采用0,1进行label表示。

    以下给出模型类供参考,分类效果很不错:

    class BaseClassier(object):
        def __init__(self, config, sess):
            # configuration
            self.max_len = config["max_len"]
            self.position_len = config["position_len"]
            self.sess = sess
            self.num_classes = config["n_class"]
            self.lstm_layers = config["lstm_layers"]
            self.vocab_size = config["vocab_size"]
            self.embedding_size = config["embedding_size"]
            self.hidden_size = config["hidden_size"]
            self.l2_reg_lambda = config["l2_reg_lambda"]
            self.learning_rate = config["learning_rate"]
            self.filter_heights = config["filter_heights"]
            self.filter_num_per_height = config["filter_num_per_height"]
            self.numBlocks = config['numBlocks']
            self.filters = config['filters']
            self.numHeads = config['numHeads']
            self.keepProp = config['keepProp']  # multi head attention 中的dropout
            self.norm_epsilon = config['norm_epsilon']
    
            # placeholder
            self.x = tf.compat.v1.placeholder(tf.float32, [None, self.max_len], name="input_x")
            self.label = tf.compat.v1.placeholder(tf.float32, [None, self.num_classes], name="input_y")
            self.trans_keep_prob = tf.compat.v1.placeholder(tf.float32, name="trans_keep_prob")
            self.multi_keep_prob = tf.compat.v1.placeholder(tf.float32, name="multi_keep_prob")
            self.embeddedPosition = tf.placeholder(tf.float32, [None, self.position_len, self.position_len],
                                                   name="embed_position")
    
        def transformer_layer(self):
            l2Loss = tf.constant(0.0)
            with tf.name_scope("embedding"):
                self.embed_fusion = self.embedding_layer_fusion_v1(self.x)
                self.embeddedWords = tf.concat([self.embed_fusion, self.embeddedPosition], -1)
    
            with tf.name_scope("transformer"):
                for i in range(self.numBlocks):
                    with tf.name_scope("transformer-{}".format(i + 1)):
                        multiHeadAtt = self._multiheadAttention(rawKeys=self.original_feature, queries=self.embeddedWords,
                                                                keys=self.embeddedWords)
                        self.embeddedWords = self._feedForward(multiHeadAtt,
                                                               [self.filters,
                                                                self.embedding_size + self.position_len])
    
                outputs = tf.reshape(self.embeddedWords,
                                     [-1, self.position_len * (self.embedding_size + self.position_len)])
            outputSize = outputs.get_shape()[-1].value
            with tf.name_scope("dropout"):
                outputs = tf.nn.dropout(outputs, keep_prob=self.trans_keep_prob)
            with tf.name_scope("output"):
                outputW = tf.compat.v1.get_variable(
                    "outputW",
                    shape=[outputSize, self.num_classes],
                    initializer=tf.contrib.layers.xavier_initializer())
    
                outputB = tf.Variable(tf.constant(0.1, shape=[self.num_classes]), name="outputB")
                l2Loss += tf.nn.l2_loss(outputW)
                l2Loss += tf.nn.l2_loss(outputB)
                self.logits = tf.compat.v1.nn.xw_plus_b(outputs, outputW, outputB, name="logits")
                self.possibility = tf.nn.sigmoid(self.logits, name="possibility")
                self.prediction = tf.round(self.possibility, name="prediction")
            with tf.name_scope("loss"):
                if self.num_classes == 1:
                    pass
                elif self.num_classes > 1:
                    losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.label)
                sum_losses = tf.reduce_sum(losses, axis=1)
                self.loss = tf.reduce_mean(sum_losses) + self.l2_reg_lambda * l2Loss
    
        def embedding_layer_fusion_v1(self, input_x, name=None):
            """ 
            :param input_x:
            :param name:
            :return:
            """
            with tf.name_scope('word_embedding' if not name else name), tf.device('/cpu:0'):
                embeddings = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0) ,name='embeddings')
                geek_hist = input_x[:, :112]
                geek_profile = input_x[:, 112:115]
                geek_query = input_x[:, 115:]
                pos_list = []
                score_list = []
                for i in range(112):
                    if i % 2 == 0:
                        pos_list.append(tf.expand_dims(geek_hist[:, i], -1))
                    else:
                        score_list.append(tf.expand_dims(geek_hist[:, i], -1))
                geek_position = tf.cast(tf.concat(pos_list, axis=1), tf.int32)
                geek_profile = tf.cast(geek_profile, tf.int32)
                geek_query = tf.cast(geek_query, tf.int32)
                geek_score = tf.concat(score_list, axis=1)
                expand_score = tf.expand_dims(geek_score, -1)
                geek_score = tf.tile(expand_score, [1, 1, self.embedding_size])
                position_embed = tf.nn.embedding_lookup(embeddings, geek_position, name='position_embed')
                position_embed_with_score = tf.multiply(position_embed, geek_score)
                profile_embed = tf.nn.embedding_lookup(embeddings, geek_profile, name='profile_embed')
                query_embed = tf.nn.embedding_lookup(embeddings, geek_query, name='query_embed')
                self.original_feature = tf.concat([geek_position, geek_profile, geek_query], axis=1)
                embed_fusion = tf.concat([position_embed_with_score, profile_embed, query_embed], axis=1)
                return embed_fusion
    
        def _layerNormalization(self, inputs, scope="layerNorm"):
            epsilon = self.norm_epsilon
            inputsShape = inputs.get_shape()  # [batch_size, sequence_length, embedding_size]
            paramsShape = inputsShape[-1:]
            mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
            beta = tf.Variable(tf.zeros(paramsShape))
            gamma = tf.Variable(tf.ones(paramsShape))
            normalized = (inputs - mean) / ((variance + epsilon) ** .5)
            outputs = gamma * normalized + beta
            return outputs
    
        def _multiheadAttention(self, rawKeys, queries, keys, numUnits=None, causality=False, scope="multiheadAttention"):
            numHeads = self.numHeads
            keepProp = self.keepProp
    
            if numUnits is None:
                numUnits = queries.get_shape().as_list()[-1]
            Q = tf.layers.dense(queries, numUnits, activation=tf.nn.relu)
            K = tf.layers.dense(keys, numUnits, activation=tf.nn.relu)
            V = tf.layers.dense(keys, numUnits, activation=tf.nn.relu)
            Q_ = tf.concat(tf.split(Q, numHeads, axis=-1), axis=0)
            K_ = tf.concat(tf.split(K, numHeads, axis=-1), axis=0)
            V_ = tf.concat(tf.split(V, numHeads, axis=-1), axis=0)
            similary = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
            scaledSimilary = similary / (K_.get_shape().as_list()[-1] ** 0.5)
            keyMasks = tf.tile(rawKeys, [numHeads, 1])
            keyMasks = tf.tile(tf.expand_dims(keyMasks, 1), [1, tf.shape(queries)[1], 1])
            paddings = tf.ones_like(scaledSimilary) * (-2 ** (32 + 1))
            maskedSimilary = tf.where(tf.equal(keyMasks, 0), paddings, scaledSimilary)
            if causality:
                diagVals = tf.ones_like(maskedSimilary[0, :, :])  # [queries_len, keys_len]
                tril = tf.contrib.linalg.LinearOperatorTriL(diagVals).to_dense()  # [queries_len, keys_len]
                masks = tf.tile(tf.expand_dims(tril, 0),
                                [tf.shape(maskedSimilary)[0], 1, 1])  # [batch_size * numHeads, queries_len, keys_len]
    
                paddings = tf.ones_like(masks) * (-2 ** (32 + 1))
                maskedSimilary = tf.where(tf.equal(masks, 0), paddings,
                                          maskedSimilary)  # [batch_size * numHeads, queries_len, keys_len]
            weights = tf.nn.softmax(maskedSimilary)
            outputs = tf.matmul(weights, V_)
            outputs = tf.concat(tf.split(outputs, numHeads, axis=0), axis=2)
            outputs = tf.nn.dropout(outputs, keep_prob=self.multi_keep_prob)
            outputs += queries
            outputs = self._layerNormalization(outputs)
            return outputs
    
        def _feedForward(self, inputs, filters, scope="multiheadAttention"):
            params = {"inputs": inputs, "filters": filters[0], "kernel_size": 1,
                      "activation": tf.nn.relu, "use_bias": True}
            outputs = tf.layers.conv1d(**params)
            params = {"inputs": outputs, "filters": filters[1], "kernel_size": 1,
                      "activation": None, "use_bias": True}
            outputs = tf.layers.conv1d(**params)
            outputs += inputs
            outputs = self._layerNormalization(outputs)
            return outputs
    
        def _positionEmbedding(self, scope="positionEmbedding"):
            batchSize = self.config.batchSize
            sequenceLen = self.config.sequenceLength
            embeddingSize = self.config.model.embeddingSize
            positionIndex = tf.tile(tf.expand_dims(tf.range(sequenceLen), 0), [batchSize, 1])
            positionEmbedding = np.array([[pos / np.power(10000, (i - i % 2) / embeddingSize) for i in range(embeddingSize)]
                                          for pos in range(sequenceLen)])
            positionEmbedding[:, 0::2] = np.sin(positionEmbedding[:, 0::2])
            positionEmbedding[:, 1::2] = np.cos(positionEmbedding[:, 1::2])
            positionEmbedding_ = tf.cast(positionEmbedding, dtype=tf.float32)
            positionEmbedded = tf.nn.embedding_lookup(positionEmbedding_, positionIndex)
            return positionEmbedded
    
        def build_graph(self):
            print("building graph...")
            with tf.compat.v1.variable_scope("discriminator"):
                self.transformer_layer()
                self.global_step = tf.Variable(0, name="globalStep", trainable=False)
                optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate)
                gradsAndVars = optimizer.compute_gradients(self.loss)
                self.train_op = optimizer.apply_gradients(gradsAndVars, global_step=self.global_step)
                gradSummaries = []
                for g, v in gradsAndVars:
                    if g is not None:
                        tf.compat.v1.summary.histogram("{}/grad/hist".format(v.name), g)
                        tf.compat.v1.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                outDir = r"./summarys"
                print("Writing to {}
    ".format(outDir))
                lossSummary = tf.compat.v1.summary.scalar("loss", self.loss)
                summaryOp = tf.compat.v1.summary.merge_all()
                trainSummaryDir = os.path.join(outDir, "train")
                trainSummaryWriter = tf.compat.v1.summary.FileWriter(trainSummaryDir, self.sess.graph)
                evalSummaryDir = os.path.join(outDir, "eval")
                evalSummaryWriter = tf.compat.v1.summary.FileWriter(evalSummaryDir, self.sess.graph)
                self.saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables(), max_to_keep=3)
            print("graph built successfully!")
    
    
    if __name__ == '__main__':
        y_train = transform_multilabel_as_multihot(y_train, position2id, nums_class)
        y_test = transform_multilabel_as_multihot(y_test, position2id, nums_class)
    
        x_train, x_test, vocab_size = data_processing(features, x_train, x_test, max_len=135)
        print("train size: ", len(x_train))
        print("test size", len(x_test))
        print("vocab size: ", vocab_size)
    
        config = {
            "position_len": 79,
            "max_len": 135,
            "vocab_size": vocab_size,
            "embedding_size": 161,
            "learning_rate": 1e-3,
            "l2_reg_lambda": 1e-3,
            "batch_size": 32,
            "n_class": nums_class,
            "hidden_size": 256,
            "lstm_layers": 2,
            "filter_heights": [2, 3, 4, 5],
            "filter_num_per_height": [100, 100, 300, 300],
            "numBlocks": 1,
            "filters": 128,
            "numHeads": 8,
            "keepProp": 0.9,  # multi head attention 中的dropout
            "norm_epsilon": 1e-8,
            "train_epoch": 20,
            "savedModelPath": r'./PBModel',
    
        }
        embeddedPosition = fixedPositionEmbedding(config["batch_size"], config["position_len"]) 
    
        # auto GPU growth, avoid occupy all GPU memory
        tf_config = tf.compat.v1.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        sess = tf.compat.v1.Session(config=tf_config)
    
        classifier = BaseClassier(config, sess)
        classifier.build_graph()
    
        sess.run(tf.compat.v1.global_variables_initializer())
        dev_batch = (x_test, y_test)
        start = time.time()
        best_auc = .0
    
        for e in range(config["train_epoch"]):
            t0 = time.time()
            print("
    Epoch {} start !".format(e + 1))
            trained_samples = 0
            for batch_idx, (x_batch, y_batch) in enumerate(fill_feed_dict(x_train, y_train, config["batch_size"], is_shuffle=False)):
                return_dict = run_train_step(classifier, sess, (x_batch, y_batch))
                trained_samples += len(x_batch)
                progress = math.ceil(batch_idx / (x_train.shape[0] // config["batch_size"]) * 50)
                print('
    Train epoch: {} {}/{} [{}]{}% '.format(e+1, trained_samples, len(x_train),
                                                               '-' * progress + '>', progress * 2), end='')
            t1 = time.time()
            print("Train Epoch time:  {:.4f} s".format(t1 - t0))
            auc, _, _, _ = run_eval_step(classifier, sess, dev_batch)
            print("validation loss:{:.4f}	auc:{:.4f}".format(
                return_dict["loss"], auc))
    
            if auc > best_auc:
                best_auc = auc
                saver = tf.train.Saver()
                saver.save(sess, "Model/model.ckpt")
    
                output_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(
                    sess=sess, input_graph_def=sess.graph_def, output_node_names=['discriminator/output/prediction'])
                with tf.io.gfile.GFile(output_graph, 'wb') as fw:
                    fw.write(output_graph_def.SerializeToString())
    
                print('best model have saved!')
        print("Training finished, time consumed : {:.2f} s
     Training over!".format(time.time() - start))
    时刻记着自己要成为什么样的人!
  • 相关阅读:
    G-sensor驱动分析
    写i2c_client驱动的两种方式
    TP分析
    JAVA基础知识要点
    springboot-线程池简单使用
    java 实现好看的图形验证码
    正则表达式校验15/18位生份证-JAVA版
    springboot2.X集成HttpClient 发送HTTPS 请求
    linux-修改时区时间
    springboot2.X 在项目启动后执行一段自定义代码
  • 原文地址:https://www.cnblogs.com/demo-deng/p/14593285.html
Copyright © 2011-2022 走看看