zoukankan      html  css  js  c++  java
  • 【文本摘要项目】7性能提升之基于Transformer的PGN模型








    class PGN_TRANSFORMER(tf.keras.Model):
        def __init__(self, params):
            super(PGN_TRANSFORMER, self).__init__()
            self.num_blocks = params["num_blocks"]
            self.batch_size = params["batch_size"]
            self.vocab_size = params["vocab_size"]
            self.num_heads = params["num_heads"]
            self.embedding = Embedding(params["vocab_size"], params["d_model"])
            self.encoder = Encoder(params["num_blocks"],
            self.decoder = Decoder(params["num_blocks"],
            self.final_layer = tf.keras.layers.Dense(params["vocab_size"])
        def call(self, inp, extended_inp, max_oov_len, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
            # print('inp is ', inp)
            embed_x = self.embedding(inp)
            embed_dec = self.embedding(tar)
            enc_output = self.encoder(embed_x, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
            # dec_output.shape == (batch_size, tar_seq_len, d_model)
            dec_output, attention_weights, p_gens = self.decoder(embed_dec,
            final_output = self.final_layer(dec_output)
            # (batch_size, tar_seq_len, target_vocab_size)
            final_output = tf.nn.softmax(final_output)
            # p_gens = tf.keras.layers.Dense(tf.concat([before_dec, dec, attn_dists[-1]], axis=-1),units=1,activation=tf.sigmoid,trainable=training,use_bias=False)
            attn_dists = attention_weights['decoder_layer{}_block2'.format(self.num_blocks)]
            # (batch_size,num_heads, targ_seq_len, inp_seq_len)
            attn_dists = tf.reduce_sum(attn_dists, axis=1) / self.num_heads
            # (batch_size, targ_seq_len, inp_seq_len)
            final_dists = calc_final_dist(extended_inp,
                                          tf.unstack(final_output, axis=1),
                                          tf.unstack(attn_dists, axis=1),
                                          tf.unstack(p_gens, axis=1),
            outputs = dict(logits=tf.stack(final_dists, 1), attentions=attn_dists)
            return outputs


    def calc_final_dist(_enc_batch_extend_vocab, vocab_dists, attn_dists, p_gens, batch_oov_len, vocab_size, batch_size):
        Calculate the final distribution, for the pointer-generator model
        vocab_dists: The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays.
                    The words are in the order they appear in the vocabulary file.
        attn_dists: The attention distributions. List length max_dec_steps of (batch_size, attn_len) arrays
        final_dists: The final distributions. List length max_dec_steps of (batch_size, extended_vsize) arrays.
        # Multiply vocab dists by p_gen and attention dists by (1-p_gen)
        vocab_dists = [p_gen * dist for (p_gen, dist) in zip(p_gens, vocab_dists)]
        attn_dists = [(1-p_gen) * dist for (p_gen, dist) in zip(p_gens, attn_dists)]
        # Concatenate some zeros to each vocabulary dist, to hold the probabilities for in-article OOV words
        # the maximum (over the batch) size of the extended vocabulary
        extended_size = vocab_size + batch_oov_len
        extra_zeros = tf.zeros((batch_size, batch_oov_len))
        # list length max_dec_steps of shape (batch_size, extended_size)
        vocab_dists_extended = [tf.concat(axis=1, values=[dist, extra_zeros]) for dist in vocab_dists]
        # Project the values in the attention distributions onto the appropriate entries in the final distributions
        # This means that if a_i = 0.1 and the ith encoder word is w, and w has index 500 in the vocabulary
        # then we add 0.1 onto the 500th entry of the final distribution
        # This is done for each decoder timestep.
        # This is fiddly; we use tf.scatter_nd to do the projection
        batch_nums = tf.range(0, limit=batch_size)  # shape (batch_size)
        batch_nums = tf.expand_dims(batch_nums, 1)  # shape (batch_size, 1)
        attn_len = tf.shape(_enc_batch_extend_vocab)[1]  # number of states we attend over
        batch_nums = tf.tile(batch_nums, [1, attn_len])  # shape (batch_size, attn_len)
        indices = tf.stack((batch_nums, _enc_batch_extend_vocab), axis=2)  # shape (batch_size, enc_t, 2)
        shape = [batch_size, extended_size]
        # list length max_dec_steps (batch_size, extended_size)
        attn_dists_projected = [tf.scatter_nd(indices, copy_dist, shape) for copy_dist in attn_dists]
        # Add the vocab distributions and the copy distributions together to get the final distributions
        # final_dists is a list length max_dec_steps; each entry is a tensor shape (batch_size, extended_size) giving
        # the final distribution for that decoder timestep
        # Note that for decoder timesteps and examples corresponding to a [PAD] token, this is junk - ignore.
        final_dists = [vocab_dist + copy_dist for (vocab_dist, copy_dist) in zip(vocab_dists_extended, attn_dists_projected)]
        return final_dists


        Decoder部分和Transformer模型的Decoder区别在于context vector以及p_gen概率的计算。

    class Decoder(tf.keras.layers.Layer):
        def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, rate=0.1):
            super(Decoder, self).__init__()
            self.d_model = d_model
            self.num_layers = num_layers
            self.num_heads = num_heads
            self.depth = self.d_model // self.num_heads
            self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
            self.dropout = tf.keras.layers.Dropout(rate)
            self.Wh = tf.keras.layers.Dense(1)
            self.Ws = tf.keras.layers.Dense(1)
            self.Wx = tf.keras.layers.Dense(1)
            self.V = tf.keras.layers.Dense(1)
        def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
            attention_weights = {}
            out = self.dropout(x, training=training)
            for i in range(self.num_layers):
                out, block1, block2 = self.dec_layers[i](out, enc_output, training, look_ahead_mask, padding_mask)
                attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1
                attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2
            # x.shape == (batch_size, target_seq_len, d_model)
            # context vectors
            enc_out_shape = tf.shape(enc_output)
            context = tf.reshape(enc_output, (enc_out_shape[0], enc_out_shape[1], self.num_heads, self.depth))  # shape : (batch_size, input_seq_len, num_heads, depth)
            context = tf.transpose(context, [0, 2, 1, 3])  # (batch_size, num_heads, input_seq_len, depth)
            context = tf.expand_dims(context, axis=2)  # (batch_size, num_heads, 1, input_seq_len, depth)
            attn = tf.expand_dims(block2, axis=-1)  # (batch_size, num_heads, target_seq_len, input_seq_len, 1)
            context = context * attn  # (batch_size, num_heads, target_seq_len, input_seq_len, depth)
            context = tf.reduce_sum(context, axis=3)  # (batch_size, num_heads, target_seq_len, depth)
            context = tf.transpose(context, [0, 2, 1, 3])  # (batch_size, target_seq_len, num_heads, depth)
            context = tf.reshape(context, (tf.shape(context)[0], tf.shape(context)[1], self.d_model))  # (batch_size, target_seq_len, d_model)
            # P_gens computing
            a = self.Wx(x)
            b = self.Ws(out)
            c = self.Wh(context)
            p_gens = tf.sigmoid(self.V(a + b + c))
            return out, attention_weights, p_gens



  • 相关阅读:
    第三届 山东省ACM省赛
    最短路模板(Dijkstra & Dijkstra算法+堆优化 & bellman_ford & 单源最短路SPFA)
    最短路模板(Dijkstra & Dijkstra算法+堆优化 & bellman_ford & 单源最短路SPFA)
    最短路模板(Dijkstra & Dijkstra算法+堆优化 & bellman_ford & 单源最短路SPFA)
    hdoj 4883 TIANKENG’s restaurant【贪心区间覆盖】
    hdoj 1072 Nightmare
    hdoj 2141 Can you find it?【二分查找+暴力】
    poj 1064 Cable master【浮点型二分查找】
  • 原文地址:https://www.cnblogs.com/miners/p/15195374.html
Copyright © 2011-2022 走看看