zoukankan      html  css  js  c++  java
  • QAnet Encoder

    #!/usr/bin/python3
    # -*- coding: utf-8 -*-
    '''
    date: 2019/8/19
    mail: cally.maxiong@gmail.com
    blog: http://www.cnblogs.com/callyblog/
    '''
    import math
    import tensorflow as tf

    __all__ = ['encoder']

    initializer_relu = lambda: tf.contrib.layers.variance_scaling_initializer(factor=2.0,
    mode='FAN_IN',
    uniform=False,
    dtype=tf.float32)
    regularizer = tf.contrib.layers.l2_regularizer(scale=3e-7)

    def encoder(inputs, num_blocks, num_conv_layers, kernel_size, inputs_mask, num_filters=128, input_projection=False,
    num_heads=8, is_training=False, reuse=None, dropout=0.0, scope="res_block"):
    """
    QAnet encoder
    :param inputs: inputs
    :param num_blocks: number of conv and self attention block
    :param num_conv_layers: number of layers of each conv block
    :param kernel_size: kernel size
    :param inputs_mask: input mask
    :param num_filters: number of conv filters
    :param input_projection: whether add linear before through conv and self attention block
    :param num_heads: self attention number of heads
    :param is_training: whether training
    :param reuse: whether reuse variable
    :param dropout: dropout rate
    :param scope: scope name
    """
    with tf.variable_scope(scope, reuse=reuse):
    if input_projection:
    inputs = tf.layers.conv1d(inputs, filters=num_filters, kernel_size=1, use_bias=False, reuse=reuse, name='input_projection')

    outputs = inputs

    for i in range(num_blocks):
    outputs = _add_timing_signal_1d(outputs)
    outputs = _conv_block(outputs, num_conv_layers, kernel_size, num_filters, reuse=reuse, is_training=is_training,
    dropout=dropout, scope="conv_block%d" % i)

    outputs = _multihead_attention(outputs, inputs_mask, dropout_rate=dropout, num_heads=num_heads,
    training=is_training, reuse=reuse, scope="self_attention_layers%d" % i)
    return outputs

    def _add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
    """Adds a bunch of sinusoids of different frequencies to a Tensor.
    Each channel of the input Tensor is incremented by a sinusoid of a different
    frequency and phase.
    This allows attention to learn to use absolute and relative positions.
    Timing signals should be added to some precursors of both the query and the
    memory inputs to attention.
    The use of relative position is possible because sin(x+y) and cos(x+y) can be
    experessed in terms of y, sin(x) and cos(x).
    In particular, we use a geometric sequence of timescales starting with
    min_timescale and ending with max_timescale. The number of different
    timescales is equal to channels / 2. For each timescale, we
    generate the two sinusoidal signals sin(timestep/timescale) and
    cos(timestep/timescale). All of these sinusoids are concatenated in
    the channels dimension.
    Args:
    x: a Tensor with shape [batch, length, channels]
    min_timescale: a float
    max_timescale: a float
    Returns:
    a Tensor the same shape as x.
    """
    length = tf.shape(x)[1]
    channels = tf.shape(x)[2]
    signal = _get_timing_signal_1d(length, channels, min_timescale, max_timescale)
    return x + signal

    def _get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
    """Gets a bunch of sinusoids of different frequencies.
    Each channel of the input Tensor is incremented by a sinusoid of a different
    frequency and phase.
    This allows attention to learn to use absolute and relative positions.
    Timing signals should be added to some precursors of both the query and the
    memory inputs to attention.
    The use of relative position is possible because sin(x+y) and cos(x+y) can be
    experessed in terms of y, sin(x) and cos(x).
    In particular, we use a geometric sequence of timescales starting with
    min_timescale and ending with max_timescale. The number of different
    timescales is equal to channels / 2. For each timescale, we
    generate the two sinusoidal signals sin(timestep/timescale) and
    cos(timestep/timescale). All of these sinusoids are concatenated in
    the channels dimension.
    Args:
    length: scalar, length of timing signal sequence.
    channels: scalar, size of timing embeddings to create. The number of
    different timescales is equal to channels / 2.
    min_timescale: a float
    max_timescale: a float
    Returns:
    a Tensor of timing signals [1, length, channels]
    """
    position = tf.to_float(tf.range(length))
    num_timescales = channels // 2
    log_timescale_increment = (
    math.log(float(max_timescale) / float(min_timescale)) /
    (tf.to_float(num_timescales) - 1))
    inv_timescales = min_timescale * tf.exp(
    tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
    scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
    signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
    signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
    signal = tf.reshape(signal, [1, length, channels])
    return signal

    def _conv_block(inputs, num_conv_layers, kernel_size, num_filters, scope="conv_block", is_training=False, reuse=None,
    dropout=0.0):
    """
    conv block, contain depth wise separable convolution and conv block
    :param inputs: inputs
    :param num_conv_layers: number of conv layers
    :param kernel_size: conv kernel size
    :param num_filters: number of conv filters
    :param scope: scope name
    :param is_training: whether training
    :param reuse: whether reuse variable
    :param dropout: dropout rate
    """
    with tf.variable_scope(scope, reuse=reuse):
    outputs = tf.expand_dims(inputs, 2)

    for i in range(num_conv_layers):
    residual = outputs
    outputs = _ln(outputs, scope="layer_norm_%d" % i, reuse=reuse)

    if i % 2 == 0 and is_training:
    outputs = tf.layers.dropout(outputs, dropout, training=is_training)

    outputs = _depthwise_separable_convolution(outputs, kernel_size=(kernel_size, 1), num_filters=num_filters,
    scope="depthwise_conv_layers_%d" % i, reuse=reuse)

    outputs = tf.layers.dropout(outputs, dropout, training=is_training)
    outputs = outputs + residual

    return tf.squeeze(outputs, 2)

    def _depthwise_separable_convolution(inputs, kernel_size, num_filters, bias=True, reuse=None,
    scope="depthwise_separable_convolution"):
    """
    depth wise separable convolution
    :param inputs: input
    :param kernel_size: kernel size
    :param num_filters: number of filter
    :param bias: whether use bias
    :param reuse: whether reuse variable
    :param scope: scope name
    """
    with tf.variable_scope(scope, reuse=reuse):
    shapes = inputs.shape.as_list()
    depthwise_filter = tf.get_variable("depthwise_filter",
    (kernel_size[0], kernel_size[1], shapes[-1], 1),
    dtype=tf.float32,
    regularizer=regularizer,
    initializer=initializer_relu())
    pointwise_filter = tf.get_variable("pointwise_filter",
    (1, 1, shapes[-1], num_filters),
    dtype=tf.float32,
    regularizer=regularizer,
    initializer=initializer_relu())
    outputs = tf.nn.separable_conv2d(inputs,
    depthwise_filter,
    pointwise_filter,
    strides=(1, 1, 1, 1),
    padding="SAME")

    if bias:
    b = tf.get_variable("bias",
    outputs.shape[-1],
    regularizer=regularizer,
    initializer=tf.zeros_initializer())
    outputs += b
    outputs = tf.nn.relu(outputs)
    return outputs

    def _multihead_attention(inputs,
    input_mask,
    num_heads=8,
    dropout_rate=0.0,
    training=False,
    reuse=None,
    scope="multihead_attention"):
    '''Applies multihead attention. See 3.2.2
    inputs: A 3d tensor with shape of [N, T, d_model].
    input_mask: A 3d tensor with shape of [N, T].
    num_heads: An int. Number of heads.
    dropout_rate: A floating point number.
    training: Boolean. Controller of mechanism for dropout.
    causality: Boolean. If true, units that reference the future are masked.
    scope: Optional scope for `variable_scope`.

    Returns
    A 3d tensor with shape of (N, T_q, C)
    '''

    with tf.variable_scope(scope, reuse=reuse):
    inputs = inputs * tf.cast(tf.expand_dims(input_mask, axis=-1), dtype=tf.float32)
    inputs = _ln(inputs, reuse=reuse, scope=scope+'_layer_normal')

    queries = inputs
    keys = inputs
    values = inputs

    d_model = queries.get_shape().as_list()[-1]
    # Linear projections
    Q = tf.layers.dense(queries, d_model, use_bias=False, reuse=reuse) # (N, T_q, d_model)
    K = tf.layers.dense(keys, d_model, use_bias=False, reuse=reuse) # (N, T_k, d_model)
    V = tf.layers.dense(values, d_model, use_bias=False, reuse=reuse) # (N, T_k, d_model)

    # Split and concat
    Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, d_model/h)
    K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)
    V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)

    # Attention
    outputs = _scaled_dot_product_attention(Q_, K_, V_, dropout_rate, training, reuse=reuse)

    # Restore shape
    outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, d_model)

    # feed forward
    outputs = tf.layers.conv1d(outputs, filters=d_model, kernel_size=1, reuse=reuse, trainable=training)
    outputs = tf.layers.dropout(outputs, dropout_rate, training=training)

    # Residual connection
    outputs = queries + outputs

    # Normalize
    outputs = _ln(outputs, reuse=reuse, scope='feed_forword_layer_normal')

    return outputs

    def _scaled_dot_product_attention(Q, K, V,
    dropout_rate=0.,
    training=False,
    reuse=None,
    scope="scaled_dot_product_attention"):
    '''See 3.2.1.
    Q: Packed queries. 3d tensor. [N, T_q, d_k].
    K: Packed keys. 3d tensor. [N, T_k, d_k].
    V: Packed values. 3d tensor. [N, T_k, d_v].
    causality: If True, applies masking for future blinding
    dropout_rate: A floating point number of [0, 1].
    training: boolean for controlling droput
    scope: Optional scope for `variable_scope`.
    '''
    with tf.variable_scope(scope, reuse=reuse):
    d_k = Q.get_shape().as_list()[-1]

    # dot product
    outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1])) # (N, T_q, T_k)

    # scale
    outputs /= d_k ** 0.5

    # key masking, delete key 0
    outputs = _mask(outputs, Q, K, type="key")

    # softmax
    outputs = tf.nn.softmax(outputs)
    attention = tf.transpose(outputs, [0, 2, 1])
    tf.summary.image("attention", tf.expand_dims(attention[:1], -1))

    # query masking, delete query <pad>
    outputs = _mask(outputs, Q, K, type="query")

    # dropout
    outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=training)

    # weighted sum (context vectors)
    outputs = tf.matmul(outputs, V) # (N, T_q, d_v)

    return outputs

    def _mask(inputs, queries=None, keys=None, type=None):
    """Masks paddings on keys or queries to inputs
    inputs: 3d tensor. (N, T_q, T_k)
    queries: 3d tensor. (N, T_q, d)
    keys: 3d tensor. (N, T_k, d)

    e.g.,
    >> queries = tf.constant([[[1.],
    [2.],
    [0.]]], tf.float32) # (1, 3, 1)
    >> keys = tf.constant([[[4.],
    [0.]]], tf.float32) # (1, 2, 1)
    >> inputs = tf.constant([[[4., 0.],
    [8., 0.],
    [0., 0.]]], tf.float32)
    >> mask(inputs, queries, keys, "key")
    array([[[ 4.0000000e+00, -4.2949673e+09],
    [ 8.0000000e+00, -4.2949673e+09],
    [ 0.0000000e+00, -4.2949673e+09]]], dtype=float32)
    >> inputs = tf.constant([[[1., 0.],
    [1., 0.],
    [1., 0.]]], tf.float32)
    >> mask(inputs, queries, keys, "query")
    array([[[1., 0.],
    [1., 0.],
    [0., 0.]]], dtype=float32)
    """
    outputs = None
    padding_num = -2 ** 32 + 1
    if type in ("k", "key", "keys"):
    # Generate masks
    masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1)) # (N, T_k)
    masks = tf.expand_dims(masks, 1) # (N, 1, T_k)
    masks = tf.tile(masks, [1, tf.shape(queries)[1], 1]) # (N, T_q, T_k)

    # Apply masks to inputs
    paddings = tf.ones_like(inputs) * padding_num

    outputs = tf.where(tf.equal(masks, 0), paddings, inputs) # (N, T_q, T_k)
    elif type in ("q", "query", "queries"):
    # Generate masks
    masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1)) # (N, T_q)
    masks = tf.expand_dims(masks, -1) # (N, T_q, 1)
    masks = tf.tile(masks, [1, 1, tf.shape(keys)[1]]) # (N, T_q, T_k)

    # Apply masks to inputs
    outputs = inputs*masks
    else:
    print("Check if you entered type correctly!")

    return outputs


    def _ln(inputs, epsilon=1e-6, reuse=None, scope="ln"):
    '''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
    inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
    epsilon: A floating number. A very small number for preventing ZeroDivision Error.
    scope: Optional scope for `variable_scope`.

    Returns:
    A tensor with the same shape and data dtype as `inputs`.
    '''
    with tf.variable_scope(scope, reuse=reuse):
    inputs_shape = inputs.get_shape()
    params_shape = inputs_shape[-1:]

    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    beta = tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
    gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
    normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
    outputs = gamma * normalized + beta

    return outputs

    在QAnet最后的三个encoder中,各项参数为,其中hidden size为context_query输出的hidden size

    encoder(enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, inputs_mask=input_mask, num_filters=hidden_size, num_heads=8,
                                 scope='Model_Encoder', reuse=True if i > 0 else None, is_training=False, dropout=0.1)
  • 相关阅读:
    hadoop:WordCount问题总结
    .mata. _root_ (转)
    Hbase笔记:批量导入
    Hbase笔记4 java操作Hbase
    wget
    中国大陆开源镜像站汇总
    全键盘操作Windows
    linux下实用命令
    /dev/null和/dev/zero的区别
    Windows xp下安装sql server2005所碰到的一些问题及解决方法
  • 原文地址:https://www.cnblogs.com/callyblog/p/11463437.html
Copyright © 2011-2022 走看看