zoukankan      html  css  js  c++  java
  • Bert源码解读(一)之主框架

    一、BertModel主入口

    class BertModel(object):
      """BERT model ("Bidirectional Encoder Representations from Transformers").
    
      Example usage:
    
      ```python
      # Already been converted into WordPiece token ids
      input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
      input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
      token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
    
      config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
        num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
    
      model = modeling.BertModel(config=config, is_training=True,
        input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
    
      label_embeddings = tf.get_variable(...)
      pooled_output = model.get_pooled_output()
      logits = tf.matmul(pooled_output, label_embeddings)
      ...
      ```
      """
    
      def __init__(self,
                   config,                            # BertConfig对象
                   is_training,
                   input_ids,                        # 【batch_size, seq_length】
                   input_mask=None,                    # 【batch_size, seq_length】
                   token_type_ids=None,                # 【batch_size, seq_length】
                   use_one_hot_embeddings=False,    # 是否使用one-hot;否则tf.gather()
                   scope=None):
    
        config = copy.deepcopy(config)
        if not is_training:
          config.hidden_dropout_prob = 0.0
          config.attention_probs_dropout_prob = 0.0
    
        input_shape = get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]
        # 不做mask,即所有元素为1
        if input_mask is None:
          input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
    
        if token_type_ids is None:
          token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
    
        with tf.variable_scope(scope, default_name="bert"):
          with tf.variable_scope("embeddings"):
            # word embedding,首先可以随机初始化每个词的embedding,通过训练最后得出具有上下文关系的词向量,Transformer的最后输出就是每个词的词向量。
            (self.embedding_output, self.embedding_table) = embedding_lookup(
                input_ids=input_ids,
                vocab_size=config.vocab_size,
                embedding_size=config.hidden_size, #这里词向量的维度就设置为hidden_size,也即论文中的H
                initializer_range=config.initializer_range,
                word_embedding_name="word_embeddings",
                use_one_hot_embeddings=use_one_hot_embeddings)
    
            # 添加position embedding和segment embedding,layer norm + dropout
            self.embedding_output = embedding_postprocessor(
                input_tensor=self.embedding_output,
                use_token_type=True,
                token_type_ids=token_type_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob)
    
          with tf.variable_scope("encoder"):
    
            # input_ids是经过padding的word_ids:[25, 120, 34, 0, 0],input_mask是有效词标记:[1, 1, 1, 0, 0],这里形成一个attention_mask矩阵
            attention_mask = create_attention_mask_from_input_mask(
                input_ids, input_mask)
    
            # transformer模块叠加, 输入是[batch_size, seq_length, hidden_size],输出也是[batch_size, seq_length, hidden_size].
            self.all_encoder_layers = transformer_model(
                input_tensor=self.embedding_output,
                attention_mask=attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)
    
          # `self.sequence_output`是最后一层的输出,shape为【batch_size, seq_length, hidden_size】
          self.sequence_output = self.all_encoder_layers[-1]
    
          # ‘pooler’部分将encoder输出【batch_size, seq_length, hidden_size】,转成【batch_size, hidden_size】
          #这一部分主要是为分类任务做准备,取每个sequence的第一个位置CLS的输出向量结果为整个句子的训练结果向量,然后后面再加一层全连接网络并softmax就可以做句子分类任务了。
          #其实上面的sequence_output生成的是每个token的词向量,可以认为是bert论文中Mask LM任务的结果;pooled_output是Next sentence prdict任务的结果。
          with tf.variable_scope("pooler"):
            # 取最终输出结果层的第一个位置[CLS]对应的tensor, 对于分类任务很重要,sequence_output[:, 0:1, :]得到的是[batch_size, 1, hidden_size],我们需要用squeeze把第二维去掉
            first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
            # 然后再加一个全连接层,输出仍然是[batch_size, hidden_size]
            self.pooled_output = tf.layers.dense(
                first_token_tensor,
                config.hidden_size,
                activation=tf.tanh,
                kernel_initializer=create_initializer(config.initializer_range))

    总结:Bert的输出最终有两个结果可用

    sequence_output:维度【batch_size, seq_length, hidden_size】,这是训练后每个token的词向量。

    pooled_output:维度是【batch_size, hidden_size】,每个sequence第一个位置CLS的向量输出,用于分类任务。

    class BertConfig(object):
      """Configuration for `BertModel`."""
    
      def __init__(self,
                   vocab_size,
                   hidden_size=768,
                   num_hidden_layers=12,
                   num_attention_heads=12,
                   intermediate_size=3072,
                   hidden_act="gelu",
                   hidden_dropout_prob=0.1,
                   attention_probs_dropout_prob=0.1,
                   max_position_embeddings=512,
                   type_vocab_size=16,
                   initializer_range=0.02):
        """Constructs BertConfig.
    
        Args:
          vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
          hidden_size: Size of the encoder layers and the pooler layer.
          num_hidden_layers: Number of hidden layers in the Transformer encoder.
          num_attention_heads: Number of attention heads for each attention layer in
            the Transformer encoder.
          intermediate_size: The size of the "intermediate" (i.e., feed-forward)
            layer in the Transformer encoder.
          hidden_act: The non-linear activation function (function or string) in the
            encoder and pooler.
          hidden_dropout_prob: The dropout probability for all fully connected
            layers in the embeddings, encoder, and pooler.
          attention_probs_dropout_prob: The dropout ratio for the attention
            probabilities.
          max_position_embeddings: The maximum sequence length that this model might
            ever be used with. Typically set this to something large just in case
            (e.g., 512 or 1024 or 2048).
          type_vocab_size: The vocabulary size of the `token_type_ids` passed into
            `BertModel`.
          initializer_range: The stdev of the truncated_normal_initializer for
            initializing all weight matrices.
        """
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
    
      @classmethod
      def from_dict(cls, json_object):
        """Constructs a `BertConfig` from a Python dictionary of parameters."""
        config = BertConfig(vocab_size=None)
        for (key, value) in six.iteritems(json_object):
          config.__dict__[key] = value
        return config
    
      @classmethod
      def from_json_file(cls, json_file):
        """Constructs a `BertConfig` from a json file of parameters."""
        with tf.gfile.GFile(json_file, "r") as reader:
          text = reader.read()
        return cls.from_dict(json.loads(text))
    
      def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output
    
      def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "
    "
    • vocab_size:词表大小
    • hidden_size:隐藏层神经元数,可以理解为dmodel,即单个Transformer block第一层(输入层后面链接的层)和最后一层(输出层)的节点数,对应于论文中的H
    • num_hidden_layers:Transformer 的层数,对应于论文中的L
    • num_attention_heads:multi-head attention 的 head 数,对应于论文中的A
    • intermediate_size:encoder 的“中间”隐层神经元数(例如 feed-forward layer),对应于论文中的4H
    • hidden_act:隐藏层激活函数
    • hidden_dropout_prob:隐层 dropout 率
    • attention_probs_dropout_prob:注意力部分的 dropout
    • max_position_embeddings:最大位置编码
    • type_vocab_size:token_type_ids 的词典大小
    • initializer_range:truncated_normal_initializer 初始化方法的 stdev
    • 这里要注意一点,可能刚看的时候对type_vocab_size这个参数会有点不理解,其实就是在next sentence prediction任务里的Segment A和 Segment B。在下载的bert_config.json文件里也有说明,默认值应该为 2。

    二、获取词向量(Embedding_lookup)

    对于输入 word_ids,返回 embedding table。可以选用 one-hot 或者 tf.gather() 

    def embedding_lookup(input_ids,                        # word_id:【batch_size, seq_length】
                         vocab_size,
                         embedding_size=128,
                         initializer_range=0.02,
                         word_embedding_name="word_embeddings",
                         use_one_hot_embeddings=False):
    
      # 该函数默认输入的形状为【batch_size, seq_length, input_num】
      # 如果输入为2D的【batch_size, seq_length】,则扩展到【batch_size, seq_length, 1】
      if input_ids.shape.ndims == 2:
        input_ids = tf.expand_dims(input_ids, axis=[-1])
    
      embedding_table = tf.get_variable(
          name=word_embedding_name,
          shape=[vocab_size, embedding_size],
          initializer=create_initializer(initializer_range))
    
      flat_input_ids = tf.reshape(input_ids, [-1])    #【batch_size*seq_length*input_num】
      if use_one_hot_embeddings:
        one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
        output = tf.matmul(one_hot_input_ids, embedding_table)
      else:    # 按索引取值
        output = tf.gather(embedding_table, flat_input_ids)
    
      input_shape = get_shape_list(input_ids)
    
      # output:[batch_size, seq_length, num_inputs]
      # 转成:[batch_size, seq_length, num_inputs*embedding_size]
      output = tf.reshape(output,
                            input_shape[0:-1] + [input_shape[-1] * embedding_size])
      return (output, embedding_table)

     这里是首先随机初始化embedding_table,shape为[vocab_size, embedding_size],词向量的维度是128维,也就是bert输入层是128维,通过bert的训练,形成最终的词向量。所以,这里bert预训练的过程就是词向量形成的过程,load bert的参数就可以直接生成词向量。

     

    • Return:【batch_size, seq_length, embedding_size】

     

    1) tf.gather 用法

    import tensorflow as tf
    a = tf.Variable([[1,2,3,4,5], [6,7,8,9,10], [11,12,13,14,15]])
    index_a = tf.Variable([0,2])
     
    b = tf.Variable([1,2,3,4,5,6,7,8,9,10])
    index_b = tf.Variable([2,4,6,8])
     
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        #从a中取出第0个和第2个索引位置的值,因为a里面的元素值都是list,所以是取出了两个list
        print(sess.run(tf.gather(a, index_a))) 
        #从b中取出索引位置为2,3,6,8的元素值。
        print(sess.run(tf.gather(b, index_b)))
        
    out:
    #  [[ 1  2  3  4  5]
    #   [11 12 13 14 15]]
     
    #  [3 5 7 9]

     2)get_shape_list(tensor, expected_rank=None, name=None)  list形式返回tensor的shap,并做维度校验

    参数:
    tensor:一个需要返回shape的tf.Tensor
    expected_rank:int或者是一个int的list。输入tensor期望的rank(也就是维度),如果输入tensor的维度不等于这个数或者不再这个list中,就会抛出异常。
    tensor.shape.ndims可以返回该tensor数据的维度
    Return:tensor的shape

    one = tf.constant([[0.0, 0.1, 0.2],[0.0, 0.1, 0.2]])#这是一个矩阵显然是二维数据,一个向量是一维数据,如[1,2,3]
    one_shape = get_shape_list(one, expected_rank=[2,3,4])#希望one的维度在2,3,4中,不超出这个范围。
    print('one_shape:',one_shape)
    print(one.shape.ndims)
    
    out:
    one_shape: [2, 3]
    2

    三、词向量的后续处理(embedding_postprocessor)

     我们知道 BERT 模型的输入有三部分:token embedding ,segment embedding以及position embedding。上一节中我们只获得了 token embedding,这部分代码对其完善信息,正则化,dropout 之后输出最终 embedding。注意,在 Transformer 论文中的position embedding是由 sin/cos 函数生成的固定的值,而在这里代码实现中是跟普通 word embedding 一样随机生成的,可以训练的。作者这里这样选择的原因可能是 BERT 训练的数据比 Transformer 那篇大很多,完全可以让模型自己去学习。

    def embedding_postprocessor(input_tensor,                # [batch_size, seq_length, embedding_size]
                                use_token_type=False,
                                token_type_ids=None,
                                token_type_vocab_size=16,        # 一般是2
                                token_type_embedding_name="token_type_embeddings",
                                use_position_embeddings=True,
                                position_embedding_name="position_embeddings",
                                initializer_range=0.02,
                                max_position_embeddings=512,    #最大位置编码,必须大于等于max_seq_len
                                dropout_prob=0.1):
    
      input_shape = get_shape_list(input_tensor, expected_rank=3)   #【batch_size,seq_length,embedding_size】
      batch_size = input_shape[0]
      seq_length = input_shape[1]
      width = input_shape[2]
    
      output = input_tensor
    
      # Segment embedding信息
      if use_token_type:
        if token_type_ids is None:
          raise ValueError("`token_type_ids` must be specified if"
                           "`use_token_type` is True.")
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        # 由于token-type-table比较小,所以这里采用one-hot的embedding方式加速
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings
    
      # Position embedding信息
      if use_position_embeddings:
        # 确保seq_length小于等于max_position_embeddings
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
          full_position_embeddings = tf.get_variable(
              name=position_embedding_name,
              shape=[max_position_embeddings, width],
              initializer=create_initializer(initializer_range))
    
          # 这里position embedding是可学习的参数,[max_position_embeddings, width]
          # 但是通常实际输入序列没有达到max_position_embeddings
          # 所以为了提高训练速度,使用tf.slice取出句子长度的embedding
          position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                         [seq_length, -1])
          num_dims = len(output.shape.as_list())
    
          # word embedding之后的tensor是[batch_size, seq_length, width]
          # 因为位置编码是与输入内容无关,它的shape总是[seq_length, width]
          # 我们无法把位置Embedding加到word embedding上
          # 因此我们需要扩展位置编码为[1, seq_length, width]
          # 然后就能通过broadcasting加上去了。
          position_broadcast_shape = []
          for _ in range(num_dims - 2):
            position_broadcast_shape.append(1)
          position_broadcast_shape.extend([seq_length, width])
          position_embeddings = tf.reshape(position_embeddings,
                                           position_broadcast_shape)
          output += position_embeddings
    
      output = layer_norm_and_dropout(output, dropout_prob)
      return output

    四、构造 attention_mask

    因为每个样本都经过padding了,所以一个sequence中每个词对于占位符的位置要mask(因为pad的占位符原本是不存在的,所以置为0,表示看到不到;其它位置为1),这里就是构造每个词的可视域矩阵attention_mask,看得到的词就置为1,看不到的就置为0,进而带入tranformer模型中备用。

    Return:将shape为[batch_size, to_seq_length]的2D mask转换为一个shape 为[batch_size, from_seq_length, to_seq_length] 的3D mask用于attention当中。

    def create_attention_mask_from_input_mask(from_tensor, to_mask):
    #这里的to_mask就是input_mask,from_tensor就是input_ids,两者长度都是max_seq_length。
    """Create 3D attention mask from a 2D tensor mask. Args: from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. to_mask: int32 Tensor of shape [batch_size, to_seq_length]. Returns: float Tensor of shape [batch_size, from_seq_length, to_seq_length]. """ from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) batch_size = from_shape[0] from_seq_length = from_shape[1] to_shape = get_shape_list(to_mask, expected_rank=2) to_seq_length = to_shape[1] to_mask = tf.cast( tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) # We don't assume that `from_tensor` is a mask (although it could be). We # don't actually care if we attend *from* padding tokens (only *to* padding) # tokens so we create a tensor of all ones. # # `broadcast_ones` = [batch_size, from_seq_length, 1] broadcast_ones = tf.ones( shape=[batch_size, from_seq_length, 1], dtype=tf.float32) # Here we broadcast along two dimensions to create the mask. mask = broadcast_ones * to_mask
     return mask

     举例:

    import tensorflow as tf
    import six
    batch_size=2
    to_seq_length=3
    from_seq_length=3
    to_mask=[[1,0,0],[1,1,0]]
    to_mask = tf.cast(
          tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
    print(to_mask)
    broadcast_ones = tf.ones(
          shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
    print(broadcast_ones)
    mask = broadcast_ones * to_mask
    mask
    
    输出:
    
    tf.Tensor(
    [[[1. 0. 0.]]
    
     [[1. 1. 0.]]], shape=(2, 1, 3), dtype=float32)
    tf.Tensor(
    [[[1.]
      [1.]
      [1.]]
    
     [[1.]
      [1.]
      [1.]]], shape=(2, 3, 1), dtype=float32)
    
    
    
    <tf.Tensor: id=63, shape=(2, 3, 3), dtype=float32, numpy=
    array([[[1., 0., 0.],
            [1., 0., 0.],
            [1., 0., 0.]],
    
           [[1., 1., 0.],
            [1., 1., 0.],
            [1., 1., 0.]]], dtype=float32)>
  • 相关阅读:
    程序员常去的14个顶级开发社区
    为何技术领域中女程序员较少?
    为何技术领域中女程序员较少?
    为何技术领域中女程序员较少?
    关于HTTP和HTTPS的区别
    关于HTTP和HTTPS的区别
    关于HTTP和HTTPS的区别
    Coupled model
    java和javascript日期详解
    Java 线程总结(十四)
  • 原文地址:https://www.cnblogs.com/gczr/p/12382240.html
Copyright © 2011-2022 走看看