zoukankan      html  css  js  c++  java
  • [Keras实战教程]·使用Transfromer模型做文本分类(NLP分类最佳模型)

    Transfromer理论部分
    谷歌大脑在论文《Attention Is All You Need》中提出了一个完全基于注意力机制的编解码器模型 Transformer ,它完全抛弃了之前其它模型引入注意力机制后仍然保留的循环与卷积结构,然后在任务表现、并行能力和易于训练性方面都有大幅的提高。Transformer 从此也成为了机器翻译和其它许多文本理解任务中的重要基准模型。

    模型具体介绍

    模型论文解析

    GitHub:https://github.com/xiaosongshine/transfromer_keras

    Transfromer模型代码实现(基于Keras)
    Position_Embedding
    #! -*- coding: utf-8 -*-
    #%%
    from __future__ import print_function
    from keras import backend as K
    from keras.engine.topology import Layer

    class Position_Embedding(Layer):

    def __init__(self, size=None, mode='sum', **kwargs):
    self.size = size #必须为偶数
    self.mode = mode
    super(Position_Embedding, self).__init__(**kwargs)

    def call(self, x):
    if (self.size == None) or (self.mode == 'sum'):
    self.size = int(x.shape[-1])
    batch_size,seq_len = K.shape(x)[0],K.shape(x)[1]
    position_j = 1. / K.pow(10000.,
    2 * K.arange(self.size / 2, dtype='float32'
    ) / self.size)
    position_j = K.expand_dims(position_j, 0)
    position_i = K.cumsum(K.ones_like(x[:,:,0]), 1)-1 #K.arange不支持变长,只好用这种方法生成
    position_i = K.expand_dims(position_i, 2)
    position_ij = K.dot(position_i, position_j)
    position_ij = K.concatenate([K.cos(position_ij), K.sin(position_ij)], 2)
    if self.mode == 'sum':
    return position_ij + x
    elif self.mode == 'concat':
    return K.concatenate([position_ij, x], 2)

    def compute_output_shape(self, input_shape):
    if self.mode == 'sum':
    return input_shape
    elif self.mode == 'concat':
    return (input_shape[0], input_shape[1], input_shape[2]+self.size)
    Attention

    class Attention(Layer):

    def __init__(self, nb_head, size_per_head, **kwargs):
    self.nb_head = nb_head
    self.size_per_head = size_per_head
    self.output_dim = nb_head*size_per_head
    super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
    self.WQ = self.add_weight(name='WQ',
    shape=(input_shape[0][-1], self.output_dim),
    initializer='glorot_uniform',
    trainable=True)
    self.WK = self.add_weight(name='WK',
    shape=(input_shape[1][-1], self.output_dim),
    initializer='glorot_uniform',
    trainable=True)
    self.WV = self.add_weight(name='WV',
    shape=(input_shape[2][-1], self.output_dim),
    initializer='glorot_uniform',
    trainable=True)
    super(Attention, self).build(input_shape)

    def Mask(self, inputs, seq_len, mode='mul'):
    if seq_len == None:
    return inputs
    else:
    mask = K.one_hot(seq_len[:,0], K.shape(inputs)[1])
    mask = 1 - K.cumsum(mask, 1)
    for _ in range(len(inputs.shape)-2):
    mask = K.expand_dims(mask, 2)
    if mode == 'mul':
    return inputs * mask
    if mode == 'add':
    return inputs - (1 - mask) * 1e12

    def call(self, x):
    #如果只传入Q_seq,K_seq,V_seq,那么就不做Mask
    #如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len,那么对多余部分做Mask
    if len(x) == 3:
    Q_seq,K_seq,V_seq = x
    Q_len,V_len = None,None
    elif len(x) == 5:
    Q_seq,K_seq,V_seq,Q_len,V_len = x
    #对Q、K、V做线性变换
    Q_seq = K.dot(Q_seq, self.WQ)
    Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
    Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))
    K_seq = K.dot(K_seq, self.WK)
    K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
    K_seq = K.permute_dimensions(K_seq, (0,2,1,3))
    V_seq = K.dot(V_seq, self.WV)
    V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
    V_seq = K.permute_dimensions(V_seq, (0,2,1,3))
    #计算内积,然后mask,然后softmax
    A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5
    A = K.permute_dimensions(A, (0,3,2,1))
    A = self.Mask(A, V_len, 'add')
    A = K.permute_dimensions(A, (0,3,2,1))
    A = K.softmax(A)
    #输出并mask
    O_seq = K.batch_dot(A, V_seq, axes=[3,2])
    O_seq = K.permute_dimensions(O_seq, (0,2,1,3))
    O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
    O_seq = self.Mask(O_seq, Q_len, 'mul')
    return O_seq

    def compute_output_shape(self, input_shape):
    return (input_shape[0][0], input_shape[0][1], self.output_dim)
    将上述两段代码保存到 Attention_keras.py

    训练模型
    引入包,记载文本数据
    #%%
    from keras.preprocessing import sequence
    from keras.datasets import imdb
    from matplotlib import pyplot as plt
    import pandas as pd

    max_features = 20000

    print('Loading data...')

    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

    #标签转换为独热码
    y_train, y_test = pd.get_dummies(y_train),pd.get_dummies(y_test)

    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')
    输出:(如果第一次运行,将下载文件,我下载过现在运行就直接加载了)

    Using TensorFlow backend.
    Loading data...
    25000 train sequences
    25000 test sequences
    数据归一化处理
    #%%数据归一化处理

    maxlen = 64


    print('Pad sequences (samples x time)')

    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)

    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

    print('x_train shape:', x_train.shape)

    print('x_test shape:', x_test.shape)
    输出结果(pad_sequences作用,长度大于maxlen部分会被截取的,小于maxlen会填充到maxlen长度)

    Pad sequences (samples x time)
    x_train shape: (25000, 64)
    x_test shape: (25000, 64)
    定义网络模型

    batch_size = 5
    from keras.models import Model
    from keras.optimizers import SGD,Adam
    from keras.layers import *


    S_inputs = Input(shape=(None,), dtype='int32')

    embeddings = Embedding(max_features, 128)(S_inputs)
    embeddings = Position_Embedding()(embeddings) #增加Position_Embedding能轻微提高准确率

    O_seq = Attention(8,16)([embeddings,embeddings,embeddings])

    O_seq = GlobalAveragePooling1D()(O_seq)

    O_seq = Dropout(0.5)(O_seq)

    outputs = Dense(2, activation='softmax')(O_seq)


    model = Model(inputs=S_inputs, outputs=outputs)
    # try using different optimizers and different optimizer configs
    opt = Adam(lr=0.0005)
    loss = 'categorical_crossentropy'
    model.compile(loss=loss,

    optimizer=opt,

    metrics=['accuracy'])

    print(model.summary(http://www.my516.com))
    模型输出(模型很简单,参数量较少)

    ==================================================================================================
    input_1 (InputLayer) (None, None) 0
    __________________________________________________________________________________________________
    embedding_1 (Embedding) (None, None, 128) 2560000 input_1[0][0]
    __________________________________________________________________________________________________
    position__embedding_1 (Position (None, None, 128) 0 embedding_1[0][0]
    __________________________________________________________________________________________________
    attention_1 (Attention) (None, None, 128) 49152 position__embedding_1[0][0]
    position__embedding_1[0][0]
    position__embedding_1[0][0]
    __________________________________________________________________________________________________
    global_average_pooling1d_1 (Glo (None, 128) 0 attention_1[0][0]
    __________________________________________________________________________________________________
    dropout_1 (Dropout) (None, 128) 0 global_average_pooling1d_1[0][0]
    __________________________________________________________________________________________________
    dense_1 (Dense) (None, 2) 258 dropout_1[0][0]
    ==================================================================================================
    Total params: 2,609,410
    Trainable params: 2,609,410
    Non-trainable params: 0
    __________________________________________________________________________________________________
    训练,保存模型

    #%%
    print('Train...')

    model.fit(x_train, y_train,

    batch_size=batch_size,

    epochs=2,

    validation_data=(x_test, y_test))

    model.save("imdb.h5")
    输出:(训练2个epochs,就可以达到80%多准确率,模型优异)

    Train...
    Train on 25000 samples, validate on 25000 samples
    Epoch 1/2
    25000/25000 [==============================] - 95s 4ms/step - loss: 0.4826 - acc: 0.7499 - val_loss: 0.3663 - val_acc: 0.8353
    Epoch 2/2
    25000/25000 [==============================] - 93s 4ms/step - loss: 0.3084 - acc: 0.8680 - val_loss: 0.3983 - val_acc: 0.8163
    将上述代码保存到 train.py


    ---------------------

  • 相关阅读:
    Kafka 入门(四)-- Python Kafka Client 性能测试
    XShell连接阿里云服务器出现”用户密钥加载失败:请确定输入的密码“处理办法
    优化自动化测试流程,使用 flask 开发一个 toy jenkins工具
    我做了回视频,告诉你需要用到哪些工具
    提问的基本原则
    12 月31 日返利系统问题复盘
    外部prometheus监控k8s(k3s)集群
    一个Java类在运行时候,变量是怎么在JVM中分布的呢?
    JVM学习第二篇思考:一个Java代码是怎么运行起来的-下篇
    JVM学习第一篇思考:一个Java代码是怎么运行起来的-上篇
  • 原文地址:https://www.cnblogs.com/hyhy904/p/11097323.html
Copyright © 2011-2022 走看看