zoukankan      html  css  js  c++  java
  • 在Keras中用Bert进行情感分析

    之前在BERT实战——基于Keras一文中介绍了两个库 keras_bert 和 bert4keras

    但是由于 bert4keras 处于开发阶段,有些函数名称和位置等等发生了变化,那篇文章只用了 bert4keras 进行情感分析

    于是这里新开了一篇文章将 2 个库都用一遍, bert4keras 也使用最新版本

    本文所用bert4keras时间:2019-11-09

    害怕 bert4keras 后续继续变化,需要稳定的可以先采用 keras_bert 

    数据集:

    原始Github链接:https://github.com/bojone/bert4keras/tree/master/examples/datasets

    个人网盘:链接: https://pan.baidu.com/s/1OAhNbRYpU1HW25_vChdRng 提取码: uxax 

    使用keras_bert

    配置一些超参数,导入需要的包和设置文件路径

    import json
    import numpy as np
    import pandas as pdfrom keras_bert import load_trained_model_from_checkpoint, Tokenizer
    # 超参数 maxlen = 100 batch_size = 16 droup_out_rate = 0.5 learning_rate = 1e-5 epochs = 15 path_prefix = "./test" # 预训练模型目录 config_path = path_prefix + "/chinese_L-12_H-768_A-12/bert_config.json" checkpoint_path = path_prefix + "/chinese_L-12_H-768_A-12/bert_model.ckpt" dict_path = path_prefix + "/chinese_L-12_H-768_A-12/vocab.txt"

    读取数据和构造训练样本

    # 读取数据
    neg = pd.read_excel(path_prefix + "/data/neg.xls", header=None)
    pos = pd.read_excel(path_prefix + "/data/pos.xls", header=None)
    
    # 构建训练数据
    data = []
    
    for d in neg[0]:
        data.append((d, 0))
    
    for d in pos[0]:
        data.append((d, 1))

    读取字典

    # 读取字典
    token_dict = load_vocabulary(dict_path)
    # 建立分词器
    tokenizer = Tokenizer(token_dict)

    拆分为训练集和测试集

    # 按照9:1的比例划分训练集和验证集
    random_order = list(range(len(data)))
    np.random.shuffle(random_order)
    train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
    valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]

    序列padding 和 训练用的生成器

    def seq_padding(X, padding=0):
        L = [len(x) for x in X]
        ML = max(L)
        return np.array([
            np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
        ])
    
    
    class data_generator:
        def __init__(self, data, batch_size=batch_size):
            self.data = data
            self.batch_size = batch_size
            self.steps = len(self.data) // self.batch_size
            if len(self.data) % self.batch_size != 0:
                self.steps += 1
        def __len__(self):
            return self.steps
        def __iter__(self):
            while True:
                idxs = list(range(len(self.data)))
                np.random.shuffle(idxs)
                X1, X2, Y = [], [], []
                for i in idxs:
                    d = self.data[i]
                    text = d[0][:maxlen]
                    x1, x2 = tokenizer.encode(first=text)
                    y = d[1]
                    X1.append(x1)
                    X2.append(x2)
                    Y.append([y])
                    if len(X1) == self.batch_size or i == idxs[-1]:
                        X1 = seq_padding(X1)
                        X2 = seq_padding(X2)
                        Y = seq_padding(Y)
                        yield [X1, X2], Y
                        [X1, X2, Y] = [], [], []

    读取 bert 模型并增加一个全连接层用于预测

    from keras.layers import *
    from keras.models import Model
    import keras.backend as K
    from keras.optimizers import Adam
    
    # trainable设置True对Bert进行微调
    # 默认不对Bert模型进行调参
    bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, , trainable=True)
    
    x1_in = Input(shape=(None,))
    x2_in = Input(shape=(None,))
    
    x = bert_model([x1_in, x2_in])
    x = Lambda(lambda x: x[:, 0])(x)
    x = Dropout(droup_out_rate)(x)
    p = Dense(1, activation='sigmoid')(x)
    
    model = Model([x1_in, x2_in], p)
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate),
        metrics=['accuracy']
    )
    model.summary()

    开始训练

    train_D = data_generator(train_data)
    valid_D = data_generator(valid_data)
    
    model.fit_generator(
        train_D.__iter__(),
        steps_per_epoch=len(train_D),
        epochs=epochs,
        validation_data=valid_D.__iter__(),
        validation_steps=len(valid_D)
    )

    使用bert4keras

    为防止 bert4keras 又调整,导致代码和最新版本不适配,这里记录更新时间

    更新时间:2019-11-09

    配置超参数,导入需要的包和设置预训练模型的路径

    import json
    import numpy as np
    import pandas as pd
    import os
    from bert4keras.bert import build_bert_model
    from bert4keras.backend import set_gelu
    from bert4keras.utils import Tokenizer, load_vocab
    set_gelu('tanh') # 切换gelu版本
    
    #超参数
    maxlen = 100
    batch_size = 16
    droup_out_rate = 0.5
    learning_rate = 1e-5
    epochs = 15
    path_prefix = "./test"
    # 预训练模型路径
    config_path = path_prefix + "/chinese_L-12_H-768_A-12/bert_config.json"
    checkpoint_path = path_prefix + "/chinese_L-12_H-768_A-12/bert_model.ckpt"
    dict_path = path_prefix + "/chinese_L-12_H-768_A-12/vocab.txt"

    读取数据和构造训练样本

    # 读取数据
    neg = pd.read_excel(path_prefix + "/data/neg.xls", header=None)
    pos = pd.read_excel(path_prefix + "/data/pos.xls", header=None)
    
    data, tokens = [], {}
    # 读取词典
    _token_dict = load_vocab(dict_path)
    # 建立临时分词器
    _tokenizer = Tokenizer(_token_dict)
    
    for d in neg[0]:
        data.append((d, 0))
        for t in _tokenizer.tokenize(d):
            tokens[t] = tokens.get(t, 0) + 1
    
    for d in pos[0]:
        data.append((d, 1))
        for t in _tokenizer.tokenize(d):
            tokens[t] = tokens.get(t, 0) + 1

    精简字典,只留下本任务用到的字

    tokens = {i: j for i, j in tokens.items() if j >= 4}
    # token_dict是本任务需要用到的字
    # keep_words是在bert中保留的字表
    token_dict, keep_words = {}, []
    
    for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:
        token_dict[t] = len(token_dict)
        keep_words.append(_token_dict[t])
    
    for t in tokens:
        if t in _token_dict and t not in token_dict:
            token_dict[t] = len(token_dict)
            keep_words.append(_token_dict[t])
    
    # 建立分词器
    tokenizer = Tokenizer(token_dict)

    拆分训练集和测试集

    if not os.path.exists('./random_order.json'):
        random_order = list(range(len(data)))
        np.random.shuffle(random_order)
        json.dump(
            random_order,
            open('./random_order.json', 'w'),
            indent=4
        )
    else:
        random_order = json.load(open('./random_order.json'))
    
    
    # 按照9:1的比例划分训练集和验证集
    train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
    valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]

    padding和生成器

    def seq_padding(X, padding=0):
        L = [len(x) for x in X]
        ML = max(L)
        return np.array([
            np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
        ])
    
    
    class data_generator:
        def __init__(self, data, batch_size=batch_size):
            self.data = data
            self.batch_size = batch_size
            self.steps = len(self.data) // self.batch_size
            if len(self.data) % self.batch_size != 0:
                self.steps += 1
        def __len__(self):
            return self.steps
        def __iter__(self):
            while True:
                idxs = list(range(len(self.data)))
                np.random.shuffle(idxs)
                X1, X2, Y = [], [], []
                for i in idxs:
                    d = self.data[i]
                    text = d[0][:maxlen]
                    x1, x2 = tokenizer.encode(text)
                    y = d[1]
                    X1.append(x1)
                    X2.append(x2)
                    Y.append([y])
                    if len(X1) == self.batch_size or i == idxs[-1]:
                        X1 = seq_padding(X1)
                        X2 = seq_padding(X2)
                        Y = seq_padding(Y)
                        yield [X1, X2], Y
                        [X1, X2, Y] = [], [], []

    读取 bert 模型并增加一个全连接层用于预测

    from keras.layers import *
    from keras.models import Model
    import keras.backend as K
    from keras.optimizers import Adam
    
    model = build_bert_model(
        config_path,
        checkpoint_path,
        # 只保留keep_words中的字,精简原字表
        keep_words=keep_words,
    )
    
    output = Lambda(lambda x: x[:, 0])(model.output)
    output = Dropout(droup_out_rate)(output)
    output = Dense(1, activation='sigmoid')(output)
    model = Model(model.input, output)
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate),
        metrics=['accuracy']
    )
    model.summary()

    开始训练

    train_D = data_generator(train_data)
    valid_D = data_generator(valid_data)
    
    model.fit_generator(
        train_D.__iter__(),
        steps_per_epoch=len(train_D),
        epochs=epochs,
        validation_data=valid_D.__iter__(),
        validation_steps=len(valid_D)
    )
  • 相关阅读:
    SHELL编程之执行环境----邹立巍的博客
    linux内核分析系列--百度
    Linux模式设计系列( 内核与应用关联思考)
    Linux内核源代码情景分析系列
    Linux内核学习和研究及嵌入式(ARM)学习和研究的开放文档
    实验楼内核分析 +图
    《Linux内核修炼之道》 系列
    和菜鸟一起学linux内核源码之基础准备篇 系列 体系结构图
    实验楼在线算法学习
    linux-0.11内核 调试教程+GCC源代码
  • 原文地址:https://www.cnblogs.com/dogecheng/p/11824494.html
Copyright © 2011-2022 走看看