zoukankan      html  css  js  c++  java
  • 利用CNN进行中文文本分类(数据集是复旦中文语料)

    利用TfidfVectorizer进行中文文本分类(数据集是复旦中文语料) 

    利用RNN进行中文文本分类(数据集是复旦中文语料)   

    上一节我们利用了RNN(GRU)对中文文本进行了分类,本节我们将继续使用CNN对中文文本进行分类。

    数据处理还是没有变,只是换了个模型,代码如下:

    # coding: utf-8
    
    from __future__ import print_function
    
    import os
    import sys
    import time
    from datetime import timedelta
    import keras
    
    import numpy as np
    import tensorflow as tf
    from sklearn import metrics
    #将词汇表中的单词映射成id
    def word2id():
      vocabulary_path = '/content/drive/My Drive/NLP/dataset/Fudan/vocabulary.txt'
      fp1 = open(vocabulary_path,'r',encoding='utf-8')
      word2id_dict = {}
      for i,line in enumerate(fp1.readlines()):
        word2id_dict[line.strip()] = i
      print(len(word2id_dict))
      fp1.close()
      return word2id_dict
    
    #得到文本内容及对应的标签
    def get_content_label(path):
      #data = '/content/drive/My Drive/NLP/dataset/Fudan/data/train_clean_jieba.txt'
      fp = open(path,'r',encoding='utf-8')
      content_list = []
      label_list = []
      for line in fp.readlines():
        line = line.strip().split('	')
        if len(line) == 2:
          content_list.append(line[0])
          label_list.append(line[1])
      print(content_list[:5])
      print(label_list[:5])
      fp.close()
      return content_list,label_list
    #得到标签对应的id
    def get_label_id():
      label = '/content/drive/My Drive/NLP/dataset/Fudan/label.txt'
      label2id_dict = {}
      fp = open(label,'r',encoding='utf-8')
      for line in fp.readlines():
        line = line.strip().split('	')
        label2id_dict[line[0]] = line[1]
      #print(label2id_dict)
      return label2id_dict
    #将文本内容中的词替换成词对应的id,并设定文本的最大长度
    #对标签进行one-hot编码
    def process(path,max_length):
      contents,labels = get_content_label(path)
      word_to_id = word2id()
      cat_to_id = get_label_id()
      data_id = []
      label_id = []
      for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])
    
      # 使用keras提供的pad_sequences来将文本pad为固定长度
      x_pad = keras.preprocessing.sequence.pad_sequences(data_id, max_length)
      y_pad = keras.utils.to_categorical(label_id, num_classes=len(cat_to_id))  # 将标签转换为one-hot表示
      return x_pad,y_pad
    
    def batch_iter(x, y, batch_size=64):
        """生成批次数据"""
        data_len = len(x)
        num_batch = int((data_len - 1) / batch_size) + 1
    
        indices = np.random.permutation(np.arange(data_len))
        x_shuffle = x[indices]
        y_shuffle = y[indices]
    
        for i in range(num_batch):
            start_id = i * batch_size
            end_id = min((i + 1) * batch_size, data_len)
            yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]
    
    def evaluate(sess, x_, y_):
        """评估在某一数据上的准确率和损失"""
        data_len = len(x_)
        batch_eval = batch_iter(x_, y_, 128)
        total_loss = 0.0
        total_acc = 0.0
        for x_batch, y_batch in batch_eval:
            batch_len = len(x_batch)
            feed_dict = feed_data(x_batch, y_batch, 1.0)
            loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
            total_loss += loss * batch_len
            total_acc += acc * batch_len
    
        return total_loss / data_len, total_acc / data_len
    
    def get_time_dif(start_time):
        """获取已使用时间"""
        end_time = time.time()
        time_dif = end_time - start_time
        return timedelta(seconds=int(round(time_dif)))
    
    
    def feed_data(x_batch, y_batch, keep_prob):
        feed_dict = {
            model.input_x: x_batch,
            model.input_y: y_batch,
            model.keep_prob: keep_prob
        }
        return feed_dict
    
    
    def get_training_word2vec_vectors(filename):
      with np.load(filename) as data:
        return data["embeddings"]
    
    class TCNNConfig(object):
        """CNN配置参数"""
    
        embedding_dim = 100  # 词向量维度
        seq_length = 600  # 序列长度
        num_classes = 20  # 类别数
        num_filters = 256  # 卷积核数目
        kernel_size = 5  # 卷积核尺寸
        vocab_size = 183664  # 词汇表达小
    
        hidden_dim = 128  # 全连接层神经元
    
        dropout_keep_prob = 0.5  # dropout保留比例
        learning_rate = 1e-3  # 学习率
    
        batch_size = 64  # 每批训练大小
        num_epochs = 10  # 总迭代轮次
    
        print_per_batch = 20  # 每多少轮输出一次结果
        save_per_batch = 10  # 每多少轮存入tensorboard
        pre_trianing = None
        vector_word_npz = '/content/drive/My Drive/NLP/dataset/Fudan/vector_word.npz'
    
    
    class TextCNN(object):
        """文本分类,CNN模型"""
    
        def __init__(self, config):
            self.config = config
    
            # 三个待输入的数据
            self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
            self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
            self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
            self.cnn()
    
        def cnn(self):
            """CNN模型"""
            # 词向量映射
            with tf.device('/cpu:0'):
                #embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
                embedding = tf.get_variable("embeddings", shape=[self.config.vocab_size, self.config.embedding_dim],
                                                 initializer=tf.constant_initializer(self.config.pre_trianing))
                embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
                
    
            with tf.name_scope("cnn"):
                # CNN layer
                conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv')
                # global max pooling layer
                gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp')
    
            with tf.name_scope("score"):
                # 全连接层,后面接dropout以及relu激活
                fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1')
                fc = tf.contrib.layers.dropout(fc, self.keep_prob)
                fc = tf.nn.relu(fc)
    
                # 分类器
                self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
                self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # 预测类别
    
            with tf.name_scope("optimize"):
                # 损失函数,交叉熵
                cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
                self.loss = tf.reduce_mean(cross_entropy)
                # 优化器
                self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
    
            with tf.name_scope("accuracy"):
                # 准确率
                correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
                self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
    def train():
        print("Configuring TensorBoard and Saver...")
        # 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖
        tensorboard_dir = 'tensorboard/textcnn'
        if not os.path.exists(tensorboard_dir):
            os.makedirs(tensorboard_dir)
    
        tf.summary.scalar("loss", model.loss)
        tf.summary.scalar("accuracy", model.acc)
        merged_summary = tf.summary.merge_all()
        writer = tf.summary.FileWriter(tensorboard_dir)
        save_dir = 'checkpoint/textcnn/'
        save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径
        # 配置 Saver
        saver = tf.train.Saver()
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
    
        print("Loading training and validation data...")
        # 载入训练集与验证集
        start_time = time.time()
        train_dir = '/content/drive/My Drive/NLP/dataset/Fudan/data/train_clean_jieba.txt'
        val_dir = '/content/drive/My Drive/NLP/dataset/Fudan/data/test_clean_jieba.txt'
        x_train, y_train = process(train_dir, config.seq_length)
        x_val, y_val = process(val_dir, config.seq_length)
        time_dif = get_time_dif(start_time)
        print("Time usage:", time_dif)
    
        # 创建session
        session = tf.Session()
        session.run(tf.global_variables_initializer())
        writer.add_graph(session.graph)
    
        print('Training and evaluating...')
        start_time = time.time()
        total_batch = 0  # 总批次
        best_acc_val = 0.0  # 最佳验证集准确率
        last_improved = 0  # 记录上一次提升批次
        require_improvement = 1000  # 如果超过1000轮未提升,提前结束训练
    
        flag = False
        for epoch in range(config.num_epochs):
            print('Epoch:', epoch + 1)
            batch_train = batch_iter(x_train, y_train, config.batch_size)
            for x_batch, y_batch in batch_train:
                feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)
    
                if total_batch % config.save_per_batch == 0:
                    # 每多少轮次将训练结果写入tensorboard scalar
                    s = session.run(merged_summary, feed_dict=feed_dict)
                    writer.add_summary(s, total_batch)
    
                if total_batch % config.print_per_batch == 0:
                    # 每多少轮次输出在训练集和验证集上的性能
                    feed_dict[model.keep_prob] = 1.0
                    loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                    loss_val, acc_val = evaluate(session, x_val, y_val)  # todo
    
                    if acc_val > best_acc_val:
                        # 保存最好结果
                        best_acc_val = acc_val
                        last_improved = total_batch
                        saver.save(sess=session, save_path=save_path)
                        improved_str = '*'
                    else:
                        improved_str = ''
    
                    time_dif = get_time_dif(start_time)
                    msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' 
                          + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                    print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))
    
                feed_dict[model.keep_prob] = config.dropout_keep_prob
                session.run(model.optim, feed_dict=feed_dict)  # 运行优化
                total_batch += 1
    
                if total_batch - last_improved > require_improvement:
                    # 验证集正确率长期不提升,提前结束训练
                    print("No optimization for a long time, auto-stopping...")
                    flag = True
                    break  # 跳出循环
            if flag:  # 同上
                break
    
    
    def test():
        print("Loading test data...")
        start_time = time.time()
        test_dir = '/content/drive/My Drive/NLP/dataset/Fudan/data/test_clean_jieba.txt'
        x_test, y_test = process(test_dir, config.seq_length)
        save_path = 'checkpoint/textcnn/best_validation'
    
        session = tf.Session()
        session.run(tf.global_variables_initializer())
    
        saver = tf.train.Saver()
        saver.restore(sess=session, save_path=save_path)  # 读取保存的模型
    
        print('Testing...')
        loss_test, acc_test = evaluate(session, x_test, y_test)
        msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
        print(msg.format(loss_test, acc_test))
    
        batch_size = 128
        data_len = len(x_test)
        num_batch = int((data_len - 1) / batch_size) + 1
    
        y_test_cls = np.argmax(y_test, 1)
        y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存预测结果
        for i in range(num_batch):  # 逐批次处理
            start_id = i * batch_size
            end_id = min((i + 1) * batch_size, data_len)
            feed_dict = {
                model.input_x: x_test[start_id:end_id],
                model.keep_prob: 1.0
            }
            y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)
        categories = get_label_id().values()
        # 评估
        print("Precision, Recall and F1-Score...")
        print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))
    
        # 混淆矩阵
        print("Confusion Matrix...")
        cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
        print(cm)
    
        time_dif = get_time_dif(start_time)
        print("Time usage:", time_dif)
    
    
    if __name__ == '__main__':
      print('Configuring CNN model...')
      config = TCNNConfig()
      config.pre_trianing = get_training_word2vec_vectors(config.vector_word_npz)
      model = TextCNN(config)
      test()

    结果如下:

    Epoch: 8
    Iter:   1080, Train Loss:   0.13, Train Acc:  95.31%, Val Loss:   0.44, Val Acc:  87.19%, Time: 0:04:33 
    Iter:   1100, Train Loss:   0.24, Train Acc:  95.31%, Val Loss:   0.44, Val Acc:  87.03%, Time: 0:04:38 
    Iter:   1120, Train Loss:   0.19, Train Acc:  93.75%, Val Loss:   0.43, Val Acc:  87.38%, Time: 0:04:42 
    Iter:   1140, Train Loss:   0.17, Train Acc:  92.19%, Val Loss:   0.42, Val Acc:  87.80%, Time: 0:04:47 *
    Iter:   1160, Train Loss:   0.21, Train Acc:  90.62%, Val Loss:   0.41, Val Acc:  87.89%, Time: 0:04:53 *
    Iter:   1180, Train Loss:   0.34, Train Acc:  89.06%, Val Loss:   0.43, Val Acc:  87.57%, Time: 0:04:57 
    Iter:   1200, Train Loss:   0.22, Train Acc:  92.19%, Val Loss:   0.41, Val Acc:  87.62%, Time: 0:05:01 
    Iter:   1220, Train Loss:   0.24, Train Acc:  90.62%, Val Loss:   0.41, Val Acc:  87.87%, Time: 0:05:06 
    Epoch: 9
    Iter:   1240, Train Loss:  0.096, Train Acc:  95.31%, Val Loss:    0.4, Val Acc:  88.34%, Time: 0:05:11 *
    Iter:   1260, Train Loss:   0.21, Train Acc:  92.19%, Val Loss:   0.41, Val Acc:  87.98%, Time: 0:05:16 
    Iter:   1280, Train Loss:   0.13, Train Acc:  95.31%, Val Loss:   0.42, Val Acc:  88.14%, Time: 0:05:20 
    Iter:   1300, Train Loss:    0.1, Train Acc:  98.44%, Val Loss:   0.43, Val Acc:  87.76%, Time: 0:05:25 
    Iter:   1320, Train Loss:   0.27, Train Acc:  92.19%, Val Loss:   0.39, Val Acc:  87.93%, Time: 0:05:29 
    Iter:   1340, Train Loss:   0.19, Train Acc:  92.19%, Val Loss:   0.45, Val Acc:  87.67%, Time: 0:05:33 
    Iter:   1360, Train Loss:   0.27, Train Acc:  92.19%, Val Loss:   0.42, Val Acc:  87.57%, Time: 0:05:38 
    Iter:   1380, Train Loss:   0.17, Train Acc:  92.19%, Val Loss:   0.41, Val Acc:  88.07%, Time: 0:05:42 
    Epoch: 10
    Iter:   1400, Train Loss:    0.1, Train Acc:  98.44%, Val Loss:   0.39, Val Acc:  88.64%, Time: 0:05:47 *
    Iter:   1420, Train Loss:  0.069, Train Acc:  96.88%, Val Loss:    0.4, Val Acc:  88.46%, Time: 0:05:51 
    Iter:   1440, Train Loss:   0.15, Train Acc:  98.44%, Val Loss:   0.41, Val Acc:  88.16%, Time: 0:05:56 
    Iter:   1460, Train Loss:  0.073, Train Acc:  98.44%, Val Loss:    0.4, Val Acc:  88.38%, Time: 0:06:00 
    Iter:   1480, Train Loss:   0.16, Train Acc:  95.31%, Val Loss:   0.42, Val Acc:  88.12%, Time: 0:06:05 
    Iter:   1500, Train Loss:   0.21, Train Acc:  92.19%, Val Loss:   0.41, Val Acc:  87.79%, Time: 0:06:09 
    Iter:   1520, Train Loss:   0.16, Train Acc:  93.75%, Val Loss:   0.41, Val Acc:  88.03%, Time: 0:06:13 

    进行测试,测试结果如下:

    Testing...
    2020-10-19 12:51:46.979827: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
    2020-10-19 12:51:47.221023: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
    Test Loss:   0.39, Test Acc:  88.64%
    Precision, Recall and F1-Score...
    /usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
      _warn_prf(average, modifier, msg_start, len(result))
                  precision    recall  f1-score   support
    
               0       0.33      0.05      0.09        61
               1       0.89      0.96      0.93      1022
               2       0.39      0.15      0.22        59
               3       0.89      0.95      0.92      1254
               4       0.33      0.08      0.12        52
               5       0.83      0.90      0.86      1026
               6       0.95      0.98      0.96      1358
               7       0.67      0.04      0.08        45
               8       0.39      0.28      0.32        76
               9       0.85      0.94      0.89       742
              10       0.00      0.00      0.00        34
              11       0.00      0.00      0.00        28
              12       0.96      0.96      0.96      1218
              13       0.87      0.92      0.89       642
              14       0.50      0.15      0.23        33
              15       0.67      0.07      0.13        27
              16       0.91      0.91      0.91      1601
              17       0.86      0.11      0.20        53
              18       0.00      0.00      0.00        34
              19       0.74      0.69      0.72       468
    
        accuracy                           0.89      9833
       macro avg       0.60      0.46      0.47      9833
    weighted avg       0.87      0.89      0.87      9833
    
    Confusion Matrix...
    [[   3    1    0   42    0    5    0    0    4    3    0    0    0    2
         0    0    1    0    0    0]
     [   0  983    0    5    0    1    0    0    0    0    0    0    8    3
         0    0   14    1    0    7]
     [   1    2    9    3    0    4    2    0    3    1    0    0    2   15
         3    0   13    0    0    1]
     [   0    3    0 1195    0   12    2    0    0   16    0    0    3    2
         0    0    8    0    0   13]
     [   0    6    1    1    4   14    5    0    5    0    0    0    1    1
         0    0   14    0    0    0]
     [   0    7    0   16    0  924    1    0    3    5    0    0    1    0
         0    0   39    0    0   30]
     [   0    1    0    3    0    0 1328    1    1    0    0    0    1   17
         0    0    5    0    0    1]
     [   0    0    0   13    0   12    0    2    0    8    0    0    1    2
         0    0    0    0    0    7]
     [   2    1    1    7    0   39    0    0   21    0    0    0    0    4
         0    0    0    0    0    1]
     [   0    1    0   10    0   10    1    0    1  696    0    0    0    0
         0    0    3    0    0   20]
     [   0    0    0    4    0    0    0    0    0   15    0    0    0    1
         0    0    1    0    0   13]
     [   0    0    0    2    1    0    5    0    2    0    0    0    0   10
         1    0    7    0    0    0]
     [   0   11    0    1    1    1    8    0    3    0    0    0 1175    6
         0    0    7    0    0    5]
     [   0    0    0    6    0    0   31    0    0    1    0    0   12  589
         0    0    3    0    0    0]
     [   0    2    4    1    1    1    0    0    1    0    0    0    4    6
         5    1    7    0    0    0]
     [   0    0    2    1    0    1    6    0    0    0    0    0    0   11
         0    2    4    0    0    0]
     [   0   70    2   10    2   39    5    0    2    2    0    0    7    0
         0    0 1451    0    0   11]
     [   3    4    0   10    3   12    0    0    6    3    0    0    0    0
         0    0    5    6    0    1]
     [   0    7    4    0    0    1    0    0    1    1    0    0    6    5
         1    0    7    0    0    1]
     [   0    4    0    7    0   43    5    0    1   72    0    0    1    1
         0    0   11    0    0  323]]
    Time usage: 0:00:13

    至此使用传统的TF-IDF+朴素贝叶斯、RNN(LSTM、GRU)和CNN从数据的处理到模型的训练和测试就全部完成了,接下来准备弄弄Transformer和Bert了,欢迎关注。

    参考:

    https://github.com/gaussic/text-classification-cnn-rnn

  • 相关阅读:
    内存、时间复杂度、CPU/GPU以及运行时间
    内存、时间复杂度、CPU/GPU以及运行时间
    四叉树问题
    四叉树问题
    基于深度学习的人脸识别系统(Caffe+OpenCV+Dlib)【三】VGG网络进行特征提取
    爬楼梯问题种种
    爬楼梯问题种种
    Python 在线笔试
    基于深度学习的人脸识别系统(Caffe+OpenCV+Dlib)【二】人脸预处理
    标准模板库 STL 使用之 —— vector 使用 tricks
  • 原文地址:https://www.cnblogs.com/xiximayou/p/13842312.html
Copyright © 2011-2022 走看看