zoukankan      html  css  js  c++  java
  • pytorch 文本多分类

    https://github.com/jiangqy/LSTM-Classification-pytorch 基础上进行的修改

    一、需求:短信文本分类

    1.1 原始数据

    以英语语言为主,人工打标签,分为四类:0,1,2,3。
    文本长度:最长为300个单词。
    已经经过预处理:去掉所有其它字符,只保留了字母,以空格作为分隔符。

    df = pd.read_csv('./data/labeled.csv')
    df=df[['clean_review','cat_id']]
    df.sample(10)
    

    二、构造训练样本

    1. 特征和标签

    import torch
    from torch.utils.data.dataset import Dataset
    import numpy as np
    import pandas as pd
    
    class Dictionary(object):
        def __init__(self):
            self.word2idx = {}
            self.idx2word = []
    
        def add_word(self, word):
            if word not in self.word2idx:
                self.idx2word.append(word)
                self.word2idx[word] = len(self.idx2word) - 1
            return self.word2idx[word]
    
        def __len__(self):
            return len(self.idx2word)
    
    class Corpus(object):
    
        def __init__(self,sen_len):
            self.dictionary = Dictionary()
            self.dictionary.add_word("UNK") # 要指定样本长度,不够的补0,默认0对应的单词为"UNK"
            self.texts,self.labels = self.tokenize(sen_len)
    
        def tokenize(self,sen_len):
            """
            得到字典,完成文本从单词到数字的转换
            :param sen_len: 文本长度
            :return:
            """
            df = pd.read_csv("./data/clean_review.csv")
            token_text = []
            tokens = 0
            labels = []
            for item in df.iterrows():
                line = item[1]["clean_review"] # 该行中,clean_review字段对应的值
                labels.append(int(item[1]["cat_id"])) # 该行中,cat_id字段对应的值,即这个样本的标签
                words = line.split(" ") 
    
                tokens += len(words)
                for word in words:
                    word = word.strip()
                    if word:
                        self.dictionary.add_word(word)
    
                txt = torch.LongTensor(np.zeros(sen_len, dtype=np.int64)) # 构造长度为sen_len的tensor
                for index,word in enumerate(words[:sen_len]):
                    word = word.strip()
                    if word:
                        txt[index] = self.dictionary.word2idx[word]
                token_text.append(txt)
    
            return token_text,labels
    

    2. 自定义Dataset

    class LSTMDataset(Dataset):
    
        def __init__(self,sen_len, corpus):
            corpus = corpus
            self.token_text = corpus.texts
            self.labels = corpus.labels
            self.sen_len = sen_len
    
        def __getitem__(self, index):
            """
            根据索引获取对应的特征和标签
            :param index:
            :return:
            """
            text = self.token_text[index]
            label = torch.LongTensor([self.labels[index]])
            return text, label
    
        def __len__(self):
            return len(self.labels)
    
    

    三、LSTM模型

    import torch.nn as nn
    import torch.nn.functional as F
    import torch
    from torch.autograd import Variable
    
    
    class LSTMClassifier(nn.Module):
        """
        LSTM模型
        """
        def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu):
            super(LSTMClassifier, self).__init__()
            self.hidden_dim = hidden_dim
            self.batch_size = batch_size
            self.use_gpu = use_gpu
    
            self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) # 生成的嵌入矩阵
            self.lstm = nn.LSTM(embedding_dim, hidden_dim,num_layers=1) #
            self.hidden2label = nn.Linear(hidden_dim, label_size) # 四分类 label_size=4
            self.hidden = self.init_hidden()
    
        def init_hidden(self):
            if self.use_gpu:
                h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
                c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
            else:
                h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
                c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
            return (h0, c0)
    
        def forward(self, sentence):
            embeds = self.word_embeddings(sentence)
            x = embeds.view(len(sentence), self.batch_size, -1)
            lstm_out, self.hidden = self.lstm(x, self.hidden)
            y = self.hidden2label(lstm_out[-1])
            y = F.softmax(y,dim=1)
    
            return y
    
    # 得到最终的train_loader,训练过程就是对其进行遍历
    corpus = Corpus(DATA_DIR,sentence_len)
    train_set = LSTMDataset(sentence_len, corpus)
    train_loader = DataLoader(train_set,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=4
                             )
    

    四、损失函数和优化器

    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    loss_function = nn.CrossEntropyLoss()
    

    输出层经过softmax之后,可以使用CrossEntropyLoss作为损失函数。

    五、开始训练

    for epoch in range(epochs):
        optimizer = adjust_learning_rate(optimizer, epoch)
    
        ## training epoch
        total_acc = 0.0
        total_loss = 0.0
        total = 0.0
        for iter, traindata in enumerate(train_loader):
            train_inputs, train_labels = traindata
            train_labels = torch.squeeze(train_labels)
    
            if use_gpu:
                train_inputs, train_labels = Variable(train_inputs.cuda()), train_labels.cuda()
            else:
                train_inputs = Variable(train_inputs)
    
            model.zero_grad()
            model.batch_size = len(train_labels)
            model.hidden = model.init_hidden()
            output = model(train_inputs.t())
    
            loss = loss_function(output, Variable(train_labels))
            loss.backward()
            optimizer.step()
    
            # calc training acc
            _, predicted = torch.max(output.data, 1)
            total_acc += (predicted == train_labels).sum()
            total += len(train_labels)
            total_loss += loss.item()
    
        train_loss_.append(total_loss / total)
        train_acc_.append(total_acc / total)
        ## testing epoch
        total_acc = 0.0
        total_loss = 0.0
        total = 0.0
        for iter, testdata in enumerate(test_loader):
            test_inputs, test_labels = testdata
            test_labels = torch.squeeze(test_labels)
    
            if use_gpu:
                test_inputs, test_labels = Variable(test_inputs.cuda()), test_labels.cuda()
            else:
                test_inputs = Variable(test_inputs)
    
            model.batch_size = len(test_labels)
            model.hidden = model.init_hidden()
            output = model(test_inputs.t())
    
            loss = loss_function(output, Variable(test_labels))
    
            # calc testing acc
            _, predicted = torch.max(output.data, 1)
            total_acc += (predicted == test_labels).sum()
            total += len(test_labels)
            total_loss += loss.item()
        test_loss_.append(total_loss / total)
        test_acc_.append(total_acc / total)
    
        print('[Epoch: %3d/%3d] Training Loss: %.3f, Testing Loss: %.3f, Training Acc: %.3f, Testing Acc: %.3f'
              % (epoch, epochs, train_loss_[epoch], test_loss_[epoch], train_acc_[epoch], test_acc_[epoch]))
    

    六、模型保存

    traced_model = torch.jit.script(model)
    traced_model.save("lstm.pt")
    

    如果生成的模型提供给java调用,而不是python直接调用,使用pytorch提供的torch.jit.script方法
    教程如下:https://liuzhian.github.io/2021/04/08/初识TorchScript/

  • 相关阅读:
    第十九节,使用RNN实现一个退位减法器
    深度学习系列经典博客收藏
    第十八节,TensorFlow中使用批量归一化(BN)
    第十七节,深度学习模型的训练技巧-优化卷积核,多通道卷积
    第十六节,使用函数封装库tf.contrib.layers
    第十五节,利用反卷积技术复原卷积网络各层图像
    第十四节,TensorFlow中的反卷积,反池化操作以及gradients的使用
    第十三节,使用带有全局平均池化层的CNN对CIFAR10数据集分类
    第十二节,TensorFlow读取数据的几种方法以及队列的使用
    在hadoop集群添加了slave节点的方法
  • 原文地址:https://www.cnblogs.com/leimu/p/15789279.html
Copyright © 2011-2022 走看看