zoukankan      html  css  js  c++  java
  • NLP(三十一):用transformers库的BertForSequenceClassification实现文本分类

    一、类别编码必须是0开始

    import argparse
    import torch
    import tqdm
    from root_path import root
    import os
    import pandas as pd
    import json
    from sklearn.model_selection import train_test_split
    from transformers import BertTokenizer
    from torch.utils.data import Dataset, DataLoader, TensorDataset
    import numpy as np
    import random
    import re
    from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
    
    # 数据集读取
    class NewsDataset(Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels
    
        # 读取单个样本
        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(int(self.labels[idx]))
            return item
    
        def __len__(self):
            return len(self.labels)
    
    data_path = os.path.join(root, "data", "raw_data")
    code_to_label_file = os.path.join(data_path, "code_to_label.json")
    
    def get_dataset():
        train_path = os.path.join(data_path, "all_0727.xlsx")
        test_path = os.path.join(data_path, "更正的测试集.xlsx")
        train_table = pd.read_excel(train_path, sheet_name="data")
        train_sentence_list = train_table["句子"].tolist()
        train_code_list = train_table["语义编号"]
        with open(code_to_label_file, "r", encoding="utf8") as f:
            code_label = json.load(f)
        train_num_list = [code_label[train_code][2] for train_code in train_code_list]
        return train_sentence_list,train_num_list, len(code_label)
    
    def flat_accuracy(logits, label_ids):
        pred = np.argmax(logits, axis = 1)
        acc = np.equal(pred, label_ids).sum()
        return acc
    
    # 训练函数
    def train(model, train_loader, optim, device, scheduler, epoch, test_dataloader):
        model.train()
        total_train_loss = 0
        iter_num = 0
        total_iter = len(train_loader)
        for batch in train_loader:
            # 正向传播
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            label = batch['labels'].to(device)
    
            outputs = model(input_ids, attention_mask=attention_mask, labels=label)
            loss = outputs[0]
            total_train_loss += loss.item()
    
            # 反向梯度信息
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    
            # 参数更新
            optim.step()
            scheduler.step()
    
            iter_num += 1
            if (iter_num % 100 == 0):
                print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (
                epoch, iter_num, loss.item(), iter_num / total_iter * 100))
    
    
        print("Epoch: %d, Average training loss: %.4f" % (epoch, total_train_loss / len(train_loader)))
    
    
    def validation(model, test_dataloader, device):
        model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0
        for batch in test_dataloader:
            with torch.no_grad():
                # 正常传播
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    
            loss = outputs[0]
            logits = outputs[1]
    
            total_eval_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
    
            total_eval_accuracy += flat_accuracy(logits, label_ids)
    
        avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
        print("Accuracy: %.4f" % (avg_val_accuracy))
        print("Average testing loss: %.4f" % (total_eval_loss / len(test_dataloader)))
        print("-------------------------------")
    
    def main(model_name,
             epoch,
             learning_rate,
             batch_size,
             device,
             save_dir):
        device = torch.device(device)
        """读取训练数据"""
        sentence, label, num_cls = get_dataset()
        """划分为训练集和验证集, stratify 按照标签进行采样,训练集和验证部分同分布, 
        random_state:设置随机数种子,保证每次都是同一个随机数。若为0或不填,则每次得到数据都不一样
        """
        x_train, x_test, train_label, test_label = 
            train_test_split(sentence, label, test_size=0.5, stratify=label, random_state=5)
        tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
        train_encoding = tokenizer(x_train, truncation=True, padding=True, max_length=64)
        test_encoding = tokenizer(x_test, truncation=True, padding=True, max_length=64)
        train_dataset = NewsDataset(train_encoding, train_label)
        test_dataset = NewsDataset(test_encoding, test_label)
        model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=194)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
    
        # 单个读取到批量读取
        train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
        test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
    
        # 优化方法
        optim = AdamW(model.parameters(), lr=2e-5)
        total_steps = len(train_loader) * 1
        scheduler = get_linear_schedule_with_warmup(optim,
                                                    num_warmup_steps=0,  # Default value in run_glue.py
                                                    num_training_steps=total_steps)
        for epoch in range(4):
            print("------------Epoch: %d ----------------" % epoch)
            train(model, train_loader, optim, device, scheduler, epoch, test_dataloader)
            validation(model, test_dataloader, device)
    
    
    if __name__ == '__main__':
        parser = argparse.ArgumentParser()
        parser.add_argument('--model_name', default='afi')
        parser.add_argument('--epoch', type=int, default=100)
        parser.add_argument('--learning_rate', type=float, default=0.001)
        parser.add_argument('--batch_size', type=int, default=2048)
        parser.add_argument('--device', default='cuda:0')
        parser.add_argument('--save_dir', default='chkpt')
        args = parser.parse_args()
        main(args.model_name,
             args.epoch,
             args.learning_rate,
             args.batch_size,
             args.device,
             args.save_dir)
  • 相关阅读:
    PAT甲题题解-1106. Lowest Price in Supply Chain (25)-(dfs计算树的最小层数)
    PAT甲题题解-1105. Spiral Matrix (25)-(模拟顺时针矩阵)
    PAT甲题题解-1102. Invert a Binary Tree (25)-(建树,水题)
    PAT甲题题解-1101. Quick Sort (25)-大水题
    PAT甲级题解-1100. Mars Numbers (20)-字符串处理
    XJOI网上同步训练DAY1 T2
    XJOI网上同步训练DAY1 T1
    BZOJ 1061 志愿者招募
    BZOJ 2432 兔农
    KMP算法总♂结
  • 原文地址:https://www.cnblogs.com/zhangxianrong/p/15070092.html
Copyright © 2011-2022 走看看