zoukankan      html  css  js  c++  java
  • 基于Huggingface使用BERT进行文本分类的fine-tuning

    随着BERT大火之后,很多BERT的变种,这里借用Huggingface工具来简单实现一个文本分类,从而进一步通过Huggingface来认识BERT的工程上的实现方法。

    1、load data

    train_df = pd.read_csv('../data/train.tsv',delimiter='	',names=['text','label'])
    print(train_df.shape)
    train_df.head()

    sentences = list(train_df['text'])
    targets =train_df['label'].values

    2、token encodding

    #如果token要封装到自定义model类中的话,则需要指定max_len
    tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
    max_length=32
    sentences_tokened=tokenizer(sentences,padding=True,truncation=True,max_length=max_length,return_tensors='pt')
    targets=torch.tensor(targets)

    3、encoding data

    # from torchvision import transforms,datasets
    from torch.utils.data import Dataset,DataLoader,random_split
    
    class DataToDataset(Dataset):
        def __init__(self,encoding,labels):
            self.encoding=encoding
            self.labels=labels
            
        def __len__(self):
            return len(self.labels)
            
        def __getitem__(self,index):
            return self.encoding['input_ids'][index],self.encoding['attention_mask'][index],self.labels[index]
    
    #封装数据
    datasets=DataToDataset(sentences_tokened,targets)
    train_size=int(len(datasets)*0.8)
    test_size=len(datasets)-train_size
    print([train_size,test_size])
    train_dataset,val_dataset=random_split(dataset=datasets,lengths=[train_size,test_size])
    
    BATCH_SIZE=64
    #这里的num_workers要大于0
    train_loader=DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=5)
    
    val_loader=DataLoader(dataset=val_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=5)#

    4、create model

    class BertTextClassficationModel(nn.Module):
        def __init__(self):
            super(BertTextClassficationModel,self).__init__()
            self.bert=BertModel.from_pretrained('bert-base-uncased')
            self.dense=nn.Linear(768,2)  #768 input, 2 output
            
        def forward(self,ids,mask):
            out,_=self.bert(input_ids=ids,attention_mask=mask)
            out=self.dense(out[:,0,:])
            return out
    
    
    mymodel=BertTextClassficationModel()
    
    
    #获取gpu和cpu的设备信息
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("device=",device)
    if torch.cuda.device_count()>1:
        print("Let's use ",torch.cuda.device_count(),"GPUs!")
        mymodel=nn.DataParallel(mymodel)
    mymodel.to(device)

    5、train model

    loss_func=nn.CrossEntropyLoss()
    optimizer=optim.Adam(mymodel.parameters(),lr=0.0001)
    
    from sklearn.metrics import accuracy_score
    def flat_accuracy(preds,labels):
        pred_flat=np.argmax(preds,axis=1).flatten()
        labels_flat=labels.flatten()
        return accuracy_score(labels_flat,pred_flat)
    
    epochs=3
    for epoch in range(epochs):
        train_loss = 0.0
        train_acc=0.0
        for i,data in enumerate(train_loader):
            input_ids,attention_mask,labels=[elem.to(device) for elem in data]
            #优化器置零
            optimizer.zero_grad()
            #得到模型的结果
            out=mymodel(input_ids,attention_mask)
            #计算误差
            loss=loss_func(out,labels)
            train_loss += loss.item()
            #误差反向传播
            loss.backward()
            #更新模型参数
            optimizer.step()
            #计算acc 
            out=out.detach().numpy()
            labels=labels.detach().numpy()
            train_acc+=flat_accuracy(out,labels)
    
        print("train %d/%d epochs Loss:%f, Acc:%f" %(epoch,epochs,train_loss/(i+1),train_acc/(i+1)))

    6、evaluate

    print("evaluate...")
    val_loss=0
    val_acc=0
    mymodel.eval()
    for j,batch in enumerate(val_loader):
        val_input_ids,val_attention_mask,val_labels=[elem.to(device) for elem in batch]
        with torch.no_grad():
            pred=mymodel(val_input_ids,val_attention_mask)
            val_loss+=loss_func(pred,val_labels)
            pred=pred.detach().cpu().numpy()
            val_labels=val_labels.detach().cpu().numpy()
            val_acc+=flat_accuracy(pred,val_labels)
    print("evaluate loss:%d, Acc:%d" %(val_loss/len(val_loader),val_acc/len(val_loader)))
        
  • 相关阅读:
    一条语句批量插入多条数据
    VMware Workstation 15 Pro 永久激活密钥
    windows下java项目打包、启动批处理 .bat文件
    java简写名词解释
    windows查看所有进程:netstat -ano
    Java验证jwt token
    【转载】三种方法从 HTTPS 网站导出 SSL 证书链
    使用solr将CSV/XML/DB/JSON数据发布为Rest Service
    检验多个xsd的xml是否合法
    Eclipse 打包运行maven项目
  • 原文地址:https://www.cnblogs.com/ljy2013/p/13726148.html
Copyright © 2011-2022 走看看