zoukankan      html  css  js  c++  java
  • 四、word2vec + siameseLSTM改进(1)

    一、查看GPU

    >>> import torch
    >>> torch.cuda.device_count()
    1
    >>> torch.cuda.get_device_name(0)
    'GeForce GTX 1060'
    >>> torch.cuda.is_available()
    True

    二、CPU转移到GPU步骤

    """
    注意,转移到GPU步骤:
    (1)设置种子:torch.cuda.manual_seed(123)
    (2)device = torch.device('cuda')定义设备
    (3)损失函数转移:criterion = nn.BCEWithLogitsLoss().to(self.device)
    (4)网络模型转移:myLSTM = SiameseLSTM(128).to(self.device)
    (5)数据转移:a, b, label= a.cuda(), b.cuda(), label.cuda()
    """

    三、项目目录

     四、代码步骤

    1、数据初始化

    import numpy as np
    import pandas as pd
    import os
    class DataInitial(object):
        def __init__(self):
            pass
        def all_data(self):
            df1 = pd.read_csv("data/POI/negtive.csv")
            df2 = pd.read_csv("data/POI/positive.csv")
            df = pd.concat([df1,df2],ignore_index=True)
            df.to_csv("data/POI/all.csv",index=False,sep=',')
        def split(self):
            df = pd.read_csv('data/POI/all.csv')
            df = df.sample(frac=1.0)
            cut_idx = int(round(0.2 * df.shape[0]))
            df_test, df_train = df.iloc[:cut_idx], df.iloc[cut_idx:]
            df_test.to_csv("data/POI/test.csv",index=False,sep=',')
            df_train.to_csv("data/POI/train.csv", index=False, sep=',')
        def train_data(self):
            train_texta = pd.read_csv("data/POI/train.csv")["address_1"]
            train_textb = pd.read_csv("data/POI/train.csv")["address_2"]
            train_label = pd.read_csv("data/POI/train.csv")["tag"]
            return train_texta,train_textb,train_label
        def test_data(self):
            test_texta = pd.read_csv("data/POI/test.csv")["address_1"]
            test_textb = pd.read_csv("data/POI/test.csv")["address_2"]
            test_label = pd.read_csv("data/POI/test.csv")["tag"]
            return test_texta,test_textb,test_label

    2、DataSet

    import torch.utils.data as data
    import torch
    
    class DatasetIterater(data.Dataset):
        def __init__(self, texta, textb, label):
            self.texta = texta
            self.textb = textb
            self.label = label
        def __getitem__(self, item):
            texta = self.texta[item]
            textb = self.textb[item]
            label = self.label[item]
            return texta, textb, label
        def __len__(self):
            return len(self.texta)

    3、词嵌入

    import jieba
    from gensim.models import Word2Vec
    import torch
    import gensim
    import numpy as np
    model = gensim.models.KeyedVectors.load_word2vec_format('model/word2vec.bin', binary=True)
    class WordEmbedding(object):
        def __init__(self):
            pass
        def sentenceTupleToEmbedding(self, data1, data2):
            aCutListMaxLen = max([len(list(jieba.cut(sentence_a))) for sentence_a in data1])
            bCutListMaxLen = max([len(list(jieba.cut(sentence_a))) for sentence_a in data2])
            maxLen = max(aCutListMaxLen,bCutListMaxLen)
            seq_len = maxLen
            a = self.sqence_vec(data1, seq_len) #batch_size, sqence, embedding
            b = self.sqence_vec(data2, seq_len)
            return torch.FloatTensor(a), torch.FloatTensor(b)
        def sqence_vec(self, data, seq_len):
            data_a_vec = []
            for sequence_a in data:
                sequence_vec = []  # sequence * 128
                for word_a in jieba.cut(sequence_a):
                    if word_a in model:
                        sequence_vec.append(model[word_a])
                sequence_vec = np.array(sequence_vec)
                add = np.zeros((seq_len - sequence_vec.shape[0], 128))
                sequenceVec = np.vstack((sequence_vec, add))
                data_a_vec.append(sequenceVec)
            a_vec = np.array(data_a_vec)
            return a_vec

    4、孪生LSTM

    import torch
    from torch import nn
    class SiameseLSTM(nn.Module):
        def __init__(self, input_size):
            super(SiameseLSTM, self).__init__()
            self.lstm = nn.LSTM(input_size=input_size, hidden_size=10, num_layers=4, batch_first=True)
            self.fc = nn.Linear(10, 1)
        def forward(self, data1, data2):
            out1, (h1, c1) = self.lstm(data1)
            out2, (h2, c2) = self.lstm(data2)
            pre1 = out1[:, -1, :]
            pre2 = out2[:, -1, :]
            dis = torch.abs(pre1 - pre2)
            out = self.fc(dis)
            return out

    5、主程序

    import torch
    from torch import nn
    from torch.utils.data import DataLoader
    import pandas as pd
    from datasetIterater import DatasetIterater
    import jieba
    from wordEmbedding import WordEmbedding
    from siameseLSTM import  SiameseLSTM
    import numpy as np
    from dataInitial import DataInitial
    
    word = WordEmbedding()
    """
    注意,转移到GPU步骤:
    (1)设置种子:torch.cuda.manual_seed(123)
    (2)device = torch.device('cuda')定义设备
    (3)损失函数转移:criterion = nn.BCEWithLogitsLoss().to(self.device)
    (4)网络模型转移:myLSTM = SiameseLSTM(128).to(self.device)
    (5)数据转移:a, b, label= a.cuda(), b.cuda(), label.cuda()
    """
    class MainProcess(object):
        def __init__(self):
            self.learning_rate = 0.001
            torch.cuda.manual_seed(123)
            self.device = torch.device('cuda')
            self.train_texta, self.train_textb, self.train_label = DataInitial().train_data()
            self.test_texta, self.test_textb, self.test_label = DataInitial().test_data()
            self.train_data = DatasetIterater(self.train_texta,self.train_textb,self.train_label)
            self.test_data = DatasetIterater(self.test_texta, self.test_textb, self.test_label)
            self.train_iter = DataLoader(dataset=self.train_data, batch_size=32, shuffle=True)
            self.test_iter = DataLoader(dataset=self.test_data, batch_size=128, shuffle=True)
            self.myLSTM = SiameseLSTM(128).to(self.device)
            self.criterion = nn.BCEWithLogitsLoss().to(self.device)
            self.optimizer = torch.optim.Adam(self.myLSTM.parameters(), lr=self.learning_rate)
    
        def binary_acc(self, preds, y):
            preds = torch.round(torch.sigmoid(preds))
            correct = torch.eq(preds, y).float()
            acc = correct.sum() / len(correct)
            return acc
    
        def train(self, mynet, train_iter, optimizer, criterion, epoch):
            avg_acc = []
            avg_loss = []
            mynet.train()
            for batch_id, (data1, data2, label) in enumerate(train_iter):
                try:
                    a, b = word.sentenceTupleToEmbedding(data1, data2)
                except Exception as e:
                    continue
                a, b, label= a.cuda(non_blocking=True), b.cuda(non_blocking=True), label.cuda(non_blocking=True)
                distence = mynet(a, b)
                loss = criterion(distence, label.float().unsqueeze(-1))
                acc = self.binary_acc(distence, label.float().unsqueeze(-1)).item()
                avg_acc.append(acc)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if batch_id % 100==0:
                    print("轮数:", epoch, "batch: ",batch_id,"训练损失:", loss.item())
                avg_loss.append(loss.item())
            avg_acc = np.array(avg_acc).mean()
            avg_loss = np.array(avg_loss).mean()
            print('train acc:', avg_acc)
            print("train loss", avg_loss)
    
        def eval(self, mynet, test_iter, criteon, epoch):
            mynet.eval()
            avg_acc = []
            avg_loss = []
            with torch.no_grad():
                for batch_id, (data1, data2, label) in enumerate(test_iter):
                    try:
                        a, b = word.sentenceTupleToEmbedding(data1, data2)
                    except Exception as e:
                        continue
                    a, b, label= a.cuda(non_blocking=True), b.cuda(non_blocking=True), label.cuda(non_blocking=True)
                    distence = mynet(a, b)
                    loss = criteon(distence, label.float().unsqueeze(-1))
                    acc = self.binary_acc(distence, label.float().unsqueeze(-1)).item()
                    avg_acc.append(acc)
                    avg_loss.append(loss.item())
                    if batch_id>50:
                        break
            avg_acc = np.array(avg_acc).mean()
            avg_loss = np.array(avg_loss).mean()
            print('>>test acc:', avg_acc)
            print(">>test loss:", avg_loss)
            return (avg_acc, avg_loss)
    
        def main(self):
            min_loss = 100000
            for epoch in range(50):
                self.train(self.myLSTM, self.train_iter, self.optimizer, self.criterion, epoch)
                eval_acc, eval_loss = self.eval(self.myLSTM, self.test_iter, self.criterion, epoch)
                if eval_loss < min_loss:
                    min_loss = eval_loss
                    print("save model")
                    torch.save(self.myLSTM.state_dict(), 'model.pth')
    
    
    
    if __name__ == '__main__':
        MainProcess().main()

    五、实验结果

     六、测试

        def test(self):
            self.myLSTM.load_state_dict(torch.load('model.pth'))
            data1 = ("浙江杭州富阳区银湖街黄先生的外卖",)
            data2 = ("富阳区浙江富阳区银湖街道新常村",)
            try:
                a, b = word.sentenceTupleToEmbedding(data1, data2)
                print(a.shape, b.shape)
            except Exception as e:
                print(e)
            a, b = a.cuda(non_blocking=True), b.cuda(non_blocking=True)
            distence = self.myLSTM(a, b)
            preds = torch.round(torch.sigmoid(distence))
            print(preds.item())
  • 相关阅读:
    InnoDB和MyISAM区别
    include和require的区别
    php的魔术方法
    php中heredoc的使用方法
    20条常见的编码陷阱
    php header 跳转
    php.ini设置详解
    session的实现原理 大网站应用应注意的问题
    Ruby on Rails 开发实践相关命令参考
    IBM WebSphere Portal6 最佳项目实践
  • 原文地址:https://www.cnblogs.com/zhangxianrong/p/14175414.html
Copyright © 2011-2022 走看看