zoukankan      html  css  js  c++  java
  • sru源码--language model

    import sys
    import os
    import argparse
    import time
    import random
    import math
    
    import numpy as np
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    from torch.autograd import Variable
    
    import cuda_functional as MF
    
    
    def read_corpus(path, eos="</s>"):
        data = [ ]
        with open(path) as fin:
            for line in fin:
                data += line.split() + [ eos ]
        return data
    
    def create_batches(data_text, map_to_ids, batch_size, cuda=True):
        data_ids = map_to_ids(data_text)
        N = len(data_ids)
        L = ((N-1) // batch_size) * batch_size
        x = np.copy(data_ids[:L].reshape(batch_size,-1).T)
        y = np.copy(data_ids[1:L+1].reshape(batch_size,-1).T)#x和y的结果基本相同
        x, y = torch.from_numpy(x), torch.from_numpy(y)
        x, y = x.contiguous(), y.contiguous()
        if cuda:
            x, y = x.cuda(), y.cuda()
        return x, y
    
    
    class EmbeddingLayer(nn.Module):#为语料中每一个单词对应的其相应的词向量
        def __init__(self, n_d, words, fix_emb=False):
            super(EmbeddingLayer, self).__init__()
            word2id = {}
            for w in words:
                if w not in word2id:
                    word2id[w] = len(word2id)#把文本映射到数字上。
    
            self.word2id = word2id
            self.n_V, self.n_d = len(word2id), n_d#n_V应该是指词库大小,n_d指hidden state size    
            self.embedding = nn.Embedding(self.n_V, n_d)#赋予每个单词相应的词向量
    
        def forward(self, x):
            return self.embedding(x)
    
        def map_to_ids(self, text):#映射
            return np.asarray([self.word2id[x] for x in text],
                     dtype='int64'
            )
    
    class Model(nn.Module):
        def __init__(self, words, args):
            super(Model, self).__init__()
            self.args = args
            self.n_d = args.d
            self.depth = args.depth
            self.drop = nn.Dropout(args.dropout)#防止过拟合的层,变分dropout
            self.embedding_layer = EmbeddingLayer(self.n_d, words)
            self.n_V = self.embedding_layer.n_V
            if args.lstm:
                self.rnn = nn.LSTM(self.n_d, self.n_d,#self.rnn = nn.LSTM(         # if use nn.RNN(), it hardly learns
                input_size=INPUT_SIZE,
                hidden_size=64,         # rnn hidden unit
                num_layers=1,           # number of rnn layer
                batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
            )
    
                    self.depth,
                    dropout = args.rnn_dropout
                )
            else:
                self.rnn = MF.SRU(self.n_d, self.n_d, self.depth,
                    dropout = args.rnn_dropout,
                    rnn_dropout = args.rnn_dropout,
                    use_tanh = 0
                )
            self.output_layer = nn.Linear(self.n_d, self.n_V)
            # tie weights
            self.output_layer.weight = self.embedding_layer.embedding.weight
            self.init_weights()
            if not args.lstm:
                self.rnn.set_bias(args.bias)
    
        def init_weights(self):#initial c
            val_range = (3.0/self.n_d)**0.5
            for p in self.parameters():
                if p.dim() > 1:  # matrix
                    p.data.uniform_(-val_range, val_range)
                else:
                    p.data.zero_()
    
        def forward(self, x, hidden):
            emb = self.drop(self.embedding_layer(x))
            output, hidden = self.rnn(emb, hidden)#rnn的输入和输出都有两个,即输入和上一层的隐层的值
            output = self.drop(output)
            output = output.view(-1, output.size(2))#改变tensor的size,size(2)表示计算第三维的大小,如size 4x6x7,则.size(3)就等于7
            output = self.output_layer(output)
            return output, hidden
    
        def init_hidden(self, batch_size):
            weight = next(self.parameters()).data
            zeros = Variable(weight.new(self.depth, batch_size, self.n_d).zero_())
            if self.args.lstm:
                return (zeros, zeros)
            else:
                return zeros
    
        def print_pnorm(self):#输出范数,范数常常被用来度量某个向量空间(或矩阵)中的每个向量的长度或大小。正则化中就是用范数
            norms = [ "{:.0f}".format(x.norm().data[0]) for x in self.parameters() ]
            sys.stdout.write("	p_norm: {}
    ".format(
                norms
            ))
    
    def train_model(epoch, model, train):
        model.train()
        args = model.args
    
        unroll_size = args.unroll_size
        batch_size = args.batch_size
        N = (len(train[0])-1)//unroll_size + 1
        lr = args.lr
    
        total_loss = 0.0
        criterion = nn.CrossEntropyLoss(size_average=False)#每个小批次的损失将被相加。
        hidden = model.init_hidden(batch_size)
        for i in range(N):
            x = train[0][i*unroll_size:(i+1)*unroll_size]
            y = train[1][i*unroll_size:(i+1)*unroll_size].view(-1)#view(-1)是指按列展开
            x, y =  Variable(x), Variable(y)
            hidden = (Variable(hidden[0].data), Variable(hidden[1].data)) if args.lstm 
                else Variable(hidden.data)
    
            model.zero_grad()
            output, hidden = model(x, hidden)
            assert x.size(1) == batch_size
            loss = criterion(output, y) / x.size(1)#.size(1)计算列数.size(0)计算行数,must be (1. nn output, 2. target), the target label is NOT one-hotted
            loss.backward()
    
            torch.nn.utils.clip_grad_norm(model.parameters(), args.clip_grad)#nn.utils.clip_grad_norm()对网络进行梯度裁剪,因为RNN中容易出现梯度爆炸的问题。
            for p in model.parameters():
                if p.requires_grad:
                    if args.weight_decay > 0:
                        p.data.mul_(1.0-args.weight_decay)
                    p.data.add_(-lr, p.grad.data)
            if math.isnan(loss.data[0]) or math.isinf(loss.data[0]):#如果发生梯度消失或梯度爆炸则退出程序
                sys.exit(0)                                         #math.isinf(x):如果x = ±inf(inf:infinity ,译为无穷)也就是±∞返回True
                return                                              #math.isnan(x):如果x = Non (not a number) 返回True;
    
            total_loss += loss.data[0] / x.size(0)
            if i%10 == 0:
                sys.stdout.write("
    {}".format(i))
                sys.stdout.flush()
    
        return np.exp(total_loss/N)
    
    def eval_model(model, valid):
        model.eval()
        args = model.args
        total_loss = 0.0
        unroll_size = model.args.unroll_size
        criterion = nn.CrossEntropyLoss(size_average=False)
        hidden = model.init_hidden(1)
        N = (len(valid[0])-1)//unroll_size + 1
        for i in range(N):
            x = valid[0][i*unroll_size:(i+1)*unroll_size]
            y = valid[1][i*unroll_size:(i+1)*unroll_size].view(-1)
            x, y = Variable(x, volatile=True), Variable(y)
            hidden = (Variable(hidden[0].data), Variable(hidden[1].data)) if args.lstm 
                else Variable(hidden.data)
            output, hidden = model(x, hidden)
            loss = criterion(output, y)
            total_loss += loss.data[0]
        avg_loss = total_loss / valid[1].numel()#numel()返回张量所含元素的个数
        ppl = np.exp(avg_loss)
        return ppl
    
    def main(args):
        train = read_corpus(args.train)
        dev = read_corpus(args.dev)
        test = read_corpus(args.test)
    
        model = Model(train, args)
        model.cuda()
        sys.stdout.write("vocab size: {}
    ".format(
            model.embedding_layer.n_V
        ))
        sys.stdout.write("num of parameters: {}
    ".format(
            sum(x.numel() for x in model.parameters() if x.requires_grad)
        ))
        model.print_pnorm()
        sys.stdout.write("
    ")
    
        map_to_ids = model.embedding_layer.map_to_ids
        train = create_batches(train, map_to_ids, args.batch_size)
        dev = create_batches(dev, map_to_ids, 1)
        test = create_batches(test, map_to_ids, 1)
    
        unchanged = 0
        best_dev = 1e+8
        for epoch in range(args.max_epoch):
            start_time = time.time()#返回当前时间的时间戳(1970纪元后经过的浮点秒数)。
            if args.lr_decay_epoch>0 and epoch>=args.lr_decay_epoch:
                args.lr *= args.lr_decay
            train_ppl = train_model(epoch, model, train)
            dev_ppl = eval_model(model, dev)
            sys.stdout.write("
    Epoch={}  lr={:.4f}  train_ppl={:.2f}  dev_ppl={:.2f}"
                    "	[{:.2f}m]
    ".format(
                epoch,
                args.lr,
                train_ppl,
                dev_ppl,
                (time.time()-start_time)/60.0
            ))
            model.print_pnorm()
            sys.stdout.flush()
    
            if dev_ppl < best_dev:
                unchanged = 0
                best_dev = dev_ppl
                start_time = time.time()
                test_ppl = eval_model(model, test)
                sys.stdout.write("	[eval]  test_ppl={:.2f}	[{:.2f}m]
    ".format(
                    test_ppl,
                    (time.time()-start_time)/60.0
                ))
                sys.stdout.flush()
            else:
                unchanged += 1
            if unchanged >= 30: break
            sys.stdout.write("
    ")
    
    if __name__ == "__main__":
        argparser = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve')
        argparser.add_argument("--lstm", action="store_true")
        argparser.add_argument("--train", type=str, required=True, help="training file")
        argparser.add_argument("--dev", type=str, required=True, help="dev file")
        argparser.add_argument("--test", type=str, required=True, help="test file")
        argparser.add_argument("--batch_size", "--batch", type=int, default=32)
        argparser.add_argument("--unroll_size", type=int, default=35)
        argparser.add_argument("    ", type=int, default=300)
        argparser.add_argument("--d", type=int, default=910)
        argparser.add_argument("--dropout", type=float, default=0.7,
            help="dropout of word embeddings and softmax output"
        )
        argparser.add_argument("--rnn_dropout", type=float, default=0.2,
            help="dropout of RNN layers"
        ) 
        argparser.add_argument("--bias", type=float, default=-3,
            help="intial bias of highway gates",
        )
        argparser.add_argument("--depth", type=int, default=6)
        argparser.add_argument("--lr", type=float, default=1.0)
        argparser.add_argument("--lr_decay", type=float, default=0.98)
        argparser.add_argument("--lr_decay_epoch", type=int, default=175)
        argparser.add_argument("--weight_decay", type=float, default=1e-5)
        argparser.add_argument("--clip_grad", type=float, default=5)
    
        args = argparser.parse_args()
        print (args)
    
  • 相关阅读:
    【Python之路】第三篇--Python基本数据类型
    【Python之路】第二篇--初识Python
    【Python之路】第一篇--Linux基础命令
    noip模拟测试7
    noip模拟测试6
    动态添加select的option [转载]
    javaweb报错:java.lang.NumberFormatException: null
    Javascript获取select的选中值和选中文本(转载)
    动态生成select框内容
    IO(Input&Output)流の介绍
  • 原文地址:https://www.cnblogs.com/lindaxin/p/8021518.html
Copyright © 2011-2022 走看看