zoukankan      html  css  js  c++  java
  • Transformer(self attention pytorch)代码

    实现细节;

    1.embedding 层

    2.positional encoding层:添加位置信息

    3,MultiHeadAttention层:encoder的self attention

    4,sublayerConnection层:add&norm,使用layerNorm,

    5,FeedForward层:两层全连接

    6,Masked MultiHeadAttention:decoder中的self attention层,添加mask,不考虑计算当前位置的后面信息

    7,MultiHeadAttention层:encoder的输出做key,value,decoder的self attention输出做query,类似于传统attention

    8,generator层:最后的linear和softmax层,转为概率输出

    9,预测时greedy_decode,第一个预测初始化为start字符

      1 #!/usr/bin/env python
      2 # coding: utf-8
      3 
      4 import numpy as np
      5 import torch
      6 import torch.nn as nn
      7 import torch.nn.functional as F
      8 import math
      9 import copy
     10 import time
     11 from torch.autograd import Variable
     12 import matplotlib.pyplot as plt
     13 import seaborn
     14 seaborn.set_context(context="talk")
     15 
     16 
     17 class EncoderDecoder(nn.Module):
     18     """
     19     A standard Encoder-Decoder architecture. Base for this and many
     20     other models.
     21     """
     22 
     23     def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
     24         super(EncoderDecoder, self).__init__()
     25         self.encoder = encoder
     26         self.decoder = decoder
     27         self.src_embed = src_embed
     28         self.tgt_embed = tgt_embed
     29         self.generator = generator
     30 
     31     def forward(self, src, tgt, src_mask, tgt_mask):
     32         "Take in and process masked src and target sequences."
     33         memory = self.encode(src, src_mask)
     34         ret = self.decode(memory, src_mask, tgt, tgt_mask)
     35         return ret
     36 
     37     def encode(self, src, src_mask):
     38         src_embedding = self.src_embed(src)
     39         ret = self.encoder(src_embedding, src_mask)
     40         return ret
     41 
     42     def decode(self, memory, src_mask, tgt, tgt_mask):
     43         ret = tgt_embdding = self.tgt_embed(tgt)
     44         self.decoder(tgt_embdding, memory, src_mask, tgt_mask)
     45         return ret
     46 
     47 
     48 class Generator(nn.Module):
     49     "Define standard linear + softmax generation step."
     50 
     51     def __init__(self, d_model, vocab):
     52         super(Generator, self).__init__()
     53         self.proj = nn.Linear(d_model, vocab)
     54 
     55     def forward(self, x):
     56         return F.log_softmax(self.proj(x), dim=-1)
     57 
     58 
     59 # The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively.
     60 
     61 # ## Encoder and Decoder Stacks
     62 # ### Encoder
     63 # The encoder is composed of a stack of $N=6$ identical layers.
     64 def clones(module, N):
     65     "Produce N identical layers."
     66     return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
     67 
     68 
     69 class Encoder(nn.Module):
     70     "Core encoder is a stack of N layers"
     71 
     72     def __init__(self, layer, N):
     73         super(Encoder, self).__init__()
     74         self.layers = clones(layer, N)
     75         self.norm = LayerNorm(layer.size)
     76 
     77     def forward(self, x, mask):
     78         "Pass the input (and mask) through each layer in turn."
     79         for layer in self.layers:
     80             x = layer(x, mask)
     81         return self.norm(x)
     82 
     83 
     84 #layer normalization [(cite)](https://arxiv.org/abs/1607.06450). do on 
     85 class LayerNorm(nn.Module):
     86     "Construct a layernorm module (See citation for details)."
     87     def __init__(self, features, eps=1e-6):
     88         super(LayerNorm, self).__init__()
     89         self.a_2 = nn.Parameter(torch.ones(features))
     90         self.b_2 = nn.Parameter(torch.zeros(features))
     91         self.eps = eps
     92 
     93     def forward(self, x):
     94         mean = x.mean(-1, keepdim=True)
     95         std = x.std(-1, keepdim=True)
     96         return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
     97 
     98 
     99 # That is, the output of each sub-layer is $mathrm{LayerNorm}(x + mathrm{Sublayer}(x))$, where $mathrm{Sublayer}(x)$ is the function implemented by the sub-layer itself.  We apply dropout [(cite)](http://jmlr.org/papers/v15/srivastava14a.html) to the output of each sub-layer, before it is added to the sub-layer input and normalized.
    100 # To facilitate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimension $d_{	ext{model}}=512$.
    101 class SublayerConnection(nn.Module):
    102     """
    103     A residual connection followed by a layer norm.
    104     Note for code simplicity the norm is first as opposed to last.
    105     """
    106 
    107     def __init__(self, size, dropout):
    108         super(SublayerConnection, self).__init__()
    109         self.norm = LayerNorm(size)
    110         self.dropout = nn.Dropout(dropout)
    111 
    112     def forward(self, x, sublayer):
    113         "Apply residual connection to any sublayer with the same size."
    114         ret = x + self.dropout(sublayer(self.norm(x)))
    115         return ret
    116 
    117 
    118 # Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed-forward network.
    119 class EncoderLayer(nn.Module):
    120     "Encoder is made up of self-attn and feed forward (defined below)"
    121 
    122     def __init__(self, size, self_attn, feed_forward, dropout):
    123         super(EncoderLayer, self).__init__()
    124         self.self_attn = self_attn
    125         self.feed_forward = feed_forward
    126         self.sublayer = clones(SublayerConnection(size, dropout), 2)
    127         self.size = size
    128 
    129     def forward(self, x, mask):
    130         "Follow Figure 1 (left) for connections."
    131         x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
    132         # torch.Size([30, 10, 512])
    133         ret = self.sublayer[1](x, self.feed_forward)
    134         return ret
    135 
    136 
    137 # ### Decoder
    138 # The decoder is also composed of a stack of $N=6$ identical layers.
    139 class Decoder(nn.Module):
    140     "Generic N layer decoder with masking."
    141 
    142     def __init__(self, layer, N):
    143         super(Decoder, self).__init__()
    144         self.layers = clones(layer, N)
    145         self.norm = LayerNorm(layer.size)
    146 
    147     def forward(self, x, memory, src_mask, tgt_mask):
    148         for layer in self.layers:
    149             x = layer(x, memory, src_mask, tgt_mask)
    150         return self.norm(x)
    151 
    152 
    153 # In addition to the two sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack.  Similar to the encoder, we employ residual connections around each of the sub-layers, followed by layer normalization.
    154 class DecoderLayer(nn.Module):
    155     "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    156 
    157     def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
    158         super(DecoderLayer, self).__init__()
    159         self.size = size
    160         self.self_attn = self_attn
    161         self.src_attn = src_attn
    162         self.feed_forward = feed_forward
    163         self.sublayer = clones(SublayerConnection(size, dropout), 3)
    164 
    165     def forward(self, x, memory, src_mask, tgt_mask):
    166         "Follow Figure 1 (right) for connections."
    167         m = memory
    168         x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
    169         x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
    170         return self.sublayer[2](x, self.feed_forward)
    171 
    172 
    173 # ### Attention
    174 # An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors.  The output is computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key.
    175 # We call our particular attention "Scaled Dot-Product Attention".   The input consists of queries and keys of dimension $d_k$, and values of dimension $d_v$.  We compute the dot products of the query with all keys, divide each by $sqrt{d_k}$, and apply a softmax function to obtain the weights on the values.
    176 def attention(query, key, value, mask=None, dropout=None):
    177     "Compute 'Scaled Dot Product Attention'"
    178     # query,key,value:torch.Size([30, 8, 10, 64])
    179     # decoder mask:torch.Size([30, 1, 9, 9])
    180     d_k = query.size(-1)
    181     key_ = key.transpose(-2, -1)  # torch.Size([30, 8, 64, 10])
    182     # torch.Size([30, 8, 10, 10])
    183     scores = torch.matmul(query, key_) / math.sqrt(d_k)
    184     if mask is not None:
    185         # decoder scores:torch.Size([30, 8, 9, 9]),
    186         scores = scores.masked_fill(mask == 0, -1e9)
    187     p_attn = F.softmax(scores, dim=-1)
    188     if dropout is not None:
    189         p_attn = dropout(p_attn)
    190     return torch.matmul(p_attn, value), p_attn
    191 
    192 
    193 class MultiHeadedAttention(nn.Module):
    194     def __init__(self, h, d_model, dropout=0.1):
    195         "Take in model size and number of heads."
    196         super(MultiHeadedAttention, self).__init__()
    197         assert d_model % h == 0
    198         # We assume d_v always equals d_k
    199         self.d_k = d_model // h  # 64=512//8
    200         self.h = h
    201         self.linears = clones(nn.Linear(d_model, d_model), 4)
    202         self.attn = None
    203         self.dropout = nn.Dropout(p=dropout)
    204 
    205     def forward(self, query, key, value, mask=None):
    206         # query,key,value:torch.Size([30, 10, 512])
    207         if mask is not None:
    208             # Same mask applied to all h heads.
    209             mask = mask.unsqueeze(1)
    210         nbatches = query.size(0)
    211         # 1) Do all the linear projections in batch from d_model => h x d_k
    212         query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
    213                 for l, x in zip(self.linears, (query, key, value))]  # query,key,value:torch.Size([30, 8, 10, 64])
    214         # 2) Apply attention on all the projected vectors in batch.
    215         x, self.attn = attention(query, key, value, mask=mask,
    216                                  dropout=self.dropout)
    217         # 3) "Concat" using a view and apply a final linear.
    218         x = x.transpose(1, 2).contiguous().view(
    219             nbatches, -1, self.h * self.d_k)
    220         ret = self.linears[-1](x)  # torch.Size([30, 10, 512])
    221         return ret
    222 
    223 
    224 # ### Applications of Attention in our Model
    225 # The Transformer uses multi-head attention in three different ways:
    226 # 1) In "encoder-decoder attention" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder.   This allows every position in the decoder to attend over all positions in the input sequence.  This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [(cite)](https://arxiv.org/abs/1609.08144).
    227 # 2) The encoder contains self-attention layers.  In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder.   Each position in the encoder can attend to all positions in the previous layer of the encoder.
    228 # 3) Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position.  We need to prevent leftward information flow in the decoder to preserve the auto-regressive property.  We implement this inside of scaled dot-product attention by masking out (setting to $-infty$) all values in the input of the softmax which correspond to illegal connections.
    229 # ## Position-wise Feed-Forward Networks
    230 class PositionwiseFeedForward(nn.Module):
    231     "Implements FFN equation."
    232 
    233     def __init__(self, d_model, d_ff, dropout=0.1):
    234         super(PositionwiseFeedForward, self).__init__()
    235         self.w_1 = nn.Linear(d_model, d_ff)
    236         self.w_2 = nn.Linear(d_ff, d_model)
    237         self.dropout = nn.Dropout(dropout)
    238 
    239     def forward(self, x):
    240         return self.w_2(self.dropout(F.relu(self.w_1(x))))
    241 
    242 
    243 # ## Embeddings and Softmax
    244 # Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension $d_{	ext{model}}$.  We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities.  In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [(cite)](https://arxiv.org/abs/1608.05859). In the embedding layers, we multiply those weights by $sqrt{d_{	ext{model}}}$.
    245 class Embeddings(nn.Module):
    246     def __init__(self, d_model, vocab):
    247         super(Embeddings, self).__init__()
    248         self.lut = nn.Embedding(vocab, d_model)  # Embedding(11, 512)
    249         self.d_model = d_model
    250 
    251     def forward(self, x):
    252         return self.lut(x) * math.sqrt(self.d_model)
    253 
    254 
    255 # ## Positional Encoding
    256 class PositionalEncoding(nn.Module):
    257     "Implement the PE function."
    258 
    259     def __init__(self, d_model, dropout, max_len=5000):
    260         super(PositionalEncoding, self).__init__()
    261         self.dropout = nn.Dropout(p=dropout)
    262 
    263         # Compute the positional encodings once in log space.
    264         pe = torch.zeros(max_len, d_model)
    265         position = torch.arange(0., max_len).unsqueeze(1)
    266         div_term = torch.exp(torch.arange(0., d_model, 2)
    267                              * -(math.log(10000.0) / d_model))
    268 
    269         pe[:, 0::2] = torch.sin(position * div_term)
    270         pe[:, 1::2] = torch.cos(position * div_term)
    271         pe = pe.unsqueeze(0)
    272         self.register_buffer('pe', pe)
    273 
    274     def forward(self, x):
    275         x = x + Variable(self.pe[:, :x.size(1)],
    276                          requires_grad=False)
    277         return self.dropout(x)
    278 
    279 
    280 # We also experimented with using learned positional embeddings [(cite)](https://arxiv.org/pdf/1705.03122.pdf) instead, and found that the two versions produced nearly identical results.  We chose the sinusoidal version because it may allow the model to extrapolate to sequence lengths longer than the ones encountered during training.
    281 # ## Full Model
    282 def make_model(src_vocab, tgt_vocab, N=6,
    283                d_model=512, d_ff=2048, h=8, dropout=0.1):
    284     "Helper: Construct a model from hyperparameters."
    285     c = copy.deepcopy
    286     attn = MultiHeadedAttention(h, d_model)
    287     ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    288     position = PositionalEncoding(d_model, dropout)
    289     model = EncoderDecoder(
    290         Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
    291         Decoder(DecoderLayer(d_model, c(attn), c(attn),
    292                              c(ff), dropout), N),
    293         nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
    294         nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
    295         Generator(d_model, tgt_vocab))
    296 
    297     # This was important from their code.
    298     # Initialize parameters with Glorot / fan_avg.
    299     for p in model.parameters():
    300         if p.dim() > 1:
    301             nn.init.xavier_uniform_(p)
    302     return model
    303 
    304 # We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions.  This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position $i$ can depend only on the known outputs at positions less than $i$.
    305 
    306 
    307 def subsequent_mask(size):
    308     "Mask out subsequent positions when decoding."
    309     attn_shape = (1, size, size)
    310     subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    311     return torch.from_numpy(subsequent_mask) == 0
    312 
    313 # # Training
    314 # This section describes the training regime for our models.
    315 # > We stop for a quick interlude to introduce some of the tools
    316 # needed to train a standard encoder decoder model. First we define a batch object that holds the src and target sentences for training, as well as constructing the masks.
    317 # ## Batches and Masking
    318 
    319 
    320 class Batch:
    321     "Object for holding a batch of data with mask during training."
    322 
    323     def __init__(self, src, trg=None, pad=0):
    324         self.src = src
    325         self.src_mask = (src != pad).unsqueeze(-2)
    326         if trg is not None:
    327             self.trg = trg[:, :-1]
    328             self.trg_y = trg[:, 1:]
    329             self.trg_mask = self.make_std_mask(self.trg, pad)
    330             self.ntokens = (self.trg_y != pad).data.sum()
    331 
    332     @staticmethod
    333     def make_std_mask(tgt, pad):
    334         "Create a mask to hide padding and future words."
    335         tgt_mask = (tgt != pad).unsqueeze(-2)
    336         tgt_mask = tgt_mask & Variable(
    337             subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
    338         return tgt_mask
    339 
    340 # Next we create a generic training and scoring function to keep track of loss. We pass in a generic loss compute function that also handles parameter updates.
    341 def run_epoch(data_iter, model, loss_compute):
    342     "Standard Training and Logging Function"
    343     start = time.time()
    344     total_tokens = 0
    345     total_loss = 0
    346     tokens = 0
    347     for i, batch in enumerate(data_iter):
    348         out = model.forward(batch.src, batch.trg, 
    349                             batch.src_mask, batch.trg_mask)#torch.Size([30, 10]),torch.Size([30, 9]),torch.Size([30, 1, 10]),torch.Size([30, 9, 9])
    350         
    351         
    352         loss = loss_compute(out, batch.trg_y, batch.ntokens)
    353         total_loss += loss
    354         total_tokens += batch.ntokens
    355         tokens += batch.ntokens
    356         if i % 50 == 1:
    357             elapsed = time.time() - start
    358             print("Step: %d Loss: %f" %
    359                     (i, loss / batch.ntokens))
    360             start = time.time()
    361             tokens = 0
    362 
    363     return total_loss / total_tokens
    364 
    365 
    366 # ## Optimizer
    367 class NoamOpt:
    368     "Optim wrapper that implements rate."
    369     def __init__(self, model_size, factor, warmup, optimizer):
    370         self.optimizer = optimizer
    371         self._step = 0
    372         self.warmup = warmup
    373         self.factor = factor
    374         self.model_size = model_size
    375         self._rate = 0
    376         
    377     def step(self):
    378         "Update parameters and rate"
    379         self._step += 1
    380         rate = self.rate()
    381         for p in self.optimizer.param_groups:
    382             p['lr'] = rate
    383         self._rate = rate
    384         self.optimizer.step()
    385         
    386     def rate(self, step = None):
    387         "Implement `lrate` above"
    388         if step is None:
    389             step = self._step
    390         return self.factor *(self.model_size ** (-0.5) *min(step ** (-0.5), step * self.warmup ** (-1.5)))
    391         
    392 def get_std_opt(model):
    393     return NoamOpt(model.src_embed[0].d_model, 2, 4000,
    394             torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
    395 # Three settings of the lrate hyperparameters.
    396 opts = [NoamOpt(512, 1, 4000, None), 
    397         NoamOpt(512, 1, 8000, None),
    398         NoamOpt(256, 1, 4000, None)]
    399 
    400 # ## Regularization                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
    401 # ### Label Smoothing
    402 # During training, we employed label smoothing . This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.  
    403 class LabelSmoothing(nn.Module):
    404     "Implement label smoothing."
    405     def __init__(self, size, padding_idx, smoothing=0.0):
    406         super(LabelSmoothing, self).__init__()
    407         self.criterion = nn.KLDivLoss(size_average=False)
    408         self.padding_idx = padding_idx
    409         self.confidence = 1.0 - smoothing
    410         self.smoothing = smoothing
    411         self.size = size
    412         self.true_dist = None
    413         
    414     def forward(self, x, target):
    415         assert x.size(1) == self.size
    416         true_dist = x.data.clone()
    417         true_dist.fill_(self.smoothing / (self.size - 2))
    418         true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
    419         true_dist[:, self.padding_idx] = 0
    420         mask = torch.nonzero(target.data == self.padding_idx)
    421         if mask.dim() > 0:
    422             true_dist.index_fill_(0, mask.squeeze(), 0.0)
    423         self.true_dist = true_dist
    424         return self.criterion(x, Variable(true_dist, requires_grad=False))
    425 
    426 
    427 # > Here we can see an example of how the mass is distributed to the words based on confidence. 
    428 # crit = LabelSmoothing(5, 0, 0.4)
    429 # predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
    430 #                              [0, 0.2, 0.7, 0.1, 0], 
    431 #                              [0, 0.2, 0.7, 0.1, 0]])
    432 # v = crit(Variable(predict.log()), 
    433 #          Variable(torch.LongTensor([2, 1, 0])))
    434 
    435 
    436 # crit = LabelSmoothing(5, 0, 0.1)
    437 # def loss(x):
    438 #     d = x + 3 * 1
    439 #     predict = torch.FloatTensor([[0, x / d, 1 / d, 1 / d, 1 / d],
    440 #                                  ])
    441 #     # print(predict)
    442 #     return crit(Variable(predict.log()),
    443 #                  Variable(torch.LongTensor([1]))).item()
    444 
    445 # # A First  Example
    446 # > We can begin by trying out a simple copy-task. Given a random set of input symbols from a small vocabulary, the goal is to generate back those same symbols. 
    447 # ## Synthetic Data
    448 def data_gen(V, batch, nbatches):
    449     "Generate random data for a src-tgt copy task."
    450     for i in range(nbatches):
    451         data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10)))#torch.Size([30, 10])
    452         data[:, 0] = 1 #start
    453         src = Variable(data, requires_grad=False)
    454         tgt = Variable(data, requires_grad=False)
    455         yield Batch(src, tgt, 0)
    456 # data_gen(11,30,20)
    457 
    458 
    459 # ## Loss Computation
    460 class SimpleLossCompute:
    461     "A simple loss compute and train function."
    462     def __init__(self, generator, criterion, opt=None):
    463         self.generator = generator
    464         self.criterion = criterion
    465         self.opt = opt
    466         
    467     def __call__(self, x, y, norm):
    468         x = self.generator(x)
    469         loss = self.criterion(x.contiguous().view(-1, x.size(-1)), 
    470                               y.contiguous().view(-1)) / norm
    471         loss.backward()
    472         if self.opt is not None:
    473             self.opt.step()
    474             self.opt.optimizer.zero_grad()
    475         return loss.item() * norm
    476 
    477 
    478 # ## Greedy Decoding
    479 # Train the simple copy task.
    480 V = 11
    481 criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
    482 model = make_model(V, V, N=2)
    483 model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400,
    484         torch.optim.Adam(model.parameters(), lr=0.01, betas=(0.9, 0.98), eps=1e-9))
    485 
    486 for epoch in range(5):
    487     model.train()
    488     run_epoch(data_gen(V, 30, 20), model, 
    489               SimpleLossCompute(model.generator, criterion, model_opt))
    490     model.eval()
    491     print(run_epoch(data_gen(V, 30, 5), model, 
    492                     SimpleLossCompute(model.generator, criterion, None)))
    493 
    494 
    495 #This code predicts a translation using greedy decoding for simplicity. 
    496 def greedy_decode(model, src, src_mask, max_len, start_symbol):
    497     memory = model.encode(src, src_mask)
    498     ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)#fill start symbol
    499     for i in range(max_len-1):
    500         out = model.decode(memory, src_mask, 
    501                            Variable(ys), 
    502                            Variable(subsequent_mask(ys.size(1))
    503                                     .type_as(src.data)))
    504         prob = model.generator(out[:, -1])
    505         _, next_word = torch.max(prob, dim = 1)
    506         next_word = next_word.data[0]
    507         ys = torch.cat([ys, 
    508                         torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    509     return ys
    510 
    511 model.eval()
    512 src = Variable(torch.LongTensor([[1,2,3,4,5,6,7,8,9,10]]) )
    513 src_mask = Variable(torch.ones(1, 1, 10) )
    514 print(greedy_decode(model, src, src_mask, max_len=10, start_symbol=1))
    515 
    516 
    517 '''
    518 # # A Real World Example
    519 # 
    520 # > Now we consider a real-world example using the IWSLT German-English Translation task. This task is much smaller than the WMT task considered in the paper, but it illustrates the whole system. We also show how to use multi-gpu processing to make it really fast.
    521 
    522 #!pip install torchtext spacy
    523 #!python -m spacy download en
    524 #!python -m spacy download de
    525 
    526 
    527 # ## Training Data and Batching
    528 global max_src_in_batch, max_tgt_in_batch
    529 def batch_size_fn(new, count, sofar):
    530     "Keep augmenting batch and calculate total number of tokens + padding."
    531     global max_src_in_batch, max_tgt_in_batch
    532     if count == 1:
    533         max_src_in_batch = 0
    534         max_tgt_in_batch = 0
    535     max_src_in_batch = max(max_src_in_batch,  len(new.src))
    536     max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
    537     src_elements = count * max_src_in_batch
    538     tgt_elements = count * max_tgt_in_batch
    539 
    540     return max(src_elements, tgt_elements)
    541 
    542 # ## Data Loading
    543 # > We will load the dataset using torchtext and spacy for tokenization. 
    544 
    545 # For data loading.
    546 from torchtext import data, datasets
    547 
    548 if True:
    549     import spacy
    550     spacy_de = spacy.load('de')
    551     spacy_en = spacy.load('en')
    552 
    553     def tokenize_de(text):
    554         return [tok.text for tok in spacy_de.tokenizer(text)]
    555 
    556     def tokenize_en(text):
    557         return [tok.text for tok in spacy_en.tokenizer(text)]
    558 
    559     BOS_WORD = '<s>'
    560     EOS_WORD = '</s>'
    561     BLANK_WORD = "<blank>"
    562     SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
    563     TGT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD, 
    564                      eos_token = EOS_WORD, pad_token=BLANK_WORD)
    565 
    566     MAX_LEN = 100
    567     train, val, test = datasets.IWSLT.splits(
    568         exts=('.de', '.en'), fields=(SRC, TGT), 
    569         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
    570             len(vars(x)['trg']) <= MAX_LEN)
    571     MIN_FREQ = 2
    572     SRC.build_vocab(train.src, min_freq=MIN_FREQ)
    573     TGT.build_vocab(train.trg, min_freq=MIN_FREQ)
    574 
    575 
    576 # > Batching matters a ton for speed. We want to have very evenly divided batches, with absolutely minimal padding. To do this we have to hack a bit around the default torchtext batching. This code patches their default batching to make sure we search over enough sentences to find tight batches. 
    577 # ## Iterators
    578 
    579 class MyIterator(data.Iterator):
    580     def create_batches(self):
    581         if self.train:
    582             def pool(d, random_shuffler):
    583                 for p in data.batch(d, self.batch_size * 100):
    584                     p_batch = data.batch(
    585                         sorted(p, key=self.sort_key),
    586                         self.batch_size, self.batch_size_fn)
    587                     for b in random_shuffler(list(p_batch)):
    588                         yield b
    589             self.batches = pool(self.data(), self.random_shuffler)
    590             
    591         else:
    592             self.batches = []
    593             for b in data.batch(self.data(), self.batch_size,
    594                                           self.batch_size_fn):
    595                 self.batches.append(sorted(b, key=self.sort_key))
    596 
    597 def rebatch(pad_idx, batch):
    598     "Fix order in torchtext to match ours"
    599     src, trg = batch.src.transpose(0, 1), batch.trg.transpose(0, 1)
    600     return Batch(src, trg, pad_idx)
    601 
    602 
    603 # ## Multi-GPU Training
    604 # > Finally to really target fast training, we will use multi-gpu. This code implements multi-gpu word generation. It is not specific to transformer so I won't go into too much detail. The idea is to split up word generation at training time into chunks to be processed in parallel across many different gpus. We do this using pytorch parallel primitives:
    605 # 
    606 # * replicate - split modules onto different gpus.
    607 # * scatter - split batches onto different gpus
    608 # * parallel_apply - apply module to batches on different gpus
    609 # * gather - pull scattered data back onto one gpu. 
    610 # * nn.DataParallel - a special module wrapper that calls these all before evaluating. 
    611 # 
    612 
    613 # Skip if not interested in multigpu.
    614 class MultiGPULossCompute:
    615     "A multi-gpu loss compute and train function."
    616     def __init__(self, generator, criterion, devices, opt=None, chunk_size=5):
    617         # Send out to different gpus.
    618         self.generator = generator
    619         self.criterion = nn.parallel.replicate(criterion, 
    620                                                devices=devices)
    621         self.opt = opt
    622         self.devices = devices
    623         self.chunk_size = chunk_size
    624         
    625     def __call__(self, out, targets, normalize):
    626         total = 0.0
    627         generator = nn.parallel.replicate(self.generator, 
    628                                                 devices=self.devices)
    629         out_scatter = nn.parallel.scatter(out, 
    630                                           target_gpus=self.devices)
    631         out_grad = [[] for _ in out_scatter]
    632         targets = nn.parallel.scatter(targets, 
    633                                       target_gpus=self.devices)
    634 
    635         # Divide generating into chunks.
    636         chunk_size = self.chunk_size
    637         for i in range(0, out_scatter[0].size(1), chunk_size):
    638             # Predict distributions
    639             out_column = [[Variable(o[:, i:i+chunk_size].data, 
    640                                     requires_grad=self.opt is not None)] 
    641                            for o in out_scatter]
    642             gen = nn.parallel.parallel_apply(generator, out_column)
    643 
    644             # Compute loss. 
    645             y = [(g.contiguous().view(-1, g.size(-1)), 
    646                   t[:, i:i+chunk_size].contiguous().view(-1)) 
    647                  for g, t in zip(gen, targets)]
    648             loss = nn.parallel.parallel_apply(self.criterion, y)
    649 
    650             # Sum and normalize loss
    651             l = nn.parallel.gather(loss, 
    652                                    target_device=self.devices[0])
    653             l = l.sum()[0] / normalize
    654             total += l.data[0]
    655 
    656             # Backprop loss to output of transformer
    657             if self.opt is not None:
    658                 l.backward()
    659                 for j, l in enumerate(loss):
    660                     out_grad[j].append(out_column[j][0].grad.data.clone())
    661 
    662         # Backprop all loss through transformer.            
    663         if self.opt is not None:
    664             out_grad = [Variable(torch.cat(og, dim=1)) for og in out_grad]
    665             o1 = out
    666             o2 = nn.parallel.gather(out_grad, 
    667                                     target_device=self.devices[0])
    668             o1.backward(gradient=o2)
    669             self.opt.step()
    670             self.opt.optimizer.zero_grad()
    671         return total * normalize
    672 
    673 
    674 # > Now we create our model, criterion, optimizer, data iterators, and paralelization
    675 # GPUs to use
    676 devices = [0, 1, 2, 3]
    677 if True:
    678     pad_idx = TGT.vocab.stoi["<blank>"]
    679     model = make_model(len(SRC.vocab), len(TGT.vocab), N=6)
    680     model.cuda()
    681     criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1)
    682     criterion.cuda()
    683     BATCH_SIZE = 12000
    684     train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=0,
    685                             repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
    686                             batch_size_fn=batch_size_fn, train=True)
    687     valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=0,
    688                             repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
    689                             batch_size_fn=batch_size_fn, train=False)
    690     model_par = nn.DataParallel(model, device_ids=devices)
    691 None
    692 
    693 
    694 # > Now we train the model. I will play with the warmup steps a bit, but everything else uses the default parameters.  On an AWS p3.8xlarge with 4 Tesla V100s, this runs at ~27,000 tokens per second with a batch size of 12,000 
    695 # ## Training the System
    696 #!wget https://s3.amazonaws.com/opennmt-models/iwslt.pt
    697 
    698 if False:
    699     model_opt = NoamOpt(model.src_embed[0].d_model, 1, 2000,
    700             torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
    701     for epoch in range(10):
    702         model_par.train()
    703         run_epoch((rebatch(pad_idx, b) for b in train_iter), 
    704                   model_par, 
    705                   MultiGPULossCompute(model.generator, criterion, 
    706                                       devices=devices, opt=model_opt))
    707         model_par.eval()
    708         loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), 
    709                           model_par, 
    710                           MultiGPULossCompute(model.generator, criterion, 
    711                           devices=devices, opt=None))
    712         print(loss)
    713 else:
    714     model = torch.load("iwslt.pt")
    715 
    716 
    717 # > Once trained we can decode the model to produce a set of translations. Here we simply translate the first sentence in the validation set. This dataset is pretty small so the translations with greedy search are reasonably accurate. 
    718 
    719 for i, batch in enumerate(valid_iter):
    720     src = batch.src.transpose(0, 1)[:1]
    721     src_mask = (src != SRC.vocab.stoi["<blank>"]).unsqueeze(-2)
    722     out = greedy_decode(model, src, src_mask, 
    723                         max_len=60, start_symbol=TGT.vocab.stoi["<s>"])
    724     print("Translation:", end="	")
    725     for i in range(1, out.size(1)):
    726         sym = TGT.vocab.itos[out[0, i]]
    727         if sym == "</s>": break
    728         print(sym, end =" ")
    729     print()
    730     print("Target:", end="	")
    731     for i in range(1, batch.trg.size(0)):
    732         sym = TGT.vocab.itos[batch.trg.data[i, 0]]
    733         if sym == "</s>": break
    734         print(sym, end =" ")
    735     print()
    736     break
    737 
    738 
    739 # # Additional Components: BPE, Search, Averaging
    740 
    741 # > So this mostly covers the transformer model itself. There are four aspects that we didn't cover explicitly. We also have all these additional features implemented in [OpenNMT-py](https://github.com/opennmt/opennmt-py).
    742 # 
    743 # 
    744 
    745 # > 1) BPE/ Word-piece: We can use a library to first preprocess the data into subword units. See Rico Sennrich's [subword-nmt](https://github.com/rsennrich/subword-nmt) implementation. These models will transform the training data to look like this:
    746 # ▁Die ▁Protokoll datei ▁kann ▁ heimlich ▁per ▁E - Mail ▁oder ▁FTP ▁an ▁einen ▁bestimmte n ▁Empfänger ▁gesendet ▁werden .
    747 # > 2) Shared Embeddings: When using BPE with shared vocabulary we can share the same weight vectors between the source / target / generator. See the [(cite)](https://arxiv.org/abs/1608.05859) for details. To add this to the model simply do this:
    748 
    749 if False:
    750     model.src_embed[0].lut.weight = model.tgt_embeddings[0].lut.weight
    751     model.generator.lut.weight = model.tgt_embed[0].lut.weight
    752 
    753 
    754 # > 3) Beam Search: This is a bit too complicated to cover here. See the [OpenNMT-py](https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/translate/Beam.py) for a pytorch implementation.
    755 # > 4) Model Averaging: The paper averages the last k checkpoints to create an ensembling effect. We can do this after the fact if we have a bunch of models:
    756 
    757 def average(model, models):
    758     "Average models into model"
    759     for ps in zip(*[m.params() for m in [model] + models]):
    760         p[0].copy_(torch.sum(*ps[1:]) / len(ps[1:]))
    761 
    762 
    763 # # Results
    764 # 
    765 # On the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big)
    766 # in Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0
    767 # BLEU, establishing a new state-of-the-art BLEU score of 28.4. The configuration of this model is
    768 # listed in the bottom line of Table 3. Training took 3.5 days on 8 P100 GPUs. Even our base model
    769 # surpasses all previously published models and ensembles, at a fraction of the training cost of any of
    770 # the competitive models.
    771 # 
    772 # On the WMT 2014 English-to-French translation task, our big model achieves a BLEU score of 41.0,
    773 # outperforming all of the previously published single models, at less than 1/4 the training cost of the
    774 # previous state-of-the-art model. The Transformer (big) model trained for English-to-French used
    775 # dropout rate Pdrop = 0.1, instead of 0.3.
    776 # 
    777 # 
    778 
    779 
    780 # > The code we have written here is a version of the base model. There are fully trained version of this system available here  [(Example Models)](http://opennmt.net/Models-py/).
    781 # >
    782 # > With the addtional extensions in the last section, the OpenNMT-py replication gets to 26.9 on EN-DE WMT. Here I have loaded in those parameters to our reimplemenation. 
    783 
    784 get_ipython().system('wget https://s3.amazonaws.com/opennmt-models/en-de-model.pt')
    785 
    786 model, SRC, TGT = torch.load("en-de-model.pt")
    787 
    788 model.eval()
    789 sent = "▁The ▁log ▁file ▁can ▁be ▁sent ▁secret ly ▁with ▁email ▁or ▁FTP ▁to ▁a ▁specified ▁receiver".split()
    790 src = torch.LongTensor([[SRC.stoi[w] for w in sent]])
    791 src = Variable(src)
    792 src_mask = (src != SRC.stoi["<blank>"]).unsqueeze(-2)
    793 out = greedy_decode(model, src, src_mask, 
    794                     max_len=60, start_symbol=TGT.stoi["<s>"])
    795 print("Translation:", end="	")
    796 trans = "<s> "
    797 for i in range(1, out.size(1)):
    798     sym = TGT.itos[out[0, i]]
    799     if sym == "</s>": break
    800     trans += sym + " "
    801 print(trans)
    802 
    803 
    804 # ## Attention Visualization
    805 # 
    806 # > Even with a greedy decoder the translation looks pretty good. We can further visualize it to see what is happening at each layer of the attention 
    807 
    808 tgt_sent = trans.split()
    809 def draw(data, x, y, ax):
    810     seaborn.heatmap(data, 
    811                     xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0, 
    812                     cbar=False, ax=ax)
    813     
    814 for layer in range(1, 6, 2):
    815     fig, axs = plt.subplots(1,4, figsize=(20, 10))
    816     print("Encoder Layer", layer+1)
    817     for h in range(4):
    818         draw(model.encoder.layers[layer].self_attn.attn[0, h].data, 
    819             sent, sent if h ==0 else [], ax=axs[h])
    820     plt.show()
    821     
    822 for layer in range(1, 6, 2):
    823     fig, axs = plt.subplots(1,4, figsize=(20, 10))
    824     print("Decoder Self Layer", layer+1)
    825     for h in range(4):
    826         draw(model.decoder.layers[layer].self_attn.attn[0, h].data[:len(tgt_sent), :len(tgt_sent)], 
    827             tgt_sent, tgt_sent if h ==0 else [], ax=axs[h])
    828     plt.show()
    829     print("Decoder Src Layer", layer+1)
    830     fig, axs = plt.subplots(1,4, figsize=(20, 10))
    831     for h in range(4):
    832         draw(model.decoder.layers[layer].self_attn.attn[0, h].data[:len(tgt_sent), :len(sent)], 
    833             sent, tgt_sent if h ==0 else [], ax=axs[h])
    834     plt.show()
    835 
    836 '''
  • 相关阅读:
    CMS 阶段性了结
    Java 代码监控 JVM 运行状态 —— 记一次 JVM 调优的毛招
    ClassLoader PPT 总结
    记一次 CMS 回收异常问题 —— 跨代引用和循环依赖
    读懂 GC 日志
    ClassLoader 学习笔记
    java 可见性简单总结
    Kafka 基本原理整理
    秒杀系统个人总结
    centos 7 mongodb4.0 安装配置
  • 原文地址:https://www.cnblogs.com/buyizhiyou/p/11649817.html
Copyright © 2011-2022 走看看