直接看这个-->Github
导包:
import re
import math
import torch
import numpy as np
from random import *
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
1. 数据预处理
1.1 构造单词表和映射
text = (
'Hello, how are you? I am Romeo.
' # R
'Hello, Romeo My name is Juliet. Nice to meet you.
' # J
'Nice to meet you too. How are you today?
' # R
'Great. My baseball team won the competition.
' # J
'Oh Congratulations, Juliet
' # R
'Thank you Romeo
' # J
'Where are you going today?
' # R
'I am going shopping. What about you?
' # J
'I am going to visit my grandmother. she is not very well' # R
)
sentences = re.sub("[.,!?\-]", '', text.lower()).split('
') # filter '.', ',', '?', '!'
# 所有句子的单词list
word_list = list(set(" ".join(sentences).split())) # ['hello', 'how', 'are', 'you',...]
# 给单词表中所有单词设置序号
word2idx = {'[PAD]' : 0, '[CLS]' : 1, '[SEP]' : 2, '[MASK]' : 3}
for i, w in enumerate(word_list):
word2idx[w] = i + 4
# 用于 idx 映射回 word
idx2word = {i: w for i, w in enumerate(word2idx)}
vocab_size = len(word2idx) # 40
# token: 就是每个单词在词表中的index
token_list = list() # token_list存储了每一句的token
for sentence in sentences:
arr = [word2idx[s] for s in sentence.split()]
token_list.append(arr)
展示一下:
print(sentences[1]) # hello romeo my name is juliet nice to meet you
print(token_list[1]) # [14, 31, 35, 33, 27, 11, 8, 16, 5, 34]
1.2 设置超参数
maxlen = 30 # 句子pad到的最大长度,即下面句子中的seq_len
batch_size = 6
# BERT模型参数
max_pred = 5 # max tokens of prediction
n_layers = 6 # Bert中Transformer的层数
n_heads = 12 # Multi-head的数量
d_model = 768 # 即embedding_dim
d_ff = 768*4 # 4*d_model, FeedForward dimension
d_k = d_v = 64 # dimension of K(=Q), V,是d_model分割成n_heads之后的长度, 768 // 12 = 64
n_segments = 2 # 分隔句子数
2.实现Dataloader
2.1生成data
-
选中语料中所有词的15%进行随机mask
-
在确定要Mask掉的单词之后:
-
选中的单词,在80%的概率下被用 [MASK] 来代替
-
选中的单词,在10%的概率下不做mask,用任意非标记词代替
-
选中的单词,在10%的概率下不做mask,仍然保留原来真实的词
-
# sample IsNext and NotNext to be same in small batch size
def make_data():
batch = []
positive = negative = 0
while (positive != batch_size / 2) or (negative != batch_size / 2):
# ==========================BERT 的 input 表示================================
# 随机取两个句子的index
tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences
# 随机取两个句子
tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]
# Token Embeddings (没有使用word piece): 单词在词典中的编码
input_ids = [word2idx['[CLS]']] + tokens_a + [word2idx['[SEP]']] + tokens_b + [word2idx['[SEP]']]
# Segment Embeddings: 区分两个句子的编码(上句全为0 (CLS~SEP),下句全为1)
segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
# ========================== MASK LM ==========================================
n_pred = min(max_pred, max(1, int(len(input_ids) * 0.15))) # 15 % of tokens in one sentence
# token在 input_ids 中的下标(不包括[CLS], [SEP])
cand_maked_pos = [i for i, token in enumerate(input_ids)
if token != word2idx['[CLS]'] and token != word2idx['[SEP]']] # candidate masked position
shuffle(cand_maked_pos)
masked_tokens, masked_pos = [], [] # 被mask的tokens,被mask的tokens的索引号
for pos in cand_maked_pos[:n_pred]: # 随机mask 15% 的tokens
masked_pos.append(pos)
masked_tokens.append(input_ids[pos])
# 选定要mask的词
if random() < 0.8: # 80%:被真实mask
input_ids[pos] = word2idx['[MASK]']
elif random() > 0.9: # 10%
index = randint(0, vocab_size - 1) # random index in vocabulary
while index < 4: # 不能是 [PAD], [CLS], [SEP], [MASK]
index = randint(0, vocab_size - 1)
input_ids[pos] = index # 10%:不做mask,用任意非标记词代替
# 还有10%:不做mask,什么也不做
# ==========================+ Paddings ======================================
# input_ids全部padding到相同的长度
n_pad = maxlen - len(input_ids)
input_ids.extend(word2idx['[PAD]'] * n_pad)
segment_ids.extend(word2idx['[PAD]'] * n_pad)
# zero padding (100% - 15%) tokens
if max_pred > n_pred:
n_pad = max_pred - n_pred
masked_tokens.extend([0] * n_pad)
masked_pos.extend([0] * n_pad)
# 让正例 和 负例 数量相同
if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:
batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
positive += 1
elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:
batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
negative += 1
return batch
调用上面函数:(一个batch的数据)
batch = make_data()
# 一个batch的数据
input_ids, segment_ids, masked_tokens, masked_pos, isNext = zip(*batch)
# 全部要转成LongTensor类型
input_ids, segment_ids, masked_tokens, masked_pos, isNext =
torch.LongTensor(input_ids), torch.LongTensor(segment_ids), torch.LongTensor(masked_tokens),
torch.LongTensor(masked_pos), torch.LongTensor(isNext)
2.2 实现DataLoader
-
为了使用dataloader,我们需要定义以下两个function:
-
__len__
function:需要返回整个数据集中有多少个item -
__get__
:根据给定的index返回一个item
-
有了dataloader之后,我们可以轻松随机打乱整个数据集,拿到一个batch的数据等等。
class MyDataSet(Data.Dataset):
def __init__(self, input_ids, segment_ids, masked_tokens, masked_pos, isNext):
# 全部要转成LongTensor类型
self.input_ids = torch.LongTensor(input_ids)
self.segment_ids = torch.LongTensor(segment_ids)
self.masked_tokens = torch.LongTensor(masked_tokens)
self.masked_pos = torch.LongTensor(masked_pos)
self.isNext = torch.LongTensor(isNext)
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.segment_ids[idx], self.masked_tokens[idx], self.masked_pos[idx], self.isNext[idx]
dataset = MyDataSet(input_ids, segment_ids, masked_tokens, masked_pos, isNext)
dataloader = Data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
查看数据:
print(next(iter(dataloader)))
print(len(dataloader)) # 就一个batch
输出:
[tensor([[ 1, 3, 13, 11, 2, 7, 34, 31, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 1, 4, 23, 3, 16, 17, 35, 30, 18, 27, 29, 36, 24, 2, 3, 13, 11, 2,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 1, 6, 13, 11, 2, 3, 34, 31, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 1, 3, 23, 37, 39, 26, 3, 34, 2, 4, 23, 37, 16, 3, 35, 30, 18, 27,
29, 36, 24, 2, 0, 0, 0, 0, 0, 0, 0, 0],
[ 1, 7, 34, 31, 2, 4, 23, 37, 39, 26, 21, 34, 2, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 1, 7, 34, 31, 2, 8, 16, 3, 34, 32, 19, 12, 34, 28, 2, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0]]), tensor([[ 6, 0, 0, 0, 0],
[ 6, 37, 0, 0, 0],
[ 7, 0, 0, 0, 0],
[17, 21, 4, 0, 0],
[ 7, 0, 0, 0, 0],
[ 5, 34, 0, 0, 0]]), tensor([[ 1, 0, 0, 0, 0],
[14, 3, 0, 0, 0],
[ 5, 0, 0, 0, 0],
[13, 6, 1, 0, 0],
[ 1, 0, 0, 0, 0],
[ 7, 2, 0, 0, 0]]), tensor([1, 0, 1, 1, 0, 0])]
1