首先使用bert获取词向量bert-as-service
1.安装Bert-as-Service
pip install bert-serving-server # server
pip install bert-serving-client #
client, independent of `bert-serving-server
2.下载Bert预训练模型
bert_uncased_L-12_H-768_A-12.zip
3.开启服务
- Anaconda 的cmd中启用
bert-serving-start -model_dir C:UsersAdministratorDesktop自然语言处理uncased_L-12_H-768_A-12 -num_worker=1 - 其中,-model_dir 是预训练模型的路径,-num_worker 是线程数,表示同时可以处理多少个并发请求
4.加载句向量
- 转到pycharm,创建一个py文件然后输入如下代码,如果产生了向量矩阵则说明配置成功
from bert_serving.client import BertClient bc = BertClient() vec = bc.encode(["yan", "low"]) print(vec)
应用
由于使用bert的预训练模型768维的输出不可改变,我们可以改变上述三个模型中LSTM,self.lstm = nn.LSTM(input_size=config.words_dim, # 768
用BERT替换gloVe 300
使用GloVe部分
if os.path.isfile(args.vector_cache): # vector_cache(存储器) = "data/sq_glove300d.pt" # stoi 序号对应的词{',': 0, '.': 1, 'the': 2,...,'sábato': 52282} vector = torch.Size([52283, 300]) dim = 300 stoi, vectors, dim = torch.load(args.vector_cache) TEXT.vocab.vectors = torch.Tensor(len(TEXT.vocab), dim) # 矩阵 for i, token in enumerate(TEXT.vocab.itos): # itos token wv_index = stoi.get(token, None) # if TEXT in glove300 if wv_index is not None: # 则 token to vector like the weights (转换因子) TEXT.vocab.vectors[i] = vectors[wv_index] match_embedding += 1 else: #随机生成 (-0.25,0.25)之间数值 dim个float TEXT.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(-0.25, 0.25) # _表示修改本身数据 else: print("Error: Need word embedding pt file") exit(1)
替换后
bc = BertClient() if bc: for i, token in enumerate(TEXT.vocab.itos): # itos token if bc.encode(token.split()).any(): TEXT.vocab.vectors[i] = torch.tensor(bc.encode(token.split())) #list 会分成字母 match_embedding += 1 else: # 随机生成 (-0.25,0.25)之间数值 dim个float TEXT.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(-0.25, 0.25) # _表示修改本身数据 else: print("Error: Need word embedding pt file") exit(1)
if os.path.isfile(args.vector_cache): # vector_cache(存储器) = "data/sq_glove300d.pt"
# stoi 序号对应的词{',': 0, '.': 1, 'the': 2,...,'sábato': 52282} vector = torch.Size([52283, 300]) dim = 300
stoi, vectors, dim = torch.load(args.vector_cache)
TEXT.vocab.vectors = torch.Tensor(len(TEXT.vocab), dim) # 矩阵
for i, token in enumerate(TEXT.vocab.itos): # itos token
wv_index = stoi.get(token, None)
# if TEXT in glove300
if wv_index is not None:
# 则 token to vector like the weights (转换因子)
TEXT.vocab.vectors[i] = vectors[wv_index]
match_embedding += 1
else:
#随机生成 (-0.25,0.25)之间数值 dim个float
TEXT.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(-0.25, 0.25) # _表示修改本身数据
else:
print("Error: Need word embedding pt file")
exit(1)