安装
!pip install gensim
训练
from gensim.models import word2vec
import logging
# 主程序
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.Text8Corpus() # 加载语料。可替换为自己的语料
model = word2vec.Word2Vec(sentences, size=200) # 默认window=5
# 计算两个词的相似度/相关程度
y1 = model.similarity(u"不错", u"好")
print(u"【不错】和【好】的相似度为:", y1)
print("--------
")
加载词向量文件
from gensim.models import KeyedVectors
file = '/home/xuehp/data/Tencent_AILab_ChineseEmbedding.txt'
wv_from_text = KeyedVectors.load_word2vec_format(file, binary=False)
wv_from_text.init_sims(replace=True)
获取单词向量
import numpy as np
# 未知词、短语向量补齐
def compute_ngrams(word, min_n, max_n):
#BOW, EOW = ('<', '>') # Used by FastText to attach to all words as prefix and suffix
extended_word = word
ngrams = []
for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):
for i in range(0, len(extended_word) - ngram_length + 1):
ngrams.append(extended_word[i:i + ngram_length])
return list(set(ngrams))
def wordVec(word, wv_from_text, min_n = 1, max_n = 3):
'''
ngrams_single/ngrams_more,主要是为了当出现oov的情况下,最好先不考虑单字词向量
'''
# 确认词向量维度
word_size = wv_from_text.wv.syn0[0].shape[0]
# 计算word的ngrams词组
ngrams = compute_ngrams(word,min_n = min_n, max_n = max_n)
# 如果在词典之中,直接返回词向量
if word in wv_from_text.wv.vocab.keys():
return wv_from_text[word]
else:
# 不在词典的情况下
word_vec = np.zeros(word_size, dtype=np.float32)
ngrams_found = 0
ngrams_single = [ng for ng in ngrams if len(ng) == 1]
ngrams_more = [ng for ng in ngrams if len(ng) > 1]
# 先只接受2个单词长度以上的词向量
for ngram in ngrams_more:
if ngram in wv_from_text.wv.vocab.keys():
word_vec += wv_from_text[ngram]
ngrams_found += 1
#print(ngram)
# 如果,没有匹配到,那么最后是考虑单个词向量
if ngrams_found == 0:
for ngram in ngrams_single:
word_vec += wv_from_text[ngram]
ngrams_found += 1
if word_vec.any():
return word_vec / max(1, ngrams_found)
else:
raise KeyError('all ngrams for word %s absent from model' % word)
例子1
vec = wordVec('苹果', wv_from_text, min_n = 1, max_n = 3)
wv_from_text.most_similar(positive=[vec], topn=20)
输出:
[('苹果', 1.0),
('苹果公司', 0.8514505624771118),
('以及苹果', 0.8457839488983154),
('比如苹果', 0.7890200018882751),
('苹果新', 0.7845828533172607),
('其他苹果', 0.7817449569702148),
('iphone', 0.7793817520141602),
('苹果iphone', 0.7790712714195251),
('苹果的iphone', 0.7720062136650085),
('apple', 0.7679361701011658),
('苹果产品', 0.7623019814491272),
('像苹果', 0.7533938884735107),
('小米', 0.7517136335372925),
('关于苹果', 0.7515844106674194),
('iphone产品', 0.7507627606391907),
('iphonex', 0.7488199472427368),
('新款iphone', 0.747662365436554),
('苹果10', 0.7474119067192078),
('iphone系列', 0.7470223307609558),
('新iphone', 0.7435163855552673)]
例子2
vec = wordVec('iuap', wv_from_text, min_n = 1, max_n = 3)
wv_from_text.most_similar(positive=[vec], topn=20)
输出:
[('iuap', 1.0),
('用友云平台', 0.8234802484512329),
('paas平台', 0.8118030428886414),
('用友云', 0.7954781651496887),
('云操作系统', 0.7548810839653015),
('iaas平台', 0.7546966075897217),
('appcenter', 0.7538243532180786),
('u8cloud', 0.7484996914863586),
('paas', 0.7466067671775818),
('社会化商业', 0.7457333207130432),
('云erp', 0.7428735494613647),
('协同云', 0.7421062588691711),
('海云捷迅', 0.7403150200843811),
('采购云', 0.7385496497154236),
('paas+saas', 0.7368173599243164),
('云管理平台', 0.7367190718650818),
('escloud', 0.736686646938324),
('私有云平台', 0.7358618974685669),
('mopaas', 0.7325429916381836),
('云应用', 0.7322961688041687)]
例子3
vec = wordVec('友云采', wv_from_text, min_n = 1, max_n = 3)
wv_from_text.most_similar(positive=[vec], topn=20)
输出:
[('友云采', 1.0000001192092896),
('供应商协同平台', 0.7404446601867676),
('伙伴门户', 0.7326363325119019),
('企业交易平台', 0.7278861999511719),
('供应商门户', 0.7263870239257812),
('移动云分销', 0.7180557250976562),
('电商管理系统', 0.7153645157814026),
('求购大厅', 0.7131102085113525),
('百卓优采', 0.7128005027770996),
('o2o方案', 0.7122943997383118),
('农鲜生', 0.7077293992042542),
('会员资料库', 0.7064912915229797),
('企业管理云平台', 0.7042117118835449),
('56linked', 0.7034884691238403),
('网上订单系统', 0.7033181190490723),
('协同门户', 0.7029898762702942),
('电商建站', 0.7025145292282104),
('管理商机', 0.7013753056526184),
('直销通', 0.7007359862327576),
('erpbuilder', 0.6993728876113892)]
例子4
vec = wordVec('财务云', wv_from_text, min_n = 1, max_n = 3)
wv_from_text.most_similar(positive=[vec], topn=20)
输出:
[('财务云', 1.0),
('财务共享服务', 0.7762293815612793),
('金蝶云', 0.7745106220245361),
('浪潮云', 0.7651669383049011),
('财务共享中心', 0.7502492070198059),
('畅捷通', 0.7385521531105042),
('协同云', 0.7370111346244812),
('企业云服务', 0.7364829182624817),
('用友云', 0.7306167483329773),
('采购云', 0.729377031326294),
('云erp', 0.7251084446907043),
('共享服务中心', 0.7224213480949402),
('人力云', 0.721336305141449),
('金蝶', 0.7165836095809937),
('用友', 0.7122166752815247),
('企业云', 0.7093378305435181),
('erp云', 0.7075839638710022),
('致远协同', 0.706666886806488),
('企业金融', 0.7049797773361206),
('移动信息化', 0.7018118500709534)]
例子5
vec = wordVec('友报账', wv_from_text, min_n = 1, max_n = 3)
wv_from_text.most_similar(positive=[vec], topn=20)
输出:
[('报账', 0.7958753705024719),
('友报', 0.7958752512931824),
('报帐', 0.7087380886077881),
('报销业务', 0.7015117406845093),
('财务报账', 0.6572694778442383),
('审核报销', 0.6517125964164734),
('报销单', 0.6511596441268921),
('费用报销', 0.6456758975982666),
('报销单据', 0.642286479473114),
('原始票据', 0.6387859582901001),
('报销审核', 0.6324885487556458),
('发票报销', 0.6296700835227966),
('做账', 0.6251322031021118),
('员工报销', 0.6216662526130676),
('财务报销', 0.6187087297439575),
('原始单据', 0.6172932386398315),
('对账', 0.6172742247581482),
('费用报销单', 0.6142060160636902),
('审批报销', 0.6136212348937988),
('核账', 0.6098783016204834)]
这个例子中,训练时候和测试时候的分词结果不一致。
本文仅供学习使用