1 """ 2 1.在自然语言处理中常常使用预训练的word2vec,这个预训练的词向量可以使用google的GoogleNews-vectors-negative300.bin 3 2.GoogleNews-vectors-negative300.bin是训练好的300维的新闻语料词向量 4 3.本函数的作用就是把一个词转换成词向量,以供我们后期使用。没有在该word2vec中的词采用其他的方式构建,如采用均匀分布或者高斯分布等随机初始化 5 """ 6 import numpy as np 7 8 9 # loads 300x1 word vectors from file. 10 def load_bin_vec(fname, vocab): 11 word_vecs = {} 12 with open(fname, "rb") as f: 13 header = f.readline() 14 vocab_size, layer1_size = map(int, header.split()) # 3000000 300 15 binary_len = np.dtype('float32').itemsize * layer1_size # 1200 16 for line in range(vocab_size): 17 word = [] 18 while True: 19 ch = f.read(1) 20 if ch == ' ': 21 word = ''.join(word) 22 break 23 if ch != ' ': 24 word.append(ch) 25 if word in vocab: 26 word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32') 27 else: 28 f.read(binary_len) 29 return word_vecs 30 31 32 # add random vectors of unknown words which are not in pre-trained vector file. 33 # if pre-trained vectors are not used, then initialize all words in vocab with random value. 34 def add_unknown_words(word_vecs, vocab, min_df=1, k=300): 35 for word in vocab: 36 if word not in word_vecs and vocab[word] >= min_df: 37 word_vecs[word] = np.random.uniform(-0.25, 0.25, k) 38 39 40 vectors_file = './GoogleNews-vectors-negative300.bin' 41 vocab = ['I', 'can', 'do'] 42 43 vectors = load_bin_vec(vectors_file, vocab) # pre-trained vectors 44 add_unknown_words(vectors, vocab) 45 print(vectors['I']) 46 print('*'*40) 47 print(vectors['can']) 48 print('*'*40) 49 print(vectors['do'])