1 def word_vector_gener(): 2 """ 3 几种不同的方法来生成词向量 4 :return: 5 """ 6 from gensim.models import Word2Vec 7 from gensim.test.utils import common_texts 8 # 1.word2vec 9 # 获取原始数据 10 DATA_PATH = './word2vec_data.txt' 11 word2evctor = open('./word2vector.txt', 'w', encoding='utf8') 12 word_list = [] 13 finall = [] 14 # jieba分词 15 with open(DATA_PATH, 'r', encoding='utf8') as file: 16 for each_line in file.readlines(): 17 # 分词 18 cut_word = list(jieba.cut(each_line.strip())) 19 # 去停用词 20 stopwords = [w.strip() for w in open('./stop_words.txt', 'r', encoding='utf8')] 21 temp = [] 22 for each in cut_word: 23 if each not in stopwords and each.strip(): 24 temp.append(each) 25 word_list.append(each) 26 finall.append(temp) 27 # 训练模型 28 model = Word2Vec(finall, size=100, window=1, min_count=1, workers=4) 29 model.save('./word2vec_model.') 30 # 查看词向量 31 for word in list(set(word_list)): 32 content = str(word) + ' ' + str(model[word]) 33 word2evctor.write(content+' ') 34 print(content) 35 36 37 print('ok') 38 39 40 if __name__ == '__main__': 41 word_vector_gener()