zoukankan      html  css  js  c++  java
  • 快速进行词向量训练和读取

    1.词向量训练demo

    from gensim.models import Word2Vec
    from gensim.test.utils import common_texts
    import jieba
    import tqdm
    
    word2vec_path = './resources/word2vec.model'
    
    def word_vector_gener():
        """
        几种不同的方法来生成词向量
        :return:
        """
        # 1.word2vec
        # 获取原始数据
        DATA_PATH = './data/seo_search_word_copy.txt'
    #     word2evctor = open('./word2vector.txt', 'w', encoding='utf8')
        word_list = []
        finall = []
        # jieba分词
        with open(DATA_PATH, 'r', encoding='utf8') as file:
            for each_line in tqdm.tqdm(file.readlines()):
                query = each_line.strip().split('	')[-1]
                # 分词
                cut_word = jieba.lcut(query)
                finall.append(cut_word)
        # 训练模型
        model = Word2Vec(finall, sg=1, size=10, window=2, min_count=1, negative=1,
                     sample=0.001, workers=4)
    #     model.save('./resources/word2vec.model')
        model.wv.save(word2vec_path)
        print(model['老师'])
    
    
    if __name__ == '__main__':
        word_vector_gener()

    2.词向量加载demo(此方法为获得词向量最快)

    word2vec_path = './resources/word2vec.model'
    wv = KeyedVectors.load(word2vec_path, mmap='r')
    vector = wv['
    主管']
    word = wv.most_similar([
    '主管'], topn=30)
    print(word)

    输出:

    [('组长', 0.8488447070121765),
     ('经理', 0.8272342085838318),
     ('总监', 0.816636323928833),
     ('副经理', 0.8071938753128052),
     ('部长', 0.8019827604293823),
     ('专员', 0.7792257070541382),
     ('高级专员', 0.7695066332817078),
     ('主任', 0.7676611542701721),
     ('负责人', 0.761403501033783),
     ('部副', 0.7570186853408813),
     ('及', 0.7355248928070068),
     ('业务主管', 0.732032299041748),
     ('岗', 0.7316986322402954),
     ('副总', 0.7278518676757812),
     ('科长', 0.72648024559021),
     ('兼', 0.7262977957725525),
     ('助理', 0.7255839705467224),
     ('资深', 0.7252861261367798),
     ('组', 0.7167786955833435),
     ('储干', 0.7150581479072571),
     ('班长', 0.7146369218826294),
     ('职员', 0.7104721665382385),
     ('实习生', 0.707991898059845),
     ('支持', 0.7070707082748413),
     ('高级', 0.7055947184562683),
     ('管理人员', 0.7054109573364258),
     ('初级', 0.7042156457901001),
     ('副理', 0.7038965821266174),
     ('小组长', 0.7035383582115173),
     ('技术主管', 0.7024495601654053)]
  • 相关阅读:
    高斯消元
    UVa12103
    UVa10294
    UVa11762
    牛客网算法工程师能力评估
    华为研发工程师编程题
    网易2017春招笔试真题编程题集合
    2017网易有道内推编程题
    2017网易雷火实习生招聘编程题
    数组---面试知识点整理
  • 原文地址:https://www.cnblogs.com/demo-deng/p/13857640.html
Copyright © 2011-2022 走看看