zoukankan      html  css  js  c++  java
  • CS224N 2019 自然语言处理(一)自然语言处理库gensim之Word2vec

    笔记摘抄

    1. WordNet显示同义词

    from nltk.corpus import wordnet as wn
    
    # 同义词
    poses = {'n': 'noun', 'v': 'verb', 's': 'adj(s)', 'a': 'adj', 'r': 'adv'}
    for synset in wn.synsets('good'):
        print('{}: {}'.format(poses[synset.pos()],
                             ', '.join([l.name() for l in synset.lemmas()])))
    
    输出
    noun: good
    noun: good, goodness
    noun: good, goodness
    noun: commodity, trade_good, good
    adj: good
    adj(s): full, good
    adj: good
    adj(s): estimable, good, honorable, respectable
    adj(s): beneficial, good
    adj(s): good
    adj(s): good, just, upright
    adj(s): adept, expert, good, practiced, proficient, skillful, skilful
    adj(s): good
    adj(s): dear, good, near
    adj(s): dependable, good, safe, secure
    adj(s): good, right, ripe
    adj(s): good, well
    adj(s): effective, good, in_effect, in_force
    adj(s): good
    adj(s): good, serious
    adj(s): good, sound
    adj(s): good, salutary
    adj(s): good, honest
    adj(s): good, undecomposed, unspoiled, unspoilt
    adj(s): good
    adv: well, good
    adv: thoroughly, soundly, good
    
    from nltk.corpus import wordnet as wn
    
    panda = wn.synset('panda.n.01')
    hyper = lambda s: s.hypernyms()
    list(panda.closure(hyper))
    
    [Synset('procyonid.n.01'),
     Synset('carnivore.n.01'),
     Synset('placental.n.01'),
     Synset('mammal.n.01'),
     Synset('vertebrate.n.01'),
     Synset('chordate.n.01'),
     Synset('animal.n.01'),
     Synset('organism.n.01'),
     Synset('living_thing.n.01'),
     Synset('whole.n.02'),
     Synset('object.n.01'),
     Synset('physical_entity.n.01'),
     Synset('entity.n.01')]
    

    2. 自然语言处理库gensim

    • glove 和 word2vec是目前最常用的两个训练词向量的模型

    • 两者训练出来的文件都以文本格式呈现

    • 区别:在于word2vec包含 向量的数量 及其 维度

    2.1 显示词向量

    import numpy as np
    
    # Get the interactive Tools for Matplotlib
    %matplotlib notebook
    import matplotlib.pyplot as plt
    plt.style.use('ggplot')
    
    from sklearn.decomposition import PCA
    # 词相似性软件包
    # 加载Glove向量
    from gensim.test.utils import datapath, get_tmpfile
    from gensim.models import KeyedVectors
    # 加载word2vec向量
    from gensim.scripts.glove2word2vec import glove2word2vec
    
    
    glove_file = datapath('F:/DeapLearning/cs224n_nlp/cs224_exercise/01_Intro_and_WordVectors/Gensim/GloVe/glove.6B.100d.txt')   # 输入文件
    word2vec_glove_file = get_tmpfile("F:/DeapLearning/cs224n_nlp/cs224_exercise/01_Intro_and_WordVectors/Gensim/GloVe/glove.6B.100d.word2vec.txt")  # 输出文件
    glove2word2vec(glove_file, word2vec_glove_file)  # 转换 (400000, 100)
    
    model = KeyedVectors.load_word2vec_format(word2vec_glove_file)  # 加载转化后的文件
    

    测试相似性:

    model.most_similar('obama')
    

    [('barack', 0.937216579914093),
    ('bush', 0.927285373210907),
    ('clinton', 0.8960003852844238),
    ('mccain', 0.8875633478164673),
    ('gore', 0.8000321388244629),
    ('hillary', 0.7933663129806519),
    ('dole', 0.7851964235305786),
    ('rodham', 0.751889705657959),
    ('romney', 0.7488929629325867),
    ('kerry', 0.7472623586654663)]

    print(model.most_similar('banana'))
    

    [('coconut', 0.7097253799438477),
    ('mango', 0.7054824233055115),
    ('bananas', 0.6887733936309814),
    ('potato', 0.6629636287689209),
    ('pineapple', 0.6534532904624939),
    ('fruit', 0.6519855260848999),
    ('peanut', 0.6420576572418213),
    ('pecan', 0.6349173188209534),
    ('cashew', 0.6294420957565308),
    ('papaya', 0.6246591210365295)]

    model.most_similar(negative='banana')
    

    [('keyrates', 0.7173938751220703),
    ('sungrebe', 0.7119239568710327),
    ('þórður', 0.7067720890045166),
    ('zety', 0.7056615352630615),
    ('23aou94', 0.6959497928619385),
    ('___________________________________________________________',
    0.694915235042572),
    ('elymians', 0.6945434212684631),
    ('camarina', 0.6927202939987183),
    ('ryryryryryry', 0.6905653476715088),
    ('maurilio', 0.6865653395652771)]

    2.2 计算词语相似度

    # 计算词语相似度
    result = model.most_similar(positive=['woman', 'king'], negative=['man'])
    print("{}: {:.4f}".format(*result[0]))
    

    queen: 0.7699

    def analogy(x1, x2, y1):
        result = model.most_similar(positive=[y1, x2], negative=[x1])
        return result[0][0]
    
    
    print(analogy('man', 'king', 'woman'))                    # queen
    print(analogy('japan', 'japanese', 'australia'))          # australian
    print(analogy('tall', 'tallest', 'long'))                 # longest
    print(analogy('good', 'fantastic', 'bad'))                # terrible
    
    print(model.doesnt_match("breakfast cereal dinner lunch".split()))    # cereal
    

    2.3 Gensim矢量可视化的各种词向量

    def display_pca_scatterplot(model, words=None, sample=0):
        if words == None:
            if sample > 0:
                words = np.random.choice(list(model.vocab.keys()), sample)
            else:
                words = [ word for word in model.vocab ]           #words里面存储了单词集,len(model.vocab))=400000
    
        word_vectors = np.array([model[w] for w in words])         #word_vectors里面存储了单词集对应的嵌入向量
    
        twodim = PCA().fit_transform(word_vectors)[:,:2]           #降维,取前两个维度
    
        plt.figure(figsize=(6,6))
        plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
        for word, (x,y) in zip(words, twodim):
            plt.text(x+0.05, y+0.05, word)
    
    
    display_pca_scatterplot(model,
                            ['coffee', 'tea', 'beer', 'wine', 'brandy', 'rum', 'champagne', 'water',
                             'spaghetti', 'borscht', 'hamburger', 'pizza', 'falafel', 'sushi', 'meatballs',
                             'dog', 'horse', 'cat', 'monkey', 'parrot', 'koala', 'lizard',
                             'frog', 'toad', 'monkey', 'ape', 'kangaroo', 'wombat', 'wolf',
                             'france', 'germany', 'hungary', 'luxembourg', 'australia', 'fiji', 'china',
                             'homework', 'assignment', 'problem', 'exam', 'test', 'class',
                             'school', 'college', 'university', 'institute'])
    

    由图可知,相关性较大的词语会靠的近一些。

  • 相关阅读:
    C# Redis实战(五)
    C# Redis实战(四)
    C# Redis实战(三)
    C# Redis实战(二)
    C# Redis实战(一)
    memcached的基本命令(安装、卸载、启动、配置相关)
    git和tortoisegit安装教程
    编程规范是非常重要的,为什么说可读性比什么都重要?你有没有确定一个编程规范呢?
    关于VR游戏的前景
    在项目开发过程中如何处理人际关系
  • 原文地址:https://www.cnblogs.com/douzujun/p/13419925.html
Copyright © 2011-2022 走看看