zoukankan      html  css  js  c++  java
  • 【502】gensim实现Word2Vec

    参考:Word Embedding Tutorial: word2vec using Gensim [EXAMPLE]

    参考:NLP入门(三)词形还原(Lemmatization)

    参考:文本分类实战(一)—— word2vec预训练词向量

    参考:Implementing Word2Vec with Gensim Library in Python

      文本预处理

    • 分词
    • 单词转化为小写字母
    • 去除单词中的标点符号
    • 去除单词中的数字
    • 去除空字符
    • 去掉停用词
    • 去掉空的list
    • 词形还原

      首先导入必要的 libraries

    import gensim
    import nltk
    from gensim.models import Word2Vec
    
    # 停用词
    from nltk.corpus import stopwords
    stop = stopwords.words('english')
    
    # 标点符号
    import string
    # string.punctuation
    
    # 词形还原
    from nltk.stem import WordNetLemmatizer
    wnl = WordNetLemmatizer()
    

      加载数据并显示

    data = [{"tag": "welcome",
    "patterns": ["Hi", "How are you", "Is any one to talk?", "Hello", "hi are you available"],
    "responses": ["Hello, thanks for contacting us", "Good to see you here"," Hi there, how may I assist you?"]
            },
    {"tag": "goodbye",
    "patterns": ["Bye", "See you later", "Goodbye", "I will come back soon"],
    "responses": ["See you later, thanks for visiting", "have a great day ahead", "Wish you Come back again soon."]
            },
    {"tag": "thankful",
    "patterns": ["Thanks for helping me", "Thank your guidance", "That's helpful and kind from you"],
    "responses": ["Happy to help!", "Any time!", "My pleasure", "It is my duty to help you"]
            },
            {"tag": "hoursopening",
    "patterns": ["What hours are you open?", "Tell your opening time?", "When are you open?", "Just your timing please"],
    "responses": ["We're open every day 8am-7pm", "Our office hours are 8am-7pm every day", "We open office at 8 am and close at 7 pm"]
            },
    
    {"tag": "payments",
    "patterns": ["Can I pay using credit card?", " Can I pay using Mastercard?", " Can I pay using cash only?" ],
    "responses": ["We accept VISA, Mastercard and credit card", "We accept credit card, debit cards and cash. Please don’t worry"]
            }
       ]
    
    bigger_list = []
    
    for i in range(len(data)):
        for s in data[i]['patterns']:
            li = s.split(" ")
            bigger_list.append(li)
            
    bigger_list
    

      输出结果如下:

    [['Hi'],
     ['How', 'are', 'you'],
     ['Is', 'any', 'one', 'to', 'talk?'],
     ['Hello'],
     ['hi', 'are', 'you', 'available'],
     ['Bye'],
     ['See', 'you', 'later'],
     ['Goodbye'],
     ['I', 'will', 'come', 'back', 'soon'],
     ['Thanks', 'for', 'helping', 'me'],
     ['Thank', 'your', 'guidance'],
     ["That's", 'helpful', 'and', 'kind', 'from', 'you'],
     ['What', 'hours', 'are', 'you', 'open?'],
     ['Tell', 'your', 'opening', 'time?'],
     ['When', 'are', 'you', 'open?'],
     ['Just', 'your', 'timing', 'please'],
     ['Can', 'I', 'pay', 'using', 'credit', 'card?'],
     ['', 'Can', 'I', 'pay', 'using', 'Mastercard?'],
     ['', 'Can', 'I', 'pay', 'using', 'cash', 'only?']]
    

      将单词都转换为小写字母:

    # 将单词变为小写
    bigger_list = [[w.lower() for w in s] for s in bigger_list]
    bigger_list
    

      输出结果如下:

    [['hi'],
     ['how', 'are', 'you'],
     ['is', 'any', 'one', 'to', 'talk?'],
     ['hello'],
     ['hi', 'are', 'you', 'available'],
     ['bye'],
     ['see', 'you', 'later'],
     ['goodbye'],
     ['i', 'will', 'come', 'back', 'soon'],
     ['thanks', 'for', 'helping', 'me'],
     ['thank', 'your', 'guidance'],
     ["that's", 'helpful', 'and', 'kind', 'from', 'you'],
     ['what', 'hours', 'are', 'you', 'open?'],
     ['tell', 'your', 'opening', 'time?'],
     ['when', 'are', 'you', 'open?'],
     ['just', 'your', 'timing', 'please'],
     ['can', 'i', 'pay', 'using', 'credit', 'card?'],
     ['', 'can', 'i', 'pay', 'using', 'mastercard?'],
     ['', 'can', 'i', 'pay', 'using', 'cash', 'only?']]
    

      删除单词里面的标点符号

    import string
    # 存储标点符号为一个字符串
    # string.punctuation
    
    # 去掉单词中的标点
    # ''.join([x for x in 'alex?' if x not in string.punctuation])
    # 输出为 alex
    
    # 去掉单词中的标点
    bigger_list = [[''.join([x for x in w if x not in string.punctuation]) for w in s] for s in bigger_list]
    bigger_list
    

      输出结果如下:

    [['hi'],
     ['how', 'are', 'you'],
     ['is', 'any', 'one', 'to', 'talk'],
     ['hello'],
     ['hi', 'are', 'you', 'available'],
     ['bye'],
     ['see', 'you', 'later'],
     ['goodbye'],
     ['i', 'will', 'come', 'back', 'soon'],
     ['thanks', 'for', 'helping', 'me'],
     ['thank', 'your', 'guidance'],
     ['thats', 'helpful', 'and', 'kind', 'from', 'you'],
     ['what', 'hours', 'are', 'you', 'open'],
     ['tell', 'your', 'opening', 'time'],
     ['when', 'are', 'you', 'open'],
     ['just', 'your', 'timing', 'please'],
     ['can', 'i', 'pay', 'using', 'credit', 'card'],
     ['', 'can', 'i', 'pay', 'using', 'mastercard'],
     ['', 'can', 'i', 'pay', 'using', 'cash', 'only']]
    

      去掉空字符

    # 去掉空字符
    bigger_list = [[w for w in s if w!=''] for s in bigger_list]
    bigger_list
    

      输出结果如下:

    [['hi'],
     ['how', 'are', 'you'],
     ['is', 'any', 'one', 'to', 'talk'],
     ['hello'],
     ['hi', 'are', 'you', 'available'],
     ['bye'],
     ['see', 'you', 'later'],
     ['goodbye'],
     ['i', 'will', 'come', 'back', 'soon'],
     ['thanks', 'for', 'helping', 'me'],
     ['thank', 'your', 'guidance'],
     ['thats', 'helpful', 'and', 'kind', 'from', 'you'],
     ['what', 'hours', 'are', 'you', 'open'],
     ['tell', 'your', 'opening', 'time'],
     ['when', 'are', 'you', 'open'],
     ['just', 'your', 'timing', 'please'],
     ['can', 'i', 'pay', 'using', 'credit', 'card'],
     ['can', 'i', 'pay', 'using', 'mastercard'],
     ['can', 'i', 'pay', 'using', 'cash', 'only']]
    

      去掉停用词

    from nltk.corpus import stopwords
    # 存储停用词
    stop = stopwords.words('english')
    
    # 去掉停用词
    bigger_list = [[w for w in s if w not in stop] for s in bigger_list]
    bigger_list
    

      输出结果如下:

    [['hi'],
     [],
     ['one', 'talk'],
     ['hello'],
     ['hi', 'available'],
     ['bye'],
     ['see', 'later'],
     ['goodbye'],
     ['come', 'back', 'soon'],
     ['thanks', 'helping'],
     ['thank', 'guidance'],
     ['thats', 'helpful', 'kind'],
     ['hours', 'open'],
     ['tell', 'opening', 'time'],
     ['open'],
     ['timing', 'please'],
     ['pay', 'using', 'credit', 'card'],
     ['pay', 'using', 'mastercard'],
     ['pay', 'using', 'cash']]
    

      去掉空的 list

    # 去掉空的list
    bigger_list = [s for s in bigger_list if len(s) > 0]
    bigger_list
    

      输出结果如下:

    [['hi'],
     ['one', 'talk'],
     ['hello'],
     ['hi', 'available'],
     ['bye'],
     ['see', 'later'],
     ['goodbye'],
     ['come', 'back', 'soon'],
     ['thanks', 'helping'],
     ['thank', 'guidance'],
     ['thats', 'helpful', 'kind'],
     ['hours', 'open'],
     ['tell', 'opening', 'time'],
     ['open'],
     ['timing', 'please'],
     ['pay', 'using', 'credit', 'card'],
     ['pay', 'using', 'mastercard'],
     ['pay', 'using', 'cash']]
    

      词形还原

    # 词形还原
    from nltk.stem import WordNetLemmatizer
    wnl = WordNetLemmatizer()
    bigger_list = [[wnl.lemmatize(w) for w in s] for s in bigger_list]
    bigger_list
    

      输出结果如下:

    [['hi'],
     ['one', 'talk'],
     ['hello'],
     ['hi', 'available'],
     ['bye'],
     ['see', 'later'],
     ['goodbye'],
     ['come', 'back', 'soon'],
     ['thanks', 'helping'],
     ['thank', 'guidance'],
     ['thats', 'helpful', 'kind'],
     ['hour', 'open'],
     ['tell', 'opening', 'time'],
     ['open'],
     ['timing', 'please'],
     ['pay', 'using', 'credit', 'card'],
     ['pay', 'using', 'mastercard'],
     ['pay', 'using', 'cash']]
    

      模型训练并存储以及调用

    # 训练模型
    model= Word2Vec(bigger_list,min_count=1,size=300,workers=4)
    
    # 模型存储
    model.save("word2vec.model")
    model.save('word2vec.bin')
    
    # 模型加载
    model = Word2Vec.load('word2vec.bin')
    
    # 词汇
    list(model.wv.vocab)
    
    # thanks 对应的 vector
    model.wv.word_vec('thanks')
    

      

    word2vec API讲解

      在gensim中,word2vec 相关的API都在包gensim.models.word2vec中。和算法有关的参数都在类gensim.models.word2vec.Word2Vec中。算法需要注意的参数有:

    1. sentences:我们要分析的语料,可以是一个列表,或者从文件中遍历读出(word2vec.LineSentence(filename) )。
    2. size:词向量的维度,默认值是100。这个维度的取值一般与我们的语料的大小相关,如果是不大的语料,比如小于100M的文本语料,则使用默认值一般就可以了。如果是超大的语料,建议增大维度。
    3. window:即词向量上下文最大距离,window越大,则和某一词较远的词也会产生上下文关系。默认值为5,在实际使用中,可以根据实际的需求来动态调整这个window的大小。如果是小语料则这个值可以设的更小。对于一般的语料这个值推荐在[5;10]之间。
    4. sg:即我们的word2vec两个模型的选择了。如果是0, 则是CBOW模型;是1则是Skip-Gram模型;默认是0即CBOW模型。
    5. hs:即我们的word2vec两个解法的选择了。如果是0, 则是Negative Sampling;是1的话并且负采样个数negative大于0, 则是Hierarchical Softmax。默认是0即Negative Sampling。
    6. negative:即使用Negative Sampling时负采样的个数,默认是5。推荐在[3,10]之间。这个参数在我们的算法原理篇中标记为neg。
    7. cbow_mean:仅用于CBOW在做投影的时候,为0,则算法中的xw为上下文的词向量之和,为1则为上下文的词向量的平均值。在我们的原理篇中,是按照词向量的平均值来描述的。个人比较喜欢用平均值来表示xw,默认值也是1,不推荐修改默认值。
    8. min_count:需要计算词向量的最小词频。这个值可以去掉一些很生僻的低频词,默认是5。如果是小语料,可以调低这个值。
    9. iter:随机梯度下降法中迭代的最大次数,默认是5。对于大语料,可以增大这个值。
    10. alpha:在随机梯度下降法中迭代的初始步长。算法原理篇中标记为η,默认是0.025。
    11. min_alpha: 由于算法支持在迭代的过程中逐渐减小步长,min_alpha给出了最小的迭代步长值。随机梯度下降中每轮的迭代步长可以由iter,alpha, min_alpha一起得出。这部分由于不是word2vec算法的核心内容,因此在原理篇我们没有提到。

      利用json和pandas处理

    #list of libraries used by the code
    import string
    from gensim.models import Word2Vec
    import logging
    from nltk.corpus import stopwords
    from textblob import Word
    import json
    import pandas as pd
    #data in json format
    json_file = 'intents.json'
    with open('intents.json','r') as f:
        data = json.load(f)
    #displaying the list of stopwords
    stop = stopwords.words('english')
    #dataframe
    df = pd.DataFrame(data)
    
    df['patterns'] = df['patterns'].apply(', '.join)
    # print(df['patterns'])
    #print(df['patterns'])
    #cleaning the data using the NLP approach
    print(df)
    df['patterns'] = df['patterns'].apply(lambda x:' '.join(x.lower() for x in x.split()))
    df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if x not in string.punctuation))
    df['patterns']= df['patterns'].str.replace('[^ws]','')
    df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if  not x.isdigit()))
    df['patterns'] = df['patterns'].apply(lambda x:' '.join(x for x in x.split() if not x in stop))
    df['patterns'] = df['patterns'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    #taking the outer list
    bigger_list=[]
    for i in df['patterns']:
        li = list(i.split(" "))
        bigger_list.append(li)
    #structure of data to be taken by the model.word2vec
    print("Data format for the overall list:",bigger_list)
    #custom data is fed to machine for further processing
    model = Word2Vec(bigger_list, min_count=1,size=300,workers=4)
    #print(model)
    

      

  • 相关阅读:
    Unity c# 状态机的简单入门
    python实战教程之自动扫雷(自己存下来学习之用)
    Kubernetes的三种外部访问方式:NodePort、LoadBalancer和Ingress-十一(1)
    Ubuntu安装eclipse以及创建快捷方式
    Dockerfile-HEALTHCHECK指令
    各个版本Microsoft Visual C++运行库下载
    docker 远程连接设置
    centos7安装redis3.2.12
    Windows下允许redis远程访问
    UltraISO制作U盘启动盘-centos7
  • 原文地址:https://www.cnblogs.com/alex-bn-lee/p/14111655.html
Copyright © 2011-2022 走看看