zoukankan      html  css  js  c++  java
  • nltk 获取 gutenberg 语料,gensim 生成词库和 onehot 编码

    nltk 获取 gutenberg 语料
    gensim 生成词库和 onehot 编码

    正在尝试基于 Tensorflow LSTM 模型开发另外一个项目,需要自然语言处理的工具和语料。


    import nltk
    import numpy as np
    from nltk.corpus import gutenberg
    from gensim import corpora, models, similarities
    
    
    class Book2Array(object):
        sentences=None
        token2id_dic=None
        def __init__(self,sentences):
            self.sentences=sentences
            self.token2id_dic=self.get_token2id_dic()
    
        def get_sentences(self):
            #macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
            #print(macbeth_sentences)
            #print(type(macbeth_sentences))
            print(len(macbeth_sentences))
            sentences_list=[sentence for sentence in self.sentences]
            #print(type(macbeth_list))
            return sentences_list
    
        def get_token2id_dic(self):
            # collect statistics about all tokens
            dictionary = corpora.Dictionary(self.sentences)
            # remove stop words and words that appear only once
            dictionary.compactify() # remove gaps in id sequence after words that were removed
            print(len(dictionary))
            token2id_dic=dictionary.token2id
            return token2id_dic
    
        def word2onehot(self,word):
            onehot_list=np.zeros(8192)
            onehot_list[self.token2id_dic[word]]=1
            return onehot_list
    
        def sent2vec(self,sentence):
            vec=[]
            if(len(sentence)>20):
                sentence=sentence[0:20]
            for word in sentence:
                onehot_list=self.word2onehot(word)
                vec.append(onehot_list)
            len_vec=len(vec)
            for i in range(0,20-len_vec):
                vec.append(np.zeros(8192))
            #print(len(vec))
            vec_np=np.asarray(vec)
            return vec_np
    
        def sentences2array(self):
            array=[]
            for sentence in self.sentences:
                array.append(self.sent2vec(sentence))
            return array
    
        def gen_batch(self):
            pass
    
    if __name__ == '__main__':
        macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
        book_array=Book2Array(macbeth_sentences)
        book_array.get_sentences()
        array=book_array.sentences2array()
        np_array=np.array(array[0])
        print(np_array.shape)
    

    更多教程:http://www.tensorflownews.com/
  • 相关阅读:
    Linux运维常用的几个命令介绍【转】
    Linux 删除文件后空间不释放【原创】
    使用 Xtrabackup 在线对MySQL做主从复制【转】
    用Centos7搭建小微企业Samba文件共享服务器【转】
    工作流数据表设计
    mysql函数大全
    git 分支管理
    Bootstap datetimepicker报错TypeError: intermediate value
    分分钟搞定IOS远程消息推送
    Windows10下安装OpenSSL
  • 原文地址:https://www.cnblogs.com/panchuangai/p/12568324.html
Copyright © 2011-2022 走看看