zoukankan html css js c++ java

nltk 获取 gutenberg 语料，gensim 生成词库和 onehot 编码

nltk 获取 gutenberg 语料
gensim 生成词库和 onehot 编码

正在尝试基于 Tensorflow LSTM 模型开发另外一个项目，需要自然语言处理的工具和语料。

import nltk
import numpy as np
from nltk.corpus import gutenberg
from gensim import corpora, models, similarities


class Book2Array(object):
    sentences=None
    token2id_dic=None
    def __init__(self,sentences):
        self.sentences=sentences
        self.token2id_dic=self.get_token2id_dic()

    def get_sentences(self):
        #macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
        #print(macbeth_sentences)
        #print(type(macbeth_sentences))
        print(len(macbeth_sentences))
        sentences_list=[sentence for sentence in self.sentences]
        #print(type(macbeth_list))
        return sentences_list

    def get_token2id_dic(self):
        # collect statistics about all tokens
        dictionary = corpora.Dictionary(self.sentences)
        # remove stop words and words that appear only once
        dictionary.compactify() # remove gaps in id sequence after words that were removed
        print(len(dictionary))
        token2id_dic=dictionary.token2id
        return token2id_dic

    def word2onehot(self,word):
        onehot_list=np.zeros(8192)
        onehot_list[self.token2id_dic[word]]=1
        return onehot_list

    def sent2vec(self,sentence):
        vec=[]
        if(len(sentence)>20):
            sentence=sentence[0:20]
        for word in sentence:
            onehot_list=self.word2onehot(word)
            vec.append(onehot_list)
        len_vec=len(vec)
        for i in range(0,20-len_vec):
            vec.append(np.zeros(8192))
        #print(len(vec))
        vec_np=np.asarray(vec)
        return vec_np

    def sentences2array(self):
        array=[]
        for sentence in self.sentences:
            array.append(self.sent2vec(sentence))
        return array

    def gen_batch(self):
        pass

if __name__ == '__main__':
    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
    book_array=Book2Array(macbeth_sentences)
    book_array.get_sentences()
    array=book_array.sentences2array()
    np_array=np.array(array[0])
    print(np_array.shape)

更多教程：http://www.tensorflownews.com/

查看全文

相关阅读:
b_lc_长度为 3 的不同回文子序列（统计两个相同字符中间有多少个不同字符）
b_lc_最小未被占据椅子的编号（记录每个时间来的人 + pq）
b_lc_统计好数字的数量（排列数+组合数+快速幂）
TreeMap
LinkedHashMap
HashMap的总结
 HashMap
Collection
Map
LinkedList学习

原文地址：https://www.cnblogs.com/panchuangai/p/12568324.html