zoukankan      html  css  js  c++  java
  • 主题模型理解

    先上代码

     1 import nltk
     2 from nltk.tokenize import word_tokenize
     3 
     4 openfile = open('corpus.txt')
     5 texts = openfile.readlines()
     6 texts_tokenized = [[word.lower() for word in word_tokenize(text.decode('utf-8'))]  for text in texts]
     7 
     8 print (texts_tokenized)
     9 
    10 
    11 form nltk.corpus import stopwords
    12 english_stopwords = stopwords.words('english')
    13 texts_filterd_stopwords = [[word for word in text_tokenized if not word in english_stopwords] for text_tokenized in texts_tokenized]
    14 english_punctuations = [',','.',':',';','?','(',')','[',']','&','!','*','@','#','$','%']
    15 
    16 texts_filterd_punctuations = [[word for word in text_filterd_stopwords if not word in english_punctuations] for text_filterd_stopwords in texts_filterd_stopwords]
    17 print texts_filterd_punctuations
    18 
    19 # 词干化
    20 from nltk.stem.lancaster import LancasterStemmer
    21 st = LancasterStemmer()
    22 texts_stemmed = [[st.stem(word) for word in text_filterd_punctuations] for text_filterd_punctuations in texts_filterd_punctuations]
    23 
    24 # 写日志
    25 from gensim import corpors,models,similarities
    26 import logging
    27 logging.basicConfig(format ='%(asctime)s : %(levelname)s : % (message)s', level = logging.INFO)
    28 
    29 
    30 #1.抽取词袋
    31 dictionary  = corpora.Dictionary(texts_stemmed)
    32 print(dictionary.token2id)
    33 
    34 # 2.文本向量化
    35 corpus = [dictionary.doc2bow(text_stemmed) for text_stemmed in texts_stemmed]
    36 print(corpus)
    37 
    38 # 3.训练LDA模型
    39 tfidf = models.TfidfModel(corpus)
    40 corpus_tfidf = tfidf(corpus)
    41 for text in corpus_tfidf:
    42     print(text)
    43 
    44 lda = models.LdaModel(corpus_tfidf,id2word = dictionary,num_topics = 2)
    45 corpus_lda = lda(corpus_tfidf)
    46 
    47 for text in corpus_lda[0:6]:
    48     print(text)
    49 
    50 # 4计算相似度
    51 lda_index = similarities.MatrixSimilarity(corpus_lda)
    52 sims = lda_index[corpus_lda[0]]
    53 sort_sims = sorted(enumerate(sims),key = lambda item: -item[1])
    54 print(sort_sims)

    后面再补充说明,代码来自于 机器学习基础 吕云翔

  • 相关阅读:
    Python列表去重
    hash表长度优化证明
    DDD初学指南
    继承和实现的明显区别
    微信商户支付
    centos7+mono4+jexus5.6.2安装过程中的遇到的问题
    SVN:重命名文件之后不允许提交
    SpringMVC 自定义全局日期转换器
    解决Cannot change version of project facet Dynamic web module to 2.5
    Maven项目热部署到Tomcat容器下
  • 原文地址:https://www.cnblogs.com/www-caiyin-com/p/11074956.html
Copyright © 2011-2022 走看看