zoukankan      html  css  js  c++  java
  • LDA模型在邮件分类上的运用

    lda模型实战

    # -*- coding: utf-8 -*-
    """
    Created on Tue Dec  8 00:02:27 2020
    
    @author: Administrator
    """
    
    import numpy as np
    import pandas as pd
    import re
    
    
    # 数据加载
    df = pd.read_csv("../input/HillaryEmails.csv")
    # 原邮件数据中有很多Nan的值,直接扔了。
    df = df[['Id','ExtractedBodyText']].dropna()
    
    
    #文本预处理
    def clean_email_text(text):
        text = text.replace('
    '," ") #新行,我们是不需要的
        text = re.sub(r"-", " ", text) #把 "-" 的两个单词,分开。(比如:july-edu ==> july edu)
        text = re.sub(r"d+/d+/d+", "", text) #日期,对主体模型没什么意义
        text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) #时间,没意义
        text = re.sub(r"[w]+@[.w]+", "", text) #邮件地址,没意义
        text = re.sub(r"/[a-zA-Z]*[://]*[A-Za-z0-9-_]+.+[A-Za-z0-9./%&=?-_]+/i", "", text) #网址,没意义
        pure_text = ''
        # 以防还有其他特殊字符(数字)等等,我们直接把他们loop一遍,过滤掉
        for letter in text:
            # 只留下字母和空格
            if letter.isalpha() or letter==' ':
                pure_text += letter
        # 再把那些去除特殊字符后落单的单词,直接排除。
        # 我们就只剩下有意义的单词了。
        text = ' '.join(word for word in pure_text.split() if len(word)>1)
        return text
    
    
    docs = df['ExtractedBodyText']
    docs = docs.apply(lambda s: clean_email_text(s))  
    
    docs.head(1).values
    doclist = docs.values
    
    
    # LDA模型构建
    from gensim import corpora, models, similarities
    import gensim
    
    '''
    为了免去讲解安装NLTK等等的麻烦,我这里直接手写一下停止词列表:
    这些词在不同语境中指代意义完全不同,但是在不同主题中的出现概率是几乎一致的。
    所以要去除,否则对模型的准确性有影响
    '''
    stoplist = ['very', 'ourselves', 'am', 'doesn', 'through', 'me', 'against', 'up', 'just', 'her', 'ours', 
                'couldn', 'because', 'is', 'isn', 'it', 'only', 'in', 'such', 'too', 'mustn', 'under', 'their', 
                'if', 'to', 'my', 'himself', 'after', 'why', 'while', 'can', 'each', 'itself', 'his', 'all', 'once', 
                'herself', 'more', 'our', 'they', 'hasn', 'on', 'ma', 'them', 'its', 'where', 'did', 'll', 'you', 
                'didn', 'nor', 'as', 'now', 'before', 'those', 'yours', 'from', 'who', 'was', 'm', 'been', 'will', 
                'into', 'same', 'how', 'some', 'of', 'out', 'with', 's', 'being', 't', 'mightn', 'she', 'again', 'be', 
                'by', 'shan', 'have', 'yourselves', 'needn', 'and', 'are', 'o', 'these', 'further', 'most', 'yourself', 
                'having', 'aren', 'here', 'he', 'were', 'but', 'this', 'myself', 'own', 'we', 'so', 'i', 'does', 'both', 
                'when', 'between', 'd', 'had', 'the', 'y', 'has', 'down', 'off', 'than', 'haven', 'whom', 'wouldn', 
                'should', 've', 'over', 'themselves', 'few', 'then', 'hadn', 'what', 'until', 'won', 'no', 'about', 
                'any', 'that', 'for', 'shouldn', 'don', 'do', 'there', 'doing', 'an', 'or', 'ain', 'hers', 'wasn', 
                'weren', 'above', 'a', 'at', 'your', 'theirs', 'below', 'other', 'not', 're', 'him', 'during', 'which']
    
    
    
    # 中文的分词稍微复杂点儿,具体可以百度:CoreNLP, HaNLP, 结巴分词,等等
    texts = [[word for word in doc.lower().split() if word not in stoplist] for doc in doclist]
    texts[0]
    
    
    # 语料库构建
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]  #词袋模型
    corpus[13]
    
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
    
    #第十主题/分类,最常出现词
    lda.print_topic(10, topn=5)
    #所有主题top5词
    lda.print_topics(num_topics=20, num_words=5)
    #文本主题
    lda.get_document_topics(bow)
    #词主题
    lda.get_term_topics(word_id)
    

      

  • 相关阅读:
    Java学习开篇
    《我的姐姐》
    世上本无事,庸人自扰之
    这48小时
    补觉
    淡定
    es java api 设置index mapping 报错 mapping source must be pairs of fieldnames and properties definition.
    java mongodb groupby分组查询
    linux 常用命令
    mongodb too many users are authenticated
  • 原文地址:https://www.cnblogs.com/iupoint/p/14113271.html
Copyright © 2011-2022 走看看