zoukankan      html  css  js  c++  java
  • 基于LDA主题模型和SVM的文本分类

    用LDA模型抽取文本特征,再用线性SVM分类,发现效果很差,F1=0.654。

    Precision:0.680,Recall:0.649,F1:0.654

    RandomForestClassifier的表现也比较差:

    Precision:0.680,Recall:0.668,F1:0.670

    而随便用一个深度学习模型(textCNN,LSTM+Attention)都能达到0.95+的F1,而且还不用处理特征、不用分词。

    说下具体流程:提取LDA特征时,需要CountVectorizer来先对文本进行向量化,首先需要对文本进行分词,考虑到样本数量较多(搜狐新闻数据集,5个类别*3000条信息),使用了多进程程(此处用了进程池ProcessPoolExecutor来实现)来进行jieba分词。

    import pandas as pd
    import jieba
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation
    import multiprocessing
    from concurrent.futures import ProcessPoolExecutor,as_completed
    from utils import log
    from tqdm import tqdm
    import time
    import pickle as pk
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.svm import LinearSVC,SVC
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import precision_score,recall_score,f1_score
    
    def transform_text(text,stopwords):
        #对文章进行jieba分词
        words=[w for w in jieba.cut(text) if w.strip() and (w not in stopwords)]
        return ','.join(words)
    
    def cut_texts(lock,texts,stopwords,processName,doc_list=[]):
        #进程+锁的形式来做多进程分词
        log('Process {} is cutting texts...'.format(processName))
        docs=[]
        for text in tqdm(texts):
            doc=transform_text(text,stopwords)
            #log(doc)
            docs.append(doc)
        lock.acquire()
        doc_list.extend(docs)
        lock.release()
    
    def cut_texts_pool(texts,stopwords,processName):
        #分词,此方法将以,进程池方式的方式实现多进程加速执行
        log('Process {} is cutting texts...'.format(processName))
        docs=[]
        for text in tqdm(texts):
            doc=transform_text(text,stopwords)
            #log(doc)
            docs.append(doc)
        log('Process {} finished cutting.'.format(processName))
        return docs
    
    def hard_work(processName):
        #测试方法,模拟耗时操作
        log('Process {} is running...'.format(processName))
        time.sleep(2)
        log('Process {} finished.'.format(processName))
        return processName
    
    def mp_pool_test(texts=None,res=None):
        #多进程测试
        n_process=multiprocessing.cpu_count()
        pool=ProcessPoolExecutor()
        fs=[]
        for i in range(n_process):
            f=pool.submit(hard_work,i)
            fs.append(f)
        names=[]
        for f in as_completed(fs):
            name = f.result()
            names.append(name)
        log(names)
    
    def partition(iterable_,n_parittion):
        #多文本进行分割,大体均分为n_parittion份
        assert isinstance(n_parittion,int) and n_parittion>0,'Invalid value for "n_partition"'
        temp=list(iterable_)
        total=len(temp)
        assert total>n_parittion,'Size of iterable is less than "n_partition"'
    
        partition_size=total//n_parittion
        res=[]
        for i in range(n_parittion-1):
            res.append(temp[partition_size*i:partition_size*(i+1)])
        res.append(temp[partition_size*(i+1):])
        return res
    
    def mp_cut_pool(texts):
        #有几个CPU就创建几个进程
        n_process=multiprocessing.cpu_count()
        texts=partition(texts,n_process)
        #以进程池的方式进行多进程分词
        pool=ProcessPoolExecutor(max_workers=12)
        fs=[]
        docs=[]
        for i in range(n_process):
            #submit启动进程,第一个参数是目标方法,后面是该方法的参数
            f=pool.submit(cut_texts_pool,texts[i],[],i)
            #f是一个Future对象
            fs.append(f)
        #as_completed返回一个迭代器,当进程池当中的进程执行结束时调用
        for f in as_completed(fs):
            #f.result()获取每个进程的返回值
            docs.extend(f.result())
        return docs
    
    class LDA_Transformer:
        def __init__(self,n_features):
            self.n_features=n_features
    
        def fit(self,texts):
            log('Building CountVectorizer with texts...')
            ct=CountVectorizer()
            self.count_vectorizer=ct
            log(type(texts))
            if isinstance(texts,list):
                log('Len of texts:{}'.format(len(texts)))
                #log(texts)
            else:
                log('Shape of texts:{}'.format(texts.shape))
            print('texts[0]',texts[0])
            ctv=ct.fit_transform(texts)
            log('Building LDA model with CountVectorizer..')
            #n_components是LDA的主题个数,类似于word embedding的维度大小
            lda=LatentDirichletAllocation(n_components=self.n_features)
            lda.fit(ctv)
            log('Done building LDA model.')
            self.lda_model=lda
    
        def transform(self,texts):
            count_vec=self.count_vectorizer.transform(texts)
            return self.lda_model.transform(count_vec)
    
    def build_data():
        df=pd.read_excel('data/souhu_news_400_500.xlsx')
        texts=list(df['content'])#文本字段
        log(df.columns)
        docs=mp_cut_pool(texts)
        lda_transformer=LDA_Transformer(64)
        lda_transformer.fit(docs)
        #保存LDA模型到本地
        with open('output/lda_transformer.pkl','wb') as f:
            pk.dump(lda_transformer,f)
    
        indices=list(range(df.shape[0]))
        np.random.shuffle(indices)
        df=df.iloc[indices]
        dic={topic:i for i,topic in enumerate(list(df['topic'].unique()))}
        y=[dic[topic] for topic in list(df['topic'])]
        with open('data/y_lda.pkl','wb') as f:
            pk.dump(y,f)
    
        texts=list(df['content'])
        X=lda_transformer.transform(texts)
        with open('data/X_lda.pkl','wb') as f:
            pk.dump(X,f)
        log('Training data is saved.')
    
    def load_train_data():
        with open('data/X_lda.pkl','rb') as f:
            X=pk.load(f)
        with open('data/y_lda.pkl','rb') as f:
            y=pk.load(f)
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
        return X_train,X_test,y_train,y_test
    
    def main():
        log('Building training data...')
        build_data()
        log('Loading training data with LDA features...')
        X_train,X_test,y_train,y_test=load_train_data()
        log('Training LinearSVC model..')
        #model=LinearSVC()
        model=RandomForestClassifier()
        model.fit(X_train,y_train)
        log('Evaluating model...')
        acc=model.score(X_test,y_test)
        log('Accuracy:{}'.format(acc))
        y_pred=model.predict(X_test)
        p=precision_score(y_test,y_pred,average='macro')
        r=recall_score(y_test,y_pred,average='macro')
        f1=f1_score(y_test,y_pred,average='macro')
        log('Precision:{:.3f},Recall:{:.3f},F1:{:.3f}'.format(p,r,f1))
    
    
    if __name__=='__main__':
        main()
  • 相关阅读:
    Regexp:教程
    Regexp:目录
    笔记-C#:C# 方法、属性杂项-01
    Regexp:正则表达式应用——实例应用
    正则表达式:百科
    Regexp:template
    AngularJS:参考手册
    命令目录
    java实现连续数的公倍数
    java实现连续数的公倍数
  • 原文地址:https://www.cnblogs.com/aaronhoo/p/14087619.html
Copyright © 2011-2022 走看看