zoukankan      html  css  js  c++  java
  • 朴素贝叶斯分类器基本代码 && n折交叉优化 2

    这个代码基于上一个代码

    不同的是:读取了txt文件,改变了min_ft与max_ft的参数

    import re
    import pandas as pd
    import warnings
    import numpy as np
    from sklearn.metrics import roc_auc_score
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.naive_bayes import MultinomialNB as MNB #多项分布朴素贝叶斯公式
    from sklearn.naive_bayes import BernoulliNB as BNB
    from sklearn.model_selection  import cross_val_score
    warnings.filterwarnings("ignore")
    def proces(col2):
        col2_text=re.sub("[^a-zA-Z]"," ",col2)
        words=col2_text.lower().split()
        #print(words)
        return words
    train=pd.read_table('sentimentLabel.txt',lineterminator='
    ', header=None, names=[0, 1])
    print(train.head(5))
    train_labers=train[0]
    train_texts=train[1]
    class_mapping={'Negative':0, 'Positive':1}
    train_labers=train_labers.map(class_mapping)
    #print(labers)
    
    test=pd.read_table('test.txt', lineterminator='
    ', header=None, names=[0, 1])
    test_labers=test[0]
    test_texts=test[1]
    test_labers=test_labers.map(class_mapping)
    
    train_data=[]
    for i in range(len(train_texts)):
        train_data.append(' '.join(proces(train_texts[i])))
        pass
    test_data=[]
    for i in range(len(test_texts)):
        test_data.append(' '.join(proces(test_texts[i])))
    #print(train_data)
    #print(test_data)
    data_all = train_data+test_data
    #print(data_all)
    count_vec = TfidfVectorizer(min_df=1,
                                max_df=60,
                                analyzer='word',
                                ngram_range=(1, 2),
                                use_idf=1,
                                smooth_idf=1,
                                sublinear_tf=1,
                                stop_words='english'
    )
    length=len(train_data)
    count_vec.fit(data_all)
    data_all=count_vec.transform(data_all)
    #print(data_all)
    train_data=data_all[:length]
    test_data=data_all[length:]
    
    
    model=MNB()
    #model=BNB()
    model.fit(train_data,train_labers)
    #pred=model.predict(test_data)
    MNB(alpha=1.0, class_prior=False, fit_prior=True)
    #print("roc_auc",roc_auc_score(test_labers, pred))
    #print("roc_auc",roc_auc_score(w, pred))
    '''
    MX = 0.7996632996632996
    MX_idx = 5
    for i in range(400, 500):
        if MX < np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc')):
            MX=np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc'))
            MX_idx=i
        pass
    print("roc_auc",MX, MX_idx)
    '''
    print("roc_auc", np.mean(cross_val_score(model, train_data, train_labers, cv=297, scoring='roc_auc')))
    化繁为简 大巧不工
  • 相关阅读:
    linux c dlopen加载动态链接库
    c++锁 测试 (gcc test.cpp -o test -lpthread)
    shell 清理目录下 超过一段时间的数据。
    大话存储学习笔记
    python总结
    正则表达式使用
    #linux shell#模拟日志生成过程
    深入理解Java虚拟机
    Nginx修改access.log日志时间格式
    mfcs100d.lib(dllmodul.obj) : error LNK2005: _DllMain@12 already defined
  • 原文地址:https://www.cnblogs.com/mpeter/p/11172284.html
Copyright © 2011-2022 走看看