zoukankan      html  css  js  c++  java
  • 朴素贝叶斯分类器基本代码 && n折交叉优化 2

    这个代码基于上一个代码

    不同的是:读取了txt文件,改变了min_ft与max_ft的参数

    import re
    import pandas as pd
    import warnings
    import numpy as np
    from sklearn.metrics import roc_auc_score
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.naive_bayes import MultinomialNB as MNB #多项分布朴素贝叶斯公式
    from sklearn.naive_bayes import BernoulliNB as BNB
    from sklearn.model_selection  import cross_val_score
    warnings.filterwarnings("ignore")
    def proces(col2):
        col2_text=re.sub("[^a-zA-Z]"," ",col2)
        words=col2_text.lower().split()
        #print(words)
        return words
    train=pd.read_table('sentimentLabel.txt',lineterminator='
    ', header=None, names=[0, 1])
    print(train.head(5))
    train_labers=train[0]
    train_texts=train[1]
    class_mapping={'Negative':0, 'Positive':1}
    train_labers=train_labers.map(class_mapping)
    #print(labers)
    
    test=pd.read_table('test.txt', lineterminator='
    ', header=None, names=[0, 1])
    test_labers=test[0]
    test_texts=test[1]
    test_labers=test_labers.map(class_mapping)
    
    train_data=[]
    for i in range(len(train_texts)):
        train_data.append(' '.join(proces(train_texts[i])))
        pass
    test_data=[]
    for i in range(len(test_texts)):
        test_data.append(' '.join(proces(test_texts[i])))
    #print(train_data)
    #print(test_data)
    data_all = train_data+test_data
    #print(data_all)
    count_vec = TfidfVectorizer(min_df=1,
                                max_df=60,
                                analyzer='word',
                                ngram_range=(1, 2),
                                use_idf=1,
                                smooth_idf=1,
                                sublinear_tf=1,
                                stop_words='english'
    )
    length=len(train_data)
    count_vec.fit(data_all)
    data_all=count_vec.transform(data_all)
    #print(data_all)
    train_data=data_all[:length]
    test_data=data_all[length:]
    
    
    model=MNB()
    #model=BNB()
    model.fit(train_data,train_labers)
    #pred=model.predict(test_data)
    MNB(alpha=1.0, class_prior=False, fit_prior=True)
    #print("roc_auc",roc_auc_score(test_labers, pred))
    #print("roc_auc",roc_auc_score(w, pred))
    '''
    MX = 0.7996632996632996
    MX_idx = 5
    for i in range(400, 500):
        if MX < np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc')):
            MX=np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc'))
            MX_idx=i
        pass
    print("roc_auc",MX, MX_idx)
    '''
    print("roc_auc", np.mean(cross_val_score(model, train_data, train_labers, cv=297, scoring='roc_auc')))
    化繁为简 大巧不工
  • 相关阅读:
    分治法(求最大子序列和)
    分治法(二分查找)
    自定义选择动画提示
    将图片转为二进制字符串
    根据尺寸压缩图片
    在ios7系统下,scrollView下移20像素
    UIActionSheet警告,提示调用showFromTabBar方法
    使用Xcode5开发时的icon取消高光效果
    duplicate symbol _OBJC_METACLASS_$ 报错记录
    self.view添加UIView时添加动画
  • 原文地址:https://www.cnblogs.com/mpeter/p/11172284.html
Copyright © 2011-2022 走看看