zoukankan      html  css  js  c++  java
  • 朴素贝叶斯分类器基本代码 && n折交叉优化 2

    这个代码基于上一个代码

    不同的是:读取了txt文件,改变了min_ft与max_ft的参数

    import re
    import pandas as pd
    import warnings
    import numpy as np
    from sklearn.metrics import roc_auc_score
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.naive_bayes import MultinomialNB as MNB #多项分布朴素贝叶斯公式
    from sklearn.naive_bayes import BernoulliNB as BNB
    from sklearn.model_selection  import cross_val_score
    warnings.filterwarnings("ignore")
    def proces(col2):
        col2_text=re.sub("[^a-zA-Z]"," ",col2)
        words=col2_text.lower().split()
        #print(words)
        return words
    train=pd.read_table('sentimentLabel.txt',lineterminator='
    ', header=None, names=[0, 1])
    print(train.head(5))
    train_labers=train[0]
    train_texts=train[1]
    class_mapping={'Negative':0, 'Positive':1}
    train_labers=train_labers.map(class_mapping)
    #print(labers)
    
    test=pd.read_table('test.txt', lineterminator='
    ', header=None, names=[0, 1])
    test_labers=test[0]
    test_texts=test[1]
    test_labers=test_labers.map(class_mapping)
    
    train_data=[]
    for i in range(len(train_texts)):
        train_data.append(' '.join(proces(train_texts[i])))
        pass
    test_data=[]
    for i in range(len(test_texts)):
        test_data.append(' '.join(proces(test_texts[i])))
    #print(train_data)
    #print(test_data)
    data_all = train_data+test_data
    #print(data_all)
    count_vec = TfidfVectorizer(min_df=1,
                                max_df=60,
                                analyzer='word',
                                ngram_range=(1, 2),
                                use_idf=1,
                                smooth_idf=1,
                                sublinear_tf=1,
                                stop_words='english'
    )
    length=len(train_data)
    count_vec.fit(data_all)
    data_all=count_vec.transform(data_all)
    #print(data_all)
    train_data=data_all[:length]
    test_data=data_all[length:]
    
    
    model=MNB()
    #model=BNB()
    model.fit(train_data,train_labers)
    #pred=model.predict(test_data)
    MNB(alpha=1.0, class_prior=False, fit_prior=True)
    #print("roc_auc",roc_auc_score(test_labers, pred))
    #print("roc_auc",roc_auc_score(w, pred))
    '''
    MX = 0.7996632996632996
    MX_idx = 5
    for i in range(400, 500):
        if MX < np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc')):
            MX=np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc'))
            MX_idx=i
        pass
    print("roc_auc",MX, MX_idx)
    '''
    print("roc_auc", np.mean(cross_val_score(model, train_data, train_labers, cv=297, scoring='roc_auc')))
    化繁为简 大巧不工
  • 相关阅读:
    芯片手冊写错了,非常悲剧
    编程之美读书笔记1.8
    贪心算法
    codeforces Looksery Cup 2015 D. Haar Features
    NS3网络仿真(5): 数据包分析
    hdu1116 Play on Words--并查集
    程序猿生存定律-六个程序猿的故事(2)
    保存Activity的状态
    9月份入读哈工大计算机研究生了
    leetcode第一刷_Same Tree
  • 原文地址:https://www.cnblogs.com/mpeter/p/11172284.html
Copyright © 2011-2022 走看看