zoukankan      html  css  js  c++  java
  • 朴素贝叶斯分类器基本代码 && n折交叉优化 2

    这个代码基于上一个代码

    不同的是:读取了txt文件,改变了min_ft与max_ft的参数

    import re
    import pandas as pd
    import warnings
    import numpy as np
    from sklearn.metrics import roc_auc_score
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.naive_bayes import MultinomialNB as MNB #多项分布朴素贝叶斯公式
    from sklearn.naive_bayes import BernoulliNB as BNB
    from sklearn.model_selection  import cross_val_score
    warnings.filterwarnings("ignore")
    def proces(col2):
        col2_text=re.sub("[^a-zA-Z]"," ",col2)
        words=col2_text.lower().split()
        #print(words)
        return words
    train=pd.read_table('sentimentLabel.txt',lineterminator='
    ', header=None, names=[0, 1])
    print(train.head(5))
    train_labers=train[0]
    train_texts=train[1]
    class_mapping={'Negative':0, 'Positive':1}
    train_labers=train_labers.map(class_mapping)
    #print(labers)
    
    test=pd.read_table('test.txt', lineterminator='
    ', header=None, names=[0, 1])
    test_labers=test[0]
    test_texts=test[1]
    test_labers=test_labers.map(class_mapping)
    
    train_data=[]
    for i in range(len(train_texts)):
        train_data.append(' '.join(proces(train_texts[i])))
        pass
    test_data=[]
    for i in range(len(test_texts)):
        test_data.append(' '.join(proces(test_texts[i])))
    #print(train_data)
    #print(test_data)
    data_all = train_data+test_data
    #print(data_all)
    count_vec = TfidfVectorizer(min_df=1,
                                max_df=60,
                                analyzer='word',
                                ngram_range=(1, 2),
                                use_idf=1,
                                smooth_idf=1,
                                sublinear_tf=1,
                                stop_words='english'
    )
    length=len(train_data)
    count_vec.fit(data_all)
    data_all=count_vec.transform(data_all)
    #print(data_all)
    train_data=data_all[:length]
    test_data=data_all[length:]
    
    
    model=MNB()
    #model=BNB()
    model.fit(train_data,train_labers)
    #pred=model.predict(test_data)
    MNB(alpha=1.0, class_prior=False, fit_prior=True)
    #print("roc_auc",roc_auc_score(test_labers, pred))
    #print("roc_auc",roc_auc_score(w, pred))
    '''
    MX = 0.7996632996632996
    MX_idx = 5
    for i in range(400, 500):
        if MX < np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc')):
            MX=np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc'))
            MX_idx=i
        pass
    print("roc_auc",MX, MX_idx)
    '''
    print("roc_auc", np.mean(cross_val_score(model, train_data, train_labers, cv=297, scoring='roc_auc')))
    化繁为简 大巧不工
  • 相关阅读:
    jdk版本切换
    Java开发中遇到的问题
    递归删除文件夹
    重写equals方法
    JSP基础
    js把变量转换成json数据
    myBatista批量查询和插入
    Jquery密码强度校验
    Linux配置外网访问mysql
    linux下开启、关闭、重启mysql服务命令
  • 原文地址:https://www.cnblogs.com/mpeter/p/11172284.html
Copyright © 2011-2022 走看看