zoukankan      html  css  js  c++  java
  • 朴素贝叶斯应用:垃圾邮件分类

    # 若没有nltk则先定义一个空函数
    def pre(text):
        pre_text=text
        return pre_text
    
    #读取数据
    import csv
    # with open(r'd:/SMSSpamCollectionjsn.txt',encoding = "utf-8")as file_path:
    # with open('C:UsersAdministratorDesktopSMSSpamCollection.csv','r',encoding='utf-8')as file_path:
    #     sms=file_path.read()
    # print(sms)
    file_path=r'd:/SMSSpamCollectionjsn.txt'
    sms=open(file_path,'r',encoding="utf-8")
    sms_data=[]
    sms_label=[]
    reader=csv.reader(sms,delimiter='	')
    for  line in reader:
        sms_label.append(line[0])
        sms_data.append(pre(line[1]))
    sms.close()
    
    #训练集合测试集,先将先验数据按如下比例划分
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(sms_data,sms_label,test_size=0.3,random_state=0,stratify=sms_label)
    print(len(sms_data),len(x_train),len(x_test))
    x_train
    
    
    # 将其向量化,提取数据特征,将文本解析为词向量,训练模型
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer=TfidfVectorizer(min_df=2,ngram_range=(1,2),stop_words='english',strip_accents='unicode')
    x_train=vectorizer.fit_transform(x_train)
    x_train.toarray().shape
    
    
    
    (3898, 6649)
    
    
    x_test=vectorizer.transform(x_test)
    # 贝叶斯分类器
    from sklearn.naive_bayes import MultinomialNB
    result=MultinomialNB().fit(x_train,y_train)
    y_pred=result.predict(x_test)
    
    
    #分类结果显示,利用混淆矩阵评估预测模型的正确率,准确率、精确率、召回率。F值
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    print(y_pred.shape,y_pred)
    print('nb_confusion_matrix:')
    cm=confusion_matrix(y_test,y_pred)
    print(cm)
    print('nb_classification_report:')
    cr=classification_report(y_test,y_pred)
    print(cr)
    
    
    (1671,) ['ham' 'ham' 'ham' ... 'ham' 'spam' 'ham']
    nb_confusion_matrix:
    [[1447    0]
     [  48  176]]
    nb_classification_report:
                 precision    recall  f1-score   support
    
            ham       0.97      1.00      0.98      1447
           spam       1.00      0.79      0.88       224
    
    avg / total       0.97      0.97      0.97      1671
    
    
    
    #预测排行榜
    
    feature_names=vectorizer.get_feature_names()#出现过的单词列表
    xgailv=result.coef_  #先验概率 P(x_i|y)
    intercept=result.intercept_   #p(y)
    xgailv_with_fns=sorted(zip(xgailv[0],feature_names))  #对数海旅p(x_i|y)与单词x_i映射
    
    n=10
    top=zip(xgailv_with_fns[:n],xgailv_with_fns[:-(n+1):-1]) #最大的10个和最小的10个单词
    for (coef_1,fn_1),(coef_2,fn_2) in top:
        print('	%.4f	%-15s	%.4f	%-15s' % (coef_1,fn_1,coef_2,fn_2))
    
    
    
    -9.1053	10 smth        	-6.1149	free           
    	-9.1053	15             	-6.3421	txt            
    	-9.1053	2go            	-6.4948	mobile         
    	-9.1053	2gthr          	-6.5769	text           
    	-9.1053	2gthr drinking 	-6.5780	claim          
    	-9.1053	2marrow        	-6.6015	stop           
    	-9.1053	2morrow        	-6.6108	ur             
    	-9.1053	2mrw           	-6.6352	reply          
    	-9.1053	2mrw luv       	-6.7198	www            
    	-9.1053	2nd ur         	-6.7481	prize    
    
    vectorizer.get_feature_names()#出现的有分类价值的单词
    

      

  • 相关阅读:
    /sbin/nologin 和 /bin/false 的区别
    lesson
    df 命令详解
    课后习题-7 阶段汇总
    javascript Window Navigator
    javascript 正则表达式
    linux crontab 的使用
    linux环境变量
    linux第一个C语言和sh脚本
    linux 文件常用操作
  • 原文地址:https://www.cnblogs.com/cc013/p/10059022.html
Copyright © 2011-2022 走看看