zoukankan      html  css  js  c++  java
  • 朴素贝叶斯应用:垃圾邮件分类

    import nltk
    nltk.download()
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    
    #预处理
    def preprocessing(text):
        tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokrnize(sent)]
        stops = stopwords.words('english')  
        tokens = [token for token in tokens if token not in stops]   #去掉停用词
    
        tokens = [token.lower() for token in tokens if len(token)>=2]  #去掉长度小于2的词
        lmtzr  =  WordNetLemmatizer()
        tokens = (lmtzr.lemmatize(token) for token in tokens) #词性还原
        preprocessed_text = ' '.join(tokens)  
        return preprocessed_text
    
    #读取数据集
    import csv
    file_path = r'C:UsersAdministratorDesktopSMSSpamCollectionjsn.txt'
    sms = open(file_path,'r',encoding='utf-8')
    sms_data = []
    sms_label = []
    csv_reader = csv.reader(sms,delimiter = '	')
    for line in csv_reader:
        sms_label.append(line[0])
        sms_data.append(preprocessing(line[1]))
    sms.close()
    
    #训练集和测试集数据划分
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test = train_test_split(sms_data,sms_label,test_size = 0.3,random_state=0,stratify=sms_label)
    
    #将其向量化
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(min_df=2,ngram_range=(1,2),stop_words='english',strip_accents='unicode',norm='12')
    X_train = vectorizer.fit_transform(x_train)
    X_test = vectorizer.transform(x_test)
    
    #朴素贝叶斯分类器
    
    from sklearn.navie_bayes import MultinomiaNB
    clf = MultinomiaNB().fit(X_train,y_train)
    
    #测试模型
    y_nb_pred = clf.predict(X_test)
    
    #测试模型:结果显示
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    
    print(y_nb_pred.shape,y_nb_pred) #x_test预测结果
    print('nb_confusion_matrix:')
    cm = confusion_matrix(y_test,y_nb_pred)#混淆矩阵
    print(cm)
    print('nb_classification_report:')
    cr = classification_report(y_test,y_nb_pred) #主要分类指标的文本报告
    print(cr)
  • 相关阅读:
    Unique Binary Search Trees——LeetCode
    Binary Tree Inorder Traversal ——LeetCode
    Maximum Product Subarray——LeetCode
    Remove Linked List Elements——LeetCode
    Maximum Subarray——LeetCode
    Validate Binary Search Tree——LeetCode
    Swap Nodes in Pairs——LeetCode
    Find Minimum in Rotated Sorted Array——LeetCode
    Linked List Cycle——LeetCode
    VR AR MR
  • 原文地址:https://www.cnblogs.com/hodafu/p/10037332.html
Copyright © 2011-2022 走看看