zoukankan      html  css  js  c++  java
  • 机器学习之贝叶斯垃圾邮件分类

    代码来源于:https://www.cnblogs.com/huangyc/p/10327209.html  ,本人只是简介学习

    1、 贝叶斯.py

    import numpy as np
    from word_utils import *
    
    
    
    class NaiveBayesBase(object):
    
        def __init__(self):
            pass
    
    
        def fit(self, trainMatrix, trainCategory):
            '''
            朴素贝叶斯分类器训练函数,求:p(Ci),基于词汇表的p(w|Ci)
            Args:
                trainMatrix : 训练矩阵,即向量化表示后的文档(词条集合)
                trainCategory : 文档中每个词条的列表标注
            Return:
                p0Vect : 属于0类别的概率向量(p(w1|C0),p(w2|C0),...,p(wn|C0))
                p1Vect : 属于1类别的概率向量(p(w1|C1),p(w2|C1),...,p(wn|C1))
                pAbusive : 属于1类别文档的概率
            '''
            numTrainDocs = len(trainMatrix)
            # 长度为词汇表长度
            numWords = len(trainMatrix[0])
            # p(ci)
            self.pAbusive = sum(trainCategory) / float(numTrainDocs)
            # 由于后期要计算p(w|Ci)=p(w1|Ci)*p(w2|Ci)*...*p(wn|Ci),若wj未出现,则p(wj|Ci)=0,因此p(w|Ci)=0,这样显然是不对的
            # 故在初始化时,将所有词的出现数初始化为1,分母即出现词条总数初始化为2
            p0Num = np.ones(numWords)
            p1Num = np.ones(numWords)
            p0Denom = 2.0
            p1Denom = 2.0
            for i in range(numTrainDocs):
                if trainCategory[i] == 1:
                    p1Num += trainMatrix[i]
                    p1Denom += sum(trainMatrix[i])
                else:
                    p0Num += trainMatrix[i]
                    p0Denom += sum(trainMatrix[i])
            # p(wi | c1)
            # 为了避免下溢出(当所有的p都很小时,再相乘会得到0.0,使用log则会避免得到0.0)
            self.p1Vect = np.log(p1Num / p1Denom)
            # p(wi | c2)
            self.p0Vect = np.log(p0Num / p0Denom)
            return self
    
    
        def predict(self, testX):
            '''
            朴素贝叶斯分类器
            Args:
                testX : 待分类的文档向量(已转换成array)
                p0Vect : p(w|C0)
                p1Vect : p(w|C1)
                pAbusive : p(C1)
            Return:
                1 : 为侮辱性文档 (基于当前文档的p(w|C1)*p(C1)=log(基于当前文档的p(w|C1))+log(p(C1)))
                0 : 非侮辱性文档 (基于当前文档的p(w|C0)*p(C0)=log(基于当前文档的p(w|C0))+log(p(C0)))
            '''
    
            p1 = np.sum(testX * self.p1Vect) + np.log(self.pAbusive)
            p0 = np.sum(testX * self.p0Vect) + np.log(1 - self.pAbusive)
            if p1 > p0:
                return 1
            else:
                return 0
    
    def loadDataSet():
        '''数据加载函数。这里是一个小例子'''
        postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                       ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                       ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                       ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                       ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                       ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
        classVec = [0, 1, 0, 1, 0, 1]  # 1代表侮辱性文字,0代表正常言论,代表上面6个样本的类别
        return postingList, classVec
    
    
    def checkNB():
        '''测试'''
        listPosts, listClasses = loadDataSet()
        myVocabList = createVocabList(listPosts)
        trainMat = []
        for postDoc in listPosts:
            trainMat.append(setOfWord2Vec(myVocabList, postDoc))
    
        nb = NaiveBayesBase()
        nb.fit(np.array(trainMat), np.array(listClasses))
    
        testEntry1 = ['love', 'my', 'dalmation']
        thisDoc = np.array(setOfWord2Vec(myVocabList, testEntry1))
        print(testEntry1, 'classified as:', nb.predict(thisDoc))
    
        testEntry2 = ['stupid', 'garbage']
        thisDoc2 = np.array(setOfWord2Vec(myVocabList, testEntry2))
        print(testEntry2, 'classified as:', nb.predict(thisDoc2))
    
    
    if __name__ == "__main__":
        checkNB()
    View Code

    2、word_utils.py

    def createVocabList(dataSet):
        '''
        创建所有文档中出现的不重复词汇列表
        Args:
            dataSet: 所有文档
        Return:
            包含所有文档的不重复词列表,即词汇表
        '''
        vocabSet = set([])
        # 创建两个集合的并集
        for document in dataSet:
            vocabSet = vocabSet | set(document)
        return list(vocabSet)
    
    
    # 词袋模型(bag-of-words model):词在文档中出现的次数
    def bagOfWords2Vec(vocabList, inputSet):
        '''
        依据词汇表,将输入文本转化成词袋模型词向量
        Args:
            vocabList: 词汇表
            inputSet: 当前输入文档
        Return:
            returnVec: 转换成词向量的文档
        例子:
            vocabList = ['I', 'love', 'python', 'and', 'machine', 'learning']
            inputset = ['python', 'machine', 'learning', 'python', 'machine']
            returnVec = [0, 0, 2, 0, 2, 1]
            长度与词汇表一样长,出现了的位置为1,未出现为0,如果词汇表中无该单词则print
        '''
        returnVec = [0] * len(vocabList)
        for word in inputSet:
            if word in vocabList:
                returnVec[vocabList.index(word)] += 1
            else:
                print("the word: %s is not in my vocabulary!" % word)
            return returnVec
    
    
    # 词集模型(set-of-words model):词在文档中是否存在,存在为1,不存在为0
    def setOfWord2Vec(vocabList, inputSet):
        '''
        依据词汇表,将输入文本转化成词集模型词向量
        Args:
            vocabList: 词汇表
            inputSet: 当前输入文档
        Return:
            returnVec: 转换成词向量的文档
        例子:
            vocabList = ['I', 'love', 'python', 'and', 'machine', 'learning']
            inputset = ['python', 'machine', 'learning']
            returnVec = [0, 0, 1, 0, 1, 1]
            长度与词汇表一样长,出现了的位置为1,未出现为0,如果词汇表中无该单词则print
        '''
        returnVec = [0] * len(vocabList)
        for word in inputSet:
            if word in vocabList:
                returnVec[vocabList.index(word)] = 1
            else:
                print("the word: %s is not in my vocabulary!" % word)
        return returnVec
    View Code
  • 相关阅读:
    编译原理-第二章 一个简单的语法指导编译器-2.4 语法制导翻译
    编译原理-第二章 一个简单的语法指导编译器-2.3 语法定义
    编译原理-第二章 一个简单的语法指导编译器-2.2 词法分析
    LeetCode 1347. Minimum Number of Steps to Make Two Strings Anagram
    LeetCode 1348. Tweet Counts Per Frequency
    1349. Maximum Students Taking Exam(DP,状态压缩)
    LeetCode 1345. Jump Game IV(BFS)
    LeetCode 212. Word Search II
    LeetCode 188. Best Time to Buy and Sell Stock IV (动态规划)
    LeetCode 187. Repeated DNA Sequences(位运算,hash)
  • 原文地址:https://www.cnblogs.com/ywjfx/p/11045395.html
Copyright © 2011-2022 走看看