zoukankan html css js c++ java

朴素贝叶斯

从词向量计算概率


import numpy as np

def loadDataSet():
    """
    实验样本
    :return: 第一个变量是进行词条切分后的文档集合，第二个变量是一个类别标签的集合
    """
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]  # 人工标记 【0，stupid愚蠢，0，stupid愚蠢worthless垃圾garbage一文不值，0，worthless垃圾stupid愚蠢】   #1 is abusive, 0 not
    return postingList, classVec

def createVocabList(dataSet):
    """
    创建一个包含所以文档中出现的不重复词的列表
    :param dataSet:
    :return:
    """
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document)  # 并集 #union of the two sets
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    """

    :param vocabList: 词汇表
    :param inputSet: 某个文档
    :return: 词汇表长度的列表，1表示出现，0没有出现
    """
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

def trainNB0(trainMatrix, trainCategory):
    """
    分类器训练
    :param trainMatrix: 文档矩阵 训练集
    :param trainCategory: 文档类别标签向量
    :return:
    """
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = np.zeros(numWords); p1Num = np.zeros(numWords)      #change to np.ones()
    # p0Num = np.ones(numWords); p1Num = np.ones(numWords)      #change to np.ones()
    p0Denom = 0.0; p1Denom = 0.0                        #change to 2.0
    # p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num/p1Denom          #change to np.log()
    # p1Vect = np.log(p1Num/p1Denom)          #change to np.log()
    p0Vect = p0Num/p0Denom         #change to np.log()
    # p0Vect = np.log(p0Num/p0Denom)          #change to np.log()
    return p0Vect, p1Vect, pAbusive

if __name__ == '__main__':
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    print(myVocabList)

    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    print(trainMat)
    p0v, p1v, pAb = trainNB0(trainMat, listClasses)
    print(pAb)
    print(p0v)
    print(p1v)

'''
['love', 'take', 'cute', 'so', 'flea', 'posting', 'stop', 'help', 'mr', 'stupid', 'ate', 'garbage', 'has', 'I', 'problems', 'licks', 'worthless', 'is', 'how', 'not', 'maybe', 'dalmation', 'food', 'buying', 'please', 'him', 'park', 'quit', 'steak', 'my', 'dog', 'to']
[[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1], [1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0]]
0.5
[0.04166667 0.         0.04166667 0.04166667 0.04166667 0.
 0.04166667 0.04166667 0.04166667 0.         0.04166667 0.
 0.04166667 0.04166667 0.04166667 0.04166667 0.         0.04166667
 0.04166667 0.         0.         0.04166667 0.         0.
 0.04166667 0.08333333 0.         0.         0.04166667 0.125
 0.04166667 0.04166667]
[0.         0.05263158 0.         0.         0.         0.05263158
 0.05263158 0.         0.         0.15789474 0.         0.05263158
 0.         0.         0.         0.         0.10526316 0.
 0.         0.05263158 0.05263158 0.         0.05263158 0.05263158
 0.         0.05263158 0.05263158 0.05263158 0.         0.
 0.10526316 0.05263158]


cute 在类别0中出现1次，类别1中出现0次，对应的条件概率分别是0.04166667与0.

p1v中最大概率是0.15789474对应stupid，这意味stupid是最能表征类别1的单词

'''

修改分类器

计算多个概率的额乘积以获得文档属于某个类别的概率如果其中一个概率值为0，那么最后的乘积也为0，
为降低这种影像，可以将所以词的出现数初始化为1，将分母初始化为2
p0Num = np.ones(numWords); p1Num = np.ones(numWords)
p0Denom = 2.0; p1Denom = 2.0
下溢出问题，由于太多很小的数相乘造成的，通过求对数可以避免下溢出或浮点数舍入导致的错误
p1Vect = np.log(p1Num/p1Denom)
p0Vect = np.log(p0Num/p0Denom)

分类函数

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    """
    分类函数
    :param vec2Classify:
    :param p0Vec:
    :param p1Vec:
    :param pClass1:
    :return:
    """
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

测试

def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))

查看全文

相关阅读:
Object.assign
js获取 some方法索引值
 Vue配置sass
spring MVC，controller中获得resuqest和response的方式
 CentOS7中启动Tomcat后，8080端口不能被外部访问的解决办法。
spring mvc 中 controller 路径配置
 Spring扫面路径配置不全导致异常 org.apache.ibatis.binding.BindingException: Invalid bound statement (not found): 的原因
 CentOS7中安装MySQL5.7
eclipse maven web
用Eclipse进行远程Debug代码

原文地址：https://www.cnblogs.com/fly-book/p/14215301.html