zoukankan      html  css  js  c++  java
  • (3)机器学习实战笔记:朴素贝叶斯

    优点:数据比较少的时候仍然有效,可以处理多类别问题
    缺点:对于输入数据的准备方式比较敏感
    适用的数据类型:标称型数据
     
     
    将一组单词转换为一组数字
    使用数字计算概率
     
    著名应用:使用朴素贝叶斯过滤垃圾邮件
    分类思路:
    (1)收集数据:提供文本文件
    (2)准备数据:将文本文件解析成词条向量
    (3)分析数据:检查词条确保解析的正确性
    (4)训练算法:使用我们之前建立的trainNB0()函数
    (5)测试算法:使用classifyNB(),构建一个新的测试函数来计算文档的错误率
    (6)使用算法:构建一个完整的程度对一组文档进行分类,将错分的文档输出到屏幕上
     
    切分文本:使用String.split()方法切分
    为了更加精确地估计分类器错误率,需要进行多次迭代后求出平均错误率

    ——————————————————————————————-

    简单实例:通过朴素贝叶斯分类实现垃圾邮件分类

    通过对一邮件文本数据集进行处理(转化为向量)

    经过朴素贝叶斯分类器进行分类可以判定是否为垃圾邮件

    代码实现了简单的朴素贝叶斯分类器、文本向量转换器

    详细备注见解释,下载数据集点这里

    import numpy as np
    from functools import reduce
    
    #准备数据:从文本中构建词向量
    def loadDataSet():
        # 切分的词条
        postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                       ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                       ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                       ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                       ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                       ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
        # 类别标签向量,1代表侮辱性词汇,0代表不是
        classVec = [0, 1, 0, 1, 0, 1]
        # 返回实验样本切分的词条、类别标签向量
        return postingList, classVec
    
    def createVocabList(dataSet):
        #无重复提取单祠
        vocabSet = set([])
        for document in dataSet:
            vocabSet = vocabSet |set(document)#创建两个集合的并集
    
        return list(vocabSet)
    
    #检查单词在第几篇文档(文档为inputSet向量)中出现
    def setOfWords2Vec(vocabList,inputSet):
        #创建一个元素都为0的向量
        returnVec = [0]*len(vocabList)
        for word in inputSet:
            if word in vocabList:
                returnVec[vocabList.index(word)] += 1
        # else: print("the word: %s is not in my Vocabulary!" % word)
    
        return returnVec
    
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    # print(myVocabList)
    # print(setOfWords2Vec(myVocabList,listOPosts[0]))
    
    #训练算法:词向量计算概率
    #输入:文档矩阵trainMatrix、每类文档类别标签所构成向量trainCategory
    def trainNB0(trainMatrix,trainCategory):
        numTrainDocs = len(trainMatrix)
        numWords = len(trainMatrix[0])#无重复单词表有几个单词
    
        #概率初始化
        pAbusive=sum(trainCategory)/float(numTrainDocs) #文档里脏话文档的概率
        p0Num = np.ones(numWords)#正向词汇列
        p1Num = np.ones(numWords)#脏话词汇列,都初始化为0
        #降低由概率值为0导致最后乘积为0的影响
        p0Denom=2.0
        p1Denom=2.0
    
        for i in range(numTrainDocs):
            #计算文档属于侮辱性文档(class=1)的概率P(1)
            #对于二分类问题可以通过1-P(1)得到P(0)
            #一旦某个词语在某文档中出现,该词对应的个数就加1
            #在所有的文档中,文档的总词数也相应+1,
    
            if trainCategory[i]==1: #对于脏话类词汇统计
                p1Num += trainMatrix[i] #统计脏话词汇数量,对应位置数量+1(单位就是1)
                p1Denom += sum(trainMatrix[i]) #总脏话词汇+出现次数
            else:
                p0Num += trainMatrix[i]
                p0Denom += sum(trainMatrix[i])
    
        p1Vect=p1Num/p1Denom
        p0Vect=p0Num/p0Denom
    
        return p0Vect,p1Vect,pAbusive
    
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
        #统计经过处理的无重复词汇表中在对应第(postinDoc)文档中是否出现,是标记1,
        #返回len为文档长度的向量组
    # print(trainMat)
    
    p0V,p1V,pAb=trainNB0(trainMat,listClasses)
    # print("0")
    # print(p0V)
    # print("1")
    # print(p1V)
    # print("A")
    # print(pAb)
    # print(myVocabList)
    
    #朴素贝叶斯分类函数/输入向量(要分类的向量,使用函数trainNB0计算得到三个概率
    def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
        p1 = sum(vec2Classify*p1Vec)+np.log(pClass1)
        p0 = sum(vec2Classify*p0Vec)+np.log(1.0-pClass1)
        if p1>p0:
            return 1
        else:
            return 0
    #简单的分类测试
    def testingNB():
        listOposts,listClasses = loadDataSet()
        myVocabList = createVocabList(listOposts)
        trainMat=[]
    
        for postinDoc in listOposts:
            trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
        p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))
        testEntry=['love','my','dalmation']
    
        thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
        print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
        testEntry=['stupid','garbage']
        thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
        print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
    
    
    
    testingNB()
    
    #词袋模型:遇到每一个单词时,会增加词向量中对应值,而不是将对应数值设为0
    def bagOfWords2VecMN(vocabList,inputSet):
        returnVec = [0]*len(vocabList)
        for word in inputSet:
            if word in vocabList:
                returnVec[vocabList.index(word)]+=1
        return returnVec
    
    #应用:进行垃圾邮件的过滤
    
    #切分文本
    
    #test!
    # mySent = 'This book is the best book on python or M.L. I have laid eyes upon.'
    #
    # import re
    # regEx = re.compile('\W*')
    # listOfTokens = regEx.split(mySent)
    
    #测试:使用朴素贝叶斯进行交叉验证
    def textParse(bigString):
        import re
        listOfTokens =re.split(r'W*',bigString)
        return[tok.lower() for tok in listOfTokens if len(tok)>2]
        #返回长度大于2的词,而且全部小写化
    
    
    
    #该函数对贝叶斯垃圾邮件分类进行自动化处理,导入spam与ham下的文本文件,并为他们解析词列表。(*1)
    #分离器所需要的概率计算只利用训练集中的文档来完成
    #python变量trainingSet是一个整数列表,数值范围是0到49;(*2)
    
    
    
    def spamTest():
        docList=[]
        classList=[]
        fullText=[]
        main_email=[]
        for i in range(1,26):
            #(*1)
            wordList = textParse(open('email/spam/%d.txt'%i).read())
            main_e=open('email/spam/%d.txt'%i).read()
            main_email.append(main_e)
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(1)
    
            wordList = textParse(open('email/ham/%d.txt'%i).read())
            docList.append(wordList)
            main_e= open('email/spam/%d.txt' % i).read()
            main_email.append(main_e)
            fullText.extend(wordList)
            classList.append(0)
    
    
    
        #构建词汇表
        vocabList = createVocabList(docList)
        # print("构建的vocabList:")
        # print(vocabList)
        # print("=========================================================")
    
    
    
        #进行测试集的划分 (*2)
        trainingSet = list(range(50))
        testSet=[]
        for i in range(10): #随机选择10个文件
            randIndex = int(np.random.uniform(0,len(trainingSet))) #随机构建测试集,获取随机数作为index
            testSet.append(trainingSet[randIndex])#把index对应的邮件index添加到测试集中
            del(trainingSet[randIndex])#并且把该index从待挑选名单中删除
    
    
        trainMat=[]
        trainClasses =[]
        for docIndex in trainingSet:
            #对于每一个训练集里的训练单位进行词向量的构建
            trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
            trainClasses.append(classList[docIndex]) #安上对应标签!
        #针对训练集进行训练
        #
        # print(trainMat)
        # print(trainClasses)
    
        p0V,p1V,pSpam=trainNB0(np.array(trainMat),np.array(trainClasses))
        # print(p0V)
    
        errorCount = 0
    
        for docIndex in testSet:
            #提取训练名单上对应的邮件信息,查看滴docIndex条元素里对应的单词是否在vocablist词汇表中出现,出现则+1,返回信息向量wordVector
            # print("train")
            # print(docList[docIndex])
            wordVector = setOfWords2Vec(vocabList,docList[docIndex])
            # print(wordVector)
    
            if classifyNB(np.array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
                errorCount +=1
                print(main_email[docIndex])
                print(classifyNB(np.array(wordVector), p0V, p1V, pSpam))
                print(classList[docIndex])
    
    
    
    
        print('the error rate is :',float(errorCount)/len(testSet))
    
    # spamTest()
    
    #寻找最优参数
    
    
    def findthebest_Data_test():
        docList = []
        classList = []
        fullText = []
        main_email = []
        for i in range(1, 26):
            # (*1)
            wordList = textParse(open('email/spam/%d.txt' % i).read())
            main_e = open('email/spam/%d.txt' % i).read()
            main_email.append(main_e)
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(1)
    
            wordList = textParse(open('email/ham/%d.txt' % i).read())
            docList.append(wordList)
            main_e = open('email/spam/%d.txt' % i).read()
            main_email.append(main_e)
            fullText.extend(wordList)
            classList.append(0)
    
        # 构建词汇表
        vocabList = createVocabList(docList)
        # print("构建的vocabList:")
        # print(vocabList)
        # print("=========================================================")
    
        # 进行测试集的划分 (*2)
        trainingSet = list(range(50))
        testSet = []
        for i in range(10):  # 随机选择10个文件
            randIndex = int(np.random.uniform(0, len(trainingSet)))  # 随机构建测试集,获取随机数作为index
            testSet.append(trainingSet[randIndex])  # 把index对应的邮件index添加到测试集中
            del (trainingSet[randIndex])  # 并且把该index从待挑选名单中删除
    
        trainMat = []
        trainClasses = []
        for docIndex in trainingSet:
            # 对于每一个训练集里的训练单位进行词向量的构建
            trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
            trainClasses.append(classList[docIndex])  # 安上对应标签!
        # 针对训练集进行训练
        #
        # print(trainMat)
        # print(trainClasses)
    
        p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
        # print(p0V)
    
        errorCount = 0
    
        for docIndex in testSet:
            # 提取训练名单上对应的邮件信息,查看滴docIndex条元素里对应的单词是否在vocablist词汇表中出现,出现则+1,返回信息向量wordVector
            # print("train")
            # print(docList[docIndex])
            wordVector = setOfWords2Vec(vocabList, docList[docIndex])
            # print(wordVector)
    
            if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
                errorCount += 1
                # print(main_email[docIndex])
                # print(classifyNB(np.array(wordVector), p0V, p1V, pSpam))
                # print(classList[docIndex])
    
        # print('the error rate is :', float(errorCount) / len(testSet))
        error_rate=float(errorCount) / len(testSet)
        return p0V, p1V, pSpam,error_rate
    
    def find_the_data():
        p0Num = np.ones(10)
        p1Num = np.ones(10)
        PA = 0.0
        err=1
        for i in range(50):
            a,b,c,d=findthebest_Data_test()
            if d<err:
                err = d
                p0Num=a
                p1Num=b
                PA=c
    
    
        return p0Num,p1Num,PA
    
    
    
    def final_test():
        p0,p1,pA =find_the_data()
    
    
    
        docList = []
        classList = []
        fullText = []
        main_email = []
        for i in range(1, 26):
            # (*1)
            wordList = textParse(open('email/spam/%d.txt' % i).read())
            main_e = open('email/spam/%d.txt' % i).read()
            main_email.append(main_e)
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(1)
    
            wordList = textParse(open('email/ham/%d.txt' % i).read())
            docList.append(wordList)
            main_e = open('email/spam/%d.txt' % i).read()
            main_email.append(main_e)
            fullText.extend(wordList)
            classList.append(0)
    
        vocabList = createVocabList(docList)
    
        errorCount = 0
    
        for i in range(len(docList)):
            # 提取训练名单上对应的邮件信息,查看滴docIndex条元素里对应的单词是否在vocablist词汇表中出现,出现则+1,返回信息向量wordVector
            # print("train")
            # print(docList[docIndex])
            wordVector = setOfWords2Vec(vocabList, docList[i])
            # print(wordVector)
    
    
            if classifyNB(np.array(wordVector), p0, p1, pA) != classList[i]:
                errorCount += 1
                # print(main_email[i])
                # print(classifyNB(np.array(wordVector), p0, p1, pA))
                # print(classList[i])
    
        print('the error rate is :', float(errorCount) / len(docList))
    
    
    
    final_test()
    

     

  • 相关阅读:
    hihocoder-1014 Trie树
    51Nod-1265 四点共面
    cf466B Wonder Room
    2014.9.13模拟赛【数位和乘积】
    2014.9.13模拟赛【环上的游戏】
    bzoj2719[Violet 4]银河之星
    wikioi1450 xth的旅行
    poj2352 stars
    2014.9.6模拟赛【藏妹子之处】
    2014.9.6模拟赛【工资】
  • 原文地址:https://www.cnblogs.com/AKsnoopy/p/14085074.html
Copyright © 2011-2022 走看看