zoukankan      html  css  js  c++  java
  • 朴素贝叶斯进行分类

    python代码实现    

     主要是根据朴素贝叶斯 相互独立假设  p(w|c1)=p(w1|c1)*p(w2|c1)*p(w3|c1)......*p(wn|c1)  

    从而 p(c1|w)=[p(w|c1)*p(c1)]/p(w)   而p(w)等于 i 从到n  所有的 p(w|ci)*p(ci)相加,从而p(w)不变 

    因此只需要计算[p(w|c1)*p(c1)] 

    def loadDataSet():
        postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                       ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                       ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                       ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                       ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                       ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
        classVec = [0, 1, 0, 1, 0, 1]  # 1代表侮辱文字,0代表正常言论
        return postingList, classVec
    
    
    # 创建不重复所有文档词的字典列表
    def createVocabList(dataSet):
        vocabSet = set([])
        for document in dataSet:
            # 并集操作
            vocabSet = vocabSet | set(document)
        return list(vocabSet)
    
    
    # 把文档词汇表变成文档向量
    def setOfWords2Vec(vocabList, inputSet):
        returnVec = [0] * len(vocabList)
        for word in inputSet:
            if word in vocabList:
                # 获得某个词所在的索引位置
                returnVec[vocabList.index(word)] = 1
            else:
                print('the word:%s is not in my Vocabulart!' % word)
        return returnVec
    
    
    # 词袋模型  统计每个词出现的次数
    def bagOfWords2VecMN(vocabList, inputSet):
        retrunVec = [0] * len(vocabList)
        for word in inputSet:
            if word in vocabList:
                retrunVec[vocabList.index(word)] += 1
        return retrunVec
    
    
    import numpy as np
    
    
    # 训练数据集
    def trainB0(trainMatrix, trainCategory):
        numTrainDocs = len(trainMatrix)
        numWords = len(trainMatrix[0])
        # 侮辱性类型所占的比例
        pAbusive = sum(trainCategory) / float(numTrainDocs)
        # P(x|c=0)的概率 转换为朴素贝叶斯函数 P(x1|c=0)*P(x2|c=0).......
        # p0Num = np.zeros(numWords)
        p0Num = np.ones(numWords)
        # P(x|c=1)的概率 如上
        # p1Num = np.zeros(numWords)
        p1Num = np.ones(numWords)
        # p0Denom = 0.0
        # p1Denom = 0.0
        p0Denom = 2.0
        p1Denom = 2
        for i in range(numTrainDocs):
            if trainCategory[i] == 1:
                p1Num += trainMatrix[i]
                p1Denom += sum(trainMatrix[i])
            else:
                p0Num += trainMatrix[i]
                p0Denom += sum(trainMatrix[i])
        # p1Vect = p1Num / p1Denom
        # p0Vect = p0Num / p0Denom
        # 防止数据出现下溢
        p1Vect = np.log(p1Num / p1Denom)
        p0Vect = np.log(p0Num / p0Denom)
        return p0Vect, p1Vect, pAbusive
    
    
    # vec2Classify表示当前文档的词向量
    def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
        p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
        p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
        if p1 > p0:
            return 1
        else:
            return 0
    
    
    def textTrain():
        postingList, classVec = loadDataSet()
        dataSet = createVocabList(postingList)
        trainMatrix = []
        for postinDoc in postingList:
            # 获得每个文档的向量表示,并添加到trainMatrix矩阵中
            trainMatrix.append(setOfWords2Vec(dataSet, postinDoc))
            # trainMatrix.append(bagOfWords2VecMN(dataSet, postinDoc))
        p0v, p1v, pAb = trainB0(trainMatrix, classVec)
        print(p0v)
        print(p1v)
        print(pAb)  # textTrain()
    
    
    def testingNB():
        listOppsts, listClasses = loadDataSet()
        myVocabList = createVocabList(listOppsts)
        trainMat = []
        for postinDoc in listOppsts:
            trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
        p0v, p1v, pAb = trainB0(trainMat, listClasses)
        testEntry = ['love', 'my', 'dalmation']
        thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
        print('testEntry , classified as :%s' % (classifyNB(thisDoc, p0v, p1v, pAb)))
        testEntry = ['stupid', 'garbage']
        thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
        print('testEntry , classified as :%s' % (classifyNB(thisDoc, p0v, p1v, pAb)))
    
    
    def textParse(bigString):
        import re
        try:
            # 匹配所有非 字母、数字、下划线
            listOfTokens = re.split(r'W*', bigString)
        except FutureWarning:
            print("error")
            return []
        return [tok.lower() for tok in listOfTokens if len(tok) > 2]
    
    
    # 对贝叶斯垃圾邮件分类器进行自动化处理
    # 交叉验证
    def spamTest():
        docList = []
        classList = []
        fullText = []
        for i in range(1, 26):
            wordList = textParse(open('email/spam/%d.txt' % i).read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(1)
            wordList = textParse(open('email/spam/%d.txt' % i).read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(0)
        # 创建词汇集合
        vocabList = createVocabList(docList)
        # 训练数据集合
        trainingSet = list(range(50))
        testSet = []
        # 随机分割训练数据集和测试数据集
        for i in range(10):
            # 从0-49 随机选择一个数字,其实是随机选择一个样本
            randIndex = int(np.random.uniform(0, len(trainingSet)))
            testSet.append(trainingSet[randIndex])
            del (trainingSet[randIndex])
        trainMat = []
        trainClasses = []
        for docIndex in trainingSet:
            trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
            trainClasses.append(classList[docIndex])
        p0V, p1V, pSpam = trainB0(trainMat, trainClasses)
        errorCount = 0
        #     对于测试数据集
        for docIndex in testSet:
            wordVector = setOfWords2Vec(vocabList, docList[docIndex])
            if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
                errorCount += 1
        print("the error rate is:", float(errorCount) / len(testSet))
    
    
    import feedparser
    
    
    # ny = feedparser.parse('http://feed.cnblogs.com/blog/u/205667/rss')
    # print(ny['entries'][0]['summary'])
    
    
    # spamTest()
    # RSS源分类器及高频词去除函数
    # 统计词频,并且取出从词频最高的前30个数据
    def calcMostFreq(vocabList, fullText):
        import operator
        freqDict = {}
        for token in vocabList:
            freqDict[token] = fullText.count(token)
        sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
        return sortedFreq[:20]
    
    # 从文件中加载停用词
    def load_the_remov_words():
        rmwords = []
        with open('removeWords.txt', 'rb') as fr:
            for line in fr.readlines():
                strline = line.strip()
                rmwords.append(strline)
        fr.close()
        return rmwords
    
    
    # //从RSS源中加载数据
    def localWords(feed1, feed0):
        import feedparser
        # 文档列表
        docList = []
        # 类别列表
        classList = []
        # 没有去重的所有词表
        fullText = []
    
        # 选择其中最短的预料长度
        minLen = min(len(feed1['entries']), len(feed0['entries']))
        for i in range(minLen):
            # 把摘要分割成单词list集合
            wordList = textParse(feed1['entries'][i]['summary'])
            # 添加每篇文档的词集合
            docList.append(wordList)
            # 保存所有的词成一个list集合
            fullText.extend(wordList)
            # 添加类别
            classList.append(1)
            # 如上一样
            wordList = textParse(feed0['entries'][i]['summary'])
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(0)
        # 创建不重复所有文档使用的单词列表
        vocabList = createVocabList(docList)
        # 选择词频最高的前30条数据  返回的格式是[({词:出现的次数})({词:出现的次数})....]
        top30Words = calcMostFreq(vocabList, fullText)
        # 移除频数最高的前30条数据
        for pairW in top30Words:
            if pairW[0] in vocabList:
                vocabList.remove(pairW[0])
    
        # 移除停用词
        rmwords = load_the_remov_words()
        for pairW in rmwords:
            if pairW in vocabList:
                vocabList.remove(pairW)
    
        # 训练数据集中数据的个数
        trainingSet = list(range(2 * minLen))
        # 测试数据集
        testSet = []
        # 任意选择二十条数据作为测试数据
        for i in range(20):
            # 产生一个随机数
            randIndex = int(np.random.uniform(0, len(trainingSet)))
            # 添加到测试集合上
            testSet.append(trainingSet[randIndex])
            # 删除测试数据索引
            del (trainingSet[randIndex])
    
        trainMat = []
        trainClasses = []
        for docIndex in trainingSet:
            # 获得训练数据相应的词向量
            trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
            # 添加相应的类别
            trainClasses.append(classList[docIndex])
    
        p0V, p1V, pSpam = trainB0(np.array(trainMat), np.array(trainClasses))
        errorCount = 0
        for docIndex in testSet:
            wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
            if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
                errorCount += 1
        print('the error rate is:', float(errorCount) / len(testSet))
        return vocabList, p0V, p1V
    
    
    # 按顺序输出 满足一定阈值词
    def getTopWords(ny, sf):
        vocabList, p0V, p1V = localWords(ny, sf)
        topNY = [];
        topSF = []
        for i in range(len(p0V)):
            if p0V[i] > -6.0:
                topSF.append((vocabList[i], p0V[i]))
            if p1V[i] > -6.0:
                topNY.append((vocabList[i], p1V[i]))
        sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
        print("SF**" * 14)
        for item in sortedSF:
            print(item[0])
        sortedNF = sorted(topNY, key=lambda pair: pair[1], reverse=True)
        print("NF**" * 14)
        for item in sortedNF:
            print(item[0])
    
    
    ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
    sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
    getTopWords(ny, sf)
    # vocabList, pSF, pNY = localWords(ny, sf)
    # vocabList, pSF, pNY = localWords(ny, sf)
    # print(vocabList)
    # print("-" * 20)
    # print(pSF)
    # print('------------')
    # print(pNY)

     停用词文件

         

  • 相关阅读:
    git命令的使用
    动态生成表格的每一行的操作按钮如何获取当前行的index
    js判断一些时间范围是否有重复时间段
    infiniband install driver
    python之pip install
    KVM :vnc 远程控制kvm创建虚拟机
    如何设置UNIX/Linux中新创建目录或文件的默认权限
    python获取报文参考代码
    JAVA命名规范
    oracle常用知识随笔
  • 原文地址:https://www.cnblogs.com/09120912zhang/p/8039913.html
Copyright © 2011-2022 走看看