zoukankan      html  css  js  c++  java
  • 朴素贝叶斯算法的实例

    贝叶斯的应用

    • 过滤垃圾邮件

      贝叶斯分类器的著名的应用就是垃圾邮件过滤了,这方面推荐想详细了解的可以去看看《黑客与画家》或是《数学之美》中对应的章节,贝叶斯的基础实现看这里

      数据集

      两个文件夹,分别是正常邮件和垃圾邮件,其中各有25封邮件

      测试方法

      从50封邮件中随机选取10封做为测试数据

      实现细节


      1.首先我们需要将文本转成我们需要的向量的样子,这里需要使用一点正则表达式
      2.由于采取交叉验证的方式,随机过程会导致每次的结果不尽相同
      1 #coding=utf-8
      2 from numpy import *
      3 
      4 #解析文档的函数
      5 def textParse(bigString):
      6     import re
      7     listOfTokens = re.split(r'W*',bigString)
      8     return [tok.lower() for tok in listOfTokens if len(tok) > 2]
      9     
     10     
     11 #创建一个带有所有单词的列表
     12 def createVocabList(dataSet):
     13     vocabSet = set([])
     14     for document in dataSet:
     15         vocabSet = vocabSet | set(document)
     16     return list(vocabSet)
     17     
     18 def setOfWords2Vec(vocabList, inputSet):
     19     retVocabList = [0] * len(vocabList)
     20     for word in inputSet:
     21         if word in vocabList:
     22             retVocabList[vocabList.index(word)] = 1
     23         else:
     24             print 'word ',word ,'not in dict'
     25     return retVocabList
     26 
     27 #另一种模型    
     28 def bagOfWords2VecMN(vocabList, inputSet):
     29     returnVec = [0]*len(vocabList)
     30     for word in inputSet:
     31         if word in vocabList:
     32             returnVec[vocabList.index(word)] += 1
     33     return returnVec
     34 
     35 def trainNB0(trainMatrix,trainCatergory):
     36     numTrainDoc = len(trainMatrix)
     37     numWords = len(trainMatrix[0])
     38     pAbusive = sum(trainCatergory)/float(numTrainDoc)
     39     #防止多个概率的成绩当中的一个为0
     40     p0Num = ones(numWords)
     41     p1Num = ones(numWords)
     42     p0Denom = 2.0
     43     p1Denom = 2.0
     44     for i in range(numTrainDoc):
     45         if trainCatergory[i] == 1:
     46             p1Num +=trainMatrix[i]
     47             p1Denom += sum(trainMatrix[i])
     48         else:
     49             p0Num +=trainMatrix[i]
     50             p0Denom += sum(trainMatrix[i])
     51     p1Vect = log(p1Num/p1Denom)#处于精度的考虑,否则很可能到限归零
     52     p0Vect = log(p0Num/p0Denom)
     53     return p0Vect,p1Vect,pAbusive
     54     
     55 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
     56     p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult
     57     p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
     58     if p1 > p0:
     59         return 1
     60     else: 
     61         return 0
     62         
     63 def spamTest(spamFloder, hamFloder):
     64     docList = []
     65     classList = []
     66     fullText = []
     67     for i in range(1,26):
     68         wordList = textParse(open(spamFloder+str(i)+'.txt').read())
     69         docList.append(wordList)
     70         fullText.extend(wordList)
     71         classList.append(1)
     72         wordList = textParse(open(hamFloder+str(i)+'.txt').read())
     73         docList.append(wordList)
     74         fullText.extend(wordList)
     75         classList.append(0)
     76     vocabList = createVocabList(docList)
     77     trainingSet = range(50)
     78     testSet = []
     79     for i in range(10):
     80         randIndex = int(random.uniform(0,len(trainingSet)))
     81         testSet.append(trainingSet[randIndex])
     82         del(trainingSet[randIndex])
     83     trainMat = []
     84     trianClasses = []
     85     print trainingSet
     86     for docIndex in trainingSet:
     87         trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
     88         #trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
     89         trianClasses.append(classList[docIndex])
     90     p0V,p1V,pSpam = trainNB0(array(trainMat),array(trianClasses))
     91     errorCount = 0
     92     for docIndex in testSet:        #classify the remaining items
     93         #wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
     94         wordVector = setOfWords2Vec(vocabList, docList[docIndex])
     95         if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
     96             errorCount += 1
     97             print "classification error",docList[docIndex]
     98     print 'the error rate is: ',float(errorCount)/len(testSet)
     99     #return vocabList,fullText
    100     
    101     
    102 def main():
    103     spamTest('email/spam/','email/ham/')
    104     
    105 if __name__ == '__main__':
    106     main()
     
    • 从个人广告中获取地区倾向

      这个是从某个网站上提取了不同地区板块的信息,分析他们的用词是不是有某些规律

      数据集

      这里的数据使用RSS获取的,用到了python的feedparse包,想了解可以看这里.这里分别获取了某网站两个地区板块中的信息

      测试方法

      交叉验证

      实现细节


      1.这里有两种字符需要特别处理(其实他们有很大重合),一种是频率最高的一些,另一种是所谓的停用词(我的理解其实就是那些使用频率很高但没什么实际意义的),各种语言的停用词可以看这里。 我们需要移除这些词以使得结果更能体现出地区差异。
      2.getTopWords函数实际上就是对这个概率统计了一下特征。对学习贝叶斯来说不是必要代码
      3.除了数据来源不同实现细节和上面的很相似
    1.   1 #coding=utf-8
        2 from numpy import *
        3 
        4 #解析文档的函数
        5 def textParse(bigString):
        6     import re
        7     listOfTokens = re.split(r'W*',bigString)
        8     return [tok.lower() for tok in listOfTokens if len(tok) > 2]
        9     
       10     
       11 #创建一个带有所有单词的列表
       12 def createVocabList(dataSet):
       13     vocabSet = set([])
       14     for document in dataSet:
       15         vocabSet = vocabSet | set(document)
       16     return list(vocabSet)
       17     
       18 def setOfWords2Vec(vocabList, inputSet):
       19     retVocabList = [0] * len(vocabList)
       20     for word in inputSet:
       21         if word in vocabList:
       22             retVocabList[vocabList.index(word)] = 1
       23         else:
       24             print 'word ',word ,'not in dict'
       25     return retVocabList
       26 
       27 #另一种模型    
       28 def bagOfWords2VecMN(vocabList, inputSet):
       29     returnVec = [0]*len(vocabList)
       30     for word in inputSet:
       31         if word in vocabList:
       32             returnVec[vocabList.index(word)] += 1
       33     return returnVec
       34 
       35 def trainNB0(trainMatrix,trainCatergory):
       36     numTrainDoc = len(trainMatrix)
       37     numWords = len(trainMatrix[0])
       38     pAbusive = sum(trainCatergory)/float(numTrainDoc)
       39     #防止多个概率的成绩当中的一个为0
       40     p0Num = ones(numWords)
       41     p1Num = ones(numWords)
       42     p0Denom = 2.0
       43     p1Denom = 2.0
       44     for i in range(numTrainDoc):
       45         if trainCatergory[i] == 1:
       46             p1Num +=trainMatrix[i]
       47             p1Denom += sum(trainMatrix[i])
       48         else:
       49             p0Num +=trainMatrix[i]
       50             p0Denom += sum(trainMatrix[i])
       51     p1Vect = log(p1Num/p1Denom)#处于精度的考虑,否则很可能到限归零
       52     p0Vect = log(p0Num/p0Denom)
       53     return p0Vect,p1Vect,pAbusive
       54     
       55 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
       56     p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult
       57     p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
       58     if p1 > p0:
       59         return 1
       60     else: 
       61         return 0
       62 
       63 def stopWords():
       64     stopW = []
       65     f = open('stopwords.txt').readlines()
       66     for eachLine in f:
       67         stopW.append(eachLine[:-1])
       68     return stopW
       69 
       70 def calcMostFreq(vocabList,fullText):
       71     import operator
       72     freqDict = {}
       73     for token in vocabList:
       74         freqDict[token]=fullText.count(token)
       75     sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True) 
       76     return sortedFreq[:30]       
       77 
       78 def localWords(rss1,rss0):
       79     import feedparser
       80     feed1 = feedparser.parse(rss1)
       81     feed0 = feedparser.parse(rss0)
       82     docList=[]; classList = []; fullText =[]
       83     minLen = min(len(feed1['entries']),len(feed0['entries']))
       84     for i in range(minLen):
       85         wordList = textParse(feed1['entries'][i]['summary'])
       86         docList.append(wordList)
       87         fullText.extend(wordList)
       88         classList.append(1) #NY is class 1
       89         wordList = textParse(feed0['entries'][i]['summary'])
       90         docList.append(wordList)
       91         fullText.extend(wordList)
       92         classList.append(0)
       93     vocabList = createVocabList(docList)#create vocabulary
       94     top30Words = calcMostFreq(vocabList,fullText)   #remove top 30 words
       95     for pairW in top30Words:
       96         if pairW[0] in vocabList: vocabList.remove(pairW[0])
       97     stopW = stopWords()
       98     for pairW in stopW:
       99         if pairW[0] in vocabList:
      100             vocabList.remove(pairW[0])
      101     trainingSet = range(2*minLen); testSet=[]           #create test set
      102     for i in range(20):
      103         randIndex = int(random.uniform(0,len(trainingSet)))
      104         testSet.append(trainingSet[randIndex])
      105         del(trainingSet[randIndex])  
      106     trainMat=[]; trainClasses = []
      107     for docIndex in trainingSet:#train the classifier (get probs) trainNB0
      108         trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
      109         trainClasses.append(classList[docIndex])
      110     p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
      111     errorCount = 0
      112     for docIndex in testSet:        #classify the remaining items
      113         wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
      114         if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
      115             errorCount += 1
      116     print 'the error rate is: ',float(errorCount)/len(testSet)
      117     return vocabList,p0V,p1V
      118 
      119 def getTopWords(ny,sf):
      120     import operator
      121     vocabList,p0V,p1V=localWords(ny,sf)
      122     topNY=[]; topSF=[]
      123     for i in range(len(p0V)):
      124         if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
      125         if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
      126     sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
      127     print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"
      128     for item in sortedSF:
      129         print item[0]
      130     sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
      131     print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"
      132     for item in sortedNY:
      133         print item[0]    
      134     
      135 def main():
      136     #print stopWords()
      137     localWords('http://newyork.craigslist.org/stp/index.rss','http://sfbay.craigslist.org/stp/index.rss')
      138     
      139 if __name__ == '__main__':
      140     main()

      机器学习笔记索引



  • 相关阅读:
    BZOJ3744 : Gty的妹子序列
    BZOJ2827 : 千山鸟飞绝
    BZOJ3547 : [ONTAK2010]Matchings
    BZOJ1185 : [HNOI2007]最小矩形覆盖
    BZOJ3046 : lagoon
    BZOJ3743 : [Coci2014]Kamp
    BZOJ3742 : Painting
    iOS移动开发周报-第25期
    适合码农工作时玩的游戏:Scrum
    iOS移动开发周报-第24期
  • 原文地址:https://www.cnblogs.com/MrLJC/p/4107117.html
Copyright © 2011-2022 走看看