引入
使用来自不同城市的广告训练一个分类器,然后观察分类器的效果。我们的目的并不是使用该分类器进行分类,而是通过观察单词和条件概率值来发现与特定城市相关的内容。
1 def calcMostFreq(vocabList,fullText): 2 import operator 3 freqDict = {} 4 for token in vocabList: 5 freqDict[token]=fullText.count(token) 6 sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True) 7 return sortedFreq[:30] 8 9 def localWords(feed1,feed0): 10 import feedparser 11 docList=[]; classList = []; fullText =[] 12 minLen = min(len(feed1['entries']),len(feed0['entries'])) 13 for i in range(minLen): 14 wordList = textParse(feed1['entries'][i]['summary']) 15 docList.append(wordList) 16 fullText.extend(wordList) 17 classList.append(1) #NY is class 1 18 wordList = textParse(feed0['entries'][i]['summary']) 19 docList.append(wordList) 20 fullText.extend(wordList) 21 classList.append(0) 22 vocabList = createVocabList(docList)#create vocabulary 23 top30Words = calcMostFreq(vocabList,fullText) #remove top 30 words 24 for pairW in top30Words: 25 if pairW[0] in vocabList: vocabList.remove(pairW[0]) 26 trainingSet = range(2*minLen); testSet=[] #create test set 27 for i in range(20): 28 randIndex = int(random.uniform(0,len(trainingSet))) 29 testSet.append(trainingSet[randIndex]) 30 del(trainingSet[randIndex]) 31 trainMat=[]; trainClasses = [] 32 for docIndex in trainingSet:#train the classifier (get probs) trainNB0 33 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) 34 trainClasses.append(classList[docIndex]) 35 p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) 36 errorCount = 0 37 for docIndex in testSet: #classify the remaining items 38 wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) 39 if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: 40 errorCount += 1 41 print 'the error rate is: ',float(errorCount)/len(testSet) 42 return vocabList,p0V,p1V
第一个函数遍历词汇表中的每个词并统计它在文本中出现的次数,然后根据出现次数从高到低对词典进行排序,最后返回排序最高的30个单词
第二个函数使用两个RSS源作为参数。这个函数和之前的spamTest函数几乎相同。区别在于这里访问的是RSS源而不是文件。
词袋模型在解决文档分类问题上比词集模型有所提高。
1 def getTopWords(ny,sf): 2 import operator 3 vocabList,p0V,p1V=localWords(ny,sf) 4 topNY=[]; topSF=[] 5 for i in range(len(p0V)): 6 if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i])) 7 if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i])) 8 sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True) 9 print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**" 10 for item in sortedSF: 11 print item[0] 12 sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True) 13 print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**" 14 for item in sortedNY: 15 print item[0]
>>> import feedparser >>> import bayes >>> ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss') >>> sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') >>> voacblist,psf,pny=bayes.localWords(ny,sf) the error rate is: 0.25 >>> voacblist,psf,pny=bayes.localWords(ny,sf) the error rate is: 0.45 >>> voacblist,psf,pny=bayes.localWords(ny,sf) the error rate is: 0.4 >>> voacblist,psf,pny=bayes.localWords(ny,sf) the error rate is: 0.3 voacblist,psf,pny=bayes.localWords(ny,sf) >>> >>> KeyboardInterrupt >>> >>> voacblist,psf,pny=bayes.localWords(ny,sf) the error rate is: 0.25 >>> bayes.getTopWords(ny,sf)
使用两个RSS源作为输入,然后训练并测试朴素贝叶斯分类器,返回使用的概率值。然后创建两个列表用于元素的存储,返回大于某个阈值的所有词,然后按照他们的条件概率进行排序。