zoukankan      html  css  js  c++  java
  • 吴裕雄--天生自然python机器学习:使用朴素贝叶斯过滤垃圾邮件

    使用朴素贝叶斯解决一些现实生活中
    的问题时,需要先从文本内容得到字符串列表,然后生成词向量。

    准备数据:切分文本

    测试算法:使用朴素贝叶斯进行交叉验证

    文件解析及完整的垃圾邮件测试函数

    def createVocabList(dataSet):
        vocabSet = set([])  #create empty set
        for document in dataSet:
            vocabSet = vocabSet | set(document) #union of the two sets
        return list(vocabSet)
    
    def setOfWords2Vec(vocabList, inputSet):
        returnVec = [0]*len(vocabList)
        for word in inputSet:
            if word in vocabList:
                returnVec[vocabList.index(word)] = 1
            else: 
                print("the word: %s is not in my Vocabulary!" % word)
        return returnVec
    
    def bagOfWords2VecMN(vocabList, inputSet):
        returnVec = [0]*len(vocabList)
        for word in inputSet:
            if word in vocabList:
                returnVec[vocabList.index(word)] += 1
        return returnVec
    
    def textParse(bigString):    #input is big string, #output is word list
        import re
        listOfTokens = re.split(r'W*', bigString)
        return [tok.lower() for tok in listOfTokens if len(tok) > 2] 
    
    def trainNB0(trainMatrix,trainCategory):
        numTrainDocs = len(trainMatrix)
        numWords = len(trainMatrix[0])
        pAbusive = sum(trainCategory)/float(numTrainDocs)
        p0Num = ones(numWords)
        p1Num = ones(numWords)      #change to ones() 
        p0Denom = 2.0
        p1Denom = 2.0                        #change to 2.0
        for i in range(numTrainDocs):
            if trainCategory[i] == 1:
                p1Num += trainMatrix[i]
                p1Denom += sum(trainMatrix[i])
            else:
                p0Num += trainMatrix[i]
                p0Denom += sum(trainMatrix[i])
        p1Vect = log(p1Num/p1Denom)          #change to log()
        p0Vect = log(p0Num/p0Denom)          #change to log()
        return p0Vect,p1Vect,pAbusive
    
    def aloneIndex(datasetLen):
        a = []
        while(True):
            randIndex = int(random.uniform(0,len(trainingSet)))
            a.append(randIndex)
            if(len(set(a))==10):
                break
        return a
    
    def spamTest():
        docList=[]
        classList = []
        fullText =[]
        for i in range(1,26):
            wordList = textParse(open('F:\machinelearninginaction\Ch04\email\spam\%d.txt' % i).read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(1)
            wordList = textParse(open('F:\machinelearninginaction\Ch04\email\ham\%d.txt' % i).read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(0)
        vocabList = createVocabList(docList)#create vocabulary
        trainingSet = range(50)
        testSet = aloneIndex(trainingSet) #create test set
        trainingSetT = []
        for i in range(len(trainingSet)):
            for j in range(len(testSet)):
                if(testSet[j] != trainingSet[i]):
                    trainingSetT.append(trainingSet[i])
        trainingSet = trainingSetT
        trainMat=[]
        trainClasses = []
        for docIndex in trainingSet:#train the classifier (get probs) trainNB0
            trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
            trainClasses.append(classList[docIndex])
        p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
        errorCount = 0
        for docIndex in testSet:        #classify the remaining items
            wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
            if(classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]):
                errorCount += 1
                print("classification error",docList[docIndex])
        print('the error rate is: ',float(errorCount)/len(testSet))
        
    spamTest()

  • 相关阅读:
    css3——box-sizing属性
    HTML5存储--离线存储
    微信公众号爆出前端安全漏洞
    Js获取宽高度的归纳集锦总结
    Yii 2 修改 URL 模式为 PATH 模式,并隐藏index.php
    SQL 查询优化 索引优化
    linux提示语言包
    安装linux工作环境
    linux常用命令
    PHP解决抢购、秒杀、抢楼、抽奖等阻塞式高并发库存防控超量的思路方法
  • 原文地址:https://www.cnblogs.com/tszr/p/12041716.html
Copyright © 2011-2022 走看看