zoukankan      html  css  js  c++  java
  • bayes

    from numpy import *
    
    import time
    starttime = time.time()
    
    
    def loadDataSet(): 
        postingList = [['my', 'dog', 'has', 'flea',
                        'problems', 'help', 'please'],
                        ['maybe', 'not', 'take', 'him',
                        'to', 'dog', 'park', 'stupid'],
                        ['my', 'dalmation', 'is', 'so', 'cute',
                        'I', 'love', 'him'],
                        ['stop', 'posting', 'stupid', 'worthless', 
                        'garbage'],
                        ['mr', 'licks', 'ate', 'my', 'steak', 'how',
                        'to', 'stop', 'him'],
                        ['quit', 'buying', 'worthless', 'dog', 'food',
                        'stupid']]
        classVec = [0, 1, 0, 1, 0, 1] 
        return postingList, classVec
    
    def createVocabList(dataSet): # dataSet = postingList 
        vocabSet = set([]) # vocabSet = set(dataSet)
        for document in dataSet:
            vocabSet = vocabSet | set(document) # 
        return list(vocabSet) # createVocabList = list(set(dataSet)) 
    
    def setOfWords2Vec(vocabList, inputSet): 
        returnVec = [0] * len(vocabList) # [0, 0 , 0 ,0,..] len(vocabList)  0
        for word in vocabList:
            if word in inputSet:
                returnVec[vocabList.index(word)] = 1 + 1.0
            else:
                returnVec[vocabList.index(word)] = 1.0
                print "the word: %s is not in my Vocabulary!" % word
        return returnVec 
    
    
    
    
    def txt2trainxy(filename1, filename2):
        import re
        reg = re.compile(r'W*') #
        # step 1: loading data...
        print "stet 1: loading data..."
        from os import listdir
        ld1 = listdir('email/' + filename1); ld2 = listdir('email/' + filename2)
        filelist = ld1 + ld2
        trainy = ((filename1 + '	') * len(ld1) + (filename2 + '	') * len(ld2)).split()
        
        trainx = []; fulltext = []; i = 0
        for File in filelist:
            if i < len(ld1):
                fr = reg.split(open('email/' + filename1 + '/' + File).readlines()[0].lower())
            else:
                fr = reg.split(open('email/' + filename2 + '/' + File).readlines()[0].lower())
            trainx.append([f for f in fr if len(f) > 2]) #
            fulltext.extend([f for f in fr if len(f) > 2]) #
            i += 1
        fulltext = list(set(fulltext))
        # set of words
        trainxws = [[list(set(item)).count(strg) + 1.0 for strg in fulltext] for item in trainx]
        # bag of words 
        trainxwb = [[item.count(strg) + 1.0 for strg in fulltext] for item in trainx]
    
        return trainxws, trainxwb, trainy, trainx, fulltext
    
    def testx2vec(testx, fulltext):
        # set of words
        testxws = [list(set(testx)).count(strg) + 1.0 for strg in fulltext] #
        # bag of words 
        testxwb = [testx.count(strg) + 1.0 for strg in fulltext] #
        for word in testx:
            if word not in fulltext:
                print "the word: %s is not in my fulltext!" % word
        return testxws, testxwb
    
    def bayes(testx, trainx, trainy, fulltext):
        print "---Getting Prob..."
        s = set(trainy); l = len(trainy); r = len(trainx[0])
        IDs = [[id for id in range(l) if trainy[id] == item] for item in s]
        logproby = [log(array(trainy.count(item)) / float(l)) for item in s]
        numbxv = [sum([trainx[id] for id in ids], 0) for ids in IDs]
        numbx = [sum([trainx[id] for id in ids]) + 2.0 for ids in IDs] #
        probx = [numbxv[i] / float(numbx[i]) for i in range(len(s))]
        logprobx = [[log(p[i]) for i in range(r)] for p in probx]
        print "---Printing Prob..."
        #print probx
        print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][0]] # argsort() small to big
        print trainy[IDs[0][0]]
        print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][1]]
        print trainy[IDs[1][0]]
        """
        print IDs
        print numbxv
        print logprobx
        """
    
        # step 4: showing the result...
        print "---Showing the result..."
        # set of words
        sumlogpxws = sum(array(logprobx) * testx, 1)
        sumlogpxyws = array(sumlogpxws) + array(logproby)
        #print logprobx
        print sumlogpxws
        print sum(array(probx) * testx, 1)
        bestyws = trainy[IDs[sumlogpxyws.argmax()][0]]
        print "---From set of words: ", bestyws
        """
        # bag of words
        sumlogpxwb = sum(array(logprobx) * testxwb, 1)
        sumlogpxywb = array(sumlogpxwb) + array(logproby)
        bestywb = trainy[IDs[sumlogpxywb.argmax()][0]]
        print "---From bag of words: ", bestywb
        """
        return bestyws
        
    
    def main():
        # step 1: loading data...
        trainxws, trainxwb, trainy, trainx, fulltext = txt2trainxy('spam','ham')
        print fulltext
    
        # step 2: training...
        print "step 2: training..."
        pass
    
        # step 3: testing...
        print "step 3: testing..."
        print "---Preparing testdata..."
        import random
        l = len(trainy)
        testid = random.sample(range(l), 20)
        testxxx = [trainxws[i] for i in testid]
        testyyy = [trainy[i] for i in testid]
        testtrainxws = [trainxws[i] for i in range(l) if i not in testid]
        testtrainy = [trainy[i] for i in range(l) if i not in testid]
        print "---Testing now..."
        errorcount = 0; p = len(testid)
        for i in range(p):
            if bayes(testxxx[i], testtrainxws, testtrainy, fulltext) != testyyy[i]:
                errorcount += 1
        print errorcount
        print p
        print "---Errorrate is: ", (errorcount / float(p))
    
    
        # step 4: showing the result
        print "step 4: using..."
        testx = ['love', 'my', 'dalmation']
        print "the testx is: ", testx
        print "---Changing testx into vector..."
        testxws, testxwb = testx2vec(testx, fulltext)
        #print testxws
        bayes(testxws, testtrainxws, testtrainy, fulltext)
    
    main()
    
    
    """
    trainx, trainy = loadDataSet()
    fulltext = createVocabList(trainx)
    print fulltext
    print setOfWords2Vec(fulltext, trainx[0])
    trainxws = []
    for t in trainx:
        trainxws.append(setOfWords2Vec(fulltext, t))
    testEntry1 = ['love', 'my', 'dalmation']
    testEntry2 = ['stupid', 'garbage']
    bayes(testEntry1, trainxws, trainy, fulltext)
    
    """
  • 相关阅读:
    Uva 10779 collector's problem
    poj 2728 最优比率树(最小生成树问题)
    LA 3126 二分图匹配 最小路径覆盖
    poj 1149 最大流构图
    Step By Step(Java XML篇)
    Step By Step(Java 输入输出篇)
    Step By Step(Java 集合篇)
    Step By Step(Java 线程篇)
    Step By Step(Java 反射篇)
    Step By Step(Java 国际化篇)
  • 原文地址:https://www.cnblogs.com/monne/p/4249324.html
Copyright © 2011-2022 走看看