zoukankan      html  css  js  c++  java
  • naiveBayes_python

    #coding:utf-8
    from numpy import *
    import re
    def createlist(lst):#将所有文本放入一个列表中
        listt=set([])
        for line in lst:
            listt=listt|set(line)
        return list(listt)
    def word2vec(List,inputset):#将输入文本转为词向量,每个文本对应一个词向量,其长度为上述列表长度
        lenth=len(List)
        vec=[0]*lenth
        for word in inputset:
            if word in List:
                vec[List.index(word)]+=1
        return vec
    def singprob(trainmatrix,label):#计算在已知类别的条件下,每个单词出现的概率对应于p(w1|c1),p(w2|c1).....
        lenth=len(trainmatrix)
        numword=len(trainmatrix[0])
        pb1=sum(label)/float(lenth)
        p0num=ones(numword)
        p1num=ones(numword)
        p0all=2
        p1all=2
        for i in range(lenth):
            if label[i]==1:
                p1num+=trainmatrix[i]
                p1all+=sum(trainmatrix[i])
            else:
                p0num+=trainmatrix[i]
                p0all+=sum(trainmatrix[i])
        p1vect=log(p1num/p1all)
        p0vect=log(p0num/p0all)
        return p1vect,p0vect,pb1
    def classifier(vect,p1vect,p0vect,pb1):#对应公式:lnp(w1|c=1 or 0)p(w2/c=1 or 0)..p(wn/c=1 0r 0)p(c)
        p1=sum(vect*p1vect)+log(pb1)
        p0=sum(vect*p0vect)+log(1-pb1)
        if p1>p0:
            return 1
        else:
            return 0
    def testparse(str):
        reg=re.compile('W*')
        line=reg.split(str)
        List=[tt.lower for tt in line if len(tt)>2]
        return List
    def Test():
        doc=[]
        label=[]
        fulltext=[]
        for i in range(1,26):
            wordlist=testparse(open("email/spam/%d.txt" %i).read())
            doc.append(wordlist)
            fulltext.extend(wordlist)
            label.append(1)
            wordlist=testparse(open("email/ham/%d.txt" %i).read())
            doc.append(wordlist)
            fulltext.extend(wordlist)
            label.append(0)
        doclist=createlist(doc)
        trainingset=range(50)
        testset=[]
        for i in range(10):
            index=int(random.uniform(0,len(trainingset)))
            testset.append(index)
            del(trainingset[index])
        trainmat=[]
        classlabel=[]
        for docindex in trainingset:
            trainmat.append(word2vec(doclist,doc[docindex]))
            classlabel.append(label[docindex])
        p1,p0,pb=singprob(trainmat,classlabel)
        error=0
        for testindex in testset:
            wordvect=word2vec(doclist,doc[testindex])
            if classifier(wordvect,p1,p0,pb)!=label[testindex]:
                error+=1
        print error/float(len(testset))
    Test()
  • 相关阅读:
    poj 2386 Lake Counting
    hdu 3998 Sequence
    hdu 1556 Color the ball
    synchronized和ReentrantLock的区别
    4种常用线程池
    java深浅拷贝
    ConcurrentHashMap总结
    List原理
    volatile关键字
    java关键字总结
  • 原文地址:https://www.cnblogs.com/semen/p/6962929.html
Copyright © 2011-2022 走看看