zoukankan      html  css  js  c++  java
  • naiveBayes_python

    #coding:utf-8
    from numpy import *
    import re
    def createlist(lst):#将所有文本放入一个列表中
        listt=set([])
        for line in lst:
            listt=listt|set(line)
        return list(listt)
    def word2vec(List,inputset):#将输入文本转为词向量,每个文本对应一个词向量,其长度为上述列表长度
        lenth=len(List)
        vec=[0]*lenth
        for word in inputset:
            if word in List:
                vec[List.index(word)]+=1
        return vec
    def singprob(trainmatrix,label):#计算在已知类别的条件下,每个单词出现的概率对应于p(w1|c1),p(w2|c1).....
        lenth=len(trainmatrix)
        numword=len(trainmatrix[0])
        pb1=sum(label)/float(lenth)
        p0num=ones(numword)
        p1num=ones(numword)
        p0all=2
        p1all=2
        for i in range(lenth):
            if label[i]==1:
                p1num+=trainmatrix[i]
                p1all+=sum(trainmatrix[i])
            else:
                p0num+=trainmatrix[i]
                p0all+=sum(trainmatrix[i])
        p1vect=log(p1num/p1all)
        p0vect=log(p0num/p0all)
        return p1vect,p0vect,pb1
    def classifier(vect,p1vect,p0vect,pb1):#对应公式:lnp(w1|c=1 or 0)p(w2/c=1 or 0)..p(wn/c=1 0r 0)p(c)
        p1=sum(vect*p1vect)+log(pb1)
        p0=sum(vect*p0vect)+log(1-pb1)
        if p1>p0:
            return 1
        else:
            return 0
    def testparse(str):
        reg=re.compile('W*')
        line=reg.split(str)
        List=[tt.lower for tt in line if len(tt)>2]
        return List
    def Test():
        doc=[]
        label=[]
        fulltext=[]
        for i in range(1,26):
            wordlist=testparse(open("email/spam/%d.txt" %i).read())
            doc.append(wordlist)
            fulltext.extend(wordlist)
            label.append(1)
            wordlist=testparse(open("email/ham/%d.txt" %i).read())
            doc.append(wordlist)
            fulltext.extend(wordlist)
            label.append(0)
        doclist=createlist(doc)
        trainingset=range(50)
        testset=[]
        for i in range(10):
            index=int(random.uniform(0,len(trainingset)))
            testset.append(index)
            del(trainingset[index])
        trainmat=[]
        classlabel=[]
        for docindex in trainingset:
            trainmat.append(word2vec(doclist,doc[docindex]))
            classlabel.append(label[docindex])
        p1,p0,pb=singprob(trainmat,classlabel)
        error=0
        for testindex in testset:
            wordvect=word2vec(doclist,doc[testindex])
            if classifier(wordvect,p1,p0,pb)!=label[testindex]:
                error+=1
        print error/float(len(testset))
    Test()
  • 相关阅读:
    Kafka 生产者 自定义分区策略
    同步互斥
    poj 1562 Oil Deposits(dfs)
    poj 2386 Lake Counting(dfs)
    poj 1915 KnightMoves(bfs)
    poj 1664 放苹果(dfs)
    poj 1543 Perfect Cubes (暴搜)
    poj 1166 The Clocks (暴搜)
    poj 3126 Prime Path(bfs)
    处理机调度
  • 原文地址:https://www.cnblogs.com/semen/p/6962929.html
Copyright © 2011-2022 走看看