zoukankan      html  css  js  c++  java
  • 机器学习(基于概率论的分类方法:朴素贝叶斯)

    概率论是许多机器学习算法的基础,因而本篇将会用到一些概率论知识,我们先统计在数据集中取某个特定值的次数,然后除以数据集的实例总数,就得到了取该值的概率。

    优点:在数据较少的情况下仍然有效,可以处理多类别问题

    缺点:对输入数据的准备方式比较敏感

    适用于标称型数据

    如果P1(X,Y)>P2(X,Y),那么属于类别1

    如果P2(X,Y)>P1(X,Y),那么属于类别2

    也就是说我们会选择高概率对应的类别。这就是贝叶斯决策理论的核心思想,即选择具有最高概率的决策

    朴素贝叶斯的朴素就是特征之间相互独立

    接下来插入该算法的具体代码

    from numpy import *
    
    def loadDataSet():
        return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
    
    def createC1(dataSet):
        C1 = []
        for transaction in dataSet:
            for item in transaction:
                if not [item] in C1:
                    C1.append([item])
                    
        C1.sort()
        return map(frozenset, C1)#use frozen set so we
                                #can use it as a key in a dict    
    
    def scanD(D, Ck, minSupport):
        ssCnt = {}
        for tid in D:
            for can in Ck:
                if can.issubset(tid):
                    if not ssCnt.has_key(can): ssCnt[can]=1
                    else: ssCnt[can] += 1
        numItems = float(len(D))
        retList = []
        supportData = {}
        for key in ssCnt:
            support = ssCnt[key]/numItems
            if support >= minSupport:
                retList.insert(0,key)
            supportData[key] = support
        return retList, supportData
    
    def aprioriGen(Lk, k): #creates Ck
        retList = []
        lenLk = len(Lk)
        for i in range(lenLk):
            for j in range(i+1, lenLk): 
                L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
                L1.sort(); L2.sort()
                if L1==L2: #if first k-2 elements are equal
                    retList.append(Lk[i] | Lk[j]) #set union
        return retList
    
    def apriori(dataSet, minSupport = 0.5):
        C1 = createC1(dataSet)
        D = map(set, dataSet)
        L1, supportData = scanD(D, C1, minSupport)
        L = [L1]
        k = 2
        while (len(L[k-2]) > 0):
            Ck = aprioriGen(L[k-2], k)
            Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
            supportData.update(supK)
            L.append(Lk)
            k += 1
        return L, supportData
    
    def generateRules(L, supportData, minConf=0.7):  #supportData is a dict coming from scanD
        bigRuleList = []
        for i in range(1, len(L)):#only get the sets with two or more items
            for freqSet in L[i]:
                H1 = [frozenset([item]) for item in freqSet]
                if (i > 1):
                    rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
                else:
                    calcConf(freqSet, H1, supportData, bigRuleList, minConf)
        return bigRuleList         
    
    def calcConf(freqSet, H, supportData, brl, minConf=0.7):
        prunedH = [] #create new list to return
        for conseq in H:
            conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
            if conf >= minConf: 
                print freqSet-conseq,'-->',conseq,'conf:',conf
                brl.append((freqSet-conseq, conseq, conf))
                prunedH.append(conseq)
        return prunedH
    
    def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
        m = len(H[0])
        if (len(freqSet) > (m + 1)): #try further merging
            Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
            Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
            if (len(Hmp1) > 1):    #need at least two sets to merge
                rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
                
    def pntRules(ruleList, itemMeaning):
        for ruleTup in ruleList:
            for item in ruleTup[0]:
                print itemMeaning[item]
            print "           -------->"
            for item in ruleTup[1]:
                print itemMeaning[item]
            print "confidence: %f" % ruleTup[2]
            print       #print a blank line
            
                
    from time import sleep
    from votesmart import votesmart
    votesmart.apikey = 'a7fa40adec6f4a77178799fae4441030'
    #votesmart.apikey = 'get your api key first'
    def getActionIds():
        actionIdList = []; billTitleList = []
        fr = open('recent20bills.txt') 
        for line in fr.readlines():
            billNum = int(line.split('	')[0])
            try:
                billDetail = votesmart.votes.getBill(billNum) #api call
                for action in billDetail.actions:
                    if action.level == 'House' and 
                    (action.stage == 'Passage' or action.stage == 'Amendment Vote'):
                        actionId = int(action.actionId)
                        print 'bill: %d has actionId: %d' % (billNum, actionId)
                        actionIdList.append(actionId)
                        billTitleList.append(line.strip().split('	')[1])
            except:
                print "problem getting bill %d" % billNum
            sleep(1)                                      #delay to be polite
        return actionIdList, billTitleList
            
    def getTransList(actionIdList, billTitleList): #this will return a list of lists containing ints
        itemMeaning = ['Republican', 'Democratic']#list of what each item stands for
        for billTitle in billTitleList:#fill up itemMeaning list
            itemMeaning.append('%s -- Nay' % billTitle)
            itemMeaning.append('%s -- Yea' % billTitle)
        transDict = {}#list of items in each transaction (politician) 
        voteCount = 2
        for actionId in actionIdList:
            sleep(3)
            print 'getting votes for actionId: %d' % actionId
            try:
                voteList = votesmart.votes.getBillActionVotes(actionId)
                for vote in voteList:
                    if not transDict.has_key(vote.candidateName): 
                        transDict[vote.candidateName] = []
                        if vote.officeParties == 'Democratic':
                            transDict[vote.candidateName].append(1)
                        elif vote.officeParties == 'Republican':
                            transDict[vote.candidateName].append(0)
                    if vote.action == 'Nay':
                        transDict[vote.candidateName].append(voteCount)
                    elif vote.action == 'Yea':
                        transDict[vote.candidateName].append(voteCount + 1)
            except: 
                print "problem getting actionId: %d" % actionId
            voteCount += 2
        return transDict, itemMeaning
  • 相关阅读:
    Final TFS 2008 Feature List
    来看看微软对测试是什么要求
    淘宝设计流程
    Disable try catch
    jQuery validate API
    iPhone手机开发平台入门介绍和教程
    VSSpeedster Speed up your VS 2010
    Where are the SDK tools? Where is ildasm?
    效率高的删除语句truncate table [tablename]
    修改Hosts去除各站广告
  • 原文地址:https://www.cnblogs.com/xzm123/p/8981313.html
Copyright © 2011-2022 走看看