zoukankan      html  css  js  c++  java
  • python调用R语言,关联规则可视化

    首先当然要配置r语言环境变量什么的

    D:R-3.5.1inx64;
    D:R-3.5.1inx64R.dll;
    D:R-3.5.1;
    D:ProgramDataAnaconda3Libsite-packages py2;

    本来用python也可以实现关联规则,虽然没包,但是可视化挺麻烦的

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    from pandas import read_csv
    
    
    
    def loadDataSet():
        dataset = read_csv("F:/goverment/Aprior/No Number.csv")
        data = dataset.values[:,:]
        Data=[]
        for line in data:
            ls=[]
            for i in line:
                ls.append(i)
            Data.append(ls)
        #print(Data)
        return Data
    
        '''
        return [['a', 'c', 'e'], ['b', 'd'], ['b', 'c'], ['a', 'b', 'c', 'd'], ['a', 'b'], ['b', 'c'], ['a', 'b'],
                ['a', 'b', 'c', 'e'], ['a', 'b', 'c'], ['a', 'c', 'e']]'''
    
    def createC1(dataSet):
        C1 = []
        for transaction in dataSet:
            for item in transaction:
                if not [item] in C1:
                    C1.append([item])
        C1.sort()
        '''??????????????????????????????????????????????????????'''
        # 映射为frozenset唯一性的,可使用其构造字典
        return list(map(frozenset, C1))      
     
     
    # 从候选K项集到频繁K项集(支持度计算)
    def scanD(D, Ck, minSupport):
        ssCnt = {}
        for tid in D:
            for can in Ck:
                if can.issubset(tid):
                    if not can in ssCnt:
                        ssCnt[can] = 1
                    else:
                        ssCnt[can] += 1
        numItems = float(len(D))
        retList = []
        supportData = {}
        for key in ssCnt:
            support = ssCnt[key] / numItems
            if support >= minSupport:
                retList.insert(0, key)
                supportData[key] = support  
        return retList, supportData
     
     
    def calSupport(D, Ck, min_support):
        dict_sup = {}
        for i in D:
            for j in Ck:
                if j.issubset(i):
                    if not j in dict_sup:
                        dict_sup[j] = 1
                    else:
                        dict_sup[j] += 1
        sumCount = float(len(D))
        supportData = {}
        relist = []
        for i in dict_sup:
            temp_sup = dict_sup[i] / sumCount
            if temp_sup >= min_support:
                relist.append(i)
                supportData[i] = temp_sup  # 此处可设置返回全部的支持度数据(或者频繁项集的支持度数据)
        return relist, supportData
     
     
    # 改进剪枝算法
    def aprioriGen(Lk, k):  # 创建候选K项集 ##LK为频繁K项集
        retList = []
        lenLk = len(Lk)
        for i in range(lenLk):
            for j in range(i + 1, lenLk):
                L1 = list(Lk[i])[:k - 2]
                L2 = list(Lk[j])[:k - 2]
                L1.sort()
                L2.sort()
                if L1 == L2:  # 前k-1项相等,则可相乘,这样可防止重复项出现
                    #  进行剪枝(a1为k项集中的一个元素,b为它的所有k-1项子集)
                    a = Lk[i] | Lk[j]  # a为frozenset()集合
                    a1 = list(a)
                    b = []
                    # 遍历取出每一个元素,转换为set,依次从a1中剔除该元素,并加入到b中
                    for q in range(len(a1)):
                        t = [a1[q]]
                        tt = frozenset(set(a1) - set(t))
                        b.append(tt)
                    t = 0
                    for w in b:
                        # 当b(即所有k-1项子集)都是Lk(频繁的)的子集,则保留,否则删除。
                        if w in Lk:
                            t += 1
                    if t == len(b):
                        retList.append(b[0] | b[1])
        return retList
     
     
    def apriori(dataSet, minSupport=0.2):
        C1 = createC1(dataSet)
        D = list(map(set, dataSet))  # 使用list()转换为列表
        L1, supportData = calSupport(D, C1, minSupport)
        L = [L1]  # 加列表框,使得1项集为一个单独元素
        k = 2
        while (len(L[k - 2]) > 0):
            Ck = aprioriGen(L[k - 2], k)
            Lk, supK = scanD(D, Ck, minSupport)  # scan DB to get Lk
            supportData.update(supK)
            L.append(Lk)  # L最后一个值为空集
            k += 1
        del L[-1]  # 删除最后一个空集
        return L, supportData  # L为频繁项集,为一个列表,1,2,3项集分别为一个元素。
     
     
    # 生成集合的所有子集
    def getSubset(fromList, toList):
        for i in range(len(fromList)):
            t = [fromList[i]]
            tt = frozenset(set(fromList) - set(t))
            if not tt in toList:
                toList.append(tt)
                tt = list(tt)
                if len(tt) > 1:
                    getSubset(tt, toList)
     
     
    #def calcConf(freqSet, H, supportData, ruleList, minConf=0.7):
    def calcConf(freqSet, H, supportData, Rule, minConf=0.7):
        for conseq in H:
            conf = supportData[freqSet] / supportData[freqSet - conseq]  # 计算置信度
            # 提升度lift计算lift = p(a & b) / p(a)*p(b)
            lift = supportData[freqSet] / (supportData[conseq] * supportData[freqSet - conseq])
            
            ls=[]
            if conf >= minConf and lift > 3:
                for i in freqSet - conseq:
                    #print(i," ",end="")
                    ls.append(i)
                    ls.append(" ")
                #print('-->',end="")
                ls.append('-->')
                for i in conseq:
                    #print(i," ",end="")
                    ls.append(i)
                    ls.append(" ")
                #print('支持度:', round(supportData[freqSet - conseq]*100, 1), "%",'  置信度:', round(conf*100,1),"%",'  lift值为', round(lift, 2))
                #ls.append(' 支持度:')
                #ls.append(round(supportData[freqSet - conseq]*100, 1))
                #ls.append("% ")
                #ls.append(' 置信度:')
                ls.append( round(conf*100,1))
                ls.append("% ")
                #ls.append( round(lift, 2))
                #ls.append(round(lift, 2))
                
                #ruleList.append((freqSet - conseq, conseq, conf))
            if ls!=[]: 
                #print(len(ls))
                Rule.append(ls)
    # =============================================================================
    #     for line in Rule:
    #         for i in line:
    #             print(i,end="")
    #         print("")
    # =============================================================================
        return Rule
    # =============================================================================
    #             print(freqSet - conseq, '-->', conseq, '支持度', round(supportData[freqSet - conseq], 2), '置信度:', round(conf,3),
    #                   'lift值为:', round(lift, 2))
    # =============================================================================
                
     
    # 生成规则
    def gen_rule(L, supportData, minConf=0.7):
        bigRuleList = []
        for i in range(1, len(L)):  # 从二项集开始计算
            for freqSet in L[i]:  # freqSet为所有的k项集
                # 求该三项集的所有非空子集,1项集,2项集,直到k-1项集,用H1表示,为list类型,里面为frozenset类型,
                H1 = list(freqSet)
                all_subset = []
                getSubset(H1, all_subset)  # 生成所有的子集
                calcConf(freqSet, all_subset, supportData, bigRuleList, minConf)
        return bigRuleList
     
     
    if __name__ == '__main__':
        
        dataSet = loadDataSet()
        #print(dataSet)
        L, supportData = apriori(dataSet, minSupport=0.05)
        rule = gen_rule(L, supportData, minConf=0.5)
        for i in rule:
            for j in i:
                if j==',':
                    continue
                else:
                    print(j,end="")
            print("")
    
    '''
    具体公式:
    
    P(B|A)/P(B)
    
    称为A条件对于B事件的提升度,如果该值=1,说明两个条件没有任何关联,
    如果<1,说明A条件(或者说A事件的发生)与B事件是相斥的,
    一般在数据挖掘中当提升度大于3时,我们才承认挖掘出的关联规则是有价值的。
    '''
    View Code

    之后还是用r吧,要下载rpy2,见https://www.cnblogs.com/caiyishuai/p/9520214.html 

    还要下载两个R的包

    import rpy2.robjects as robjects
    b=('''
        install.packages("arules")
        install.packages("arulesViz")
    ''')
    robjects.r(b)

    然后就是主代码了

    import rpy2.robjects as robjects
    
    a=('''Encoding("UTF-8")
    setwd("F:/goverment/Aprior")
    
    all_data<-read.csv("F:/goverment/Aprior/NewData.csv",header = T,#将数据转化为因子型
                       colClasses=c("factor","factor","factor","factor","factor","factor","factor","factor","factor","factor","factor","factor"))
    library(arules)
    rule=apriori(data=all_data[,c(1,4,5,6,7,8,9,10,12)], parameter = list(support=0.05,confidence=0.7,minlen=2,maxlen=10))       
       ''')
    robjects.r(a)
    
    robjects.r('''                        
    rule.subset<-subset(rule,lift>1)
    #inspect(rule.subset)
    rules.sorted<-sort(rule.subset,by="lift")
    subset.matrix<-is.subset(rules.sorted,rules.sorted)
    lower.tri(subset.matrix,diag=T)
    subset.matrix[lower.tri(subset.matrix,diag = T)]<-NA
    redundant<-colSums(subset.matrix,na.rm = T)>=1          #这五条就是去冗余(感兴趣可以去网上搜),我虽然这里写了,但我没有去冗余,我的去了以后一个规则都没了
    which(redundant)
    rules.pruned<-rules.sorted[!redundant]
    #inspect(rules.pruned) #输出去冗余后的规则
    ''')
    
    
    c=('''
    
    library(arulesViz)#掉包
    
    jpeg(file="plot1.jpg")
    #inspect(rule.subset)
    plt<-plot(rule.subset,shading = "lift")#画散点图
    dev.off()
    
    
    subrules<-head(sort(rule.subset,by="lift"),50)
    #jpeg(file="plot2.jpg")
    plot(subrules,method = "graph")#画图
    #dev.off()
    
    rule.sorted <- sort(rule.subset, decreasing=TRUE,  by="lift") #按提升度排序
    rules.write<-as(rule.sorted,"data.frame") #将规则转化为data类型
    write.csv(rules.write,"F:/goverment/Aprior/NewRules.csv",fileEncoding="UTF-8")
    ''')
    robjects.r(c)
    
    #取出保存的规则,放到一个列表中
    from pandas import read_csv
    data_set = read_csv("F:/goverment/Aprior/NewRules.csv")
    data = data_set.values[:, :]
    rul = []
    for line in data:
        ls = []
        for j in line:
            try :
                j=float(j)
                if j>0 and j<=1:
                    j=str(round(j*100,2))+"%"
                    ls.append(j)
                else:
                    ls.append(round(j,2))
            except:
                ls.append(j)
        rul.append(ls)
    
    
    for line in rul:
        print(line)
    
    
    
            
  • 相关阅读:
    vs编译出现 fatal error LNK1281:无法生成 SAFESEH 映像
    $apply()和$digest()——angular
    JS获取URL中参数值
    NeDB——node嵌入式数据库
    VS Code常用插件
    js断点调试
    VS Code 使用Git进行版本控制
    VS Code快捷键
    用户tokenId
    node-webkit-updater——NW.js自动更新
  • 原文地址:https://www.cnblogs.com/caiyishuai/p/13270920.html
Copyright © 2011-2022 走看看