zoukankan      html  css  js  c++  java
  • Apriori 获取关联规则实现

    前言

    自己的一个Apriori 获取关联规则的python实现。具体原理不讲,代码添加了说明,还是很好理解的。

    数据预处理

    #最小置信度
    min_conf = 0.5
    #最小支持度
    min_sup = 2
    f=open('data.txt')
    dataset = f.readlines()
    print(dataset)
    
    ['T1	bread, cream, milk, tea
    ', 'T2	bread, cream, milk
    ', 'T3	cake, milk
    ', 'T4	milk, tea
    ', 'T5	bread, cake, milk
    ', 'T6	bread, tea
    ', 'T7	beer, milk, tea
    ', 'T8	bread, tea
    ', 'T9	bread, cream, milk, tea
    ', 'T10	bread, milk, tea']
    
    dataset = [data.replace('
    ','').split('	') for data in dataset]
    print(dataset)
    
    [['T1', 'bread, cream, milk, tea'], ['T2', 'bread, cream, milk'], ['T3', 'cake, milk'], ['T4', 'milk, tea'], ['T5', 'bread, cake, milk'], ['T6', 'bread, tea'], ['T7', 'beer, milk, tea'], ['T8', 'bread, tea'], ['T9', 'bread, cream, milk, tea'], ['T10', 'bread, milk, tea']]
    
    dataset = [tuple([data[0],sorted(data[1].replace(' ', '').split(','))]) for data in dataset]
    print(dataset)
    
    [('T1', ['bread', 'cream', 'milk', 'tea']), ('T2', ['bread', 'cream', 'milk']), ('T3', ['cake', 'milk']), ('T4', ['milk', 'tea']), ('T5', ['bread', 'cake', 'milk']), ('T6', ['bread', 'tea']), ('T7', ['beer', 'milk', 'tea']), ('T8', ['bread', 'tea']), ('T9', ['bread', 'cream', 'milk', 'tea']), ('T10', ['bread', 'milk', 'tea'])]
    
    terms = [term for data in dataset for term in data[1]]
    terms.sort()
    terms = [terms[i] for i in range(0,len(terms)) if i==0 or terms[i]!=terms[i-1]]
    print(terms)
    
    ['beer', 'bread', 'cake', 'cream', 'milk', 'tea']
    

    Aprior寻找频繁项集

    def is_sub_seq(P, T):
        '''判断P是否为T的子序列
        Parameters
        --------
        P: 一个有序序列
        T: 一个有序序列
        '''
        i, j = 0, 0
        while(i<len(P) and j<len(T)):
            if(P[i]==T[j]):
                i+=1
            j+=1
        return i==len(P)
    
    def Aprior_sieve(L):
        '''从一个项集组成的序列中中筛选出频繁项集
        Parameters
        ---
        L: 一个项集组成的序列
        
        Returns
        ---
        一个频繁项集和它支持度组成的序列
        '''
        L = [[l,0] for l in L]
        for l in L:
            for data in dataset:
                if(is_sub_seq(l[0], data[1])):
                    l[1] += 1
        L = [l for l in L if l[1]>=minsup]
        return L
        
    
    def Aprior_gen(L,k):
        '''通过k项集构造k+1项集
        Parameters
        ---
        L:一个频繁k项集和它支持度组成的序列
        k:频繁k项集的项数
        
        Returns
        ---
        一个k+1项集组成的序列
        '''
        print(k,":	",L)
        NL = []
        myset = {tuple(l[0]) for l in L}
        for i in range(0, len(L)):
            for j in range(i+1, len(L)):
                if(L[i][0][:k-1]==L[j][0][:k-1]):
                    nl = L[i][0].copy()
                    nl.append(L[j][0][k-1])
                    ok = True
                    for r in range(0, k-1):
                        tmp = nl.copy()
                        tmp.pop(r)
                        tmp = tuple(tmp)
                        if(tmp not in myset):
                            ok = False
                            break
                    if(ok):
                        NL.append(nl)
                else:
                    break
        return NL
                    
    
    L = [[term] for term in terms]
    L = Aprior_sieve(L)
    print(L)
    
    [[['bread'], 7], [['cake'], 2], [['cream'], 3], [['milk'], 8], [['tea'], 7]]
    
    Ans = []
    Ans.append(L)
    for i in range(1,len(terms)):
        L = Aprior_gen(Ans[i-1],i)
        L = Aprior_sieve(L)
        if(len(L)==0):
            break
        Ans.append(L)
    print(Ans)
    
    1 :	 [[['bread'], 7], [['cake'], 2], [['cream'], 3], [['milk'], 8], [['tea'], 7]]
    2 :	 [[['bread', 'cream'], 3], [['bread', 'milk'], 5], [['bread', 'tea'], 5], [['cake', 'milk'], 2], [['cream', 'milk'], 3], [['cream', 'tea'], 2], [['milk', 'tea'], 5]]
    3 :	 [[['bread', 'cream', 'milk'], 3], [['bread', 'cream', 'tea'], 2], [['bread', 'milk', 'tea'], 3], [['cream', 'milk', 'tea'], 2]]
    4 :	 [[['bread', 'cream', 'milk', 'tea'], 2]]
    [[[['bread'], 7], [['cake'], 2], [['cream'], 3], [['milk'], 8], [['tea'], 7]], [[['bread', 'cream'], 3], [['bread', 'milk'], 5], [['bread', 'tea'], 5], [['cake', 'milk'], 2], [['cream', 'milk'], 3], [['cream', 'tea'], 2], [['milk', 'tea'], 5]], [[['bread', 'cream', 'milk'], 3], [['bread', 'cream', 'tea'], 2], [['bread', 'milk', 'tea'], 3], [['cream', 'milk', 'tea'], 2]], [[['bread', 'cream', 'milk', 'tea'], 2]]]
    

    获取关联规则

    mydict = { tuple(l[0]):l[1] for i in range(0, len(Ans)) for l in Ans[i]}
    print(mydict)
    R=set()
    
    {('bread',): 7, ('cake',): 2, ('cream',): 3, ('milk',): 8, ('tea',): 7, ('bread', 'cream'): 3, ('bread', 'milk'): 5, ('bread', 'tea'): 5, ('cake', 'milk'): 2, ('cream', 'milk'): 3, ('cream', 'tea'): 2, ('milk', 'tea'): 5, ('bread', 'cream', 'milk'): 3, ('bread', 'cream', 'tea'): 2, ('bread', 'milk', 'tea'): 3, ('cream', 'milk', 'tea'): 2, ('bread', 'cream', 'milk', 'tea'): 2}
    
    def conf(rule):
        return mydict[rule[1]]/mydict[rule[0]]
    
    def gen_rule(X, Y):
        for item in Y:
            if item not in X:
                nX = X.copy()
                nX.append(item)
                nX.sort()
                rule = (tuple(nX),Y)
                if(rule not in R and conf(rule)>=min_conf):
                    R.add(rule)
                    gen_rule(nX, Y)
    
    for l in mydict.keys():
        gen_rule([],l)
    
    R = [(f, tuple(set(b)-set(f))) for f, b in R]
    R.sort()
    print(R)
    
    [(('bread',), ()), (('bread',), ('milk',)), (('bread',), ('tea',)), (('bread', 'cream'), ()), (('bread', 'cream'), ('milk',)), (('bread', 'cream'), ('milk', 'tea')), (('bread', 'cream'), ('tea',)), (('bread', 'cream', 'milk'), ()), (('bread', 'cream', 'milk'), ('tea',)), (('bread', 'cream', 'milk', 'tea'), ()), (('bread', 'cream', 'tea'), ()), (('bread', 'cream', 'tea'), ('milk',)), (('bread', 'milk'), ()), (('bread', 'tea'), ()), (('cake',), ()), (('cake',), ('milk',)), (('cake', 'milk'), ()), (('cream',), ()), (('cream',), ('bread',)), (('cream',), ('bread', 'milk')), (('cream',), ('bread', 'milk', 'tea')), (('cream',), ('bread', 'tea')), (('cream',), ('milk',)), (('cream',), ('milk', 'tea')), (('cream',), ('tea',)), (('cream', 'milk'), ()), (('cream', 'milk'), ('bread',)), (('cream', 'milk'), ('bread', 'tea')), (('cream', 'milk'), ('tea',)), (('cream', 'milk', 'tea'), ()), (('cream', 'milk', 'tea'), ('bread',)), (('cream', 'tea'), ()), (('cream', 'tea'), ('bread',)), (('cream', 'tea'), ('bread', 'milk')), (('cream', 'tea'), ('milk',)), (('milk',), ()), (('milk',), ('bread',)), (('milk',), ('tea',)), (('milk', 'tea'), ()), (('tea',), ()), (('tea',), ('bread',)), (('tea',), ('milk',))]
    
    print(len(mydict), len(R))
    
    17 42
    

    感想

    算法的原理还是比较简单的,但实现起来还是要花些功夫。另外使用python的一些特性可以极大简化代码实现,如列表解析(学到了一种多重循环的解析),容器转换。踩了一波语法特性的坑,比如copy,dict的键为容器的话只能用tuple

  • 相关阅读:
    Entity Framework 批量操作
    Tsak多线程学习记录
    .net webservice 动态更换地址
    .NET EF Core2.0 (DBFirst)使用与配置
    MVC发布IIS后提示未配置默认文档
    未能加载文件或程序集“Microsoft.Web.Infrastructure, Version=1.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35”或它的某一个依赖项。系统找不到指定的文件。
    Redis教程
    使用git将项目上传到github
    Redis集群(一)搭建Cluster模式[超简单]
    Redis 常见5大数据类型结构,附录3个数据类型
  • 原文地址:https://www.cnblogs.com/Merodach/p/9040554.html
Copyright © 2011-2022 走看看