四、代码实现(python)
以下代码来自Peter Harrington《Machine Learing in Action》。
代码如下(保存为apriori.py)
# -- coding: utf-8 -- from numpy import * def loadDataSet(): return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] def createC1(dataSet): # 该函数构建集合C1:候选1-项集 C1 = [] for transaction in dataSet: for item in transaction: if not [item] in C1: C1.append([item]) C1.sort() return map(frozenset, C1) def scanD(D, Ck, minSupport): # 该函数接收3个参数,分别是数据集、候选k-项集、支持度阈值;该函数用于生成频繁项集 ssCnt = {} for tid in D: for can in Ck: if can.issubset(tid): if not ssCnt.has_key(can): ssCnt[can]=1 else: ssCnt[can] += 1 numItems = float(len(D)) retList = [] # retList存储大于支持度阈值的候选1-项集,即频繁1-项集 supportData = {} # supportDatacunc存储各候选1-项集的支持度 for key in ssCnt: support = ssCnt[key]/numItems if support >= minSupport: retList.insert(0,key) supportData[key] = support return retList, supportData def aprioriGen(Lk, k): # 该函数接收2个参数,分别是频繁(k-1)-项集、k;该函数用于生成候选项集 retList = [] # 存储候选k-项集 lenLk = len(Lk) for i in range(lenLk): for j in range(i+1, lenLk): L1 = list(Lk[i])[:k-2] L2 = list(Lk[j])[:k-2] L1.sort() L2.sort() if L1==L2: retList.append(Lk[i] | Lk[j]) # 前k-2个项相同,合并Lk[i]与Lk[j] return retList def apriori(dataSet, minSupport = 0.5): # 该函数接收2个参数,分别是数据集、支持度阈值(默认0.5) C1 = createC1(dataSet) # 创建候选1-项集 D = map(set, dataSet) L1, supportData = scanD(D, C1, minSupport) # L1为频繁1-项集,supportData存储各候选1-项集的支持度 L = [L1] k = 2 while (len(L[k-2]) > 0): # 循环各频繁(k-1)-项集,直至为空 Ck = aprioriGen(L[k-2], k) # Ck为候选k-项集 Lk, supK = scanD(D, Ck, minSupport) # Lk为频繁k-项集,supportData存储各候选k-项集的支持度 supportData.update(supK) # 存储各候选项集的支持度 L.append(Lk) # 将新生成的频繁k-项集添加进频繁项集数组 k += 1 return L, supportData def generateRules(L, supportData, minConf=0.7): # 该函数接收3个参数,分别是频繁项集、包含项集的支持度字典、置信度阈值; bigRuleList = [] for i in range(1, len(L)): for freqSet in L[i]: H1 = [frozenset([item]) for item in freqSet] if (i > 1): # 项集数目大于3执行次函数 rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) else: # 频繁2-项集执行此函数 calcConf(freqSet, H1, supportData, bigRuleList, minConf) return bigRuleList def calcConf(freqSet, H, supportData, brl, minConf=0.7): # 该函数接收5个参数,分别是用于计算的频繁项集、此项集各个元素、包含项集的支持度字典、关联规则数组、置信度阈值; prunedH = [] for conseq in H: conf = supportData[freqSet]/supportData[freqSet-conseq] # 计算置信度 if conf >= minConf: print freqSet-conseq,'-->',conseq,'conf:',conf brl.append((freqSet-conseq, conseq, conf)) prunedH.append(conseq) return prunedH def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7): # 该函数接收5个参数,分别是用于计算的频繁项集、此项集各个元素、包含项集的支持度字典、关联规则数组、置信度阈值; m = len(H[0]) if (len(freqSet) > (m + 1)): Hmp1 = aprioriGen(H, m+1) # 将H中的元素两两合并 Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf) # 计算置信度 if (len(Hmp1) > 1): rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)