一个很重要的思想:

如果某元素不是频繁的,那么包含该元素的超集也是不频繁的。

发现dataSet的频繁集:

importnumpyasnpimportpandasaspddefloadDataSet():return[[1,3,4],[2,3,5],[1,2,3,5],[2,5]]#提取数据集中所有的单独数据,参数是数据集defcreateC1(dataSet):C1=[]fortransactionindataSet:foritemintransaction:ifnot[item]inC1:C1.append([item])C1.sort()returnmap(frozenset,C1)#usefrozensetsowe#canuseitasakeyinadict#找出ck字典中所有的序列在D数据集中出现的次数以及由此计算出的频繁度defscanD(D,Ck,minSupport):ssCnt={}#找出ck字典中所有的序列在D数据集中出现的次数fortidinD:forcaninCk:ifcan.issubset(tid):ifnotssCnt.has_key(can):ssCnt[can]=1else:ssCnt[can]+=1numItems=float(len(D))retList=[]supportData={}#计算每个序列的频繁度forkeyinssCnt:support=ssCnt[key]/numItemsifsupport>=minSupport:retList.insert(0,key)supportData[key]=support#返回合格的序列和对于的支持度returnretList,supportData#Lk是k-1层的频繁度序列,这里是拼接处k层序列,在用上面的scanD函数计算出合格的序列defaprioriGen(Lk,k):#createsCkretList=[]lenLk=len(Lk)foriinrange(lenLk):forjinrange(i+1,lenLk):L1=list(Lk[i])[:k-2];L2=list(Lk[j])[:k-2]L1.sort();L2.sort()ifL1==L2:#iffirstk-2elementsareequalretList.append(Lk[i]|Lk[j])#setunionreturnretList#Apriori算法的主函数,用到以上的所有函数defapriori(dataSet,minSupport=0.5):C1=createC1(dataSet)D=map(set,dataSet)L1,supportData=scanD(D,C1,minSupport)L=[L1]k=2while(len(L[k-2])>0):Ck=aprioriGen(L[k-2],k)Lk,supK=scanD(D,Ck,minSupport)supportData.update(supK)L.append(Lk)k+=1#L是所有的合格的频繁度序列,supportData是一个字典,键是序列,值是支持度returnL,supportData

从找出的频繁集中找关联规则:

defgenerateRules(L,supportData,minConf=0.7):#supportDataisadictcomingfromscanDbigRuleList=[]foriinrange(1,len(L)):#onlygetthesetswithtwoormoreitemsforfreqSetinL[i]:H1=[frozenset([item])foriteminfreqSet]if(i>1):rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf)else:calcConf(freqSet,H1,supportData,bigRuleList,minConf)returnbigRuleListdefcalcConf(freqSet,H,supportData,brl,minConf=0.7):prunedH=[]#createnewlisttoreturnforconseqinH:conf=supportData[freqSet]/supportData[freqSet-conseq]#calcconfidenceifconf>=minConf:printfreqSet-conseq,'-->',conseq,'conf:',confbrl.append((freqSet-conseq,conseq,conf))prunedH.append(conseq)returnprunedHdefrulesFromConseq(freqSet,H,supportData,brl,minConf=0.7):m=len(H[0])if(len(freqSet)>(m+1)):#tryfurthermergingHmp1=aprioriGen(H,m+1)#createHm+1newcandidatesHmp1=calcConf(freqSet,Hmp1,supportData,brl,minConf)if(len(Hmp1)>1):#needatleasttwosetstomergerulesFromConseq(freqSet,Hmp1,supportData,brl,minConf)