ID3算法

#coding=utf-8frommathimportlogimportoperator#这里定义个样本集defcreateDataSet():dataSet=[[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]labels=['nosurfacing','flippers']#changetodiscretevaluesreturndataSet,labels#这里计算该样本的期望值defcalcShannonEnt(dataSet):numEntries=len(dataSet)labelCounts={}forfeatVecindataSet:#thethenumberofuniqueelementsandtheiroccurancecurrentLabel=featVec[-1]ifcurrentLabelnotinlabelCounts.keys():labelCounts[currentLabel]=0labelCounts[currentLabel]+=1shannonEnt=0.0forkeyinlabelCounts:prob=float(labelCounts[key])/numEntriesshannonEnt-=prob*log(prob,2)#logbase2returnshannonEnt#这里计算数据集中某列满足相同条件下的其他数值组成的矩阵,用于计算该特征值的期望值defsplitDataSet(dataSet,axis,value):retDataSet=[]forfeatVecindataSet:iffeatVec[axis]==value:reducedFeatVec=featVec[:axis]#chopoutaxisusedforsplittingreducedFeatVec.extend(featVec[axis+1:])retDataSet.append(reducedFeatVec)returnretDataSet#针对上个函数算出的矩阵,计算出所有特征值的期望值,然后得出最大的信息增量defchooseBestFeatureToSplit(dataSet):numFeatures=len(dataSet[0])-1#thelastcolumnisusedforthelabelsbaseEntropy=calcShannonEnt(dataSet)bestInfoGain=0.0;bestFeature=-1foriinrange(numFeatures):#iterateoverallthefeaturesfeatList=[example[i]forexampleindataSet]#createalistofalltheexamplesofthisfeatureuniqueVals=set(featList)#getasetofuniquevaluesnewEntropy=0.0forvalueinuniqueVals:#这里选取某一特征值得所有可能计算期望值subDataSet=splitDataSet(dataSet,i,value)prob=len(subDataSet)/float(len(dataSet))newEntropy+=prob*calcShannonEnt(subDataSet)infoGain=baseEntropy-newEntropy#calculatetheinfogain;iereductioninentropyif(infoGain>bestInfoGain):#comparethistothebestgainsofarbestInfoGain=infoGain#ifbetterthancurrentbest,settobestbestFeature=ireturnbestFeature#returnsaninteger#如果最后的结果有多个,投票机制取最大的defmajorityCnt(classList):classCount={}forvoteinclassList:ifvotenotinclassCount.keys():classCount[vote]=0classCount[vote]+=1sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)returnsortedClassCount[0][0]#创建树,使用递归defcreateTree(dataSet,labels):#计算dataset里是否只有单值classList=[example[-1]forexampleindataSet]#如果只有单值,且唯一,则返回单值ifclassList.count(classList[0])==len(classList):returnclassList[0]#如果是最后结果,但有多值,取最多的iflen(dataSet[0])==1:returnmajorityCnt(classList)#取最佳的特征值bestFeat=chooseBestFeatureToSplit(dataSet)bestFeatLabel=labels[bestFeat]#根据这个特征值制定字典myTree={bestFeatLabel:{}}#删除这个特征值del(labels[bestFeat])#找出这个特征值下有几个选择featValues=[example[bestFeat]forexampleindataSet]uniqueVals=set(featValues)forvalueinuniqueVals:subLabels=labels[:]#针对每个选择,建立不同的分支myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)returnmyTree#决策树分类器,inputtree是决策树,fearlabel是列种类,testVec是要分类的向量defclassify(inputTree,featLabels,testVec):firstStr=inputTree.keys()[0]secondDict=inputTree[firstStr]featIndex=featLabels.index(firstStr)key=testVec[featIndex]valueOfFeat=secondDict[key]ifisinstance(valueOfFeat,dict):classLabel=classify(valueOfFeat,featLabels,testVec)else:classLabel=valueOfFeatreturnclassLabel#序列化决策树defstoreTree(inputTree,filename):importpicklefw=open(filename,'w')pickle.dump(inputTree,fw)fw.close()#提取决策树defgrabTree(filename):importpicklefr=open(filename)returnpickle.load(fr)

输入矩阵是一张表,最后一列是结果。

ID3算法是比较粗糙的算法,对标志性变量分类可以,但是无法分析数值型数据。

而且在选择特征值时,倾向于选择种类较多的特征值,因此需要改进。