importnumpyasnp#返回样本数据集defloadDataSet():postingList=[['my','dog','has','flea','problems','help','please'],['maybe','not','take','him','to','dog','park','stupid'],['my','dalmation','is','so','cute','I','love','him'],['stop','posting','stupid','worthless','garbage'],['mr','licks','ate','my','steak','how','to','stop','him'],['quit','buying','worthless','dog','food','stupid']]classVec=[0,1,0,1,0,1]returnpostingList,classVec#提取样本数据中的单词,构成词汇表defcreateVocabList(dataSet):vocabSet=set([])fordocumentindataSet:vocabSet=vocabSet|set(document)returnlist(vocabSet)#传入单词表和待分析的数据,讲数据转为向量,这里记录每行样本的单词是否出现defsetOfWords2Vec(vocabList,inputSet):retVocabList=[0]*len(vocabList)forwordininputSet:ifwordinvocabList:retVocabList[vocabList.index(word)]=1else:print'word',word,'notindict'returnretVocabList#这里是每个样本的出现次数defbagOfWords2VecMN(vocabList,inputSet):returnVec=[0]*len(vocabList)forwordininputSet:ifwordinvocabList:returnVec[vocabList.index(word)]+=1returnreturnVec#带入样本数据和结果,计算样本对于某一类别的出现次数#这个求出不同组中,每个词出现的概率deftrainNB0(trainMatrix,trainCatergory):numTrainDoc=len(trainMatrix)numWords=len(trainMatrix[0])pAbusive=sum(trainCatergory)/float(numTrainDoc)#防止多个概率的成绩当中的一个为0p0Num=np.ones(numWords)p1Num=np.ones(numWords)p0Denom=2.0p1Denom=2.0foriinrange(numTrainDoc):iftrainCatergory[i]==1:p1Num+=trainMatrix[i]p1Denom+=sum(trainMatrix[i])else:p0Num+=trainMatrix[i]p0Denom+=sum(trainMatrix[i])#处于精度的考虑,否则很可能到限归零,因为可能有太多项都为0#避免下溢出和浮点数舍入导致的错误p1Vect=np.log(p1Num/p1Denom)p0Vect=np.log(p0Num/p0Denom)returnp0Vect,p1Vect,pAbusive#这里也就相当于log了一下defclassifyNB(vec2Classify,p0Vec,p1Vec,pClass1):p1=sum(vec2Classify*p1Vec)+np.log(pClass1)p0=sum(vec2Classify*p0Vec)+np.log(1.0-pClass1)ifp1>p0:return1else:return0#测试方法deftestingNB():listOPosts,listClasses=loadDataSet()myVocabList=createVocabList(listOPosts)trainMat=[]forpostinDocinlistOPosts:trainMat.append(setOfWords2Vec(myVocabList,postinDoc))p0V,p1V,pAb=trainNB0(np.array(trainMat),np.array(listClasses))testEntry=['love','my','dalmation']thisDoc=np.array(setOfWords2Vec(myVocabList,testEntry))printtestEntry,'classifiedas:',classifyNB(thisDoc,p0V,p1V,pAb)testEntry=['stupid','garbage']thisDoc=np.array(setOfWords2Vec(myVocabList,testEntry))printtestEntry,'classifiedas:',classifyNB(thisDoc,p0V,p1V,pAb)defmain():testingNB()if__name__=='__main__':main()