通用文件处理:

importnumpyasnp//文件名和文件中每行的分隔符defloadDataSet(fileName,dotSplit):numFeat=len(open(fileName).readline().split(dotSplit))dataMat=[];labelMat=[]fr=open(fileName)//该数据集默认是最后一列是因变量forlineinfr.readlines():lineArr=[]curline=line.split(dotSplit)foriinrange(0,numFeat-1):lineArr.append(float(curline[i]))dataMat.append(lineArr)labelMat.append(float(curline[numFeat-1]))xMat=np.mat(dataMat)yMat=np.mat(labelMat).TreturnxMat,yMat

这里是处理岭回归的实现:

importnumpyasnpdefridgeRegres(xMat,yMat,lam=0.2):xTx=xMat.T*xMatdenom=xTx+np.eye(np.shape(xMat)[1])*lamprintnp.shape(xMat)[0]ifnp.linalg.det(denom)==0.0:print"wrong"returnws=denom.I*(xMat.T*yMat)returnwsdefnormalizing(xMat,yMat):yMean=np.mean(yMat,0)y=yMat-yMeanxMeans=np.mean(xMat,0)xVar=np.var(xMat,0)x=(xMat-xMeans)/xVarreturnx,ydefridgeTest(xM,yM):xMat,yMat=normalizing(xM,yM)numTestPts=30wMat=np.zeros((numTestPts,np.shape(xMat)[1]))printwMatforiinrange(numTestPts):ws=ridgeRegres(xMat,yMat,np.exp(i-10))wMat[i,:]=ws.TreturnwMat

向前逐步回归:

importnumpyasnpdefrssError(yArr,yHatArr):return((yArr-yHatArr)**2).sum()defstageWise(xM,yM,eps=0.01,numIt=100):m,n=np.shape(xM)returnMat=np.zeros((numIt,n))ws=np.zeros((n,1));wsTest=ws.copy();wsMax=ws.copy()lowestError=0foriinrange(numIt):printws.Tforjinrange(n):forsignin[-1,1]:wsTest=ws.copy()wsTest[j]+=eps*signyTest=xM*wsTestrssE=rssError(yM.A,yTest.A)ifi==0:lowestError=rssEifrssE<lowestError:lowestError=rssEprintlowestErrorwsMax=wsTestws=wsMax.copy()returnMat[i,:]=ws.TreturnreturnMat