以German信用数据为例的logistics regression算法在评分卡上的实践

2025-01-07 技术教程

以德国信用数据为例，用logistict regression算法做信用评分卡原理性实现，因此并未考虑feature selection.

第一步：导入必要的库

importpandasaspdimportnumpyasnpfromsklearn.cross_validationimporttrain_test_split

第二步：导入数据

german=pd.read_csv('D:/CreditDatasets/german.data',sep='',header=None)german.columns=['Status_of_existing_checking_account','Duration_in_month','Credit_history','Purpose','Credit_amount','Savings_account','Present_employment_since','Installment_rate','Personal_status_and_sex','Other_debtors','Present_residence_since','Property','Age','Other_installment_plans','Housing','Number_of_existing_credits','Job','Number_of_people','Telephone','foreign_worker','default']Grp=german.groupby('default')total_good=Grp.size()[1]total_bad=Grp.size()[2]

第三步：分别计算名义变量和数值变量的woe值，对取值较少的数值变量也用名义变量woe计算方法实现，其余数值变量均5等分

defCalcWOE(VarName):WOE_Map=pd.DataFrame()Vars=np.unique(german[VarName])forvinVars:tmp=german[VarName]==vgrp=german[tmp].groupby('default')Good=grp.size()[1]Bad=grp.size()[2]good_ratio=float(Good)/total_goodbad_ratio=float(Bad)/total_badWOE=np.log(bad_ratio/good_ratio)IV=(bad_ratio-good_ratio)*WOEresult=pd.DataFrame([[VarName,v,WOE,IV]],index=None,columns=['variable','class','woe','iv'])WOE_Map=WOE_Map.append(result,ignore_index=True)returnWOE_Map#nominalvariablewoestatus_checking_account_woe=CalcWOE('Status_of_existing_checking_account')Credit_history_woe=CalcWOE('Credit_history')Purpose_woe=CalcWOE('Purpose')Savings_account_woe=CalcWOE('Savings_account')Present_employment_since_woe=CalcWOE('Present_employment_since')Personal_status_and_sex_woe=CalcWOE('Personal_status_and_sex')Other_debtors_woe=CalcWOE('Other_debtors')Property_woe=CalcWOE('Property')Other_installment_plans_woe=CalcWOE('Other_installment_plans')Housing_woe=CalcWOE('Housing')Job_woe=CalcWOE('Job')Telephone_woe=CalcWOE('Telephone')foreign_worker_woe=CalcWOE('foreign_worker')#numericvariablewoe,nobinningInstallment_rate_woe=CalcWOE('Installment_rate')Present_residence_since_woe=CalcWOE('Present_residence_since')Number_of_existing_credits_woe=CalcWOE('Number_of_existing_credits')Number_of_people_woe=CalcWOE('Number_of_people')defCalcWOE_bin(VarName,N):WOE_Map=pd.DataFrame()max_value=max(german[VarName])min_value=min(german[VarName])bin=float(max_value-min_value)/Nforiinrange(N):bin_U=min_value+(i+1)*binbin_L=bin_U-binifi==1:tmp=(german[VarName]>=bin_L)&(german[VarName]<=bin_U)grp=german[tmp].groupby('default')else:tmp=(german[VarName]>bin_L)&(german[VarName]<=bin_U)grp=german[tmp].groupby('default')Good=grp.size()[1]Bad=grp.size()[2]good_ratio=float(Good)/total_goodbad_ratio=float(Bad)/total_badWOE=np.log(bad_ratio/good_ratio)IV=(bad_ratio-good_ratio)*WOEresult=pd.DataFrame([[VarName,[bin_L,bin_U,WOE],WOE,IV]],index=None,columns=['variable','class+woe','woe','iv'])WOE_Map=WOE_Map.append(result,ignore_index=True)returnWOE_MapDuration_in_month_woe=CalcWOE_bin('Duration_in_month',5)Credit_amount_woe=CalcWOE_bin('Credit_amount',5)Age_woe=CalcWOE_bin('Age',5)

第四步：用woe值替代原来的值

defReplaceWOE(VarName,SourceDF,VarWOE):dict1=dict.fromkeys(VarWOE['class'])j=0forkeyindict1:dict1[key]=VarWOE['woe'][j]j=j+1SourceDF[VarName]=SourceDF[VarName].map(dict1)returnSourceDFgerman_woe=germantemp=ReplaceWOE('Status_of_existing_checking_account',german_woe,status_checking_account_woe)temp1=ReplaceWOE('Credit_history',temp,Credit_history_woe)temp=ReplaceWOE('Purpose',temp1,Purpose_woe)temp1=ReplaceWOE('Savings_account',temp,Savings_account_woe)temp=ReplaceWOE('Present_employment_since',temp1,Present_employment_since_woe)temp1=ReplaceWOE('Personal_status_and_sex',temp,Personal_status_and_sex_woe)temp=ReplaceWOE('Other_debtors',temp1,Other_debtors_woe)temp1=ReplaceWOE('Property',temp,Property_woe)temp=ReplaceWOE('Other_installment_plans',temp1,Other_installment_plans_woe)temp1=ReplaceWOE('Housing',temp,Housing_woe)temp=ReplaceWOE('Job',temp1,Job_woe)temp1=ReplaceWOE('Telephone',temp,Telephone_woe)temp=ReplaceWOE('foreign_worker',temp1,foreign_worker_woe)temp1=ReplaceWOE('Installment_rate',temp,Installment_rate_woe)temp=ReplaceWOE('Present_residence_since',temp1,Present_residence_since_woe)temp1=ReplaceWOE('Number_of_existing_credits',temp,Number_of_existing_credits_woe)temp=ReplaceWOE('Number_of_people',temp1,Number_of_people_woe)defReplaceWOE_bin(VarName,SourceDF,VarWOE):items=np.unique(SourceDF[VarName])m=min(SourceDF[VarName])dict2={}foritinitems:ifit==m:dict2[it]=VarWOE['class+woe'][0][2]else:forl,u,winVarWOE['class+woe']:if(it>l)&(it<=u):dict2[it]=wSourceDF[VarName]=SourceDF[VarName].map(dict2)returnSourceDFtemp1=ReplaceWOE_bin('Duration_in_month',temp,Duration_in_month_woe)temp=ReplaceWOE_bin('Credit_amount',temp1,Credit_amount_woe)temp1=ReplaceWOE_bin('Age',temp,Age_woe)

第五步：将数据集拆分为训练集和测试集

X=temp1[list(temp1.columns)[:-1]]y=temp1['default']-1X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=0)

第六步：在训练集上应用logistic regression算法

fromsklearn.linear_model.logisticimportLogisticRegressionclassifier=LogisticRegression()classifier.fit(X_train,y_train)predictions=classifier.predict(X_test)

第七步：评估模型分类精度

fromsklearn.metricsimportaccuracy_score#print'Accuracy:',accuracy_score(y_test,predictions)fromsklearn.cross_validationimportcross_val_scorescores=cross_val_score(classifier,X_train,y_train,cv=5)#printnp.mean(scores),scores

第八步：创建评分卡

#score=A-B*log(theta)#P0=A-B*log(theta0),P0+PDO=A-B*log(2*theta0)P0=600PDO=20theta0=1.0/60B=PDO/np.log(2)A=P0+B*np.log(theta0)coef=classifier.coef_beta0=classifier.intercept_status_checking_account_woe['score']=(A-B*beta0)/20-B*coef[0][0]*status_checking_account_woe['woe']Duration_in_month_woe['score']=(A-B*beta0)/20-B*coef[0][1]*Duration_in_month_woe['woe']Credit_history_woe['score']=(A-B*beta0)/20-B*coef[0][2]*Credit_history_woe['woe']Purpose_woe['score']=(A-B*beta0)/20-B*coef[0][3]*Purpose_woe['woe']Credit_amount_woe['score']=(A-B*beta0)/20-B*coef[0][4]*Credit_amount_woe['woe']Savings_account_woe['score']=(A-B*beta0)/20-B*coef[0][5]*Savings_account_woe['woe']Present_employment_since_woe['score']=(A-B*beta0)/20-B*coef[0][6]*Present_employment_since_woe['woe']Installment_rate_woe['score']=(A-B*beta0)/20-B*coef[0][7]*Installment_rate_woe['woe']Personal_status_and_sex_woe['score']=(A-B*beta0)/20-B*coef[0][8]*Personal_status_and_sex_woe['woe']Other_debtors_woe['score']=(A-B*beta0)/20-B*coef[0][9]*Other_debtors_woe['woe']Present_residence_since_woe['score']=(A-B*beta0)/20-B*coef[0][10]*Present_residence_since_woe['woe']Property_woe['score']=(A-B*beta0)/20-B*coef[0][11]*Property_woe['woe']Age_woe['score']=(A-B*beta0)/20-B*coef[0][12]*Age_woe['woe']Other_installment_plans_woe['score']=(A-B*beta0)/20-B*coef[0][13]*Other_installment_plans_woe['woe']Housing_woe['score']=(A-B*beta0)/20-B*coef[0][14]*Housing_woe['woe']Number_of_existing_credits_woe['score']=(A-B*beta0)/20-B*coef[0][15]*Number_of_existing_credits_woe['woe']Job_woe['score']=(A-B*beta0)/20-B*coef[0][16]*Job_woe['woe']Number_of_people_woe['score']=(A-B*beta0)/20-B*coef[0][17]*Number_of_people_woe['woe']Telephone_woe['score']=(A-B*beta0)/20-B*coef[0][18]*Telephone_woe['woe']foreign_worker_woe['score']=(A-B*beta0)/20-B*coef[0][19]*foreign_worker_woe['woe']

初次用python实现，不当之处请不吝批评指正！