Lending Club Loan Prediction
A Bank Loan Default Prediction with Machine Learning Classification Model
Posted by Yun (Jessica) Yan on March 8, 2020
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as patches
from sklearn.metrics import roc_curve,auc
from scipy import interp
pd.set_option('display.max_columns', 500)
loan = pd.read_csv("E:/Downloads/CreditNinja_ModelingChallenge/data.csv")
print(loan.shape)
loan.head()
I chose loan_status as the dependent variable and turned it into a binary classification problem by setting 'Default' as '1' and other status as '0'. Since we only want to focus on the default clients, there is no need to identify the difference between 'Current' and 'Fully Paid'.
# create new dependent variable: default
loan.loc[loan['loan_status'] == 'Default', 'Default'] = 1
loan.loc[loan['loan_status'] != 'Default', 'Default'] = 0
# drop the original unique id and loan_status column
loan = loan.drop(['id', 'loan_status'], axis=1)
print(loan.shape)
loan.head()
My data cleaning methodology and findings are listed as below.
dit
column. However, they have already been dropped in the second step.annual_inc
and dit
, I decided to normalize the data to ensure the model performance.First, there are several variables we would like to drop.
loan = loan.drop(['last_credit_pull_d','last_fico_range_high','last_fico_range_low','addr_state','issue_d','earliest_cr_line'], axis=1)
print(loan.shape)
loan.head()
# check the NAs in each column
isnull = loan.isnull().sum().sort_values(ascending=False)/len(loan)
print('col_count:',isnull.count())
isnull
# drop the columns with over 40% missing values
loan = loan.dropna(thresh=len(loan)*0.6, axis=1)
print(loan.shape)
loan.head()
loan.select_dtypes(include=[np.number]).describe().T
loan.select_dtypes(include=['object']).describe().T
# drop the rows with missing values in the `emp_length` variable
loan = loan.dropna(axis=0)
print(loan.shape)
loan.head()
The emp_length variable is an ordinal variable. Therefore, I assigned different numbers to different levels accordingly to convert it into a numerical variable that can feed the model.
map = {
"emp_length": {
"10+ years": 10,
"9 years": 9,
"8 years": 8,
"7 years": 7,
"6 years": 6,
"5 years": 5,
"4 years": 4,
"3 years": 3,
"2 years": 2,
"1 year": 1,
"< 1 year": 0
}
}
loan = loan.replace(map)
print(loan.shape)
loan.head()
As for the four nominal variables, I used the get_dummies()
function to conduct one-hot encoding.
n_columns = ["term","home_ownership", "verification_status", "purpose"]
dummy_df = pd.get_dummies(loan[n_columns],drop_first=True)
loan = pd.concat([loan, dummy_df], axis=1)
loan = loan.drop(n_columns, axis=1)
print(loan.shape)
loan.head()
columns = loan.columns
features = columns.drop('Default')
features_s = columns.drop('Default').drop(dummy_df.columns)
from sklearn.preprocessing import StandardScaler
sc =StandardScaler()
loan[features_s] =sc.fit_transform(loan[features_s])
print(loan.shape)
loan.head()
loan.describe().T
loan.to_csv('loan_clean.csv')
Using Recursive Feature Elimination, I selected 20 out of 30 variables that are most relevant to the dependent variable.
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
x_val = loan[features]
y_val = loan['Default']
estimator = LogisticRegression(class_weight='balanced',solver='liblinear')
rfe = RFE(estimator=estimator,n_features_to_select=20).fit(x_val, y_val)
x_chosed = features[rfe.support_]
x_chosed
Based on the correlation heatmap, I dropped some variables that are highly relevant to another feature in order to avoid multicollinearity and improve the model performance.
colormap = plt.cm.viridis
plt.figure(figsize=(12,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(loan[x_chosed].corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)
drop_col = ['installment','fico_range_high','term_60 months','verification_status_Verified']
x_new = x_chosed.drop(drop_col)
x_new
# Partition the data
X = loan[x_new]
y = loan["Default"]
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X,y,train_size=0.8, random_state = 1)
print ('Train Features: ',train_X.shape ,
'Test Features: ',test_X.shape)
print ('Train Labels: ',train_y.shape ,
'Test Labels: ',test_y.shape)
Obviously, our dataset is not very balanced since most people would pay their loan. My solution is to use SMOTE()
function to balance the training dataset.
# SMOTE
n_sample = train_y.shape[0]
n_pos_sample = train_y[train_y == 0].shape[0]
n_neg_sample = train_y[train_y == 1].shape[0]
print('Observations: {}; Positives: {:.2%}; Negatives: {:.2%}'.format(n_sample,
n_pos_sample / n_sample,
n_neg_sample / n_sample))
print('Features: ', train_X.shape[1])
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=1)
train_X, train_y = sm.fit_sample(train_X, train_y)
print('After SMOTE: ')
n_sample = train_y.shape[0]
n_pos_sample = train_y[train_y == 0].shape[0]
n_neg_sample = train_y[train_y == 1].shape[0]
print('Observations: {}; Positives: {:.2%}; Negatives: {:.2%}'.format(n_sample,
n_pos_sample / n_sample,
n_neg_sample / n_sample))
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear')
model.fit(train_X,train_y)
import sklearn.metrics as sklmetrics
predict_y=model.predict(test_X)
sklmetrics.accuracy_score(test_y, predict_y)
# Confusion Matrix
conf_mat = sklmetrics.confusion_matrix(test_y, predict_y, labels =[0,1])
conf_mat
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(test_y, predict_y)
print("Area under the ROC curve : %f" % roc_auc)
def plot_feature_importance_coeff(model, Xnames, cls_nm = None):
imp_features = pd.DataFrame(np.column_stack((Xnames, model.coef_.ravel())), columns = ['feature', 'importance'])
imp_features[['importance']] = imp_features[['importance']].astype(float)
imp_features[['abs_importance']] = imp_features[['importance']].abs()
# Sort the features based on absolute value of importance
imp_features = imp_features.sort_values(by = ['abs_importance'], ascending = [1])
# Plot the feature importances of the forest
plt.figure()
plt.title(cls_nm + " - Feature Importance")
plt.barh(range(imp_features.shape[0]), imp_features['importance'],
color="b", align="center")
plt.yticks(range(imp_features.shape[0]), imp_features['feature'], )
plt.ylim([-1, imp_features.shape[0]])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig(cls_nm + "_feature_imp.png", bbox_inches='tight')
plt.show()
plot_feature_importance_coeff(model, X.columns, cls_nm="Logistic Regression")
We found that the purpose of the specfic loan is really important for its default probability. Also, fico score in application would have a negative effect on the default probability. We should consider these features more carefully when processing an loan application.
Moreover, we found that dti, the number of inquiries in past 6 months and loan amount have a positive relationship to the default probability. It aligns with the business rules we created before the model runs. The heavier the financial burden is, the higher risk that the loan would get default.
from sklearn.model_selection import GridSearchCV
param_grid = {'C': 10.**np.arange(-5, 5),
'penalty': [ 'l1', 'l2']}
grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid, cv=10)
grid_search.fit(train_X, train_y)
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.5f}".format(grid_search.best_score_))
predict_y=grid_search.predict(test_X)
sklmetrics.accuracy_score(test_y, predict_y)
# Confusion Matrix
conf_mat = sklmetrics.confusion_matrix(test_y, predict_y, labels =[0,1])
conf_mat
We used the optimal hyperparameter(C=10.0,penalty='l2') generated in the grid search above.
from sklearn.model_selection import cross_val_predict, KFold, cross_val_score
lr = LogisticRegression(C=10.0,penalty='l2',solver='liblinear')
kf = KFold(n_splits=10, shuffle=True)
cross_val_score(lr,train_X, train_y,cv=kf,scoring='accuracy').mean()
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
)
ax1.add_patch(
patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
)
tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train,test in kf.split(train_X, train_y):
prediction = lr.fit(train_X.iloc[train],train_y.iloc[train]).predict_proba(train_X.iloc[test])
fpr, tpr, t = roc_curve(train_y[test], prediction[:, 1])
tprs.append(interp(mean_fpr, fpr, tpr))
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()
The model accuracy has been improved to 61% after the cross validation. Also, we can see that the AUC is relatively stable across different folders.
Besides, a penalty can be added on false negatives since it would be more serious to classify a person would have default as not default than the opposite. It would further improve our accuracy.
Are there any ways to derive additional variables that would improve the model prediction accuracy?
What variables, if any, can be used to create business rules that can be used to decline customer’s application before the model runs?