Home Credit — Credit Risk Model Stability — EDA
In this blog post, we will conduct Exploratory Data Analysis (EDA) on the dataset related to Home Credit’s Credit Risk Model Stability.
As part of their credit risk assessment, Home Credit uses various data points to evaluate the creditworthiness of applicants. In this analysis, we will explore the dataset related to the stability of Home Credit’s credit risk model.
Dataset Overview
The dataset contains information about:
This dataset contains a large number of tables as a result of utilizing diverse data sources and the varying levels of data aggregation used while preparing the dataset. Note: All files listed below are found in both .csv and .parquet formats.
Data can be downloaded from here . To get an overview about data, please go through it.
Let’s dive into the code without delay.
import pandas as pd
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score# Correct function definition
def set_table_dtypes_pandas(df):
# Iterate over the DataFrame's columns to identify and cast columns
for col_name in df.columns:
# If the last letter of the column name is 'P' or 'A', cast to float
if col_name[-1] in ('P', 'A'):
df[col_name] = df[col_name].astype(float)
return df
def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
for col in df.columns:
if df[col].dtype.name in ['object', 'string']:
df[col] = df[col].astype("string").astype('category')
current_categories = df[col].cat.categories
new_categories = current_categories.to_list() + ["Unknown"]
new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
df[col] = df[col].astype(new_dtype)
return dffrom google.colab import drive
drive.mount('/content/drive')!unzip "/content/drive/MyDrive/Dataset/credit/home-credit-credit-risk-model-stability.zip" -d "/content/tmp"df_train_base = pd.read_csv("/content/tmp/csv_files/train/train_base.csv")df_train_base.isnull().sum()case_id 0
date_decision 0
MONTH 0
WEEK_NUM 0
target 0
dtype: int64df_train_base.head(5)df_train_base['date_decision'] = pd.to_datetime(df_train_base['date_decision'])
df_train_base['year'] = df_train_base['date_decision'].dt.year
df_train_base['month'] = df_train_base['date_decision'].dt.monthdf_train_base.head(5)
df_train_base.describe()
df_train_appl_prev = pd.read_csv("/content/tmp/csv_files/train/train_applprev_1_0.csv")df_test_base = pd.read_csv("/content/tmp/csv_files/test/test_base.csv",low_memory=False)
df_test_base = set_table_dtypes_pandas(df_test_base)
test_static_0_0 = pd.read_csv("/content/tmp/csv_files/test/test_static_0_0.csv",low_memory=False)
test_static_0_0 = set_table_dtypes_pandas(test_static_0_0)
test_static_cb = pd.read_csv("/content/tmp/csv_files/test/test_static_cb_0.csv",low_memory=False)
test_static_cb = set_table_dtypes_pandas(test_static_cb)
test_person_1 = pd.read_csv("/content/tmp/csv_files/test/test_person_1.csv",low_memory=False)
test_person_1 = set_table_dtypes_pandas(test_person_1)
test_credit_bureau_b_2 = pd.read_csv("/content/tmp/csv_files/test/test_credit_bureau_b_2.csv",low_memory=False)
test_credit_bureau_b_2 = set_table_dtypes_pandas(test_credit_bureau_b_2)# Merging all the tables together on 'case_id'
data_submission = df_train_base.merge(
test_static_0_0[['case_id']+selected_static_cols], how='left', on='case_id'
).merge(
test_static_cb[['case_id']+selected_static_cb_cols], how='left', on='case_id'
).merge(
test_person_1_agg, how='left', on='case_id'
).merge(
test_person_1_feats_2, how='left', on='case_id'
).merge(
test_credit_bureau_b_2_feats, how='left', on='case_id'
)We can use LGB model to train
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)
params = {
"boosting_type": "gbdt",
"objective": "binary",
"metric": "auc",
"max_depth": 3,
"num_leaves": 31,
"learning_rate": 0.05,
"feature_fraction": 0.9,
"bagging_fraction": 0.8,
"bagging_freq": 5,
"n_estimators": 1000,
"verbose": -1,
}
gbm = lgb.train(
params,
lgb_train,
valid_sets=lgb_valid,
callbacks=[lgb.log_evaluation(50), lgb.early_stopping(5)]
)This serves as a starting point for reference as I continue refining my final model. More details on the model will be shared soon.





