⭐ 1. Introduction & Overview¶
Your Goal: Predict the likelihood of loan payback.
🔹 2. Import Libraries & Set Up¶
In [5]:
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
# =====================
# General utilities
# =====================
import json
import os
import pickle
import time
from collections import Counter
import scipy
# =====================
# Data handling & processing
# =====================
import numpy as np
import pandas as pd
from tqdm import tqdm
# =====================
# Visualization
# =====================
import matplotlib.pyplot as plt
import seaborn as sns
# =====================
# Machine Learning - Core scikit-learn
# =====================
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, ElasticNet
from sklearn.metrics import (
accuracy_score, f1_score, precision_score, recall_score,
mean_absolute_error, mean_squared_error, r2_score,
root_mean_squared_error, roc_auc_score
)
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVC, SVR
# =====================
# Machine Learning - Tree Boosting & advanced
# =====================
import xgboost as xg
import lightgbm as lgb
import catboost
# =====================
# Deep Learning - TensorFlow / Keras
# =====================
import tensorflow as tf
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam
# =====================
# Deep Learning - PyTorch
# =====================
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim
# =====================
# Imbalanced data handling
# =====================
from imblearn.over_sampling import SMOTE
# =====================
# Optimization / AutoML
# =====================
import optuna
# =====================
# Feature importance & explainability
# =====================
import shap
# =====================
# Self Made Utilities
# =====================
from utils import *
# =====================
# Settings & reproducibility
# =====================
import warnings
warnings.filterwarnings("ignore")
SEED = 42
np.random.seed(SEED)
print("Libraries successfully loaded. Ready to go!")
Libraries successfully loaded. Ready to go!
In [6]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [7]:
train.head()
Out[7]:
| id | annual_income | debt_to_income_ratio | credit_score | loan_amount | interest_rate | gender | marital_status | education_level | employment_status | loan_purpose | grade_subgrade | loan_paid_back | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 29367.99 | 0.084 | 736 | 2528.42 | 13.67 | Female | Single | High School | Self-employed | Other | C3 | 1.0 |
| 1 | 1 | 22108.02 | 0.166 | 636 | 4593.10 | 12.92 | Male | Married | Master's | Employed | Debt consolidation | D3 | 0.0 |
| 2 | 2 | 49566.20 | 0.097 | 694 | 17005.15 | 9.76 | Male | Single | High School | Employed | Debt consolidation | C5 | 1.0 |
| 3 | 3 | 46858.25 | 0.065 | 533 | 4682.48 | 16.10 | Female | Single | High School | Employed | Debt consolidation | F1 | 1.0 |
| 4 | 4 | 25496.70 | 0.053 | 665 | 12184.43 | 10.21 | Male | Married | High School | Employed | Other | D1 | 1.0 |
In [8]:
numerical_features = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
features = train.columns.tolist()
target_col = 'accident_risk'
In [9]:
print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)
Numerical features: ['id', 'annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate', 'loan_paid_back'] Categorical features: ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']
🔹 3. Data Exploration¶
In [10]:
plot_cats(train)
In [11]:
plot_nums(train)
In [12]:
def plot_bool(dataframe):
bool_features = dataframe.select_dtypes(include=['bool']).columns
num_features = len(bool_features)
n_cols = 3
n_rows = (num_features + n_cols - 1) // n_cols
plt.figure(figsize=(5 * n_cols, 4 * n_rows))
for i, feature in enumerate(bool_features):
plt.subplot(n_rows, n_cols, i + 1)
sns.countplot(x=feature, data=dataframe)
plt.title(f'Count Plot of {feature}')
plt.xlabel(feature)
plt.ylabel('Count')
plt.tight_layout()
plt.show()
In [13]:
plot_bool(train)
<Figure size 1500x0 with 0 Axes>
Good distributions among all of these, highly balanced
🔹 4. Feature Engineering¶
In [14]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [15]:
def create_enhanced_features(df):
"""
Create interaction features based on teacher's knowledge
These capture relationships that teacher learned from full feature set
"""
df_enhanced = df.copy()
# Based on common loan domain knowledge and teacher patterns:
# 1. Credit utilization
if 'credit_score' in df.columns and 'debt_to_income_ratio' in df.columns:
df_enhanced['credit_risk_score'] = df['credit_score'] * (1 - df['debt_to_income_ratio'])
# 2. Loan burden ratio
if 'loan_amount' in df.columns and 'annual_income' in df.columns:
df_enhanced['loan_to_income'] = df['loan_amount'] / (df['annual_income'] + 1)
# 3. Payment affordability
if 'interest_rate' in df.columns and 'loan_amount' in df.columns:
df_enhanced['total_interest_burden'] = df['loan_amount'] * df['interest_rate']
# 4. Risk tier from subgrade
if 'grade_subgrade' in df.columns:
grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
df_enhanced['grade_numeric'] = df['grade_subgrade'].str[0].map(grade_map)
df_enhanced['subgrade_numeric'] = df['grade_subgrade'].str[1].astype(int)
df_enhanced['risk_score'] = df_enhanced['grade_numeric'] * 5 + df_enhanced['subgrade_numeric']
df_enhanced.drop(columns=['grade_numeric', 'subgrade_numeric'], inplace=True)
return df_enhanced
# Apply enhancement
train_enhanced = create_enhanced_features(train)
test_enhanced = create_enhanced_features(test)
In [16]:
numerical_features = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
features = train.columns.tolist()
target_col = 'loan_paid_back'
In [17]:
# Combine for consistent encoding
combined = pd.concat([train[categorical_features], test[categorical_features]], axis=0)
# Encode all at once
combined_ohe = pd.get_dummies(combined, drop_first=True)
# Split back to train/test
ohe_train = combined_ohe.iloc[:len(train), :].reset_index(drop=True)
ohe_test = combined_ohe.iloc[len(train):, :].reset_index(drop=True)
# Final datasets
train_encoded = pd.concat([train[numerical_features].reset_index(drop=True), ohe_train], axis=1)
numerical_features.remove('loan_paid_back')
test_encoded = pd.concat([test[numerical_features].reset_index(drop=True), ohe_test], axis=1)
🔹 5. Modelling¶
In [18]:
X = train_encoded.drop(columns=['id', 'loan_paid_back'], axis=1)
y = train_encoded['loan_paid_back']
X_test = test_encoded.drop(columns=['id'], axis=1)
In [ ]:
def objective(trial):
"""Optuna objective function for LightGBM hyperparameter optimization"""
# Hyperparameters to optimize
params = {
'n_estimators': 20000, # Will use early stopping
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
'num_leaves': trial.suggest_int('num_leaves', 31, 255),
'max_depth': trial.suggest_int('max_depth', 3, 15),
'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
'random_state': 42,
'verbosity': -1,
'device': 'gpu',
'gpu_platform_id': 0,
'gpu_device_id': 0
}
# Cross-validation
n_splits = 5 # Using 5 folds for faster optimization
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
model = lgb.LGBMClassifier(**params)
model.fit(
X_train,
y_train,
eval_set=[(X_val, y_val)],
eval_metric="auc",
callbacks=[
lgb.early_stopping(100, verbose=False),
]
)
oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
cv_auc = roc_auc_score(y, oof_preds)
return cv_auc
# Create and run the study
print("Starting Optuna hyperparameter optimization...")
print("This will take some time depending on n_trials...\n")
study = optuna.create_study(
direction='maximize',
study_name='lightgbm_optimization',
sampler=optuna.samplers.TPESampler(seed=42)
)
study.optimize(
objective,
n_trials=50, # Adjust based on your time budget
show_progress_bar=True
)
# Print results
print("\n" + "="*60)
print("OPTIMIZATION COMPLETE")
print("="*60)
print(f"\nBest CV ROC-AUC: {study.best_value:.6f}")
print(f"\nBest hyperparameters:")
for key, value in study.best_params.items():
print(f" {key}: {value}")
[I 2025-11-27 19:35:33,468] A new study created in memory with name: lightgbm_optimization
Starting Optuna hyperparameter optimization... This will take some time depending on n_trials...
0%| | 0/50 [00:00<?, ?it/s]
[I 2025-11-27 19:42:04,675] Trial 0 finished with value: 0.9235066856773901 and parameters: {'learning_rate': 0.023688639503640783, 'num_leaves': 244, 'max_depth': 12, 'min_child_samples': 62, 'subsample': 0.5780093202212182, 'colsample_bytree': 0.40919616423534183, 'reg_alpha': 0.5808361216819946, 'reg_lambda': 8.661761457749352}. Best is trial 0 with value: 0.9235066856773901.
In [ ]:
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
y_probs_lgbm = np.zeros(len(X_test))
oof_preds = np.zeros(len(X))
models = []
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
print(f"\nTraining fold {fold + 1}/{n_splits} >>>")
X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
lightgbm = lgb.LGBMClassifier(
n_estimators=20000,
learning_rate=0.06,
num_leaves=100,
max_depth=10,
min_child_samples=9,
subsample=0.8,
colsample_bytree=0.5,
reg_alpha=0.78,
reg_lambda=3.0,
random_state=42,
verbosity=-1,
device="gpu",
gpu_platform_id=0,
gpu_device_id=0
)
lightgbm.fit(
X_train,
y_train,
eval_set=[(X_val, y_val)],
eval_metric="auc",
callbacks=[
lgb.early_stopping(100),
lgb.log_evaluation(period=500)
]
)
oof_preds[val_idx] = lightgbm.predict_proba(X_val)[:, 1]
y_probs_lgbm += lightgbm.predict_proba(X_test)[:, 1] / n_splits
models.append(lightgbm)
cv_auc = roc_auc_score(y, oof_preds)
print(f"\nCV ROC-AUC: {cv_auc:.4f}")
Training fold 1/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's auc: 0.92405 valid_0's binary_logloss: 0.243891 Early stopping, best iteration is: [435] valid_0's auc: 0.924072 valid_0's binary_logloss: 0.243883 Training fold 2/10 >>> Training until validation scores don't improve for 100 rounds Early stopping, best iteration is: [367] valid_0's auc: 0.923904 valid_0's binary_logloss: 0.242444 Training fold 3/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's auc: 0.924101 valid_0's binary_logloss: 0.243216 Early stopping, best iteration is: [533] valid_0's auc: 0.924159 valid_0's binary_logloss: 0.243149 Training fold 4/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's auc: 0.924643 valid_0's binary_logloss: 0.241827 Early stopping, best iteration is: [634] valid_0's auc: 0.924762 valid_0's binary_logloss: 0.241688 Training fold 5/10 >>> Training until validation scores don't improve for 100 rounds Early stopping, best iteration is: [381] valid_0's auc: 0.923735 valid_0's binary_logloss: 0.240376 Training fold 6/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's auc: 0.923851 valid_0's binary_logloss: 0.240405 Early stopping, best iteration is: [559] valid_0's auc: 0.923908 valid_0's binary_logloss: 0.240372 Training fold 7/10 >>> Training until validation scores don't improve for 100 rounds Early stopping, best iteration is: [336] valid_0's auc: 0.920027 valid_0's binary_logloss: 0.246426 Training fold 8/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's auc: 0.922635 valid_0's binary_logloss: 0.244698 Early stopping, best iteration is: [513] valid_0's auc: 0.922654 valid_0's binary_logloss: 0.24467 Training fold 9/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's auc: 0.923948 valid_0's binary_logloss: 0.242522 Early stopping, best iteration is: [637] valid_0's auc: 0.924046 valid_0's binary_logloss: 0.242414 Training fold 10/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's auc: 0.923369 valid_0's binary_logloss: 0.243848 Early stopping, best iteration is: [425] valid_0's auc: 0.923433 valid_0's binary_logloss: 0.243792 CV ROC-AUC: 0.9235
In [ ]:
output_lgbm = pd.DataFrame({
'id': test.id,
'loan_paid_back': y_probs_lgbm
})
output_lgbm.to_csv('attempt-lightgbm2.csv', index=False)
print("Your submission was successfully saved!")
Your submission was successfully saved!
In [ ]:
# Load both submissions
sub1 = pd.read_csv('attempt-lightgbm2.csv')
sub2 = pd.read_csv('best-public.csv')
# Simple average
ensemble = sub1.copy()
ensemble.iloc[:, 1:] = (sub1.iloc[:, 1:] + sub2.iloc[:, 1:]) / 2
ensemble.to_csv('ensemble_avg.csv', index=False)
In [ ]:
import pandas as pd
sub1 = pd.read_csv('attempt-lightgbm1.csv')
sub2 = pd.read_csv('best-public.csv')
# Try different weight combinations
best_weights = []
for w1 in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
w2 = 1.0 - w1
ensemble = sub1.copy()
ensemble.iloc[:, 1:] = w1 * sub1.iloc[:, 1:] + w2 * sub2.iloc[:, 1:]
ensemble.to_csv(f'ensemble_w{w1:.1f}.csv', index=False)
print(f"Created ensemble with weights: {w1:.1f} / {w2:.1f}")
Created ensemble with weights: 0.0 / 1.0 Created ensemble with weights: 0.1 / 0.9 Created ensemble with weights: 0.2 / 0.8 Created ensemble with weights: 0.3 / 0.7 Created ensemble with weights: 0.4 / 0.6 Created ensemble with weights: 0.5 / 0.5 Created ensemble with weights: 0.6 / 0.4 Created ensemble with weights: 0.7 / 0.3 Created ensemble with weights: 0.8 / 0.2 Created ensemble with weights: 0.9 / 0.1 Created ensemble with weights: 1.0 / 0.0
In [ ]:
sub1 = pd.read_csv('attempt-lightgbm1.csv')
sub2 = pd.read_csv('best-public.csv')
ensemble = sub1.copy()
for col in sub1.columns[1:]:
rank1 = sub1[col].rank(pct=True)
rank2 = sub2[col].rank(pct=True)
ensemble[col] = (rank1 + rank2) / 2
ensemble.to_csv('ensemble_rank_avg.csv', index=False)