#11¶

Kaggle competition: [link]

Entry by Robin R.P.M. Kras

robkras.com

⭐ 1. Introduction & Overview¶

Your Goal: Predict the likelihood of loan payback.

🔹 2. Import Libraries & Set Up¶

In [5]:
# !/usr/bin/env python3
# -*- coding: utf-8 -*-

# =====================
# General utilities
# =====================
import json
import os
import pickle
import time
from collections import Counter
import scipy

# =====================
# Data handling & processing
# =====================
import numpy as np
import pandas as pd
from tqdm import tqdm

# =====================
# Visualization
# =====================
import matplotlib.pyplot as plt
import seaborn as sns

# =====================
# Machine Learning - Core scikit-learn
# =====================
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, ElasticNet
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    mean_absolute_error, mean_squared_error, r2_score,
    root_mean_squared_error, roc_auc_score
)
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVC, SVR

# =====================
# Machine Learning - Tree Boosting & advanced
# =====================
import xgboost as xg
import lightgbm as lgb
import catboost

# =====================
# Deep Learning - TensorFlow / Keras
# =====================
import tensorflow as tf
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam

# =====================
# Deep Learning - PyTorch
# =====================
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim

# =====================
# Imbalanced data handling
# =====================
from imblearn.over_sampling import SMOTE

# =====================
# Optimization / AutoML
# =====================
import optuna

# =====================
# Feature importance & explainability
# =====================
import shap

# =====================
# Self Made Utilities
# =====================
from utils import *

# =====================
# Settings & reproducibility
# =====================
import warnings
warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)

print("Libraries successfully loaded. Ready to go!")
Libraries successfully loaded. Ready to go!
In [6]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [7]:
train.head()
Out[7]:
id annual_income debt_to_income_ratio credit_score loan_amount interest_rate gender marital_status education_level employment_status loan_purpose grade_subgrade loan_paid_back
0 0 29367.99 0.084 736 2528.42 13.67 Female Single High School Self-employed Other C3 1.0
1 1 22108.02 0.166 636 4593.10 12.92 Male Married Master's Employed Debt consolidation D3 0.0
2 2 49566.20 0.097 694 17005.15 9.76 Male Single High School Employed Debt consolidation C5 1.0
3 3 46858.25 0.065 533 4682.48 16.10 Female Single High School Employed Debt consolidation F1 1.0
4 4 25496.70 0.053 665 12184.43 10.21 Male Married High School Employed Other D1 1.0
In [8]:
numerical_features = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
features = train.columns.tolist()

target_col = 'accident_risk'
In [9]:
print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)
Numerical features: ['id', 'annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate', 'loan_paid_back']
Categorical features: ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']

🔹 3. Data Exploration¶

In [10]:
plot_cats(train)
No description has been provided for this image
In [11]:
plot_nums(train)
No description has been provided for this image
In [12]:
def plot_bool(dataframe):
    bool_features = dataframe.select_dtypes(include=['bool']).columns
    num_features = len(bool_features)
    n_cols = 3
    n_rows = (num_features + n_cols - 1) // n_cols

    plt.figure(figsize=(5 * n_cols, 4 * n_rows))

    for i, feature in enumerate(bool_features):
        plt.subplot(n_rows, n_cols, i + 1)
        sns.countplot(x=feature, data=dataframe)
        plt.title(f'Count Plot of {feature}')
        plt.xlabel(feature)
        plt.ylabel('Count')

    plt.tight_layout()
    plt.show()
In [13]:
plot_bool(train)
<Figure size 1500x0 with 0 Axes>

Good distributions among all of these, highly balanced

🔹 4. Feature Engineering¶

In [14]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [15]:
def create_enhanced_features(df):
    """
    Create interaction features based on teacher's knowledge
    These capture relationships that teacher learned from full feature set
    """
    df_enhanced = df.copy()
    
    # Based on common loan domain knowledge and teacher patterns:
    # 1. Credit utilization
    if 'credit_score' in df.columns and 'debt_to_income_ratio' in df.columns:
        df_enhanced['credit_risk_score'] = df['credit_score'] * (1 - df['debt_to_income_ratio'])
    
    # 2. Loan burden ratio
    if 'loan_amount' in df.columns and 'annual_income' in df.columns:
        df_enhanced['loan_to_income'] = df['loan_amount'] / (df['annual_income'] + 1)
    
    # 3. Payment affordability
    if 'interest_rate' in df.columns and 'loan_amount' in df.columns:
        df_enhanced['total_interest_burden'] = df['loan_amount'] * df['interest_rate']
    
    # 4. Risk tier from subgrade
    if 'grade_subgrade' in df.columns:
        grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
        df_enhanced['grade_numeric'] = df['grade_subgrade'].str[0].map(grade_map)
        df_enhanced['subgrade_numeric'] = df['grade_subgrade'].str[1].astype(int)
        df_enhanced['risk_score'] = df_enhanced['grade_numeric'] * 5 + df_enhanced['subgrade_numeric']
        df_enhanced.drop(columns=['grade_numeric', 'subgrade_numeric'], inplace=True)
    
    return df_enhanced

# Apply enhancement
train_enhanced = create_enhanced_features(train)
test_enhanced = create_enhanced_features(test)
In [16]:
numerical_features = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
features = train.columns.tolist()

target_col = 'loan_paid_back'
In [17]:
# Combine for consistent encoding
combined = pd.concat([train[categorical_features], test[categorical_features]], axis=0)

# Encode all at once
combined_ohe = pd.get_dummies(combined, drop_first=True)

# Split back to train/test
ohe_train = combined_ohe.iloc[:len(train), :].reset_index(drop=True)
ohe_test  = combined_ohe.iloc[len(train):, :].reset_index(drop=True)

# Final datasets
train_encoded = pd.concat([train[numerical_features].reset_index(drop=True), ohe_train], axis=1)

numerical_features.remove('loan_paid_back')
test_encoded  = pd.concat([test[numerical_features].reset_index(drop=True),  ohe_test],  axis=1)

🔹 5. Modelling¶

In [18]:
X = train_encoded.drop(columns=['id', 'loan_paid_back'], axis=1)
y = train_encoded['loan_paid_back']

X_test = test_encoded.drop(columns=['id'], axis=1)
In [ ]:
def objective(trial):
    """Optuna objective function for LightGBM hyperparameter optimization"""
    
    # Hyperparameters to optimize
    params = {
        'n_estimators': 20000,  # Will use early stopping
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 31, 255),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'random_state': 42,
        'verbosity': -1,
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0
    }
    
    # Cross-validation
    n_splits = 5  # Using 5 folds for faster optimization
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    oof_preds = np.zeros(len(X))
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        
        model.fit(
            X_train, 
            y_train, 
            eval_set=[(X_val, y_val)], 
            eval_metric="auc",
            callbacks=[
                lgb.early_stopping(100, verbose=False),
            ]
        )
        
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    
    cv_auc = roc_auc_score(y, oof_preds)
    return cv_auc


# Create and run the study
print("Starting Optuna hyperparameter optimization...")
print("This will take some time depending on n_trials...\n")

study = optuna.create_study(
    direction='maximize',
    study_name='lightgbm_optimization',
    sampler=optuna.samplers.TPESampler(seed=42)
)

study.optimize(
    objective, 
    n_trials=50,  # Adjust based on your time budget
    show_progress_bar=True
)

# Print results
print("\n" + "="*60)
print("OPTIMIZATION COMPLETE")
print("="*60)
print(f"\nBest CV ROC-AUC: {study.best_value:.6f}")
print(f"\nBest hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")
[I 2025-11-27 19:35:33,468] A new study created in memory with name: lightgbm_optimization
Starting Optuna hyperparameter optimization...
This will take some time depending on n_trials...

  0%|          | 0/50 [00:00<?, ?it/s]
[I 2025-11-27 19:42:04,675] Trial 0 finished with value: 0.9235066856773901 and parameters: {'learning_rate': 0.023688639503640783, 'num_leaves': 244, 'max_depth': 12, 'min_child_samples': 62, 'subsample': 0.5780093202212182, 'colsample_bytree': 0.40919616423534183, 'reg_alpha': 0.5808361216819946, 'reg_lambda': 8.661761457749352}. Best is trial 0 with value: 0.9235066856773901.
In [ ]:
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

y_probs_lgbm = np.zeros(len(X_test))
oof_preds = np.zeros(len(X))
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"\nTraining fold {fold + 1}/{n_splits} >>>")
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    lightgbm = lgb.LGBMClassifier(
        n_estimators=20000,
        learning_rate=0.06,
        num_leaves=100,
        max_depth=10,
        min_child_samples=9,
        subsample=0.8,
        colsample_bytree=0.5,
        reg_alpha=0.78,
        reg_lambda=3.0,
        random_state=42,
        verbosity=-1,
        device="gpu",
        gpu_platform_id=0,
        gpu_device_id=0
    )

    lightgbm.fit(
        X_train, 
        y_train, 
        eval_set=[(X_val, y_val)], 
        eval_metric="auc",
        callbacks=[
            lgb.early_stopping(100),
            lgb.log_evaluation(period=500)
        ]
    )

    oof_preds[val_idx] = lightgbm.predict_proba(X_val)[:, 1]
    y_probs_lgbm += lightgbm.predict_proba(X_test)[:, 1] / n_splits
    models.append(lightgbm)

cv_auc = roc_auc_score(y, oof_preds)
print(f"\nCV ROC-AUC: {cv_auc:.4f}")
Training fold 1/10 >>>
Training until validation scores don't improve for 100 rounds
[500]	valid_0's auc: 0.92405	valid_0's binary_logloss: 0.243891
Early stopping, best iteration is:
[435]	valid_0's auc: 0.924072	valid_0's binary_logloss: 0.243883

Training fold 2/10 >>>
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[367]	valid_0's auc: 0.923904	valid_0's binary_logloss: 0.242444

Training fold 3/10 >>>
Training until validation scores don't improve for 100 rounds
[500]	valid_0's auc: 0.924101	valid_0's binary_logloss: 0.243216
Early stopping, best iteration is:
[533]	valid_0's auc: 0.924159	valid_0's binary_logloss: 0.243149

Training fold 4/10 >>>
Training until validation scores don't improve for 100 rounds
[500]	valid_0's auc: 0.924643	valid_0's binary_logloss: 0.241827
Early stopping, best iteration is:
[634]	valid_0's auc: 0.924762	valid_0's binary_logloss: 0.241688

Training fold 5/10 >>>
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[381]	valid_0's auc: 0.923735	valid_0's binary_logloss: 0.240376

Training fold 6/10 >>>
Training until validation scores don't improve for 100 rounds
[500]	valid_0's auc: 0.923851	valid_0's binary_logloss: 0.240405
Early stopping, best iteration is:
[559]	valid_0's auc: 0.923908	valid_0's binary_logloss: 0.240372

Training fold 7/10 >>>
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[336]	valid_0's auc: 0.920027	valid_0's binary_logloss: 0.246426

Training fold 8/10 >>>
Training until validation scores don't improve for 100 rounds
[500]	valid_0's auc: 0.922635	valid_0's binary_logloss: 0.244698
Early stopping, best iteration is:
[513]	valid_0's auc: 0.922654	valid_0's binary_logloss: 0.24467

Training fold 9/10 >>>
Training until validation scores don't improve for 100 rounds
[500]	valid_0's auc: 0.923948	valid_0's binary_logloss: 0.242522
Early stopping, best iteration is:
[637]	valid_0's auc: 0.924046	valid_0's binary_logloss: 0.242414

Training fold 10/10 >>>
Training until validation scores don't improve for 100 rounds
[500]	valid_0's auc: 0.923369	valid_0's binary_logloss: 0.243848
Early stopping, best iteration is:
[425]	valid_0's auc: 0.923433	valid_0's binary_logloss: 0.243792

CV ROC-AUC: 0.9235
In [ ]:
output_lgbm = pd.DataFrame({
    'id': test.id,
    'loan_paid_back': y_probs_lgbm
})

output_lgbm.to_csv('attempt-lightgbm2.csv', index=False)
print("Your submission was successfully saved!")
Your submission was successfully saved!
In [ ]:
# Load both submissions
sub1 = pd.read_csv('attempt-lightgbm2.csv')
sub2 = pd.read_csv('best-public.csv')

# Simple average
ensemble = sub1.copy()
ensemble.iloc[:, 1:] = (sub1.iloc[:, 1:] + sub2.iloc[:, 1:]) / 2
ensemble.to_csv('ensemble_avg.csv', index=False)
In [ ]:
import pandas as pd

sub1 = pd.read_csv('attempt-lightgbm1.csv')
sub2 = pd.read_csv('best-public.csv')

# Try different weight combinations
best_weights = []
for w1 in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    w2 = 1.0 - w1
    ensemble = sub1.copy()
    ensemble.iloc[:, 1:] = w1 * sub1.iloc[:, 1:] + w2 * sub2.iloc[:, 1:]
    ensemble.to_csv(f'ensemble_w{w1:.1f}.csv', index=False)
    print(f"Created ensemble with weights: {w1:.1f} / {w2:.1f}")
Created ensemble with weights: 0.0 / 1.0
Created ensemble with weights: 0.1 / 0.9
Created ensemble with weights: 0.2 / 0.8
Created ensemble with weights: 0.3 / 0.7
Created ensemble with weights: 0.4 / 0.6
Created ensemble with weights: 0.5 / 0.5
Created ensemble with weights: 0.6 / 0.4
Created ensemble with weights: 0.7 / 0.3
Created ensemble with weights: 0.8 / 0.2
Created ensemble with weights: 0.9 / 0.1
Created ensemble with weights: 1.0 / 0.0
In [ ]:
sub1 = pd.read_csv('attempt-lightgbm1.csv')
sub2 = pd.read_csv('best-public.csv')

ensemble = sub1.copy()
for col in sub1.columns[1:]:  
    rank1 = sub1[col].rank(pct=True)
    rank2 = sub2[col].rank(pct=True)
    ensemble[col] = (rank1 + rank2) / 2

ensemble.to_csv('ensemble_rank_avg.csv', index=False)