⭐ 1. Introduction & Overview¶
Your Goal: Your goal is to predict whether a client will subscribe to a bank term deposit.
🔹 2. Import Libraries & Set Up¶
In [32]:
# =============================================================================
# MACHINE LEARNING LIBRARIES - SIMPLE IMPORTS
# =============================================================================
# Set environment variable for scipy array API support
import os
os.environ['SCIPY_ARRAY_API'] = '1'
# Core Data Science Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# Gradient Boosting Libraries
import xgboost as xgb
import lightgbm as lgb
# Deep Learning
#import torch
#import torch.nn as nn
#import torch.optim as optim
#import torchvision.transforms as transforms
import tensorflow as tf
from tensorflow import keras
# Visualization
import plotly.express as px
import plotly.graph_objects as go
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool
# Computer Vision
import cv2
# Scientific Computing & Statistics
import scipy.stats as stats
from scipy import optimize
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
# Image Processing
from PIL import Image, ImageDraw, ImageFont
# Sampling and Resampling - Try import, use alternatives if failed
try:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
IMBLEARN_AVAILABLE = True
except ImportError:
print("imblearn not available, using sklearn class_weight='balanced' instead")
IMBLEARN_AVAILABLE = False
# Utilities
import sys
import warnings
import datetime
from pathlib import Path
import pickle
import json
# Configuration
plt.rcParams['figure.figsize'] = (10, 6)
sns.set_palette("husl")
warnings.filterwarnings('ignore')
SEED = 42
In [33]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
bank = pd.read_csv('bank-full.csv', delimiter=';')
In [34]:
train.head()
Out[34]:
id | age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 42 | technician | married | secondary | no | 7 | no | no | cellular | 25 | aug | 117 | 3 | -1 | 0 | unknown | 0 |
1 | 1 | 38 | blue-collar | married | secondary | no | 514 | no | no | unknown | 18 | jun | 185 | 1 | -1 | 0 | unknown | 0 |
2 | 2 | 36 | blue-collar | married | secondary | no | 602 | yes | no | unknown | 14 | may | 111 | 2 | -1 | 0 | unknown | 0 |
3 | 3 | 27 | student | single | secondary | no | 34 | yes | no | unknown | 28 | may | 10 | 2 | -1 | 0 | unknown | 0 |
4 | 4 | 26 | technician | married | secondary | no | 889 | yes | no | cellular | 3 | feb | 902 | 1 | -1 | 0 | unknown | 1 |
In [35]:
# Enhanced Feature Engineering for Bank Marketing (Non-redundant)
import numpy as np
import pandas as pd
def create_features(df):
df = df.copy()
# Original many_no feature (captures risk aversion pattern)
def many_no(x):
if x['default']=='no' and x['housing']=='no' and x['loan']=='no':
return 21
if x['default']=='no' and x['housing']=='no'\
or x['default']=='no' and x['loan']=='no'\
or x['housing']=='no' and x['loan']=='no':
return 7
if x['default']=='no' or x['housing']=='no' or x['loan']=='no':
return 3
return 0
df['many_no'] = df.apply(lambda x: many_no(x), axis=1)
df['balance_duration'] = df['balance'] * df['duration'] # Financial capacity × engagement
df['campaign_previous'] = df['campaign'] * df['previous'] # Current effort × past history
df['age_balance'] = df['age'] * df['balance'] # Life stage × wealth
df['contact_success_ratio'] = df['previous'] / (df['campaign'] + 1)
df['days_since_contact'] = np.where(df['pdays'] == -1, 999, df['pdays'])
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 50, 65, 100],
labels=[0, 1, 2, 3, 4]).astype(int)
df['balance_category'] = pd.cut(df['balance'], bins=[-np.inf, 0, 1000, 5000, np.inf],
labels=[0, 1, 2, 3]).astype(int)
return df
bank = create_features(bank)
train = create_features(train)
test = create_features(test)
In [36]:
# Prepare feature matrix (X) and target vector (y) for training
X = train.drop(["y", "id"], axis=1)
y = train["y"]
# Prepare feature matrix (X_bank) and target vector (y_bank) for training
X_bank = bank.drop(["y"], axis=1)
y_bank = bank["y"].map({'yes': 1, 'no': 0})
# Prepare feature matrix (X_test) for testing
X_test = test.drop(["id"], axis=1)
In [37]:
object_cols = train.select_dtypes(include="object").columns
from sklearn.preprocessing import LabelEncoder
for col_name in object_cols:
le = LabelEncoder()
X[col_name] = le.fit_transform(X[col_name])
X_test[col_name] = le.transform(X_test[col_name])
X_bank[col_name] = le.transform(X_bank[col_name])
In [38]:
train.head(3)
Out[38]:
id | age | job | marital | education | default | balance | housing | loan | contact | ... | poutcome | y | many_no | balance_duration | campaign_previous | age_balance | contact_success_ratio | days_since_contact | age_group | balance_category | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 42 | technician | married | secondary | no | 7 | no | no | cellular | ... | unknown | 0 | 21 | 819 | 0 | 294 | 0.0 | 999 | 2 | 1 |
1 | 1 | 38 | blue-collar | married | secondary | no | 514 | no | no | unknown | ... | unknown | 0 | 21 | 95090 | 0 | 19532 | 0.0 | 999 | 2 | 1 |
2 | 2 | 36 | blue-collar | married | secondary | no | 602 | yes | no | unknown | ... | unknown | 0 | 7 | 66822 | 0 | 21672 | 0.0 | 999 | 2 | 1 |
3 rows × 26 columns
In [39]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
# Use 10-fold stratified cross-validation
n_splits = 10
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
y_probs = np.zeros(len(X_test))
models = []
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
print(f"Training fold {fold + 1}/{n_splits} >>>")
X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
X_train = pd.concat([X_train, X_bank])
y_train = pd.concat([y_train, y_bank])
model = lgb.LGBMClassifier(
n_estimators=20000,
learning_rate=0.06,
num_leaves=100,
max_depth=10,
min_child_samples=9,
subsample=0.8,
colsample_bytree=0.5,
reg_alpha=0.78,
reg_lambda=3.0,
max_bin=4523,
random_state=42,
verbosity=-1
)
model.fit(
X_train,
y_train,
eval_set=[(X_val, y_val)],
callbacks=[
lgb.early_stopping(100),
lgb.log_evaluation(period=500)
]
)
models.append(model)
# Average predictions across all folds
y_probs += model.predict_proba(X_test)[:, 1] / n_splits
Training fold 1/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's binary_logloss: 0.134942 [1000] valid_0's binary_logloss: 0.131812 [1500] valid_0's binary_logloss: 0.130558 [2000] valid_0's binary_logloss: 0.12992 [2500] valid_0's binary_logloss: 0.129507 Early stopping, best iteration is: [2605] valid_0's binary_logloss: 0.129407 Training fold 2/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's binary_logloss: 0.137124 [1000] valid_0's binary_logloss: 0.133799 [1500] valid_0's binary_logloss: 0.132358 [2000] valid_0's binary_logloss: 0.131729 [2500] valid_0's binary_logloss: 0.13127 Early stopping, best iteration is: [2558] valid_0's binary_logloss: 0.131218 Training fold 3/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's binary_logloss: 0.138993 [1000] valid_0's binary_logloss: 0.135548 [1500] valid_0's binary_logloss: 0.134042 [2000] valid_0's binary_logloss: 0.133397 Early stopping, best iteration is: [2179] valid_0's binary_logloss: 0.133221 Training fold 4/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's binary_logloss: 0.13742 [1000] valid_0's binary_logloss: 0.133993 [1500] valid_0's binary_logloss: 0.132662 [2000] valid_0's binary_logloss: 0.132007 [2500] valid_0's binary_logloss: 0.131704 Early stopping, best iteration is: [2619] valid_0's binary_logloss: 0.131637 Training fold 5/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's binary_logloss: 0.139034 [1000] valid_0's binary_logloss: 0.135711 [1500] valid_0's binary_logloss: 0.134509 [2000] valid_0's binary_logloss: 0.134 Early stopping, best iteration is: [2334] valid_0's binary_logloss: 0.133784 Training fold 6/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's binary_logloss: 0.138064 [1000] valid_0's binary_logloss: 0.134963 [1500] valid_0's binary_logloss: 0.133569 Early stopping, best iteration is: [1754] valid_0's binary_logloss: 0.133191 Training fold 7/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's binary_logloss: 0.135646 [1000] valid_0's binary_logloss: 0.132563 [1500] valid_0's binary_logloss: 0.130898 [2000] valid_0's binary_logloss: 0.130101 Early stopping, best iteration is: [2313] valid_0's binary_logloss: 0.129904 Training fold 8/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's binary_logloss: 0.137441 [1000] valid_0's binary_logloss: 0.134486 [1500] valid_0's binary_logloss: 0.133291 [2000] valid_0's binary_logloss: 0.132774 [2500] valid_0's binary_logloss: 0.132539 Early stopping, best iteration is: [2412] valid_0's binary_logloss: 0.13253 Training fold 9/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's binary_logloss: 0.137212 [1000] valid_0's binary_logloss: 0.1339 [1500] valid_0's binary_logloss: 0.132606 [2000] valid_0's binary_logloss: 0.132026 Early stopping, best iteration is: [2337] valid_0's binary_logloss: 0.131833 Training fold 10/10 >>> Training until validation scores don't improve for 100 rounds [500] valid_0's binary_logloss: 0.13835 [1000] valid_0's binary_logloss: 0.134681 [1500] valid_0's binary_logloss: 0.133202 [2000] valid_0's binary_logloss: 0.132491 [2500] valid_0's binary_logloss: 0.132217 Early stopping, best iteration is: [2804] valid_0's binary_logloss: 0.132031
- [2876] valid_0's binary_logloss: 0.131302
- [2804] valid_0's binary_logloss: 0.132031
In [40]:
from sklearn.metrics import roc_auc_score
best_auc = roc_auc_score(y, model.predict_proba(X)[:, 1])
print(f"Best AUC: {best_auc:.4f}")
Best AUC: 0.9892
- pre-set features, best AUC: 0.9879
- new features, best AUC: 0.9892
In [41]:
output = pd.DataFrame({
'id': test.id,
'y': y_probs
})
output.to_csv('attempt-lightgbm7.csv', index=False)
print("Your submission was successfully saved!")
Your submission was successfully saved!
Ensemble of LightGBM, XGBoost, CATboost
In [42]:
import optuna, catboost as cb
In [43]:
# XGBoost Hyperparameter Tuning - Complementary to LightGBM/CatBoost
def objective_xgboost(trial):
# Parameter space for robust regularization (different approach than LightGBM)
params = {
'n_estimators': trial.suggest_int('n_estimators', 2000, 5000),
'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.08),
'max_depth': trial.suggest_int('max_depth', 6, 10),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
'subsample': trial.suggest_float('subsample', 0.7, 0.9),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 0.9),
'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 2.0),
'reg_lambda': trial.suggest_float('reg_lambda', 1.0, 5.0),
'gamma': trial.suggest_float('gamma', 0.0, 1.0),
'random_state': 42,
'eval_metric': 'logloss',
'early_stopping_rounds': 100,
'verbosity': 0
}
# 5-fold CV for faster tuning
cv_scores = []
kf_tune = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in kf_tune.split(X, y):
X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]
# Add bank data
X_train_fold = pd.concat([X_train_fold, X_bank])
y_train_fold = pd.concat([y_train_fold, y_bank])
model = xgb.XGBClassifier(**params)
model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], verbose=False)
pred = model.predict_proba(X_val_fold)[:, 1]
score = roc_auc_score(y_val_fold, pred)
cv_scores.append(score)
return np.mean(cv_scores)
print("Tuning XGBoost parameters...")
study_xgboost = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study_xgboost.optimize(objective_xgboost, n_trials=50)
best_xgboost_params = study_xgboost.best_params
print(f"Best XGBoost CV AUC: {study_xgboost.best_value:.4f}")
print(f"Best XGBoost params: {best_xgboost_params}")
[I 2025-08-07 13:01:26,444] A new study created in memory with name: no-name-d418a1b1-a8db-4ae7-9f4d-ecdf4f262018
Tuning XGBoost parameters...
[I 2025-08-07 13:02:52,437] Trial 0 finished with value: 0.9681654297437244 and parameters: {'n_estimators': 3123, 'learning_rate': 0.0775357153204958, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.7312037280884873, 'colsample_bytree': 0.7311989040672405, 'reg_alpha': 0.21035886311957896, 'reg_lambda': 4.46470458309974, 'gamma': 0.6011150117432088}. Best is trial 0 with value: 0.9681654297437244. [I 2025-08-07 13:05:57,541] Trial 1 finished with value: 0.9685067764131802 and parameters: {'n_estimators': 4124, 'learning_rate': 0.03102922471479012, 'max_depth': 10, 'min_child_weight': 6, 'subsample': 0.7424678221356552, 'colsample_bytree': 0.73636499344142, 'reg_alpha': 0.4484685687215243, 'reg_lambda': 2.216968971838151, 'gamma': 0.5247564316322378}. Best is trial 1 with value: 0.9685067764131802. [I 2025-08-07 13:08:19,918] Trial 2 finished with value: 0.9684309696188743 and parameters: {'n_estimators': 3296, 'learning_rate': 0.04456145700990209, 'max_depth': 9, 'min_child_weight': 1, 'subsample': 0.7584289297070436, 'colsample_bytree': 0.7732723686587383, 'reg_alpha': 0.9665329700123682, 'reg_lambda': 4.140703845572054, 'gamma': 0.19967378215835974}. Best is trial 1 with value: 0.9685067764131802. [I 2025-08-07 13:12:01,230] Trial 3 finished with value: 0.968241902166777 and parameters: {'n_estimators': 3543, 'learning_rate': 0.05962072844310212, 'max_depth': 6, 'min_child_weight': 5, 'subsample': 0.7341048247374583, 'colsample_bytree': 0.7130103185970559, 'reg_alpha': 1.9028825207813331, 'reg_lambda': 4.862528132298237, 'gamma': 0.8083973481164611}. Best is trial 1 with value: 0.9685067764131802. [I 2025-08-07 13:15:14,937] Trial 4 finished with value: 0.968422817858561 and parameters: {'n_estimators': 2914, 'learning_rate': 0.03488360570031919, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.7244076469689558, 'colsample_bytree': 0.799035382022254, 'reg_alpha': 0.16533819011891496, 'reg_lambda': 4.637281608315128, 'gamma': 0.2587799816000169}. Best is trial 1 with value: 0.9685067764131802. [I 2025-08-07 13:17:59,319] Trial 5 finished with value: 0.9683243602374161 and parameters: {'n_estimators': 3988, 'learning_rate': 0.04558555380447055, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.7369708911051054, 'colsample_bytree': 0.8939169255529118, 'reg_alpha': 1.5727523643861177, 'reg_lambda': 4.757995766256757, 'gamma': 0.8948273504276488}. Best is trial 1 with value: 0.9685067764131802. [I 2025-08-07 13:20:30,376] Trial 6 finished with value: 0.9680618053962011 and parameters: {'n_estimators': 3794, 'learning_rate': 0.07609371175115584, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.7090454577821076, 'colsample_bytree': 0.7650660661526528, 'reg_alpha': 0.8384868504100158, 'reg_lambda': 2.0853961270955836, 'gamma': 0.8287375091519293}. Best is trial 1 with value: 0.9685067764131802. [I 2025-08-07 13:23:34,919] Trial 7 finished with value: 0.9685565580431874 and parameters: {'n_estimators': 3070, 'learning_rate': 0.04404672548436904, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.8604393961508079, 'colsample_bytree': 0.7149101287359542, 'reg_alpha': 1.975085179540983, 'reg_lambda': 4.08897907718663, 'gamma': 0.1987156815341724}. Best is trial 7 with value: 0.9685565580431874. [I 2025-08-07 13:25:08,307] Trial 8 finished with value: 0.9683656780752485 and parameters: {'n_estimators': 2016, 'learning_rate': 0.07077307142274171, 'max_depth': 9, 'min_child_weight': 6, 'subsample': 0.8542540693371892, 'colsample_bytree': 0.714808930346818, 'reg_alpha': 0.7810848842341179, 'reg_lambda': 1.4634762381005189, 'gamma': 0.8631034258755935}. Best is trial 7 with value: 0.9685565580431874. [I 2025-08-07 13:29:24,095] Trial 9 finished with value: 0.9682804122687492 and parameters: {'n_estimators': 3870, 'learning_rate': 0.046544901242632455, 'max_depth': 6, 'min_child_weight': 3, 'subsample': 0.7650366644053493, 'colsample_bytree': 0.8459212356676128, 'reg_alpha': 1.3113591955749049, 'reg_lambda': 4.548850970305306, 'gamma': 0.4722149251619493}. Best is trial 7 with value: 0.9685565580431874. [I 2025-08-07 13:32:04,390] Trial 10 finished with value: 0.9683976536882083 and parameters: {'n_estimators': 4836, 'learning_rate': 0.060598596139102734, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 0.8962141652178608, 'colsample_bytree': 0.8380494147138291, 'reg_alpha': 1.948771745462287, 'reg_lambda': 3.496334108689462, 'gamma': 0.0397996060770148}. Best is trial 7 with value: 0.9685565580431874. [I 2025-08-07 13:35:16,822] Trial 11 finished with value: 0.9685690567785026 and parameters: {'n_estimators': 4712, 'learning_rate': 0.03125412937116301, 'max_depth': 10, 'min_child_weight': 7, 'subsample': 0.8125061405332253, 'colsample_bytree': 0.7486758045268418, 'reg_alpha': 0.5320224533652154, 'reg_lambda': 2.702958058813853, 'gamma': 0.46994321499695574}. Best is trial 11 with value: 0.9685690567785026. [I 2025-08-07 13:38:47,886] Trial 12 finished with value: 0.9685023742854725 and parameters: {'n_estimators': 4815, 'learning_rate': 0.03720456338607968, 'max_depth': 8, 'min_child_weight': 7, 'subsample': 0.821814772686749, 'colsample_bytree': 0.7572042288907423, 'reg_alpha': 1.2925215959863048, 'reg_lambda': 3.250426043779354, 'gamma': 0.345437871794682}. Best is trial 11 with value: 0.9685690567785026. [I 2025-08-07 13:41:24,331] Trial 13 finished with value: 0.9684950316469078 and parameters: {'n_estimators': 2550, 'learning_rate': 0.03937663157166836, 'max_depth': 10, 'min_child_weight': 7, 'subsample': 0.8154140805769239, 'colsample_bytree': 0.7051098989621086, 'reg_alpha': 0.5589543726626454, 'reg_lambda': 2.6614202484762868, 'gamma': 0.03778031229306575}. Best is trial 11 with value: 0.9685690567785026. [I 2025-08-07 13:44:31,507] Trial 14 finished with value: 0.9684448419012266 and parameters: {'n_estimators': 4334, 'learning_rate': 0.04982309189699997, 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.8561338035887187, 'colsample_bytree': 0.7944478816823999, 'reg_alpha': 1.6625870988287481, 'reg_lambda': 3.7577508159850352, 'gamma': 0.38383114510937844}. Best is trial 11 with value: 0.9685690567785026. [I 2025-08-07 13:48:57,692] Trial 15 finished with value: 0.9684619660268263 and parameters: {'n_estimators': 2748, 'learning_rate': 0.03163468929187697, 'max_depth': 7, 'min_child_weight': 3, 'subsample': 0.7897096113780152, 'colsample_bytree': 0.7436181591751374, 'reg_alpha': 1.212992579215986, 'reg_lambda': 1.062041527976909, 'gamma': 0.6423620561394954}. Best is trial 11 with value: 0.9685690567785026. [I 2025-08-07 13:51:30,045] Trial 16 finished with value: 0.9684402711488886 and parameters: {'n_estimators': 4418, 'learning_rate': 0.053597616885897405, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.8527494789829706, 'colsample_bytree': 0.7031207411014159, 'reg_alpha': 0.4812031244022279, 'reg_lambda': 2.6542232604747626, 'gamma': 0.16527177610975427}. Best is trial 11 with value: 0.9685690567785026. [I 2025-08-07 13:53:53,403] Trial 17 finished with value: 0.968561409664099 and parameters: {'n_estimators': 2293, 'learning_rate': 0.041117317845056386, 'max_depth': 10, 'min_child_weight': 3, 'subsample': 0.8871933728473397, 'colsample_bytree': 0.8285694861181111, 'reg_alpha': 1.525765108083813, 'reg_lambda': 3.941375147628981, 'gamma': 0.6712369673505519}. Best is trial 11 with value: 0.9685690567785026. [I 2025-08-07 13:56:19,464] Trial 18 finished with value: 0.9685970618366178 and parameters: {'n_estimators': 2054, 'learning_rate': 0.04025387737768451, 'max_depth': 10, 'min_child_weight': 3, 'subsample': 0.8813483474949517, 'colsample_bytree': 0.8312087668843073, 'reg_alpha': 1.5084255565406828, 'reg_lambda': 2.9313945858455686, 'gamma': 0.6972987930890137}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 13:59:33,184] Trial 19 finished with value: 0.9685373914693688 and parameters: {'n_estimators': 2011, 'learning_rate': 0.03048632991768497, 'max_depth': 10, 'min_child_weight': 6, 'subsample': 0.825752469459229, 'colsample_bytree': 0.8698062277490901, 'reg_alpha': 1.1308969090374692, 'reg_lambda': 2.8146421430011497, 'gamma': 0.7157675415766318}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 14:02:03,942] Trial 20 finished with value: 0.968445555276028 and parameters: {'n_estimators': 4617, 'learning_rate': 0.035413075471851865, 'max_depth': 10, 'min_child_weight': 2, 'subsample': 0.7918855630714174, 'colsample_bytree': 0.817744961921466, 'reg_alpha': 0.706963858463172, 'reg_lambda': 2.1215665409160955, 'gamma': 0.9938427850497089}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 14:04:32,957] Trial 21 finished with value: 0.9685482132720742 and parameters: {'n_estimators': 2418, 'learning_rate': 0.04032891518735913, 'max_depth': 10, 'min_child_weight': 3, 'subsample': 0.8985088151036531, 'colsample_bytree': 0.8261442043181213, 'reg_alpha': 1.520283912164619, 'reg_lambda': 3.1558290948988725, 'gamma': 0.6990242621704733}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 14:07:24,146] Trial 22 finished with value: 0.9685705202072713 and parameters: {'n_estimators': 2299, 'learning_rate': 0.040462254876629716, 'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.880164288279437, 'colsample_bytree': 0.7843544449087234, 'reg_alpha': 1.7404268484824685, 'reg_lambda': 3.701769326617604, 'gamma': 0.5488163533512334}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 14:10:39,757] Trial 23 finished with value: 0.9685967351693886 and parameters: {'n_estimators': 2232, 'learning_rate': 0.03541637056222954, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.8760052469445377, 'colsample_bytree': 0.7837467057556148, 'reg_alpha': 1.7551103922407045, 'reg_lambda': 3.2730717745677778, 'gamma': 0.5180316543892582}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 14:13:01,762] Trial 24 finished with value: 0.9685375946923912 and parameters: {'n_estimators': 2278, 'learning_rate': 0.05069931042013509, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.8756670794814093, 'colsample_bytree': 0.7824505524025784, 'reg_alpha': 1.7656927235231779, 'reg_lambda': 3.5048444092558335, 'gamma': 0.5550196256555672}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 14:14:59,782] Trial 25 finished with value: 0.9684086434707917 and parameters: {'n_estimators': 2686, 'learning_rate': 0.059634969996231725, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.876294883888365, 'colsample_bytree': 0.8593406338373164, 'reg_alpha': 1.3942794805927359, 'reg_lambda': 3.5155657480075657, 'gamma': 0.4013971858561438}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 14:17:56,493] Trial 26 finished with value: 0.9685319947672966 and parameters: {'n_estimators': 2229, 'learning_rate': 0.03637106055995824, 'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.8380521430564344, 'colsample_bytree': 0.8115035303145616, 'reg_alpha': 1.7037467588646507, 'reg_lambda': 3.0205556555689954, 'gamma': 0.7575431762403109}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 14:21:36,710] Trial 27 finished with value: 0.9685797020584769 and parameters: {'n_estimators': 2547, 'learning_rate': 0.041770924261068314, 'max_depth': 9, 'min_child_weight': 2, 'subsample': 0.8759480939746173, 'colsample_bytree': 0.7858858120192809, 'reg_alpha': 1.8141895329186388, 'reg_lambda': 3.752406525736708, 'gamma': 0.5982343179060122}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 14:24:20,756] Trial 28 finished with value: 0.9684419594015342 and parameters: {'n_estimators': 2564, 'learning_rate': 0.04815612813384653, 'max_depth': 9, 'min_child_weight': 2, 'subsample': 0.8385590482386824, 'colsample_bytree': 0.8127559344204559, 'reg_alpha': 1.8307260283340514, 'reg_lambda': 2.4091047612786354, 'gamma': 0.6079220082693564}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 14:27:00,771] Trial 29 finished with value: 0.9684586847392644 and parameters: {'n_estimators': 2902, 'learning_rate': 0.05420177949435838, 'max_depth': 9, 'min_child_weight': 2, 'subsample': 0.8695450968301509, 'colsample_bytree': 0.794336170393672, 'reg_alpha': 1.4682318043922964, 'reg_lambda': 4.359353824726454, 'gamma': 0.6002411647513862}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 14:28:50,267] Trial 30 finished with value: 0.9682874696461801 and parameters: {'n_estimators': 3310, 'learning_rate': 0.06585527130069006, 'max_depth': 10, 'min_child_weight': 5, 'subsample': 0.8405941026091306, 'colsample_bytree': 0.872161454205605, 'reg_alpha': 1.5916724619154088, 'reg_lambda': 1.8136773522298422, 'gamma': 0.7530878527989517}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 14:32:29,605] Trial 31 finished with value: 0.9685899528509596 and parameters: {'n_estimators': 2146, 'learning_rate': 0.04129994985805446, 'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.8850986758981175, 'colsample_bytree': 0.7880362305909715, 'reg_alpha': 1.8087787931058803, 'reg_lambda': 3.3102680420664576, 'gamma': 0.5719406095472662}. Best is trial 18 with value: 0.9685970618366178. [I 2025-08-07 14:36:42,614] Trial 32 finished with value: 0.9686084599363983 and parameters: {'n_estimators': 2084, 'learning_rate': 0.04241228443916973, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.8894733238422363, 'colsample_bytree': 0.7755272104546164, 'reg_alpha': 1.8455318772526679, 'reg_lambda': 3.297335548087981, 'gamma': 0.48321415961320585}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 14:40:34,707] Trial 33 finished with value: 0.968551640078298 and parameters: {'n_estimators': 2119, 'learning_rate': 0.04338840570520476, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.8892626092582322, 'colsample_bytree': 0.7674550416274887, 'reg_alpha': 1.8877025633427411, 'reg_lambda': 3.232533496601635, 'gamma': 0.4904619715408989}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 14:44:38,604] Trial 34 finished with value: 0.9685770643353221 and parameters: {'n_estimators': 2139, 'learning_rate': 0.038241088360278556, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.8648283507568529, 'colsample_bytree': 0.774651833441289, 'reg_alpha': 1.6539527315495488, 'reg_lambda': 2.439312360251772, 'gamma': 0.4168883638319408}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 14:48:27,799] Trial 35 finished with value: 0.9685951346600735 and parameters: {'n_estimators': 2422, 'learning_rate': 0.03347798379949189, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.8882143847643476, 'colsample_bytree': 0.7286400425952747, 'reg_alpha': 1.3932743053643368, 'reg_lambda': 2.9085352944240426, 'gamma': 0.2745171391685712}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 14:54:33,093] Trial 36 finished with value: 0.9684932996227612 and parameters: {'n_estimators': 2383, 'learning_rate': 0.03357339868321989, 'max_depth': 7, 'min_child_weight': 4, 'subsample': 0.8990308670613424, 'colsample_bytree': 0.7281870439848215, 'reg_alpha': 1.426841843545473, 'reg_lambda': 2.9571353188970173, 'gamma': 0.29402528460586036}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 14:59:21,289] Trial 37 finished with value: 0.9685637807325058 and parameters: {'n_estimators': 2763, 'learning_rate': 0.03422300656050513, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.8470292552950044, 'colsample_bytree': 0.7271902732118719, 'reg_alpha': 1.0168717021941491, 'reg_lambda': 2.42473824681267, 'gamma': 0.14387613651266906}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 15:03:57,400] Trial 38 finished with value: 0.9685204983598441 and parameters: {'n_estimators': 2460, 'learning_rate': 0.03332480855354848, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.8689805955783403, 'colsample_bytree': 0.7552082826946688, 'reg_alpha': 0.3038203991932765, 'reg_lambda': 2.977853990759783, 'gamma': 0.2934095366949327}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 15:09:15,931] Trial 39 finished with value: 0.968476502910395 and parameters: {'n_estimators': 3535, 'learning_rate': 0.03768389782907463, 'max_depth': 7, 'min_child_weight': 5, 'subsample': 0.8870508385440137, 'colsample_bytree': 0.806852055005723, 'reg_alpha': 1.1240542693372433, 'reg_lambda': 4.247064792967203, 'gamma': 0.4431761270391086}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 15:12:54,845] Trial 40 finished with value: 0.9685076014200591 and parameters: {'n_estimators': 3114, 'learning_rate': 0.043522297347761606, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.867150342933512, 'colsample_bytree': 0.7355065436663255, 'reg_alpha': 1.3462940653444355, 'reg_lambda': 3.3886384477538236, 'gamma': 0.33644244669160894}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 15:16:17,622] Trial 41 finished with value: 0.9685460451896585 and parameters: {'n_estimators': 2152, 'learning_rate': 0.04762410330034662, 'max_depth': 8, 'min_child_weight': 3, 'subsample': 0.8866180955645535, 'colsample_bytree': 0.7741620346345551, 'reg_alpha': 1.8695575045928476, 'reg_lambda': 3.2620665862348606, 'gamma': 0.5329880533144051}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 15:19:59,455] Trial 42 finished with value: 0.9685735224213274 and parameters: {'n_estimators': 2067, 'learning_rate': 0.036007975914862204, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.8837510159741416, 'colsample_bytree': 0.8018471357933656, 'reg_alpha': 1.5986751297301085, 'reg_lambda': 2.8498277449693017, 'gamma': 0.1035947169887228}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 15:23:22,124] Trial 43 finished with value: 0.9684125545367672 and parameters: {'n_estimators': 2003, 'learning_rate': 0.04292813206599627, 'max_depth': 8, 'min_child_weight': 3, 'subsample': 0.7576374368586407, 'colsample_bytree': 0.8482233882320287, 'reg_alpha': 1.9891736607326924, 'reg_lambda': 3.118779879780776, 'gamma': 0.5115330697360837}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 15:27:25,906] Trial 44 finished with value: 0.9685951028089347 and parameters: {'n_estimators': 2212, 'learning_rate': 0.03869431968241978, 'max_depth': 8, 'min_child_weight': 3, 'subsample': 0.8942471672342232, 'colsample_bytree': 0.823759548165568, 'reg_alpha': 1.7604837999849499, 'reg_lambda': 3.9450024171593716, 'gamma': 0.2310154219068074}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 15:31:57,362] Trial 45 finished with value: 0.9684731902304702 and parameters: {'n_estimators': 2966, 'learning_rate': 0.045751892253505955, 'max_depth': 7, 'min_child_weight': 4, 'subsample': 0.8930602579182858, 'colsample_bytree': 0.8239655586433066, 'reg_alpha': 1.673608109766161, 'reg_lambda': 3.9842411922162886, 'gamma': 0.2374887759598658}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 15:36:09,316] Trial 46 finished with value: 0.9685340943544037 and parameters: {'n_estimators': 2643, 'learning_rate': 0.03856905320270151, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.8992963809039051, 'colsample_bytree': 0.8362502434563537, 'reg_alpha': 1.2437210485609844, 'reg_lambda': 4.692988148459355, 'gamma': 0.2444166487449254}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 15:41:52,534] Trial 47 finished with value: 0.9684550945005057 and parameters: {'n_estimators': 3330, 'learning_rate': 0.032663322034005654, 'max_depth': 7, 'min_child_weight': 3, 'subsample': 0.8585221498006452, 'colsample_bytree': 0.8959247074152795, 'reg_alpha': 1.4862112189301173, 'reg_lambda': 3.7302889554221603, 'gamma': 0.8088903146556538}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 15:46:16,263] Trial 48 finished with value: 0.9685140387775778 and parameters: {'n_estimators': 2454, 'learning_rate': 0.03517406326288233, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.8755600448653386, 'colsample_bytree': 0.8457341281507705, 'reg_alpha': 1.9289213274655754, 'reg_lambda': 1.9455145231215816, 'gamma': 0.09870018198996322}. Best is trial 32 with value: 0.9686084599363983. [I 2025-08-07 15:50:39,129] Trial 49 finished with value: 0.9680490165066125 and parameters: {'n_estimators': 2336, 'learning_rate': 0.030639974765641696, 'max_depth': 6, 'min_child_weight': 6, 'subsample': 0.8485940876094181, 'colsample_bytree': 0.7586569969544734, 'reg_alpha': 0.914079879556502, 'reg_lambda': 2.574204435311884, 'gamma': 0.3617514035297955}. Best is trial 32 with value: 0.9686084599363983.
Best XGBoost CV AUC: 0.9686 Best XGBoost params: {'n_estimators': 2084, 'learning_rate': 0.04241228443916973, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.8894733238422363, 'colsample_bytree': 0.7755272104546164, 'reg_alpha': 1.8455318772526679, 'reg_lambda': 3.297335548087981, 'gamma': 0.48321415961320585}
In [ ]:
# CatBoost Hyperparameter Tuning - Optimized for diversity from LightGBM
def objective_catboost(trial):
# Parameter space designed to be different from LightGBM
params = {
'iterations': trial.suggest_int('iterations', 2000, 5000),
'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.08),
'depth': trial.suggest_int('depth', 6, 10),
'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
'border_count': trial.suggest_int('border_count', 128, 255),
'random_strength': trial.suggest_float('random_strength', 0.5, 2.0),
'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
'random_state': 42,
'verbose': False,
'early_stopping_rounds': 100
}
# 5-fold CV for faster tuning
cv_scores = []
kf_tune = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in kf_tune.split(X, y):
X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]
# Add bank data
X_train_fold = pd.concat([X_train_fold, X_bank])
y_train_fold = pd.concat([y_train_fold, y_bank])
model = cb.CatBoostClassifier(**params)
model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
pred = model.predict_proba(X_val_fold)[:, 1]
score = roc_auc_score(y_val_fold, pred)
cv_scores.append(score)
return np.mean(cv_scores)
print("Tuning CatBoost parameters...")
study_catboost = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study_catboost.optimize(objective_catboost, n_trials=50)
best_catboost_params = study_catboost.best_params
print(f"Best CatBoost CV AUC: {study_catboost.best_value:.4f}")
print(f"Best CatBoost params: {best_catboost_params}")
[I 2025-08-07 15:50:39,142] A new study created in memory with name: no-name-31037c8e-fb7c-4f8c-86cc-f3a82f5285aa
Tuning CatBoost parameters...
[I 2025-08-07 15:59:19,874] Trial 0 finished with value: 0.9667201844179077 and parameters: {'iterations': 3123, 'learning_rate': 0.0775357153204958, 'depth': 9, 'l2_leaf_reg': 6.387926357773329, 'border_count': 147, 'random_strength': 0.7339917805043039, 'bagging_temperature': 0.05808361216819946}. Best is trial 0 with value: 0.9667201844179077. [I 2025-08-07 16:10:40,762] Trial 1 finished with value: 0.9674417394062009 and parameters: {'iterations': 4599, 'learning_rate': 0.060055750587160436, 'depth': 9, 'l2_leaf_reg': 1.185260448662222, 'border_count': 252, 'random_strength': 1.7486639612006325, 'bagging_temperature': 0.21233911067827616}. Best is trial 1 with value: 0.9674417394062009. [I 2025-08-07 16:20:45,734] Trial 2 finished with value: 0.9664948735192249 and parameters: {'iterations': 2545, 'learning_rate': 0.03917022549267169, 'depth': 7, 'l2_leaf_reg': 5.72280788469014, 'border_count': 183, 'random_strength': 0.9368437102970628, 'bagging_temperature': 0.6118528947223795}. Best is trial 1 with value: 0.9674417394062009. [I 2025-08-07 16:30:03,526] Trial 3 finished with value: 0.9668437071626259 and parameters: {'iterations': 2418, 'learning_rate': 0.04460723242676091, 'depth': 7, 'l2_leaf_reg': 5.104629857953324, 'border_count': 228, 'random_strength': 0.7995106732375397, 'bagging_temperature': 0.5142344384136116}. Best is trial 1 with value: 0.9674417394062009. [I 2025-08-07 16:47:14,869] Trial 4 finished with value: 0.9666843667800308 and parameters: {'iterations': 3777, 'learning_rate': 0.032322520635999885, 'depth': 9, 'l2_leaf_reg': 2.5347171131856236, 'border_count': 136, 'random_strength': 1.92332830588, 'bagging_temperature': 0.9656320330745594}. Best is trial 1 with value: 0.9674417394062009. [I 2025-08-07 17:03:07,524] Trial 5 finished with value: 0.9669287849486056 and parameters: {'iterations': 4426, 'learning_rate': 0.04523068845866853, 'depth': 6, 'l2_leaf_reg': 7.158097238609412, 'border_count': 184, 'random_strength': 0.6830573522671682, 'bagging_temperature': 0.4951769101112702}. Best is trial 1 with value: 0.9674417394062009. [I 2025-08-07 17:11:19,099] Trial 6 finished with value: 0.9668431460298846 and parameters: {'iterations': 2103, 'learning_rate': 0.0754660201039391, 'depth': 7, 'l2_leaf_reg': 6.962700559185838, 'border_count': 167, 'random_strength': 1.2801020317667162, 'bagging_temperature': 0.5467102793432796}. Best is trial 1 with value: 0.9674417394062009. [I 2025-08-07 17:21:44,760] Trial 7 finished with value: 0.9675255790147533 and parameters: {'iterations': 2554, 'learning_rate': 0.07847923138822793, 'depth': 9, 'l2_leaf_reg': 9.455490474077703, 'border_count': 242, 'random_strength': 1.3968499682166278, 'bagging_temperature': 0.9218742350231168}. Best is trial 7 with value: 0.9675255790147533. [I 2025-08-07 17:29:52,473] Trial 8 finished with value: 0.9658251102966036 and parameters: {'iterations': 2265, 'learning_rate': 0.03979914312095726, 'depth': 6, 'l2_leaf_reg': 3.927972976869379, 'border_count': 177, 'random_strength': 0.9070235476608439, 'bagging_temperature': 0.8287375091519293}. Best is trial 7 with value: 0.9675255790147533. [I 2025-08-07 17:43:07,376] Trial 9 finished with value: 0.9674044399850678 and parameters: {'iterations': 3070, 'learning_rate': 0.04404672548436904, 'depth': 8, 'l2_leaf_reg': 2.2683180247728636, 'border_count': 230, 'random_strength': 0.6118259655196563, 'bagging_temperature': 0.9868869366005173}. Best is trial 7 with value: 0.9675255790147533. [I 2025-08-07 17:57:24,028] Trial 10 finished with value: 0.9671118153757383 and parameters: {'iterations': 3541, 'learning_rate': 0.06499375049607987, 'depth': 10, 'l2_leaf_reg': 9.59625943278804, 'border_count': 211, 'random_strength': 1.3628121002229092, 'bagging_temperature': 0.7303668952070097}. Best is trial 7 with value: 0.9675255790147533. [I 2025-08-07 18:14:59,081] Trial 11 finished with value: 0.9673944906871246 and parameters: {'iterations': 4929, 'learning_rate': 0.060101379931577215, 'depth': 10, 'l2_leaf_reg': 9.93149973182709, 'border_count': 249, 'random_strength': 1.6974075590594648, 'bagging_temperature': 0.1533681809513373}. Best is trial 7 with value: 0.9675255790147533. [I 2025-08-07 18:25:17,159] Trial 12 finished with value: 0.9673831300845113 and parameters: {'iterations': 4165, 'learning_rate': 0.0680821342737341, 'depth': 9, 'l2_leaf_reg': 1.1963612909754555, 'border_count': 249, 'random_strength': 1.5667091934615425, 'bagging_temperature': 0.2646596329637791}. Best is trial 7 with value: 0.9675255790147533. [I 2025-08-07 18:41:21,287] Trial 13 finished with value: 0.9674887704614047 and parameters: {'iterations': 4997, 'learning_rate': 0.05406449773780332, 'depth': 8, 'l2_leaf_reg': 8.316378003207923, 'border_count': 211, 'random_strength': 1.9883771855808856, 'bagging_temperature': 0.34279928826631634}. Best is trial 7 with value: 0.9675255790147533. [I 2025-08-07 18:54:12,018] Trial 14 finished with value: 0.967349560003362 and parameters: {'iterations': 2872, 'learning_rate': 0.05276040985150533, 'depth': 8, 'l2_leaf_reg': 8.461514548344311, 'border_count': 207, 'random_strength': 1.9886919905347793, 'bagging_temperature': 0.3276567589577757}. Best is trial 7 with value: 0.9675255790147533. [I 2025-08-07 19:11:17,085] Trial 15 finished with value: 0.967578450157388 and parameters: {'iterations': 3934, 'learning_rate': 0.052825853293157414, 'depth': 8, 'l2_leaf_reg': 8.272594310024477, 'border_count': 225, 'random_strength': 1.1201211495390695, 'bagging_temperature': 0.3565126982022847}. Best is trial 15 with value: 0.967578450157388.
In [ ]:
# Train finetuned CatBoost with same CV structure as your LightGBM
print("Training finetuned CatBoost...")
y_probs_cat_tuned = np.zeros(len(X_test))
catboost_models_tuned = []
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
print(f"CatBoost fold {fold + 1}/{n_splits}")
X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]
# Add bank data like your LightGBM
X_train_fold = pd.concat([X_train_fold, X_bank])
y_train_fold = pd.concat([y_train_fold, y_bank])
model = cb.CatBoostClassifier(**best_catboost_params)
model.fit(
X_train_fold,
y_train_fold,
eval_set=(X_val_fold, y_val_fold),
verbose=500,
use_best_model=True
)
catboost_models_tuned.append(model)
y_probs_cat_tuned += model.predict_proba(X_test)[:, 1] / n_splits
print("CatBoost training completed!")
In [ ]:
# Train finetuned XGBoost with same CV structure as your LightGBM
print("Training finetuned XGBoost...")
y_probs_xgb_tuned = np.zeros(len(X_test))
xgboost_models_tuned = []
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
print(f"XGBoost fold {fold + 1}/{n_splits}")
X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]
# Add bank data like your LightGBM
X_train_fold = pd.concat([X_train_fold, X_bank])
y_train_fold = pd.concat([y_train_fold, y_bank])
model = xgb.XGBClassifier(**best_xgboost_params)
model.fit(
X_train_fold,
y_train_fold,
eval_set=[(X_val_fold, y_val_fold)],
verbose=500
)
xgboost_models_tuned.append(model)
y_probs_xgb_tuned += model.predict_proba(X_test)[:, 1] / n_splits
print("XGBoost training completed!")
In [ ]:
# Create ensemble from all 3 finetuned models
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score
print("=== CREATING OPTIMIZED 3-MODEL ENSEMBLE ===")
# Generate out-of-fold predictions for validation
def get_oof_predictions_simple(models_list, X_data, y_data, kf):
"""Generate out-of-fold predictions"""
oof_preds = np.zeros(len(X_data))
for fold, (train_idx, val_idx) in enumerate(kf.split(X_data, y_data)):
model = models_list[fold]
oof_preds[val_idx] = model.predict_proba(X_data.iloc[val_idx])[:, 1]
return oof_preds
# Get OOF predictions for ensemble optimization
lgb_oof = get_oof_predictions_simple(models, X, y, kf)
cat_oof = get_oof_predictions_simple(catboost_models_tuned, X, y, kf)
xgb_oof = get_oof_predictions_simple(xgboost_models_tuned, X, y, kf)
# Individual model scores
lgb_score = roc_auc_score(y, lgb_oof)
cat_score = roc_auc_score(y, cat_oof)
xgb_score = roc_auc_score(y, xgb_oof)
print(f"Individual Model CV Scores:")
print(f"LightGBM: {lgb_score:.4f}")
print(f"CatBoost: {cat_score:.4f}")
print(f"XGBoost: {xgb_score:.4f}")
print()
# Optimize ensemble weights
def ensemble_loss(weights, *args):
lgb_pred, cat_pred, xgb_pred, y_true = args
weights = weights / weights.sum() # Normalize
ensemble_pred = weights[0] * lgb_pred + weights[1] * cat_pred + weights[2] * xgb_pred
return -roc_auc_score(y_true, ensemble_pred) # Negative because we minimize
result = minimize(
ensemble_loss,
x0=[1, 1, 1], # Initial equal weights
args=(lgb_oof, cat_oof, xgb_oof, y),
bounds=[(0.01, 5), (0.01, 5), (0.01, 5)],
method='L-BFGS-B'
)
optimal_weights = result.x / result.x.sum()
print(f"Optimal weights:")
print(f" LightGBM: {optimal_weights[0]:.3f}")
print(f" CatBoost: {optimal_weights[1]:.3f}")
print(f" XGBoost: {optimal_weights[2]:.3f}")
# Create final ensemble predictions
ensemble_oof = (optimal_weights[0] * lgb_oof +
optimal_weights[1] * cat_oof +
optimal_weights[2] * xgb_oof)
ensemble_test = (optimal_weights[0] * y_probs +
optimal_weights[1] * y_probs_cat_tuned +
optimal_weights[2] * y_probs_xgb_tuned)
ensemble_score = roc_auc_score(y, ensemble_oof)
improvement = ensemble_score - lgb_score
print(f"\nEnsemble Performance:")
print(f"Ensemble CV AUC: {ensemble_score:.4f}")
print(f"Improvement: {improvement:+.4f} over best single model")
if improvement > 0:
print("✅ Ensemble improves performance!")
else:
print("⚠️ Ensemble doesn't improve - consider using LightGBM only")
In [ ]:
# Save final ensemble results
if improvement > 0:
# Use ensemble if it improves
final_predictions = ensemble_test
method_used = "Weighted Ensemble"
final_score = ensemble_score
else:
# Use LightGBM if ensemble doesn't help
final_predictions = y_probs
method_used = "LightGBM Only"
final_score = lgb_score
# Save main submission
final_submission = pd.DataFrame({
'id': test.id,
'y': final_predictions
})
final_submission.to_csv('final_3model_ensemble.csv', index=False)
print(f"Final submission saved as 'final_3model_ensemble.csv'")
print(f"Method used: {method_used}")
print(f"CV AUC: {final_score:.4f}")
# Also save all predictions for comparison
all_predictions = pd.DataFrame({
'id': test.id,
'lightgbm': y_probs,
'catboost_tuned': y_probs_cat_tuned,
'xgboost_tuned': y_probs_xgb_tuned,
'weighted_ensemble': ensemble_test
})
all_predictions.to_csv('all_model_predictions.csv', index=False)
print("All model predictions saved as 'all_model_predictions.csv'")
print(f"\n🏆 FINAL RESULTS:")
print(f"{'='*50}")
print(f"Best method: {method_used}")
print(f"Final CV AUC: {final_score:.4f}")
if improvement > 0:
print(f"Improvement: +{improvement:.4f} over LightGBM")
else:
print("Ensemble didn't improve - using single model")
print(f"{'='*50}")