Guide file¶

Robin R.P.M. Kras

Start of Kaggle notebook¶

#X¶

Kaggle competition: [link]

Entry by Robin R.P.M. Kras

Libraries¶

In [ ]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Machine Learning
import xgboost as xg

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from imblearn.over_sampling import SMOTE

# Feature Importance & Explainability
import shap

# Settings
import warnings
warnings.filterwarnings("ignore")

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

print("Libraries loaded. Ready to go!")

Quick Experimenting¶

In [ ]:
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_predictions = np.zeros(len(train))

for train_idx, val_idx in kf.split(train):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = "MODEL"()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    oof_predictions[val_idx] = y_pred  # Store out-of-fold predictions
    
    print(f"Fold RMSE: {root_mean_squared_error(y_val, y_pred)}") # Change RMSE by accuracy/recall/f1-score for classification

final_rmse = root_mean_squared_error(y, oof_predictions)
print(f"Final Cross-Validation RMSE: {final_rmse}")

Quick data visualization¶

Histograms for numerical features

In [ ]:
float_cols = [col for col in train.columns if train[col].dtype == "float64" or train[col].dtype == "int64"]

cols_per_row = 3
num_plots = len(float_cols)
rows = (num_plots // cols_per_row) + (num_plots % cols_per_row > 0) 

fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 5 * rows)) 
axes = axes.flatten()  

for idx, col in enumerate(float_cols):
    sns.histplot(train[col], bins=50, kde=True, ax=axes[idx])
    axes[idx].set_title(f"Distribution of {col}")

for i in range(idx + 1, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

Pie charts

In [ ]:
categorical_features = train.select_dtypes(include=['object']).columns

num_features = len(categorical_features)
cols = 3 
rows = (num_features // cols) + (num_features % cols > 0) 

# Create subplots
fig, axes = plt.subplots(rows, cols, figsize=(15, rows * 5)) 
axes = axes.flatten()  

for i, feature in enumerate(categorical_features):
    train[feature].value_counts().plot.pie(
        autopct='%1.1f%%', ax=axes[i], startangle=90, cmap="viridis"
    )
    axes[i].set_title(feature)
    axes[i].set_ylabel("") 

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

Heatmap of numerical features

In [ ]:
heatmap_train = train.select_dtypes(include=["float64", "int64"])

corr_matrix = heatmap_train.corr()

threshold = 0.75

high_corr_pairs = (
    corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) 
    .stack()  
    .reset_index()
)

high_corr_pairs.columns = ["Feature 1", "Feature 2", "Correlation"]
high_corr_pairs = high_corr_pairs[high_corr_pairs["Correlation"].abs() > threshold]  

plt.figure(figsize=(30, 12))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.show()

print("Highly correlated feature pairs (above threshold):")
print(high_corr_pairs)

Feature Engineering¶

Create new features from highly-correlated features

In [ ]:
import itertools

def create_combination_features(df, features):
    combinations = itertools.combinations(features, 2)

    for comb in combinations:
        feature_name = "_".join(comb)
        df[feature_name] = df[list(comb)].mean(axis=1)
    
    return df

Model Training and Fine Tuning¶

GridSearch

In [ ]:
param_grid = {
    'x': [1,2,3],
    'y': [4,5,6]
}

grid_search = GridSearchCV(model(), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)])

print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

Metrics

In [ ]:
### X_train, X_val, y_train, y_val = train test split ...

y_pred = best_model.predict(X_val)

val_accuracy = accuracy_score(y_val, y_pred)
val_precision = precision_score(y_val, y_pred)
val_recall = recall_score(y_val, y_pred)
val_f1 = f1_score(y_val, y_pred)
val_auc = roc_auc_score(y_val, y_pred)

Cross validation

In [ ]:
# Set up k-fold cross validation
k = 5  # Common choice is 5 or 10 folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# If you're using a sklearn model (for example, Random Forest)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

# Print results
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {np.mean(cv_scores):.4f}")
print(f"Standard deviation: {np.std(cv_scores):.4f}")

SHAP¶

In [ ]:
explainer = shap.TreeExplainer(model)

shap_values = explainer.shap_values(X_val)

shap.summary_plot(shap_values, X_val)
shap.force_plot(explainer.expected_value, shap_values[0], X_val.iloc[0])