# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Machine Learning
import xgboost as xg

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from imblearn.over_sampling import SMOTE

# Feature Importance & Explainability
import shap

# Settings
import warnings
warnings.filterwarnings("ignore")

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

print("Libraries loaded. Ready to go!")

from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_predictions = np.zeros(len(train))

for train_idx, val_idx in kf.split(train):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = "MODEL"()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    oof_predictions[val_idx] = y_pred  # Store out-of-fold predictions
    
    print(f"Fold RMSE: {root_mean_squared_error(y_val, y_pred)}") # Change RMSE by accuracy/recall/f1-score for classification

final_rmse = root_mean_squared_error(y, oof_predictions)
print(f"Final Cross-Validation RMSE: {final_rmse}")

float_cols = [col for col in train.columns if train[col].dtype == "float64" or train[col].dtype == "int64"]

cols_per_row = 3
num_plots = len(float_cols)
rows = (num_plots // cols_per_row) + (num_plots % cols_per_row > 0) 

fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 5 * rows)) 
axes = axes.flatten()  

for idx, col in enumerate(float_cols):
    sns.histplot(train[col], bins=50, kde=True, ax=axes[idx])
    axes[idx].set_title(f"Distribution of {col}")

for i in range(idx + 1, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

categorical_features = train.select_dtypes(include=['object']).columns

num_features = len(categorical_features)
cols = 3 
rows = (num_features // cols) + (num_features % cols > 0) 

# Create subplots
fig, axes = plt.subplots(rows, cols, figsize=(15, rows * 5)) 
axes = axes.flatten()  

for i, feature in enumerate(categorical_features):
    train[feature].value_counts().plot.pie(
        autopct='%1.1f%%', ax=axes[i], startangle=90, cmap="viridis"
    )
    axes[i].set_title(feature)
    axes[i].set_ylabel("") 

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

heatmap_train = train.select_dtypes(include=["float64", "int64"])

corr_matrix = heatmap_train.corr()

threshold = 0.75

high_corr_pairs = (
    corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) 
    .stack()  
    .reset_index()
)

high_corr_pairs.columns = ["Feature 1", "Feature 2", "Correlation"]
high_corr_pairs = high_corr_pairs[high_corr_pairs["Correlation"].abs() > threshold]  

plt.figure(figsize=(30, 12))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.show()

print("Highly correlated feature pairs (above threshold):")
print(high_corr_pairs)

import itertools

def create_combination_features(df, features):
    combinations = itertools.combinations(features, 2)

    for comb in combinations:
        feature_name = "_".join(comb)
        df[feature_name] = df[list(comb)].mean(axis=1)
    
    return df

param_grid = {
    'x': [1,2,3],
    'y': [4,5,6]
}

grid_search = GridSearchCV(model(), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)])

print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

### X_train, X_val, y_train, y_val = train test split ...

y_pred = best_model.predict(X_val)

val_accuracy = accuracy_score(y_val, y_pred)
val_precision = precision_score(y_val, y_pred)
val_recall = recall_score(y_val, y_pred)
val_f1 = f1_score(y_val, y_pred)
val_auc = roc_auc_score(y_val, y_pred)

# Set up k-fold cross validation
k = 5  # Common choice is 5 or 10 folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# If you're using a sklearn model (for example, Random Forest)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

# Print results
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {np.mean(cv_scores):.4f}")
print(f"Standard deviation: {np.std(cv_scores):.4f}")

explainer = shap.TreeExplainer(model)

shap_values = explainer.shap_values(X_val)

shap.summary_plot(shap_values, X_val)
shap.force_plot(explainer.expected_value, shap_values[0], X_val.iloc[0])

Guide file¶

Start of Kaggle notebook¶

#X¶

Libraries¶

Quick Experimenting¶

Quick data visualization¶

Feature Engineering¶

Model Training and Fine Tuning¶

SHAP¶