Guide file¶
Robin R.P.M. Kras
Start of Kaggle notebook¶
Libraries¶
In [ ]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
# Machine Learning
import xgboost as xg
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import SMOTE
# Feature Importance & Explainability
import shap
# Settings
import warnings
warnings.filterwarnings("ignore")
# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)
print("Libraries loaded. Ready to go!")
Quick Experimenting¶
In [ ]:
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
import numpy as np
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_predictions = np.zeros(len(train))
for train_idx, val_idx in kf.split(train):
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
model = "MODEL"()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
oof_predictions[val_idx] = y_pred # Store out-of-fold predictions
print(f"Fold RMSE: {root_mean_squared_error(y_val, y_pred)}") # Change RMSE by accuracy/recall/f1-score for classification
final_rmse = root_mean_squared_error(y, oof_predictions)
print(f"Final Cross-Validation RMSE: {final_rmse}")
Quick data visualization¶
Histograms for numerical features
In [ ]:
float_cols = [col for col in train.columns if train[col].dtype == "float64" or train[col].dtype == "int64"]
cols_per_row = 3
num_plots = len(float_cols)
rows = (num_plots // cols_per_row) + (num_plots % cols_per_row > 0)
fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 5 * rows))
axes = axes.flatten()
for idx, col in enumerate(float_cols):
sns.histplot(train[col], bins=50, kde=True, ax=axes[idx])
axes[idx].set_title(f"Distribution of {col}")
for i in range(idx + 1, len(axes)):
fig.delaxes(axes[i])
plt.tight_layout()
plt.show()
Pie charts
In [ ]:
categorical_features = train.select_dtypes(include=['object']).columns
num_features = len(categorical_features)
cols = 3
rows = (num_features // cols) + (num_features % cols > 0)
# Create subplots
fig, axes = plt.subplots(rows, cols, figsize=(15, rows * 5))
axes = axes.flatten()
for i, feature in enumerate(categorical_features):
train[feature].value_counts().plot.pie(
autopct='%1.1f%%', ax=axes[i], startangle=90, cmap="viridis"
)
axes[i].set_title(feature)
axes[i].set_ylabel("")
# Hide any unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
plt.show()
Heatmap of numerical features
In [ ]:
heatmap_train = train.select_dtypes(include=["float64", "int64"])
corr_matrix = heatmap_train.corr()
threshold = 0.75
high_corr_pairs = (
corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
.stack()
.reset_index()
)
high_corr_pairs.columns = ["Feature 1", "Feature 2", "Correlation"]
high_corr_pairs = high_corr_pairs[high_corr_pairs["Correlation"].abs() > threshold]
plt.figure(figsize=(30, 12))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.show()
print("Highly correlated feature pairs (above threshold):")
print(high_corr_pairs)
Feature Engineering¶
Create new features from highly-correlated features
In [ ]:
import itertools
def create_combination_features(df, features):
combinations = itertools.combinations(features, 2)
for comb in combinations:
feature_name = "_".join(comb)
df[feature_name] = df[list(comb)].mean(axis=1)
return df
Model Training and Fine Tuning¶
GridSearch
In [ ]:
param_grid = {
'x': [1,2,3],
'y': [4,5,6]
}
grid_search = GridSearchCV(model(), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_val, y_val)])
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_
Metrics
In [ ]:
### X_train, X_val, y_train, y_val = train test split ...
y_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_pred)
val_precision = precision_score(y_val, y_pred)
val_recall = recall_score(y_val, y_pred)
val_f1 = f1_score(y_val, y_pred)
val_auc = roc_auc_score(y_val, y_pred)
Cross validation
In [ ]:
# Set up k-fold cross validation
k = 5 # Common choice is 5 or 10 folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)
# If you're using a sklearn model (for example, Random Forest)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
# Print results
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {np.mean(cv_scores):.4f}")
print(f"Standard deviation: {np.std(cv_scores):.4f}")
SHAP¶
In [ ]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_val)
shap.summary_plot(shap_values, X_val)
shap.force_plot(explainer.expected_value, shap_values[0], X_val.iloc[0])