#10¶

Kaggle competition: [link]

Entry by Robin R.P.M. Kras

robkras.com

⭐ 1. Introduction & Overview¶

Your Goal: Predict the likelihood of accidents on different types of roads.

🔹 2. Import Libraries & Set Up¶

In [1]:
# !/usr/bin/env python3
# -*- coding: utf-8 -*-

# =====================
# General utilities
# =====================
import json
import os
import pickle
import time
from collections import Counter
import scipy

# =====================
# Data handling & processing
# =====================
import numpy as np
import pandas as pd
from tqdm import tqdm

# =====================
# Visualization
# =====================
import matplotlib.pyplot as plt
import seaborn as sns

# =====================
# Machine Learning - Core scikit-learn
# =====================
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, ElasticNet
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    mean_absolute_error, mean_squared_error, r2_score,
    root_mean_squared_error, roc_auc_score
)
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVC, SVR

# =====================
# Machine Learning - Tree Boosting & advanced
# =====================
import xgboost as xg
import lightgbm as lgb
import catboost

# =====================
# Deep Learning - TensorFlow / Keras
# =====================
import tensorflow as tf
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam

# =====================
# Deep Learning - PyTorch
# =====================
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim

# =====================
# Imbalanced data handling
# =====================
from imblearn.over_sampling import SMOTE

# =====================
# Optimization / AutoML
# =====================
import optuna

# =====================
# Feature importance & explainability
# =====================
import shap

# =====================
# Self Made Utilities
# =====================
from utils import *

# =====================
# Settings & reproducibility
# =====================
import warnings
warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)

print("Libraries successfully loaded. Ready to go!")
WARNING:tensorflow:From c:\Users\robkr\anaconda3\envs\kaggle\lib\site-packages\keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.

Libraries successfully loaded. Ready to go!
In [2]:
train = pd.read_csv('train.csv')
train = train.drop(columns=['id'])
test = pd.read_csv('test.csv')
orig = pd.read_csv('original.csv')
orig = orig[train.columns]

complete = pd.concat([train, orig], axis=0).reset_index(drop=True)
In [3]:
train.head()
Out[3]:
road_type num_lanes curvature speed_limit lighting weather road_signs_present public_road time_of_day holiday school_season num_reported_accidents accident_risk
0 urban 2 0.06 35 daylight rainy False True afternoon False True 1 0.13
1 urban 4 0.99 35 daylight clear True False evening True True 0 0.35
2 rural 4 0.63 70 dim clear False True morning True False 2 0.30
3 highway 4 0.07 35 dim rainy True True morning False False 1 0.21
4 rural 1 0.58 60 daylight foggy False False evening True False 1 0.56
In [4]:
numerical_features = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
features = train.columns.tolist()

target_col = 'accident_risk'
In [5]:
print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)
Numerical features: ['num_lanes', 'curvature', 'speed_limit', 'num_reported_accidents', 'accident_risk']
Categorical features: ['road_type', 'lighting', 'weather', 'road_signs_present', 'public_road', 'time_of_day', 'holiday', 'school_season']

🔹 3. Data Exploration¶

In [6]:
plot_cats(train)
No description has been provided for this image
In [7]:
plot_nums(train)
No description has been provided for this image
In [8]:
def plot_bool(dataframe):
    bool_features = dataframe.select_dtypes(include=['bool']).columns
    num_features = len(bool_features)
    n_cols = 3
    n_rows = (num_features + n_cols - 1) // n_cols

    plt.figure(figsize=(5 * n_cols, 4 * n_rows))

    for i, feature in enumerate(bool_features):
        plt.subplot(n_rows, n_cols, i + 1)
        sns.countplot(x=feature, data=dataframe)
        plt.title(f'Count Plot of {feature}')
        plt.xlabel(feature)
        plt.ylabel('Count')

    plt.tight_layout()
    plt.show()
In [9]:
plot_bool(train)
No description has been provided for this image

Good distributions among all of these, highly balanced

🔹 4. Feature Engineering¶

In [10]:
train.head(1)
Out[10]:
road_type num_lanes curvature speed_limit lighting weather road_signs_present public_road time_of_day holiday school_season num_reported_accidents accident_risk
0 urban 2 0.06 35 daylight rainy False True afternoon False True 1 0.13

def create_accident_risk_features(df, remove_correlated=True, threshold=0.85): """ Create engineered features for accident risk prediction while reducing multicollinearity by dropping highly correlated ones. """ df = df.copy()

# ===== INTERACTIONS =====
df['speed_curvature_interaction'] = df['speed_limit'] * df['curvature']
df['poor_visibility'] = ((df['weather'].isin(['rainy', 'foggy', 'snowy'])) &
                         (df['lighting'].isin(['dim', 'dark']))).astype(int)
df['speed_weather_risk'] = df['speed_limit'] * df['poor_visibility']
df['urban_complexity'] = ((df['road_type'] == 'urban') &
                          (df['num_lanes'] >= 3) &
                          (df['curvature'] > 0.5)).astype(int)
df['urban_rush'] = ((df['road_type'] == 'urban') &
                    (df['time_of_day'].isin(['morning', 'evening']))).astype(int)

# ===== WEATHER & LIGHTING =====
weather_severity_map = {'clear': 0, 'cloudy': 1, 'rainy': 2, 'foggy': 3, 'snowy': 3}
df['weather_severity'] = df['weather'].map(weather_severity_map)
df['adverse_weather'] = (df['weather_severity'] >= 2).astype(int)

lighting_score_map = {'daylight': 0, 'dim': 1, 'dark': 2}
df['lighting_score'] = df['lighting'].map(lighting_score_map)
df['poor_lighting'] = (df['lighting_score'] >= 1).astype(int)

# ===== TIME FEATURES =====
df['is_rush_hour'] = df['time_of_day'].isin(['morning', 'evening']).astype(int)
df['is_night'] = (df['time_of_day'] == 'night').astype(int)
df['school_traffic'] = (df['school_season'] & 
                        df['time_of_day'].isin(['morning', 'afternoon'])).astype(int)

# ===== ROAD FEATURES =====
df['high_curvature'] = (df['curvature'] > df['curvature'].quantile(0.75)).astype(int)
df['high_speed'] = (df['speed_limit'] >= 60).astype(int)
df['narrow_road'] = (df['num_lanes'] <= 2).astype(int)

# Encode road type (avoid full one-hot -> prevents collinearity)
df['is_highway'] = (df['road_type'] == 'highway').astype(int)
df['is_urban'] = (df['road_type'] == 'urban').astype(int)
# skip 'is_rural' — redundant

# ===== SAFETY =====
df['safety_infrastructure'] = df['road_signs_present'].astype(int) + df['public_road'].astype(int)
# Skip no_road_signs/private_road (negations = collinear)

# ===== NONLINEAR =====
df['speed_limit_squared'] = df['speed_limit'] ** 2
df['curvature_squared'] = df['curvature'] ** 2

# ===== RATIOS =====
df['speed_to_curvature'] = df['speed_limit'] / (df['curvature'] + 0.01)
df['lanes_to_accidents'] = df['num_lanes'] / (df['num_reported_accidents'] + 1)

# ===== OPTIONAL: remove correlated numeric features =====
if remove_correlated:
    num_df = df.select_dtypes(include=[np.number])
    corr_matrix = num_df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    df.drop(columns=to_drop, inplace=True)
    print(f"Removed {len(to_drop)} highly correlated features: {to_drop}")

return df

full = pd.concat([train, orig], axis=0).reset_index(drop=True) full = create_accident_risk_features(full)

test = create_accident_risk_features(test)

0.0556 with lots of features

In [11]:
# FEATURE ENGINEERING
def f(X):
    return \
    0.3 * X["curvature"] + \
    0.2 * (X["lighting"] == "night").astype(int) + \
    0.1 * (X["weather"] != "clear").astype(int) + \
    0.2 * (X["speed_limit"] >= 60).astype(int) + \
    0.1 * (X["num_reported_accidents"] > 2).astype(int)

def clip(f):
    def clip_f(X):
        sigma = 0.05
        mu = f(X)
        a, b = -mu/sigma, (1-mu)/sigma
        Phi_a, Phi_b = scipy.stats.norm.cdf(a), scipy.stats.norm.cdf(b)
        phi_a, phi_b = scipy.stats.norm.pdf(a), scipy.stats.norm.pdf(b)
        return mu*(Phi_b-Phi_a)+sigma*(phi_a-phi_b)+1-Phi_b
    return clip_f

tr = clip(f)(train)
ori = clip(f)(orig)
te = clip(f)(test)

train['score'] = tr
test['score'] = te
In [12]:
train.head()
Out[12]:
road_type num_lanes curvature speed_limit lighting weather road_signs_present public_road time_of_day holiday school_season num_reported_accidents accident_risk score
0 urban 2 0.06 35 daylight rainy False True afternoon False True 1 0.13 0.118153
1 urban 4 0.99 35 daylight clear True False evening True True 0 0.35 0.297000
2 rural 4 0.63 70 dim clear False True morning True False 2 0.30 0.389000
3 highway 4 0.07 35 dim rainy True True morning False False 1 0.21 0.121128
4 rural 1 0.58 60 daylight foggy False False evening True False 1 0.56 0.474000
In [13]:
# GET CAT COLS
categorical_cols = train.select_dtypes(include = 'object').columns

encoders = {}

# DO ENCODE FOR EACH CATEGORICAL COLS
for col in categorical_cols:
    train[col], uniques = train[col].factorize()
    encoders[col] = dict(zip(uniques, range(len(uniques))))

    test[col] = test[col].map(encoders[col])

train.head(4)
Out[13]:
road_type num_lanes curvature speed_limit lighting weather road_signs_present public_road time_of_day holiday school_season num_reported_accidents accident_risk score
0 0 2 0.06 35 0 0 False True 0 False True 1 0.13 0.118153
1 0 4 0.99 35 0 1 True False 1 True True 0 0.35 0.297000
2 1 4 0.63 70 1 1 False True 2 True False 2 0.30 0.389000
3 2 4 0.07 35 1 0 True True 2 False False 1 0.21 0.121128

🔹 5. Model Testing and Submission¶

In [14]:
X = full.drop(columns=['accident_risk'], axis=1)
y = full['accident_risk']

X_test = test.drop(columns=['id'], axis=1)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 1
----> 1 X = full.drop(columns=['accident_risk'], axis=1)
      2 y = full['accident_risk']
      4 X_test = test.drop(columns=['id'], axis=1)

NameError: name 'full' is not defined
In [ ]:
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
y_probs_lgbm = np.zeros(len(X_test))
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"Training fold {fold + 1}/{n_splits} >>>")
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    lightgbm = lgb.LGBMRegressor(
        n_estimators=20000,
        learning_rate=0.06,
        num_leaves=100,
        max_depth=10,
        min_child_samples=9,
        subsample=0.8,
        colsample_bytree=0.5,
        reg_alpha=0.78,
        reg_lambda=3.0,
        random_state=42,
        verbosity=-1,
        device="gpu",
        gpu_platform_id=0,
        gpu_device_id=0,
        predict_disable_shape_check=True
    )
    
    lightgbm.fit(
        X_train, 
        y_train, 
        eval_set=[(X_val, y_val)], 
        callbacks=[
            lgb.early_stopping(100),
            lgb.log_evaluation(period=500)
        ]
    )

    models.append(lightgbm)
    
    # Average predictions across all folds
    y_probs_lgbm += lightgbm.predict(X_test) / n_splits

best_rmse = root_mean_squared_error(y, lightgbm.predict(X))
print(f"\nBest RMSE: {best_rmse:.4f}")
Training fold 1/10 >>>
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[356]	valid_0's l2: 0.00321422
Training fold 2/10 >>>
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[274]	valid_0's l2: 0.00316808
Training fold 3/10 >>>
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[317]	valid_0's l2: 0.00311866
Training fold 4/10 >>>
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[369]	valid_0's l2: 0.00313769
Training fold 5/10 >>>
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[257]	valid_0's l2: 0.00314865
Training fold 6/10 >>>
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[352]	valid_0's l2: 0.00310538
Training fold 7/10 >>>
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[329]	valid_0's l2: 0.00309887
Training fold 8/10 >>>
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[227]	valid_0's l2: 0.00316374
Training fold 9/10 >>>
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[341]	valid_0's l2: 0.00311398
Training fold 10/10 >>>
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[335]	valid_0's l2: 0.00310523

Best RMSE: 0.0556
In [ ]:
output_lgbm = pd.DataFrame({
    'id': test.id,
    'accident_risk': y_probs_lgbm
})

output_lgbm.to_csv('attempt-lightgbm1.csv', index=False)
print("Your submission was successfully saved!")
Your submission was successfully saved!
In [ ]:
output_lgbm.head()
Out[ ]:
id accident_risk
0 517754 0.294408
1 517755 0.119755
2 517756 0.180206
3 517757 0.338537
4 517758 0.382397
In [27]:
import pandas as pd
import numpy as np

# Step 1: Load all submissions
submissions = {
    'my_best': pd.read_csv("my_best1.csv"),
    'best1': pd.read_csv("best2.csv"),
    'best2': pd.read_csv("best3.csv"),
    'best3': pd.read_csv("best4.csv")
}

# Step 2: Create ensemble dataframe starting with IDs from first submission
ensemble = pd.DataFrame({'id': submissions['my_best']['id']})

# Collect all predictions
for name, df in submissions.items():
    ensemble[name] = df.set_index('id')['accident_risk'].reindex(ensemble['id']).values

# Step 3: Blend the predictions
# Option 1: Simple average (equal weights)
ensemble["accident_risk"] = ensemble[['my_best', 'best1', 'best2', 'best3']].mean(axis=1)

# Option 2: Weighted average (adjust weights as needed)
# weights = [0.4, 0.2, 0.2, 0.2]  # Emphasize my_best
# ensemble["accident_risk"] = np.average(
#     ensemble[['my_best', 'best1', 'best2', 'best3']].values,
#     weights=weights,
#     axis=1
# )

# Option 3: If you want to use the 1.05/-0.05 style weighting for just 2 submissions:
# ensemble["accident_risk"] = (
#     1.05 * ensemble["my_best"] +
#     -0.05 * ensemble["best1"]
# )

# Step 4: Save final blended submission
final = ensemble[["id", "accident_risk"]]
final.to_csv("submission-blender.csv", index=False)
In [28]:
final.head()
Out[28]:
id accident_risk
0 517754 0.294976
1 517755 0.120976
2 517756 0.182933
3 517757 0.311282
4 517758 0.399501