⭐ 1. Introduction & Overview¶
Your Goal: Predict the likelihood of accidents on different types of roads.
🔹 2. Import Libraries & Set Up¶
In [1]:
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
# =====================
# General utilities
# =====================
import json
import os
import pickle
import time
from collections import Counter
import scipy
# =====================
# Data handling & processing
# =====================
import numpy as np
import pandas as pd
from tqdm import tqdm
# =====================
# Visualization
# =====================
import matplotlib.pyplot as plt
import seaborn as sns
# =====================
# Machine Learning - Core scikit-learn
# =====================
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, ElasticNet
from sklearn.metrics import (
accuracy_score, f1_score, precision_score, recall_score,
mean_absolute_error, mean_squared_error, r2_score,
root_mean_squared_error, roc_auc_score
)
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVC, SVR
# =====================
# Machine Learning - Tree Boosting & advanced
# =====================
import xgboost as xg
import lightgbm as lgb
import catboost
# =====================
# Deep Learning - TensorFlow / Keras
# =====================
import tensorflow as tf
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam
# =====================
# Deep Learning - PyTorch
# =====================
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim
# =====================
# Imbalanced data handling
# =====================
from imblearn.over_sampling import SMOTE
# =====================
# Optimization / AutoML
# =====================
import optuna
# =====================
# Feature importance & explainability
# =====================
import shap
# =====================
# Self Made Utilities
# =====================
from utils import *
# =====================
# Settings & reproducibility
# =====================
import warnings
warnings.filterwarnings("ignore")
SEED = 42
np.random.seed(SEED)
print("Libraries successfully loaded. Ready to go!")
WARNING:tensorflow:From c:\Users\robkr\anaconda3\envs\kaggle\lib\site-packages\keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead. Libraries successfully loaded. Ready to go!
In [2]:
train = pd.read_csv('train.csv')
train = train.drop(columns=['id'])
test = pd.read_csv('test.csv')
orig = pd.read_csv('original.csv')
orig = orig[train.columns]
complete = pd.concat([train, orig], axis=0).reset_index(drop=True)
In [3]:
train.head()
Out[3]:
| road_type | num_lanes | curvature | speed_limit | lighting | weather | road_signs_present | public_road | time_of_day | holiday | school_season | num_reported_accidents | accident_risk | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | urban | 2 | 0.06 | 35 | daylight | rainy | False | True | afternoon | False | True | 1 | 0.13 |
| 1 | urban | 4 | 0.99 | 35 | daylight | clear | True | False | evening | True | True | 0 | 0.35 |
| 2 | rural | 4 | 0.63 | 70 | dim | clear | False | True | morning | True | False | 2 | 0.30 |
| 3 | highway | 4 | 0.07 | 35 | dim | rainy | True | True | morning | False | False | 1 | 0.21 |
| 4 | rural | 1 | 0.58 | 60 | daylight | foggy | False | False | evening | True | False | 1 | 0.56 |
In [4]:
numerical_features = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
features = train.columns.tolist()
target_col = 'accident_risk'
In [5]:
print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)
Numerical features: ['num_lanes', 'curvature', 'speed_limit', 'num_reported_accidents', 'accident_risk'] Categorical features: ['road_type', 'lighting', 'weather', 'road_signs_present', 'public_road', 'time_of_day', 'holiday', 'school_season']
🔹 3. Data Exploration¶
In [6]:
plot_cats(train)
In [7]:
plot_nums(train)
In [8]:
def plot_bool(dataframe):
bool_features = dataframe.select_dtypes(include=['bool']).columns
num_features = len(bool_features)
n_cols = 3
n_rows = (num_features + n_cols - 1) // n_cols
plt.figure(figsize=(5 * n_cols, 4 * n_rows))
for i, feature in enumerate(bool_features):
plt.subplot(n_rows, n_cols, i + 1)
sns.countplot(x=feature, data=dataframe)
plt.title(f'Count Plot of {feature}')
plt.xlabel(feature)
plt.ylabel('Count')
plt.tight_layout()
plt.show()
In [9]:
plot_bool(train)
Good distributions among all of these, highly balanced
🔹 4. Feature Engineering¶
In [10]:
train.head(1)
Out[10]:
| road_type | num_lanes | curvature | speed_limit | lighting | weather | road_signs_present | public_road | time_of_day | holiday | school_season | num_reported_accidents | accident_risk | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | urban | 2 | 0.06 | 35 | daylight | rainy | False | True | afternoon | False | True | 1 | 0.13 |
def create_accident_risk_features(df, remove_correlated=True, threshold=0.85): """ Create engineered features for accident risk prediction while reducing multicollinearity by dropping highly correlated ones. """ df = df.copy()
# ===== INTERACTIONS =====
df['speed_curvature_interaction'] = df['speed_limit'] * df['curvature']
df['poor_visibility'] = ((df['weather'].isin(['rainy', 'foggy', 'snowy'])) &
(df['lighting'].isin(['dim', 'dark']))).astype(int)
df['speed_weather_risk'] = df['speed_limit'] * df['poor_visibility']
df['urban_complexity'] = ((df['road_type'] == 'urban') &
(df['num_lanes'] >= 3) &
(df['curvature'] > 0.5)).astype(int)
df['urban_rush'] = ((df['road_type'] == 'urban') &
(df['time_of_day'].isin(['morning', 'evening']))).astype(int)
# ===== WEATHER & LIGHTING =====
weather_severity_map = {'clear': 0, 'cloudy': 1, 'rainy': 2, 'foggy': 3, 'snowy': 3}
df['weather_severity'] = df['weather'].map(weather_severity_map)
df['adverse_weather'] = (df['weather_severity'] >= 2).astype(int)
lighting_score_map = {'daylight': 0, 'dim': 1, 'dark': 2}
df['lighting_score'] = df['lighting'].map(lighting_score_map)
df['poor_lighting'] = (df['lighting_score'] >= 1).astype(int)
# ===== TIME FEATURES =====
df['is_rush_hour'] = df['time_of_day'].isin(['morning', 'evening']).astype(int)
df['is_night'] = (df['time_of_day'] == 'night').astype(int)
df['school_traffic'] = (df['school_season'] &
df['time_of_day'].isin(['morning', 'afternoon'])).astype(int)
# ===== ROAD FEATURES =====
df['high_curvature'] = (df['curvature'] > df['curvature'].quantile(0.75)).astype(int)
df['high_speed'] = (df['speed_limit'] >= 60).astype(int)
df['narrow_road'] = (df['num_lanes'] <= 2).astype(int)
# Encode road type (avoid full one-hot -> prevents collinearity)
df['is_highway'] = (df['road_type'] == 'highway').astype(int)
df['is_urban'] = (df['road_type'] == 'urban').astype(int)
# skip 'is_rural' — redundant
# ===== SAFETY =====
df['safety_infrastructure'] = df['road_signs_present'].astype(int) + df['public_road'].astype(int)
# Skip no_road_signs/private_road (negations = collinear)
# ===== NONLINEAR =====
df['speed_limit_squared'] = df['speed_limit'] ** 2
df['curvature_squared'] = df['curvature'] ** 2
# ===== RATIOS =====
df['speed_to_curvature'] = df['speed_limit'] / (df['curvature'] + 0.01)
df['lanes_to_accidents'] = df['num_lanes'] / (df['num_reported_accidents'] + 1)
# ===== OPTIONAL: remove correlated numeric features =====
if remove_correlated:
num_df = df.select_dtypes(include=[np.number])
corr_matrix = num_df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
df.drop(columns=to_drop, inplace=True)
print(f"Removed {len(to_drop)} highly correlated features: {to_drop}")
return df
full = pd.concat([train, orig], axis=0).reset_index(drop=True) full = create_accident_risk_features(full)
test = create_accident_risk_features(test)
0.0556 with lots of features
In [11]:
# FEATURE ENGINEERING
def f(X):
return \
0.3 * X["curvature"] + \
0.2 * (X["lighting"] == "night").astype(int) + \
0.1 * (X["weather"] != "clear").astype(int) + \
0.2 * (X["speed_limit"] >= 60).astype(int) + \
0.1 * (X["num_reported_accidents"] > 2).astype(int)
def clip(f):
def clip_f(X):
sigma = 0.05
mu = f(X)
a, b = -mu/sigma, (1-mu)/sigma
Phi_a, Phi_b = scipy.stats.norm.cdf(a), scipy.stats.norm.cdf(b)
phi_a, phi_b = scipy.stats.norm.pdf(a), scipy.stats.norm.pdf(b)
return mu*(Phi_b-Phi_a)+sigma*(phi_a-phi_b)+1-Phi_b
return clip_f
tr = clip(f)(train)
ori = clip(f)(orig)
te = clip(f)(test)
train['score'] = tr
test['score'] = te
In [12]:
train.head()
Out[12]:
| road_type | num_lanes | curvature | speed_limit | lighting | weather | road_signs_present | public_road | time_of_day | holiday | school_season | num_reported_accidents | accident_risk | score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | urban | 2 | 0.06 | 35 | daylight | rainy | False | True | afternoon | False | True | 1 | 0.13 | 0.118153 |
| 1 | urban | 4 | 0.99 | 35 | daylight | clear | True | False | evening | True | True | 0 | 0.35 | 0.297000 |
| 2 | rural | 4 | 0.63 | 70 | dim | clear | False | True | morning | True | False | 2 | 0.30 | 0.389000 |
| 3 | highway | 4 | 0.07 | 35 | dim | rainy | True | True | morning | False | False | 1 | 0.21 | 0.121128 |
| 4 | rural | 1 | 0.58 | 60 | daylight | foggy | False | False | evening | True | False | 1 | 0.56 | 0.474000 |
In [13]:
# GET CAT COLS
categorical_cols = train.select_dtypes(include = 'object').columns
encoders = {}
# DO ENCODE FOR EACH CATEGORICAL COLS
for col in categorical_cols:
train[col], uniques = train[col].factorize()
encoders[col] = dict(zip(uniques, range(len(uniques))))
test[col] = test[col].map(encoders[col])
train.head(4)
Out[13]:
| road_type | num_lanes | curvature | speed_limit | lighting | weather | road_signs_present | public_road | time_of_day | holiday | school_season | num_reported_accidents | accident_risk | score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2 | 0.06 | 35 | 0 | 0 | False | True | 0 | False | True | 1 | 0.13 | 0.118153 |
| 1 | 0 | 4 | 0.99 | 35 | 0 | 1 | True | False | 1 | True | True | 0 | 0.35 | 0.297000 |
| 2 | 1 | 4 | 0.63 | 70 | 1 | 1 | False | True | 2 | True | False | 2 | 0.30 | 0.389000 |
| 3 | 2 | 4 | 0.07 | 35 | 1 | 0 | True | True | 2 | False | False | 1 | 0.21 | 0.121128 |
🔹 5. Model Testing and Submission¶
In [14]:
X = full.drop(columns=['accident_risk'], axis=1)
y = full['accident_risk']
X_test = test.drop(columns=['id'], axis=1)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[14], line 1 ----> 1 X = full.drop(columns=['accident_risk'], axis=1) 2 y = full['accident_risk'] 4 X_test = test.drop(columns=['id'], axis=1) NameError: name 'full' is not defined
In [ ]:
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
y_probs_lgbm = np.zeros(len(X_test))
models = []
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
print(f"Training fold {fold + 1}/{n_splits} >>>")
X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
lightgbm = lgb.LGBMRegressor(
n_estimators=20000,
learning_rate=0.06,
num_leaves=100,
max_depth=10,
min_child_samples=9,
subsample=0.8,
colsample_bytree=0.5,
reg_alpha=0.78,
reg_lambda=3.0,
random_state=42,
verbosity=-1,
device="gpu",
gpu_platform_id=0,
gpu_device_id=0,
predict_disable_shape_check=True
)
lightgbm.fit(
X_train,
y_train,
eval_set=[(X_val, y_val)],
callbacks=[
lgb.early_stopping(100),
lgb.log_evaluation(period=500)
]
)
models.append(lightgbm)
# Average predictions across all folds
y_probs_lgbm += lightgbm.predict(X_test) / n_splits
best_rmse = root_mean_squared_error(y, lightgbm.predict(X))
print(f"\nBest RMSE: {best_rmse:.4f}")
Training fold 1/10 >>> Training until validation scores don't improve for 100 rounds Early stopping, best iteration is: [356] valid_0's l2: 0.00321422 Training fold 2/10 >>> Training until validation scores don't improve for 100 rounds Early stopping, best iteration is: [274] valid_0's l2: 0.00316808 Training fold 3/10 >>> Training until validation scores don't improve for 100 rounds Early stopping, best iteration is: [317] valid_0's l2: 0.00311866 Training fold 4/10 >>> Training until validation scores don't improve for 100 rounds Early stopping, best iteration is: [369] valid_0's l2: 0.00313769 Training fold 5/10 >>> Training until validation scores don't improve for 100 rounds Early stopping, best iteration is: [257] valid_0's l2: 0.00314865 Training fold 6/10 >>> Training until validation scores don't improve for 100 rounds Early stopping, best iteration is: [352] valid_0's l2: 0.00310538 Training fold 7/10 >>> Training until validation scores don't improve for 100 rounds Early stopping, best iteration is: [329] valid_0's l2: 0.00309887 Training fold 8/10 >>> Training until validation scores don't improve for 100 rounds Early stopping, best iteration is: [227] valid_0's l2: 0.00316374 Training fold 9/10 >>> Training until validation scores don't improve for 100 rounds Early stopping, best iteration is: [341] valid_0's l2: 0.00311398 Training fold 10/10 >>> Training until validation scores don't improve for 100 rounds Early stopping, best iteration is: [335] valid_0's l2: 0.00310523 Best RMSE: 0.0556
In [ ]:
output_lgbm = pd.DataFrame({
'id': test.id,
'accident_risk': y_probs_lgbm
})
output_lgbm.to_csv('attempt-lightgbm1.csv', index=False)
print("Your submission was successfully saved!")
Your submission was successfully saved!
In [ ]:
output_lgbm.head()
Out[ ]:
| id | accident_risk | |
|---|---|---|
| 0 | 517754 | 0.294408 |
| 1 | 517755 | 0.119755 |
| 2 | 517756 | 0.180206 |
| 3 | 517757 | 0.338537 |
| 4 | 517758 | 0.382397 |
In [27]:
import pandas as pd
import numpy as np
# Step 1: Load all submissions
submissions = {
'my_best': pd.read_csv("my_best1.csv"),
'best1': pd.read_csv("best2.csv"),
'best2': pd.read_csv("best3.csv"),
'best3': pd.read_csv("best4.csv")
}
# Step 2: Create ensemble dataframe starting with IDs from first submission
ensemble = pd.DataFrame({'id': submissions['my_best']['id']})
# Collect all predictions
for name, df in submissions.items():
ensemble[name] = df.set_index('id')['accident_risk'].reindex(ensemble['id']).values
# Step 3: Blend the predictions
# Option 1: Simple average (equal weights)
ensemble["accident_risk"] = ensemble[['my_best', 'best1', 'best2', 'best3']].mean(axis=1)
# Option 2: Weighted average (adjust weights as needed)
# weights = [0.4, 0.2, 0.2, 0.2] # Emphasize my_best
# ensemble["accident_risk"] = np.average(
# ensemble[['my_best', 'best1', 'best2', 'best3']].values,
# weights=weights,
# axis=1
# )
# Option 3: If you want to use the 1.05/-0.05 style weighting for just 2 submissions:
# ensemble["accident_risk"] = (
# 1.05 * ensemble["my_best"] +
# -0.05 * ensemble["best1"]
# )
# Step 4: Save final blended submission
final = ensemble[["id", "accident_risk"]]
final.to_csv("submission-blender.csv", index=False)
In [28]:
final.head()
Out[28]:
| id | accident_risk | |
|---|---|---|
| 0 | 517754 | 0.294976 |
| 1 | 517755 | 0.120976 |
| 2 | 517756 | 0.182933 |
| 3 | 517757 | 0.311282 |
| 4 | 517758 | 0.399501 |