#4¶

Kaggle competition: [link]

Entry by Robin R.P.M. Kras

⭐ 1. Introduction & Overview¶

Your Goal: Your goal is to predict rainfall for each day of the year.

šŸ”¹ 2. Import Libraries & Set Up¶

InĀ [3]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
import xgboost as xg

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from imblearn.over_sampling import SMOTE

# Feature Importance & Explainability
import shap

# Settings
import warnings
warnings.filterwarnings("ignore")

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

print("Libraries loaded. Ready to go!")
Libraries loaded. Ready to go!
c:\Users\robkr\AppData\Local\Programs\Python\Python39\lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

šŸ”¹ 3. Load & Explore Data¶

InĀ [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
InĀ [5]:
train.head()
Out[5]:
id day pressure maxtemp temparature mintemp dewpoint humidity cloud sunshine winddirection windspeed rainfall
0 0 1 1017.4 21.2 20.6 19.9 19.4 87.0 88.0 1.1 60.0 17.2 1
1 1 2 1019.5 16.2 16.9 15.8 15.4 95.0 91.0 0.0 50.0 21.9 1
2 2 3 1024.1 19.4 16.1 14.6 9.3 75.0 47.0 8.3 70.0 18.1 1
3 3 4 1013.4 18.1 17.8 16.9 16.8 95.0 95.0 0.0 60.0 35.6 1
4 4 5 1021.8 21.3 18.4 15.2 9.6 52.0 45.0 3.6 40.0 24.8 0
InĀ [6]:
train.shape
Out[6]:
(2190, 13)
InĀ [7]:
train.isnull().sum()
Out[7]:
id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dtype: int64
InĀ [8]:
# Quick summary of dataset
train.describe()
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2190 entries, 0 to 2189
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             2190 non-null   int64  
 1   day            2190 non-null   int64  
 2   pressure       2190 non-null   float64
 3   maxtemp        2190 non-null   float64
 4   temparature    2190 non-null   float64
 5   mintemp        2190 non-null   float64
 6   dewpoint       2190 non-null   float64
 7   humidity       2190 non-null   float64
 8   cloud          2190 non-null   float64
 9   sunshine       2190 non-null   float64
 10  winddirection  2190 non-null   float64
 11  windspeed      2190 non-null   float64
 12  rainfall       2190 non-null   int64  
dtypes: float64(10), int64(3)
memory usage: 222.5 KB

šŸ”¹ 4. Data Visualization & EDA¶

InĀ [9]:
float_cols = [col for col in train.columns if train[col].dtype == "float64"]

cols_per_row = 3
num_plots = len(float_cols)
rows = (num_plots // cols_per_row) + (num_plots % cols_per_row > 0) 

fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 5 * rows)) 
axes = axes.flatten()  

for idx, col in enumerate(float_cols):
    sns.histplot(train[col], bins=50, kde=True, ax=axes[idx])
    axes[idx].set_title(f"Distribution of {col}")

for i in range(idx + 1, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [10]:
heatmap_train = train.select_dtypes(include=["float64", "int64"])

corr_matrix = heatmap_train.corr()

threshold = 0.8

high_corr_pairs = (
    corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) 
    .stack()  
    .reset_index()
)

high_corr_pairs.columns = ["Feature 1", "Feature 2", "Correlation"]
high_corr_pairs = high_corr_pairs[high_corr_pairs["Correlation"].abs() > threshold]  

plt.figure(figsize=(30, 12))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.show()

print("Highly correlated feature pairs (above threshold):")
print(high_corr_pairs)
No description has been provided for this image
Highly correlated feature pairs (above threshold):
      Feature 1    Feature 2  Correlation
23     pressure      maxtemp    -0.800499
24     pressure  temparature    -0.816531
25     pressure      mintemp    -0.814453
26     pressure     dewpoint    -0.817008
33      maxtemp  temparature     0.982932
34      maxtemp      mintemp     0.965529
35      maxtemp     dewpoint     0.906703
42  temparature      mintemp     0.987150
43  temparature     dewpoint     0.933617
50      mintemp     dewpoint     0.941342
68        cloud     sunshine    -0.805128
InĀ [11]:
l1 = high_corr_pairs['Feature 1'].tolist()
l2 = high_corr_pairs['Feature 2'].tolist()
interesting_features = list(set(l1+l2))

print(interesting_features)
['sunshine', 'pressure', 'cloud', 'mintemp', 'dewpoint', 'temparature', 'maxtemp']

šŸ”¹ 5. Feature Engineering¶

InĀ [12]:
train['humidity_cloud_interaction'] = train['humidity'] * train['cloud']
train['humidity_sunshine_interaction'] = train['humidity'] * train['sunshine']
train['cloud_sunshine_ratio'] = train['cloud'] / (train['sunshine'] + 1e-5)
train['relative_dryness'] = 100 - train['humidity']
train['sunshine_percentage'] = train['sunshine'] / (train['sunshine'] + train['cloud'] + 1e-5)
train['weather_index'] = (0.4 * train['humidity']) + (0.3 * train['cloud']) - (0.3 * train['sunshine'])

test['humidity_cloud_interaction'] = test['humidity'] * test['cloud']
test['humidity_sunshine_interaction'] = test['humidity'] * test['sunshine']
test['cloud_sunshine_ratio'] = test['cloud'] / (test['sunshine'] + 1e-5)
test['relative_dryness'] = 100 - test['humidity']
test['sunshine_percentage'] = test['sunshine'] / (test['sunshine'] + test['cloud'] + 1e-5)
test['weather_index'] = (0.4 * test['humidity']) + (0.3 * test['cloud']) - (0.3 * test['sunshine'])
InĀ [13]:
# Test set contains an instance of null
test['winddirection'].fillna(test['winddirection'].median(), inplace=True)
InĀ [14]:
test.head()
Out[14]:
id day pressure maxtemp temparature mintemp dewpoint humidity cloud sunshine winddirection windspeed humidity_cloud_interaction humidity_sunshine_interaction cloud_sunshine_ratio relative_dryness sunshine_percentage weather_index
0 2190 1 1019.5 17.5 15.8 12.7 14.9 96.0 99.0 0.0 50.0 24.3 9504.0 0.0 9.900000e+06 4.0 0.000000 68.10
1 2191 2 1016.5 17.5 16.5 15.8 15.1 97.0 99.0 0.0 50.0 35.3 9603.0 0.0 9.900000e+06 3.0 0.000000 68.50
2 2192 3 1023.9 11.2 10.4 9.4 8.9 86.0 96.0 0.0 40.0 16.9 8256.0 0.0 9.600000e+06 14.0 0.000000 63.20
3 2193 4 1022.9 20.6 17.3 15.2 9.5 75.0 45.0 7.1 20.0 50.6 3375.0 532.5 6.338019e+00 25.0 0.136276 41.37
4 2194 5 1022.2 16.1 13.8 6.4 4.3 68.0 49.0 9.2 20.0 19.4 3332.0 625.6 5.326081e+00 32.0 0.158076 39.14

Experiment¶

InĀ [16]:
X = train.drop(columns=['rainfall'], errors='ignore')
y = train['rainfall']
InĀ [26]:
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_predictions = np.zeros(len(train))

for train_idx, val_idx in kf.split(train):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = xg.XGBRegressor()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    oof_predictions[val_idx] = y_pred
    
    print(f"Fold RMSE: {root_mean_squared_error(y_val, y_pred)}")

final_rmse = root_mean_squared_error(y, oof_predictions)
print(f"Final Cross-Validation RMSE: {final_rmse}")
Fold RMSE: 0.3665955066680908
Fold RMSE: 0.34317660331726074
Fold RMSE: 0.3477436602115631
Fold RMSE: 0.3161795735359192
Fold RMSE: 0.34899771213531494
Final Cross-Validation RMSE: 0.34492231409811286

šŸ”¹ 6. Model Selection¶

InĀ [21]:
X = train.drop(columns=['rainfall'], errors='ignore')
X_test = test

y = train['rainfall']
InĀ [22]:
model = Ridge()
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'id': test.id, 'rainfall': predictions})
output.to_csv('submission_ridge.csv', index=False)
print("Your submission was successfully saved!")
Your submission was successfully saved!

šŸ”¹ 7. Keras!¶

InĀ [23]:
X_train = train.drop(columns=['day','rainfall'], errors='ignore')
X_test = test.drop(columns=['day'])

y_train = train['rainfall']
InĀ [24]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
InĀ [25]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
InĀ [26]:
model = Sequential([
    Dense(128, activation='relu', kernel_initializer='he_normal', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu', kernel_initializer='he_normal', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu', kernel_initializer='he_normal'),
    Dropout(0.2),
    Dense(16, activation='relu', kernel_initializer='he_normal'),
    Dense(1, activation='sigmoid') 
])
InĀ [27]:
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
InĀ [28]:
history = model.fit(X_train_scaled, y_train, epochs=200, batch_size=32, validation_split=0.2, 
                    callbacks=[early_stopping], verbose=1)
Epoch 1/200
55/55 [==============================] - 3s 8ms/step - loss: 0.5329 - accuracy: 0.7409 - val_loss: 0.3354 - val_accuracy: 0.8539
Epoch 2/200
55/55 [==============================] - 0s 6ms/step - loss: 0.4070 - accuracy: 0.8288 - val_loss: 0.3486 - val_accuracy: 0.8653
Epoch 3/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3865 - accuracy: 0.8447 - val_loss: 0.3285 - val_accuracy: 0.8744
Epoch 4/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3825 - accuracy: 0.8436 - val_loss: 0.3326 - val_accuracy: 0.8767
Epoch 5/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3689 - accuracy: 0.8499 - val_loss: 0.3258 - val_accuracy: 0.8767
Epoch 6/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3654 - accuracy: 0.8556 - val_loss: 0.3278 - val_accuracy: 0.8721
Epoch 7/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3769 - accuracy: 0.8447 - val_loss: 0.3281 - val_accuracy: 0.8767
Epoch 8/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3666 - accuracy: 0.8619 - val_loss: 0.3229 - val_accuracy: 0.8790
Epoch 9/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3565 - accuracy: 0.8573 - val_loss: 0.3341 - val_accuracy: 0.8699
Epoch 10/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3565 - accuracy: 0.8590 - val_loss: 0.3299 - val_accuracy: 0.8744
Epoch 11/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3573 - accuracy: 0.8596 - val_loss: 0.3282 - val_accuracy: 0.8767
Epoch 12/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3468 - accuracy: 0.8607 - val_loss: 0.3272 - val_accuracy: 0.8767
Epoch 13/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3529 - accuracy: 0.8539 - val_loss: 0.3270 - val_accuracy: 0.8790
Epoch 14/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3546 - accuracy: 0.8624 - val_loss: 0.3277 - val_accuracy: 0.8813
Epoch 15/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3506 - accuracy: 0.8670 - val_loss: 0.3251 - val_accuracy: 0.8744
Epoch 16/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3427 - accuracy: 0.8624 - val_loss: 0.3241 - val_accuracy: 0.8744
Epoch 17/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3464 - accuracy: 0.8647 - val_loss: 0.3263 - val_accuracy: 0.8813
Epoch 18/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3415 - accuracy: 0.8624 - val_loss: 0.3237 - val_accuracy: 0.8813
Epoch 19/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3437 - accuracy: 0.8624 - val_loss: 0.3219 - val_accuracy: 0.8858
Epoch 20/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3427 - accuracy: 0.8659 - val_loss: 0.3206 - val_accuracy: 0.8813
Epoch 21/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3387 - accuracy: 0.8687 - val_loss: 0.3250 - val_accuracy: 0.8744
Epoch 22/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3349 - accuracy: 0.8676 - val_loss: 0.3224 - val_accuracy: 0.8744
Epoch 23/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3420 - accuracy: 0.8619 - val_loss: 0.3239 - val_accuracy: 0.8813
Epoch 24/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3364 - accuracy: 0.8647 - val_loss: 0.3181 - val_accuracy: 0.8767
Epoch 25/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3320 - accuracy: 0.8647 - val_loss: 0.3197 - val_accuracy: 0.8767
Epoch 26/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3370 - accuracy: 0.8676 - val_loss: 0.3214 - val_accuracy: 0.8767
Epoch 27/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3387 - accuracy: 0.8756 - val_loss: 0.3247 - val_accuracy: 0.8767
Epoch 28/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3358 - accuracy: 0.8647 - val_loss: 0.3247 - val_accuracy: 0.8767
Epoch 29/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3291 - accuracy: 0.8716 - val_loss: 0.3254 - val_accuracy: 0.8721
Epoch 30/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3285 - accuracy: 0.8716 - val_loss: 0.3288 - val_accuracy: 0.8744
Epoch 31/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3340 - accuracy: 0.8699 - val_loss: 0.3239 - val_accuracy: 0.8721
Epoch 32/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3225 - accuracy: 0.8727 - val_loss: 0.3324 - val_accuracy: 0.8721
Epoch 33/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3272 - accuracy: 0.8779 - val_loss: 0.3338 - val_accuracy: 0.8699
Epoch 34/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3321 - accuracy: 0.8704 - val_loss: 0.3364 - val_accuracy: 0.8653
Epoch 35/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3259 - accuracy: 0.8624 - val_loss: 0.3298 - val_accuracy: 0.8699
Epoch 36/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3283 - accuracy: 0.8704 - val_loss: 0.3297 - val_accuracy: 0.8699
Epoch 37/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3362 - accuracy: 0.8670 - val_loss: 0.3296 - val_accuracy: 0.8790
Epoch 38/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3264 - accuracy: 0.8716 - val_loss: 0.3341 - val_accuracy: 0.8699
Epoch 39/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3332 - accuracy: 0.8710 - val_loss: 0.3249 - val_accuracy: 0.8699
Epoch 40/200
55/55 [==============================] - 0s 4ms/step - loss: 0.3229 - accuracy: 0.8744 - val_loss: 0.3262 - val_accuracy: 0.8699
Epoch 41/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3196 - accuracy: 0.8761 - val_loss: 0.3260 - val_accuracy: 0.8721
Epoch 42/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3247 - accuracy: 0.8767 - val_loss: 0.3372 - val_accuracy: 0.8653
Epoch 43/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3176 - accuracy: 0.8733 - val_loss: 0.3229 - val_accuracy: 0.8721
Epoch 44/200
55/55 [==============================] - 0s 5ms/step - loss: 0.3180 - accuracy: 0.8733 - val_loss: 0.3247 - val_accuracy: 0.8699
InĀ [29]:
predictions_keras = model.predict(X_test_scaled).flatten()

output = pd.DataFrame({'id': test.id, 'rainfall': predictions_keras})
output.to_csv('submission_keras.csv', index=False)
print("Your submission was successfully saved!")
23/23 [==============================] - 0s 1ms/step
Your submission was successfully saved!
InĀ [30]:
output.head()
Out[30]:
id rainfall
0 2190 0.993476
1 2191 0.996944
2 2192 0.970677
3 2193 0.228944
4 2194 0.074985

šŸ”¹ 8. kNN KFolds¶

InĀ [31]:
RMV = ['rainfall','id']
FEATURES = [c for c in train.columns if not c in RMV]
InĀ [32]:
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBRegressor, XGBClassifier
import xgboost
print("Using XGBoost version",xgboost.__version__)
Using XGBoost version 2.1.4
InĀ [33]:
%%time
FOLDS = 5
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=777)
    
oof_knn = np.zeros(len(train))
pred_knn = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"rainfall"]    
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"rainfall"]
    x_test = test[FEATURES].copy()

    for c in FEATURES:
        m = x_train[c].mean()
        s = x_train[c].std()
        x_train[c] = (x_train[c]-m)/s
        x_valid[c] = (x_valid[c]-m)/s
        x_test[c] = (x_test[c]-m)/s
        x_test[c] = x_test[c].fillna(0)
        x_train[c] = x_train[c].fillna(0)

    model = KNeighborsClassifier(n_neighbors=101, p=1)
    model.fit(x_train.values, y_train.values)

    # INFER OOF
    oof_knn[test_index] = model.predict_proba(x_valid.values)[:,1]
    # INFER TEST
    pred_knn += model.predict_proba(x_test.values)[:,1]

# COMPUTE AVERAGE TEST PREDS
pred_knn /= FOLDS
#########################
### Fold 1
#########################
#########################
### Fold 2
#########################
#########################
### Fold 3
#########################
#########################
### Fold 4
#########################
#########################
### Fold 5
#########################
CPU times: total: 2.03 s
Wall time: 572 ms
InĀ [34]:
best_public = pd.read_csv("best_public.csv")
display(best_public.head())
best_public = best_public.rainfall.values
id rainfall
0 2190 0.960959
1 2191 0.946575
2 2192 0.994521
3 2193 0.089041
4 2194 0.020548
InĀ [35]:
from scipy.stats import rankdata

sub = pd.read_csv("sample_submission.csv")
sub.rainfall = -0.067 * rankdata(pred_knn) + 1.067 * rankdata(best_public)
sub.rainfall = rankdata( sub.rainfall ) / len(sub)
print(sub.shape)
sub.to_csv(f"submission_knn.csv",index=False)
sub.head()
(730, 2)
Out[35]:
id rainfall
0 2190 0.962329
1 2191 0.943151
2 2192 0.993151
3 2193 0.097260
4 2194 0.020548