ā 1. Introduction & OverviewĀ¶
Your Goal: Your goal is to predict rainfall for each day of the year.
š¹ 2. Import Libraries & Set UpĀ¶
InĀ [3]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Machine Learning
import xgboost as xg
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import SMOTE
# Feature Importance & Explainability
import shap
# Settings
import warnings
warnings.filterwarnings("ignore")
# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)
print("Libraries loaded. Ready to go!")
Libraries loaded. Ready to go!
c:\Users\robkr\AppData\Local\Programs\Python\Python39\lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
š¹ 3. Load & Explore DataĀ¶
InĀ [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
InĀ [5]:
train.head()
Out[5]:
id | day | pressure | maxtemp | temparature | mintemp | dewpoint | humidity | cloud | sunshine | winddirection | windspeed | rainfall | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 1017.4 | 21.2 | 20.6 | 19.9 | 19.4 | 87.0 | 88.0 | 1.1 | 60.0 | 17.2 | 1 |
1 | 1 | 2 | 1019.5 | 16.2 | 16.9 | 15.8 | 15.4 | 95.0 | 91.0 | 0.0 | 50.0 | 21.9 | 1 |
2 | 2 | 3 | 1024.1 | 19.4 | 16.1 | 14.6 | 9.3 | 75.0 | 47.0 | 8.3 | 70.0 | 18.1 | 1 |
3 | 3 | 4 | 1013.4 | 18.1 | 17.8 | 16.9 | 16.8 | 95.0 | 95.0 | 0.0 | 60.0 | 35.6 | 1 |
4 | 4 | 5 | 1021.8 | 21.3 | 18.4 | 15.2 | 9.6 | 52.0 | 45.0 | 3.6 | 40.0 | 24.8 | 0 |
InĀ [6]:
train.shape
Out[6]:
(2190, 13)
InĀ [7]:
train.isnull().sum()
Out[7]:
id 0 day 0 pressure 0 maxtemp 0 temparature 0 mintemp 0 dewpoint 0 humidity 0 cloud 0 sunshine 0 winddirection 0 windspeed 0 rainfall 0 dtype: int64
InĀ [8]:
# Quick summary of dataset
train.describe()
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2190 entries, 0 to 2189 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 2190 non-null int64 1 day 2190 non-null int64 2 pressure 2190 non-null float64 3 maxtemp 2190 non-null float64 4 temparature 2190 non-null float64 5 mintemp 2190 non-null float64 6 dewpoint 2190 non-null float64 7 humidity 2190 non-null float64 8 cloud 2190 non-null float64 9 sunshine 2190 non-null float64 10 winddirection 2190 non-null float64 11 windspeed 2190 non-null float64 12 rainfall 2190 non-null int64 dtypes: float64(10), int64(3) memory usage: 222.5 KB
š¹ 4. Data Visualization & EDAĀ¶
InĀ [9]:
float_cols = [col for col in train.columns if train[col].dtype == "float64"]
cols_per_row = 3
num_plots = len(float_cols)
rows = (num_plots // cols_per_row) + (num_plots % cols_per_row > 0)
fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 5 * rows))
axes = axes.flatten()
for idx, col in enumerate(float_cols):
sns.histplot(train[col], bins=50, kde=True, ax=axes[idx])
axes[idx].set_title(f"Distribution of {col}")
for i in range(idx + 1, len(axes)):
fig.delaxes(axes[i])
plt.tight_layout()
plt.show()
InĀ [10]:
heatmap_train = train.select_dtypes(include=["float64", "int64"])
corr_matrix = heatmap_train.corr()
threshold = 0.8
high_corr_pairs = (
corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
.stack()
.reset_index()
)
high_corr_pairs.columns = ["Feature 1", "Feature 2", "Correlation"]
high_corr_pairs = high_corr_pairs[high_corr_pairs["Correlation"].abs() > threshold]
plt.figure(figsize=(30, 12))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.show()
print("Highly correlated feature pairs (above threshold):")
print(high_corr_pairs)
Highly correlated feature pairs (above threshold): Feature 1 Feature 2 Correlation 23 pressure maxtemp -0.800499 24 pressure temparature -0.816531 25 pressure mintemp -0.814453 26 pressure dewpoint -0.817008 33 maxtemp temparature 0.982932 34 maxtemp mintemp 0.965529 35 maxtemp dewpoint 0.906703 42 temparature mintemp 0.987150 43 temparature dewpoint 0.933617 50 mintemp dewpoint 0.941342 68 cloud sunshine -0.805128
InĀ [11]:
l1 = high_corr_pairs['Feature 1'].tolist()
l2 = high_corr_pairs['Feature 2'].tolist()
interesting_features = list(set(l1+l2))
print(interesting_features)
['sunshine', 'pressure', 'cloud', 'mintemp', 'dewpoint', 'temparature', 'maxtemp']
š¹ 5. Feature EngineeringĀ¶
InĀ [12]:
train['humidity_cloud_interaction'] = train['humidity'] * train['cloud']
train['humidity_sunshine_interaction'] = train['humidity'] * train['sunshine']
train['cloud_sunshine_ratio'] = train['cloud'] / (train['sunshine'] + 1e-5)
train['relative_dryness'] = 100 - train['humidity']
train['sunshine_percentage'] = train['sunshine'] / (train['sunshine'] + train['cloud'] + 1e-5)
train['weather_index'] = (0.4 * train['humidity']) + (0.3 * train['cloud']) - (0.3 * train['sunshine'])
test['humidity_cloud_interaction'] = test['humidity'] * test['cloud']
test['humidity_sunshine_interaction'] = test['humidity'] * test['sunshine']
test['cloud_sunshine_ratio'] = test['cloud'] / (test['sunshine'] + 1e-5)
test['relative_dryness'] = 100 - test['humidity']
test['sunshine_percentage'] = test['sunshine'] / (test['sunshine'] + test['cloud'] + 1e-5)
test['weather_index'] = (0.4 * test['humidity']) + (0.3 * test['cloud']) - (0.3 * test['sunshine'])
InĀ [13]:
# Test set contains an instance of null
test['winddirection'].fillna(test['winddirection'].median(), inplace=True)
InĀ [14]:
test.head()
Out[14]:
id | day | pressure | maxtemp | temparature | mintemp | dewpoint | humidity | cloud | sunshine | winddirection | windspeed | humidity_cloud_interaction | humidity_sunshine_interaction | cloud_sunshine_ratio | relative_dryness | sunshine_percentage | weather_index | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2190 | 1 | 1019.5 | 17.5 | 15.8 | 12.7 | 14.9 | 96.0 | 99.0 | 0.0 | 50.0 | 24.3 | 9504.0 | 0.0 | 9.900000e+06 | 4.0 | 0.000000 | 68.10 |
1 | 2191 | 2 | 1016.5 | 17.5 | 16.5 | 15.8 | 15.1 | 97.0 | 99.0 | 0.0 | 50.0 | 35.3 | 9603.0 | 0.0 | 9.900000e+06 | 3.0 | 0.000000 | 68.50 |
2 | 2192 | 3 | 1023.9 | 11.2 | 10.4 | 9.4 | 8.9 | 86.0 | 96.0 | 0.0 | 40.0 | 16.9 | 8256.0 | 0.0 | 9.600000e+06 | 14.0 | 0.000000 | 63.20 |
3 | 2193 | 4 | 1022.9 | 20.6 | 17.3 | 15.2 | 9.5 | 75.0 | 45.0 | 7.1 | 20.0 | 50.6 | 3375.0 | 532.5 | 6.338019e+00 | 25.0 | 0.136276 | 41.37 |
4 | 2194 | 5 | 1022.2 | 16.1 | 13.8 | 6.4 | 4.3 | 68.0 | 49.0 | 9.2 | 20.0 | 19.4 | 3332.0 | 625.6 | 5.326081e+00 | 32.0 | 0.158076 | 39.14 |
ExperimentĀ¶
InĀ [16]:
X = train.drop(columns=['rainfall'], errors='ignore')
y = train['rainfall']
InĀ [26]:
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
import numpy as np
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_predictions = np.zeros(len(train))
for train_idx, val_idx in kf.split(train):
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
model = xg.XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
oof_predictions[val_idx] = y_pred
print(f"Fold RMSE: {root_mean_squared_error(y_val, y_pred)}")
final_rmse = root_mean_squared_error(y, oof_predictions)
print(f"Final Cross-Validation RMSE: {final_rmse}")
Fold RMSE: 0.3665955066680908 Fold RMSE: 0.34317660331726074 Fold RMSE: 0.3477436602115631 Fold RMSE: 0.3161795735359192 Fold RMSE: 0.34899771213531494 Final Cross-Validation RMSE: 0.34492231409811286
š¹ 6. Model SelectionĀ¶
InĀ [21]:
X = train.drop(columns=['rainfall'], errors='ignore')
X_test = test
y = train['rainfall']
InĀ [22]:
model = Ridge()
model.fit(X, y)
predictions = model.predict(X_test)
output = pd.DataFrame({'id': test.id, 'rainfall': predictions})
output.to_csv('submission_ridge.csv', index=False)
print("Your submission was successfully saved!")
Your submission was successfully saved!
š¹ 7. Keras!Ā¶
InĀ [23]:
X_train = train.drop(columns=['day','rainfall'], errors='ignore')
X_test = test.drop(columns=['day'])
y_train = train['rainfall']
InĀ [24]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
InĀ [25]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
InĀ [26]:
model = Sequential([
Dense(128, activation='relu', kernel_initializer='he_normal', input_shape=(X_train_scaled.shape[1],)),
Dropout(0.3),
Dense(64, activation='relu', kernel_initializer='he_normal', input_shape=(X_train_scaled.shape[1],)),
Dropout(0.3),
Dense(32, activation='relu', kernel_initializer='he_normal'),
Dropout(0.2),
Dense(16, activation='relu', kernel_initializer='he_normal'),
Dense(1, activation='sigmoid')
])
InĀ [27]:
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
InĀ [28]:
history = model.fit(X_train_scaled, y_train, epochs=200, batch_size=32, validation_split=0.2,
callbacks=[early_stopping], verbose=1)
Epoch 1/200 55/55 [==============================] - 3s 8ms/step - loss: 0.5329 - accuracy: 0.7409 - val_loss: 0.3354 - val_accuracy: 0.8539 Epoch 2/200 55/55 [==============================] - 0s 6ms/step - loss: 0.4070 - accuracy: 0.8288 - val_loss: 0.3486 - val_accuracy: 0.8653 Epoch 3/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3865 - accuracy: 0.8447 - val_loss: 0.3285 - val_accuracy: 0.8744 Epoch 4/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3825 - accuracy: 0.8436 - val_loss: 0.3326 - val_accuracy: 0.8767 Epoch 5/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3689 - accuracy: 0.8499 - val_loss: 0.3258 - val_accuracy: 0.8767 Epoch 6/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3654 - accuracy: 0.8556 - val_loss: 0.3278 - val_accuracy: 0.8721 Epoch 7/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3769 - accuracy: 0.8447 - val_loss: 0.3281 - val_accuracy: 0.8767 Epoch 8/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3666 - accuracy: 0.8619 - val_loss: 0.3229 - val_accuracy: 0.8790 Epoch 9/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3565 - accuracy: 0.8573 - val_loss: 0.3341 - val_accuracy: 0.8699 Epoch 10/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3565 - accuracy: 0.8590 - val_loss: 0.3299 - val_accuracy: 0.8744 Epoch 11/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3573 - accuracy: 0.8596 - val_loss: 0.3282 - val_accuracy: 0.8767 Epoch 12/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3468 - accuracy: 0.8607 - val_loss: 0.3272 - val_accuracy: 0.8767 Epoch 13/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3529 - accuracy: 0.8539 - val_loss: 0.3270 - val_accuracy: 0.8790 Epoch 14/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3546 - accuracy: 0.8624 - val_loss: 0.3277 - val_accuracy: 0.8813 Epoch 15/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3506 - accuracy: 0.8670 - val_loss: 0.3251 - val_accuracy: 0.8744 Epoch 16/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3427 - accuracy: 0.8624 - val_loss: 0.3241 - val_accuracy: 0.8744 Epoch 17/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3464 - accuracy: 0.8647 - val_loss: 0.3263 - val_accuracy: 0.8813 Epoch 18/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3415 - accuracy: 0.8624 - val_loss: 0.3237 - val_accuracy: 0.8813 Epoch 19/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3437 - accuracy: 0.8624 - val_loss: 0.3219 - val_accuracy: 0.8858 Epoch 20/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3427 - accuracy: 0.8659 - val_loss: 0.3206 - val_accuracy: 0.8813 Epoch 21/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3387 - accuracy: 0.8687 - val_loss: 0.3250 - val_accuracy: 0.8744 Epoch 22/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3349 - accuracy: 0.8676 - val_loss: 0.3224 - val_accuracy: 0.8744 Epoch 23/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3420 - accuracy: 0.8619 - val_loss: 0.3239 - val_accuracy: 0.8813 Epoch 24/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3364 - accuracy: 0.8647 - val_loss: 0.3181 - val_accuracy: 0.8767 Epoch 25/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3320 - accuracy: 0.8647 - val_loss: 0.3197 - val_accuracy: 0.8767 Epoch 26/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3370 - accuracy: 0.8676 - val_loss: 0.3214 - val_accuracy: 0.8767 Epoch 27/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3387 - accuracy: 0.8756 - val_loss: 0.3247 - val_accuracy: 0.8767 Epoch 28/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3358 - accuracy: 0.8647 - val_loss: 0.3247 - val_accuracy: 0.8767 Epoch 29/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3291 - accuracy: 0.8716 - val_loss: 0.3254 - val_accuracy: 0.8721 Epoch 30/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3285 - accuracy: 0.8716 - val_loss: 0.3288 - val_accuracy: 0.8744 Epoch 31/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3340 - accuracy: 0.8699 - val_loss: 0.3239 - val_accuracy: 0.8721 Epoch 32/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3225 - accuracy: 0.8727 - val_loss: 0.3324 - val_accuracy: 0.8721 Epoch 33/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3272 - accuracy: 0.8779 - val_loss: 0.3338 - val_accuracy: 0.8699 Epoch 34/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3321 - accuracy: 0.8704 - val_loss: 0.3364 - val_accuracy: 0.8653 Epoch 35/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3259 - accuracy: 0.8624 - val_loss: 0.3298 - val_accuracy: 0.8699 Epoch 36/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3283 - accuracy: 0.8704 - val_loss: 0.3297 - val_accuracy: 0.8699 Epoch 37/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3362 - accuracy: 0.8670 - val_loss: 0.3296 - val_accuracy: 0.8790 Epoch 38/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3264 - accuracy: 0.8716 - val_loss: 0.3341 - val_accuracy: 0.8699 Epoch 39/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3332 - accuracy: 0.8710 - val_loss: 0.3249 - val_accuracy: 0.8699 Epoch 40/200 55/55 [==============================] - 0s 4ms/step - loss: 0.3229 - accuracy: 0.8744 - val_loss: 0.3262 - val_accuracy: 0.8699 Epoch 41/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3196 - accuracy: 0.8761 - val_loss: 0.3260 - val_accuracy: 0.8721 Epoch 42/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3247 - accuracy: 0.8767 - val_loss: 0.3372 - val_accuracy: 0.8653 Epoch 43/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3176 - accuracy: 0.8733 - val_loss: 0.3229 - val_accuracy: 0.8721 Epoch 44/200 55/55 [==============================] - 0s 5ms/step - loss: 0.3180 - accuracy: 0.8733 - val_loss: 0.3247 - val_accuracy: 0.8699
InĀ [29]:
predictions_keras = model.predict(X_test_scaled).flatten()
output = pd.DataFrame({'id': test.id, 'rainfall': predictions_keras})
output.to_csv('submission_keras.csv', index=False)
print("Your submission was successfully saved!")
23/23 [==============================] - 0s 1ms/step Your submission was successfully saved!
InĀ [30]:
output.head()
Out[30]:
id | rainfall | |
---|---|---|
0 | 2190 | 0.993476 |
1 | 2191 | 0.996944 |
2 | 2192 | 0.970677 |
3 | 2193 | 0.228944 |
4 | 2194 | 0.074985 |
š¹ 8. kNN KFoldsĀ¶
InĀ [31]:
RMV = ['rainfall','id']
FEATURES = [c for c in train.columns if not c in RMV]
InĀ [32]:
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBRegressor, XGBClassifier
import xgboost
print("Using XGBoost version",xgboost.__version__)
Using XGBoost version 2.1.4
InĀ [33]:
%%time
FOLDS = 5
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=777)
oof_knn = np.zeros(len(train))
pred_knn = np.zeros(len(test))
for i, (train_index, test_index) in enumerate(kf.split(train)):
print("#"*25)
print(f"### Fold {i+1}")
print("#"*25)
x_train = train.loc[train_index,FEATURES].copy()
y_train = train.loc[train_index,"rainfall"]
x_valid = train.loc[test_index,FEATURES].copy()
y_valid = train.loc[test_index,"rainfall"]
x_test = test[FEATURES].copy()
for c in FEATURES:
m = x_train[c].mean()
s = x_train[c].std()
x_train[c] = (x_train[c]-m)/s
x_valid[c] = (x_valid[c]-m)/s
x_test[c] = (x_test[c]-m)/s
x_test[c] = x_test[c].fillna(0)
x_train[c] = x_train[c].fillna(0)
model = KNeighborsClassifier(n_neighbors=101, p=1)
model.fit(x_train.values, y_train.values)
# INFER OOF
oof_knn[test_index] = model.predict_proba(x_valid.values)[:,1]
# INFER TEST
pred_knn += model.predict_proba(x_test.values)[:,1]
# COMPUTE AVERAGE TEST PREDS
pred_knn /= FOLDS
######################### ### Fold 1 ######################### ######################### ### Fold 2 ######################### ######################### ### Fold 3 ######################### ######################### ### Fold 4 ######################### ######################### ### Fold 5 ######################### CPU times: total: 2.03 s Wall time: 572 ms
InĀ [34]:
best_public = pd.read_csv("best_public.csv")
display(best_public.head())
best_public = best_public.rainfall.values
id | rainfall | |
---|---|---|
0 | 2190 | 0.960959 |
1 | 2191 | 0.946575 |
2 | 2192 | 0.994521 |
3 | 2193 | 0.089041 |
4 | 2194 | 0.020548 |
InĀ [35]:
from scipy.stats import rankdata
sub = pd.read_csv("sample_submission.csv")
sub.rainfall = -0.067 * rankdata(pred_knn) + 1.067 * rankdata(best_public)
sub.rainfall = rankdata( sub.rainfall ) / len(sub)
print(sub.shape)
sub.to_csv(f"submission_knn.csv",index=False)
sub.head()
(730, 2)
Out[35]:
id | rainfall | |
---|---|---|
0 | 2190 | 0.962329 |
1 | 2191 | 0.943151 |
2 | 2192 | 0.993151 |
3 | 2193 | 0.097260 |
4 | 2194 | 0.020548 |