#4¶

Kaggle competition: https://www.kaggle.com/competitions/playground-series-s5e3

Entry by Robin P.M. Kras

In [1]:
import pandas as pd
In [2]:
train = pd.read_csv(r'train.csv')
test = pd.read_csv(r'test.csv')
In [3]:
print(train.shape, test.shape)
(2190, 13) (730, 12)

Preprocessing¶

In [4]:
# removing potential whitespace
train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()
In [5]:
print(f"Train set, null count: \n{train.isnull().sum()}")
print("\n")
print(f"Test set, null count: \n{test.isnull().sum()}")
Train set, null count: 
id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dtype: int64


Test set, null count: 
id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    1
windspeed        0
dtype: int64
In [6]:
# Test set contains an instance of null
test = test.fillna(0)
# If wind direction is equal to null, it must mean that there was no wind. Since there is a singular instance of null, this simple approach will be suitable

Some feature engineering on variables that likely are related or have an effect on each other. Found in someone else's submission ;)

In [7]:
train['humidity_cloud_interaction'] = train['humidity'] * train['cloud']
train['humidity_sunshine_interaction'] = train['humidity'] * train['sunshine']
train['cloud_sunshine_ratio'] = train['cloud'] / (train['sunshine'] + 1e-5)
train['relative_dryness'] = 100 - train['humidity']
train['sunshine_percentage'] = train['sunshine'] / (train['sunshine'] + train['cloud'] + 1e-5)
train['weather_index'] = (0.4 * train['humidity']) + (0.3 * train['cloud']) - (0.3 * train['sunshine'])

test['humidity_cloud_interaction'] = test['humidity'] * test['cloud']
test['humidity_sunshine_interaction'] = test['humidity'] * test['sunshine']
test['cloud_sunshine_ratio'] = test['cloud'] / (test['sunshine'] + 1e-5)
test['relative_dryness'] = 100 - test['humidity']
test['sunshine_percentage'] = test['sunshine'] / (test['sunshine'] + test['cloud'] + 1e-5)
test['weather_index'] = (0.4 * test['humidity']) + (0.3 * test['cloud']) - (0.3 * test['sunshine'])

Model Selection¶

In [8]:
from sklearn.linear_model import Ridge

X = train.drop(columns=['rainfall'], errors='ignore')
y = train['rainfall']

X_test = pd.get_dummies(test)

model = Ridge()
model.fit(X, y)

predictions = model.predict(X_test)

output = pd.DataFrame({'id': test.id, 'rainfall': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
Your submission was successfully saved!
c:\Users\robkr\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=2.6643e-17): result may not be accurate.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
In [9]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

fpr, tpr, _ = roc_curve(y, model.predict(X))  # Use training data predictions
auc_score = roc_auc_score(y, model.predict(X))

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

print(f"AUC Score: {auc_score:.4f}")
No description has been provided for this image
AUC Score: 0.8951

Results are not bad.