⭐ 1. Introduction & Overview¶
Your Goal: Your task is to predict the top 3 best fertilizers for a certain soil setting.
🔹 2. Import Libraries & Set Up¶
In [1]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
# Machine Learning
import xgboost as xg
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import SMOTE
# Feature Importance & Explainability
import shap
# Settings
import warnings
warnings.filterwarnings("ignore")
# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)
print("Libraries loaded. Ready to go!")
Libraries loaded. Ready to go!
🔹 3. Load & Explore Data¶
In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(f"Train shape: {train.shape}, Test shape: {test.shape}")
Train shape: (750000, 10), Test shape: (250000, 9)
In [3]:
train.head()
Out[3]:
id | Temparature | Humidity | Moisture | Soil Type | Crop Type | Nitrogen | Potassium | Phosphorous | Fertilizer Name | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 37 | 70 | 36 | Clayey | Sugarcane | 36 | 4 | 5 | 28-28 |
1 | 1 | 27 | 69 | 65 | Sandy | Millets | 30 | 6 | 18 | 28-28 |
2 | 2 | 29 | 63 | 32 | Sandy | Millets | 24 | 12 | 16 | 17-17-17 |
3 | 3 | 35 | 62 | 54 | Sandy | Barley | 39 | 12 | 4 | 10-26-26 |
4 | 4 | 35 | 58 | 43 | Red | Paddy | 37 | 2 | 16 | DAP |
In [4]:
fertilizers = train['Fertilizer Name'].unique()
for i in fertilizers:
print(i)
28-28 17-17-17 10-26-26 DAP 20-20 14-35-14 Urea
In [5]:
train.isnull().sum()
Out[5]:
id 0 Temparature 0 Humidity 0 Moisture 0 Soil Type 0 Crop Type 0 Nitrogen 0 Potassium 0 Phosphorous 0 Fertilizer Name 0 dtype: int64
In [6]:
test.isnull().sum()
Out[6]:
id 0 Temparature 0 Humidity 0 Moisture 0 Soil Type 0 Crop Type 0 Nitrogen 0 Potassium 0 Phosphorous 0 dtype: int64
In [7]:
train.columns = train.columns.str.replace(' ', '_')
test.columns = test.columns.str.replace(' ', '_')
In [8]:
CATS = []
NUMS = []
In [9]:
FEATURES = []
In [10]:
for col in train.columns:
FEATURES.append(col)
In [11]:
for c in FEATURES:
if train[c].dtype == "object":
CATS.append(c)
for c in FEATURES:
if c not in CATS:
NUMS.append(c)
🔹 4. Data Visualization & EDA¶
In [12]:
def plot_nums(dataframe):
float_cols = [col for col in dataframe.columns if dataframe[col].dtype == "float64" or dataframe[col].dtype == "int64"]
cols_per_row = 3
num_plots = len(float_cols)
rows = (num_plots // cols_per_row) + (num_plots % cols_per_row > 0)
fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 5 * rows))
axes = axes.flatten()
for idx, col in enumerate(float_cols):
sns.histplot(dataframe[col], bins=50, kde=True, ax=axes[idx])
axes[idx].set_title(f"Distribution of {col}")
for i in range(idx + 1, len(axes)):
fig.delaxes(axes[i])
plt.tight_layout()
plt.show()
In [13]:
if False:
plot_nums(train.drop(columns=['id', 'Fertilizer_Name']))
🔹 5. Feature Engineering¶
In [14]:
train.dtypes
Out[14]:
id int64 Temparature int64 Humidity int64 Moisture int64 Soil_Type object Crop_Type object Nitrogen int64 Potassium int64 Phosphorous int64 Fertilizer_Name object dtype: object
In [15]:
# train['Climate'] = (train['Humidity'] + train['Temparature'] + train['Moisture']) / 3
# test['Climate'] = (test['Humidity'] + test['Temparature'] + test['Moisture']) / 3
In [16]:
le = LabelEncoder()
CATS.append('Fertilizer_Name')
for col in CATS:
train[col] = le.fit_transform(train[col])
if col in test.columns:
test[col] = le.fit_transform(test[col])
In [17]:
train.head()
Out[17]:
id | Temparature | Humidity | Moisture | Soil_Type | Crop_Type | Nitrogen | Potassium | Phosphorous | Fertilizer_Name | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 37 | 70 | 36 | 1 | 8 | 36 | 4 | 5 | 4 |
1 | 1 | 27 | 69 | 65 | 4 | 4 | 30 | 6 | 18 | 4 |
2 | 2 | 29 | 63 | 32 | 4 | 4 | 24 | 12 | 16 | 2 |
3 | 3 | 35 | 62 | 54 | 4 | 0 | 39 | 12 | 4 | 0 |
4 | 4 | 35 | 58 | 43 | 3 | 6 | 37 | 2 | 16 | 5 |
🔹 6. XGBoost, KFold, CV¶
In [18]:
from lightgbm import LGBMClassifier
In [19]:
X = train.drop(columns=['id', 'Fertilizer_Name'], errors='ignore')
X_test = test.drop(columns=['id'], errors='ignore')
y = train["Fertilizer_Name"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=SEED)
def quick_eval(model, X_train, y_train):
model.fit(X_train, y_train)
predictions_val = model.predict(X_val)
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='precision')
rmse = root_mean_squared_error(y_val, predictions_val)
print(f"*** {model.__class__.__name__} ***")
print(f"Root mean squared error (val): {rmse}")
print(f"Mean CV MSLE: {-np.mean(cv_scores):.4f}")
print(f"CV Std Dev: {np.std(cv_scores):.4f}")
stars = len(model.__class__.__name__) + 8
print("*" * stars)
print("\n")
# quick_eval(xg.XGBClassifier(), X_train, y_train)
# quick_eval(LogisticRegression(), X_train, y_train)
# quick_eval(LGBMClassifier(), X_train, y_train)
if False:
model.fit(X, y)
predictions = model.predict(X_test)
sub1 = pd.read_csv("sample_submission.csv")
sub1.Listening_Time_minutes = predictions
sub1.to_csv("xgboost.csv", index=False)
print("Sub shape:", sub1.shape)
sub1.head()
In [20]:
def mapk(actual, predicted, k=3):
"""
Computes the mean average precision at k.
actual: array-like of true labels
predicted: array-like of lists of predicted labels (top k)
"""
score = 0.0
for a, p in zip(actual, predicted):
try:
score += 1.0 / (p.index(a) + 1) if a in p else 0.0
except ValueError:
score += 0.0
return score / len(actual)
In [21]:
train.head()
Out[21]:
id | Temparature | Humidity | Moisture | Soil_Type | Crop_Type | Nitrogen | Potassium | Phosphorous | Fertilizer_Name | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 37 | 70 | 36 | 1 | 8 | 36 | 4 | 5 | 4 |
1 | 1 | 27 | 69 | 65 | 4 | 4 | 30 | 6 | 18 | 4 |
2 | 2 | 29 | 63 | 32 | 4 | 4 | 24 | 12 | 16 | 2 |
3 | 3 | 35 | 62 | 54 | 4 | 0 | 39 | 12 | 4 | 0 |
4 | 4 | 35 | 58 | 43 | 3 | 6 | 37 | 2 | 16 | 5 |
Experimenting
In [22]:
if False:
###
soil_moisture_mean = train.groupby('Soil_Type')['Moisture'].transform('mean')
train['Moisture_Deviation'] = train['Moisture'] - soil_moisture_mean
soil_moisture_mean_test = test.groupby('Soil_Type')['Moisture'].transform('mean')
test['Moisture_Deviation'] = test['Moisture'] - soil_moisture_mean_test
###
train['Climate_Index'] = (
0.3 * train['Temparature'] +
0.1 * train['Humidity'] +
0.6 * train['Moisture']
)
test['Climate_Index'] = (
0.3 * test['Temparature'] +
0.1 * test['Humidity'] +
0.6 * test['Moisture']
)
train['Nitrogen_Sq'] = train['Nitrogen'] ** 2
train['Temp_Moisture'] = train['Temparature'] * train['Moisture']
test['Nitrogen_Sq'] = test['Nitrogen'] ** 2
test['Temp_Moisture'] = test['Temparature'] * test['Moisture']
train['N_K_ratio'] = train['Nitrogen'] / (train['Potassium'] + 1)
train['N_P_ratio'] = train['Nitrogen'] / (train['Phosphorous'] + 1)
train['K_P_ratio'] = train['Potassium'] / (train['Phosphorous'] + 1)
test['N_K_ratio'] = test['Nitrogen'] / (test['Potassium'] + 1)
test['N_P_ratio'] = test['Nitrogen'] / (test['Phosphorous'] + 1)
test['K_P_ratio'] = test['Potassium'] / (test['Phosphorous'] + 1)
In [23]:
X = train.drop(columns=['id', 'Fertilizer_Name'], errors='ignore')
X_test = test.drop(columns=['id'], errors='ignore')
y = train["Fertilizer_Name"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=SEED)
In [24]:
X_train.head()
Out[24]:
Temparature | Humidity | Moisture | Soil_Type | Crop_Type | Nitrogen | Potassium | Phosphorous | |
---|---|---|---|---|---|---|---|---|
453635 | 28 | 51 | 47 | 2 | 2 | 20 | 17 | 24 |
11651 | 33 | 62 | 30 | 4 | 0 | 7 | 0 | 6 |
431999 | 38 | 59 | 41 | 2 | 6 | 24 | 11 | 42 |
529211 | 26 | 52 | 57 | 2 | 10 | 27 | 17 | 19 |
110925 | 37 | 61 | 35 | 0 | 6 | 25 | 14 | 16 |
In [26]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
def mapk_eval(y_true, y_pred_proba, k=3):
top_k = np.argsort(y_pred_proba, axis=1)[:, -k:][:, ::-1]
return mapk(y_true, top_k.tolist(), k=k)
def objective(trial):
params = {
'max_depth': trial.suggest_int('max_depth', 3, 10),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
'n_estimators': trial.suggest_int('n_estimators', 100, 400),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
'gamma': trial.suggest_float('gamma', 0, 5),
'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
'random_state': SEED,
'use_label_encoder': False,
'eval_metric': 'mlogloss'
}
model = xg.XGBClassifier(**params)
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_val)
score = mapk_eval(y_val.values, y_pred_proba, k=3)
return score
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
print("Best MAP@3:", study.best_value)
print("Best params:", study.best_params)
[I 2025-06-11 17:41:09,298] A new study created in memory with name: no-name-dd5b1a35-a095-4ac2-a92c-8fe87d4dee67 [I 2025-06-11 17:41:36,577] Trial 0 finished with value: 0.3231222222222909 and parameters: {'max_depth': 8, 'learning_rate': 0.06326287765150974, 'n_estimators': 162, 'subsample': 0.6821233517625884, 'colsample_bytree': 0.9638359504388128, 'gamma': 2.4792001687350584, 'reg_alpha': 1.0189751290331739, 'reg_lambda': 4.768573075328976}. Best is trial 0 with value: 0.3231222222222909. [I 2025-06-11 17:41:55,650] Trial 1 finished with value: 0.31536555555561735 and parameters: {'max_depth': 6, 'learning_rate': 0.16841103012214031, 'n_estimators': 129, 'subsample': 0.8081429739265287, 'colsample_bytree': 0.8205096527873423, 'gamma': 3.3963004754342645, 'reg_alpha': 4.3330543885150945, 'reg_lambda': 2.569170589281376}. Best is trial 0 with value: 0.3231222222222909. [I 2025-06-11 17:42:17,485] Trial 2 finished with value: 0.32085333333340355 and parameters: {'max_depth': 6, 'learning_rate': 0.16527399809340892, 'n_estimators': 142, 'subsample': 0.8894209972128635, 'colsample_bytree': 0.6185529510142461, 'gamma': 2.02614326399173, 'reg_alpha': 1.4271732057664281, 'reg_lambda': 1.998810820263441}. Best is trial 0 with value: 0.3231222222222909. [I 2025-06-11 17:42:40,289] Trial 3 finished with value: 0.3224188888889598 and parameters: {'max_depth': 4, 'learning_rate': 0.2230595671108154, 'n_estimators': 139, 'subsample': 0.7703989917310565, 'colsample_bytree': 0.7062399714577376, 'gamma': 0.2054877113843251, 'reg_alpha': 0.3159713492248678, 'reg_lambda': 1.425548894820482}. Best is trial 0 with value: 0.3231222222222909. [I 2025-06-11 17:43:21,373] Trial 4 finished with value: 0.3284222222222986 and parameters: {'max_depth': 8, 'learning_rate': 0.11729039181348787, 'n_estimators': 292, 'subsample': 0.7868763316394609, 'colsample_bytree': 0.9479985598060687, 'gamma': 1.837477551754601, 'reg_alpha': 0.3794920771432825, 'reg_lambda': 0.87934828545284}. Best is trial 4 with value: 0.3284222222222986. [I 2025-06-11 17:44:03,017] Trial 5 finished with value: 0.3200177777778431 and parameters: {'max_depth': 9, 'learning_rate': 0.19739168479199648, 'n_estimators': 372, 'subsample': 0.9157227648549711, 'colsample_bytree': 0.7188536462639663, 'gamma': 2.816024478482395, 'reg_alpha': 3.837768348497872, 'reg_lambda': 3.3131651684887404}. Best is trial 4 with value: 0.3284222222222986. [I 2025-06-11 17:44:19,037] Trial 6 finished with value: 0.31562111111117375 and parameters: {'max_depth': 5, 'learning_rate': 0.12010698415562772, 'n_estimators': 111, 'subsample': 0.9810691798381612, 'colsample_bytree': 0.9140888906728758, 'gamma': 1.915539709422021, 'reg_alpha': 2.243296498062493, 'reg_lambda': 4.661617348426917}. Best is trial 4 with value: 0.3284222222222986. [I 2025-06-11 17:45:08,842] Trial 7 finished with value: 0.3032555555556048 and parameters: {'max_depth': 4, 'learning_rate': 0.01099005409111613, 'n_estimators': 381, 'subsample': 0.6727749070349011, 'colsample_bytree': 0.9134688747252768, 'gamma': 0.3710408507772206, 'reg_alpha': 0.29748030502547484, 'reg_lambda': 4.423774568327871}. Best is trial 4 with value: 0.3284222222222986. [I 2025-06-11 17:45:40,582] Trial 8 finished with value: 0.32276222222229256 and parameters: {'max_depth': 6, 'learning_rate': 0.059865987470583186, 'n_estimators': 221, 'subsample': 0.7582158086417364, 'colsample_bytree': 0.8404512979236067, 'gamma': 1.9623022408098918, 'reg_alpha': 1.206373500960709, 'reg_lambda': 1.419453652982462}. Best is trial 4 with value: 0.3284222222222986. [I 2025-06-11 17:46:23,280] Trial 9 finished with value: 0.3264944444445166 and parameters: {'max_depth': 8, 'learning_rate': 0.027302788769481147, 'n_estimators': 229, 'subsample': 0.7529154197261488, 'colsample_bytree': 0.6529186522939491, 'gamma': 1.1240570771678426, 'reg_alpha': 0.6121402665107212, 'reg_lambda': 4.099642876119921}. Best is trial 4 with value: 0.3284222222222986. [I 2025-06-11 17:46:53,197] Trial 10 finished with value: 0.3119122222222805 and parameters: {'max_depth': 10, 'learning_rate': 0.2998616713509893, 'n_estimators': 310, 'subsample': 0.616720220647669, 'colsample_bytree': 0.9900324135783889, 'gamma': 4.829883864621441, 'reg_alpha': 2.86151565825943, 'reg_lambda': 0.2342337106652529}. Best is trial 4 with value: 0.3284222222222986. [I 2025-06-11 17:47:31,022] Trial 11 finished with value: 0.3343077777778579 and parameters: {'max_depth': 8, 'learning_rate': 0.10141428721696402, 'n_estimators': 272, 'subsample': 0.8380054019273669, 'colsample_bytree': 0.6006088545378645, 'gamma': 1.0967787479159925, 'reg_alpha': 0.07143352213340626, 'reg_lambda': 0.004677646038029781}. Best is trial 11 with value: 0.3343077777778579. [I 2025-06-11 17:48:10,152] Trial 12 finished with value: 0.33292222222230466 and parameters: {'max_depth': 8, 'learning_rate': 0.1164193628787389, 'n_estimators': 296, 'subsample': 0.8469524872130173, 'colsample_bytree': 0.756160270571199, 'gamma': 1.126997178635461, 'reg_alpha': 1.889708207869898, 'reg_lambda': 0.04509237921231746}. Best is trial 11 with value: 0.3343077777778579. [I 2025-06-11 17:48:50,703] Trial 13 finished with value: 0.3327977777778562 and parameters: {'max_depth': 10, 'learning_rate': 0.1079885059779757, 'n_estimators': 282, 'subsample': 0.8642956813550425, 'colsample_bytree': 0.758807855846258, 'gamma': 0.929319860398242, 'reg_alpha': 2.101194782635285, 'reg_lambda': 0.0720654324119212}. Best is trial 11 with value: 0.3343077777778579. [I 2025-06-11 17:49:35,590] Trial 14 finished with value: 0.3316377777778581 and parameters: {'max_depth': 7, 'learning_rate': 0.08973884386953482, 'n_estimators': 346, 'subsample': 0.8401850492620782, 'colsample_bytree': 0.6129635658250652, 'gamma': 1.1026587783031196, 'reg_alpha': 3.097882505594469, 'reg_lambda': 0.7020390385613303}. Best is trial 11 with value: 0.3343077777778579. [I 2025-06-11 17:50:00,539] Trial 15 finished with value: 0.3167311111111737 and parameters: {'max_depth': 9, 'learning_rate': 0.14071780808295184, 'n_estimators': 252, 'subsample': 0.9188314560970854, 'colsample_bytree': 0.6782619837824262, 'gamma': 3.9184671621017677, 'reg_alpha': 1.848825814992794, 'reg_lambda': 0.7976018481325824}. Best is trial 11 with value: 0.3343077777778579. [I 2025-06-11 17:50:21,306] Trial 16 finished with value: 0.3270488888889637 and parameters: {'max_depth': 7, 'learning_rate': 0.2453457898102919, 'n_estimators': 189, 'subsample': 0.9904395333844096, 'colsample_bytree': 0.7700252445735166, 'gamma': 0.6994091133884861, 'reg_alpha': 3.57368344372309, 'reg_lambda': 2.7727524033511157}. Best is trial 11 with value: 0.3343077777778579. [I 2025-06-11 17:51:04,023] Trial 17 finished with value: 0.3311422222223003 and parameters: {'max_depth': 9, 'learning_rate': 0.07983509022972851, 'n_estimators': 318, 'subsample': 0.8352748994437167, 'colsample_bytree': 0.8578981187981797, 'gamma': 1.3823289286604317, 'reg_alpha': 4.634757821379262, 'reg_lambda': 1.7479536120917665}. Best is trial 11 with value: 0.3343077777778579. [I 2025-06-11 17:51:49,971] Trial 18 finished with value: 0.32899888888896345 and parameters: {'max_depth': 7, 'learning_rate': 0.046557345482992424, 'n_estimators': 265, 'subsample': 0.722134013823966, 'colsample_bytree': 0.7570135557682693, 'gamma': 0.018843173301881677, 'reg_alpha': 1.721014203810751, 'reg_lambda': 0.42030265083030405}. Best is trial 11 with value: 0.3343077777778579. [I 2025-06-11 17:52:24,791] Trial 19 finished with value: 0.3151444444445073 and parameters: {'max_depth': 3, 'learning_rate': 0.14308686291690068, 'n_estimators': 336, 'subsample': 0.9417566294561698, 'colsample_bytree': 0.6545149117279865, 'gamma': 1.5231548436377371, 'reg_alpha': 0.9197089319250962, 'reg_lambda': 1.1776366740093365}. Best is trial 11 with value: 0.3343077777778579. [I 2025-06-11 17:52:48,544] Trial 20 finished with value: 0.32070777777784526 and parameters: {'max_depth': 8, 'learning_rate': 0.1905085685264455, 'n_estimators': 227, 'subsample': 0.8222165062977569, 'colsample_bytree': 0.6002849729898877, 'gamma': 2.6446219490853693, 'reg_alpha': 2.7049477362855354, 'reg_lambda': 3.354669886543953}. Best is trial 11 with value: 0.3343077777778579. [I 2025-06-11 17:53:33,877] Trial 21 finished with value: 0.3350211111111929 and parameters: {'max_depth': 10, 'learning_rate': 0.11248115536930409, 'n_estimators': 278, 'subsample': 0.8668532639254519, 'colsample_bytree': 0.7773126206273325, 'gamma': 0.8089080347514379, 'reg_alpha': 2.287551543519361, 'reg_lambda': 0.21171740386923416}. Best is trial 21 with value: 0.3350211111111929. [I 2025-06-11 17:54:20,902] Trial 22 finished with value: 0.3345766666667466 and parameters: {'max_depth': 10, 'learning_rate': 0.09391669624087032, 'n_estimators': 271, 'subsample': 0.8761041182170525, 'colsample_bytree': 0.7868708543511961, 'gamma': 0.671527886064389, 'reg_alpha': 2.413656615251328, 'reg_lambda': 0.10315529276170246}. Best is trial 21 with value: 0.3350211111111929. [I 2025-06-11 17:55:11,412] Trial 23 finished with value: 0.33531888888896916 and parameters: {'max_depth': 10, 'learning_rate': 0.08058832222297987, 'n_estimators': 268, 'subsample': 0.8878145949016356, 'colsample_bytree': 0.7945209987411344, 'gamma': 0.6025081681020137, 'reg_alpha': 3.1628674276138575, 'reg_lambda': 0.5475825279790727}. Best is trial 23 with value: 0.33531888888896916. [I 2025-06-11 17:56:02,814] Trial 24 finished with value: 0.33481222222230106 and parameters: {'max_depth': 10, 'learning_rate': 0.07632962492284197, 'n_estimators': 247, 'subsample': 0.8828823700576881, 'colsample_bytree': 0.8061121219495095, 'gamma': 0.5619959296860586, 'reg_alpha': 3.207741677387607, 'reg_lambda': 0.5402723006951022}. Best is trial 23 with value: 0.33531888888896916. [I 2025-06-11 17:56:46,908] Trial 25 finished with value: 0.33146000000007747 and parameters: {'max_depth': 10, 'learning_rate': 0.044158678870159264, 'n_estimators': 196, 'subsample': 0.9539433757198061, 'colsample_bytree': 0.8740205213763044, 'gamma': 0.4914488530822796, 'reg_alpha': 3.3348708028359657, 'reg_lambda': 0.6339574584605174}. Best is trial 23 with value: 0.33531888888896916. [I 2025-06-11 17:57:32,428] Trial 26 finished with value: 0.3348277777778593 and parameters: {'max_depth': 9, 'learning_rate': 0.07596125416611232, 'n_estimators': 203, 'subsample': 0.8933575475802615, 'colsample_bytree': 0.8069483676293034, 'gamma': 0.014269500839304783, 'reg_alpha': 3.93767993130073, 'reg_lambda': 1.1052745516298361}. Best is trial 23 with value: 0.33531888888896916. [I 2025-06-11 17:58:15,779] Trial 27 finished with value: 0.3302011111111886 and parameters: {'max_depth': 9, 'learning_rate': 0.04148758411418811, 'n_estimators': 189, 'subsample': 0.9094973578765564, 'colsample_bytree': 0.7229847571564367, 'gamma': 0.001878902388947612, 'reg_alpha': 4.0162576166031805, 'reg_lambda': 1.9076972350531038}. Best is trial 23 with value: 0.33531888888896916. [I 2025-06-11 17:58:42,085] Trial 28 finished with value: 0.3272244444445181 and parameters: {'max_depth': 9, 'learning_rate': 0.1426550580298173, 'n_estimators': 211, 'subsample': 0.9513131954283298, 'colsample_bytree': 0.8272066434365707, 'gamma': 1.5056017008099156, 'reg_alpha': 4.933690282059899, 'reg_lambda': 1.0899001839176081}. Best is trial 23 with value: 0.33531888888896916. [I 2025-06-11 17:59:22,774] Trial 29 finished with value: 0.33271777777785644 and parameters: {'max_depth': 10, 'learning_rate': 0.0695946642352586, 'n_estimators': 172, 'subsample': 0.9086247963279899, 'colsample_bytree': 0.8934964105380269, 'gamma': 0.30599557927289756, 'reg_alpha': 4.090575478084377, 'reg_lambda': 2.1752726368794715}. Best is trial 23 with value: 0.33531888888896916.
Best MAP@3: 0.33531888888896916 Best params: {'max_depth': 10, 'learning_rate': 0.08058832222297987, 'n_estimators': 268, 'subsample': 0.8878145949016356, 'colsample_bytree': 0.7945209987411344, 'gamma': 0.6025081681020137, 'reg_alpha': 3.1628674276138575, 'reg_lambda': 0.5475825279790727}
Best MAP@3: 0.33507222222230404
, optuna, XGBClassifier, extra features: moisture_deviation
Best MAP@3: 0.3415511111111975
, optuna, XGBClassifier, no extra features
In [27]:
og_train = pd.read_csv('train.csv')
og_test = pd.read_csv('test.csv')
In [28]:
fertilizer_encoder = LabelEncoder()
original_fertilizer_names = og_train['Fertilizer Name'].unique()
fertilizer_encoder.fit(original_fertilizer_names)
Out[28]:
LabelEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
In [29]:
submission1 = pd.read_csv("sample_submission.csv")
model = xg.XGBClassifier(**study.best_params)
model.fit(X, y)
probs = model.predict_proba(X_test)
top3_indices = np.argsort(probs, axis=1)[:, -3:][:, ::-1]
top3_fertilizers = [' '.join(fertilizer_encoder.inverse_transform(row)) for row in top3_indices]
submission = pd.DataFrame({
'ID': test['id'],
'Fertilizer_Name': top3_fertilizers
})
submission.to_csv('submission500.csv', index=False)
In [30]:
submission
Out[30]:
ID | Fertilizer_Name | |
---|---|---|
0 | 750000 | DAP 10-26-26 28-28 |
1 | 750001 | 17-17-17 20-20 10-26-26 |
2 | 750002 | 20-20 28-28 10-26-26 |
3 | 750003 | 14-35-14 DAP Urea |
4 | 750004 | 20-20 10-26-26 28-28 |
... | ... | ... |
249995 | 999995 | 17-17-17 28-28 20-20 |
249996 | 999996 | 10-26-26 14-35-14 17-17-17 |
249997 | 999997 | DAP 14-35-14 Urea |
249998 | 999998 | 10-26-26 17-17-17 28-28 |
249999 | 999999 | 14-35-14 17-17-17 20-20 |
250000 rows × 2 columns
In [31]:
from sklearn.model_selection import StratifiedKFold
from collections import Counter
In [32]:
# 1. Compute class_weights globally (can also be per fold)
counter_full = Counter(y)
max_count_full = max(counter_full.values())
class_weights_full = {cls: max_count_full / count for cls, count in counter_full.items()}
# 2. Stratified CV
kfold = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)
fold_accuracies = []
oof_preds = np.zeros((X.shape[0], len(np.unique(y))))
for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y), 1):
print(f"\n================ Fold {fold} ================")
X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
# 3. Compute per-instance weights
counter_fold = Counter(y_tr)
max_count_fold = max(counter_fold.values())
sample_weights = y_tr.map(lambda cls: max_count_fold / counter_fold[cls])
# 4. Instantiate XGBoost model
XGB_model = xg.XGBClassifier(
max_depth=12,
colsample_bytree=0.467,
subsample=0.86,
n_estimators=4000,
learning_rate=0.03,
gamma=0.26,
max_delta_step=4,
reg_alpha=2.7,
reg_lambda=1.4,
objective='multi:softprob',
random_state=13,
enable_categorical=True,
tree_method='hist',
device='cuda',
early_stopping_rounds=150,
)
# 5. Fit with early stopping
XGB_model.fit(
X_tr,
y_tr,
sample_weight=sample_weights,
eval_set=[(X_va, y_va)],
verbose=200,
)
val_labels = XGB_model.predict(X_va)
val_probas = XGB_model.predict_proba(X_va)
oof_preds[val_idx] = val_probas
acc = accuracy_score(y_va, val_labels)
fold_accuracies.append(acc)
print(f"✅ Fold {fold} Accuracy: {acc:.4f}")
# 6. Final CV metrics
print("\n🎯 Mean CV Accuracy:", np.mean(fold_accuracies))
print("📈 Std CV Accuracy:", np.std(fold_accuracies))
# Get Top-3 predicted class indices
top3_preds = np.argsort(oof_preds, axis=1)[:, ::-1][:, :3]
================ Fold 1 ================ [0] validation_0-mlogloss:1.94566 [200] validation_0-mlogloss:1.91935 [400] validation_0-mlogloss:1.90982 [600] validation_0-mlogloss:1.90566 [800] validation_0-mlogloss:1.90391 [1000] validation_0-mlogloss:1.90314 [1200] validation_0-mlogloss:1.90276 [1400] validation_0-mlogloss:1.90284 [1423] validation_0-mlogloss:1.90287 ✅ Fold 1 Accuracy: 0.2154 ================ Fold 2 ================ [0] validation_0-mlogloss:1.94569 [200] validation_0-mlogloss:1.91987 [400] validation_0-mlogloss:1.91058 [600] validation_0-mlogloss:1.90659 [800] validation_0-mlogloss:1.90491 [1000] validation_0-mlogloss:1.90431 [1200] validation_0-mlogloss:1.90422 [1243] validation_0-mlogloss:1.90421 ✅ Fold 2 Accuracy: 0.2124 ================ Fold 3 ================ [0] validation_0-mlogloss:1.94564 [200] validation_0-mlogloss:1.91894 [400] validation_0-mlogloss:1.90917 [600] validation_0-mlogloss:1.90455 [800] validation_0-mlogloss:1.90255 [1000] validation_0-mlogloss:1.90177 [1200] validation_0-mlogloss:1.90141 [1400] validation_0-mlogloss:1.90139 [1414] validation_0-mlogloss:1.90137 ✅ Fold 3 Accuracy: 0.2161 ================ Fold 4 ================ [0] validation_0-mlogloss:1.94566
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) Cell In[32], line 42 23 XGB_model = xg.XGBClassifier( 24 max_depth=12, 25 colsample_bytree=0.467, (...) 38 early_stopping_rounds=150, 39 ) 41 # 5. Fit with early stopping ---> 42 XGB_model.fit( 43 X_tr, 44 y_tr, 45 sample_weight=sample_weights, 46 eval_set=[(X_va, y_va)], 47 verbose=200, 48 ) 50 val_labels = XGB_model.predict(X_va) 51 val_probas = XGB_model.predict_proba(X_va) File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\core.py:729, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs) 727 for k, arg in zip(sig.parameters, args): 728 kwargs[k] = arg --> 729 return func(**kwargs) File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\sklearn.py:1682, in XGBClassifier.fit(self, X, y, sample_weight, base_margin, eval_set, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights) 1660 model, metric, params, feature_weights = self._configure_fit( 1661 xgb_model, params, feature_weights 1662 ) 1663 train_dmatrix, evals = _wrap_evaluation_matrices( 1664 missing=self.missing, 1665 X=X, (...) 1679 feature_types=self.feature_types, 1680 ) -> 1682 self._Booster = train( 1683 params, 1684 train_dmatrix, 1685 self.get_num_boosting_rounds(), 1686 evals=evals, 1687 early_stopping_rounds=self.early_stopping_rounds, 1688 evals_result=evals_result, 1689 obj=obj, 1690 custom_metric=metric, 1691 verbose_eval=verbose, 1692 xgb_model=model, 1693 callbacks=self.callbacks, 1694 ) 1696 if not callable(self.objective): 1697 self.objective = params["objective"] File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\core.py:729, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs) 727 for k, arg in zip(sig.parameters, args): 728 kwargs[k] = arg --> 729 return func(**kwargs) File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\training.py:183, in train(params, dtrain, num_boost_round, evals, obj, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, custom_metric) 181 if cb_container.before_iteration(bst, i, dtrain, evals): 182 break --> 183 bst.update(dtrain, iteration=i, fobj=obj) 184 if cb_container.after_iteration(bst, i, dtrain, evals): 185 break File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\core.py:2247, in Booster.update(self, dtrain, iteration, fobj) 2243 self._assign_dmatrix_features(dtrain) 2245 if fobj is None: 2246 _check_call( -> 2247 _LIB.XGBoosterUpdateOneIter( 2248 self.handle, ctypes.c_int(iteration), dtrain.handle 2249 ) 2250 ) 2251 else: 2252 pred = self.predict(dtrain, output_margin=True, training=True) KeyboardInterrupt:
In [ ]:
map3_score = mapk(
y.values.tolist(),
top3_preds.tolist(),
k=3
)
print(f"\n📊 Mean Average Precision @3 (MAP@3): {map3_score:.5f}")
📊 Mean Average Precision @3 (MAP@3): 0.35309
In [ ]:
preds1 = model.predict_proba(X_test)
preds2 = XGB_model.predict_proba(X_test)
In [ ]:
ensemble_test_preds = (0.2 * preds2 + 0.8 * preds1)
submission2 = pd.read_csv("sample_submission.csv")
top3_indices = np.argsort(ensemble_test_preds, axis=1)[:, -3:][:, ::-1]
top3_fertilizers = [' '.join(fertilizer_encoder.inverse_transform(row)) for row in top3_indices]
submission2 = pd.DataFrame({
'ID': test['id'],
'Fertilizer Name': top3_fertilizers
})
submission2.to_csv('submissionensemble10.csv', index=False)
In [ ]:
submission2.head()
Out[ ]:
ID | Fertilizer_Name | |
---|---|---|
0 | 750000 | DAP 10-26-26 28-28 |
1 | 750001 | 17-17-17 20-20 10-26-26 |
2 | 750002 | 20-20 28-28 10-26-26 |
3 | 750003 | 14-35-14 DAP 17-17-17 |
4 | 750004 | 20-20 Urea 10-26-26 |
In [ ]:
train.head()
Out[ ]:
id | Temparature | Humidity | Moisture | Soil_Type | Crop_Type | Nitrogen | Potassium | Phosphorous | Fertilizer_Name | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 37 | 70 | 36 | 1 | 8 | 36 | 4 | 5 | 4 |
1 | 1 | 27 | 69 | 65 | 4 | 4 | 30 | 6 | 18 | 4 |
2 | 2 | 29 | 63 | 32 | 4 | 4 | 24 | 12 | 16 | 2 |
3 | 3 | 35 | 62 | 54 | 4 | 0 | 39 | 12 | 4 | 0 |
4 | 4 | 35 | 58 | 43 | 3 | 6 | 37 | 2 | 16 | 5 |
In [ ]:
new_train = pd.read_csv('train.csv')
new_test = pd.read_csv('test.csv')
In [ ]:
new_train.head()
Out[ ]:
id | Temparature | Humidity | Moisture | Soil Type | Crop Type | Nitrogen | Potassium | Phosphorous | Fertilizer Name | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 37 | 70 | 36 | Clayey | Sugarcane | 36 | 4 | 5 | 28-28 |
1 | 1 | 27 | 69 | 65 | Sandy | Millets | 30 | 6 | 18 | 28-28 |
2 | 2 | 29 | 63 | 32 | Sandy | Millets | 24 | 12 | 16 | 17-17-17 |
3 | 3 | 35 | 62 | 54 | Sandy | Barley | 39 | 12 | 4 | 10-26-26 |
4 | 4 | 35 | 58 | 43 | Red | Paddy | 37 | 2 | 16 | DAP |
In [ ]:
X = new_train.drop(columns=['id', 'Fertilizer Name'], errors='ignore')
X_test = new_test.drop(columns=['id'], errors='ignore')
y = new_train["Fertilizer Name"]
In [ ]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=SEED)
In [ ]:
from catboost import CatBoostClassifier, Pool
# Define categorical features
cat_features = ['Soil Type', 'Crop Type'] # Add other categorical columns if any
# Create CatBoost Pool with specified categorical features
train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)
# Initialize and train CatBoost model
cat_model = CatBoostClassifier(
iterations=1000,
learning_rate=0.1,
depth=6,
loss_function='MultiClass',
eval_metric='MultiClass',
random_seed=SEED,
gpu_ram_part=0.95, # Utilize GPU if available
task_type="GPU" # Use GPU acceleration
)
# Fit the model
cat_model.fit(
train_pool,
eval_set=val_pool,
early_stopping_rounds=50,
verbose=200
)
# Get predictions
probs = cat_model.predict_proba(X_test)
top3_indices = np.argsort(probs, axis=1)[:, -3:][:, ::-1]
0: learn: 1.9450360 test: 1.9450892 best: 1.9450892 (0) total: 20.2ms remaining: 20.2s 200: learn: 1.9218504 test: 1.9294902 best: 1.9294902 (200) total: 3.88s remaining: 15.4s 400: learn: 1.9116783 test: 1.9254933 best: 1.9254933 (400) total: 7.71s remaining: 11.5s 600: learn: 1.9036608 test: 1.9233704 best: 1.9233704 (600) total: 11.4s remaining: 7.54s 800: learn: 1.8966321 test: 1.9221321 best: 1.9221321 (800) total: 15.1s remaining: 3.76s 999: learn: 1.8897356 test: 1.9212942 best: 1.9212942 (999) total: 18.8s remaining: 0us bestTest = 1.921294167 bestIteration = 999
In [ ]:
preds3 = cat_model.predict_proba(X_test)
ensemble_test_preds = (0.33 * preds2 + 0.33 * preds1 + 0.34 * preds3)
submission3 = pd.read_csv("sample_submission.csv")
top3_indices = np.argsort(ensemble_test_preds, axis=1)[:, -3:][:, ::-1]
top3_fertilizers = [' '.join(fertilizer_encoder.inverse_transform(row)) for row in top3_indices]
submission3 = pd.DataFrame({
'ID': test['id'],
'Fertilizer Name': top3_fertilizers
})
submission3.to_csv('submissionensemble_cats_xgb_optuna.csv', index=False)
In [ ]:
submission3
Out[ ]:
ID | Fertilizer_Name | |
---|---|---|
0 | 750000 | DAP 10-26-26 28-28 |
1 | 750001 | 17-17-17 20-20 10-26-26 |
2 | 750002 | 20-20 28-28 10-26-26 |
3 | 750003 | 14-35-14 DAP 17-17-17 |
4 | 750004 | 20-20 Urea 10-26-26 |
... | ... | ... |
249995 | 999995 | Urea 17-17-17 28-28 |
249996 | 999996 | 14-35-14 10-26-26 Urea |
249997 | 999997 | DAP Urea 14-35-14 |
249998 | 999998 | 10-26-26 28-28 17-17-17 |
249999 | 999999 | 14-35-14 17-17-17 20-20 |
250000 rows × 2 columns
In [ ]:
print("TODO: submit submission2 and submission3")
TODO: submit submission2 and submission3
In [ ]:
ensemble_test_preds = preds2
submission5 = pd.read_csv("sample_submission.csv")
top3_indices = np.argsort(ensemble_test_preds, axis=1)[:, -3:][:, ::-1]
top3_fertilizers = [' '.join(fertilizer_encoder.inverse_transform(row)) for row in top3_indices]
submission2 = pd.DataFrame({
'ID': test['id'],
'Fertilizer Name': top3_fertilizers
})
submission2.to_csv('submisX.csv', index=False)