#6¶

Kaggle competition: [link]

Entry by Robin R.P.M. Kras

⭐ 1. Introduction & Overview¶

Your Goal: Your task is to predict the top 3 best fertilizers for a certain soil setting.

🔹 2. Import Libraries & Set Up¶

In [1]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Machine Learning
import xgboost as xg

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from imblearn.over_sampling import SMOTE

# Feature Importance & Explainability
import shap

# Settings
import warnings
warnings.filterwarnings("ignore")

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

print("Libraries loaded. Ready to go!")
Libraries loaded. Ready to go!

🔹 3. Load & Explore Data¶

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(f"Train shape: {train.shape}, Test shape: {test.shape}")
Train shape: (750000, 10), Test shape: (250000, 9)
In [3]:
train.head()
Out[3]:
id Temparature Humidity Moisture Soil Type Crop Type Nitrogen Potassium Phosphorous Fertilizer Name
0 0 37 70 36 Clayey Sugarcane 36 4 5 28-28
1 1 27 69 65 Sandy Millets 30 6 18 28-28
2 2 29 63 32 Sandy Millets 24 12 16 17-17-17
3 3 35 62 54 Sandy Barley 39 12 4 10-26-26
4 4 35 58 43 Red Paddy 37 2 16 DAP
In [4]:
fertilizers = train['Fertilizer Name'].unique()
for i in fertilizers:
    print(i)
28-28
17-17-17
10-26-26
DAP
20-20
14-35-14
Urea
In [5]:
train.isnull().sum()
Out[5]:
id                 0
Temparature        0
Humidity           0
Moisture           0
Soil Type          0
Crop Type          0
Nitrogen           0
Potassium          0
Phosphorous        0
Fertilizer Name    0
dtype: int64
In [6]:
test.isnull().sum()
Out[6]:
id             0
Temparature    0
Humidity       0
Moisture       0
Soil Type      0
Crop Type      0
Nitrogen       0
Potassium      0
Phosphorous    0
dtype: int64
In [7]:
train.columns = train.columns.str.replace(' ', '_')
test.columns = test.columns.str.replace(' ', '_')
In [8]:
CATS = []
NUMS = []
In [9]:
FEATURES = []
In [10]:
for col in train.columns:
    FEATURES.append(col)
In [11]:
for c in FEATURES:
    if train[c].dtype == "object":
        CATS.append(c)

for c in FEATURES:
    if c not in CATS:
        NUMS.append(c)

🔹 4. Data Visualization & EDA¶

In [12]:
def plot_nums(dataframe):
    float_cols = [col for col in dataframe.columns if dataframe[col].dtype == "float64" or dataframe[col].dtype == "int64"]

    cols_per_row = 3
    num_plots = len(float_cols)
    rows = (num_plots // cols_per_row) + (num_plots % cols_per_row > 0) 

    fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 5 * rows)) 
    axes = axes.flatten()  

    for idx, col in enumerate(float_cols):
        sns.histplot(dataframe[col], bins=50, kde=True, ax=axes[idx])
        axes[idx].set_title(f"Distribution of {col}")

    for i in range(idx + 1, len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.show()
In [13]:
if False:
    plot_nums(train.drop(columns=['id', 'Fertilizer_Name']))

🔹 5. Feature Engineering¶

In [14]:
train.dtypes
Out[14]:
id                  int64
Temparature         int64
Humidity            int64
Moisture            int64
Soil_Type          object
Crop_Type          object
Nitrogen            int64
Potassium           int64
Phosphorous         int64
Fertilizer_Name    object
dtype: object
In [15]:
# train['Climate'] = (train['Humidity'] + train['Temparature'] + train['Moisture']) / 3
# test['Climate'] = (test['Humidity'] + test['Temparature'] + test['Moisture']) / 3
In [16]:
le = LabelEncoder()

CATS.append('Fertilizer_Name')

for col in CATS:
    train[col] = le.fit_transform(train[col])
    if col in test.columns:
        test[col] = le.fit_transform(test[col])
In [17]:
train.head()
Out[17]:
id Temparature Humidity Moisture Soil_Type Crop_Type Nitrogen Potassium Phosphorous Fertilizer_Name
0 0 37 70 36 1 8 36 4 5 4
1 1 27 69 65 4 4 30 6 18 4
2 2 29 63 32 4 4 24 12 16 2
3 3 35 62 54 4 0 39 12 4 0
4 4 35 58 43 3 6 37 2 16 5

🔹 6. XGBoost, KFold, CV¶

In [18]:
from lightgbm import LGBMClassifier
In [19]:
X = train.drop(columns=['id', 'Fertilizer_Name'], errors='ignore')
X_test = test.drop(columns=['id'], errors='ignore')

y = train["Fertilizer_Name"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=SEED)

def quick_eval(model, X_train, y_train):
    model.fit(X_train, y_train)

    predictions_val = model.predict(X_val)

    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

    cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='precision')

    rmse = root_mean_squared_error(y_val, predictions_val)
    print(f"*** {model.__class__.__name__} ***")
    print(f"Root mean squared error (val): {rmse}")
    print(f"Mean CV MSLE: {-np.mean(cv_scores):.4f}")
    print(f"CV Std Dev: {np.std(cv_scores):.4f}")

    stars = len(model.__class__.__name__) + 8
    print("*" * stars)
    print("\n")

# quick_eval(xg.XGBClassifier(), X_train, y_train)
# quick_eval(LogisticRegression(), X_train, y_train)
# quick_eval(LGBMClassifier(), X_train, y_train)

if False:
    model.fit(X, y) 
    predictions = model.predict(X_test)

    sub1 = pd.read_csv("sample_submission.csv")

    sub1.Listening_Time_minutes = predictions 

    sub1.to_csv("xgboost.csv", index=False)

    print("Sub shape:", sub1.shape)
    sub1.head()
In [20]:
def mapk(actual, predicted, k=3):
    """
    Computes the mean average precision at k.
    actual: array-like of true labels
    predicted: array-like of lists of predicted labels (top k)
    """
    score = 0.0
    for a, p in zip(actual, predicted):
        try:
            score += 1.0 / (p.index(a) + 1) if a in p else 0.0
        except ValueError:
            score += 0.0
    return score / len(actual)
In [21]:
train.head()
Out[21]:
id Temparature Humidity Moisture Soil_Type Crop_Type Nitrogen Potassium Phosphorous Fertilizer_Name
0 0 37 70 36 1 8 36 4 5 4
1 1 27 69 65 4 4 30 6 18 4
2 2 29 63 32 4 4 24 12 16 2
3 3 35 62 54 4 0 39 12 4 0
4 4 35 58 43 3 6 37 2 16 5

Experimenting

In [22]:
if False:
    ###
    soil_moisture_mean = train.groupby('Soil_Type')['Moisture'].transform('mean')
    train['Moisture_Deviation'] = train['Moisture'] - soil_moisture_mean

    soil_moisture_mean_test = test.groupby('Soil_Type')['Moisture'].transform('mean')
    test['Moisture_Deviation'] = test['Moisture'] - soil_moisture_mean_test
    ###

    train['Climate_Index'] = (
        0.3 * train['Temparature'] +
        0.1 * train['Humidity'] +
        0.6 * train['Moisture']
    )

    test['Climate_Index'] = (
        0.3 * test['Temparature'] +
        0.1 * test['Humidity'] +
        0.6 * test['Moisture']
    )

    train['Nitrogen_Sq'] = train['Nitrogen'] ** 2
    train['Temp_Moisture'] = train['Temparature'] * train['Moisture']

    test['Nitrogen_Sq'] = test['Nitrogen'] ** 2
    test['Temp_Moisture'] = test['Temparature'] * test['Moisture']

    train['N_K_ratio'] = train['Nitrogen'] / (train['Potassium'] + 1)
    train['N_P_ratio'] = train['Nitrogen'] / (train['Phosphorous'] + 1)
    train['K_P_ratio'] = train['Potassium'] / (train['Phosphorous'] + 1)

    test['N_K_ratio'] = test['Nitrogen'] / (test['Potassium'] + 1)
    test['N_P_ratio'] = test['Nitrogen'] / (test['Phosphorous'] + 1)
    test['K_P_ratio'] = test['Potassium'] / (test['Phosphorous'] + 1)
In [23]:
X = train.drop(columns=['id', 'Fertilizer_Name'], errors='ignore')
X_test = test.drop(columns=['id'], errors='ignore')

y = train["Fertilizer_Name"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=SEED)
In [24]:
X_train.head()
Out[24]:
Temparature Humidity Moisture Soil_Type Crop_Type Nitrogen Potassium Phosphorous
453635 28 51 47 2 2 20 17 24
11651 33 62 30 4 0 7 0 6
431999 38 59 41 2 6 24 11 42
529211 26 52 57 2 10 27 17 19
110925 37 61 35 0 6 25 14 16
In [26]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer

def mapk_eval(y_true, y_pred_proba, k=3):
    top_k = np.argsort(y_pred_proba, axis=1)[:, -k:][:, ::-1]
    return mapk(y_true, top_k.tolist(), k=k)

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 400),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': SEED,
        'use_label_encoder': False,
        'eval_metric': 'mlogloss'
    }
    model = xg.XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_val)
    score = mapk_eval(y_val.values, y_pred_proba, k=3)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print("Best MAP@3:", study.best_value)
print("Best params:", study.best_params)
[I 2025-06-11 17:41:09,298] A new study created in memory with name: no-name-dd5b1a35-a095-4ac2-a92c-8fe87d4dee67
[I 2025-06-11 17:41:36,577] Trial 0 finished with value: 0.3231222222222909 and parameters: {'max_depth': 8, 'learning_rate': 0.06326287765150974, 'n_estimators': 162, 'subsample': 0.6821233517625884, 'colsample_bytree': 0.9638359504388128, 'gamma': 2.4792001687350584, 'reg_alpha': 1.0189751290331739, 'reg_lambda': 4.768573075328976}. Best is trial 0 with value: 0.3231222222222909.
[I 2025-06-11 17:41:55,650] Trial 1 finished with value: 0.31536555555561735 and parameters: {'max_depth': 6, 'learning_rate': 0.16841103012214031, 'n_estimators': 129, 'subsample': 0.8081429739265287, 'colsample_bytree': 0.8205096527873423, 'gamma': 3.3963004754342645, 'reg_alpha': 4.3330543885150945, 'reg_lambda': 2.569170589281376}. Best is trial 0 with value: 0.3231222222222909.
[I 2025-06-11 17:42:17,485] Trial 2 finished with value: 0.32085333333340355 and parameters: {'max_depth': 6, 'learning_rate': 0.16527399809340892, 'n_estimators': 142, 'subsample': 0.8894209972128635, 'colsample_bytree': 0.6185529510142461, 'gamma': 2.02614326399173, 'reg_alpha': 1.4271732057664281, 'reg_lambda': 1.998810820263441}. Best is trial 0 with value: 0.3231222222222909.
[I 2025-06-11 17:42:40,289] Trial 3 finished with value: 0.3224188888889598 and parameters: {'max_depth': 4, 'learning_rate': 0.2230595671108154, 'n_estimators': 139, 'subsample': 0.7703989917310565, 'colsample_bytree': 0.7062399714577376, 'gamma': 0.2054877113843251, 'reg_alpha': 0.3159713492248678, 'reg_lambda': 1.425548894820482}. Best is trial 0 with value: 0.3231222222222909.
[I 2025-06-11 17:43:21,373] Trial 4 finished with value: 0.3284222222222986 and parameters: {'max_depth': 8, 'learning_rate': 0.11729039181348787, 'n_estimators': 292, 'subsample': 0.7868763316394609, 'colsample_bytree': 0.9479985598060687, 'gamma': 1.837477551754601, 'reg_alpha': 0.3794920771432825, 'reg_lambda': 0.87934828545284}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:44:03,017] Trial 5 finished with value: 0.3200177777778431 and parameters: {'max_depth': 9, 'learning_rate': 0.19739168479199648, 'n_estimators': 372, 'subsample': 0.9157227648549711, 'colsample_bytree': 0.7188536462639663, 'gamma': 2.816024478482395, 'reg_alpha': 3.837768348497872, 'reg_lambda': 3.3131651684887404}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:44:19,037] Trial 6 finished with value: 0.31562111111117375 and parameters: {'max_depth': 5, 'learning_rate': 0.12010698415562772, 'n_estimators': 111, 'subsample': 0.9810691798381612, 'colsample_bytree': 0.9140888906728758, 'gamma': 1.915539709422021, 'reg_alpha': 2.243296498062493, 'reg_lambda': 4.661617348426917}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:45:08,842] Trial 7 finished with value: 0.3032555555556048 and parameters: {'max_depth': 4, 'learning_rate': 0.01099005409111613, 'n_estimators': 381, 'subsample': 0.6727749070349011, 'colsample_bytree': 0.9134688747252768, 'gamma': 0.3710408507772206, 'reg_alpha': 0.29748030502547484, 'reg_lambda': 4.423774568327871}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:45:40,582] Trial 8 finished with value: 0.32276222222229256 and parameters: {'max_depth': 6, 'learning_rate': 0.059865987470583186, 'n_estimators': 221, 'subsample': 0.7582158086417364, 'colsample_bytree': 0.8404512979236067, 'gamma': 1.9623022408098918, 'reg_alpha': 1.206373500960709, 'reg_lambda': 1.419453652982462}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:46:23,280] Trial 9 finished with value: 0.3264944444445166 and parameters: {'max_depth': 8, 'learning_rate': 0.027302788769481147, 'n_estimators': 229, 'subsample': 0.7529154197261488, 'colsample_bytree': 0.6529186522939491, 'gamma': 1.1240570771678426, 'reg_alpha': 0.6121402665107212, 'reg_lambda': 4.099642876119921}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:46:53,197] Trial 10 finished with value: 0.3119122222222805 and parameters: {'max_depth': 10, 'learning_rate': 0.2998616713509893, 'n_estimators': 310, 'subsample': 0.616720220647669, 'colsample_bytree': 0.9900324135783889, 'gamma': 4.829883864621441, 'reg_alpha': 2.86151565825943, 'reg_lambda': 0.2342337106652529}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:47:31,022] Trial 11 finished with value: 0.3343077777778579 and parameters: {'max_depth': 8, 'learning_rate': 0.10141428721696402, 'n_estimators': 272, 'subsample': 0.8380054019273669, 'colsample_bytree': 0.6006088545378645, 'gamma': 1.0967787479159925, 'reg_alpha': 0.07143352213340626, 'reg_lambda': 0.004677646038029781}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:48:10,152] Trial 12 finished with value: 0.33292222222230466 and parameters: {'max_depth': 8, 'learning_rate': 0.1164193628787389, 'n_estimators': 296, 'subsample': 0.8469524872130173, 'colsample_bytree': 0.756160270571199, 'gamma': 1.126997178635461, 'reg_alpha': 1.889708207869898, 'reg_lambda': 0.04509237921231746}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:48:50,703] Trial 13 finished with value: 0.3327977777778562 and parameters: {'max_depth': 10, 'learning_rate': 0.1079885059779757, 'n_estimators': 282, 'subsample': 0.8642956813550425, 'colsample_bytree': 0.758807855846258, 'gamma': 0.929319860398242, 'reg_alpha': 2.101194782635285, 'reg_lambda': 0.0720654324119212}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:49:35,590] Trial 14 finished with value: 0.3316377777778581 and parameters: {'max_depth': 7, 'learning_rate': 0.08973884386953482, 'n_estimators': 346, 'subsample': 0.8401850492620782, 'colsample_bytree': 0.6129635658250652, 'gamma': 1.1026587783031196, 'reg_alpha': 3.097882505594469, 'reg_lambda': 0.7020390385613303}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:50:00,539] Trial 15 finished with value: 0.3167311111111737 and parameters: {'max_depth': 9, 'learning_rate': 0.14071780808295184, 'n_estimators': 252, 'subsample': 0.9188314560970854, 'colsample_bytree': 0.6782619837824262, 'gamma': 3.9184671621017677, 'reg_alpha': 1.848825814992794, 'reg_lambda': 0.7976018481325824}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:50:21,306] Trial 16 finished with value: 0.3270488888889637 and parameters: {'max_depth': 7, 'learning_rate': 0.2453457898102919, 'n_estimators': 189, 'subsample': 0.9904395333844096, 'colsample_bytree': 0.7700252445735166, 'gamma': 0.6994091133884861, 'reg_alpha': 3.57368344372309, 'reg_lambda': 2.7727524033511157}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:51:04,023] Trial 17 finished with value: 0.3311422222223003 and parameters: {'max_depth': 9, 'learning_rate': 0.07983509022972851, 'n_estimators': 318, 'subsample': 0.8352748994437167, 'colsample_bytree': 0.8578981187981797, 'gamma': 1.3823289286604317, 'reg_alpha': 4.634757821379262, 'reg_lambda': 1.7479536120917665}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:51:49,971] Trial 18 finished with value: 0.32899888888896345 and parameters: {'max_depth': 7, 'learning_rate': 0.046557345482992424, 'n_estimators': 265, 'subsample': 0.722134013823966, 'colsample_bytree': 0.7570135557682693, 'gamma': 0.018843173301881677, 'reg_alpha': 1.721014203810751, 'reg_lambda': 0.42030265083030405}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:52:24,791] Trial 19 finished with value: 0.3151444444445073 and parameters: {'max_depth': 3, 'learning_rate': 0.14308686291690068, 'n_estimators': 336, 'subsample': 0.9417566294561698, 'colsample_bytree': 0.6545149117279865, 'gamma': 1.5231548436377371, 'reg_alpha': 0.9197089319250962, 'reg_lambda': 1.1776366740093365}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:52:48,544] Trial 20 finished with value: 0.32070777777784526 and parameters: {'max_depth': 8, 'learning_rate': 0.1905085685264455, 'n_estimators': 227, 'subsample': 0.8222165062977569, 'colsample_bytree': 0.6002849729898877, 'gamma': 2.6446219490853693, 'reg_alpha': 2.7049477362855354, 'reg_lambda': 3.354669886543953}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:53:33,877] Trial 21 finished with value: 0.3350211111111929 and parameters: {'max_depth': 10, 'learning_rate': 0.11248115536930409, 'n_estimators': 278, 'subsample': 0.8668532639254519, 'colsample_bytree': 0.7773126206273325, 'gamma': 0.8089080347514379, 'reg_alpha': 2.287551543519361, 'reg_lambda': 0.21171740386923416}. Best is trial 21 with value: 0.3350211111111929.
[I 2025-06-11 17:54:20,902] Trial 22 finished with value: 0.3345766666667466 and parameters: {'max_depth': 10, 'learning_rate': 0.09391669624087032, 'n_estimators': 271, 'subsample': 0.8761041182170525, 'colsample_bytree': 0.7868708543511961, 'gamma': 0.671527886064389, 'reg_alpha': 2.413656615251328, 'reg_lambda': 0.10315529276170246}. Best is trial 21 with value: 0.3350211111111929.
[I 2025-06-11 17:55:11,412] Trial 23 finished with value: 0.33531888888896916 and parameters: {'max_depth': 10, 'learning_rate': 0.08058832222297987, 'n_estimators': 268, 'subsample': 0.8878145949016356, 'colsample_bytree': 0.7945209987411344, 'gamma': 0.6025081681020137, 'reg_alpha': 3.1628674276138575, 'reg_lambda': 0.5475825279790727}. Best is trial 23 with value: 0.33531888888896916.
[I 2025-06-11 17:56:02,814] Trial 24 finished with value: 0.33481222222230106 and parameters: {'max_depth': 10, 'learning_rate': 0.07632962492284197, 'n_estimators': 247, 'subsample': 0.8828823700576881, 'colsample_bytree': 0.8061121219495095, 'gamma': 0.5619959296860586, 'reg_alpha': 3.207741677387607, 'reg_lambda': 0.5402723006951022}. Best is trial 23 with value: 0.33531888888896916.
[I 2025-06-11 17:56:46,908] Trial 25 finished with value: 0.33146000000007747 and parameters: {'max_depth': 10, 'learning_rate': 0.044158678870159264, 'n_estimators': 196, 'subsample': 0.9539433757198061, 'colsample_bytree': 0.8740205213763044, 'gamma': 0.4914488530822796, 'reg_alpha': 3.3348708028359657, 'reg_lambda': 0.6339574584605174}. Best is trial 23 with value: 0.33531888888896916.
[I 2025-06-11 17:57:32,428] Trial 26 finished with value: 0.3348277777778593 and parameters: {'max_depth': 9, 'learning_rate': 0.07596125416611232, 'n_estimators': 203, 'subsample': 0.8933575475802615, 'colsample_bytree': 0.8069483676293034, 'gamma': 0.014269500839304783, 'reg_alpha': 3.93767993130073, 'reg_lambda': 1.1052745516298361}. Best is trial 23 with value: 0.33531888888896916.
[I 2025-06-11 17:58:15,779] Trial 27 finished with value: 0.3302011111111886 and parameters: {'max_depth': 9, 'learning_rate': 0.04148758411418811, 'n_estimators': 189, 'subsample': 0.9094973578765564, 'colsample_bytree': 0.7229847571564367, 'gamma': 0.001878902388947612, 'reg_alpha': 4.0162576166031805, 'reg_lambda': 1.9076972350531038}. Best is trial 23 with value: 0.33531888888896916.
[I 2025-06-11 17:58:42,085] Trial 28 finished with value: 0.3272244444445181 and parameters: {'max_depth': 9, 'learning_rate': 0.1426550580298173, 'n_estimators': 211, 'subsample': 0.9513131954283298, 'colsample_bytree': 0.8272066434365707, 'gamma': 1.5056017008099156, 'reg_alpha': 4.933690282059899, 'reg_lambda': 1.0899001839176081}. Best is trial 23 with value: 0.33531888888896916.
[I 2025-06-11 17:59:22,774] Trial 29 finished with value: 0.33271777777785644 and parameters: {'max_depth': 10, 'learning_rate': 0.0695946642352586, 'n_estimators': 172, 'subsample': 0.9086247963279899, 'colsample_bytree': 0.8934964105380269, 'gamma': 0.30599557927289756, 'reg_alpha': 4.090575478084377, 'reg_lambda': 2.1752726368794715}. Best is trial 23 with value: 0.33531888888896916.
Best MAP@3: 0.33531888888896916
Best params: {'max_depth': 10, 'learning_rate': 0.08058832222297987, 'n_estimators': 268, 'subsample': 0.8878145949016356, 'colsample_bytree': 0.7945209987411344, 'gamma': 0.6025081681020137, 'reg_alpha': 3.1628674276138575, 'reg_lambda': 0.5475825279790727}

Best MAP@3: 0.33507222222230404, optuna, XGBClassifier, extra features: moisture_deviation

Best MAP@3: 0.3415511111111975, optuna, XGBClassifier, no extra features

In [27]:
og_train = pd.read_csv('train.csv')
og_test = pd.read_csv('test.csv')
In [28]:
fertilizer_encoder = LabelEncoder()
original_fertilizer_names = og_train['Fertilizer Name'].unique()
fertilizer_encoder.fit(original_fertilizer_names)
Out[28]:
LabelEncoder()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
In [29]:
submission1 = pd.read_csv("sample_submission.csv")

model = xg.XGBClassifier(**study.best_params)
model.fit(X, y)

probs = model.predict_proba(X_test)
top3_indices = np.argsort(probs, axis=1)[:, -3:][:, ::-1]

top3_fertilizers = [' '.join(fertilizer_encoder.inverse_transform(row)) for row in top3_indices]

submission = pd.DataFrame({
    'ID': test['id'],
    'Fertilizer_Name': top3_fertilizers
})

submission.to_csv('submission500.csv', index=False)
In [30]:
submission
Out[30]:
ID Fertilizer_Name
0 750000 DAP 10-26-26 28-28
1 750001 17-17-17 20-20 10-26-26
2 750002 20-20 28-28 10-26-26
3 750003 14-35-14 DAP Urea
4 750004 20-20 10-26-26 28-28
... ... ...
249995 999995 17-17-17 28-28 20-20
249996 999996 10-26-26 14-35-14 17-17-17
249997 999997 DAP 14-35-14 Urea
249998 999998 10-26-26 17-17-17 28-28
249999 999999 14-35-14 17-17-17 20-20

250000 rows × 2 columns

In [31]:
from sklearn.model_selection import StratifiedKFold
from collections import Counter
In [32]:
# 1. Compute class_weights globally (can also be per fold)
counter_full = Counter(y)
max_count_full = max(counter_full.values())
class_weights_full = {cls: max_count_full / count for cls, count in counter_full.items()}

# 2. Stratified CV
kfold = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)
fold_accuracies = []
oof_preds = np.zeros((X.shape[0], len(np.unique(y))))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y), 1):
    print(f"\n================ Fold {fold} ================")

    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

    # 3. Compute per-instance weights
    counter_fold = Counter(y_tr)
    max_count_fold = max(counter_fold.values())
    sample_weights = y_tr.map(lambda cls: max_count_fold / counter_fold[cls])

    # 4. Instantiate XGBoost model
    XGB_model = xg.XGBClassifier(
        max_depth=12,
        colsample_bytree=0.467,
        subsample=0.86,
        n_estimators=4000,
        learning_rate=0.03,
        gamma=0.26,
        max_delta_step=4,
        reg_alpha=2.7,
        reg_lambda=1.4,
        objective='multi:softprob',
        random_state=13,
        enable_categorical=True,
        tree_method='hist',     
        device='cuda',
        early_stopping_rounds=150,        
    )

    # 5. Fit with early stopping
    XGB_model.fit(
        X_tr,
        y_tr,
        sample_weight=sample_weights,
        eval_set=[(X_va, y_va)],
        verbose=200,
    )

    val_labels = XGB_model.predict(X_va)
    val_probas = XGB_model.predict_proba(X_va)

    oof_preds[val_idx] = val_probas
    acc = accuracy_score(y_va, val_labels)
    fold_accuracies.append(acc)
    print(f"✅ Fold {fold} Accuracy: {acc:.4f}")

# 6. Final CV metrics
print("\n🎯 Mean CV Accuracy:", np.mean(fold_accuracies))
print("📈 Std CV Accuracy:", np.std(fold_accuracies))

# Get Top-3 predicted class indices
top3_preds = np.argsort(oof_preds, axis=1)[:, ::-1][:, :3]
================ Fold 1 ================
[0]	validation_0-mlogloss:1.94566
[200]	validation_0-mlogloss:1.91935
[400]	validation_0-mlogloss:1.90982
[600]	validation_0-mlogloss:1.90566
[800]	validation_0-mlogloss:1.90391
[1000]	validation_0-mlogloss:1.90314
[1200]	validation_0-mlogloss:1.90276
[1400]	validation_0-mlogloss:1.90284
[1423]	validation_0-mlogloss:1.90287
✅ Fold 1 Accuracy: 0.2154

================ Fold 2 ================
[0]	validation_0-mlogloss:1.94569
[200]	validation_0-mlogloss:1.91987
[400]	validation_0-mlogloss:1.91058
[600]	validation_0-mlogloss:1.90659
[800]	validation_0-mlogloss:1.90491
[1000]	validation_0-mlogloss:1.90431
[1200]	validation_0-mlogloss:1.90422
[1243]	validation_0-mlogloss:1.90421
✅ Fold 2 Accuracy: 0.2124

================ Fold 3 ================
[0]	validation_0-mlogloss:1.94564
[200]	validation_0-mlogloss:1.91894
[400]	validation_0-mlogloss:1.90917
[600]	validation_0-mlogloss:1.90455
[800]	validation_0-mlogloss:1.90255
[1000]	validation_0-mlogloss:1.90177
[1200]	validation_0-mlogloss:1.90141
[1400]	validation_0-mlogloss:1.90139
[1414]	validation_0-mlogloss:1.90137
✅ Fold 3 Accuracy: 0.2161

================ Fold 4 ================
[0]	validation_0-mlogloss:1.94566
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[32], line 42
     23 XGB_model = xg.XGBClassifier(
     24     max_depth=12,
     25     colsample_bytree=0.467,
   (...)
     38     early_stopping_rounds=150,        
     39 )
     41 # 5. Fit with early stopping
---> 42 XGB_model.fit(
     43     X_tr,
     44     y_tr,
     45     sample_weight=sample_weights,
     46     eval_set=[(X_va, y_va)],
     47     verbose=200,
     48 )
     50 val_labels = XGB_model.predict(X_va)
     51 val_probas = XGB_model.predict_proba(X_va)

File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\core.py:729, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
    727 for k, arg in zip(sig.parameters, args):
    728     kwargs[k] = arg
--> 729 return func(**kwargs)

File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\sklearn.py:1682, in XGBClassifier.fit(self, X, y, sample_weight, base_margin, eval_set, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights)
   1660 model, metric, params, feature_weights = self._configure_fit(
   1661     xgb_model, params, feature_weights
   1662 )
   1663 train_dmatrix, evals = _wrap_evaluation_matrices(
   1664     missing=self.missing,
   1665     X=X,
   (...)
   1679     feature_types=self.feature_types,
   1680 )
-> 1682 self._Booster = train(
   1683     params,
   1684     train_dmatrix,
   1685     self.get_num_boosting_rounds(),
   1686     evals=evals,
   1687     early_stopping_rounds=self.early_stopping_rounds,
   1688     evals_result=evals_result,
   1689     obj=obj,
   1690     custom_metric=metric,
   1691     verbose_eval=verbose,
   1692     xgb_model=model,
   1693     callbacks=self.callbacks,
   1694 )
   1696 if not callable(self.objective):
   1697     self.objective = params["objective"]

File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\core.py:729, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
    727 for k, arg in zip(sig.parameters, args):
    728     kwargs[k] = arg
--> 729 return func(**kwargs)

File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\training.py:183, in train(params, dtrain, num_boost_round, evals, obj, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, custom_metric)
    181 if cb_container.before_iteration(bst, i, dtrain, evals):
    182     break
--> 183 bst.update(dtrain, iteration=i, fobj=obj)
    184 if cb_container.after_iteration(bst, i, dtrain, evals):
    185     break

File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\core.py:2247, in Booster.update(self, dtrain, iteration, fobj)
   2243 self._assign_dmatrix_features(dtrain)
   2245 if fobj is None:
   2246     _check_call(
-> 2247         _LIB.XGBoosterUpdateOneIter(
   2248             self.handle, ctypes.c_int(iteration), dtrain.handle
   2249         )
   2250     )
   2251 else:
   2252     pred = self.predict(dtrain, output_margin=True, training=True)

KeyboardInterrupt: 
In [ ]:
map3_score = mapk(
    y.values.tolist(), 
    top3_preds.tolist(),  
    k=3
)
print(f"\n📊 Mean Average Precision @3 (MAP@3): {map3_score:.5f}")
📊 Mean Average Precision @3 (MAP@3): 0.35309
In [ ]:
preds1 = model.predict_proba(X_test)
preds2 = XGB_model.predict_proba(X_test)
In [ ]:
ensemble_test_preds = (0.2 * preds2 + 0.8 * preds1)

submission2 = pd.read_csv("sample_submission.csv")

top3_indices = np.argsort(ensemble_test_preds, axis=1)[:, -3:][:, ::-1]

top3_fertilizers = [' '.join(fertilizer_encoder.inverse_transform(row)) for row in top3_indices]

submission2 = pd.DataFrame({
    'ID': test['id'],
    'Fertilizer Name': top3_fertilizers
})

submission2.to_csv('submissionensemble10.csv', index=False)
In [ ]:
submission2.head()
Out[ ]:
ID Fertilizer_Name
0 750000 DAP 10-26-26 28-28
1 750001 17-17-17 20-20 10-26-26
2 750002 20-20 28-28 10-26-26
3 750003 14-35-14 DAP 17-17-17
4 750004 20-20 Urea 10-26-26
In [ ]:
train.head()
Out[ ]:
id Temparature Humidity Moisture Soil_Type Crop_Type Nitrogen Potassium Phosphorous Fertilizer_Name
0 0 37 70 36 1 8 36 4 5 4
1 1 27 69 65 4 4 30 6 18 4
2 2 29 63 32 4 4 24 12 16 2
3 3 35 62 54 4 0 39 12 4 0
4 4 35 58 43 3 6 37 2 16 5
In [ ]:
new_train = pd.read_csv('train.csv')
new_test = pd.read_csv('test.csv')
In [ ]:
new_train.head()
Out[ ]:
id Temparature Humidity Moisture Soil Type Crop Type Nitrogen Potassium Phosphorous Fertilizer Name
0 0 37 70 36 Clayey Sugarcane 36 4 5 28-28
1 1 27 69 65 Sandy Millets 30 6 18 28-28
2 2 29 63 32 Sandy Millets 24 12 16 17-17-17
3 3 35 62 54 Sandy Barley 39 12 4 10-26-26
4 4 35 58 43 Red Paddy 37 2 16 DAP
In [ ]:
X = new_train.drop(columns=['id', 'Fertilizer Name'], errors='ignore')
X_test = new_test.drop(columns=['id'], errors='ignore')

y = new_train["Fertilizer Name"]
In [ ]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=SEED)
In [ ]:
from catboost import CatBoostClassifier, Pool

# Define categorical features
cat_features = ['Soil Type', 'Crop Type']  # Add other categorical columns if any

# Create CatBoost Pool with specified categorical features
train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)

# Initialize and train CatBoost model
cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=SEED,
    gpu_ram_part=0.95,  # Utilize GPU if available
    task_type="GPU"     # Use GPU acceleration
)

# Fit the model
cat_model.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=50,
    verbose=200
)

# Get predictions
probs = cat_model.predict_proba(X_test)
top3_indices = np.argsort(probs, axis=1)[:, -3:][:, ::-1]
0:	learn: 1.9450360	test: 1.9450892	best: 1.9450892 (0)	total: 20.2ms	remaining: 20.2s
200:	learn: 1.9218504	test: 1.9294902	best: 1.9294902 (200)	total: 3.88s	remaining: 15.4s
400:	learn: 1.9116783	test: 1.9254933	best: 1.9254933 (400)	total: 7.71s	remaining: 11.5s
600:	learn: 1.9036608	test: 1.9233704	best: 1.9233704 (600)	total: 11.4s	remaining: 7.54s
800:	learn: 1.8966321	test: 1.9221321	best: 1.9221321 (800)	total: 15.1s	remaining: 3.76s
999:	learn: 1.8897356	test: 1.9212942	best: 1.9212942 (999)	total: 18.8s	remaining: 0us
bestTest = 1.921294167
bestIteration = 999
In [ ]:
preds3 = cat_model.predict_proba(X_test)

ensemble_test_preds = (0.33 * preds2 + 0.33 * preds1 + 0.34 * preds3)

submission3 = pd.read_csv("sample_submission.csv")

top3_indices = np.argsort(ensemble_test_preds, axis=1)[:, -3:][:, ::-1]

top3_fertilizers = [' '.join(fertilizer_encoder.inverse_transform(row)) for row in top3_indices]

submission3 = pd.DataFrame({
    'ID': test['id'],
    'Fertilizer Name': top3_fertilizers
})

submission3.to_csv('submissionensemble_cats_xgb_optuna.csv', index=False)
In [ ]:
submission3
Out[ ]:
ID Fertilizer_Name
0 750000 DAP 10-26-26 28-28
1 750001 17-17-17 20-20 10-26-26
2 750002 20-20 28-28 10-26-26
3 750003 14-35-14 DAP 17-17-17
4 750004 20-20 Urea 10-26-26
... ... ...
249995 999995 Urea 17-17-17 28-28
249996 999996 14-35-14 10-26-26 Urea
249997 999997 DAP Urea 14-35-14
249998 999998 10-26-26 28-28 17-17-17
249999 999999 14-35-14 17-17-17 20-20

250000 rows × 2 columns

In [ ]:
print("TODO: submit submission2 and submission3")
TODO: submit submission2 and submission3
In [ ]:
ensemble_test_preds = preds2

submission5 = pd.read_csv("sample_submission.csv")

top3_indices = np.argsort(ensemble_test_preds, axis=1)[:, -3:][:, ::-1]

top3_fertilizers = [' '.join(fertilizer_encoder.inverse_transform(row)) for row in top3_indices]

submission2 = pd.DataFrame({
    'ID': test['id'],
    'Fertilizer Name': top3_fertilizers
})

submission2.to_csv('submisX.csv', index=False)