# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Machine Learning
import xgboost as xg

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from imblearn.over_sampling import SMOTE

# Feature Importance & Explainability
import shap

# Settings
import warnings
warnings.filterwarnings("ignore")

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

print("Libraries loaded. Ready to go!")

Libraries loaded. Ready to go!

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

Train shape: (750000, 10), Test shape: (250000, 9)

train.head()

fertilizers = train['Fertilizer Name'].unique()
for i in fertilizers:
    print(i)

28-28
17-17-17
10-26-26
DAP
20-20
14-35-14
Urea

train.isnull().sum()

id                 0
Temparature        0
Humidity           0
Moisture           0
Soil Type          0
Crop Type          0
Nitrogen           0
Potassium          0
Phosphorous        0
Fertilizer Name    0
dtype: int64

test.isnull().sum()

id             0
Temparature    0
Humidity       0
Moisture       0
Soil Type      0
Crop Type      0
Nitrogen       0
Potassium      0
Phosphorous    0
dtype: int64

train.columns = train.columns.str.replace(' ', '_')
test.columns = test.columns.str.replace(' ', '_')

CATS = []
NUMS = []

FEATURES = []

for col in train.columns:
    FEATURES.append(col)

for c in FEATURES:
    if train[c].dtype == "object":
        CATS.append(c)

for c in FEATURES:
    if c not in CATS:
        NUMS.append(c)

def plot_nums(dataframe):
    float_cols = [col for col in dataframe.columns if dataframe[col].dtype == "float64" or dataframe[col].dtype == "int64"]

    cols_per_row = 3
    num_plots = len(float_cols)
    rows = (num_plots // cols_per_row) + (num_plots % cols_per_row > 0) 

    fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 5 * rows)) 
    axes = axes.flatten()  

    for idx, col in enumerate(float_cols):
        sns.histplot(dataframe[col], bins=50, kde=True, ax=axes[idx])
        axes[idx].set_title(f"Distribution of {col}")

    for i in range(idx + 1, len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.show()

if False:
    plot_nums(train.drop(columns=['id', 'Fertilizer_Name']))

train.dtypes

id                  int64
Temparature         int64
Humidity            int64
Moisture            int64
Soil_Type          object
Crop_Type          object
Nitrogen            int64
Potassium           int64
Phosphorous         int64
Fertilizer_Name    object
dtype: object

# train['Climate'] = (train['Humidity'] + train['Temparature'] + train['Moisture']) / 3
# test['Climate'] = (test['Humidity'] + test['Temparature'] + test['Moisture']) / 3

le = LabelEncoder()

CATS.append('Fertilizer_Name')

for col in CATS:
    train[col] = le.fit_transform(train[col])
    if col in test.columns:
        test[col] = le.fit_transform(test[col])

train.head()

from lightgbm import LGBMClassifier

X = train.drop(columns=['id', 'Fertilizer_Name'], errors='ignore')
X_test = test.drop(columns=['id'], errors='ignore')

y = train["Fertilizer_Name"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=SEED)

def quick_eval(model, X_train, y_train):
    model.fit(X_train, y_train)

    predictions_val = model.predict(X_val)

    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

    cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='precision')

    rmse = root_mean_squared_error(y_val, predictions_val)
    print(f"*** {model.__class__.__name__} ***")
    print(f"Root mean squared error (val): {rmse}")
    print(f"Mean CV MSLE: {-np.mean(cv_scores):.4f}")
    print(f"CV Std Dev: {np.std(cv_scores):.4f}")

    stars = len(model.__class__.__name__) + 8
    print("*" * stars)
    print("\n")

# quick_eval(xg.XGBClassifier(), X_train, y_train)
# quick_eval(LogisticRegression(), X_train, y_train)
# quick_eval(LGBMClassifier(), X_train, y_train)

if False:
    model.fit(X, y) 
    predictions = model.predict(X_test)

    sub1 = pd.read_csv("sample_submission.csv")

    sub1.Listening_Time_minutes = predictions 

    sub1.to_csv("xgboost.csv", index=False)

    print("Sub shape:", sub1.shape)
    sub1.head()

def mapk(actual, predicted, k=3):
    """
    Computes the mean average precision at k.
    actual: array-like of true labels
    predicted: array-like of lists of predicted labels (top k)
    """
    score = 0.0
    for a, p in zip(actual, predicted):
        try:
            score += 1.0 / (p.index(a) + 1) if a in p else 0.0
        except ValueError:
            score += 0.0
    return score / len(actual)

train.head()

if False:
    ###
    soil_moisture_mean = train.groupby('Soil_Type')['Moisture'].transform('mean')
    train['Moisture_Deviation'] = train['Moisture'] - soil_moisture_mean

    soil_moisture_mean_test = test.groupby('Soil_Type')['Moisture'].transform('mean')
    test['Moisture_Deviation'] = test['Moisture'] - soil_moisture_mean_test
    ###

    train['Climate_Index'] = (
        0.3 * train['Temparature'] +
        0.1 * train['Humidity'] +
        0.6 * train['Moisture']
    )

    test['Climate_Index'] = (
        0.3 * test['Temparature'] +
        0.1 * test['Humidity'] +
        0.6 * test['Moisture']
    )

    train['Nitrogen_Sq'] = train['Nitrogen'] ** 2
    train['Temp_Moisture'] = train['Temparature'] * train['Moisture']

    test['Nitrogen_Sq'] = test['Nitrogen'] ** 2
    test['Temp_Moisture'] = test['Temparature'] * test['Moisture']

    train['N_K_ratio'] = train['Nitrogen'] / (train['Potassium'] + 1)
    train['N_P_ratio'] = train['Nitrogen'] / (train['Phosphorous'] + 1)
    train['K_P_ratio'] = train['Potassium'] / (train['Phosphorous'] + 1)

    test['N_K_ratio'] = test['Nitrogen'] / (test['Potassium'] + 1)
    test['N_P_ratio'] = test['Nitrogen'] / (test['Phosphorous'] + 1)
    test['K_P_ratio'] = test['Potassium'] / (test['Phosphorous'] + 1)

X = train.drop(columns=['id', 'Fertilizer_Name'], errors='ignore')
X_test = test.drop(columns=['id'], errors='ignore')

y = train["Fertilizer_Name"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=SEED)

X_train.head()

import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer

def mapk_eval(y_true, y_pred_proba, k=3):
    top_k = np.argsort(y_pred_proba, axis=1)[:, -k:][:, ::-1]
    return mapk(y_true, top_k.tolist(), k=k)

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 400),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': SEED,
        'use_label_encoder': False,
        'eval_metric': 'mlogloss'
    }
    model = xg.XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_val)
    score = mapk_eval(y_val.values, y_pred_proba, k=3)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print("Best MAP@3:", study.best_value)
print("Best params:", study.best_params)

[I 2025-06-11 17:41:09,298] A new study created in memory with name: no-name-dd5b1a35-a095-4ac2-a92c-8fe87d4dee67
[I 2025-06-11 17:41:36,577] Trial 0 finished with value: 0.3231222222222909 and parameters: {'max_depth': 8, 'learning_rate': 0.06326287765150974, 'n_estimators': 162, 'subsample': 0.6821233517625884, 'colsample_bytree': 0.9638359504388128, 'gamma': 2.4792001687350584, 'reg_alpha': 1.0189751290331739, 'reg_lambda': 4.768573075328976}. Best is trial 0 with value: 0.3231222222222909.
[I 2025-06-11 17:41:55,650] Trial 1 finished with value: 0.31536555555561735 and parameters: {'max_depth': 6, 'learning_rate': 0.16841103012214031, 'n_estimators': 129, 'subsample': 0.8081429739265287, 'colsample_bytree': 0.8205096527873423, 'gamma': 3.3963004754342645, 'reg_alpha': 4.3330543885150945, 'reg_lambda': 2.569170589281376}. Best is trial 0 with value: 0.3231222222222909.
[I 2025-06-11 17:42:17,485] Trial 2 finished with value: 0.32085333333340355 and parameters: {'max_depth': 6, 'learning_rate': 0.16527399809340892, 'n_estimators': 142, 'subsample': 0.8894209972128635, 'colsample_bytree': 0.6185529510142461, 'gamma': 2.02614326399173, 'reg_alpha': 1.4271732057664281, 'reg_lambda': 1.998810820263441}. Best is trial 0 with value: 0.3231222222222909.
[I 2025-06-11 17:42:40,289] Trial 3 finished with value: 0.3224188888889598 and parameters: {'max_depth': 4, 'learning_rate': 0.2230595671108154, 'n_estimators': 139, 'subsample': 0.7703989917310565, 'colsample_bytree': 0.7062399714577376, 'gamma': 0.2054877113843251, 'reg_alpha': 0.3159713492248678, 'reg_lambda': 1.425548894820482}. Best is trial 0 with value: 0.3231222222222909.
[I 2025-06-11 17:43:21,373] Trial 4 finished with value: 0.3284222222222986 and parameters: {'max_depth': 8, 'learning_rate': 0.11729039181348787, 'n_estimators': 292, 'subsample': 0.7868763316394609, 'colsample_bytree': 0.9479985598060687, 'gamma': 1.837477551754601, 'reg_alpha': 0.3794920771432825, 'reg_lambda': 0.87934828545284}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:44:03,017] Trial 5 finished with value: 0.3200177777778431 and parameters: {'max_depth': 9, 'learning_rate': 0.19739168479199648, 'n_estimators': 372, 'subsample': 0.9157227648549711, 'colsample_bytree': 0.7188536462639663, 'gamma': 2.816024478482395, 'reg_alpha': 3.837768348497872, 'reg_lambda': 3.3131651684887404}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:44:19,037] Trial 6 finished with value: 0.31562111111117375 and parameters: {'max_depth': 5, 'learning_rate': 0.12010698415562772, 'n_estimators': 111, 'subsample': 0.9810691798381612, 'colsample_bytree': 0.9140888906728758, 'gamma': 1.915539709422021, 'reg_alpha': 2.243296498062493, 'reg_lambda': 4.661617348426917}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:45:08,842] Trial 7 finished with value: 0.3032555555556048 and parameters: {'max_depth': 4, 'learning_rate': 0.01099005409111613, 'n_estimators': 381, 'subsample': 0.6727749070349011, 'colsample_bytree': 0.9134688747252768, 'gamma': 0.3710408507772206, 'reg_alpha': 0.29748030502547484, 'reg_lambda': 4.423774568327871}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:45:40,582] Trial 8 finished with value: 0.32276222222229256 and parameters: {'max_depth': 6, 'learning_rate': 0.059865987470583186, 'n_estimators': 221, 'subsample': 0.7582158086417364, 'colsample_bytree': 0.8404512979236067, 'gamma': 1.9623022408098918, 'reg_alpha': 1.206373500960709, 'reg_lambda': 1.419453652982462}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:46:23,280] Trial 9 finished with value: 0.3264944444445166 and parameters: {'max_depth': 8, 'learning_rate': 0.027302788769481147, 'n_estimators': 229, 'subsample': 0.7529154197261488, 'colsample_bytree': 0.6529186522939491, 'gamma': 1.1240570771678426, 'reg_alpha': 0.6121402665107212, 'reg_lambda': 4.099642876119921}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:46:53,197] Trial 10 finished with value: 0.3119122222222805 and parameters: {'max_depth': 10, 'learning_rate': 0.2998616713509893, 'n_estimators': 310, 'subsample': 0.616720220647669, 'colsample_bytree': 0.9900324135783889, 'gamma': 4.829883864621441, 'reg_alpha': 2.86151565825943, 'reg_lambda': 0.2342337106652529}. Best is trial 4 with value: 0.3284222222222986.
[I 2025-06-11 17:47:31,022] Trial 11 finished with value: 0.3343077777778579 and parameters: {'max_depth': 8, 'learning_rate': 0.10141428721696402, 'n_estimators': 272, 'subsample': 0.8380054019273669, 'colsample_bytree': 0.6006088545378645, 'gamma': 1.0967787479159925, 'reg_alpha': 0.07143352213340626, 'reg_lambda': 0.004677646038029781}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:48:10,152] Trial 12 finished with value: 0.33292222222230466 and parameters: {'max_depth': 8, 'learning_rate': 0.1164193628787389, 'n_estimators': 296, 'subsample': 0.8469524872130173, 'colsample_bytree': 0.756160270571199, 'gamma': 1.126997178635461, 'reg_alpha': 1.889708207869898, 'reg_lambda': 0.04509237921231746}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:48:50,703] Trial 13 finished with value: 0.3327977777778562 and parameters: {'max_depth': 10, 'learning_rate': 0.1079885059779757, 'n_estimators': 282, 'subsample': 0.8642956813550425, 'colsample_bytree': 0.758807855846258, 'gamma': 0.929319860398242, 'reg_alpha': 2.101194782635285, 'reg_lambda': 0.0720654324119212}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:49:35,590] Trial 14 finished with value: 0.3316377777778581 and parameters: {'max_depth': 7, 'learning_rate': 0.08973884386953482, 'n_estimators': 346, 'subsample': 0.8401850492620782, 'colsample_bytree': 0.6129635658250652, 'gamma': 1.1026587783031196, 'reg_alpha': 3.097882505594469, 'reg_lambda': 0.7020390385613303}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:50:00,539] Trial 15 finished with value: 0.3167311111111737 and parameters: {'max_depth': 9, 'learning_rate': 0.14071780808295184, 'n_estimators': 252, 'subsample': 0.9188314560970854, 'colsample_bytree': 0.6782619837824262, 'gamma': 3.9184671621017677, 'reg_alpha': 1.848825814992794, 'reg_lambda': 0.7976018481325824}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:50:21,306] Trial 16 finished with value: 0.3270488888889637 and parameters: {'max_depth': 7, 'learning_rate': 0.2453457898102919, 'n_estimators': 189, 'subsample': 0.9904395333844096, 'colsample_bytree': 0.7700252445735166, 'gamma': 0.6994091133884861, 'reg_alpha': 3.57368344372309, 'reg_lambda': 2.7727524033511157}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:51:04,023] Trial 17 finished with value: 0.3311422222223003 and parameters: {'max_depth': 9, 'learning_rate': 0.07983509022972851, 'n_estimators': 318, 'subsample': 0.8352748994437167, 'colsample_bytree': 0.8578981187981797, 'gamma': 1.3823289286604317, 'reg_alpha': 4.634757821379262, 'reg_lambda': 1.7479536120917665}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:51:49,971] Trial 18 finished with value: 0.32899888888896345 and parameters: {'max_depth': 7, 'learning_rate': 0.046557345482992424, 'n_estimators': 265, 'subsample': 0.722134013823966, 'colsample_bytree': 0.7570135557682693, 'gamma': 0.018843173301881677, 'reg_alpha': 1.721014203810751, 'reg_lambda': 0.42030265083030405}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:52:24,791] Trial 19 finished with value: 0.3151444444445073 and parameters: {'max_depth': 3, 'learning_rate': 0.14308686291690068, 'n_estimators': 336, 'subsample': 0.9417566294561698, 'colsample_bytree': 0.6545149117279865, 'gamma': 1.5231548436377371, 'reg_alpha': 0.9197089319250962, 'reg_lambda': 1.1776366740093365}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:52:48,544] Trial 20 finished with value: 0.32070777777784526 and parameters: {'max_depth': 8, 'learning_rate': 0.1905085685264455, 'n_estimators': 227, 'subsample': 0.8222165062977569, 'colsample_bytree': 0.6002849729898877, 'gamma': 2.6446219490853693, 'reg_alpha': 2.7049477362855354, 'reg_lambda': 3.354669886543953}. Best is trial 11 with value: 0.3343077777778579.
[I 2025-06-11 17:53:33,877] Trial 21 finished with value: 0.3350211111111929 and parameters: {'max_depth': 10, 'learning_rate': 0.11248115536930409, 'n_estimators': 278, 'subsample': 0.8668532639254519, 'colsample_bytree': 0.7773126206273325, 'gamma': 0.8089080347514379, 'reg_alpha': 2.287551543519361, 'reg_lambda': 0.21171740386923416}. Best is trial 21 with value: 0.3350211111111929.
[I 2025-06-11 17:54:20,902] Trial 22 finished with value: 0.3345766666667466 and parameters: {'max_depth': 10, 'learning_rate': 0.09391669624087032, 'n_estimators': 271, 'subsample': 0.8761041182170525, 'colsample_bytree': 0.7868708543511961, 'gamma': 0.671527886064389, 'reg_alpha': 2.413656615251328, 'reg_lambda': 0.10315529276170246}. Best is trial 21 with value: 0.3350211111111929.
[I 2025-06-11 17:55:11,412] Trial 23 finished with value: 0.33531888888896916 and parameters: {'max_depth': 10, 'learning_rate': 0.08058832222297987, 'n_estimators': 268, 'subsample': 0.8878145949016356, 'colsample_bytree': 0.7945209987411344, 'gamma': 0.6025081681020137, 'reg_alpha': 3.1628674276138575, 'reg_lambda': 0.5475825279790727}. Best is trial 23 with value: 0.33531888888896916.
[I 2025-06-11 17:56:02,814] Trial 24 finished with value: 0.33481222222230106 and parameters: {'max_depth': 10, 'learning_rate': 0.07632962492284197, 'n_estimators': 247, 'subsample': 0.8828823700576881, 'colsample_bytree': 0.8061121219495095, 'gamma': 0.5619959296860586, 'reg_alpha': 3.207741677387607, 'reg_lambda': 0.5402723006951022}. Best is trial 23 with value: 0.33531888888896916.
[I 2025-06-11 17:56:46,908] Trial 25 finished with value: 0.33146000000007747 and parameters: {'max_depth': 10, 'learning_rate': 0.044158678870159264, 'n_estimators': 196, 'subsample': 0.9539433757198061, 'colsample_bytree': 0.8740205213763044, 'gamma': 0.4914488530822796, 'reg_alpha': 3.3348708028359657, 'reg_lambda': 0.6339574584605174}. Best is trial 23 with value: 0.33531888888896916.
[I 2025-06-11 17:57:32,428] Trial 26 finished with value: 0.3348277777778593 and parameters: {'max_depth': 9, 'learning_rate': 0.07596125416611232, 'n_estimators': 203, 'subsample': 0.8933575475802615, 'colsample_bytree': 0.8069483676293034, 'gamma': 0.014269500839304783, 'reg_alpha': 3.93767993130073, 'reg_lambda': 1.1052745516298361}. Best is trial 23 with value: 0.33531888888896916.
[I 2025-06-11 17:58:15,779] Trial 27 finished with value: 0.3302011111111886 and parameters: {'max_depth': 9, 'learning_rate': 0.04148758411418811, 'n_estimators': 189, 'subsample': 0.9094973578765564, 'colsample_bytree': 0.7229847571564367, 'gamma': 0.001878902388947612, 'reg_alpha': 4.0162576166031805, 'reg_lambda': 1.9076972350531038}. Best is trial 23 with value: 0.33531888888896916.
[I 2025-06-11 17:58:42,085] Trial 28 finished with value: 0.3272244444445181 and parameters: {'max_depth': 9, 'learning_rate': 0.1426550580298173, 'n_estimators': 211, 'subsample': 0.9513131954283298, 'colsample_bytree': 0.8272066434365707, 'gamma': 1.5056017008099156, 'reg_alpha': 4.933690282059899, 'reg_lambda': 1.0899001839176081}. Best is trial 23 with value: 0.33531888888896916.
[I 2025-06-11 17:59:22,774] Trial 29 finished with value: 0.33271777777785644 and parameters: {'max_depth': 10, 'learning_rate': 0.0695946642352586, 'n_estimators': 172, 'subsample': 0.9086247963279899, 'colsample_bytree': 0.8934964105380269, 'gamma': 0.30599557927289756, 'reg_alpha': 4.090575478084377, 'reg_lambda': 2.1752726368794715}. Best is trial 23 with value: 0.33531888888896916.

Best MAP@3: 0.33531888888896916
Best params: {'max_depth': 10, 'learning_rate': 0.08058832222297987, 'n_estimators': 268, 'subsample': 0.8878145949016356, 'colsample_bytree': 0.7945209987411344, 'gamma': 0.6025081681020137, 'reg_alpha': 3.1628674276138575, 'reg_lambda': 0.5475825279790727}

og_train = pd.read_csv('train.csv')
og_test = pd.read_csv('test.csv')

fertilizer_encoder = LabelEncoder()
original_fertilizer_names = og_train['Fertilizer Name'].unique()
fertilizer_encoder.fit(original_fertilizer_names)

LabelEncoder()

LabelEncoder()

submission1 = pd.read_csv("sample_submission.csv")

model = xg.XGBClassifier(**study.best_params)
model.fit(X, y)

probs = model.predict_proba(X_test)
top3_indices = np.argsort(probs, axis=1)[:, -3:][:, ::-1]

top3_fertilizers = [' '.join(fertilizer_encoder.inverse_transform(row)) for row in top3_indices]

submission = pd.DataFrame({
    'ID': test['id'],
    'Fertilizer_Name': top3_fertilizers
})

submission.to_csv('submission500.csv', index=False)

submission

from sklearn.model_selection import StratifiedKFold
from collections import Counter

# 1. Compute class_weights globally (can also be per fold)
counter_full = Counter(y)
max_count_full = max(counter_full.values())
class_weights_full = {cls: max_count_full / count for cls, count in counter_full.items()}

# 2. Stratified CV
kfold = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)
fold_accuracies = []
oof_preds = np.zeros((X.shape[0], len(np.unique(y))))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y), 1):
    print(f"\n================ Fold {fold} ================")

    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

    # 3. Compute per-instance weights
    counter_fold = Counter(y_tr)
    max_count_fold = max(counter_fold.values())
    sample_weights = y_tr.map(lambda cls: max_count_fold / counter_fold[cls])

    # 4. Instantiate XGBoost model
    XGB_model = xg.XGBClassifier(
        max_depth=12,
        colsample_bytree=0.467,
        subsample=0.86,
        n_estimators=4000,
        learning_rate=0.03,
        gamma=0.26,
        max_delta_step=4,
        reg_alpha=2.7,
        reg_lambda=1.4,
        objective='multi:softprob',
        random_state=13,
        enable_categorical=True,
        tree_method='hist',     
        device='cuda',
        early_stopping_rounds=150,        
    )

    # 5. Fit with early stopping
    XGB_model.fit(
        X_tr,
        y_tr,
        sample_weight=sample_weights,
        eval_set=[(X_va, y_va)],
        verbose=200,
    )

    val_labels = XGB_model.predict(X_va)
    val_probas = XGB_model.predict_proba(X_va)

    oof_preds[val_idx] = val_probas
    acc = accuracy_score(y_va, val_labels)
    fold_accuracies.append(acc)
    print(f"✅ Fold {fold} Accuracy: {acc:.4f}")

# 6. Final CV metrics
print("\n🎯 Mean CV Accuracy:", np.mean(fold_accuracies))
print("📈 Std CV Accuracy:", np.std(fold_accuracies))

# Get Top-3 predicted class indices
top3_preds = np.argsort(oof_preds, axis=1)[:, ::-1][:, :3]

================ Fold 1 ================
[0]	validation_0-mlogloss:1.94566
[200]	validation_0-mlogloss:1.91935
[400]	validation_0-mlogloss:1.90982
[600]	validation_0-mlogloss:1.90566
[800]	validation_0-mlogloss:1.90391
[1000]	validation_0-mlogloss:1.90314
[1200]	validation_0-mlogloss:1.90276
[1400]	validation_0-mlogloss:1.90284
[1423]	validation_0-mlogloss:1.90287
✅ Fold 1 Accuracy: 0.2154

================ Fold 2 ================
[0]	validation_0-mlogloss:1.94569
[200]	validation_0-mlogloss:1.91987
[400]	validation_0-mlogloss:1.91058
[600]	validation_0-mlogloss:1.90659
[800]	validation_0-mlogloss:1.90491
[1000]	validation_0-mlogloss:1.90431
[1200]	validation_0-mlogloss:1.90422
[1243]	validation_0-mlogloss:1.90421
✅ Fold 2 Accuracy: 0.2124

================ Fold 3 ================
[0]	validation_0-mlogloss:1.94564
[200]	validation_0-mlogloss:1.91894
[400]	validation_0-mlogloss:1.90917
[600]	validation_0-mlogloss:1.90455
[800]	validation_0-mlogloss:1.90255
[1000]	validation_0-mlogloss:1.90177
[1200]	validation_0-mlogloss:1.90141
[1400]	validation_0-mlogloss:1.90139
[1414]	validation_0-mlogloss:1.90137
✅ Fold 3 Accuracy: 0.2161

================ Fold 4 ================
[0]	validation_0-mlogloss:1.94566

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[32], line 42
     23 XGB_model = xg.XGBClassifier(
     24     max_depth=12,
     25     colsample_bytree=0.467,
   (...)
     38     early_stopping_rounds=150,        
     39 )
     41 # 5. Fit with early stopping
---> 42 XGB_model.fit(
     43     X_tr,
     44     y_tr,
     45     sample_weight=sample_weights,
     46     eval_set=[(X_va, y_va)],
     47     verbose=200,
     48 )
     50 val_labels = XGB_model.predict(X_va)
     51 val_probas = XGB_model.predict_proba(X_va)

File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\core.py:729, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
    727 for k, arg in zip(sig.parameters, args):
    728     kwargs[k] = arg
--> 729 return func(**kwargs)

File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\sklearn.py:1682, in XGBClassifier.fit(self, X, y, sample_weight, base_margin, eval_set, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights)
   1660 model, metric, params, feature_weights = self._configure_fit(
   1661     xgb_model, params, feature_weights
   1662 )
   1663 train_dmatrix, evals = _wrap_evaluation_matrices(
   1664     missing=self.missing,
   1665     X=X,
   (...)
   1679     feature_types=self.feature_types,
   1680 )
-> 1682 self._Booster = train(
   1683     params,
   1684     train_dmatrix,
   1685     self.get_num_boosting_rounds(),
   1686     evals=evals,
   1687     early_stopping_rounds=self.early_stopping_rounds,
   1688     evals_result=evals_result,
   1689     obj=obj,
   1690     custom_metric=metric,
   1691     verbose_eval=verbose,
   1692     xgb_model=model,
   1693     callbacks=self.callbacks,
   1694 )
   1696 if not callable(self.objective):
   1697     self.objective = params["objective"]

File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\core.py:729, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
    727 for k, arg in zip(sig.parameters, args):
    728     kwargs[k] = arg
--> 729 return func(**kwargs)

File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\training.py:183, in train(params, dtrain, num_boost_round, evals, obj, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, custom_metric)
    181 if cb_container.before_iteration(bst, i, dtrain, evals):
    182     break
--> 183 bst.update(dtrain, iteration=i, fobj=obj)
    184 if cb_container.after_iteration(bst, i, dtrain, evals):
    185     break

File c:\Users\robkr\anaconda3\Lib\site-packages\xgboost\core.py:2247, in Booster.update(self, dtrain, iteration, fobj)
   2243 self._assign_dmatrix_features(dtrain)
   2245 if fobj is None:
   2246     _check_call(
-> 2247         _LIB.XGBoosterUpdateOneIter(
   2248             self.handle, ctypes.c_int(iteration), dtrain.handle
   2249         )
   2250     )
   2251 else:
   2252     pred = self.predict(dtrain, output_margin=True, training=True)

KeyboardInterrupt:

map3_score = mapk(
    y.values.tolist(), 
    top3_preds.tolist(),  
    k=3
)
print(f"\n📊 Mean Average Precision @3 (MAP@3): {map3_score:.5f}")

📊 Mean Average Precision @3 (MAP@3): 0.35309

preds1 = model.predict_proba(X_test)
preds2 = XGB_model.predict_proba(X_test)

ensemble_test_preds = (0.2 * preds2 + 0.8 * preds1)

submission2 = pd.read_csv("sample_submission.csv")

top3_indices = np.argsort(ensemble_test_preds, axis=1)[:, -3:][:, ::-1]

top3_fertilizers = [' '.join(fertilizer_encoder.inverse_transform(row)) for row in top3_indices]

submission2 = pd.DataFrame({
    'ID': test['id'],
    'Fertilizer Name': top3_fertilizers
})

submission2.to_csv('submissionensemble10.csv', index=False)

submission2.head()

train.head()

new_train = pd.read_csv('train.csv')
new_test = pd.read_csv('test.csv')

new_train.head()

X = new_train.drop(columns=['id', 'Fertilizer Name'], errors='ignore')
X_test = new_test.drop(columns=['id'], errors='ignore')

y = new_train["Fertilizer Name"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=SEED)

from catboost import CatBoostClassifier, Pool

# Define categorical features
cat_features = ['Soil Type', 'Crop Type']  # Add other categorical columns if any

# Create CatBoost Pool with specified categorical features
train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)

# Initialize and train CatBoost model
cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=SEED,
    gpu_ram_part=0.95,  # Utilize GPU if available
    task_type="GPU"     # Use GPU acceleration
)

# Fit the model
cat_model.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=50,
    verbose=200
)

# Get predictions
probs = cat_model.predict_proba(X_test)
top3_indices = np.argsort(probs, axis=1)[:, -3:][:, ::-1]

0:	learn: 1.9450360	test: 1.9450892	best: 1.9450892 (0)	total: 20.2ms	remaining: 20.2s
200:	learn: 1.9218504	test: 1.9294902	best: 1.9294902 (200)	total: 3.88s	remaining: 15.4s
400:	learn: 1.9116783	test: 1.9254933	best: 1.9254933 (400)	total: 7.71s	remaining: 11.5s
600:	learn: 1.9036608	test: 1.9233704	best: 1.9233704 (600)	total: 11.4s	remaining: 7.54s
800:	learn: 1.8966321	test: 1.9221321	best: 1.9221321 (800)	total: 15.1s	remaining: 3.76s
999:	learn: 1.8897356	test: 1.9212942	best: 1.9212942 (999)	total: 18.8s	remaining: 0us
bestTest = 1.921294167
bestIteration = 999

preds3 = cat_model.predict_proba(X_test)

ensemble_test_preds = (0.33 * preds2 + 0.33 * preds1 + 0.34 * preds3)

submission3 = pd.read_csv("sample_submission.csv")

top3_indices = np.argsort(ensemble_test_preds, axis=1)[:, -3:][:, ::-1]

top3_fertilizers = [' '.join(fertilizer_encoder.inverse_transform(row)) for row in top3_indices]

submission3 = pd.DataFrame({
    'ID': test['id'],
    'Fertilizer Name': top3_fertilizers
})

submission3.to_csv('submissionensemble_cats_xgb_optuna.csv', index=False)

submission3

print("TODO: submit submission2 and submission3")

TODO: submit submission2 and submission3

ensemble_test_preds = preds2

submission5 = pd.read_csv("sample_submission.csv")

top3_indices = np.argsort(ensemble_test_preds, axis=1)[:, -3:][:, ::-1]

top3_fertilizers = [' '.join(fertilizer_encoder.inverse_transform(row)) for row in top3_indices]

submission2 = pd.DataFrame({
    'ID': test['id'],
    'Fertilizer Name': top3_fertilizers
})

submission2.to_csv('submisX.csv', index=False)

	ID	Fertilizer_Name
0	750000	DAP 10-26-26 28-28
1	750001	17-17-17 20-20 10-26-26
2	750002	20-20 28-28 10-26-26
3	750003	14-35-14 DAP Urea
4	750004	20-20 10-26-26 28-28
...	...	...
249995	999995	17-17-17 28-28 20-20
249996	999996	10-26-26 14-35-14 17-17-17
249997	999997	DAP 14-35-14 Urea
249998	999998	10-26-26 17-17-17 28-28
249999	999999	14-35-14 17-17-17 20-20

	ID	Fertilizer_Name
0	750000	DAP 10-26-26 28-28
1	750001	17-17-17 20-20 10-26-26
2	750002	20-20 28-28 10-26-26
3	750003	14-35-14 DAP 17-17-17
4	750004	20-20 Urea 10-26-26

	ID	Fertilizer_Name
0	750000	DAP 10-26-26 28-28
1	750001	17-17-17 20-20 10-26-26
2	750002	20-20 28-28 10-26-26
3	750003	14-35-14 DAP 17-17-17
4	750004	20-20 Urea 10-26-26
...	...	...
249995	999995	Urea 17-17-17 28-28
249996	999996	14-35-14 10-26-26 Urea
249997	999997	DAP Urea 14-35-14
249998	999998	10-26-26 28-28 17-17-17
249999	999999	14-35-14 17-17-17 20-20

#6¶

⭐ 1. Introduction & Overview¶

🔹 2. Import Libraries & Set Up¶

🔹 3. Load & Explore Data¶

🔹 4. Data Visualization & EDA¶

🔹 5. Feature Engineering¶

🔹 6. XGBoost, KFold, CV¶

	id	Temparature	Humidity	Moisture	Soil Type	Crop Type	Nitrogen	Potassium	Phosphorous	Fertilizer Name
0	0	37	70	36	Clayey	Sugarcane	36	4	5	28-28
1	1	27	69	65	Sandy	Millets	30	6	18	28-28
2	2	29	63	32	Sandy	Millets	24	12	16	17-17-17
3	3	35	62	54	Sandy	Barley	39	12	4	10-26-26
4	4	35	58	43	Red	Paddy	37	2	16	DAP

	id	Temparature	Humidity	Moisture	Soil_Type	Crop_Type	Nitrogen	Potassium	Phosphorous	Fertilizer_Name
0	0	37	70	36	1	8	36	4	5	4
1	1	27	69	65	4	4	30	6	18	4
2	2	29	63	32	4	4	24	12	16	2
3	3	35	62	54	4	0	39	12	4	0
4	4	35	58	43	3	6	37	2	16	5

	id	Temparature	Humidity	Moisture	Soil_Type	Crop_Type	Nitrogen	Potassium	Phosphorous	Fertilizer_Name
0	0	37	70	36	1	8	36	4	5	4
1	1	27	69	65	4	4	30	6	18	4
2	2	29	63	32	4	4	24	12	16	2
3	3	35	62	54	4	0	39	12	4	0
4	4	35	58	43	3	6	37	2	16	5

	Temparature	Humidity	Moisture	Soil_Type	Crop_Type	Nitrogen	Potassium	Phosphorous
453635	28	51	47	2	2	20	17	24
11651	33	62	30	4	0	7	0	6
431999	38	59	41	2	6	24	11	42
529211	26	52	57	2	10	27	17	19
110925	37	61	35	0	6	25	14	16

	id	Temparature	Humidity	Moisture	Soil_Type	Crop_Type	Nitrogen	Potassium	Phosphorous	Fertilizer_Name
0	0	37	70	36	1	8	36	4	5	4
1	1	27	69	65	4	4	30	6	18	4
2	2	29	63	32	4	4	24	12	16	2
3	3	35	62	54	4	0	39	12	4	0
4	4	35	58	43	3	6	37	2	16	5

	id	Temparature	Humidity	Moisture	Soil_Type	Crop_Type	Nitrogen	Potassium	Phosphorous	Fertilizer_Name
0	0	37	70	36	1	8	36	4	5	4
1	1	27	69	65	4	4	30	6	18	4
2	2	29	63	32	4	4	24	12	16	2
3	3	35	62	54	4	0	39	12	4	0
4	4	35	58	43	3	6	37	2	16	5

	id	Temparature	Humidity	Moisture	Soil_Type	Crop_Type	Nitrogen	Potassium	Phosphorous	Fertilizer_Name
0	0	37	70	36	1	8	36	4	5	4
1	1	27	69	65	4	4	30	6	18	4
2	2	29	63	32	4	4	24	12	16	2
3	3	35	62	54	4	0	39	12	4	0
4	4	35	58	43	3	6	37	2	16	5

	id	Temparature	Humidity	Moisture	Soil_Type	Crop_Type	Nitrogen	Potassium	Phosphorous	Fertilizer_Name
0	0	37	70	36	1	8	36	4	5	4
1	1	27	69	65	4	4	30	6	18	4
2	2	29	63	32	4	4	24	12	16	2
3	3	35	62	54	4	0	39	12	4	0
4	4	35	58	43	3	6	37	2	16	5

	id	Temparature	Humidity	Moisture	Soil_Type	Crop_Type	Nitrogen	Potassium	Phosphorous	Fertilizer_Name
0	0	37	70	36	1	8	36	4	5	4
1	1	27	69	65	4	4	30	6	18	4
2	2	29	63	32	4	4	24	12	16	2
3	3	35	62	54	4	0	39	12	4	0
4	4	35	58	43	3	6	37	2	16	5

	id	Temparature	Humidity	Moisture	Soil_Type	Crop_Type	Nitrogen	Potassium	Phosphorous	Fertilizer_Name
0	0	37	70	36	1	8	36	4	5	4
1	1	27	69	65	4	4	30	6	18	4
2	2	29	63	32	4	4	24	12	16	2
3	3	35	62	54	4	0	39	12	4	0
4	4	35	58	43	3	6	37	2	16	5

	id	Temparature	Humidity	Moisture	Soil_Type	Crop_Type	Nitrogen	Potassium	Phosphorous	Fertilizer_Name
0	0	37	70	36	1	8	36	4	5	4
1	1	27	69	65	4	4	30	6	18	4
2	2	29	63	32	4	4	24	12	16	2
3	3	35	62	54	4	0	39	12	4	0
4	4	35	58	43	3	6	37	2	16	5