#7¶

Kaggle competition: [link]

Entry by Robin R.P.M. Kras

⚠️ Disclaimer: I forgot to submit my best-performing submissions as my final score... again. The final ranking doesn't reflect the actual model performance achieved during development.

⭐ 1. Introduction & Overview¶

Your Goal: Your objective is to predict whether a person is an Introvert or Extrovert, given their social behavior and personality traits.

🔹 2. Import Libraries & Set Up¶

In [87]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Machine Learning
import xgboost as xg

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from imblearn.over_sampling import SMOTE

import optuna

# Feature Importance & Explainability
import shap

# Settings
import warnings
warnings.filterwarnings("ignore")

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

print("Libraries loaded. Ready to go!")
Libraries loaded. Ready to go!

🔹 3. Load & Explore Data¶

In [88]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(f"Train shape: {train.shape}, Test shape: {test.shape}")
Train shape: (18524, 9), Test shape: (6175, 8)
In [89]:
train.head()
Out[89]:
id Time_spent_Alone Stage_fear Social_event_attendance Going_outside Drained_after_socializing Friends_circle_size Post_frequency Personality
0 0 0.0 No 6.0 4.0 No 15.0 5.0 Extrovert
1 1 1.0 No 7.0 3.0 No 10.0 8.0 Extrovert
2 2 6.0 Yes 1.0 0.0 NaN 3.0 0.0 Introvert
3 3 3.0 No 7.0 3.0 No 11.0 5.0 Extrovert
4 4 1.0 No 4.0 4.0 No 13.0 NaN Extrovert
In [90]:
train['Personality'].value_counts()
Out[90]:
Personality
Extrovert    13699
Introvert     4825
Name: count, dtype: int64
In [91]:
def plot_nums(dataframe):
    float_cols = [col for col in dataframe.columns if dataframe[col].dtype == "float64" or dataframe[col].dtype == "int64"]

    cols_per_row = 3
    num_plots = len(float_cols)
    rows = (num_plots // cols_per_row) + (num_plots % cols_per_row > 0) 

    fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 5 * rows)) 
    axes = axes.flatten()  

    for idx, col in enumerate(float_cols):
        sns.histplot(dataframe[col], bins=50, kde=True, ax=axes[idx])
        axes[idx].set_title(f"Distribution of {col}")

    for i in range(idx + 1, len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.show()
In [92]:
def plot_cats(dataframe):
    categorical_features = dataframe.select_dtypes(include=['object']).columns

    num_features = len(categorical_features)
    cols = 3 
    rows = (num_features // cols) + (num_features % cols > 0) 

    # Create subplots
    fig, axes = plt.subplots(rows, cols, figsize=(15, rows * 5)) 
    axes = axes.flatten()  

    for i, feature in enumerate(categorical_features):
        dataframe[feature].value_counts().plot.pie(
            autopct='%1.1f%%', ax=axes[i], startangle=90, cmap="viridis"
        )
        axes[i].set_title(feature)
        axes[i].set_ylabel("") 

    # Hide any unused subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()
In [93]:
plot_cats(train)
plot_nums(train)
No description has been provided for this image
No description has been provided for this image
In [94]:
def heatmap_nums(dataframe): 
    heatmap_train = dataframe.select_dtypes(include=["float64", "int64"])

    corr_matrix = heatmap_train.corr()

    threshold = 0.75

    high_corr_pairs = (
        corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) 
        .stack()  
        .reset_index()
    )

    high_corr_pairs.columns = ["Feature 1", "Feature 2", "Correlation"]
    high_corr_pairs = high_corr_pairs[high_corr_pairs["Correlation"].abs() > threshold]  

    plt.figure(figsize=(30, 12))
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
    plt.title("Feature Correlation Matrix")
    plt.show()

    print("Highly correlated feature pairs (above threshold):")
    print(high_corr_pairs)
In [95]:
heatmap_nums(train)
No description has been provided for this image
Highly correlated feature pairs (above threshold):
Empty DataFrame
Columns: [Feature 1, Feature 2, Correlation]
Index: []

🔹 5. Model Testing¶

In [96]:
sampler = SMOTE(random_state=SEED)
In [97]:
# Prepare features and target
X = train.drop(['id', 'Personality'], axis=1)
y = train['Personality']
X_test = test.drop(['id'], axis=1)

# Convert categorical variables to numeric
le = LabelEncoder()
X['Stage_fear'] = le.fit_transform(X['Stage_fear'].fillna('Missing'))
X['Drained_after_socializing'] = le.fit_transform(X['Drained_after_socializing'].fillna('Missing'))

X_test['Stage_fear'] = le.transform(X_test['Stage_fear'].fillna('Missing'))
X_test['Drained_after_socializing'] = le.transform(X_test['Drained_after_socializing'].fillna('Missing'))

# Fill remaining NaN values with median
X = X.fillna(X.median())
X_test = X_test.fillna(X_test.median())

y = le.fit_transform(y)

# Apply SMOTE to balance the classes
X_resampled, y_resampled = sampler.fit_resample(X, y)

print("Original dataset shape:", dict(pd.Series(y).value_counts()))
print("Resampled dataset shape:", dict(pd.Series(y_resampled).value_counts()))
Original dataset shape: {0: 13699, 1: 4825}
Resampled dataset shape: {0: 13699, 1: 13699}
In [98]:
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=SEED)
In [99]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 400),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': SEED,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }
    model = xg.XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print("Best accuracy:", study.best_value)
print("Best params:", study.best_params)
[I 2025-07-05 17:01:09,719] A new study created in memory with name: no-name-b11b954b-1cb6-45de-a4a8-1225d5bf0048
[I 2025-07-05 17:01:09,868] Trial 0 finished with value: 0.9625912408759124 and parameters: {'max_depth': 4, 'learning_rate': 0.15469994161927286, 'n_estimators': 191, 'subsample': 0.7072837617386846, 'colsample_bytree': 0.8946921943985664, 'gamma': 3.5368730705776326, 'reg_alpha': 4.424778275626718, 'reg_lambda': 4.155240383232749}. Best is trial 0 with value: 0.9625912408759124.
[I 2025-07-05 17:01:10,086] Trial 1 finished with value: 0.9666058394160584 and parameters: {'max_depth': 9, 'learning_rate': 0.22509357441847086, 'n_estimators': 194, 'subsample': 0.9643530922947416, 'colsample_bytree': 0.6798486909708186, 'gamma': 1.1451908024558515, 'reg_alpha': 2.395055316764743, 'reg_lambda': 4.333580836688855}. Best is trial 1 with value: 0.9666058394160584.
[I 2025-07-05 17:01:10,364] Trial 2 finished with value: 0.9673357664233576 and parameters: {'max_depth': 8, 'learning_rate': 0.05720724119830814, 'n_estimators': 148, 'subsample': 0.9560791716688567, 'colsample_bytree': 0.9211938248411107, 'gamma': 1.1207647189914405, 'reg_alpha': 2.5993735812702945, 'reg_lambda': 0.9781864370277438}. Best is trial 2 with value: 0.9673357664233576.
[I 2025-07-05 17:01:10,778] Trial 3 finished with value: 0.9695255474452554 and parameters: {'max_depth': 10, 'learning_rate': 0.25399783808411536, 'n_estimators': 393, 'subsample': 0.8578810417196061, 'colsample_bytree': 0.8063758291672654, 'gamma': 1.6942523150358495, 'reg_alpha': 1.7526888160521448, 'reg_lambda': 2.016950574143822}. Best is trial 3 with value: 0.9695255474452554.
[I 2025-07-05 17:01:10,965] Trial 4 finished with value: 0.9702554744525548 and parameters: {'max_depth': 10, 'learning_rate': 0.2764778938228978, 'n_estimators': 150, 'subsample': 0.8641456858632555, 'colsample_bytree': 0.936100745717214, 'gamma': 1.331507541360839, 'reg_alpha': 2.36713814482066, 'reg_lambda': 1.7770820738061832}. Best is trial 4 with value: 0.9702554744525548.
[I 2025-07-05 17:01:11,433] Trial 5 finished with value: 0.9614963503649635 and parameters: {'max_depth': 3, 'learning_rate': 0.11443363150912457, 'n_estimators': 400, 'subsample': 0.6576885430527835, 'colsample_bytree': 0.807433236720453, 'gamma': 3.836453501715482, 'reg_alpha': 4.117966492418683, 'reg_lambda': 3.6296943473131726}. Best is trial 4 with value: 0.9702554744525548.
[I 2025-07-05 17:01:11,719] Trial 6 finished with value: 0.9642335766423358 and parameters: {'max_depth': 4, 'learning_rate': 0.1223737584621103, 'n_estimators': 381, 'subsample': 0.7655600843016693, 'colsample_bytree': 0.704154927011539, 'gamma': 3.362676339941957, 'reg_alpha': 1.7060743423773843, 'reg_lambda': 3.5921260685311225}. Best is trial 4 with value: 0.9702554744525548.
[I 2025-07-05 17:01:11,887] Trial 7 finished with value: 0.9658759124087591 and parameters: {'max_depth': 10, 'learning_rate': 0.2411366745844033, 'n_estimators': 111, 'subsample': 0.8544366213341472, 'colsample_bytree': 0.76471675712951, 'gamma': 2.055505946626313, 'reg_alpha': 4.0902147899887105, 'reg_lambda': 1.1991756654013497}. Best is trial 4 with value: 0.9702554744525548.
[I 2025-07-05 17:01:12,047] Trial 8 finished with value: 0.9636861313868613 and parameters: {'max_depth': 8, 'learning_rate': 0.19993158675853762, 'n_estimators': 175, 'subsample': 0.862302895866639, 'colsample_bytree': 0.8866119224708692, 'gamma': 4.999220868460518, 'reg_alpha': 4.695931909335332, 'reg_lambda': 4.984128947907852}. Best is trial 4 with value: 0.9702554744525548.
[I 2025-07-05 17:01:12,521] Trial 9 finished with value: 0.9702554744525548 and parameters: {'max_depth': 4, 'learning_rate': 0.1100822276115018, 'n_estimators': 319, 'subsample': 0.6738515967809205, 'colsample_bytree': 0.9970081185483793, 'gamma': 0.3026419507240802, 'reg_alpha': 2.883003284874105, 'reg_lambda': 1.9958361618187115}. Best is trial 4 with value: 0.9702554744525548.
[I 2025-07-05 17:01:12,976] Trial 10 finished with value: 0.9739051094890511 and parameters: {'max_depth': 6, 'learning_rate': 0.29974492047756524, 'n_estimators': 245, 'subsample': 0.7740550812232525, 'colsample_bytree': 0.6127569756722933, 'gamma': 0.3013544351998254, 'reg_alpha': 0.09778473967933055, 'reg_lambda': 0.18212543949800386}. Best is trial 10 with value: 0.9739051094890511.
[I 2025-07-05 17:01:13,396] Trial 11 finished with value: 0.9744525547445255 and parameters: {'max_depth': 6, 'learning_rate': 0.29955525655248033, 'n_estimators': 257, 'subsample': 0.7716062534264041, 'colsample_bytree': 0.6071793964875558, 'gamma': 0.022320233912947518, 'reg_alpha': 0.33417881972951546, 'reg_lambda': 0.35145705168398567}. Best is trial 11 with value: 0.9744525547445255.
[I 2025-07-05 17:01:13,727] Trial 12 finished with value: 0.9748175182481752 and parameters: {'max_depth': 6, 'learning_rate': 0.29425645949624674, 'n_estimators': 260, 'subsample': 0.788132612581613, 'colsample_bytree': 0.6007653357213613, 'gamma': 0.06325369540979286, 'reg_alpha': 0.014995679499138603, 'reg_lambda': 0.18405278692331148}. Best is trial 12 with value: 0.9748175182481752.
[I 2025-07-05 17:01:14,097] Trial 13 finished with value: 0.9751824817518249 and parameters: {'max_depth': 6, 'learning_rate': 0.19993133942721536, 'n_estimators': 276, 'subsample': 0.7251177654643216, 'colsample_bytree': 0.6079517527565417, 'gamma': 0.15967881788558375, 'reg_alpha': 0.060784190738781975, 'reg_lambda': 0.40006734610382677}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:14,402] Trial 14 finished with value: 0.972992700729927 and parameters: {'max_depth': 7, 'learning_rate': 0.18152364239422583, 'n_estimators': 287, 'subsample': 0.7128773668939916, 'colsample_bytree': 0.6717605106665164, 'gamma': 0.6784136955885791, 'reg_alpha': 0.9405732238602988, 'reg_lambda': 0.8574595821331065}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:14,654] Trial 15 finished with value: 0.9677007299270073 and parameters: {'max_depth': 5, 'learning_rate': 0.0525052095966049, 'n_estimators': 339, 'subsample': 0.6034830804626845, 'colsample_bytree': 0.7357548861944907, 'gamma': 2.5524678023194465, 'reg_alpha': 1.0261299132141382, 'reg_lambda': 0.06455621731941186}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:14,860] Trial 16 finished with value: 0.968978102189781 and parameters: {'max_depth': 7, 'learning_rate': 0.20693661757339799, 'n_estimators': 248, 'subsample': 0.815304530361486, 'colsample_bytree': 0.6440215148991338, 'gamma': 2.3519600665817073, 'reg_alpha': 0.9410352332520142, 'reg_lambda': 2.92127948994743}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:15,205] Trial 17 finished with value: 0.960948905109489 and parameters: {'max_depth': 5, 'learning_rate': 0.013191062007580079, 'n_estimators': 295, 'subsample': 0.7297634371565164, 'colsample_bytree': 0.6014517889379469, 'gamma': 0.7009451110587395, 'reg_alpha': 0.4450878978061383, 'reg_lambda': 1.4354369138065575}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:15,429] Trial 18 finished with value: 0.9645985401459855 and parameters: {'max_depth': 5, 'learning_rate': 0.16423583670676817, 'n_estimators': 221, 'subsample': 0.9165925835180988, 'colsample_bytree': 0.6521530604718263, 'gamma': 2.9428140962928073, 'reg_alpha': 3.2556067523060768, 'reg_lambda': 2.6606033133585982}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:15,813] Trial 19 finished with value: 0.9731751824817518 and parameters: {'max_depth': 8, 'learning_rate': 0.2548470304525325, 'n_estimators': 345, 'subsample': 0.6387799167106778, 'colsample_bytree': 0.7385780965951254, 'gamma': 0.6665536891723872, 'reg_alpha': 1.4967539055129342, 'reg_lambda': 0.7154342378364447}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:15,972] Trial 20 finished with value: 0.9666058394160584 and parameters: {'max_depth': 6, 'learning_rate': 0.14701046408550664, 'n_estimators': 281, 'subsample': 0.799944695038173, 'colsample_bytree': 0.8433960857658866, 'gamma': 4.410295931662153, 'reg_alpha': 0.49506298949289096, 'reg_lambda': 0.543474282737727}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:16,220] Trial 21 finished with value: 0.9731751824817518 and parameters: {'max_depth': 6, 'learning_rate': 0.29938966813455814, 'n_estimators': 266, 'subsample': 0.7498699827225355, 'colsample_bytree': 0.6270920775038457, 'gamma': 0.05549118088329549, 'reg_alpha': 0.03971892658754927, 'reg_lambda': 0.3847047038942656}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:16,471] Trial 22 finished with value: 0.9739051094890511 and parameters: {'max_depth': 7, 'learning_rate': 0.2749755531702343, 'n_estimators': 229, 'subsample': 0.8102004398529764, 'colsample_bytree': 0.6973055702726525, 'gamma': 0.003420570824474889, 'reg_alpha': 0.5345826388739257, 'reg_lambda': 0.05386348150462175}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:16,708] Trial 23 finished with value: 0.9737226277372263 and parameters: {'max_depth': 6, 'learning_rate': 0.226959761321406, 'n_estimators': 321, 'subsample': 0.689895428412021, 'colsample_bytree': 0.6091097929446148, 'gamma': 0.6658544190311231, 'reg_alpha': 0.020984207392783442, 'reg_lambda': 1.4177084951010406}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:16,910] Trial 24 finished with value: 0.9706204379562043 and parameters: {'max_depth': 5, 'learning_rate': 0.27485510945938413, 'n_estimators': 265, 'subsample': 0.7409863521421621, 'colsample_bytree': 0.6492577747945761, 'gamma': 1.5107627474955871, 'reg_alpha': 1.2074492344173193, 'reg_lambda': 0.5053785225682893}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:17,100] Trial 25 finished with value: 0.972992700729927 and parameters: {'max_depth': 7, 'learning_rate': 0.2790743438420724, 'n_estimators': 217, 'subsample': 0.7741522381710476, 'colsample_bytree': 0.7079824060594946, 'gamma': 0.9107551122606206, 'reg_alpha': 0.532561334754736, 'reg_lambda': 1.0308372625179252}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:17,517] Trial 26 finished with value: 0.972992700729927 and parameters: {'max_depth': 6, 'learning_rate': 0.2019496793115241, 'n_estimators': 298, 'subsample': 0.8241555724713282, 'colsample_bytree': 0.6359198832364679, 'gamma': 0.33408180085253003, 'reg_alpha': 0.683602396491966, 'reg_lambda': 1.6267248502178864}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:17,673] Trial 27 finished with value: 0.9698905109489051 and parameters: {'max_depth': 5, 'learning_rate': 0.25126973990067164, 'n_estimators': 259, 'subsample': 0.7831921107079309, 'colsample_bytree': 0.6047559567093184, 'gamma': 1.8431325247106136, 'reg_alpha': 1.296541374323569, 'reg_lambda': 2.2575115542673303}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:17,915] Trial 28 finished with value: 0.9728102189781022 and parameters: {'max_depth': 3, 'learning_rate': 0.2354323559633792, 'n_estimators': 314, 'subsample': 0.9216074280627132, 'colsample_bytree': 0.669442323929521, 'gamma': 0.058654054050326934, 'reg_alpha': 0.23173260203061435, 'reg_lambda': 0.41788190128449454}. Best is trial 13 with value: 0.9751824817518249.
[I 2025-07-05 17:01:18,107] Trial 29 finished with value: 0.970985401459854 and parameters: {'max_depth': 4, 'learning_rate': 0.15030775283564718, 'n_estimators': 199, 'subsample': 0.7044931943987416, 'colsample_bytree': 0.7655984142539816, 'gamma': 0.36950632967326275, 'reg_alpha': 1.9495951483918534, 'reg_lambda': 1.2119217944187735}. Best is trial 13 with value: 0.9751824817518249.
Best accuracy: 0.9751824817518249
Best params: {'max_depth': 6, 'learning_rate': 0.19993133942721536, 'n_estimators': 276, 'subsample': 0.7251177654643216, 'colsample_bytree': 0.6079517527565417, 'gamma': 0.15967881788558375, 'reg_alpha': 0.060784190738781975, 'reg_lambda': 0.40006734610382677}

Best accuracy: 0.9748175182481752

In [100]:
submission1 = pd.read_csv("sample_submission.csv")

model = xg.XGBClassifier(**study.best_params)
model.fit(X, y)

predictions = model.predict(X_test)

submission = pd.DataFrame({
    'id': test['id'],
    'Personality': predictions
})

accuracy = accuracy_score(y, model.predict(X))
print(f"Model accuracy on training data: {accuracy:.4f}")

submission['Personality'] = le.inverse_transform(submission['Personality'])

submission.to_csv('submission1.csv', index=False)
Model accuracy on training data: 0.9738
In [101]:
submission.head()
Out[101]:
id Personality
0 18524 Extrovert
1 18525 Introvert
2 18526 Extrovert
3 18527 Extrovert
4 18528 Introvert
In [102]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xg

# Define base models
base_estimators = [
    ('rf', RandomForestClassifier(random_state=SEED)),
    ('xgb', xg.XGBClassifier(random_state=SEED, use_label_encoder=False, eval_metric='logloss'))
]

# Meta-model
meta_model = LogisticRegression(random_state=SEED)

# Create stacking classifier
stack = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    passthrough=True
)

# Fit on resampled data
stack.fit(X_train, y_train)

# Evaluate on validation set
val_preds = stack.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)
print(f"Stacked model validation accuracy: {val_acc:.4f}")
Stacked model validation accuracy: 0.9728
In [103]:
submission2 = pd.read_csv("sample_submission.csv")

stack.fit(X, y)

predictions = stack.predict(X_test)

submission2 = pd.DataFrame({
    'id': test['id'],
    'Personality': predictions
})

accuracy = accuracy_score(y, stack.predict(X))
print(f"Model accuracy on training data: {accuracy:.4f}")

submission2['Personality'] = le.inverse_transform(submission2['Personality'])

submission2.to_csv('submission2.csv', index=False)
Model accuracy on training data: 0.9740
In [104]:
# Suggestions to potentially increase accuracy:

# 1. Feature Engineering:
#    - Create new features from existing ones, such as ratios, sums, or interactions.
#    - Example: Social_activity_score = Social_event_attendance + Going_outside + Friends_circle_size

train['Social_activity_score'] = (
    train['Social_event_attendance'].fillna(0) +
    train['Going_outside'].fillna(0) +
    train['Friends_circle_size'].fillna(0)
)
test['Social_activity_score'] = (
    test['Social_event_attendance'].fillna(0) +
    test['Going_outside'].fillna(0) +
    test['Friends_circle_size'].fillna(0)
)

# 2. Binning/Numerical Categorization:
#    - Bin continuous features into categories (e.g., low/medium/high).
#    - Example: Bin 'Time_spent_Alone' into quantiles.

train['Alone_bin'] = pd.qcut(train['Time_spent_Alone'].fillna(-1), q=4, labels=False, duplicates='drop')
test['Alone_bin'] = pd.qcut(test['Time_spent_Alone'].fillna(-1), q=4, labels=False, duplicates='drop')

# 3. Interaction Features:
#    - Multiply or combine features that may interact.
#    - Example: Stage_fear * Drained_after_socializing

train['Fear_Drained'] = (
    train['Stage_fear'].fillna('Missing').astype(str) + "_" +
    train['Drained_after_socializing'].fillna('Missing').astype(str)
)
test['Fear_Drained'] = (
    test['Stage_fear'].fillna('Missing').astype(str) + "_" +
    test['Drained_after_socializing'].fillna('Missing').astype(str)
)

# 4. Outlier Handling:
#    - Cap/floor extreme values in numerical features.

for col in ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']:
    train[col] = train[col].clip(lower=train[col].quantile(0.01), upper=train[col].quantile(0.99))
    test[col] = test[col].clip(lower=test[col].quantile(0.01), upper=test[col].quantile(0.99))

# 5. Model Ensembling:
#    - Try blending/averaging predictions from multiple models (e.g., XGBoost, RandomForest, Neural Network).

# 6. Hyperparameter Tuning:
#    - Use Optuna or GridSearchCV for more models, not just XGBoost.

# 7. Feature Selection:
#    - Use feature importance from models or SHAP to drop uninformative features.

# After adding new features, remember to update your preprocessing and model training code to include them.
In [105]:
train.head()
Out[105]:
id Time_spent_Alone Stage_fear Social_event_attendance Going_outside Drained_after_socializing Friends_circle_size Post_frequency Personality Social_activity_score Alone_bin Fear_Drained
0 0 0.0 No 6.0 4.0 No 15.0 5.0 Extrovert 25.0 0 No_No
1 1 1.0 No 7.0 3.0 No 10.0 8.0 Extrovert 20.0 0 No_No
2 2 6.0 Yes 1.0 0.0 NaN 3.0 0.0 Introvert 4.0 3 Yes_Missing
3 3 3.0 No 7.0 3.0 No 11.0 5.0 Extrovert 21.0 2 No_No
4 4 1.0 No 4.0 4.0 No 13.0 NaN Extrovert 21.0 0 No_No
In [111]:
# Update feature set to include new engineered features
feature_cols = [
    'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside',
    'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency',
    'Social_activity_score', 'Alone_bin'
]

# Encode the new categorical interaction feature
X_new = train[feature_cols].copy()
X_test_new = test[feature_cols].copy()

# Encode 'Fear_Drained' using LabelEncoder
X_new['Fear_Drained'] = le.fit_transform(train['Fear_Drained'].fillna('Missing'))
X_test_new['Fear_Drained'] = le.transform(test['Fear_Drained'].fillna('Missing'))

print(X_new.dtypes)

le = LabelEncoder()
X_new = le.fit_transform(X_new['Stage_fear'].fillna('Missing'))
X_new = le.fit_transform(X_test_new['Stage_fear'].fillna('Missing'))

X_new = le.fit_transform(X_new['Drained_after_socializing'])
X_test_new = le.fit_transform(X_test_new['Drained_after_socializing'])

# Fill any remaining NaNs with median
X_new = X_new.fillna(X_new.median())
X_test_new = X_test_new.fillna(X_test_new.median())

# Encode target
y_new = le.fit_transform(train['Personality'])

# Balance classes with SMOTE
X_resampled_new, y_resampled_new = sampler.fit_resample(X_new, y_new)

# Train/test split
X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(
    X_resampled_new, y_resampled_new, test_size=0.2, random_state=SEED
)

# Retrain XGBoost with best params from Optuna
model_new = xg.XGBClassifier(**study.best_params)
model_new.fit(X_train_new, y_train_new)
val_preds_new = model_new.predict(X_val_new)
val_acc_new = accuracy_score(y_val_new, val_preds_new)
print(f"Validation accuracy with new features: {val_acc_new:.4f}")

# Train on all data and predict for submission
model_new.fit(X_new, y_new)
predictions_new = model_new.predict(X_test_new)
submission_new = pd.DataFrame({
    'id': test['id'],
    'Personality': le.inverse_transform(predictions_new)
})
submission_new.to_csv('submission_new_features.csv', index=False)
submission_new.head()
Time_spent_Alone             float64
Stage_fear                    object
Social_event_attendance      float64
Going_outside                float64
Drained_after_socializing     object
Friends_circle_size          float64
Post_frequency               float64
Social_activity_score        float64
Alone_bin                      int64
Fear_Drained                   int32
dtype: object
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[111], line 22
     19 X_new = le.fit_transform(X_new['Stage_fear'].fillna('Missing'))
     20 X_new = le.fit_transform(X_test_new['Stage_fear'].fillna('Missing'))
---> 22 X_new = le.fit_transform(X_new['Drained_after_socializing'])
     23 X_test_new = le.fit_transform(X_test_new['Drained_after_socializing'])
     25 # Fill any remaining NaNs with median

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices