⭐ 1. Introduction & Overview¶
Your Goal: Your objective is to predict whether a person is an Introvert or Extrovert, given their social behavior and personality traits.
🔹 2. Import Libraries & Set Up¶
In [87]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
# Machine Learning
import xgboost as xg
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import SMOTE
import optuna
# Feature Importance & Explainability
import shap
# Settings
import warnings
warnings.filterwarnings("ignore")
# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)
print("Libraries loaded. Ready to go!")
Libraries loaded. Ready to go!
🔹 3. Load & Explore Data¶
In [88]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(f"Train shape: {train.shape}, Test shape: {test.shape}")
Train shape: (18524, 9), Test shape: (6175, 8)
In [89]:
train.head()
Out[89]:
id | Time_spent_Alone | Stage_fear | Social_event_attendance | Going_outside | Drained_after_socializing | Friends_circle_size | Post_frequency | Personality | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.0 | No | 6.0 | 4.0 | No | 15.0 | 5.0 | Extrovert |
1 | 1 | 1.0 | No | 7.0 | 3.0 | No | 10.0 | 8.0 | Extrovert |
2 | 2 | 6.0 | Yes | 1.0 | 0.0 | NaN | 3.0 | 0.0 | Introvert |
3 | 3 | 3.0 | No | 7.0 | 3.0 | No | 11.0 | 5.0 | Extrovert |
4 | 4 | 1.0 | No | 4.0 | 4.0 | No | 13.0 | NaN | Extrovert |
In [90]:
train['Personality'].value_counts()
Out[90]:
Personality Extrovert 13699 Introvert 4825 Name: count, dtype: int64
In [91]:
def plot_nums(dataframe):
float_cols = [col for col in dataframe.columns if dataframe[col].dtype == "float64" or dataframe[col].dtype == "int64"]
cols_per_row = 3
num_plots = len(float_cols)
rows = (num_plots // cols_per_row) + (num_plots % cols_per_row > 0)
fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 5 * rows))
axes = axes.flatten()
for idx, col in enumerate(float_cols):
sns.histplot(dataframe[col], bins=50, kde=True, ax=axes[idx])
axes[idx].set_title(f"Distribution of {col}")
for i in range(idx + 1, len(axes)):
fig.delaxes(axes[i])
plt.tight_layout()
plt.show()
In [92]:
def plot_cats(dataframe):
categorical_features = dataframe.select_dtypes(include=['object']).columns
num_features = len(categorical_features)
cols = 3
rows = (num_features // cols) + (num_features % cols > 0)
# Create subplots
fig, axes = plt.subplots(rows, cols, figsize=(15, rows * 5))
axes = axes.flatten()
for i, feature in enumerate(categorical_features):
dataframe[feature].value_counts().plot.pie(
autopct='%1.1f%%', ax=axes[i], startangle=90, cmap="viridis"
)
axes[i].set_title(feature)
axes[i].set_ylabel("")
# Hide any unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
plt.show()
In [93]:
plot_cats(train)
plot_nums(train)
In [94]:
def heatmap_nums(dataframe):
heatmap_train = dataframe.select_dtypes(include=["float64", "int64"])
corr_matrix = heatmap_train.corr()
threshold = 0.75
high_corr_pairs = (
corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
.stack()
.reset_index()
)
high_corr_pairs.columns = ["Feature 1", "Feature 2", "Correlation"]
high_corr_pairs = high_corr_pairs[high_corr_pairs["Correlation"].abs() > threshold]
plt.figure(figsize=(30, 12))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.show()
print("Highly correlated feature pairs (above threshold):")
print(high_corr_pairs)
In [95]:
heatmap_nums(train)
Highly correlated feature pairs (above threshold): Empty DataFrame Columns: [Feature 1, Feature 2, Correlation] Index: []
🔹 5. Model Testing¶
In [96]:
sampler = SMOTE(random_state=SEED)
In [97]:
# Prepare features and target
X = train.drop(['id', 'Personality'], axis=1)
y = train['Personality']
X_test = test.drop(['id'], axis=1)
# Convert categorical variables to numeric
le = LabelEncoder()
X['Stage_fear'] = le.fit_transform(X['Stage_fear'].fillna('Missing'))
X['Drained_after_socializing'] = le.fit_transform(X['Drained_after_socializing'].fillna('Missing'))
X_test['Stage_fear'] = le.transform(X_test['Stage_fear'].fillna('Missing'))
X_test['Drained_after_socializing'] = le.transform(X_test['Drained_after_socializing'].fillna('Missing'))
# Fill remaining NaN values with median
X = X.fillna(X.median())
X_test = X_test.fillna(X_test.median())
y = le.fit_transform(y)
# Apply SMOTE to balance the classes
X_resampled, y_resampled = sampler.fit_resample(X, y)
print("Original dataset shape:", dict(pd.Series(y).value_counts()))
print("Resampled dataset shape:", dict(pd.Series(y_resampled).value_counts()))
Original dataset shape: {0: 13699, 1: 4825} Resampled dataset shape: {0: 13699, 1: 13699}
In [98]:
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=SEED)
In [99]:
def objective(trial):
params = {
'max_depth': trial.suggest_int('max_depth', 3, 10),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
'n_estimators': trial.suggest_int('n_estimators', 100, 400),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
'gamma': trial.suggest_float('gamma', 0, 5),
'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
'random_state': SEED,
'use_label_encoder': False,
'eval_metric': 'logloss'
}
model = xg.XGBClassifier(**params)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
score = accuracy_score(y_val, y_pred)
return score
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
print("Best accuracy:", study.best_value)
print("Best params:", study.best_params)
[I 2025-07-05 17:01:09,719] A new study created in memory with name: no-name-b11b954b-1cb6-45de-a4a8-1225d5bf0048 [I 2025-07-05 17:01:09,868] Trial 0 finished with value: 0.9625912408759124 and parameters: {'max_depth': 4, 'learning_rate': 0.15469994161927286, 'n_estimators': 191, 'subsample': 0.7072837617386846, 'colsample_bytree': 0.8946921943985664, 'gamma': 3.5368730705776326, 'reg_alpha': 4.424778275626718, 'reg_lambda': 4.155240383232749}. Best is trial 0 with value: 0.9625912408759124. [I 2025-07-05 17:01:10,086] Trial 1 finished with value: 0.9666058394160584 and parameters: {'max_depth': 9, 'learning_rate': 0.22509357441847086, 'n_estimators': 194, 'subsample': 0.9643530922947416, 'colsample_bytree': 0.6798486909708186, 'gamma': 1.1451908024558515, 'reg_alpha': 2.395055316764743, 'reg_lambda': 4.333580836688855}. Best is trial 1 with value: 0.9666058394160584. [I 2025-07-05 17:01:10,364] Trial 2 finished with value: 0.9673357664233576 and parameters: {'max_depth': 8, 'learning_rate': 0.05720724119830814, 'n_estimators': 148, 'subsample': 0.9560791716688567, 'colsample_bytree': 0.9211938248411107, 'gamma': 1.1207647189914405, 'reg_alpha': 2.5993735812702945, 'reg_lambda': 0.9781864370277438}. Best is trial 2 with value: 0.9673357664233576. [I 2025-07-05 17:01:10,778] Trial 3 finished with value: 0.9695255474452554 and parameters: {'max_depth': 10, 'learning_rate': 0.25399783808411536, 'n_estimators': 393, 'subsample': 0.8578810417196061, 'colsample_bytree': 0.8063758291672654, 'gamma': 1.6942523150358495, 'reg_alpha': 1.7526888160521448, 'reg_lambda': 2.016950574143822}. Best is trial 3 with value: 0.9695255474452554. [I 2025-07-05 17:01:10,965] Trial 4 finished with value: 0.9702554744525548 and parameters: {'max_depth': 10, 'learning_rate': 0.2764778938228978, 'n_estimators': 150, 'subsample': 0.8641456858632555, 'colsample_bytree': 0.936100745717214, 'gamma': 1.331507541360839, 'reg_alpha': 2.36713814482066, 'reg_lambda': 1.7770820738061832}. Best is trial 4 with value: 0.9702554744525548. [I 2025-07-05 17:01:11,433] Trial 5 finished with value: 0.9614963503649635 and parameters: {'max_depth': 3, 'learning_rate': 0.11443363150912457, 'n_estimators': 400, 'subsample': 0.6576885430527835, 'colsample_bytree': 0.807433236720453, 'gamma': 3.836453501715482, 'reg_alpha': 4.117966492418683, 'reg_lambda': 3.6296943473131726}. Best is trial 4 with value: 0.9702554744525548. [I 2025-07-05 17:01:11,719] Trial 6 finished with value: 0.9642335766423358 and parameters: {'max_depth': 4, 'learning_rate': 0.1223737584621103, 'n_estimators': 381, 'subsample': 0.7655600843016693, 'colsample_bytree': 0.704154927011539, 'gamma': 3.362676339941957, 'reg_alpha': 1.7060743423773843, 'reg_lambda': 3.5921260685311225}. Best is trial 4 with value: 0.9702554744525548. [I 2025-07-05 17:01:11,887] Trial 7 finished with value: 0.9658759124087591 and parameters: {'max_depth': 10, 'learning_rate': 0.2411366745844033, 'n_estimators': 111, 'subsample': 0.8544366213341472, 'colsample_bytree': 0.76471675712951, 'gamma': 2.055505946626313, 'reg_alpha': 4.0902147899887105, 'reg_lambda': 1.1991756654013497}. Best is trial 4 with value: 0.9702554744525548. [I 2025-07-05 17:01:12,047] Trial 8 finished with value: 0.9636861313868613 and parameters: {'max_depth': 8, 'learning_rate': 0.19993158675853762, 'n_estimators': 175, 'subsample': 0.862302895866639, 'colsample_bytree': 0.8866119224708692, 'gamma': 4.999220868460518, 'reg_alpha': 4.695931909335332, 'reg_lambda': 4.984128947907852}. Best is trial 4 with value: 0.9702554744525548. [I 2025-07-05 17:01:12,521] Trial 9 finished with value: 0.9702554744525548 and parameters: {'max_depth': 4, 'learning_rate': 0.1100822276115018, 'n_estimators': 319, 'subsample': 0.6738515967809205, 'colsample_bytree': 0.9970081185483793, 'gamma': 0.3026419507240802, 'reg_alpha': 2.883003284874105, 'reg_lambda': 1.9958361618187115}. Best is trial 4 with value: 0.9702554744525548. [I 2025-07-05 17:01:12,976] Trial 10 finished with value: 0.9739051094890511 and parameters: {'max_depth': 6, 'learning_rate': 0.29974492047756524, 'n_estimators': 245, 'subsample': 0.7740550812232525, 'colsample_bytree': 0.6127569756722933, 'gamma': 0.3013544351998254, 'reg_alpha': 0.09778473967933055, 'reg_lambda': 0.18212543949800386}. Best is trial 10 with value: 0.9739051094890511. [I 2025-07-05 17:01:13,396] Trial 11 finished with value: 0.9744525547445255 and parameters: {'max_depth': 6, 'learning_rate': 0.29955525655248033, 'n_estimators': 257, 'subsample': 0.7716062534264041, 'colsample_bytree': 0.6071793964875558, 'gamma': 0.022320233912947518, 'reg_alpha': 0.33417881972951546, 'reg_lambda': 0.35145705168398567}. Best is trial 11 with value: 0.9744525547445255. [I 2025-07-05 17:01:13,727] Trial 12 finished with value: 0.9748175182481752 and parameters: {'max_depth': 6, 'learning_rate': 0.29425645949624674, 'n_estimators': 260, 'subsample': 0.788132612581613, 'colsample_bytree': 0.6007653357213613, 'gamma': 0.06325369540979286, 'reg_alpha': 0.014995679499138603, 'reg_lambda': 0.18405278692331148}. Best is trial 12 with value: 0.9748175182481752. [I 2025-07-05 17:01:14,097] Trial 13 finished with value: 0.9751824817518249 and parameters: {'max_depth': 6, 'learning_rate': 0.19993133942721536, 'n_estimators': 276, 'subsample': 0.7251177654643216, 'colsample_bytree': 0.6079517527565417, 'gamma': 0.15967881788558375, 'reg_alpha': 0.060784190738781975, 'reg_lambda': 0.40006734610382677}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:14,402] Trial 14 finished with value: 0.972992700729927 and parameters: {'max_depth': 7, 'learning_rate': 0.18152364239422583, 'n_estimators': 287, 'subsample': 0.7128773668939916, 'colsample_bytree': 0.6717605106665164, 'gamma': 0.6784136955885791, 'reg_alpha': 0.9405732238602988, 'reg_lambda': 0.8574595821331065}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:14,654] Trial 15 finished with value: 0.9677007299270073 and parameters: {'max_depth': 5, 'learning_rate': 0.0525052095966049, 'n_estimators': 339, 'subsample': 0.6034830804626845, 'colsample_bytree': 0.7357548861944907, 'gamma': 2.5524678023194465, 'reg_alpha': 1.0261299132141382, 'reg_lambda': 0.06455621731941186}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:14,860] Trial 16 finished with value: 0.968978102189781 and parameters: {'max_depth': 7, 'learning_rate': 0.20693661757339799, 'n_estimators': 248, 'subsample': 0.815304530361486, 'colsample_bytree': 0.6440215148991338, 'gamma': 2.3519600665817073, 'reg_alpha': 0.9410352332520142, 'reg_lambda': 2.92127948994743}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:15,205] Trial 17 finished with value: 0.960948905109489 and parameters: {'max_depth': 5, 'learning_rate': 0.013191062007580079, 'n_estimators': 295, 'subsample': 0.7297634371565164, 'colsample_bytree': 0.6014517889379469, 'gamma': 0.7009451110587395, 'reg_alpha': 0.4450878978061383, 'reg_lambda': 1.4354369138065575}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:15,429] Trial 18 finished with value: 0.9645985401459855 and parameters: {'max_depth': 5, 'learning_rate': 0.16423583670676817, 'n_estimators': 221, 'subsample': 0.9165925835180988, 'colsample_bytree': 0.6521530604718263, 'gamma': 2.9428140962928073, 'reg_alpha': 3.2556067523060768, 'reg_lambda': 2.6606033133585982}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:15,813] Trial 19 finished with value: 0.9731751824817518 and parameters: {'max_depth': 8, 'learning_rate': 0.2548470304525325, 'n_estimators': 345, 'subsample': 0.6387799167106778, 'colsample_bytree': 0.7385780965951254, 'gamma': 0.6665536891723872, 'reg_alpha': 1.4967539055129342, 'reg_lambda': 0.7154342378364447}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:15,972] Trial 20 finished with value: 0.9666058394160584 and parameters: {'max_depth': 6, 'learning_rate': 0.14701046408550664, 'n_estimators': 281, 'subsample': 0.799944695038173, 'colsample_bytree': 0.8433960857658866, 'gamma': 4.410295931662153, 'reg_alpha': 0.49506298949289096, 'reg_lambda': 0.543474282737727}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:16,220] Trial 21 finished with value: 0.9731751824817518 and parameters: {'max_depth': 6, 'learning_rate': 0.29938966813455814, 'n_estimators': 266, 'subsample': 0.7498699827225355, 'colsample_bytree': 0.6270920775038457, 'gamma': 0.05549118088329549, 'reg_alpha': 0.03971892658754927, 'reg_lambda': 0.3847047038942656}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:16,471] Trial 22 finished with value: 0.9739051094890511 and parameters: {'max_depth': 7, 'learning_rate': 0.2749755531702343, 'n_estimators': 229, 'subsample': 0.8102004398529764, 'colsample_bytree': 0.6973055702726525, 'gamma': 0.003420570824474889, 'reg_alpha': 0.5345826388739257, 'reg_lambda': 0.05386348150462175}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:16,708] Trial 23 finished with value: 0.9737226277372263 and parameters: {'max_depth': 6, 'learning_rate': 0.226959761321406, 'n_estimators': 321, 'subsample': 0.689895428412021, 'colsample_bytree': 0.6091097929446148, 'gamma': 0.6658544190311231, 'reg_alpha': 0.020984207392783442, 'reg_lambda': 1.4177084951010406}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:16,910] Trial 24 finished with value: 0.9706204379562043 and parameters: {'max_depth': 5, 'learning_rate': 0.27485510945938413, 'n_estimators': 265, 'subsample': 0.7409863521421621, 'colsample_bytree': 0.6492577747945761, 'gamma': 1.5107627474955871, 'reg_alpha': 1.2074492344173193, 'reg_lambda': 0.5053785225682893}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:17,100] Trial 25 finished with value: 0.972992700729927 and parameters: {'max_depth': 7, 'learning_rate': 0.2790743438420724, 'n_estimators': 217, 'subsample': 0.7741522381710476, 'colsample_bytree': 0.7079824060594946, 'gamma': 0.9107551122606206, 'reg_alpha': 0.532561334754736, 'reg_lambda': 1.0308372625179252}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:17,517] Trial 26 finished with value: 0.972992700729927 and parameters: {'max_depth': 6, 'learning_rate': 0.2019496793115241, 'n_estimators': 298, 'subsample': 0.8241555724713282, 'colsample_bytree': 0.6359198832364679, 'gamma': 0.33408180085253003, 'reg_alpha': 0.683602396491966, 'reg_lambda': 1.6267248502178864}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:17,673] Trial 27 finished with value: 0.9698905109489051 and parameters: {'max_depth': 5, 'learning_rate': 0.25126973990067164, 'n_estimators': 259, 'subsample': 0.7831921107079309, 'colsample_bytree': 0.6047559567093184, 'gamma': 1.8431325247106136, 'reg_alpha': 1.296541374323569, 'reg_lambda': 2.2575115542673303}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:17,915] Trial 28 finished with value: 0.9728102189781022 and parameters: {'max_depth': 3, 'learning_rate': 0.2354323559633792, 'n_estimators': 314, 'subsample': 0.9216074280627132, 'colsample_bytree': 0.669442323929521, 'gamma': 0.058654054050326934, 'reg_alpha': 0.23173260203061435, 'reg_lambda': 0.41788190128449454}. Best is trial 13 with value: 0.9751824817518249. [I 2025-07-05 17:01:18,107] Trial 29 finished with value: 0.970985401459854 and parameters: {'max_depth': 4, 'learning_rate': 0.15030775283564718, 'n_estimators': 199, 'subsample': 0.7044931943987416, 'colsample_bytree': 0.7655984142539816, 'gamma': 0.36950632967326275, 'reg_alpha': 1.9495951483918534, 'reg_lambda': 1.2119217944187735}. Best is trial 13 with value: 0.9751824817518249.
Best accuracy: 0.9751824817518249 Best params: {'max_depth': 6, 'learning_rate': 0.19993133942721536, 'n_estimators': 276, 'subsample': 0.7251177654643216, 'colsample_bytree': 0.6079517527565417, 'gamma': 0.15967881788558375, 'reg_alpha': 0.060784190738781975, 'reg_lambda': 0.40006734610382677}
Best accuracy: 0.9748175182481752
In [100]:
submission1 = pd.read_csv("sample_submission.csv")
model = xg.XGBClassifier(**study.best_params)
model.fit(X, y)
predictions = model.predict(X_test)
submission = pd.DataFrame({
'id': test['id'],
'Personality': predictions
})
accuracy = accuracy_score(y, model.predict(X))
print(f"Model accuracy on training data: {accuracy:.4f}")
submission['Personality'] = le.inverse_transform(submission['Personality'])
submission.to_csv('submission1.csv', index=False)
Model accuracy on training data: 0.9738
In [101]:
submission.head()
Out[101]:
id | Personality | |
---|---|---|
0 | 18524 | Extrovert |
1 | 18525 | Introvert |
2 | 18526 | Extrovert |
3 | 18527 | Extrovert |
4 | 18528 | Introvert |
In [102]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xg
# Define base models
base_estimators = [
('rf', RandomForestClassifier(random_state=SEED)),
('xgb', xg.XGBClassifier(random_state=SEED, use_label_encoder=False, eval_metric='logloss'))
]
# Meta-model
meta_model = LogisticRegression(random_state=SEED)
# Create stacking classifier
stack = StackingClassifier(
estimators=base_estimators,
final_estimator=meta_model,
cv=5,
n_jobs=-1,
passthrough=True
)
# Fit on resampled data
stack.fit(X_train, y_train)
# Evaluate on validation set
val_preds = stack.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)
print(f"Stacked model validation accuracy: {val_acc:.4f}")
Stacked model validation accuracy: 0.9728
In [103]:
submission2 = pd.read_csv("sample_submission.csv")
stack.fit(X, y)
predictions = stack.predict(X_test)
submission2 = pd.DataFrame({
'id': test['id'],
'Personality': predictions
})
accuracy = accuracy_score(y, stack.predict(X))
print(f"Model accuracy on training data: {accuracy:.4f}")
submission2['Personality'] = le.inverse_transform(submission2['Personality'])
submission2.to_csv('submission2.csv', index=False)
Model accuracy on training data: 0.9740
In [104]:
# Suggestions to potentially increase accuracy:
# 1. Feature Engineering:
# - Create new features from existing ones, such as ratios, sums, or interactions.
# - Example: Social_activity_score = Social_event_attendance + Going_outside + Friends_circle_size
train['Social_activity_score'] = (
train['Social_event_attendance'].fillna(0) +
train['Going_outside'].fillna(0) +
train['Friends_circle_size'].fillna(0)
)
test['Social_activity_score'] = (
test['Social_event_attendance'].fillna(0) +
test['Going_outside'].fillna(0) +
test['Friends_circle_size'].fillna(0)
)
# 2. Binning/Numerical Categorization:
# - Bin continuous features into categories (e.g., low/medium/high).
# - Example: Bin 'Time_spent_Alone' into quantiles.
train['Alone_bin'] = pd.qcut(train['Time_spent_Alone'].fillna(-1), q=4, labels=False, duplicates='drop')
test['Alone_bin'] = pd.qcut(test['Time_spent_Alone'].fillna(-1), q=4, labels=False, duplicates='drop')
# 3. Interaction Features:
# - Multiply or combine features that may interact.
# - Example: Stage_fear * Drained_after_socializing
train['Fear_Drained'] = (
train['Stage_fear'].fillna('Missing').astype(str) + "_" +
train['Drained_after_socializing'].fillna('Missing').astype(str)
)
test['Fear_Drained'] = (
test['Stage_fear'].fillna('Missing').astype(str) + "_" +
test['Drained_after_socializing'].fillna('Missing').astype(str)
)
# 4. Outlier Handling:
# - Cap/floor extreme values in numerical features.
for col in ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']:
train[col] = train[col].clip(lower=train[col].quantile(0.01), upper=train[col].quantile(0.99))
test[col] = test[col].clip(lower=test[col].quantile(0.01), upper=test[col].quantile(0.99))
# 5. Model Ensembling:
# - Try blending/averaging predictions from multiple models (e.g., XGBoost, RandomForest, Neural Network).
# 6. Hyperparameter Tuning:
# - Use Optuna or GridSearchCV for more models, not just XGBoost.
# 7. Feature Selection:
# - Use feature importance from models or SHAP to drop uninformative features.
# After adding new features, remember to update your preprocessing and model training code to include them.
In [105]:
train.head()
Out[105]:
id | Time_spent_Alone | Stage_fear | Social_event_attendance | Going_outside | Drained_after_socializing | Friends_circle_size | Post_frequency | Personality | Social_activity_score | Alone_bin | Fear_Drained | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.0 | No | 6.0 | 4.0 | No | 15.0 | 5.0 | Extrovert | 25.0 | 0 | No_No |
1 | 1 | 1.0 | No | 7.0 | 3.0 | No | 10.0 | 8.0 | Extrovert | 20.0 | 0 | No_No |
2 | 2 | 6.0 | Yes | 1.0 | 0.0 | NaN | 3.0 | 0.0 | Introvert | 4.0 | 3 | Yes_Missing |
3 | 3 | 3.0 | No | 7.0 | 3.0 | No | 11.0 | 5.0 | Extrovert | 21.0 | 2 | No_No |
4 | 4 | 1.0 | No | 4.0 | 4.0 | No | 13.0 | NaN | Extrovert | 21.0 | 0 | No_No |
In [111]:
# Update feature set to include new engineered features
feature_cols = [
'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside',
'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency',
'Social_activity_score', 'Alone_bin'
]
# Encode the new categorical interaction feature
X_new = train[feature_cols].copy()
X_test_new = test[feature_cols].copy()
# Encode 'Fear_Drained' using LabelEncoder
X_new['Fear_Drained'] = le.fit_transform(train['Fear_Drained'].fillna('Missing'))
X_test_new['Fear_Drained'] = le.transform(test['Fear_Drained'].fillna('Missing'))
print(X_new.dtypes)
le = LabelEncoder()
X_new = le.fit_transform(X_new['Stage_fear'].fillna('Missing'))
X_new = le.fit_transform(X_test_new['Stage_fear'].fillna('Missing'))
X_new = le.fit_transform(X_new['Drained_after_socializing'])
X_test_new = le.fit_transform(X_test_new['Drained_after_socializing'])
# Fill any remaining NaNs with median
X_new = X_new.fillna(X_new.median())
X_test_new = X_test_new.fillna(X_test_new.median())
# Encode target
y_new = le.fit_transform(train['Personality'])
# Balance classes with SMOTE
X_resampled_new, y_resampled_new = sampler.fit_resample(X_new, y_new)
# Train/test split
X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(
X_resampled_new, y_resampled_new, test_size=0.2, random_state=SEED
)
# Retrain XGBoost with best params from Optuna
model_new = xg.XGBClassifier(**study.best_params)
model_new.fit(X_train_new, y_train_new)
val_preds_new = model_new.predict(X_val_new)
val_acc_new = accuracy_score(y_val_new, val_preds_new)
print(f"Validation accuracy with new features: {val_acc_new:.4f}")
# Train on all data and predict for submission
model_new.fit(X_new, y_new)
predictions_new = model_new.predict(X_test_new)
submission_new = pd.DataFrame({
'id': test['id'],
'Personality': le.inverse_transform(predictions_new)
})
submission_new.to_csv('submission_new_features.csv', index=False)
submission_new.head()
Time_spent_Alone float64 Stage_fear object Social_event_attendance float64 Going_outside float64 Drained_after_socializing object Friends_circle_size float64 Post_frequency float64 Social_activity_score float64 Alone_bin int64 Fear_Drained int32 dtype: object
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In[111], line 22 19 X_new = le.fit_transform(X_new['Stage_fear'].fillna('Missing')) 20 X_new = le.fit_transform(X_test_new['Stage_fear'].fillna('Missing')) ---> 22 X_new = le.fit_transform(X_new['Drained_after_socializing']) 23 X_test_new = le.fit_transform(X_test_new['Drained_after_socializing']) 25 # Fill any remaining NaNs with median IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices