#1¶
Kaggle competition: https://www.kaggle.com/competitions/spaceship-titanic/
Entry by Robin P.M. Kras
In [ ]:
import pandas as pd
In [26]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [27]:
train.head()
Out[27]:
PassengerId | HomePlanet | CryoSleep | Cabin | Destination | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | Name | Transported | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0001_01 | Europa | False | B/0/P | TRAPPIST-1e | 39.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Maham Ofracculy | False |
1 | 0002_01 | Earth | False | F/0/S | TRAPPIST-1e | 24.0 | False | 109.0 | 9.0 | 25.0 | 549.0 | 44.0 | Juanna Vines | True |
2 | 0003_01 | Europa | False | A/0/S | TRAPPIST-1e | 58.0 | True | 43.0 | 3576.0 | 0.0 | 6715.0 | 49.0 | Altark Susent | False |
3 | 0003_02 | Europa | False | A/0/S | TRAPPIST-1e | 33.0 | False | 0.0 | 1283.0 | 371.0 | 3329.0 | 193.0 | Solam Susent | False |
4 | 0004_01 | Earth | False | F/1/S | TRAPPIST-1e | 16.0 | False | 303.0 | 70.0 | 151.0 | 565.0 | 2.0 | Willy Santantines | True |
In [28]:
test.head()
Out[28]:
PassengerId | HomePlanet | CryoSleep | Cabin | Destination | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | Name | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0013_01 | Earth | True | G/3/S | TRAPPIST-1e | 27.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Nelly Carsoning |
1 | 0018_01 | Earth | False | F/4/S | TRAPPIST-1e | 19.0 | False | 0.0 | 9.0 | 0.0 | 2823.0 | 0.0 | Lerome Peckers |
2 | 0019_01 | Europa | True | C/0/S | 55 Cancri e | 31.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Sabih Unhearfus |
3 | 0021_01 | Europa | False | C/1/S | TRAPPIST-1e | 38.0 | False | 0.0 | 6652.0 | 0.0 | 181.0 | 585.0 | Meratz Caltilter |
4 | 0023_01 | Earth | False | F/5/S | TRAPPIST-1e | 20.0 | False | 10.0 | 0.0 | 635.0 | 0.0 | 0.0 | Brence Harperez |
In [29]:
# Remove unwanted or unrelated columns:
train.drop(columns=['HomePlanet','Name','Destination', 'Cabin'], inplace=True)
test.drop(columns=['HomePlanet','Name','Destination', 'Cabin'], inplace=True)
In [30]:
train.head()
Out[30]:
PassengerId | CryoSleep | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | Transported | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0001_01 | False | 39.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | False |
1 | 0002_01 | False | 24.0 | False | 109.0 | 9.0 | 25.0 | 549.0 | 44.0 | True |
2 | 0003_01 | False | 58.0 | True | 43.0 | 3576.0 | 0.0 | 6715.0 | 49.0 | False |
3 | 0003_02 | False | 33.0 | False | 0.0 | 1283.0 | 371.0 | 3329.0 | 193.0 | False |
4 | 0004_01 | False | 16.0 | False | 303.0 | 70.0 | 151.0 | 565.0 | 2.0 | True |
In [31]:
test.head()
Out[31]:
PassengerId | CryoSleep | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | |
---|---|---|---|---|---|---|---|---|---|
0 | 0013_01 | True | 27.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0018_01 | False | 19.0 | False | 0.0 | 9.0 | 0.0 | 2823.0 | 0.0 |
2 | 0019_01 | True | 31.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0021_01 | False | 38.0 | False | 0.0 | 6652.0 | 0.0 | 181.0 | 585.0 |
4 | 0023_01 | False | 20.0 | False | 10.0 | 0.0 | 635.0 | 0.0 | 0.0 |
Preprocessing¶
In [32]:
# Removing potential whitespace
train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()
In [33]:
print(f"Train set, null count: \n{train.isnull().sum()}")
print("\n")
print(f"Test set, null count: \n{test.isnull().sum()}")
Train set, null count: PassengerId 0 CryoSleep 217 Age 179 VIP 203 RoomService 181 FoodCourt 183 ShoppingMall 208 Spa 183 VRDeck 188 Transported 0 dtype: int64 Test set, null count: PassengerId 0 CryoSleep 93 Age 91 VIP 93 RoomService 82 FoodCourt 106 ShoppingMall 98 Spa 101 VRDeck 80 dtype: int64
In [34]:
train.shape
Out[34]:
(8693, 10)
In [35]:
test.shape
Out[35]:
(4277, 9)
In [36]:
test.head()
Out[36]:
PassengerId | CryoSleep | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | |
---|---|---|---|---|---|---|---|---|---|
0 | 0013_01 | True | 27.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0018_01 | False | 19.0 | False | 0.0 | 9.0 | 0.0 | 2823.0 | 0.0 |
2 | 0019_01 | True | 31.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0021_01 | False | 38.0 | False | 0.0 | 6652.0 | 0.0 | 181.0 | 585.0 |
4 | 0023_01 | False | 20.0 | False | 10.0 | 0.0 | 635.0 | 0.0 | 0.0 |
In [37]:
from sklearn.impute import SimpleImputer
numeric_cols = train.select_dtypes(include=['number']).columns
imputer = SimpleImputer(strategy='mean')
train[numeric_cols] = imputer.fit_transform(train[numeric_cols])
test[numeric_cols] = imputer.transform(test[numeric_cols])
In [38]:
print(f"Train set, null count: \n{train.isnull().sum()}")
print("\n")
print(f"Test set, null count: \n{test.isnull().sum()}")
Train set, null count: PassengerId 0 CryoSleep 217 Age 0 VIP 203 RoomService 0 FoodCourt 0 ShoppingMall 0 Spa 0 VRDeck 0 Transported 0 dtype: int64 Test set, null count: PassengerId 0 CryoSleep 93 Age 0 VIP 93 RoomService 0 FoodCourt 0 ShoppingMall 0 Spa 0 VRDeck 0 dtype: int64
In [39]:
train = train.fillna(False)
test = test.fillna(False)
C:\Users\robkr\AppData\Local\Temp\ipykernel_12540\826194202.py:1: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` train = train.fillna(False) C:\Users\robkr\AppData\Local\Temp\ipykernel_12540\826194202.py:2: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` test = test.fillna(False)
In [40]:
test.head()
Out[40]:
PassengerId | CryoSleep | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | |
---|---|---|---|---|---|---|---|---|---|
0 | 0013_01 | True | 27.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0018_01 | False | 19.0 | False | 0.0 | 9.0 | 0.0 | 2823.0 | 0.0 |
2 | 0019_01 | True | 31.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0021_01 | False | 38.0 | False | 0.0 | 6652.0 | 0.0 | 181.0 | 585.0 |
4 | 0023_01 | False | 20.0 | False | 10.0 | 0.0 | 635.0 | 0.0 | 0.0 |
In [41]:
columns_to_convert = ['CryoSleep', 'VIP']
for column in columns_to_convert:
train[column] = train[column].map({True: 1, False: 0, 'Yes': 1, 'No': 0}).astype(int)
test[column] = test[column].map({True: 1, False: 0, 'Yes': 1, 'No': 0}).astype(int)
In [42]:
test.head()
Out[42]:
PassengerId | CryoSleep | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | |
---|---|---|---|---|---|---|---|---|---|
0 | 0013_01 | 1 | 27.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0018_01 | 0 | 19.0 | 0 | 0.0 | 9.0 | 0.0 | 2823.0 | 0.0 |
2 | 0019_01 | 1 | 31.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0021_01 | 0 | 38.0 | 0 | 0.0 | 6652.0 | 0.0 | 181.0 | 585.0 |
4 | 0023_01 | 0 | 20.0 | 0 | 10.0 | 0.0 | 635.0 | 0.0 | 0.0 |
Feature Engineering¶
In [43]:
train['TotalMeanSpending'] = (train['RoomService'] + train['FoodCourt'] + train['ShoppingMall'] + train['Spa'] + train['VRDeck']) / 5
test['TotalMeanSpending'] = (train['RoomService'] + train['FoodCourt'] + train['ShoppingMall'] + train['Spa'] + train['VRDeck']) / 5
Training and Testing¶
In [44]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
In [45]:
X = train.drop(columns=['PassengerId', 'Transported'])
y = train['Transported']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
In [46]:
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
In [47]:
model = LogisticRegression(max_iter=500)
model.fit(X_train_scaled, y_train)
X_test = test.drop(columns=['PassengerId'], errors='ignore')
X_test = imputer.transform(X_test)
X_test_scaled = scaler.transform(X_test)
In [48]:
test_predictions = model.predict(X_test_scaled)
submission_df = pd.DataFrame({
'PassengerId': test['PassengerId'],
'Transported': test_predictions
})
submission_df.to_csv('submission.csv', index=False)
print(submission_df.head())
PassengerId Transported 0 0013_01 True 1 0018_01 False 2 0019_01 False 3 0021_01 True 4 0023_01 True
Final score of 76.011%. Not great.