#1¶

Kaggle competition: https://www.kaggle.com/competitions/spaceship-titanic/

Entry by Robin P.M. Kras

In [ ]:
import pandas as pd
In [26]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [27]:
train.head()
Out[27]:
PassengerId HomePlanet CryoSleep Cabin Destination Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck Name Transported
0 0001_01 Europa False B/0/P TRAPPIST-1e 39.0 False 0.0 0.0 0.0 0.0 0.0 Maham Ofracculy False
1 0002_01 Earth False F/0/S TRAPPIST-1e 24.0 False 109.0 9.0 25.0 549.0 44.0 Juanna Vines True
2 0003_01 Europa False A/0/S TRAPPIST-1e 58.0 True 43.0 3576.0 0.0 6715.0 49.0 Altark Susent False
3 0003_02 Europa False A/0/S TRAPPIST-1e 33.0 False 0.0 1283.0 371.0 3329.0 193.0 Solam Susent False
4 0004_01 Earth False F/1/S TRAPPIST-1e 16.0 False 303.0 70.0 151.0 565.0 2.0 Willy Santantines True
In [28]:
test.head()
Out[28]:
PassengerId HomePlanet CryoSleep Cabin Destination Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck Name
0 0013_01 Earth True G/3/S TRAPPIST-1e 27.0 False 0.0 0.0 0.0 0.0 0.0 Nelly Carsoning
1 0018_01 Earth False F/4/S TRAPPIST-1e 19.0 False 0.0 9.0 0.0 2823.0 0.0 Lerome Peckers
2 0019_01 Europa True C/0/S 55 Cancri e 31.0 False 0.0 0.0 0.0 0.0 0.0 Sabih Unhearfus
3 0021_01 Europa False C/1/S TRAPPIST-1e 38.0 False 0.0 6652.0 0.0 181.0 585.0 Meratz Caltilter
4 0023_01 Earth False F/5/S TRAPPIST-1e 20.0 False 10.0 0.0 635.0 0.0 0.0 Brence Harperez
In [29]:
# Remove unwanted or unrelated columns:
train.drop(columns=['HomePlanet','Name','Destination', 'Cabin'], inplace=True)
test.drop(columns=['HomePlanet','Name','Destination', 'Cabin'], inplace=True)
In [30]:
train.head()
Out[30]:
PassengerId CryoSleep Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck Transported
0 0001_01 False 39.0 False 0.0 0.0 0.0 0.0 0.0 False
1 0002_01 False 24.0 False 109.0 9.0 25.0 549.0 44.0 True
2 0003_01 False 58.0 True 43.0 3576.0 0.0 6715.0 49.0 False
3 0003_02 False 33.0 False 0.0 1283.0 371.0 3329.0 193.0 False
4 0004_01 False 16.0 False 303.0 70.0 151.0 565.0 2.0 True
In [31]:
test.head()
Out[31]:
PassengerId CryoSleep Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck
0 0013_01 True 27.0 False 0.0 0.0 0.0 0.0 0.0
1 0018_01 False 19.0 False 0.0 9.0 0.0 2823.0 0.0
2 0019_01 True 31.0 False 0.0 0.0 0.0 0.0 0.0
3 0021_01 False 38.0 False 0.0 6652.0 0.0 181.0 585.0
4 0023_01 False 20.0 False 10.0 0.0 635.0 0.0 0.0

Preprocessing¶

In [32]:
# Removing potential whitespace
train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()
In [33]:
print(f"Train set, null count: \n{train.isnull().sum()}")
print("\n")
print(f"Test set, null count: \n{test.isnull().sum()}")
Train set, null count: 
PassengerId       0
CryoSleep       217
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64


Test set, null count: 
PassengerId       0
CryoSleep        93
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64
In [34]:
train.shape
Out[34]:
(8693, 10)
In [35]:
test.shape
Out[35]:
(4277, 9)
In [36]:
test.head()
Out[36]:
PassengerId CryoSleep Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck
0 0013_01 True 27.0 False 0.0 0.0 0.0 0.0 0.0
1 0018_01 False 19.0 False 0.0 9.0 0.0 2823.0 0.0
2 0019_01 True 31.0 False 0.0 0.0 0.0 0.0 0.0
3 0021_01 False 38.0 False 0.0 6652.0 0.0 181.0 585.0
4 0023_01 False 20.0 False 10.0 0.0 635.0 0.0 0.0
In [37]:
from sklearn.impute import SimpleImputer

numeric_cols = train.select_dtypes(include=['number']).columns

imputer = SimpleImputer(strategy='mean')
train[numeric_cols] = imputer.fit_transform(train[numeric_cols])
test[numeric_cols] = imputer.transform(test[numeric_cols])
In [38]:
print(f"Train set, null count: \n{train.isnull().sum()}")
print("\n")
print(f"Test set, null count: \n{test.isnull().sum()}")
Train set, null count: 
PassengerId       0
CryoSleep       217
Age               0
VIP             203
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
dtype: int64


Test set, null count: 
PassengerId      0
CryoSleep       93
Age              0
VIP             93
RoomService      0
FoodCourt        0
ShoppingMall     0
Spa              0
VRDeck           0
dtype: int64
In [39]:
train = train.fillna(False)
test = test.fillna(False)
C:\Users\robkr\AppData\Local\Temp\ipykernel_12540\826194202.py:1: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  train = train.fillna(False)
C:\Users\robkr\AppData\Local\Temp\ipykernel_12540\826194202.py:2: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  test = test.fillna(False)
In [40]:
test.head()
Out[40]:
PassengerId CryoSleep Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck
0 0013_01 True 27.0 False 0.0 0.0 0.0 0.0 0.0
1 0018_01 False 19.0 False 0.0 9.0 0.0 2823.0 0.0
2 0019_01 True 31.0 False 0.0 0.0 0.0 0.0 0.0
3 0021_01 False 38.0 False 0.0 6652.0 0.0 181.0 585.0
4 0023_01 False 20.0 False 10.0 0.0 635.0 0.0 0.0
In [41]:
columns_to_convert = ['CryoSleep', 'VIP']

for column in columns_to_convert:
    train[column] = train[column].map({True: 1, False: 0, 'Yes': 1, 'No': 0}).astype(int)
    test[column] = test[column].map({True: 1, False: 0, 'Yes': 1, 'No': 0}).astype(int)
In [42]:
test.head()
Out[42]:
PassengerId CryoSleep Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck
0 0013_01 1 27.0 0 0.0 0.0 0.0 0.0 0.0
1 0018_01 0 19.0 0 0.0 9.0 0.0 2823.0 0.0
2 0019_01 1 31.0 0 0.0 0.0 0.0 0.0 0.0
3 0021_01 0 38.0 0 0.0 6652.0 0.0 181.0 585.0
4 0023_01 0 20.0 0 10.0 0.0 635.0 0.0 0.0

Feature Engineering¶

In [43]:
train['TotalMeanSpending'] = (train['RoomService'] + train['FoodCourt'] + train['ShoppingMall'] + train['Spa'] + train['VRDeck']) / 5
test['TotalMeanSpending'] = (train['RoomService'] + train['FoodCourt'] + train['ShoppingMall'] + train['Spa'] + train['VRDeck']) / 5

Training and Testing¶

In [44]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
In [45]:
X = train.drop(columns=['PassengerId', 'Transported']) 
y = train['Transported']  

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
In [46]:
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)  
In [47]:
model = LogisticRegression(max_iter=500)
model.fit(X_train_scaled, y_train)

X_test = test.drop(columns=['PassengerId'], errors='ignore')  
X_test = imputer.transform(X_test)  
X_test_scaled = scaler.transform(X_test)  
In [48]:
test_predictions = model.predict(X_test_scaled)

submission_df = pd.DataFrame({
    'PassengerId': test['PassengerId'],  
    'Transported': test_predictions      
})

submission_df.to_csv('submission.csv', index=False)

print(submission_df.head())
  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01        False
3     0021_01         True
4     0023_01         True

Final score of 76.011%. Not great.