import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.head()

test.head()

# Remove unwanted or unrelated columns:
train.drop(columns=['HomePlanet','Name','Destination', 'Cabin'], inplace=True)
test.drop(columns=['HomePlanet','Name','Destination', 'Cabin'], inplace=True)

train.head()

test.head()

# Removing potential whitespace
train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()

print(f"Train set, null count: \n{train.isnull().sum()}")
print("\n")
print(f"Test set, null count: \n{test.isnull().sum()}")

Train set, null count: 
PassengerId       0
CryoSleep       217
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64


Test set, null count: 
PassengerId       0
CryoSleep        93
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64

train.shape

(8693, 10)

test.shape

(4277, 9)

test.head()

from sklearn.impute import SimpleImputer

numeric_cols = train.select_dtypes(include=['number']).columns

imputer = SimpleImputer(strategy='mean')
train[numeric_cols] = imputer.fit_transform(train[numeric_cols])
test[numeric_cols] = imputer.transform(test[numeric_cols])

print(f"Train set, null count: \n{train.isnull().sum()}")
print("\n")
print(f"Test set, null count: \n{test.isnull().sum()}")

Train set, null count: 
PassengerId       0
CryoSleep       217
Age               0
VIP             203
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
dtype: int64


Test set, null count: 
PassengerId      0
CryoSleep       93
Age              0
VIP             93
RoomService      0
FoodCourt        0
ShoppingMall     0
Spa              0
VRDeck           0
dtype: int64

train = train.fillna(False)
test = test.fillna(False)

C:\Users\robkr\AppData\Local\Temp\ipykernel_12540\826194202.py:1: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  train = train.fillna(False)
C:\Users\robkr\AppData\Local\Temp\ipykernel_12540\826194202.py:2: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  test = test.fillna(False)

test.head()

columns_to_convert = ['CryoSleep', 'VIP']

for column in columns_to_convert:
    train[column] = train[column].map({True: 1, False: 0, 'Yes': 1, 'No': 0}).astype(int)
    test[column] = test[column].map({True: 1, False: 0, 'Yes': 1, 'No': 0}).astype(int)

test.head()

train['TotalMeanSpending'] = (train['RoomService'] + train['FoodCourt'] + train['ShoppingMall'] + train['Spa'] + train['VRDeck']) / 5
test['TotalMeanSpending'] = (train['RoomService'] + train['FoodCourt'] + train['ShoppingMall'] + train['Spa'] + train['VRDeck']) / 5

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

X = train.drop(columns=['PassengerId', 'Transported']) 
y = train['Transported']  

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model = LogisticRegression(max_iter=500)
model.fit(X_train_scaled, y_train)

X_test = test.drop(columns=['PassengerId'], errors='ignore')  
X_test = imputer.transform(X_test)  
X_test_scaled = scaler.transform(X_test)

test_predictions = model.predict(X_test_scaled)

submission_df = pd.DataFrame({
    'PassengerId': test['PassengerId'],  
    'Transported': test_predictions      
})

submission_df.to_csv('submission.csv', index=False)

print(submission_df.head())

  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01        False
3     0021_01         True
4     0023_01         True

	PassengerId	HomePlanet	CryoSleep	Cabin	Destination	Age	VIP	RoomService	FoodCourt	ShoppingMall	Spa	VRDeck	Name	Transported
0	0001_01	Europa	False	B/0/P	TRAPPIST-1e	39.0	False	0.0	0.0	0.0	0.0	0.0	Maham Ofracculy	False
1	0002_01	Earth	False	F/0/S	TRAPPIST-1e	24.0	False	109.0	9.0	25.0	549.0	44.0	Juanna Vines	True
2	0003_01	Europa	False	A/0/S	TRAPPIST-1e	58.0	True	43.0	3576.0	0.0	6715.0	49.0	Altark Susent	False
3	0003_02	Europa	False	A/0/S	TRAPPIST-1e	33.0	False	0.0	1283.0	371.0	3329.0	193.0	Solam Susent	False
4	0004_01	Earth	False	F/1/S	TRAPPIST-1e	16.0	False	303.0	70.0	151.0	565.0	2.0	Willy Santantines	True

	PassengerId	HomePlanet	CryoSleep	Cabin	Destination	Age	VIP	RoomService	FoodCourt	ShoppingMall	Spa	VRDeck	Name
0	0013_01	Earth	True	G/3/S	TRAPPIST-1e	27.0	False	0.0	0.0	0.0	0.0	0.0	Nelly Carsoning
1	0018_01	Earth	False	F/4/S	TRAPPIST-1e	19.0	False	0.0	9.0	0.0	2823.0	0.0	Lerome Peckers
2	0019_01	Europa	True	C/0/S	55 Cancri e	31.0	False	0.0	0.0	0.0	0.0	0.0	Sabih Unhearfus
3	0021_01	Europa	False	C/1/S	TRAPPIST-1e	38.0	False	0.0	6652.0	0.0	181.0	585.0	Meratz Caltilter
4	0023_01	Earth	False	F/5/S	TRAPPIST-1e	20.0	False	10.0	0.0	635.0	0.0	0.0	Brence Harperez

	PassengerId	CryoSleep	Age	VIP	RoomService	FoodCourt	ShoppingMall	Spa	VRDeck	Transported
0	0001_01	False	39.0	False	0.0	0.0	0.0	0.0	0.0	False
1	0002_01	False	24.0	False	109.0	9.0	25.0	549.0	44.0	True
2	0003_01	False	58.0	True	43.0	3576.0	0.0	6715.0	49.0	False
3	0003_02	False	33.0	False	0.0	1283.0	371.0	3329.0	193.0	False
4	0004_01	False	16.0	False	303.0	70.0	151.0	565.0	2.0	True

	PassengerId	CryoSleep	Age	VIP	RoomService	FoodCourt	ShoppingMall	Spa	VRDeck
0	0013_01	True	27.0	False	0.0	0.0	0.0	0.0	0.0
1	0018_01	False	19.0	False	0.0	9.0	0.0	2823.0	0.0
2	0019_01	True	31.0	False	0.0	0.0	0.0	0.0	0.0
3	0021_01	False	38.0	False	0.0	6652.0	0.0	181.0	585.0
4	0023_01	False	20.0	False	10.0	0.0	635.0	0.0	0.0

	PassengerId	CryoSleep	Age	VIP	RoomService	FoodCourt	ShoppingMall	Spa	VRDeck
0	0013_01	True	27.0	False	0.0	0.0	0.0	0.0	0.0
1	0018_01	False	19.0	False	0.0	9.0	0.0	2823.0	0.0
2	0019_01	True	31.0	False	0.0	0.0	0.0	0.0	0.0
3	0021_01	False	38.0	False	0.0	6652.0	0.0	181.0	585.0
4	0023_01	False	20.0	False	10.0	0.0	635.0	0.0	0.0

#1¶

Preprocessing¶

Feature Engineering¶

Training and Testing¶