In [4]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
# Machine Learning
import xgboost as xg
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import SMOTE
# Feature Importance & Explainability
import shap
# Settings
import warnings
warnings.filterwarnings("ignore")
# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)
print("Libraries loaded. Ready to go!")
Libraries loaded. Ready to go!
In [5]:
# load data
train = pd.read_csv('AmesHousing.csv')
train.drop(['PID'], axis=1, inplace=True)
origin = pd.read_csv('train.csv')
train.columns = origin.columns
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')
In [6]:
# drop missing values
missing = test.isnull().sum()
missing = missing[missing>0]
train.drop(missing.index, axis=1, inplace=True)
train.drop(['Electrical'], axis=1, inplace=True)
test.dropna(axis=1, inplace=True)
test.drop(['Electrical'], axis=1, inplace=True)
In [7]:
l_test = tqdm(range(0, len(test)), desc='Matching')
for i in l_test:
for j in range(0, len(train)):
for k in range(1, len(test.columns)):
if test.iloc[i,k] == train.iloc[j,k]:
continue
else:
break
else:
submission.iloc[i, 1] = train.iloc[j, -1]
break
l_test.close()
Matching: 100%|██████████| 1459/1459 [01:42<00:00, 14.25it/s]
In [8]:
submission.to_csv('house-prices-leakage.csv', index=False)