⭐ 1. Introduction & Overview¶
Your Goal: Your task it to predict listening time of a podcast episode.
🔹 2. Import Libraries & Set Up¶
In [1]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
# Machine Learning
import xgboost as xg
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from imblearn.over_sampling import SMOTE
# Feature Importance & Explainability
import shap
# Settings
import warnings
warnings.filterwarnings("ignore")
# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)
print("Libraries loaded. Ready to go!")
Libraries loaded. Ready to go!
🔹 3. Load & Explore Data¶
In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [3]:
train.head()
Out[3]:
id | Podcast_Name | Episode_Title | Episode_Length_minutes | Genre | Host_Popularity_percentage | Publication_Day | Publication_Time | Guest_Popularity_percentage | Number_of_Ads | Episode_Sentiment | Listening_Time_minutes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | Mystery Matters | Episode 98 | NaN | True Crime | 74.81 | Thursday | Night | NaN | 0.0 | Positive | 31.41998 |
1 | 1 | Joke Junction | Episode 26 | 119.80 | Comedy | 66.95 | Saturday | Afternoon | 75.95 | 2.0 | Negative | 88.01241 |
2 | 2 | Study Sessions | Episode 16 | 73.90 | Education | 69.97 | Tuesday | Evening | 8.97 | 0.0 | Negative | 44.92531 |
3 | 3 | Digital Digest | Episode 45 | 67.17 | Technology | 57.22 | Monday | Morning | 78.70 | 2.0 | Positive | 46.27824 |
4 | 4 | Mind & Body | Episode 86 | 110.51 | Health | 80.07 | Monday | Afternoon | 58.68 | 3.0 | Neutral | 75.61031 |
In [5]:
train.dtypes
Out[5]:
id int64 Podcast_Name object Episode_Title object Episode_Length_minutes float64 Genre object Host_Popularity_percentage float64 Publication_Day object Publication_Time object Guest_Popularity_percentage float64 Number_of_Ads float64 Episode_Sentiment object Listening_Time_minutes float64 dtype: object
In [4]:
FEATURES = train.drop(columns=['id', 'Listening_Time_minutes']).columns.tolist()
TARGET = 'Listening_Time_minutes'
CATS = []
NUM = []
In [5]:
for c in FEATURES:
if train[c].dtype == "object":
CATS.append(c)
for c in FEATURES:
if c not in CATS:
NUM.append(c)
In [8]:
print(f"FEATURES: {FEATURES}\n")
print(f"CATEGORICAL FEATURES: {CATS}\n")
print(f"NUMERICAL FEATURES: {NUM}\n")
FEATURES: ['Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'] CATEGORICAL FEATURES: ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment'] NUMERICAL FEATURES: ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']
In [6]:
train.isnull().sum()
Out[6]:
id 0 Podcast_Name 0 Episode_Title 0 Episode_Length_minutes 87093 Genre 0 Host_Popularity_percentage 0 Publication_Day 0 Publication_Time 0 Guest_Popularity_percentage 146030 Number_of_Ads 1 Episode_Sentiment 0 Listening_Time_minutes 0 dtype: int64
In [10]:
test.isnull().sum()
Out[10]:
id 0 Podcast_Name 0 Episode_Title 0 Episode_Length_minutes 28736 Genre 0 Host_Popularity_percentage 0 Publication_Day 0 Publication_Time 0 Guest_Popularity_percentage 48832 Number_of_Ads 0 Episode_Sentiment 0 dtype: int64
🔹 4. Data Visualization & EDA¶
In [11]:
def plot_nums(dataframe):
float_cols = [col for col in dataframe.columns if dataframe[col].dtype == "float64" or dataframe[col].dtype == "int64"]
cols_per_row = 3
num_plots = len(float_cols)
rows = (num_plots // cols_per_row) + (num_plots % cols_per_row > 0)
fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 5 * rows))
axes = axes.flatten()
for idx, col in enumerate(float_cols):
sns.histplot(dataframe[col], bins=50, kde=True, ax=axes[idx])
axes[idx].set_title(f"Distribution of {col}")
for i in range(idx + 1, len(axes)):
fig.delaxes(axes[i])
plt.tight_layout()
plt.show()
plot_nums(train)
In [12]:
def heatmap_nums(dataframe):
heatmap_train = dataframe.select_dtypes(include=["float64", "int64"])
corr_matrix = heatmap_train.corr()
# change threshold!
threshold = 0.75
high_corr_pairs = (
corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
.stack()
.reset_index()
)
high_corr_pairs.columns = ["Feature 1", "Feature 2", "Correlation"]
high_corr_pairs = high_corr_pairs[high_corr_pairs["Correlation"].abs() > threshold]
plt.figure(figsize=(30, 12))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.show()
print("Highly correlated feature pairs (above threshold):")
print(high_corr_pairs)
heatmap_nums(train)
Highly correlated feature pairs (above threshold): Feature 1 Feature 2 Correlation 8 Episode_Length_minutes Listening_Time_minutes 0.916749
🔹 5. Feature Engineering¶
In [7]:
train.isnull().sum()
Out[7]:
id 0 Podcast_Name 0 Episode_Title 0 Episode_Length_minutes 87093 Genre 0 Host_Popularity_percentage 0 Publication_Day 0 Publication_Time 0 Guest_Popularity_percentage 146030 Number_of_Ads 1 Episode_Sentiment 0 Listening_Time_minutes 0 dtype: int64
In [8]:
test.isnull().sum()
Out[8]:
id 0 Podcast_Name 0 Episode_Title 0 Episode_Length_minutes 28736 Genre 0 Host_Popularity_percentage 0 Publication_Day 0 Publication_Time 0 Guest_Popularity_percentage 48832 Number_of_Ads 0 Episode_Sentiment 0 dtype: int64
In [9]:
train[NUM] = train[NUM].fillna(train[NUM].mean())
test[NUM] = test[NUM].fillna(train[NUM].mean())
train[CATS] = train[CATS].apply(lambda x: x.fillna(x.mode()[0]))
test[CATS] = test[CATS].apply(lambda x: x.fillna(train[x.name].mode()[0]))
In [10]:
train.isnull().sum()
Out[10]:
id 0 Podcast_Name 0 Episode_Title 0 Episode_Length_minutes 0 Genre 0 Host_Popularity_percentage 0 Publication_Day 0 Publication_Time 0 Guest_Popularity_percentage 0 Number_of_Ads 0 Episode_Sentiment 0 Listening_Time_minutes 0 dtype: int64
In [11]:
for c in NUM:
train[c] = train[c].astype("int32")
In [12]:
train.sample(3)
Out[12]:
id | Podcast_Name | Episode_Title | Episode_Length_minutes | Genre | Host_Popularity_percentage | Publication_Day | Publication_Time | Guest_Popularity_percentage | Number_of_Ads | Episode_Sentiment | Listening_Time_minutes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
404846 | 404846 | Daily Digest | Episode 74 | 46 | News | 58 | Wednesday | Night | 81 | 2 | Positive | 20.91979 |
580313 | 580313 | Joke Junction | Episode 30 | 45 | Comedy | 76 | Wednesday | Night | 45 | 0 | Positive | 42.66054 |
552086 | 552086 | Business Briefs | Episode 16 | 54 | Business | 96 | Saturday | Afternoon | 52 | 3 | Positive | 43.30069 |
In [19]:
train.dtypes
Out[19]:
id int64 Podcast_Name object Episode_Title object Episode_Length_minutes int32 Genre object Host_Popularity_percentage int32 Publication_Day object Publication_Time object Guest_Popularity_percentage int32 Number_of_Ads int32 Episode_Sentiment object Listening_Time_minutes float64 dtype: object
In [13]:
print(CATS)
['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
In [14]:
NOMINAL = ['Publication_Day', 'Publication_Time', 'Episode_Sentiment']
ORDINAL = ['Podcast_Name', 'Genre', 'Episode_Title']
In [15]:
le = LabelEncoder()
for col in CATS:
train[col] = le.fit_transform(train[col])
test[col] = le.fit_transform(test[col])
In [23]:
train.head()
Out[23]:
id | Podcast_Name | Episode_Title | Episode_Length_minutes | Genre | Host_Popularity_percentage | Publication_Day | Publication_Time | Guest_Popularity_percentage | Number_of_Ads | Episode_Sentiment | Listening_Time_minutes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 34 | 98 | 64 | 9 | 74 | 4 | 3 | 52 | 0 | 2 | 31.41998 |
1 | 1 | 24 | 19 | 119 | 1 | 66 | 2 | 0 | 75 | 2 | 0 | 88.01241 |
2 | 2 | 40 | 8 | 73 | 2 | 69 | 5 | 1 | 8 | 0 | 0 | 44.92531 |
3 | 3 | 10 | 40 | 67 | 8 | 57 | 1 | 2 | 78 | 2 | 2 | 46.27824 |
4 | 4 | 31 | 85 | 110 | 3 | 80 | 1 | 0 | 58 | 3 | 1 | 75.61031 |
In [16]:
test.head()
Out[16]:
id | Podcast_Name | Episode_Title | Episode_Length_minutes | Genre | Host_Popularity_percentage | Publication_Day | Publication_Time | Guest_Popularity_percentage | Number_of_Ads | Episode_Sentiment | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 750000 | 11 | 71 | 78.96 | 2 | 38.11 | 2 | 1 | 53.330000 | 1.0 | 1 |
1 | 750001 | 36 | 16 | 27.87 | 5 | 71.29 | 3 | 2 | 52.236449 | 0.0 | 1 |
2 | 750002 | 24 | 3 | 69.10 | 1 | 67.89 | 0 | 1 | 97.510000 | 0.0 | 2 |
3 | 750003 | 4 | 71 | 115.39 | 1 | 23.40 | 3 | 2 | 51.750000 | 2.0 | 2 |
4 | 750004 | 27 | 46 | 72.32 | 4 | 58.10 | 6 | 2 | 11.300000 | 2.0 | 1 |
🔹 6. XGBoost, KFold, CV¶
In [17]:
FEATURES = train.columns.tolist()
FEATURES.remove('id')
In [26]:
X = train.drop(columns=['id', 'Listening_Time_minutes'])
X_test = test.drop(columns=['id', 'Listening_Time_minutes'], errors='ignore')
y = train["Listening_Time_minutes"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=SEED)
def quick_eval(model, X_train, y_train):
model.fit(X_train, y_train)
predictions_val = model.predict(X_val)
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_log_error')
rmse = root_mean_squared_error(y_val, predictions_val)
print(f"*** {model.__class__.__name__} ***")
print(f"Root mean squared error (val): {rmse}")
print(f"Mean CV MSLE: {-np.mean(cv_scores):.4f}")
print(f"CV Std Dev: {np.std(cv_scores):.4f}")
stars = len(model.__class__.__name__) + 8
print("*" * stars)
print("\n")
quick_eval(xg.XGBRegressor(), X_train, y_train)
quick_eval(CatBoostRegressor(silent=True), X_train, y_train)
quick_eval(LGBMRegressor(verbose=0), X_train, y_train)
if False:
model.fit(X, y)
predictions = model.predict(X_test)
sub1 = pd.read_csv("sample_submission.csv")
sub1.Listening_Time_minutes = predictions
sub1.to_csv("xgboost.csv", index=False)
print("Sub shape:", sub1.shape)
sub1.head()
*** XGBRegressor *** Root mean squared error (val): 13.00044741913928 Mean CV MSLE: nan CV Std Dev: nan ******************** *** CatBoostRegressor *** Root mean squared error (val): 13.000353001312508 Mean CV MSLE: nan CV Std Dev: nan ************************* *** LGBMRegressor *** Root mean squared error (val): 13.065073378579445 Mean CV MSLE: 0.1761 CV Std Dev: 0.0008 *********************
In [27]:
model_catboost = CatBoostRegressor(
iterations=1000,
learning_rate=0.1,
depth=6,
loss_function='RMSE',
eval_metric='RMSE',
verbose=0,
random_seed=SEED
)
quick_eval(model_catboost, X_train, y_train)
*** CatBoostRegressor *** Root mean squared error (val): 13.003884298930313 Mean CV MSLE: nan CV Std Dev: nan *************************
In [28]:
if False:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
param_dist = {
'n_estimators': [100, 300, 500],
'learning_rate': [0.01, 0.05, 0.1, 0.2],
'max_depth': [3, 5, 7, 9],
'min_child_weight': [1, 3, 5],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0],
'gamma': [0, 0.1, 0.2],
'reg_alpha': [0, 0.1, 0.5],
'reg_lambda': [1, 1.5, 2]
}
xgb = xg.XGBRegressor(random_state=42)
random_search = RandomizedSearchCV(
xgb,
param_distributions=param_dist,
n_iter=50,
scoring='neg_mean_squared_error',
cv=5,
verbose=1,
random_state=42,
n_jobs=-1
)
random_search.fit(X_train, y_train)
print("Best params:", random_search.best_params_)
xgb_best_params_ = random_search.best_params_
In [18]:
# We skip fine-tuning on the next operation
xgb_best_params_ = {'subsample': 1.0, 'reg_lambda': 1, 'reg_alpha': 0.5, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 9, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}
🔹 7. XGBoost + Finetuning¶
In [28]:
X = train.drop(columns=['id', 'Listening_Time_minutes'])
X_test = test.drop(columns=['id', 'Listening_Time_minutes'], errors='ignore')
y = train["Listening_Time_minutes"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=SEED)
model = xg.XGBRegressor(**xgb_best_params_)
model.fit(X_train, y_train)
quick_eval(model, X_train, y_train)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[28], line 11 8 model = xg.XGBRegressor(**xgb_best_params_) 9 model.fit(X_train, y_train) ---> 11 quick_eval(model, X_train, y_train) NameError: name 'quick_eval' is not defined
In [31]:
model.fit(X, y)
predictions = model.predict(X_test)
sub1 = pd.read_csv("sample_submission.csv")
sub1.Listening_Time_minutes = predictions
sub1.to_csv("submission1.csv", index=False)
print("Sub shape:", sub1.shape)
sub1.head()
Sub shape: (250000, 2)
Out[31]:
id | Listening_Time_minutes | |
---|---|---|
0 | 750000 | 53.295830 |
1 | 750001 | 18.408051 |
2 | 750002 | 46.309479 |
3 | 750003 | 76.650085 |
4 | 750004 | 47.720146 |
🔹 8. Model averaging: XGBoost, LinearRegression, RandomForest¶
In [32]:
FOLDS=5
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
oof_pred = np.zeros(len(train))
xgb_pred = np.zeros(len(test))
for train_idx, val_idx in kf.split(train):
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
xg_model = xg.XGBRegressor(**xgb_best_params_)
xg_model.fit(X_train, y_train)
y_pred = xg_model.predict(X_val)
oof_pred[val_idx] = y_pred
xgb_pred += xg_model.predict(X_test)
print(f"Fold RMSE: {root_mean_squared_error(y_val, y_pred)}") # Change RMSE by accuracy/recall/f1-score for classification
final_rmse = root_mean_squared_error(y, oof_pred)
print(f"Final Cross-Validation RMSE: {final_rmse}")
xgb_pred /= FOLDS
Fold RMSE: 12.828010543133558 Fold RMSE: 12.8868670697745 Fold RMSE: 12.884075027703568 Fold RMSE: 12.87833924047012 Fold RMSE: 12.830077576774563 Final Cross-Validation RMSE: 12.861501459059626
In [33]:
%time
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
oof_pred = np.zeros(len(train))
lr_pred = np.zeros(len(test))
for train_idx, val_idx in kf.split(train):
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_val)
oof_pred[val_idx] = y_pred
lr_pred += lr_model.predict(X_test)
print(f"Fold RMSE: {root_mean_squared_error(y_val, y_pred)}") # Change RMSE by accuracy/recall/f1-score for classification
final_rmse = root_mean_squared_error(y, oof_pred)
print(f"Final Cross-Validation RMSE: {final_rmse}")
lr_pred /= FOLDS
CPU times: total: 0 ns Wall time: 0 ns Fold RMSE: 13.360517747661406 Fold RMSE: 13.41513548139313 Fold RMSE: 13.388394145730924 Fold RMSE: 13.406016297805706 Fold RMSE: 13.364590591376729 Final Cross-Validation RMSE: 13.386948471726468
In [34]:
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
oof_pred = np.zeros(len(train))
rf_pred = np.zeros(len(test))
for train_idx, val_idx in kf.split(train):
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
rf_model = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_val)
oof_pred[val_idx] = y_pred
rf_pred += rf_model.predict(X_test)
print(f"Fold RMSE: {root_mean_squared_error(y_val, y_pred)}") # Change RMSE by accuracy/recall/f1-score for classification
final_rmse = root_mean_squared_error(y, oof_pred)
print(f"Final Cross-Validation RMSE: {final_rmse}")
rf_pred /= FOLDS
Fold RMSE: 12.820272853502713 Fold RMSE: 12.864690073880709 Fold RMSE: 12.870606446700181 Fold RMSE: 12.86273183937078 Fold RMSE: 12.8293064078577 Final Cross-Validation RMSE: 12.849537970832767
In [35]:
sub2 = pd.read_csv("sample_submission.csv")
best_public = pd.read_csv("best_public_2.csv")
best_public_preds = best_public.Listening_Time_minutes.values
sub2.Listening_Time_minutes = 0.3 * xgb_pred + 0.4 * best_public_preds + 0.3 * rf_pred
sub2.to_csv("ensemble1.csv", index=False)
print("Sub shape:", sub2.shape)
sub2.head()
Sub shape: (250000, 2)
Out[35]:
id | Listening_Time_minutes | |
---|---|---|
0 | 750000 | 54.120394 |
1 | 750001 | 19.712607 |
2 | 750002 | 49.018499 |
3 | 750003 | 75.467411 |
4 | 750004 | 45.882480 |
🔹 9. Model Stacking¶
In [19]:
X = train.drop(columns=['id', 'Listening_Time_minutes'])
X_test = test.drop(columns=['id', 'Listening_Time_minutes'], errors='ignore')
y = train["Listening_Time_minutes"]
In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
if False:
rf = RandomForestRegressor(random_state=42)
param_grid = {
'n_estimators': [150],
'max_depth': [3, 5, 7],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
grid_search = GridSearchCV(
estimator=rf,
param_grid=param_grid,
cv=3,
n_jobs=-1,
verbose=10,
scoring='neg_mean_squared_error'
)
grid_search.fit(X, y)
print(f"Best hyperparameters found: {grid_search.best_params_}")
best_rf = grid_search.best_estimator_
rf_best_params_ = grid_search.best_params_
Fitting 3 folds for each of 27 candidates, totalling 81 fits Best hyperparameters found: {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 150}
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best hyperparameters found: {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 150}
In [36]:
def generate_stack(models, X, y, X_test, SEED=42, folds=5):
n_models = len(models)
meta_features_train = np.zeros((len(X), n_models))
meta_features_test = np.zeros((len(X_test), n_models))
kf = KFold(n_splits=folds, shuffle=True, random_state=SEED)
for i, model in enumerate(models):
oof = np.zeros(len(X))
preds = np.zeros(len(X_test))
for train_idx, val_idx in kf.split(X):
model.fit(X.iloc[train_idx], y.iloc[train_idx])
oof[val_idx] = model.predict(X.iloc[val_idx])
preds += model.predict(X_test) / folds
meta_features_train[:, i] = oof
meta_features_test[:, i] = preds
meta_model = LinearRegression()
meta_model.fit(meta_features_train, y)
return meta_model, meta_features_train, meta_features_test
X = train.drop(columns=['id', 'Listening_Time_minutes'])
X_test = test.drop(columns=['id', 'Listening_Time_minutes'], errors='ignore')
y = train["Listening_Time_minutes"]
models = [
xg.XGBRegressor(**xgb_best_params_),
LinearRegression(),
RandomForestRegressor()
]
meta_model, meta_features_train, meta_features_test = generate_stack(models, X, y, X_test)
In [37]:
meta_model
Out[37]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [38]:
stacked_preds = meta_model.predict(meta_features_train)
final_rmse = root_mean_squared_error(y, stacked_preds)
print(f"Final Cross-Validation RMSE: {final_rmse:.4f}")
Final Cross-Validation RMSE: 12.6743
Final Cross-Validation RMSE: 12.6753 with Metamodel = LR
Final Cross-Validation RMSE: 12.7772 with metamodel = XGB(bestparams), no finetuning Random Forest
In [40]:
sub3 = pd.read_csv("sample_submission.csv")
final_preds = meta_model.predict(meta_features_test)
sub3.Listening_Time_minutes = final_preds
sub3.to_csv("stackingzzz.csv", index=False)
print("Sub shape:", sub3.shape)
sub3.head()
Sub shape: (250000, 2)
Out[40]:
id | Listening_Time_minutes | |
---|---|---|
0 | 750000 | 53.288106 |
1 | 750001 | 18.968049 |
2 | 750002 | 48.712395 |
3 | 750003 | 73.825208 |
4 | 750004 | 43.750943 |
Submit this!!!
🔹 10. Model Averaging with best public score¶
In [ ]:
import pandas as pd
d1 = pd.read_csv('submission3.csv')['Listening_Time_minutes'].values
d2 = pd.read_csv('submission1.csv')['Listening_Time_minutes'].values
x_Listening_Time_minutes = d2 * -0.01 + d1 * 1.01
sub4 = pd.read_csv("sample_submission.csv")
sub4.Listening_Time_minutes = x_Listening_Time_minutes
sub4.to_csv('ensemble2.csv', index=False)
sub4
Out[ ]:
id | Listening_Time_minutes | |
---|---|---|
0 | 750000 | 53.926783 |
1 | 750001 | 19.251744 |
2 | 750002 | 47.882719 |
3 | 750003 | 73.604575 |
4 | 750004 | 44.573747 |
... | ... | ... |
249995 | 999995 | 12.356622 |
249996 | 999996 | 60.152370 |
249997 | 999997 | 7.304994 |
249998 | 999998 | 75.179575 |
249999 | 999999 | 59.607581 |
250000 rows × 2 columns
In [32]:
import pandas as pd
x1 = pd.read_csv('submission_stacked.csv')['Listening_Time_minutes'].values
x2 = pd.read_csv('submission1.csv')['Listening_Time_minutes'].values
x3 = pd.read_csv("submission3.csv")['Listening_Time_minutes'].values
x4 = pd.read_csv("satckingzzz.csv")['Listening_Time_minutes'].values
x_Listening_Time_minutes = x1 * 0.25 + x2 * 0.25 + x3 * 0.25 + x4 * 0.25
sub4 = pd.read_csv("sample_submission.csv")
sub4.Listening_Time_minutes = x_Listening_Time_minutes
sub4.to_csv('ultra.csv', index=False)
sub4
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[32], line 6 4 x2 = pd.read_csv('submission1.csv')['Listening_Time_minutes'].values 5 x3 = pd.read_csv("submission3.csv")['Listening_Time_minutes'].values ----> 6 x4 = pd.read_csv("satckingzzz.csv")['Listening_Time_minutes'].values 7 x_Listening_Time_minutes = x1 * 0.25 + x2 * 0.25 + x3 * 0.25 + x4 * 0.25 9 sub4 = pd.read_csv("sample_submission.csv") File c:\Users\robkr\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend) 1013 kwds_defaults = _refine_defaults_read( 1014 dialect, 1015 delimiter, (...) 1022 dtype_backend=dtype_backend, 1023 ) 1024 kwds.update(kwds_defaults) -> 1026 return _read(filepath_or_buffer, kwds) File c:\Users\robkr\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:620, in _read(filepath_or_buffer, kwds) 617 _validate_names(kwds.get("names", None)) 619 # Create the parser. --> 620 parser = TextFileReader(filepath_or_buffer, **kwds) 622 if chunksize or iterator: 623 return parser File c:\Users\robkr\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds) 1617 self.options["has_index_names"] = kwds["has_index_names"] 1619 self.handles: IOHandles | None = None -> 1620 self._engine = self._make_engine(f, self.engine) File c:\Users\robkr\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1880, in TextFileReader._make_engine(self, f, engine) 1878 if "b" not in mode: 1879 mode += "b" -> 1880 self.handles = get_handle( 1881 f, 1882 mode, 1883 encoding=self.options.get("encoding", None), 1884 compression=self.options.get("compression", None), 1885 memory_map=self.options.get("memory_map", False), 1886 is_text=is_text, 1887 errors=self.options.get("encoding_errors", "strict"), 1888 storage_options=self.options.get("storage_options", None), 1889 ) 1890 assert self.handles is not None 1891 f = self.handles.handle File c:\Users\robkr\anaconda3\Lib\site-packages\pandas\io\common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 868 elif isinstance(handle, str): 869 # Check whether the filename is to be opened in binary mode. 870 # Binary mode does not support 'encoding' and 'newline'. 871 if ioargs.encoding and "b" not in ioargs.mode: 872 # Encoding --> 873 handle = open( 874 handle, 875 ioargs.mode, 876 encoding=ioargs.encoding, 877 errors=errors, 878 newline="", 879 ) 880 else: 881 # Binary mode 882 handle = open(handle, ioargs.mode) FileNotFoundError: [Errno 2] No such file or directory: 'satckingzzz.csv'