#5¶

Kaggle competition: [link]

Entry by Robin R.P.M. Kras

⭐ 1. Introduction & Overview¶

Your Goal: Your task it to predict listening time of a podcast episode.

🔹 2. Import Libraries & Set Up¶

In [1]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Machine Learning
import xgboost as xg

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from imblearn.over_sampling import SMOTE

# Feature Importance & Explainability
import shap

# Settings
import warnings
warnings.filterwarnings("ignore")

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

print("Libraries loaded. Ready to go!")
Libraries loaded. Ready to go!

🔹 3. Load & Explore Data¶

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [3]:
train.head()
Out[3]:
id Podcast_Name Episode_Title Episode_Length_minutes Genre Host_Popularity_percentage Publication_Day Publication_Time Guest_Popularity_percentage Number_of_Ads Episode_Sentiment Listening_Time_minutes
0 0 Mystery Matters Episode 98 NaN True Crime 74.81 Thursday Night NaN 0.0 Positive 31.41998
1 1 Joke Junction Episode 26 119.80 Comedy 66.95 Saturday Afternoon 75.95 2.0 Negative 88.01241
2 2 Study Sessions Episode 16 73.90 Education 69.97 Tuesday Evening 8.97 0.0 Negative 44.92531
3 3 Digital Digest Episode 45 67.17 Technology 57.22 Monday Morning 78.70 2.0 Positive 46.27824
4 4 Mind & Body Episode 86 110.51 Health 80.07 Monday Afternoon 58.68 3.0 Neutral 75.61031
In [5]:
train.dtypes
Out[5]:
id                               int64
Podcast_Name                    object
Episode_Title                   object
Episode_Length_minutes         float64
Genre                           object
Host_Popularity_percentage     float64
Publication_Day                 object
Publication_Time                object
Guest_Popularity_percentage    float64
Number_of_Ads                  float64
Episode_Sentiment               object
Listening_Time_minutes         float64
dtype: object
In [4]:
FEATURES = train.drop(columns=['id', 'Listening_Time_minutes']).columns.tolist()
TARGET = 'Listening_Time_minutes'
CATS = []
NUM = []
In [5]:
for c in FEATURES:
    if train[c].dtype == "object":
        CATS.append(c)

for c in FEATURES:
    if c not in CATS:
        NUM.append(c)
In [8]:
print(f"FEATURES: {FEATURES}\n")
print(f"CATEGORICAL FEATURES: {CATS}\n")
print(f"NUMERICAL FEATURES: {NUM}\n")
FEATURES: ['Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment']

CATEGORICAL FEATURES: ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

NUMERICAL FEATURES: ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']

In [6]:
train.isnull().sum()
Out[6]:
id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64
In [10]:
test.isnull().sum()
Out[10]:
id                                 0
Podcast_Name                       0
Episode_Title                      0
Episode_Length_minutes         28736
Genre                              0
Host_Popularity_percentage         0
Publication_Day                    0
Publication_Time                   0
Guest_Popularity_percentage    48832
Number_of_Ads                      0
Episode_Sentiment                  0
dtype: int64

🔹 4. Data Visualization & EDA¶

In [11]:
def plot_nums(dataframe):
    float_cols = [col for col in dataframe.columns if dataframe[col].dtype == "float64" or dataframe[col].dtype == "int64"]

    cols_per_row = 3
    num_plots = len(float_cols)
    rows = (num_plots // cols_per_row) + (num_plots % cols_per_row > 0) 

    fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 5 * rows)) 
    axes = axes.flatten()  

    for idx, col in enumerate(float_cols):
        sns.histplot(dataframe[col], bins=50, kde=True, ax=axes[idx])
        axes[idx].set_title(f"Distribution of {col}")

    for i in range(idx + 1, len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.show()

plot_nums(train)
No description has been provided for this image
In [12]:
def heatmap_nums(dataframe): 
    heatmap_train = dataframe.select_dtypes(include=["float64", "int64"])

    corr_matrix = heatmap_train.corr()

    # change threshold!
    threshold = 0.75

    high_corr_pairs = (
        corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) 
        .stack()  
        .reset_index()
    )

    high_corr_pairs.columns = ["Feature 1", "Feature 2", "Correlation"]
    high_corr_pairs = high_corr_pairs[high_corr_pairs["Correlation"].abs() > threshold]  

    plt.figure(figsize=(30, 12))
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
    plt.title("Feature Correlation Matrix")
    plt.show()

    print("Highly correlated feature pairs (above threshold):")
    print(high_corr_pairs)

heatmap_nums(train)
No description has been provided for this image
Highly correlated feature pairs (above threshold):
                Feature 1               Feature 2  Correlation
8  Episode_Length_minutes  Listening_Time_minutes     0.916749

🔹 5. Feature Engineering¶

In [7]:
train.isnull().sum()
Out[7]:
id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64
In [8]:
test.isnull().sum()
Out[8]:
id                                 0
Podcast_Name                       0
Episode_Title                      0
Episode_Length_minutes         28736
Genre                              0
Host_Popularity_percentage         0
Publication_Day                    0
Publication_Time                   0
Guest_Popularity_percentage    48832
Number_of_Ads                      0
Episode_Sentiment                  0
dtype: int64
In [9]:
train[NUM] = train[NUM].fillna(train[NUM].mean())
test[NUM] = test[NUM].fillna(train[NUM].mean())
train[CATS] = train[CATS].apply(lambda x: x.fillna(x.mode()[0]))
test[CATS] = test[CATS].apply(lambda x: x.fillna(train[x.name].mode()[0]))
In [10]:
train.isnull().sum()
Out[10]:
id                             0
Podcast_Name                   0
Episode_Title                  0
Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
Listening_Time_minutes         0
dtype: int64
In [11]:
for c in NUM:
    train[c] = train[c].astype("int32")
In [12]:
train.sample(3)
Out[12]:
id Podcast_Name Episode_Title Episode_Length_minutes Genre Host_Popularity_percentage Publication_Day Publication_Time Guest_Popularity_percentage Number_of_Ads Episode_Sentiment Listening_Time_minutes
404846 404846 Daily Digest Episode 74 46 News 58 Wednesday Night 81 2 Positive 20.91979
580313 580313 Joke Junction Episode 30 45 Comedy 76 Wednesday Night 45 0 Positive 42.66054
552086 552086 Business Briefs Episode 16 54 Business 96 Saturday Afternoon 52 3 Positive 43.30069
In [19]:
train.dtypes
Out[19]:
id                               int64
Podcast_Name                    object
Episode_Title                   object
Episode_Length_minutes           int32
Genre                           object
Host_Popularity_percentage       int32
Publication_Day                 object
Publication_Time                object
Guest_Popularity_percentage      int32
Number_of_Ads                    int32
Episode_Sentiment               object
Listening_Time_minutes         float64
dtype: object
In [13]:
print(CATS)
['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
In [14]:
NOMINAL = ['Publication_Day', 'Publication_Time', 'Episode_Sentiment']
ORDINAL = ['Podcast_Name', 'Genre', 'Episode_Title']
In [15]:
le = LabelEncoder()

for col in CATS:
    train[col] = le.fit_transform(train[col])
    test[col] = le.fit_transform(test[col])
In [23]:
train.head()
Out[23]:
id Podcast_Name Episode_Title Episode_Length_minutes Genre Host_Popularity_percentage Publication_Day Publication_Time Guest_Popularity_percentage Number_of_Ads Episode_Sentiment Listening_Time_minutes
0 0 34 98 64 9 74 4 3 52 0 2 31.41998
1 1 24 19 119 1 66 2 0 75 2 0 88.01241
2 2 40 8 73 2 69 5 1 8 0 0 44.92531
3 3 10 40 67 8 57 1 2 78 2 2 46.27824
4 4 31 85 110 3 80 1 0 58 3 1 75.61031
In [16]:
test.head()
Out[16]:
id Podcast_Name Episode_Title Episode_Length_minutes Genre Host_Popularity_percentage Publication_Day Publication_Time Guest_Popularity_percentage Number_of_Ads Episode_Sentiment
0 750000 11 71 78.96 2 38.11 2 1 53.330000 1.0 1
1 750001 36 16 27.87 5 71.29 3 2 52.236449 0.0 1
2 750002 24 3 69.10 1 67.89 0 1 97.510000 0.0 2
3 750003 4 71 115.39 1 23.40 3 2 51.750000 2.0 2
4 750004 27 46 72.32 4 58.10 6 2 11.300000 2.0 1

🔹 6. XGBoost, KFold, CV¶

In [17]:
FEATURES = train.columns.tolist()
FEATURES.remove('id')
In [26]:
X = train.drop(columns=['id', 'Listening_Time_minutes'])
X_test = test.drop(columns=['id', 'Listening_Time_minutes'], errors='ignore')

y = train["Listening_Time_minutes"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=SEED)

def quick_eval(model, X_train, y_train):
    model.fit(X_train, y_train)

    predictions_val = model.predict(X_val)

    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

    cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_log_error')

    rmse = root_mean_squared_error(y_val, predictions_val)
    print(f"*** {model.__class__.__name__} ***")
    print(f"Root mean squared error (val): {rmse}")
    print(f"Mean CV MSLE: {-np.mean(cv_scores):.4f}")
    print(f"CV Std Dev: {np.std(cv_scores):.4f}")

    stars = len(model.__class__.__name__) + 8
    print("*" * stars)
    print("\n")

quick_eval(xg.XGBRegressor(), X_train, y_train)
quick_eval(CatBoostRegressor(silent=True), X_train, y_train)
quick_eval(LGBMRegressor(verbose=0), X_train, y_train)

if False:
    model.fit(X, y) 
    predictions = model.predict(X_test)

    sub1 = pd.read_csv("sample_submission.csv")

    sub1.Listening_Time_minutes = predictions 

    sub1.to_csv("xgboost.csv", index=False)

    print("Sub shape:", sub1.shape)
    sub1.head()
*** XGBRegressor ***
Root mean squared error (val): 13.00044741913928
Mean CV MSLE: nan
CV Std Dev: nan
********************


*** CatBoostRegressor ***
Root mean squared error (val): 13.000353001312508
Mean CV MSLE: nan
CV Std Dev: nan
*************************


*** LGBMRegressor ***
Root mean squared error (val): 13.065073378579445
Mean CV MSLE: 0.1761
CV Std Dev: 0.0008
*********************


In [27]:
model_catboost = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='RMSE',
    eval_metric='RMSE',
    verbose=0,
    random_seed=SEED
)

quick_eval(model_catboost, X_train, y_train)
*** CatBoostRegressor ***
Root mean squared error (val): 13.003884298930313
Mean CV MSLE: nan
CV Std Dev: nan
*************************


In [28]:
if False:
    from sklearn.model_selection import RandomizedSearchCV
    import numpy as np

    param_dist = {
        'n_estimators': [100, 300, 500],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7, 9],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'gamma': [0, 0.1, 0.2],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [1, 1.5, 2]
    }

    xgb = xg.XGBRegressor(random_state=42)
    random_search = RandomizedSearchCV(
        xgb,
        param_distributions=param_dist,
        n_iter=50,
        scoring='neg_mean_squared_error',
        cv=5,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )

    random_search.fit(X_train, y_train)
    print("Best params:", random_search.best_params_)
    xgb_best_params_ = random_search.best_params_
In [18]:
# We skip fine-tuning on the next operation

xgb_best_params_ = {'subsample': 1.0, 'reg_lambda': 1, 'reg_alpha': 0.5, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 9, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}

🔹 7. XGBoost + Finetuning¶

In [28]:
X = train.drop(columns=['id', 'Listening_Time_minutes'])
X_test = test.drop(columns=['id', 'Listening_Time_minutes'], errors='ignore')

y = train["Listening_Time_minutes"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=SEED)

model = xg.XGBRegressor(**xgb_best_params_)
model.fit(X_train, y_train)

quick_eval(model, X_train, y_train)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[28], line 11
      8 model = xg.XGBRegressor(**xgb_best_params_)
      9 model.fit(X_train, y_train)
---> 11 quick_eval(model, X_train, y_train)

NameError: name 'quick_eval' is not defined
In [31]:
model.fit(X, y)
predictions = model.predict(X_test)

sub1 = pd.read_csv("sample_submission.csv")

sub1.Listening_Time_minutes = predictions 

sub1.to_csv("submission1.csv", index=False)

print("Sub shape:", sub1.shape)
sub1.head()
Sub shape: (250000, 2)
Out[31]:
id Listening_Time_minutes
0 750000 53.295830
1 750001 18.408051
2 750002 46.309479
3 750003 76.650085
4 750004 47.720146

🔹 8. Model averaging: XGBoost, LinearRegression, RandomForest¶

In [32]:
FOLDS=5

kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_pred = np.zeros(len(train))
xgb_pred = np.zeros(len(test))

for train_idx, val_idx in kf.split(train):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    xg_model = xg.XGBRegressor(**xgb_best_params_)
    xg_model.fit(X_train, y_train)
    
    y_pred = xg_model.predict(X_val)
    oof_pred[val_idx] = y_pred
    xgb_pred += xg_model.predict(X_test)
    
    print(f"Fold RMSE: {root_mean_squared_error(y_val, y_pred)}") # Change RMSE by accuracy/recall/f1-score for classification

final_rmse = root_mean_squared_error(y, oof_pred)
print(f"Final Cross-Validation RMSE: {final_rmse}")

xgb_pred /= FOLDS
Fold RMSE: 12.828010543133558
Fold RMSE: 12.8868670697745
Fold RMSE: 12.884075027703568
Fold RMSE: 12.87833924047012
Fold RMSE: 12.830077576774563
Final Cross-Validation RMSE: 12.861501459059626
In [33]:
%time

kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_pred = np.zeros(len(train))
lr_pred = np.zeros(len(test))

for train_idx, val_idx in kf.split(train):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    
    y_pred = lr_model.predict(X_val)
    oof_pred[val_idx] = y_pred 
    lr_pred += lr_model.predict(X_test)
    
    print(f"Fold RMSE: {root_mean_squared_error(y_val, y_pred)}") # Change RMSE by accuracy/recall/f1-score for classification

final_rmse = root_mean_squared_error(y, oof_pred)
print(f"Final Cross-Validation RMSE: {final_rmse}")

lr_pred /= FOLDS
CPU times: total: 0 ns
Wall time: 0 ns
Fold RMSE: 13.360517747661406
Fold RMSE: 13.41513548139313
Fold RMSE: 13.388394145730924
Fold RMSE: 13.406016297805706
Fold RMSE: 13.364590591376729
Final Cross-Validation RMSE: 13.386948471726468
In [34]:
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_pred = np.zeros(len(train))
rf_pred = np.zeros(len(test))

for train_idx, val_idx in kf.split(train):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    rf_model = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
    rf_model.fit(X_train, y_train)
    
    y_pred = rf_model.predict(X_val)
    oof_pred[val_idx] = y_pred  
    rf_pred += rf_model.predict(X_test)
    
    print(f"Fold RMSE: {root_mean_squared_error(y_val, y_pred)}") # Change RMSE by accuracy/recall/f1-score for classification

final_rmse = root_mean_squared_error(y, oof_pred)
print(f"Final Cross-Validation RMSE: {final_rmse}")

rf_pred /= FOLDS
Fold RMSE: 12.820272853502713
Fold RMSE: 12.864690073880709
Fold RMSE: 12.870606446700181
Fold RMSE: 12.86273183937078
Fold RMSE: 12.8293064078577
Final Cross-Validation RMSE: 12.849537970832767
In [35]:
sub2 = pd.read_csv("sample_submission.csv")

best_public = pd.read_csv("best_public_2.csv")

best_public_preds = best_public.Listening_Time_minutes.values

sub2.Listening_Time_minutes = 0.3 * xgb_pred + 0.4 * best_public_preds + 0.3 * rf_pred

sub2.to_csv("ensemble1.csv", index=False)

print("Sub shape:", sub2.shape)
sub2.head()
Sub shape: (250000, 2)
Out[35]:
id Listening_Time_minutes
0 750000 54.120394
1 750001 19.712607
2 750002 49.018499
3 750003 75.467411
4 750004 45.882480

🔹 9. Model Stacking¶

In [19]:
X = train.drop(columns=['id', 'Listening_Time_minutes'])
X_test = test.drop(columns=['id', 'Listening_Time_minutes'], errors='ignore')

y = train["Listening_Time_minutes"]
In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

if False:
    rf = RandomForestRegressor(random_state=42)

    param_grid = {
        'n_estimators': [150],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }

    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=3, 
        n_jobs=-1,
        verbose=10,
        scoring='neg_mean_squared_error'
    )

    grid_search.fit(X, y)

    print(f"Best hyperparameters found: {grid_search.best_params_}")

    best_rf = grid_search.best_estimator_
    rf_best_params_ = grid_search.best_params_
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best hyperparameters found: {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 150}

Fitting 3 folds for each of 27 candidates, totalling 81 fits

Best hyperparameters found: {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 150}

In [36]:
def generate_stack(models, X, y, X_test, SEED=42, folds=5):
    n_models = len(models)
    meta_features_train = np.zeros((len(X), n_models))  
    meta_features_test = np.zeros((len(X_test), n_models))
    kf = KFold(n_splits=folds, shuffle=True, random_state=SEED)

    for i, model in enumerate(models):
        oof = np.zeros(len(X))
        preds = np.zeros(len(X_test))
        
        for train_idx, val_idx in kf.split(X):
            model.fit(X.iloc[train_idx], y.iloc[train_idx])
            oof[val_idx] = model.predict(X.iloc[val_idx])
            preds += model.predict(X_test) / folds
        
        meta_features_train[:, i] = oof
        meta_features_test[:, i] = preds

    meta_model = LinearRegression()
    meta_model.fit(meta_features_train, y)
    return meta_model, meta_features_train, meta_features_test

X = train.drop(columns=['id', 'Listening_Time_minutes'])
X_test = test.drop(columns=['id', 'Listening_Time_minutes'], errors='ignore')
y = train["Listening_Time_minutes"]

models = [
    xg.XGBRegressor(**xgb_best_params_),
    LinearRegression(),
    RandomForestRegressor()
]

meta_model, meta_features_train, meta_features_test = generate_stack(models, X, y, X_test)
In [37]:
meta_model
Out[37]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [38]:
stacked_preds = meta_model.predict(meta_features_train)

final_rmse = root_mean_squared_error(y, stacked_preds)
print(f"Final Cross-Validation RMSE: {final_rmse:.4f}")
Final Cross-Validation RMSE: 12.6743

Final Cross-Validation RMSE: 12.6753 with Metamodel = LR

Final Cross-Validation RMSE: 12.7772 with metamodel = XGB(bestparams), no finetuning Random Forest

In [40]:
sub3 = pd.read_csv("sample_submission.csv")

final_preds = meta_model.predict(meta_features_test)

sub3.Listening_Time_minutes = final_preds

sub3.to_csv("stackingzzz.csv", index=False)

print("Sub shape:", sub3.shape)
sub3.head()
Sub shape: (250000, 2)
Out[40]:
id Listening_Time_minutes
0 750000 53.288106
1 750001 18.968049
2 750002 48.712395
3 750003 73.825208
4 750004 43.750943

Submit this!!!

🔹 10. Model Averaging with best public score¶

In [ ]:
import pandas as pd

d1 = pd.read_csv('submission3.csv')['Listening_Time_minutes'].values
d2 = pd.read_csv('submission1.csv')['Listening_Time_minutes'].values
x_Listening_Time_minutes = d2 * -0.01 + d1 * 1.01

sub4 = pd.read_csv("sample_submission.csv")
    
sub4.Listening_Time_minutes = x_Listening_Time_minutes

sub4.to_csv('ensemble2.csv', index=False)
sub4
Out[ ]:
id Listening_Time_minutes
0 750000 53.926783
1 750001 19.251744
2 750002 47.882719
3 750003 73.604575
4 750004 44.573747
... ... ...
249995 999995 12.356622
249996 999996 60.152370
249997 999997 7.304994
249998 999998 75.179575
249999 999999 59.607581

250000 rows × 2 columns

In [32]:
import pandas as pd

x1 = pd.read_csv('submission_stacked.csv')['Listening_Time_minutes'].values
x2 = pd.read_csv('submission1.csv')['Listening_Time_minutes'].values
x3 = pd.read_csv("submission3.csv")['Listening_Time_minutes'].values
x4 = pd.read_csv("satckingzzz.csv")['Listening_Time_minutes'].values
x_Listening_Time_minutes = x1 * 0.25 + x2 * 0.25 + x3 * 0.25 + x4 * 0.25

sub4 = pd.read_csv("sample_submission.csv")
    
sub4.Listening_Time_minutes = x_Listening_Time_minutes

sub4.to_csv('ultra.csv', index=False)
sub4
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[32], line 6
      4 x2 = pd.read_csv('submission1.csv')['Listening_Time_minutes'].values
      5 x3 = pd.read_csv("submission3.csv")['Listening_Time_minutes'].values
----> 6 x4 = pd.read_csv("satckingzzz.csv")['Listening_Time_minutes'].values
      7 x_Listening_Time_minutes = x1 * 0.25 + x2 * 0.25 + x3 * 0.25 + x4 * 0.25
      9 sub4 = pd.read_csv("sample_submission.csv")

File c:\Users\robkr\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)
   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File c:\Users\robkr\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File c:\Users\robkr\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File c:\Users\robkr\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File c:\Users\robkr\anaconda3\Lib\site-packages\pandas\io\common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    868 elif isinstance(handle, str):
    869     # Check whether the filename is to be opened in binary mode.
    870     # Binary mode does not support 'encoding' and 'newline'.
    871     if ioargs.encoding and "b" not in ioargs.mode:
    872         # Encoding
--> 873         handle = open(
    874             handle,
    875             ioargs.mode,
    876             encoding=ioargs.encoding,
    877             errors=errors,
    878             newline="",
    879         )
    880     else:
    881         # Binary mode
    882         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: 'satckingzzz.csv'