Free entry¶

Kaggle dataset: [link]

By Robin R.P.M. Kras

⭐ 1. Introduction & Overview¶

It is important that credit card companies are able to recognize fraudulent credit card transactions so that customers are not charged for items that they did not purchase.

🔹 2. Import Libraries & Set Up¶

In [26]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Machine Learning
import xgboost as xg

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, roc_auc_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from imblearn.over_sampling import SMOTE

# Feature Importance & Explainability
import shap

# Settings
import warnings
warnings.filterwarnings("ignore")

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

print("Libraries loaded. Ready to go!")
Libraries loaded. Ready to go!

🔹 3. Load & Explore Data¶

In [27]:
data = pd.read_csv('creditcard.csv')
In [28]:
data.head()
Out[28]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0

5 rows × 31 columns

In [29]:
data.describe()
Out[29]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 ... 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 284807.000000 284807.000000
mean 94813.859575 1.168375e-15 3.416908e-16 -1.379537e-15 2.074095e-15 9.604066e-16 1.487313e-15 -5.556467e-16 1.213481e-16 -2.406331e-15 ... 1.654067e-16 -3.568593e-16 2.578648e-16 4.473266e-15 5.340915e-16 1.683437e-15 -3.660091e-16 -1.227390e-16 88.349619 0.001727
std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00 1.380247e+00 1.332271e+00 1.237094e+00 1.194353e+00 1.098632e+00 ... 7.345240e-01 7.257016e-01 6.244603e-01 6.056471e-01 5.212781e-01 4.822270e-01 4.036325e-01 3.300833e-01 250.120109 0.041527
min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01 ... -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00 -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01 0.000000 0.000000
25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01 ... -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01 -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02 5.600000 0.000000
50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02 -5.433583e-02 -2.741871e-01 4.010308e-02 2.235804e-02 -5.142873e-02 ... -2.945017e-02 6.781943e-03 -1.119293e-02 4.097606e-02 1.659350e-02 -5.213911e-02 1.342146e-03 1.124383e-02 22.000000 0.000000
75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01 6.119264e-01 3.985649e-01 5.704361e-01 3.273459e-01 5.971390e-01 ... 1.863772e-01 5.285536e-01 1.476421e-01 4.395266e-01 3.507156e-01 2.409522e-01 9.104512e-02 7.827995e-02 77.165000 0.000000
max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01 3.480167e+01 7.330163e+01 1.205895e+02 2.000721e+01 1.559499e+01 ... 2.720284e+01 1.050309e+01 2.252841e+01 4.584549e+00 7.519589e+00 3.517346e+00 3.161220e+01 3.384781e+01 25691.160000 1.000000

8 rows × 31 columns

In [30]:
data.info
Out[30]:
<bound method DataFrame.info of             Time         V1         V2        V3        V4        V5  \
0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
...          ...        ...        ...       ...       ...       ...   
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  \
0       0.462388  0.239599  0.098698  0.363787  ... -0.018307  0.277838   
1      -0.082361 -0.078803  0.085102 -0.255425  ... -0.225775 -0.638672   
2       1.800499  0.791461  0.247676 -1.514654  ...  0.247998  0.771679   
3       1.247203  0.237609  0.377436 -1.387024  ... -0.108300  0.005274   
4       0.095921  0.592941 -0.270533  0.817739  ... -0.009431  0.798278   
...          ...       ...       ...       ...  ...       ...       ...   
284802 -2.606837 -4.918215  7.305334  1.914428  ...  0.213454  0.111864   
284803  1.058415  0.024330  0.294869  0.584800  ...  0.214205  0.924384   
284804  3.031260 -0.296827  0.708417  0.432454  ...  0.232045  0.578229   
284805  0.623708 -0.686180  0.679145  0.392087  ...  0.265245  0.800049   
284806 -0.649617  1.577006 -0.414650  0.486180  ...  0.261057  0.643078   

             V23       V24       V25       V26       V27       V28  Amount  \
0      -0.110474  0.066928  0.128539 -0.189115  0.133558 -0.021053  149.62   
1       0.101288 -0.339846  0.167170  0.125895 -0.008983  0.014724    2.69   
2       0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752  378.66   
3      -0.190321 -1.175575  0.647376 -0.221929  0.062723  0.061458  123.50   
4      -0.137458  0.141267 -0.206010  0.502292  0.219422  0.215153   69.99   
...          ...       ...       ...       ...       ...       ...     ...   
284802  1.014480 -0.509348  1.436807  0.250034  0.943651  0.823731    0.77   
284803  0.012463 -1.016226 -0.606624 -0.395255  0.068472 -0.053527   24.79   
284804 -0.037501  0.640134  0.265745 -0.087371  0.004455 -0.026561   67.88   
284805 -0.163298  0.123205 -0.569159  0.546668  0.108821  0.104533   10.00   
284806  0.376777  0.008797 -0.473649 -0.818267 -0.002415  0.013649  217.00   

        Class  
0           0  
1           0  
2           0  
3           0  
4           0  
...       ...  
284802      0  
284803      0  
284804      0  
284805      0  
284806      0  

[284807 rows x 31 columns]>

🔹 4. Data Visualization & EDA¶

In [31]:
float_cols = [col for col in data.columns if data[col].dtype == "float64" or data[col].dtype == "int64"]

cols_per_row = 3
num_plots = len(float_cols)
rows = (num_plots // cols_per_row) + (num_plots % cols_per_row > 0) 

fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 5 * rows)) 
axes = axes.flatten()  

for idx, col in enumerate(float_cols):
    sns.histplot(data[col], bins=50, kde=True, ax=axes[idx])
    axes[idx].set_title(f"Distribution of {col}")

for i in range(idx + 1, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()
No description has been provided for this image
In [32]:
plt.figure(figsize=(6, 4))
sns.countplot(data=data, x='Class')
plt.title('Fraud vs Non-Fraud Transactions')
plt.xlabel('Class (0: Non-Fraud, 1: Fraud)')
plt.ylabel('Count')
plt.show()
No description has been provided for this image
In [33]:
plt.figure(figsize=(8, 6))
sns.boxplot(data=data, x='Class', y='Amount')
plt.title('Transaction Amount, fraud vs non-fraud')
plt.xlabel('Class')
plt.ylabel('Transaction Amount')
plt.show()
No description has been provided for this image

🔹 5. Feature Engineering¶

In [34]:
print(data.isnull().sum())
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
In [35]:
print(data.Class.value_counts())
Class
0    284315
1       492
Name: count, dtype: int64
In [37]:
X = data.drop(columns=['Class'])
y = data['Class']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=SEED)
X_res, y_res = smote.fit_resample(X_train, y_train)

k = 5  # Common choice is 5 or 10 folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

model = xg.XGBClassifier()

# Perform cross-validation
cv_scores = cross_val_score(model, X_res, y_res, cv=kf, scoring='accuracy')

# Print results
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {np.mean(cv_scores):.4f}")
print(f"Standard deviation: {np.std(cv_scores):.4f}")

model.fit(X_res, y_res)

predictions = model.predict(X_val)

val_accuracy = accuracy_score(y_val, predictions)
val_precision = precision_score(y_val, predictions)
val_recall = recall_score(y_val, predictions)
val_f1 = f1_score(y_val, predictions)
val_probs = model.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)

print("accuracy: ", val_accuracy)
print("precision: ", val_precision)
print("recall: ", val_recall)
print("f1: ", val_f1)
print("auc: ", val_auc)
Cross-validation scores: [0.99989009 0.99982414 0.99989009 0.99982414 0.99990108]
Mean CV score: 0.9999
Standard deviation: 0.0000
accuracy:  0.9993504441557529
precision:  0.7904761904761904
recall:  0.8469387755102041
f1:  0.8177339901477833
auc:  0.9831145633548861