Resampling for class imbalance

Basic pipeline

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df = pd.read_csv('https://archive.ics.uci.edu/ml/'
                 'machine-learning-databases'
                 '/breast-cancer-wisconsin/wdbc.data', header=None)

X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, 
                     test_size=0.20,
                     stratify=y,
                     random_state=1)

print('Prior to imbalance')
print(f'Class 0: {X[y == 0].shape[0]}')
print(f'Class 1: {X[y == 1].shape[0]}')
Prior to imbalance
Class 0: 357
Class 1: 212

Create imbalanced set

X_imb = np.vstack((X[y == 0], X[y == 1][:40]))
y_imb = np.hstack((y[y == 0], y[y == 1][:40]))
print('After creating imbalance')
print(f'Class 0: {X_imb[y_imb == 0].shape[0]}')
print(f'Class 1: {X_imb[y_imb == 1].shape[0]}')
After creating imbalance
Class 0: 357
Class 1: 40

Upsample the minority class

from sklearn.utils import resample

print('Number of class 1 examples before:', X_imb[y_imb == 1].shape[0])

X_upsampled, y_upsampled = resample(X_imb[y_imb == 1],
                                    y_imb[y_imb == 1],
                                    replace=True,
                                    n_samples=X_imb[y_imb == 0].shape[0],
                                    random_state=123)

print('Number of class 1 examples after:', X_upsampled.shape[0])
Number of class 1 examples before: 40
Number of class 1 examples after: 357