More than 1 year has passed since last update.

不均衡二値分類でundersampling + baggingは最適戦略か?

機械学習における不均衡二値分類において、undersampling + baggingがよい戦略だというのは様々なところで言われています。



  • Windows 10 Pro
  • Python 3.6.13
  • Jupyter Lab 6.4.3


  • LigthGBM
  • CV
  • しきい値: 0.5

dataset 1


import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

df = pd.read_csv('./input/creditcard.csv')
df.columns = [c.lower() for c in df.columns]

# 0    284315
# 1       492

X = df.drop(['time', 'class'], axis=1).values
y = df['class'].values


# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]
    # prepare datasets
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        # train up to 10000 rounds
                        # if the score doesn't increase for 10 rounds, stop training

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.999, 0.679, 0.835, 0.749, 0.917



from sklearn.utils.class_weight import compute_sample_weight
train_weight = compute_sample_weight(class_weight='balanced', y=y_train).astype('float32')
lgb_train = lgb.Dataset(X_train, y_train, weight=train_weight)


from sklearn.utils.class_weight import compute_sample_weight

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    # prepare datasets
    train_weight = compute_sample_weight(class_weight='balanced', y=y_train).astype('float32')
    lgb_train = lgb.Dataset(X_train, y_train, weight=train_weight)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        # train up to 10000 rounds
                        # if the score doesn't increase for 10 rounds, stop training

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
1.0, 0.881, 0.825, 0.852, 0.913



from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0, replacement=True)
X_train, y_train = rus.fit_resample(X_train, y_train)


from imblearn.under_sampling import RandomUnderSampler

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    # prepare datasets
    rus = RandomUnderSampler(random_state=0, replacement=True)
    X_train, y_train = rus.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        # train up to 10000 rounds
                        # if the score doesn't increase for 10 rounds, stop training

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.97, 0.05, 0.913, 0.094, 0.941



from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)


from imblearn.over_sampling import RandomOverSampler

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    # prepare datasets
    ros = RandomOverSampler(random_state=0)
    X_train, y_train = ros.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        # train up to 10000 rounds
                        # if the score doesn't increase for 10 rounds, stop training

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.999, 0.879, 0.823, 0.85, 0.911



from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state=0)
X_train, y_train = sm.fit_resample(X_train, y_train)


from imblearn.over_sampling import SMOTE 

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    # prepare datasets
    sm = SMOTE(random_state=0)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        # train up to 10000 rounds
                        # if the score doesn't increase for 10 rounds, stop training

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.999, 0.732, 0.854, 0.788, 0.927



from imblearn.over_sampling import ADASYN 
ad = ADASYN(random_state=0)
X_train, y_train = ad.fit_resample(X_train, y_train)


from imblearn.over_sampling import ADASYN 

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    # prepare datasets
    ad = ADASYN(random_state=0)
    X_train, y_train = ad.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        # train up to 10000 rounds
                        # if the score doesn't increase for 10 rounds, stop training

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.999, 0.73, 0.813, 0.769, 0.906

undersampling + bagging

いよいよ本題です。baggingで抽出したサンプルをundersamplingしてモデルを構築し、テストセットで予測し確率を出します。それを全てのbaggingの抜き取りに対してsoft votingし最終的な確率を出します。このプロセスをcvの各iterationごとに行います。

from imblearn.under_sampling import RandomUnderSampler

def undersample_bagg(X_train, X_eval, y_train, y_eval):
    # prepare datasets
    rus = RandomUnderSampler(replacement=True)
    X_train, y_train = rus.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        # train up to 10000 rounds
                        # if the score doesn't increase for 10 rounds, stop training

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)

    return y_pred_proba

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    n_bagging = 10
    preds = [undersample_bagg(X_train, X_eval, y_train, y_eval) for i in range(n_bagging)]
    y_pred_proba = sum(preds) / n_bagging

    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.974, 0.058, 0.911, 0.109, 0.942

undersampling + bagging + calibration


この不均衡データのダウンサンプリングによって、サンプル選択バイアスが生じることが Calibrating > Probability with Undersampling for Unbalanced Classification という論文で説明されています。
予測確率が重要な場合 *1 は特に、このバイアスの影響を除去しなければなりません。


p = \frac{p_s}{p_s + \frac{(1 - p_s)}{\beta}}


from imblearn.under_sampling import RandomUnderSampler

def calibrate(prob, beta):
    return prob / (prob + (1 - prob) / beta)

def undersample_bagg(X_train, X_eval, y_train, y_eval):
    # prepare datasets
    rus = RandomUnderSampler(replacement=True)
    X_train, y_train = rus.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        # train up to 10000 rounds
                        # if the score doesn't increase for 10 rounds, stop training

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    # undersampling rate
    us_rate = sum(y_train == 1) / sum(y == 0)
    # calibrate probability
    y_pred_proba = calibrate(y_pred_proba, us_rate)

    return y_pred_proba

# stratified kfold split
kf = StratifiedKFold(n_splits=5, shuffle=True)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    n_bagging = 10
    preds = [undersample_bagg(X_train, X_eval, y_train, y_eval) for i in range(n_bagging)]
    y_pred_proba = sum(preds) / n_bagging

    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.999, 0.852, 0.772, 0.81, 0.886

undersampling + calibration


from imblearn.under_sampling import RandomUnderSampler

def calibrate(prob, beta):
    return prob / (prob + (1 - prob) / beta)

# stratified kfold split
kf = StratifiedKFold(n_splits=5)
oof = np.zeros(len(y))

# cv iterate through splits
for train_index, eval_index in kf.split(X, y):
    X_train, X_eval = X[train_index], X[eval_index]
    y_train, y_eval = y[train_index], y[eval_index]

    # prepare datasets
    rus = RandomUnderSampler(replacement=True)
    X_train, y_train = rus.fit_resample(X_train, y_train)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    # LightGBM hyperparameters
    lgbm_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': 0,

    model = lgb.train(lgbm_params, lgb_train,
                        # validation data for the model
                        # train up to 10000 rounds
                        # if the score doesn't increase for 10 rounds, stop training

    # predict holdout with the trained model
    y_pred_proba = model.predict(X_eval, num_iteration=model.best_iteration)
    # undersampling rate
    us_rate = sum(y_train == 1) / sum(y == 0)
    # calibrate probability
    y_pred_proba = calibrate(y_pred_proba, us_rate)

    oof[eval_index] = (y_pred_proba > 0.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print('accuracy_score, precision_score, recall_score, f1_score, roc_auc_score')
score_funcs = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scores = [round(f(y, oof) ,3) for f in score_funcs]
print(', '.join(map(str, scores)))

accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
0.999, 0.776, 0.695, 0.733, 0.847



condition accuracy_score precision_score recall_score f1_score roc_auc_score
no treatment 0.999 0.716 0.823 0.766 0.911
set weight 0.999 0.86 0.825 0.842 0.912
undersampling 0.968 0.048 0.925 0.092 0.947
oversampling 0.999 0.836 0.821 0.829 0.91
SMOTE 0.999 0.632 0.85 0.725 0.924
ADASYN 0.999 0.679 0.827 0.746 0.913
undersampling + bagging 0.973 0.056 0.909 0.105 0.941
undersampling + bagging + calib 0.999 0.846 0.77 0.806 0.885
undersampling + calib 0.999 0.817 0.642 0.719 0.821


  • accuracyは全て>0.97と非常に高い(不均衡データなので当たり前)
  • SMOTE, ADASYNはoversamplingに比べてrecallは最大0.03程度上回るが、precisionは0.8台から0.6台へと大きく下がる
  • undersamplingおよびundersampling + baggingは、recallは>0.9と非常に高いがprecisionが0.05程度と大幅に低くなる
  • だが、両者は確率の補正を加えるとprecisionは0.8台、recallが0.6~0.8となる。しかしこれはweightのオプションを付けた状態に比べてprecision, recallともに劣る。



dataset 2


import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

df = pd.read_csv('./input/cancer.csv')
df.columns = [c.lower() for c in df.columns]
df = df.replace('?', np.nan)

df = df.apply(pd.to_numeric)

# 0    803
# 1     55

# impute nan
for col, val in df.apply(lambda x: x.min(skipna=True)).to_dict().items():
    df[col][df[col].isnull()] = val - 1 # min - 1 for a GBDT model




condition accuracy_score precision_score recall_score f1_score roc_auc_score
no treatment 0.955 0.648 0.636 0.642 0.806
set weight 0.962 0.653 0.855 0.74 0.912
undersampling 0.955 0.6 0.873 0.711 0.916
oversampling 0.956 0.623 0.782 0.694 0.875
SMOTE 0.959 0.661 0.745 0.701 0.86
ADASYN 0.957 0.636 0.764 0.694 0.867
undersampling + bagging 0.962 0.649 0.873 0.744 0.92
undersampling + bagging + calib 0.95 0.688 0.4 0.506 0.694
undersampling + calib 0.952 0.706 0.436 0.539 0.712


  • データセットによって、各不均衡データ向け処理の結果が変わることがある
  • accuracyは全て>0.95と非常に高い(不均衡データなので当たり前)
  • SMOTE, ADASYNはoversamplingに比べてrecallは最大0.04程度下回るが、precisionは0.04程度上がる。
  • undersamplingおよびundersampling + baggingは、recallは0.873と高く、precisionも0.6~0.65程度とである。
  • だが、両者は確率の補正を加えるとprecisionが最大0.1程度上昇するが、recallが0.4台に低下する



手法の優劣がデータセットによって異なるため、一貫した結果が得られませんでした。と言っても2データセットしか試していませんが。undersampling + baggingに関しては、全てのデータセットで全てのスコアで優れた結果が出る、といった明確な結論が出ればよかったのですが、そうはなりませんでした。ただ一貫した傾向としては、recallに関して高めの値が出るがprecisionではそうでもないこと、確率の補正を行うとrecallは下がるがprecisionは上がることが挙げられると思います。理想としてはデータセットごとに全ての手法を試して目的に沿ったスコアが良い手法を選択することがありますが、かなりの労力がかかりますので、スコアが安定しているように見えるweightオプションを決め打ちで設定してしまうのも手かと思います。



