LoginSignup
94
69

More than 3 years have passed since last update.

LightGBMの俺用テンプレート

Last updated at Posted at 2019-10-11

LightGBMの俺用テンプレート

LightGBMモデルを学習する際の、テンプレ的なコードを自分用も兼ねてまとめました。

対象

・LightGBMについては知っている方
・LightGBMでoptuna使いたい方
・書き方はなんとなくわかるけど毎回1から書くのが面倒な方

お品書き

  1. シンプルなモデル
  2. early_stopping版
  3. optunaでパラメータ探索 + early_stopping版

評価方法

trainとtestの分割によって精度が大きく変わる場合を考慮して、
kfoldの平均値を最終的な精度としています。

データ

ここでは下記のようにダミーデータを作っています。

# データセット作成  

import pandas as pd
from sklearn.datasets import make_classification


data = make_classification(n_samples=10000, n_features=100, n_classes=2)

X = pd.DataFrame(data[0])
y = pd.DataFrame(data[1])

評価に使う関数


from sklearn import metrics

def get_evaluate(y_test, predict):

    fpr, tpr, thr_arr = metrics.roc_curve(y_test, predict)

    auc = metrics.auc(fpr, tpr)
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)      

    return auc, precision, recall

1. シンプルなモデル


import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgb

# kfoldの分割数
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=0)

lgbm_params = {'objective': 'binary'}

auc_list = []
precision_list = []
recall_list = []

# kfoldで分割
for train_index, test_index in skf.split(X, y):

    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]

    # データセットを生成する
    lgb_train = lgb.Dataset(X_train, y_train)

    # 学習
    model = lgb.train(lgbm_params, lgb_train)

    predict_proba = model.predict(X_test, num_iteration=model.best_iteration)
    # predict_probaが0.5以上なら1とする
    predict = [0 if i < 0.5 else 1 for i in predict_proba]

    auc, precision, recall = get_evaluate(y_test, predict)

    print('AUC:{}, precision:{}, recall:{}'.format(auc, precision, recall))

    auc_list.append(auc)
    precision_list.append(precision)
    recall_list.append(recall)

# kfoldの平均値を取得
print('Kfold平均 AUC:{}, precision:{}, recall:{}'.format(np.mean(auc_list), 
                                                         np.mean(precision_list), 
                                                         np.mean(recall_list)))

2. early_stopping版

import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold

import lightgbm as lgb

from tqdm import tqdm_notebook as tqdm

# kfoldの分割数
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=0)

lgbm_params = {'objective': 'binary'}

auc_list = []
precision_list = []
recall_list = []

# kfoldで学習
for train_index, test_index in skf.split(X, y):

    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]

    # ここではvalidをモデル評価、evalをフォールドアウト検証に使う・・・分割の大きさはデータセットと相談する
    X_eval, X_valid, y_eval, y_valid = train_test_split(X_test, y_test, random_state=90, 
                                                        shuffle=True, stratify=y_test, test_size=0.3)

    # データセットを生成する
    lgb_train = lgb.Dataset(X_train, y_train)

    # モデル評価用
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

    model = lgb.train(lgbm_params, 
                      lgb_train,
                      valid_sets=lgb_valid,
                      num_boost_round=100000,
                      early_stopping_rounds=10)

    predict_proba = model.predict(X_test, num_iteration=model.best_iteration)
    predict = [0 if i < 0.5 else 1 for i in predict_proba]

    auc, precision, recall = get_evaluate(y_test, predict)

    print('AUC:{}, precision:{}, recall:{}'.format(auc, precision, recall))

    auc_list.append(auc)
    precision_list.append(precision)
    recall_list.append(recall)

# kfoldの平均値を取得
print('Kfold平均 AUC:{}, precision:{}, recall:{}'.format(np.mean(auc_list), 
                                                         np.mean(precision_list), 
                                                         np.mean(recall_list)))

3. optunaでパラメータ探索 + early_stopping版

インポート

# optuna

import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold

import optuna
import lightgbm as lgb

パラメータ探索処理

これは公式サンプルそのままだったはず・・・
他にも探索したい場合は、同じようにparamsに渡してやる。


def objective(trial):

    train_x, test_x, train_y, test_y = train_test_split(X_train, y_train, test_size=0.25, 
                                                        shuffle=True, stratify=y_train)
    dtrain = lgb.Dataset(train_x, label=train_y)

    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
        'num_leaves': trial.suggest_int('num_leaves', 10, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0)
    }

    if param['boosting_type'] == 'dart':
        param['drop_rate'] = trial.suggest_loguniform('drop_rate', 1e-8, 1.0)
        param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
    if param['boosting_type'] == 'goss':
        param['top_rate'] = trial.suggest_uniform('top_rate', 0.0, 1.0)
        param['other_rate'] = trial.suggest_uniform('other_rate', 0.0, 1.0 - param['top_rate'])

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(test_x)
    pred_labels = np.rint(preds)

    accuracy = metrics.accuracy_score(test_y, pred_labels)

    return accuracy

学習

探索したパラメータを使って学習


# kfoldの分割数
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=0)

lgbm_params = {'objective': 'binary'}

auc_list = []
precision_list = []
recall_list = []

for train_index, test_index in skf.split(X, y):

    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))    

    # optunaでサーチしたパラメータ
    trial.params['objective'] = 'binary'
    lgbm_params = trial.params

    # ここではvalidをモデル評価、evalをフォールドアウト検証に使う・・・分割の大きさはデータセットと相談する
    X_eval, X_valid, y_eval, y_valid = train_test_split(X_test, y_test, random_state=90, 
                                                        shuffle=True, stratify=y_test, test_size=0.3)

    # データセットを生成する
    lgb_train = lgb.Dataset(X_train, y_train)

    # モデル評価用
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

    model = lgb.train(lgbm_params, 
                      lgb_train,
                      valid_sets=lgb_valid,
                      num_boost_round=100000,
                      early_stopping_rounds=10)

    predict_proba = model.predict(X_test, num_iteration=model.best_iteration)
    predict = [0 if i < 0.5 else 1 for i in predict_proba]

    auc, precision, recall = get_evaluate(y_test, predict)

    print('AUC:{}, precision:{}, recall:{}'.format(auc, precision, recall))

    auc_list.append(auc)
    precision_list.append(precision)
    recall_list.append(recall)

# kfoldの平均値を取得
print('Kfold平均 AUC:{}, precision:{}, recall:{}'.format(np.mean(auc_list), 
                                                         np.mean(precision_list), 
                                                         np.mean(recall_list)))

参考

LightGBMを使ってみる CUBE SUGAR CONTAINER
https://blog.amedama.jp/entry/2018/05/01/081842

LightGBM 公式ドキュメント
https://lightgbm.readthedocs.io/en/latest/

optuna
https://github.com/pfnet/optuna

94
69
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
94
69