LightGBMの俺用テンプレート
LightGBMモデルを学習する際の、テンプレ的なコードを自分用も兼ねてまとめました。
対象
・LightGBMについては知っている方
・LightGBMでoptuna使いたい方
・書き方はなんとなくわかるけど毎回1から書くのが面倒な方
お品書き
- シンプルなモデル
- early_stopping版
- optunaでパラメータ探索 + early_stopping版
評価方法
trainとtestの分割によって精度が大きく変わる場合を考慮して、
kfoldの平均値を最終的な精度としています。
データ
ここでは下記のようにダミーデータを作っています。
# データセット作成
import pandas as pd
from sklearn.datasets import make_classification
data = make_classification(n_samples=10000, n_features=100, n_classes=2)
X = pd.DataFrame(data[0])
y = pd.DataFrame(data[1])
評価に使う関数
from sklearn import metrics
def get_evaluate(y_test, predict):
fpr, tpr, thr_arr = metrics.roc_curve(y_test, predict)
auc = metrics.auc(fpr, tpr)
precision = metrics.precision_score(y_test, predict)
recall = metrics.recall_score(y_test, predict)
return auc, precision, recall
1. シンプルなモデル
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
# kfoldの分割数
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=0)
lgbm_params = {'objective': 'binary'}
auc_list = []
precision_list = []
recall_list = []
# kfoldで分割
for train_index, test_index in skf.split(X, y):
X_train = X.iloc[train_index]
y_train = y.iloc[train_index]
X_test = X.iloc[test_index]
y_test = y.iloc[test_index]
# データセットを生成する
lgb_train = lgb.Dataset(X_train, y_train)
# 学習
model = lgb.train(lgbm_params, lgb_train)
predict_proba = model.predict(X_test, num_iteration=model.best_iteration)
# predict_probaが0.5以上なら1とする
predict = [0 if i < 0.5 else 1 for i in predict_proba]
auc, precision, recall = get_evaluate(y_test, predict)
print('AUC:{}, precision:{}, recall:{}'.format(auc, precision, recall))
auc_list.append(auc)
precision_list.append(precision)
recall_list.append(recall)
# kfoldの平均値を取得
print('Kfold平均 AUC:{}, precision:{}, recall:{}'.format(np.mean(auc_list),
np.mean(precision_list),
np.mean(recall_list)))
2. early_stopping版
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold
import lightgbm as lgb
from tqdm import tqdm_notebook as tqdm
# kfoldの分割数
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=0)
lgbm_params = {'objective': 'binary'}
auc_list = []
precision_list = []
recall_list = []
# kfoldで学習
for train_index, test_index in skf.split(X, y):
X_train = X.iloc[train_index]
y_train = y.iloc[train_index]
X_test = X.iloc[test_index]
y_test = y.iloc[test_index]
# ここではvalidをモデル評価、evalをフォールドアウト検証に使う・・・分割の大きさはデータセットと相談する
X_eval, X_valid, y_eval, y_valid = train_test_split(X_test, y_test, random_state=90,
shuffle=True, stratify=y_test, test_size=0.3)
# データセットを生成する
lgb_train = lgb.Dataset(X_train, y_train)
# モデル評価用
lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
model = lgb.train(lgbm_params,
lgb_train,
valid_sets=lgb_valid,
num_boost_round=100000,
early_stopping_rounds=10)
predict_proba = model.predict(X_test, num_iteration=model.best_iteration)
predict = [0 if i < 0.5 else 1 for i in predict_proba]
auc, precision, recall = get_evaluate(y_test, predict)
print('AUC:{}, precision:{}, recall:{}'.format(auc, precision, recall))
auc_list.append(auc)
precision_list.append(precision)
recall_list.append(recall)
# kfoldの平均値を取得
print('Kfold平均 AUC:{}, precision:{}, recall:{}'.format(np.mean(auc_list),
np.mean(precision_list),
np.mean(recall_list)))
3. optunaでパラメータ探索 + early_stopping版
インポート
# optuna
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold
import optuna
import lightgbm as lgb
パラメータ探索処理
これは公式サンプルそのままだったはず・・・
他にも探索したい場合は、同じようにparamsに渡してやる。
def objective(trial):
train_x, test_x, train_y, test_y = train_test_split(X_train, y_train, test_size=0.25,
shuffle=True, stratify=y_train)
dtrain = lgb.Dataset(train_x, label=train_y)
param = {
'objective': 'binary',
'metric': 'binary_logloss',
'verbosity': -1,
'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
'num_leaves': trial.suggest_int('num_leaves', 10, 1000),
'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0)
}
if param['boosting_type'] == 'dart':
param['drop_rate'] = trial.suggest_loguniform('drop_rate', 1e-8, 1.0)
param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
if param['boosting_type'] == 'goss':
param['top_rate'] = trial.suggest_uniform('top_rate', 0.0, 1.0)
param['other_rate'] = trial.suggest_uniform('other_rate', 0.0, 1.0 - param['top_rate'])
gbm = lgb.train(param, dtrain)
preds = gbm.predict(test_x)
pred_labels = np.rint(preds)
accuracy = metrics.accuracy_score(test_y, pred_labels)
return accuracy
学習
探索したパラメータを使って学習
# kfoldの分割数
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=0)
lgbm_params = {'objective': 'binary'}
auc_list = []
precision_list = []
recall_list = []
for train_index, test_index in skf.split(X, y):
X_train = X.iloc[train_index]
y_train = y.iloc[train_index]
X_test = X.iloc[test_index]
y_test = y.iloc[test_index]
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial
print(' Value: {}'.format(trial.value))
print(' Params: ')
for key, value in trial.params.items():
print(' {}: {}'.format(key, value))
# optunaでサーチしたパラメータ
trial.params['objective'] = 'binary'
lgbm_params = trial.params
# ここではvalidをモデル評価、evalをフォールドアウト検証に使う・・・分割の大きさはデータセットと相談する
X_eval, X_valid, y_eval, y_valid = train_test_split(X_test, y_test, random_state=90,
shuffle=True, stratify=y_test, test_size=0.3)
# データセットを生成する
lgb_train = lgb.Dataset(X_train, y_train)
# モデル評価用
lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
model = lgb.train(lgbm_params,
lgb_train,
valid_sets=lgb_valid,
num_boost_round=100000,
early_stopping_rounds=10)
predict_proba = model.predict(X_test, num_iteration=model.best_iteration)
predict = [0 if i < 0.5 else 1 for i in predict_proba]
auc, precision, recall = get_evaluate(y_test, predict)
print('AUC:{}, precision:{}, recall:{}'.format(auc, precision, recall))
auc_list.append(auc)
precision_list.append(precision)
recall_list.append(recall)
# kfoldの平均値を取得
print('Kfold平均 AUC:{}, precision:{}, recall:{}'.format(np.mean(auc_list),
np.mean(precision_list),
np.mean(recall_list)))
参考
LightGBMを使ってみる CUBE SUGAR CONTAINER
https://blog.amedama.jp/entry/2018/05/01/081842
LightGBM 公式ドキュメント
https://lightgbm.readthedocs.io/en/latest/