BorutaでlightGBMとxgboostが動かなかったので一部書き換えて動くようにしてみました。
普通にsklearn estimatorでやる方法は以下を見てください
https://qiita.com/studio_haneya/items/bdb25b19baaf43d867d7
試行環境
Windows10
python 3.6.7
scikit-learn 0.21.3
lightgbm 2.3.0
xgboost 0.90
変更したところ
Borutaはfeature_importance_が取得できるsklearn estimatorで動くようになっているので、RandomForestやGradientBoostingを使うことができますが、lightGBMやxgboostのsklearn wrapperはsklearnっぽいですがちょこちょこ違う為にそのままでは動かないようです。そこで、BorutaPyクラスを継承して一部書き換えて動くようにしてみました。
lightGBMをBorutaで使おうとしたときに問題になる相違点は以下の2つです。
boruta | lgb/xgb | |
---|---|---|
random_state | np.random.RandomState | int |
max_depth制限なし | None | -1 |
sklearnではnp.random.RandomState()でseedを渡すことができるのがlightGBM/xgboostでは出来なくてint型の数値で渡さないといけないのと、max_depthの制限をなくしたときにparamsに代入される値がsklearnではNoneであるのがlightGBMでは-1である為にmax_depthを元に算出されるn_estimatorsが正常に算出されません。なのでこの2つに該当する部分を直せば動きます。
BorutaでlightGBMを使う
以下がlightGBMを動かすサンプルコードです。BorutaPyを継承して一部書き直しているんですが、_fit()がrandom_stateをnp.random.RandomState()に置き換えてからself.estimator.fit()する仕様になっている為にごっそり書き直す以外のやり方が思いつかず、えらく長いコードになってしまってます。もっと上手い方法があったら教えて下さい。
(20191102修正: random stateの値が反映されないコードになっていたので修正しました)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from boruta import BorutaPy
import lightgbm as lgb
import xgboost as xgb
from sklearn.utils import check_random_state
class BorutaPyForLGB(BorutaPy):
def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05,
two_step=True, max_iter=100, random_state=None, verbose=0):
super().__init__(estimator, n_estimators, perc, alpha,
two_step, max_iter, random_state, verbose)
if random_state is None:
self.random_state_input = np.random.randint(0, 2**64-1)
elif isinstance(random_state, int):
self.random_state_input = random_state
else:
raise TypeError('random_state must be int or None')
def _get_tree_num(self, n_feat):
depth = self.estimator.get_params()['max_depth']
if (depth == None) or (depth <= 0):
depth = 10
f_repr = 100
multi = ((n_feat * 2) / (np.sqrt(n_feat * 2) * depth))
n_estimators = int(multi * f_repr)
return n_estimators
def _fit(self, X, y):
# check input params
self._check_params(X, y)
self.random_state = check_random_state(self.random_state)
# setup variables for Boruta
n_sample, n_feat = X.shape
_iter = 1
# holds the decision about each feature:
# 0 - default state = tentative in original code
# 1 - accepted in original code
# -1 - rejected in original code
dec_reg = np.zeros(n_feat, dtype=np.int)
# counts how many times a given feature was more important than
# the best of the shadow features
hit_reg = np.zeros(n_feat, dtype=np.int)
# these record the history of the iterations
imp_history = np.zeros(n_feat, dtype=np.float)
sha_max_history = []
# set n_estimators
if self.n_estimators != 'auto':
self.estimator.set_params(n_estimators=self.n_estimators)
# main feature selection loop
while np.any(dec_reg == 0) and _iter < self.max_iter:
# find optimal number of trees and depth
if self.n_estimators == 'auto':
# number of features that aren't rejected
not_rejected = np.where(dec_reg >= 0)[0].shape[0]
n_tree = self._get_tree_num(not_rejected)
self.estimator.set_params(n_estimators=n_tree)
# make sure we start with a new tree in each iteration
self.estimator.set_params(random_state=self.random_state_input)
# add shadow attributes, shuffle them and train estimator, get imps
cur_imp = self._add_shadows_get_imps(X, y, dec_reg)
# get the threshold of shadow importances we will use for rejection
imp_sha_max = np.percentile(cur_imp[1], self.perc)
# record importance history
sha_max_history.append(imp_sha_max)
imp_history = np.vstack((imp_history, cur_imp[0]))
# register which feature is more imp than the max of shadows
hit_reg = self._assign_hits(hit_reg, cur_imp, imp_sha_max)
# based on hit_reg we check if a feature is doing better than
# expected by chance
dec_reg = self._do_tests(dec_reg, hit_reg, _iter)
# print out confirmed features
if self.verbose > 0 and _iter < self.max_iter:
self._print_results(dec_reg, _iter, 0)
if _iter < self.max_iter:
_iter += 1
# we automatically apply R package's rough fix for tentative ones
confirmed = np.where(dec_reg == 1)[0]
tentative = np.where(dec_reg == 0)[0]
# ignore the first row of zeros
tentative_median = np.median(imp_history[1:, tentative], axis=0)
# which tentative to keep
tentative_confirmed = np.where(tentative_median
> np.median(sha_max_history))[0]
tentative = tentative[tentative_confirmed]
# basic result variables
self.n_features_ = confirmed.shape[0]
self.support_ = np.zeros(n_feat, dtype=np.bool)
self.support_[confirmed] = 1
self.support_weak_ = np.zeros(n_feat, dtype=np.bool)
self.support_weak_[tentative] = 1
# ranking, confirmed variables are rank 1
self.ranking_ = np.ones(n_feat, dtype=np.int)
# tentative variables are rank 2
self.ranking_[tentative] = 2
# selected = confirmed and tentative
selected = np.hstack((confirmed, tentative))
# all rejected features are sorted by importance history
not_selected = np.setdiff1d(np.arange(n_feat), selected)
# large importance values should rank higher = lower ranks -> *(-1)
imp_history_rejected = imp_history[1:, not_selected] * -1
# update rank for not_selected features
if not_selected.shape[0] > 0:
# calculate ranks in each iteration, then median of ranks across feats
iter_ranks = self._nanrankdata(imp_history_rejected, axis=1)
rank_medians = np.nanmedian(iter_ranks, axis=0)
ranks = self._nanrankdata(rank_medians, axis=0)
# set smallest rank to 3 if there are tentative feats
if tentative.shape[0] > 0:
ranks = ranks - np.min(ranks) + 3
else:
# and 2 otherwise
ranks = ranks - np.min(ranks) + 2
self.ranking_[not_selected] = ranks
else:
# all are selected, thus we set feature supports to True
self.support_ = np.ones(n_feat, dtype=np.bool)
# notify user
if self.verbose > 0:
self._print_results(dec_reg, _iter, 1)
return self
上記で準備できたので以下で実行していきます。コードは以下を参考にしています。
https://github.com/masakiaota/blog/blob/master/boruta/Madalon_Data_Set.ipynb
def main():
# データを読んでくる
data_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'
label_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'
X_data = pd.read_csv(data_url, sep=" ", header=None)
y_data = pd.read_csv(label_url, sep=" ", header=None)
data = X_data.iloc[:,0:500]
data['target'] = y_data[0]
y=data['target']
X=data.drop(columns='target')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# データ全体で学習
model = lgb.LGBMClassifier(objective='binary',
num_leaves = 23,
learning_rate=0.1,
n_estimators=100,)
model.fit(X_train.values, y_train.values)
y_test_pred = model.predict(X_test.values)
print(confusion_matrix(y_test.values, y_test_pred, labels=model.classes_), '\n')
print('SCORE with ALL Features: %1.2f\n' % accuracy_score(y_test, y_test_pred))
# Borutaで特徴量選択 (一部書き換えたBorutaPyを使います)
model = lgb.LGBMClassifier(objective='binary',
num_leaves = 23,
learning_rate=0.1,
n_estimators=100,)
feat_selector = BorutaPyForLGB(model, n_estimators='auto', two_step=False,verbose=2, random_state=42)
feat_selector.fit(X_train.values, y_train.values)
print(X_train.columns[feat_selector.support_])
# 選択したFeatureを取り出し
X_train_selected = X_train.iloc[:,feat_selector.support_]
X_test_selected = X_test.iloc[:,feat_selector.support_]
print(X_test_selected.head())
# 選択したFeatureで学習
model = lgb.LGBMClassifier(objective='binary',
num_leaves = 23,
learning_rate=0.1,
n_estimators=100,)
model.fit(X_train_selected.values, y_train.values)
y_test_pred = model.predict(X_test_selected.values)
print(confusion_matrix(y_test.values, y_test_pred, labels=model.classes_), '\n')
print('SCORE with selected Features: %1.2f\n' % accuracy_score(y_test, y_test_pred))
if __name__=='__main__':
main()
ということで実行した結果が以下です。ちゃんと選択出来ているようです。
[[192 57]
[ 49 202]]
SCORE with ALL Features: 0.79
Index([48, 105, 153, 241, 318, 336, 338, 378, 442, 453, 472, 475], dtype='object')
[[212 37]
[ 34 217]]
SCORE with selected Features: 0.86
Borutaでxgboostを使う
上記で作ったBorutaPyForLGB()はxgboostでも使えます。
def main():
# データを読んでくる
data_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'
label_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'
X_data = pd.read_csv(data_url, sep=" ", header=None)
y_data = pd.read_csv(label_url, sep=" ", header=None)
data = X_data.iloc[:,0:500]
data['target'] = y_data[0]
y=data['target']
X=data.drop(columns='target')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# データ全体で学習
model = lgb.LGBMClassifier(objective='binary',
num_leaves = 23,
learning_rate=0.1,
n_estimators=100,)
model.fit(X_train.values, y_train.values)
y_test_pred = model.predict(X_test.values)
print(confusion_matrix(y_test.values, y_test_pred, labels=model.classes_), '\n')
print('SCORE with ALL Features: %1.2f\n' % accuracy_score(y_test, y_test_pred))
# Borutaで特徴量選択 (一部書き換えたBorutaPyを使います)
model = lgb.LGBMClassifier(objective='binary',
num_leaves = 23,
learning_rate=0.1,
n_estimators=100,)
feat_selector = BorutaPyForLGB(model, n_estimators='auto', two_step=False,verbose=2, random_state=42)
feat_selector.fit(X_train.values, y_train.values)
print(X_train.columns[feat_selector.support_])
# 選択したFeatureを取り出し
X_train_selected = X_train.iloc[:,feat_selector.support_]
X_test_selected = X_test.iloc[:,feat_selector.support_]
print(X_test_selected.head())
# 選択したFeatureで学習
model = lgb.LGBMClassifier(objective='binary',
num_leaves = 23,
learning_rate=0.1,
n_estimators=100,)
model.fit(X_train_selected.values, y_train.values)
y_test_pred = model.predict(X_test_selected.values)
print(confusion_matrix(y_test.values, y_test_pred, labels=model.classes_), '\n')
print('SCORE with selected Features: %1.2f\n' % accuracy_score(y_test, y_test_pred))
if __name__=='__main__':
main()
動きはするんですがこちらは上手くいってなくて、精度が下がっちゃってます。どうも特徴量を減らし過ぎてしまったようなんですが何故なんでしょう。
[[182 67]
[ 75 176]]
SCORE with ALL Features: 0.72
Index([28, 378, 451, 475], dtype='object')
[[148 101]
[109 142]]
SCORE with selected Features: 0.58
という感じです。もっと短く書くやり方あったら教えて下さい。
レッツトライ!