はじめに
本記事ではDay1、Day2に引き続き、KaggleのコンペのTabular Playground Seriesで学んだことの備忘録を挙げていきます。
Tabular Playground Series
Day1では自分でデータ分析に挑戦した結果を、Day2では他のKagglerの人が実際に行ったデータの可視化、分析を、Day3では上位入賞者(厳密にいうとまだ開催期間中です)のデータのモデリングを中心に取り扱います。今回取り扱う記事は以下のものです。動作環境はjupyternotebookです。
⚡Catboost with Optuna Starter [TPS-06]
[Simple NN Implementation for Beginners] (https://www.kaggle.com/boss0ayush/simple-nn-implementation-for-beginners-top-7)
Decision Forest fed by Neural Network
⚡Catboost with Optuna Starter [TPS-06]
概要
この記事では主にOptunaと呼ばれるハイパーパラメータを自動的に行ってくれるライブラリを用いてCatboostのハイパーパラメータをチューニングしています。OptunaはCatBoostに限らず、Scikit-learn, PyTorch, Keras, TensorFlow、Chainerなど大体の有名ライブラリで使用可能そうです。
GBDTの参考記事
公式
GBDT系の機械学習モデルであるXGBoost, LightGBM, CatBoostを動かしてみる。
XGBoostパラメータのまとめとランダムサーチ実装
PythonでCatBoostの解説
Optunaの参考記事
・github
・Optunaを使って関数最適化をしてみる
・公式リファレンス
・catboostとOptunaでハイパーパラメータ自動最適化
・Kerasでハイパーパラメータを自動調整したいならOptuna
・Optunaでハイパーパラメータの自動チューニング -Pytorch Lightning編-
・optunaのvisualization機能で探索結果を図示してみる
ライブラリやデータの確認
データの読み込み
data = pd.read_csv("train.csv")
ライブラリのインストール
# pip install catboost==0.24.2最新のものをインストールしないとoptunaがうまく動作しないことがあります。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn import model_selection
import lightgbm as lgbm
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier
import optuna
import tqdm
import warnings
import sklearn.exceptions
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
訓練データとテストデータの読み込み
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
一応データの中身表示しときます
train
目標変数(target)をカテゴリカル変数から数値データに
# Classを離散変数(1~9に変換)に
le = LabelEncoder()
train['target'] = le.fit_transform(train['target'])
train.columns
cols = list(train.columns)
cols.remove("target") # データからいらない特徴量を消す
cols.remove("id")
train['target']
not_features = ['id', 'target']
features = []
for feat in train.columns:
if feat not in not_features:
features.append(feat)
print(features)
数値データの標準化
# 標準化
# 最大1,最小0ではなく、単に平均を引いて標準編で割ったもの
scaler = StandardScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])
X=train.drop(['target','id'],axis=1)
Y=train['target']
optunaの設定
def objective(trial,data=X,target=Y):
# データセットの分割
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2,random_state=42)
# Catboostのハイパーパラメータの設定
# trial.suggest('最適化したいパラメータ', A, B, C)
# 最適化したいパラメータを幅C刻みで区間A, Bから最適化する
params = {'iterations':trial.suggest_int("iterations", 4000, 25000), # 構成される決定木の最大数
# 最適な特徴量が決まった後に続ける繰り返しの回数
'od_wait':trial.suggest_int('od_wait', 500, 2300),
'loss_function':'MultiClass',
'task_type':"GPU",
'eval_metric':'MultiClass', # 過学習検知と最良モデルの選択に使われる
'leaf_estimation_method':'Newton',
'bootstrap_type': 'Bernoulli',
'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
'subsample': trial.suggest_uniform('subsample',0,1),
# 木を分割するときに参照されるスコアに使われる。モデルの過学習を防ぐ。
'random_strength': trial.suggest_uniform('random_strength',10,50),
'depth': trial.suggest_int('depth',1,15), # 木の深さ
# 1つの葉に使われる最小の木の数
'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
}
# ハイパーパラメータを構築するモデルの設定
# ModelにはCatBoostとCatBoostClassifierとCatBoostRegressorがある
model = CatBoostClassifier(**params)
# モデルの適応
model.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=100,verbose=False)
# テストデータでモデルを予測する
y_preds = model.predict_proba(X_test)
# 本コンペの評価関数です。
log_loss_multi = log_loss(y_test, y_preds)
return log_loss_multi
OPTUNA_OPTIMIZATION = True
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))
if OPTUNA_OPTIMIZATION:
display(optuna.visualization.plot_optimization_history(study))
display(optuna.visualization.plot_param_importances(study))
display(optuna.visualization.plot_slice(study))
display(optuna.visualization.plot_parallel_coordinate(study))
catboostのモデル
cat_params = study.best_trial.params
cat_params['loss_function'] = 'MultiClass'
cat_params['eval_metric'] = 'MultiClass'
cat_params['bootstrap_type']= 'Bernoulli'
cat_params['leaf_estimation_method'] = 'Newton'
cat_params['random_state'] = 42
cat_params['task_type']='GPU'
test_preds=None
# 層化K分割交差検証...各foldでの各クラスの出現確率が分割前のデータでの出現確率とほぼ同一になるようにデータを分割し,K分割交差検証を適用
kf = StratifiedKFold(n_splits = 10 , shuffle = True , random_state = 42)
# kf.split でトレーニングデータと検証データに分割(インデックスの形)
for fold, (tr_index , val_index) in enumerate(kf.split(X.values , Y.values)):
print("-" * 50)
print(f"Fold {fold + 1}")
x_train,x_val = X.values[tr_index] , X.values[val_index]
y_train,y_val = Y.values[tr_index] , Y.values[val_index]
eval_set = [(x_val, y_val)]
model =CatBoostClassifier(**cat_params)
model.fit(x_train, y_train, eval_set = eval_set, verbose = False)
train_preds = model.predict(x_train)
val_preds = model.predict_proba(x_val)
print(log_loss(y_val, val_preds))
if test_preds is None:
test_preds = model.predict_proba(test[cols].values)
else:
test_preds += model.predict_proba(test[cols].values)
print("-" * 50)
test_preds /= 10
提出
submission = pd.read_csv("sample_submission.csv")
submission['Class_1']=test_preds[:,0]
submission['Class_2']=test_preds[:,1]
submission['Class_3']=test_preds[:,2]
submission['Class_4']=test_preds[:,3]
submission['Class_5']=test_preds[:,4]
submission['Class_6']=test_preds[:,5]
submission['Class_7']=test_preds[:,6]
submission['Class_8']=test_preds[:,7]
submission['Class_9']=test_preds[:,8]
submission.head()
このモデルだと提出用データでは1.74927の評価で250/749位くらいになるらしいです。
[Simple NN Implementation for Beginners]
くわしい解説はやめておきます。
ライブラリの読み込み
import numpy as np
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import log_loss
import gc
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import activations,callbacks
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import initializers
from keras.models import Model
ファイルの読み込み
train = pd.read_csv('train.csv')
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")
submission = submission.set_index('id')
カテゴリカル変数の変換
targets = pd.get_dummies(train['target'])
モデルのための行列を作成
## Setting Up the metrics for Model
def custom_metric(y_true, y_pred):
y_pred = K.clip(y_pred, 1e-15, 1-1e-15)
loss = K.mean(cce(y_true, y_pred))
return loss
cce = tf.keras.losses.CategoricalCrossentropy()
es = tf.keras.callbacks.EarlyStopping(
monitor='val_custom_metric', min_delta=1e-05, patience=5, verbose=0,
mode='min', baseline=None, restore_best_weights=True)
plateau = tf.keras.callbacks.ReduceLROnPlateau(
monitor='val_custom_metric', factor=0.7, patience=2, verbose=0,
mode='min')
モデルの作成
## Model Creation
def conv_model():
conv_inputs = layers.Input(shape = (75))
#----------- Embedding layers ----------------------
# 埋め込み層
# 整数のインデックスをベクトルにマッピングする
embed = layers.Embedding (input_dim = 354,
output_dim = 7,
embeddings_regularizer='l2')(conv_inputs)
#----------- Convolution layers ----------------------
# 畳み込み層
embed = layers.Conv1D(12,1,activation = 'relu')(embed)
embed = layers.Flatten()(embed)
hidden = layers.Dropout(0.3)(embed)
#----------- Residual blocks layers ----------------------
# WeightNormalization ...正規化、BatchNormalizationより速度と精度がいいとか
hidden = tfa.layers.WeightNormalization(
layers.Dense(
units=32,
activation ='selu',
kernel_initializer = "lecun_normal"))(hidden)
output = layers.Dropout(0.3)(layers.Concatenate()([embed, hidden]))
output = tfa.layers.WeightNormalization(
layers.Dense(
units = 32,
activation='relu',
kernel_initializer = "lecun_normal"))(output)
output = layers.Dropout(0.4)(layers.Concatenate()([embed, hidden, output]))
output = tfa.layers.WeightNormalization(
layers.Dense(
units = 32,
activation = 'elu',
kernel_initializer = "lecun_normal"))(output)
#----------- Final layer -----------------------
# 出力は9種類の分類
conv_outputs = layers.Dense(
units = 9,
activation ='softmax',
kernel_initializer ="lecun_normal")(output)
#----------- Model instantiation ---------------
model = Model(conv_inputs,conv_outputs)
return model
モデルの訓練
## Training the Model
import warnings
warnings.filterwarnings('ignore')
oof_NN_a = np.zeros((train.shape[0],9))
pred_NN_a = np.zeros((test.shape[0],9))
N_FOLDS = 20
SEED = 2021
EPOCH = 50
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
for fold, (tr_idx, ts_idx) in enumerate(skf.split(train,train.iloc[:,-1])):
print(f"\n ====== TRAINING FOLD {fold} =======\n")
X_train = train.iloc[:,1:-1].iloc[tr_idx]
y_train = targets.iloc[tr_idx]
X_test = train.iloc[:,1:-1].iloc[ts_idx]
y_test = targets.iloc[ts_idx]
K.clear_session()
#================= NN CONV MODEL training =========
print("\n-----Convolution model Training----\n")
model_conv = conv_model()
model_conv.compile(loss='categorical_crossentropy',
optimizer = keras.optimizers.Adam(learning_rate=2e-4),
metrics=custom_metric)
model_conv.fit(X_train, y_train,
batch_size = 256, epochs = EPOCH,
validation_data=(X_test, y_test),
callbacks=[es, plateau],
verbose = 0)
#============== Convolution Model prediction ==========
pred_a = model_conv.predict(X_test)
oof_NN_a[ts_idx] += pred_a
score_NN_a = log_loss(y_test, pred_a)
print(f"\nFOLD {fold} Score convolution model: {score_NN_a}\n")
pred_NN_a += model_conv.predict(test.iloc[:,1:]) / N_FOLDS
score_a = log_loss(targets, oof_NN_a)
print(f"\n=== FINAL SCORE CONVOLUTION MODEL : {score_a}===\n")
モデルの保存
## Saving the predicted values
pred_embedding = pred_NN_a
pred_embedding
提出の準備
## Preparing for Submission
submission = pd.read_csv("sample_submission.csv")
submission['Class_1']=pred_embedding[:,0]
submission['Class_2']=pred_embedding[:,1]
submission['Class_3']=pred_embedding[:,2]
submission['Class_4']=pred_embedding[:,3]
submission['Class_5']=pred_embedding[:,4]
submission['Class_6']=pred_embedding[:,5]
submission['Class_7']=pred_embedding[:,6]
submission['Class_8']=pred_embedding[:,7]
submission['Class_9']=pred_embedding[:,8]
Decision Forest fed by Neural Network
モデルの概要
ニューラルネットワークの埋め込み層を決定木の入力に使うモデルです。Kerasの公式にも同じような理論は置いてありました。詳しい解説はやめておきます。
Classification with Neural Decision Forests
ライブラリのインポート
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import log_loss
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import activations,callbacks
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras import initializers
from keras.models import Model
データの読み込みとカテゴリカル変数の離散化
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
target = pd.get_dummies(train['target'])
kerasモデルを使った決定木1
class Decision_Tree(keras.Model):
def __init__(self, depth, num_features, used_features_rate, num_classes):
super(Decision_Tree, self).__init__()
self.depth = depth
self.num_leaves = 2 ** depth
self.num_classes = num_classes
num_used_features = int(num_features * used_features_rate)
one_hot = np.eye(num_features)
sampled_feature_indicies = np.random.choice(
np.arange(num_features), num_used_features, replace=False
)
self.used_features_mask = one_hot[sampled_feature_indicies]
self.pi = tf.Variable(
initial_value = tf.random_normal_initializer()(
shape = [self.num_leaves, self.num_classes]
),
dtype="float32",
trainable=True,
)
self.decision_fn = layers.Dense(
units=self.num_leaves,
activation="sigmoid",
name="decision"
)
def call(self, features):
batch_size = tf.shape(features)[0]
features = tf.matmul(
features,
self.used_features_mask,
transpose_b=True
)
decisions = tf.expand_dims(
self.decision_fn(features),
axis=2
)
decisions = layers.concatenate(
[decisions, 1 - decisions],
axis=2
)
mu = tf.ones([batch_size, 1, 1])
begin_idx = 1
end_idx = 2
for level in range(self.depth):
mu = tf.reshape(mu, [batch_size, -1, 1])
mu = tf.tile(mu, (1, 1, 2))
level_decisions = decisions[:, begin_idx:end_idx, :]
mu = mu * level_decisions
begin_idx = end_idx
end_idx = begin_idx + 2 ** (level + 1)
mu = tf.reshape(mu, [batch_size, self.num_leaves])
probabilities = keras.activations.softmax(self.pi)
outputs = tf.matmul(mu, probabilities)
return outputs
kerasモデルを使った決定木2
class Decision_Forest(keras.Model):
def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
super(Decision_Forest, self).__init__()
self.ensemble = []
self.num_classes = num_classes
for _ in range(num_trees):
self.ensemble.append(
Decision_Tree(depth,
num_features,
used_features_rate,
self.num_classes)
)
def call(self, inputs):
batch_size = tf.shape(inputs)[0]
outputs = tf.zeros([batch_size,
num_classes])
for tree in self.ensemble:
outputs += tree(inputs)
outputs /= len(self.ensemble)
return outputs
num_trees = 20
depth = 5
used_features_rate = 0.5
num_classes = 9
num_features = 20
forest_model = Decision_Forest(
num_trees,
depth,
num_features,
used_features_rate,
num_classes
)
metrics = [tf.keras.metrics.CategoricalCrossentropy()]
loss = tf.keras.losses.CategoricalCrossentropy()
es = tf.keras.callbacks.EarlyStopping(
monitor='val_loss', min_delta=0.0000001, patience=2, verbose=0,
mode='min', baseline=None, restore_best_weights=True)
plateau = tf.keras.callbacks.ReduceLROnPlateau(
monitor='val_loss', factor=0.5, patience=2, verbose=0,
mode='min', min_delta=0.0000001, cooldown=0, min_lr=10e-7)
モデルの訓練
N_FOLDS = 10
SEED = 2021
oof_embedding = np.zeros((train.shape[0],9))
pred_embedding = np.zeros((test.shape[0],9))
oof_forest = np.zeros((train.shape[0],9))
pred_forest = np.zeros((test.shape[0],9))
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
for fold, (tr_idx, ts_idx) in enumerate(skf.split(train,train.iloc[:,-1])):
print(f"\n===== TRAINING FOLD {fold} =====\n")
X_train = train.iloc[:,1:-1].iloc[tr_idx]
y_train = target.iloc[tr_idx]
X_test = train.iloc[:,1:-1].iloc[ts_idx]
y_test = target.iloc[ts_idx]
#----------NN Model definition ----------
inp = layers.Input(shape = (75,))
x = layers.Embedding(400, 8, input_length = 256)(inp)
x = layers.Flatten()(x)
# API is the future imput layer for decision forest :
API = layers.Dense(20,
activation='relu',
kernel_initializer='random_uniform',
bias_initializer=initializers.Constant(0.1))(x)
x = layers.Dropout(0.3)(API)
x = layers.Dense(50, activation='relu')(x)
x = layers.Dropout(0.3)(x) ##ドロップアウトを二回連続でする理由があまりわかってない...
x = layers.Dropout(0.3)(x)
x = layers.Dense(20, activation = 'relu')(x)
output = layers.Dense(9, activation = 'softmax')(x)
#----------Models instantiation ---------
model_embedding = Model(inp,output)
model_embedding_without_head = tf.keras.models.Model(inputs=model_embedding.inputs,outputs=API)
model_forest = Model(inp,forest_model(API))
#----------NN Model training ------------
model_embedding.compile(tf.keras.optimizers.Adam(learning_rate=0.0001),
loss = loss ,
metrics = metrics)
model_embedding.fit(X_train,y_train,
validation_data=(X_test,y_test),
epochs=50,
verbose=0,
batch_size = 256,
callbacks=[es,plateau])
#----------NN Model prediction------------
oof_embedding[ts_idx] = model_embedding.predict(X_test)
score_embedding = log_loss(y_test, oof_embedding[ts_idx])
print(f"\nFOLD {fold} Score for NN model {score_embedding}\n")
pred_embedding += model_embedding.predict(test.iloc[:,1:]) / N_FOLDS
#----------Model forest training -----------
model_forest.compile(tf.keras.optimizers.Adam(learning_rate=0.001),
loss = loss,
metrics = metrics)
model_forest.fit(X_train,y_train,
validation_data = (X_test,y_test),
batch_size = 256,
epochs = 50,
verbose = 0,
callbacks = [es,plateau])
#----------Model forest prediction------------
oof_forest[ts_idx] = model_forest.predict(X_test)
score_forest = log_loss(y_test, oof_forest[ts_idx])
print(f"\nFOLD {fold} Score for decision forest : {score_forest}\n")
pred_forest += model_forest.predict(test.iloc[:,1:]) / N_FOLDS
score_embedding = log_loss(target, oof_embedding)
print(f"\n=== FINAL SCORE FOR NN MODEL : {score_embedding}===\n")
score_forest = log_loss(target, oof_forest)
print(f"\n=== FINAL SCORE FOR DECISION FOREST : {score_forest}===\n")
感想と近況報告
・ClassをLabelEncoder()で離散変数に分けてましたけど、one-hotエンコーディングで0か1に分けた方がモデルの出力が連続数に依存しないのでより精度が高くなりそう。
・基本的にKaggleではOptunaを用いて精度を高める手法が主流っぽくて、その中でもCatBoostは親和性が高いとか
・最近統計的因果推論の勉強しているが、k-foldでただ分割するより層化k分割法を用いた方がより理論的なモデルの構築ができると思った。(データ間の共変量を意識しているって意味で)
・optunaで最適化されたcatboostが一番評価性能いいと思ったが、実際は単純なニューラルネットワークが最もよい精度をだしていてすごく不思議だった。
・モチベーションが薄いのもあるが、実際に複雑なモデルでKaggleをやろうとは現状考えてないので、最後二つはただコードを置くだけになってしまった。(次回以降そのことも含め記事の構成を考えておきます。)
※これ以降日記なのでスキップしていいです。
結局月曜日位からoptunaでモデル構築して以降、Kaggleに対するモチベがやや低下して執筆終えたのが金曜とかなり紆余曲折してしまった。他学部のゼミとかインターン考えると、どうしてもKaggleで実績作らないといけないなぁとか思いながら手を加えれてないのがつらいです。明日はKaggleやるって決めてるので頑張りたい所存...LAMPとか画像検出の研究もしたいが...