Last updated at Posted at 2019-09-26


ハイパーパラメータの最適化は機械学習をするうえで避けては通れない道ですが、なかなか時間を取られます。ライブラリとしてはscikit-learnのgrid-searchなどをはじめとしていろいろありますが、今回はOptunaというPreferred Networksのライブラリを使用してみます。
Preferred Networksといえば深層学習フレームワークのChainerが有名ですが、ベイズ最適化を活用したOptunaというハイパーパラメータ最適化ライブラリも作成されています。Optunaを使ってKaggleで準優勝したとか。




  1. Classifierクラスを作成します(既存のClassifierを使う分にはSkip可。ここを工夫すれば非常に自由度が高い最適化を行えます。例えば、前処理部分の最適化やどの学習器を使えばいいのかなどがわかります。)
  2. どういった最適化を行うか、といったObjectiveクラスを作成します。
  3. 訓練と評価をします。


0. インストール

pip install optuna

1. Classifierクラスの作成


method 説明
_fit_and_predict_core fit及びpredictのときに呼ばれる予測のコア部分
fit fitさせます(雑)
predict [データ数]行 × [次元数]列の特徴量行列 X を引数にして、データ数分の予測ラベルを返す
predict_proba [データ数]行 × [次元数]列の特徴量行列 X を引数にして、各データがそれぞれのクラスに所属する確率を返す



class NoScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
    def fit(self, x, y=None):
        return self
    def transform(self, x, y=None):
        return x

class Classifier:
    def __init__(self, params):
        self.params = params
        if params['standardize'] == 'StandardScaler':
            self.standardizer = StandardScaler()
        elif params['standardize'] == 'MinMaxScaler':
            self.standardizer = MinMaxScaler()
        elif params['standardize'] == 'NoScaler':
            self.standardizer = NoScaler()
        if params['classifier_name'] == 'RandomForest':
            self.classifier = RandomForestClassifier(**params['classifier_params'])
        elif params['classifier_name'] == 'SVC':
            self.classifier = SVC(**params['classifier_params'])
    def _fit_and_predict_core(self, x, y=None, fitting=False, proba=False):
        if fitting == True:
        if fitting == True:
            self.classifier.fit(x, y)
        if y is None:
            if proba:
                return self.classifier.predict_proba(x)
                return self.classifier.predict(x)
        return None
    def fit(self, x, y):
        self._fit_and_predict_core(x, y, fitting=True)
        return self
    def predict(self, x):
        pred_y = self._fit_and_predict_core(x)
        return pred_y
    def predict_proba(self, x):
        pred_y = self._fit_and_predict_core(x, proba=True)
        return pred_y

2. Objectiveクラスを作成する

paramsは辞書型として、前処理、学習器をtrial.suggestcategorialメソッドを使って提案し、それぞれkey=standize, classifier_nameとします。また、学習器のハイパーパラメータをkey=classifier_paramsに格納します。


class Objective:
    def __init__(self, x, y, label_index):
        self.x = x
        self.y = y
        self.label_index = label_index
        self.best_score = 0
        self.best_params = None
    def __call__(self, trial):
        x = self.x
        y = self.y
        params = self.generate_params(trial, x)
        classifier = Classifier(params)
        skf = StratifiedKFold(n_splits=5, random_state=2019, shuffle=True)
        ts = []
        ys = []
        for train, test in skf.split(x, y):
            train_y = y.iloc[train].values
            test_y = y.iloc[test].values
            train_x = x.iloc[train].values
            test_x = x.iloc[test].values
            classifier.fit(train_x, train_y)
            pred_y = classifier.predict(test_x).reshape(-1).astype(np.int)
        y_true = self.label_index[np.concatenate(ts)]
        y_pred = self.label_index[np.concatenate(ys)]
        f1_score = metrics.f1_score(y_true, y_pred, average='macro')
        if self.best_score < f1_score:
            self.best_score = f1_score
            self.best_params = params
        return f1_score
    def generate_params(self, trial, x):
        params = {}
        params['standardize'] = trial.suggest_categorical('standardize', ['NoScaler', 'StandardScaler', 'MinMaxScaler'])
        params['classifier_name'] = trial.suggest_categorical('classifier_name', ['RandomForest', 'SVC'])
        classifier_params = {}
        if params['classifier_name'] == 'SVC':
            classifier_params['kernel'] = trial.suggest_categorical('svc_kernel',
                                                             ['linear', 'rbf'])
            classifier_params['C'] = trial.suggest_loguniform('svc_c', 1e-10, 1e10)
            if classifier_params['kernel'] == 'rbf':
                classifier_params['gamma'] = trial.suggest_categorical('svc_gamma',
                                                           ['auto', 'scale'])
                classifier_params['gamma'] = 'auto'

        elif params['classifier_name'] == 'RandomForest':
            classifier_params['n_estimators'] = trial.suggest_categorical(
                'rf_n_estimators', [5, 10, 20, 30, 50, 100])
            classifier_params['max_features'] = trial.suggest_categorical(
                'rf_max_features', ['auto', 0.2, 0.4, 0.6, 0.8])
            classifier_params['max_depth'] = int(
                trial.suggest_loguniform('rf_max_depth', 2, 32))
            classifier_params['n_jobs'] = -1
            raise RuntimeError('unspport classifier', params['classifier_name'])
        params['classifier_params'] = classifier_params
        return params

3. 訓練と評価


import numpy as np
import pandas as pd
import optuna
from sklearn import datasets, metrics
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin

wine = datasets.load_wine()
wine_df = pd.DataFrame(wine.data, columns=wine.feature_names)
wine_label = pd.Series(wine.target)
label, label_index = pd.factorize(wine_label)



objective = Objective(x=wine_df, y=wine_label, label_index=label_index)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)


[I 2019-09-25 11:27:31,892] Finished trial#0 resulted in value: 0.9624935249935249. Current best value is 0.9624935249935249 with parameters: {'standardize': 'StandardScaler', 'classifier_name': 'SVC', 'svc_kernel': 'linear', 'svc_c': 0.025216155955058838, 'max_iter': 10000000}.
[I 2019-09-25 11:27:32,482] Finished trial#1 resulted in value: 0.9388331603827972. Current best value is 0.9624935249935249 with parameters: {'standardize': 'StandardScaler', 'classifier_name': 'SVC', 'svc_kernel': 'linear', 'svc_c': 0.025216155955058838, 'max_iter': 10000000}.
[I 2019-09-25 11:27:32,578] Finished trial#2 resulted in value: 0.37333515781791643. Current best value is 0.9624935249935249 with parameters: {'standardize': 'StandardScaler', 'classifier_name': 'SVC', 'svc_kernel': 'linear', 'svc_c': 0.025216155955058838, 'max_iter': 10000000}.
[I 2019-09-25 11:27:33,823] Finished trial#3 resulted in value: 0.9780237518736309. Current best value is 0.9780237518736309 with parameters: {'standardize': 'MinMaxScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 30, 'rf_max_features': 'auto', 'rf_max_depth': 7.949794224946728}.
[I 2019-09-25 11:27:35,034] Finished trial#4 resulted in value: 0.9457183021463584. Current best value is 0.9780237518736309 with parameters: {'standardize': 'MinMaxScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 30, 'rf_max_features': 'auto', 'rf_max_depth': 7.949794224946728}.
[I 2019-09-25 11:27:36,241] Finished trial#5 resulted in value: 0.9547126096815538. Current best value is 0.9780237518736309 with parameters: {'standardize': 'MinMaxScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 30, 'rf_max_features': 'auto', 'rf_max_depth': 7.949794224946728}.
[I 2019-09-25 11:27:37,607] Finished trial#6 resulted in value: 0.9832019138590686. Current best value is 0.9832019138590686 with parameters: {'standardize': 'MinMaxScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 100, 'rf_max_features': 0.2, 'rf_max_depth': 24.44778629409185}.
[I 2019-09-25 11:27:38,818] Finished trial#7 resulted in value: 0.9663462486443852. Current best value is 0.9832019138590686 with parameters: {'standardize': 'MinMaxScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 100, 'rf_max_features': 0.2, 'rf_max_depth': 24.44778629409185}.
[I 2019-09-25 11:27:40,047] Finished trial#8 resulted in value: 0.9228335043843066. Current best value is 0.9832019138590686 with parameters: {'standardize': 'MinMaxScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 100, 'rf_max_features': 0.2, 'rf_max_depth': 24.44778629409185}.
/usr/local/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
[I 2019-09-25 11:27:40,153] Finished trial#9 resulted in value: 0.19009370816599733. Current best value is 0.9832019138590686 with parameters: {'standardize': 'MinMaxScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 100, 'rf_max_features': 0.2, 'rf_max_depth': 24.44778629409185}.
[I 2019-09-25 11:27:41,437] Finished trial#10 resulted in value: 0.9890005486730775. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:42,673] Finished trial#11 resulted in value: 0.967066267066267. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:43,901] Finished trial#12 resulted in value: 0.9774360337020184. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:45,155] Finished trial#13 resulted in value: 0.9664591260232905. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:46,446] Finished trial#14 resulted in value: 0.9722206731660932. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:47,819] Finished trial#15 resulted in value: 0.972829939395289. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:49,108] Finished trial#16 resulted in value: 0.9832019138590686. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:50,493] Finished trial#17 resulted in value: 0.9832019138590686. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:51,778] Finished trial#18 resulted in value: 0.9832019138590686. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:53,159] Finished trial#19 resulted in value: 0.9890005486730775. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:53,766] Finished trial#20 resulted in value: 0.9388331603827972. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:55,011] Finished trial#21 resulted in value: 0.9890005486730775. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:56,243] Finished trial#22 resulted in value: 0.966985815823735. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:57,447] Finished trial#23 resulted in value: 0.9444828778969215. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:27:58,670] Finished trial#24 resulted in value: 0.961842823264262. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:28:00,039] Finished trial#25 resulted in value: 0.9722588647110046. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:28:01,333] Finished trial#26 resulted in value: 0.9780237518736309. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:28:02,564] Finished trial#27 resulted in value: 0.9670016460192324. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:28:03,786] Finished trial#28 resulted in value: 0.9456877801649823. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.
[I 2019-09-25 11:28:03,891] Finished trial#29 resulted in value: 0.7024265061979896. Current best value is 0.9890005486730775 with parameters: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}.



skf = StratifiedKFold(n_splits=5, random_state=44, shuffle=True)
x = wine_df
y = wine_label

best_score = objective.best_score
print('best score:', best_score)
best_params = objective.best_params
print('best params:', best_params)

ts = []
ys = []

for train, test in skf.split(x, y):
    train_y = y.iloc[train].values
    test_y = y.iloc[test].values
    train_x = x.iloc[train]
    test_x = x.iloc[test]
    model = Classifier(best_params)
    model.fit(train_x, train_y)
    predict = model.predict(test_x).reshape(-1).astype(np.int)
y_true = label_index[np.concatenate(ts)]
y_pred = label_index[np.concatenate(ys)]
print(metrics.confusion_matrix(y_true=y_true, y_pred=y_pred))
print(metrics.classification_report(y_true=y_true, y_pred=y_pred))
FrozenTrial(number=10, state=<TrialState.COMPLETE: 1>, value=0.9890005486730775, datetime_start=datetime.datetime(2019, 9, 25, 11, 27, 40, 155549), datetime_complete=datetime.datetime(2019, 9, 25, 11, 27, 41, 434767), params={'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'rf_n_estimators': 50, 'rf_max_features': 0.2, 'rf_max_depth': 8.269534929659784}, distributions={'standardize': CategoricalDistribution(choices=('NoScaler', 'StandardScaler', 'MinMaxScaler')), 'classifier_name': CategoricalDistribution(choices=('RandomForest', 'SVC')), 'rf_n_estimators': CategoricalDistribution(choices=(5, 10, 20, 30, 50, 100)), 'rf_max_features': CategoricalDistribution(choices=('auto', 0.2, 0.4, 0.6, 0.8)), 'rf_max_depth': LogUniformDistribution(low=2, high=32)}, user_attrs={}, system_attrs={'_number': 10}, intermediate_values={}, params_in_internal_repr={'standardize': 0, 'classifier_name': 0, 'rf_n_estimators': 4, 'rf_max_features': 1, 'rf_max_depth': 8.269534929659784}, trial_id=10)
best score: 0.9890005486730775
best params: {'standardize': 'NoScaler', 'classifier_name': 'RandomForest', 'classifier_params': {'n_estimators': 50, 'max_features': 0.2, 'max_depth': 8, 'n_jobs': -1}}
[[59  0  0]
 [ 1 68  2]
 [ 0  0 48]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        59
           1       1.00      0.96      0.98        71
           2       0.96      1.00      0.98        48

    accuracy                           0.98       178
   macro avg       0.98      0.99      0.98       178
weighted avg       0.98      0.98      0.98       178




