More than 1 year has passed since last update.

Support Vector回帰の高速パラメータチューニングをscikit-learn APIで実装してみる

Posted at 2023-11-11

サポートベクトル回帰はデータ数がさほど多くないときでもそれなりの汎化性能を持つことがあり、Material infomatics界隈ではよく使われる手法の1つです。
よく使われますが、以下のようなトレードオフ関係の欠点を持ち、グリッドサーチによるハイパーパラメータ調整と絡めると今ひとつ使いにくいという側面もあります。

ハイパーパラメータの対して結果の変動がシビア
ハイパーパラメータの組み合わせによっては計算に時間がかかる

この問題に対して、逐次的にハイパーパラメータを決定することで計算時間を短縮できる方法がありますが、この方法はGridSearchCVで実装できず、scikit-learn APIに組み込めないという問題があります。
そこで、この方法をscikit-learn APIに組み込むために、インターフェースを作ってみます。
scikit-learn API準拠の自作回帰器は、以下の条件を満たす必要があります。

BaseEstimatorとRegressorMixinを継承する
fitとpredictメソッドをつける

その他のルールは本家のドキュメントに書いてありますが、上記2つで最低限動くようになるはずです。

実装

早速実装してみます。

import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.base import BaseEstimator, RegressorMixin


class FastTuneSupportVectorRegressor(BaseEstimator, RegressorMixin):
    def __init__(self):
        self.X = None
        self.y = None
        self.cv = None
        self.C = None
        self.epsilon = None
        self.gamma = None
        self.scoring = None
        self.best_estimator_ = None
        self.best_params_ = None

    def fit(self,
            X,
            y,
            cv,
            C=2 ** np.arange(-5, 11, dtype=float),
            epsilon=2 ** np.arange(-10, 1, dtype=float),
            gamma=2 ** np.arange(-20, 11, dtype=float),
            scoring="neg_mean_squared_error",
            ):
        # set variables
        if isinstance(X, pd.DataFrame):
            self.X = X.values
            self.feature_names_in_ = X.columns
        else:
            self.X = X
        if isinstance(y, (pd.Series, pd.DataFrame)):
            self.y = y.values
        else:
            self.y = y
        self.cv = cv
        self.C = C
        self.epsilon = epsilon
        self.gamma = gamma
        self.scoring = scoring
        self.best_estimator_ = None
        self.best_params_ = dict()

        # initial optimize gamma
        self.best_params_["gamma"] = self._get_gamma_at_maximum_gram_matrix_variance()

        # optimizate epsilon
        gscv = GridSearchCV(
            SVR(C=3, gamma=self.best_params_["gamma"]),
            {"epsilon": self.epsilon},
            cv=self.cv,
            scoring=self.scoring,
            n_jobs=-1
        )
        gscv.fit(self.X, self.y)
        self.best_params_["epsilon"] = gscv.best_params_["epsilon"]

        # optimize C
        gscv = GridSearchCV(
            SVR(gamma=self.best_params_["gamma"], epsilon=self.best_params_["epsilon"]),
            {"C": self.C},
            cv=self.cv,
            scoring=self.scoring,
            n_jobs=-1
        )
        gscv.fit(self.X, self.y)
        self.best_params_["C"] = gscv.best_params_["C"]

        # optimize gamma with cv
        gscv = GridSearchCV(
            SVR(gamma=self.best_params_["epsilon"], C=self.best_params_["C"]),
            {"gamma": self.gamma},
            cv=self.cv,
            scoring=self.scoring,
            n_jobs=-1
        )
        gscv.fit(self.X, self.y)
        self.best_params_["gamma"] = gscv.best_params_["gamma"]

        # learning with best params
        self.best_estimator_ = SVR(**self.best_params_)
        self.best_estimator_.fit(self.X, self.y)

        return self

    def predict(self, X):
        return self.best_estimator_.predict(X)

    def _get_gamma_at_maximum_gram_matrix_variance(self):
        variance_of_gram_matrix = []
        for gamma in self.gamma:
            gram_matrix = np.exp(-gamma * ((self.X[:, np.newaxis] - self.X) ** 2).sum(axis=2))
            variance_of_gram_matrix.append(gram_matrix.var(ddof=1))
        best_gamma = self.gamma[np.where(variance_of_gram_matrix == np.max(variance_of_gram_matrix))[0][0]]

        return best_gamma

何をしているかについては前出のリンクを踏んで確認してください。
さて、これでscikit-learn APIとなったので、pipelineなどに組み込んでも動くはずです。
過去に執筆した記事でも使用したlogSのデータセットを使用して、動作を確認してみます。

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, KFold
from matplotlib import pyplot as plt
import seaborn as sns
plt.rcParams["axes.grid"] = True

url = "https://datachemeng.com/wp-content/uploads/2017/07/logSdataset1290.csv"
df_in = pd.read_csv(url, index_col=0)

x = df_in.iloc[:, 1:]
y = df_in.iloc[:, 0]

keep_index = list(map(lambda x: not x, x.duplicated()))
x = x.loc[keep_index, :]
y = y[keep_index]

x = x.drop("Ipc", axis=1)
x = x.loc[:, x.columns[x.var() != 0]]

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=11)

params = {
    "fasttunesupportvectorregressor__cv": KFold(n_splits=5, shuffle=True, random_state=42),
    "fasttunesupportvectorregressor__C": 2 ** np.arange(-5, 11, dtype=float),
    "fasttunesupportvectorregressor__epsilon": 2 ** np.arange(-10, 1, dtype=float),
    "fasttunesupportvectorregressor__gamma": 2 ** np.arange(-20, 11, dtype=float),
    }
pipe = make_pipeline(
    VarianceThreshold(),
    StandardScaler(),
    FastTuneSupportVectorRegressor()
    )
pipe.fit(train_x, train_y, **params)
pred_train = pipe.predict(train_x)
pred_test = pipe.predict(test_x)

def calc_metrics(observed, predicted):
    print("R2:      ", r2_score(observed, predicted))
    print("MAE:     ", mean_absolute_error(observed, predicted))
    print("RMSE:    ", mean_squared_error(observed, predicted, squared=False))

print("----------Training score----------")
calc_metrics(train_y, pred_train)
print("----------Test score----------")
calc_metrics(test_y, pred_test)

ここまで問題なく実行が確認できました。
TrainingとTestに対する評価指標も無事に計算できました。

----------Training score----------
R2:       0.9504631109016163
MAE:      0.29089283790362347
RMSE:     0.4538156615531091
----------Test score----------
R2:       0.9174454139418793
MAE:      0.4199984618853553
RMSE:     0.5801650346550581

最後に実測値と予測値の関係性を見てみましょう。

    def yy_plot(observed, predicted, fold=None):
        fig, ax = plt.subplots()

        plot_args = {
            "x": observed,
            "y": predicted,
            "ax": ax
        }
        if fold is not None:
            fold = fold.astype("category")
            plot_args["hue"] = fold
            plot_args["palette"] = "deep"

        sns.scatterplot(**plot_args)
        ax.set_xlabel("Observed")
        ax.set_ylabel("Predicted")

        axis_min = min(*ax.set_xlim(), *ax.set_ylim())
        axis_max = max(*ax.set_xlim(), *ax.set_ylim())
        ax.set_xlim(axis_min, axis_max)
        ax.set_ylim(axis_min, axis_max)
        ax.plot([axis_min, axis_max], [axis_min, axis_max], "k--", alpha=0.5)
        if fold is not None:
            plt.legend(loc="upper left", bbox_to_anchor=(1, 1), title=fold.name)
        ax.set_aspect("equal", adjustable="box")

        return fig, ax

    fig, ax = yy_plot(train_y, pred_train)
    ax.set_title("Training")
    plt.show()

    fig, ax = yy_plot(test_y, pred_test)
    ax.set_title("Test")
    plt.show()

問題なく予測できていることがわかります。

最後に

これでサポートベクトル回帰の高速ハイパーパラメータチューニングもscikit-learnの便利な関数群に組み込むことができるようになりました。
過去に執筆した記事で実装したダブルクロスバリデーションにも組み込むことができます。
1点注意点としては、グラム行列がメモリを大量に使うので、データ数が10000を超えたあたりから、RAM 16GBの私のパソコンではオーバーフローを起こしてしまいました。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up