More than 3 years have passed since last update.

【Python3】よく使う分類モデルのグリッドサーチ手法まとめ

Last updated at 2021-11-17Posted at 2021-11-09

はじめに

分類タスクを行う際、毎回分類モデルについてとグリッドサーチを扱うためのパラメータなどを調べるのが面倒なのでまとめておくことにした。
今回はコードベースでまとめるので、モデルについての細かい説明は省きます。

パッケージのインポート

今回使用するライブラリをインポートします。

import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import VotingClassifier

データの準備

今回は例として乳癌患者についてのデータセットを用います。

from sklearn.datasets import load_breast_cancer

# 乳癌データの読み込み
seed = 0
cancer_data = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(cancer_data.data, cancer_data.target, random_state=seed)

モデルの学習

ロジスティック回帰

params = {
    "C": [10**i for i in range(-4,4)],
    "random_state": [seed],
    "penalty": ["l1", "l2"],
    "random_state": [seed]
}

gs_clf = GridSearchCV(LogisticRegression(), param_grid=params, cv=5)
gs_clf.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ", gs_clf.best_params_)
print("accuracy :", gs_clf.best_score_)

出力結果

tuned hpyerparameters :(best parameters)  {'C': 100, 'penalty': 'l2', 'random_state': 0}
accuracy : 0.9576744186046511

k近傍法

params = {
    'n_neighbors': np.arange(1, 20),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

gs_clf = GridSearchCV(KNeighborsClassifier(), params, cv=5)
gs_clf.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ", gs_clf.best_params_)
print("accuracy :", gs_clf.best_score_)

出力結果

tuned hpyerparameters :(best parameters)  {'n_neighbors': 13, 'p': 1, 'weights': 'distance', 'random_state': 0}
accuracy : 0.94593707250342

決定木

params = {
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"],
    "max_depth": [i for i in range(1, 11)],
    "min_samples_split": [i for i in range(2, 11)],
    "min_samples_leaf": [i for i in range(1, 11)],
    "random_state": [seed]
}

gs_clf = GridSearchCV(DecisionTreeClassifier(), params, cv=5)
gs_clf.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ", gs_clf.best_params_)
print("accuracy :", gs_clf.best_score_)

出力結果

tuned hpyerparameters :(best parameters)  {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 9, 'random_state': 0, 'splitter': 'random'}
accuracy : 0.955403556771546

サポートベクターマシン

params = {
    'C': [0.1, 1, 10, 100, 1000],  
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
    'kernel': ['linear', 'rbf'],
    'random_state': [seed]
}

gs_clf = GridSearchCV(SVC(), params, refit=True, verbose=3)
gs_clf.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ", gs_clf.best_params_)
print("accuracy :", gs_clf.best_score_)

出力結果

tuned hpyerparameters :(best parameters)  {'C': 0.1, 'gamma': 1, 'kernel': 'linear', 'random_state': 0}
accuracy : 0.9577017783857729

ランダムフォレスト

params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [10, 100, 300, 500, 1000, 1500, 2000],
    'max_depth': [3, 5, 7, 9, 11],
    "random_state": [seed]
}

gs_clf = GridSearchCV(RandomForestClassifier(), params, cv=5)
gs_clf.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ", gs_clf.best_params_)
print("accuracy :", gs_clf.best_score_)

出力結果

tuned hpyerparameters :(best parameters)  {'criterion': 'entropy', 'max_depth': 9, 'n_estimators': 300, 'random_state': 0}
accuracy : 0.9577838577291382

勾配ブースティング

params = {
    "loss": ["deviance"],
    "learning_rate": [0.001, 0.005, 0.01, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "min_samples_split": [0.2],
    "min_samples_leaf": [0.2],
    "max_depth": [3, 5, 7, 9, 11],
    "max_features": ["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample": [0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators": [10, 100, 300, 500]
}

gs_clf = GridSearchCV(GradientBoostingClassifier(), params, cv=5, n_jobs=-1)
gs_clf.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ", gs_clf.best_params_)
print("accuracy :", gs_clf.best_score_)

出力結果

tuned hpyerparameters :(best parameters)  {'criterion': 'friedman_mse', 'learning_rate': 0.15, 'loss': 'deviance', 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 0.2, 'min_samples_split': 0.2, 'n_estimators': 500, 'subsample': 0.95}
accuracy : 0.97890560875513

XGBoost

params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5]
}

gs_clf = GridSearchCV(XGBClassifier(), params, cv=5)
gs_clf.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ", gs_clf.best_params_)
print("accuracy :", gs_clf.best_score_)

出力結果

tuned hpyerparameters :(best parameters)  {'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.6}
accuracy : 0.9694938440492477

ナイーブベイズ

params = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

gs_clf = GridSearchCV(GaussianNB(), params, cv=5)
gs_clf.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ", gs_clf.best_params_)
print("accuracy :", gs_clf.best_score_)

出力結果

tuned hpyerparameters :(best parameters)  {'var_smoothing': 2.848035868435805e-09}
accuracy : 0.9413406292749658

Stochastic Gradient Descent

params = {
    "loss" : ["hinge", "log", "squared_hinge", "modified_huber"],
    "alpha" : [0.0001, 0.001, 0.01, 0.1],
    "penalty" : ["l2", "l1", "none"],
    "max_iter": [500, 600, 700, 800, 900, 1000]
}

gs_clf = GridSearchCV(SGDClassifier(), params, cv=5)
gs_clf.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ", gs_clf.best_params_)
print("accuracy :", gs_clf.best_score_)

出力結果

tuned hpyerparameters :(best parameters)  {'alpha': 0.001, 'loss': 'modified_huber', 'max_iter': 800, 'penalty': 'none'}
accuracy : 0.9247606019151846

各モデルの精度比較

グリッドサーチでパラメータチューニングを行った結果を用いて、各モデルの精度比較を行う。
ついでにアンサンブル学習のひとつであるMaxVotingでのモデルも追加した。

kfold = model_selection.KFold(n_splits = 5) 
scores = {}

# ロジスティック回帰
lr_clf=LogisticRegression(C=100, penalty="l2")
lr_clf.fit(X_train,y_train)
results = model_selection.cross_val_score(lr_clf, X_test, y_test, cv = kfold) 
scores[('1.LogisticRegression', 'train_score')] = results.mean()
scores[('1.LogisticRegression', 'test_score')] = lr_clf.score(X_test, y_test)


# k近傍法
kn_clf = KNeighborsClassifier(n_neighbors=13, weights='distance', p=1)
kn_clf.fit(X_train,y_train)
results = model_selection.cross_val_score(lr_clf, X_test, y_test, cv = kfold) 
scores[('2.KNeighborsClassifier', 'train_score')] = results.mean()
scores[('2.KNeighborsClassifier', 'test_score')] = kn_clf.score(X_test, y_test)


# 決定木
dtc_clf = DecisionTreeClassifier(
    criterion="entropy",
    splitter= "random",
    max_depth= 8,
    min_samples_split= 2,
    min_samples_leaf= 9,
    random_state= 0
)
dtc_clf.fit(X_train,y_train)
results = model_selection.cross_val_score(dtc_clf, X_test, y_test, cv = kfold)
scores[('3.DecisionTreeClassifier', 'train_score')] = results.mean()
scores[('3.DecisionTreeClassifier', 'test_score')] = dtc_clf.score(X_test, y_test)


# サポートベクターマシン
svm_clf = SVC(
    C=0.1, 
    gamma=1, 
    kernel='linear', 
    random_state=0
)
svm_clf.fit(X_train,y_train)
results = model_selection.cross_val_score(svm_clf, X_test, y_test, cv = kfold) 
scores[('4.SVM', 'train_score')] = results.mean()
scores[('4.SVM', 'test_score')] = svm_clf.score(X_test, y_test)


# ランダムフォレスト
rf_clf = RandomForestClassifier(
    criterion="entropy",
    max_depth= 9,
    n_estimators= 300,
    random_state= 0
)
rf_clf.fit(X_train,y_train)
results = model_selection.cross_val_score(rf_clf, X_test, y_test, cv = kfold) 
scores[('5.RandomForestClassifier', 'train_score')] = results.mean()
scores[('5.RandomForestClassifier', 'test_score')] = rf_clf.score(X_test, y_test)


# 勾配ブースティング
gb_clf = GradientBoostingClassifier(
    criterion= 'friedman_mse', 
    learning_rate=0.15, 
    loss= 'deviance', 
    max_depth=7, 
    max_features='sqrt',
    min_samples_leaf= 0.2, 
    min_samples_split= 0.2, 
    n_estimators= 500, 
    subsample= 0.95
)
gb_clf.fit(X_train, y_train)
results = model_selection.cross_val_score(gb_clf, X_test, y_test, cv = kfold) 
scores[('6.GradientBoosting', 'train_score')] = results.mean()
scores[('6.GradientBoosting', 'test_score')] = gb_clf.score(X_test, y_test)


# XGBoosting
xgb_clf = XGBClassifier(
    min_child_weight=1,
    gamma=0.5,
    subsample=0.6,
    colsample_bytree=1.0,
    max_depth=3
)
xgb_clf.fit(X_train, y_train)
results = model_selection.cross_val_score(xgb_clf, X_test, y_test, cv = kfold) 
scores[('7.XGBClassifier', 'train_score')] = results.mean()
scores[('7.XGBClassifier', 'test_score')] = xgb_clf.score(X_test, y_test)


# ナイーブベイズ
gnb_clf = GaussianNB(var_smoothing=2.848035868435805e-09)
gnb_clf.fit(X_train, y_train)
results = model_selection.cross_val_score(gnb_clf, X_test, y_test, cv = kfold) 
scores[('8.GaussianNB', 'train_score')] = results.mean()
scores[('8.GaussianNB', 'test_score')] = gnb_clf.score(X_test, y_test)


# Stochastic Gradient Descent
sgd_clf = SGDClassifier(
    alpha=0.001,
    loss='modified_huber',
    max_iter=800,
    penalty='none'
)
sgd_clf.fit(X_train, y_train)
results = model_selection.cross_val_score(sgd_clf, X_test, y_test, cv = kfold) 
scores[('9.SGDClassifier', 'train_score')] = results.mean()
scores[('9.SGDClassifier', 'test_score')] = sgd_clf.score(X_test, y_test)


# バギングアンサンブル
estimators=[('lr', lr_clf), ('kn', kn_clf), ('dtc', dtc_clf), ('svc', svm_clf), ('random', rf_clf), ('gradient', gb_clf), ('xgb', xgb_clf), ('gnb', gnb_clf)]
vote_clf=VotingClassifier(estimators=estimators, voting='hard')
vote_clf.fit(X_train, y_train)
results = model_selection.cross_val_score(vote_clf, X_test, y_test, cv = kfold) 
scores[('10.MaxVoting', 'train_score')] = results.mean()
scores[('10.MaxVoting', 'test_score')] = vote_clf.score(X_test, y_test)

# モデル評価
pd.Series(scores).unstack()

出力結果

                         test_score	train_score
1.LogisticRegression	   0.951049	0.944089
2.KNeighborsClassifier	   0.958042	0.944089
3.DecisionTreeClassifier   0.937063	0.916502
4.SVM	                   0.930070	0.944089
5.RandomForestClassifier   0.986014	0.944335
6.GradientBoosting         0.986014	0.951232
7.XGBClassifier	           0.979021	0.936946
8.GaussianNB	           0.937063	0.950985
9.SGDClassifier	           0.867133	0.852956
10.MaxVoting               0.972028 0.958374

これで精度比較が簡単にできるようになっただろう。

おわりに

最後までお読みいただきありがとうございます。
データさえ用意できれば基本的にコピペでパラメータチューニングができるかと思います。
ぜひ活用していただければと思います！
回帰モデルでのGridSearchについてもいつかまとめたいですね。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up