回帰
線形回帰
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, y)
y_pred = lin_reg.predict(X)
lin_reg.score(X, y) #R2スコア
lin_reg.get_params()
lin_reg.set_params(**params)
Ridge回帰(L2正則化)
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1)
ridge.fit(X, y)
y_pred = ridge.predict(X)
ridge.score(X, y)
グリッドサーチによるモデル選択
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha': [0.1, 0.25, 0.5, 0.75, 0.9]}
grid_search = GridSearchCV(Ridge(), param_grid=param_grid, cv=5)
grid_search.fit(X, y)
best_model = grid_search.best_estimator_
print(grid_search.best_params_)
print(grid_search.best_score_)
Lasso回帰(L1正則化)
from sklearn.linear_model import Lasso
lasso = Ridge(alpha=0.1)
lasso.fit(X, y)
y_pred = lasso.predict(X)
lasso.score(X, y)
グリッドサーチによるモデル選択
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha': [0.1, 0.25, 0.5, 0.75, 0.9]}
grid_search = GridSearchCV(Lasso(), param_grid=param_grid, cv=5)
grid_search.fit(X, y)
best_model = grid_search.best_estimator_
print(grid_search.best_params_)
print(grid_search.best_score_)
ElasticNet
グリッドサーチによるモデル選択
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha': [0.1, 0.5, 0.9], 'l1_ratio': [0.01, 0.1, 0.5, 0.9, 0.99]}
grid_search = GridSearchCV(ElasticNet(), param_grid=param_grid, cv=5)
grid_search.fit(X, y)
best_model = grid_search.best_estimator_
print(grid_search.best_params_)
print(grid_search.best_score_)
決定木
インポート
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree
学習と予測
X = data
y = target
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)
tree_clf.pridict_proba(new_data)
tree_clf.pridict(new_data)
決定木の可視化
plot_tree(tree_clf, feature_names=iris.feature_names, filled=True)
plt.show()
ランダムフォレスト
rnd_clf = RondomForestClassifier()
rnd_reg = RondomForestRegressor()
ハイパーパラメータ | 選択肢 | default |
---|---|---|
n_estimators | int型 | 10 |
criterion | gini、entropy | gini |
max_depth | int型 or None | Nonemin_samples_split |
min_samples_leaf | int、float型 | 1 |
min_weight_fraction_leaf | float型 | 0 |
max_features | int、float型、None、auto、sqrt、log2 | auto |
max_leaf_nodes | int型 or None | None |
min_impurity_decrease | float型 | 0 |
min_impurity_split | float型 | 1e-7 |
bootstrap | bool型 | True |
oob_score | bool型 | False |
n_jobs | int型 or None | None |
random_state | int型、RandomStateinstanceorNone | None |
verbose | int型 | 0 |
warm_start | bool型 | False |
class_weight | 辞書型、balanced、balanced_subsampleorNone | None |
詳しくは
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
データ変換
最大最小スケーリング(正規化)
パラメータfeature_range
は省略することもできる。その場合は0から1になる。
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)
標準化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
モデル評価、選択
訓練データと検証データに分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
K分割交差検証
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())
(reg, X, y, scoring="neg_mean_squared_error", cv=10)
のように適応度関数を指定することもできる。これはk個のモデルの比較に使うスコアである。
グリッドサーチ
from sklearn.model_selection import GridSearchCV
param_grid = { : [ ]}
model =
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search,fit(X, y)
best_model = grid_search.best_estimator_
best_model.predict(X)
print(grid_search.best_params_)
print(grid_search.best_score_)