Pythonの機械学習テンプレート
毎回調べるのが面倒なので適当にまとめる. pandasの基礎はこちら.
githubはこちら.
まず, データ分析で使いそうな基本的なパッケージのインポート. (モデル以外)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import datasets
from collections import defaultdict
import json
pd.set_option("display.max_columns", 100)
ツール
使いそうなやつのまとめ.
変数のセーブとロード
import pickle
def var_save(filename, var):
with open(filename, "wb") as f:
pickle.dump(var, f)
def var_load(filename):
with open(filename, "rb") as f:
var = pickle.load(f)
return var
K-Fold CV
kf = KFold(n_splits=5, random_state=1, shuffle=True)
for train_index, valid_index in kf.split(X):
X_train, X_valid = X[train_index], X[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
onehot
def onehot(target):
return np.eye(np.unique(target).shape[0])[target]
target
は0-(n-1)の整数のカテゴリ値を持つとする.
pandasを使って, pd.get_dummies(target)
でもよい.
事前にカテゴリ数がわかっているが, targetにすべて含まれているかわからない場合は,
onehot = lambda target, num: np.eye(num)[target]
でよい.
AUC
from sklearn import metrics
def auc(train, predict):
fpr, tpr, thresholds = metrics.roc_curve(train, predict)
return metrics.auc(fpr, tpr)
2クラス用
係数最適化
import numpy as np
from functools import partial
from scipy.optimize import minimize
class CoefficientOptimizer:
def __init__(
self,
loss_func,
initial_coef=None,
):
self.loss_func = loss_func
self.initial_coef = initial_coef
self.status = dict()
@property
def coefficients(self):
assert "x" in self.status
return self.status["x"]
def fit(self, X, y):
if self.initial_coef is None:
self.initial_coef = np.ones(X.shape[1]) / X.shape[1]
loss_partial = partial(self._score, X=X, y=y)
self.status = minimize(
loss_partial,
self.initial_coef,
method="nelder-mead",
)
def _score(self, coef, X, y):
blend = self.predict(X, coef)
score = self.loss_func(y, blend)
return score
def score(self, X, y, coef=None):
if coef is None:
coef = self.coefficients
return self._score(coef, X, y)
def predict(self, X, coef=None):
if coef is None:
coef = self.coefficients
blend = np.dot(X, coef)
return blend
アンサンブル時の係数最適化。
次元削減
PCA (主成分分析)
from sklearn.decomposition import PCA
def pca(X, dim):
pca = PCA(n_components=dim)
pca.fit(X)
print(sum(pca.explained_variance_ratio_)) # 寄与率
return pca.transform(X)
元の次元からdim次元へ減らす
AutoEncoder
from keras.layers import Input, Dense
from keras.models import Model
def autoenc(X, encoding_dim, epochs):
input_dim = X.shape[1] # 入力次元
# encode, decodeの深さ
depth = 3
# エンコーダー層
input_img = Input(shape=(input_dim,))
encoded = Dense(128, activation="relu")(input_img)
encoded = Dense(64, activation="relu")(encoded)
encoded = Dense(encoding_dim, activation="sigmoid")(encoded)
# デコーダー層
decoded = Dense(64, activation="relu")(encoded)
decoded = Dense(128, activation="relu")(decoded)
decoded = Dense(input_dim, activation="sigmoid")(decoded)
# AutoEncoder
autoencoder = Model(input_img, decoded)
# エンコーダーモデル
encoder = Model(input_img, encoded)
# デコーダーモデル
encoded_input = Input(shape=(encoding_dim,))
decoder_layer = encoded_input
for i in range(depth):
decoder_layer = autoencoder.layers[i-depth](decoder_layer)
decoder = Model(encoded_input, decoder_layer)
# 最適化
autoencoder.compile(optimizer="adam", loss="mean_squared_error")
autoencoder.fit(X, X, epochs=epochs, batch_size=128, shuffle=True)
return (encoder, decoder)
使用するデータに合わせて, 層の数やノード数を変える.
自然言語のベクトル化
TF-IDF vector
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
tfidf = TfidfVectorizer(
stop_words=None,
binary=False,
norm="l2",
analyzer="char_wb",
ngram_range=(2, 3),
min_df=2,
dtype=np.float32
)
texts = ["あの鳥はスズメです", "これはペンです", "あの鳥はカラスです"]
text_embeddings = tfidf.fit_transform(texts)
# 次元削減
# svd = TruncatedSVD(n_components=512, random_state=1)
# vectors = svd.fit_transform(text_embeddings)
Universal Sentence Encoder
import tensorflow_hub as hub
import tensorflow_text
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
texts = ["あの鳥はスズメです", "これはペンです", "あの鳥はカラスです"]
vectors = embed(texts)
モデル
回帰や分類のモデルの紹介.
説明無しのコピペ用.
データセットの準備
まず, 試す用データセット:
回帰
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1, test_size=0.1)
分類
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1,test_size=0.1)
以下, 回帰と分類に分けて説明するが, 分類のモデルが回帰に使えるやつもあるので, 特に気にしないでいただきたい.
回帰
LASSO
from sklearn import linear_model
model= linear_model.Lasso(alpha=1, fit_intercept=True) # alpha: L1係数
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))
print(model.coef_)
リッジ回帰
from sklearn.linear_model import Ridge
model = Ridge(alpha=1.0) # alpha: L2の係数
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))
ランダムフォレスト
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth=5, random_state=1, n_estimators=100)
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))
分類
ロジスティック回帰
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=1, penalty="l1", solver="liblinear") # C: 損失の係数(正則化係数の逆数)
model.fit(X_train, y_train)
print(model.score(X_valid, y_valid))
print(model.predict(X_valid))
print(model.predict_proba(X_valid))
SVM
from sklearn.svm import LinearSVC
model = LinearSVC(C=1, penalty="l1", loss="squared_hinge", dual=False, max_iter=10000) # C: 損失の係数(正則化係数の逆数)
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))
print(model._predict_proba_lr(X_valid))
print(model.coef_)
kernel SVM
from sklearn.svm import SVC
model = SVC(C=1, kernel="rbf")
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))
事前に計算したカーネルを使いたいときは, precomputed
を使う.
ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))
LightGBM
import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
lgbm_params = {
"objective": "multiclass", #"xentropy",
"num_class": 3,
"metric": "multi_logloss", #"auc",
"num_leaves": 15,
# "min_data_in_leaf": 20,
# "bagging_fraction": 0.9,
# "bagging_freq" : 2,
# "feature_fraction": 0.9
"lambda_l1":0.1,
"lambda_l2":0.01,
# "min_gain_to_split":0.1
# "max_depth":10
"learning_rate": 0.05,
"seed":1,
}
model = lgb.train(
lgbm_params, lgb_train,
valid_sets=[lgb_train, lgb_eval],
valid_names=["train", "valid"],
num_boost_round=5000,
# early_stopping_rounds=50, verbose_eval=50
callbacks=[
lgb.early_stopping(stopping_rounds=50),
lgb.log_evaluation(50)
],
# feval=lgb_custom_metric
)
print(model.predict(X_valid, num_iteration=model.best_iteration))
print(model.feature_importance())
lgb.plot_importance(model, ignore_zero=False, height=0.5)
model.best_score
カスタムメトリックの書き方メモ
def lgb_custom_metric(preds, data):
y_true = data.get_label()
y_pred = np.argmax(preds.reshape(3,-1), axis=0)
acc = np.mean(y_true == y_pred)
return "custom_metric", acc, True # 高いほうが良いときTrue
xgboost
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
evals = [(dtrain, "train"), (dvalid, "eval")]
xgb_params = {
"objective": "multi:softmax",#"rmse",
"num_class": 3,
"eval_metric": "mlogloss",#"rmse",
"max_depth": 5,
"subsample": 0.9,
"colsample_bytree": 0.9,
"alpha": 0.1,
"lambda": 0.01,
"gamma": 0.1
}
model=xgb.train(xgb_params, dtrain, num_boost_round=10000, early_stopping_rounds=100, evals=evals)
print(model.predict(dvalid,
# ntree_limit=model.best_ntree_limit,
iteration_range=(0,model.best_ntree_limit)
))
print(model.get_score())
xgb.plot_importance(model)
catboost
import catboost
model = catboost.CatBoostClassifier(iterations=1000,
use_best_model=True,
eval_metric = "HingeLoss", # "AUC",
random_seed=1,
l2_leaf_reg=3,
depth=6,
loss_function="MultiClass",# "CrossEntropy",
classes_count=3
)
model.fit(X_train, y_train,
# cat_features=categorical_features_index,
eval_set=(X_valid, y_valid),
early_stopping_rounds=20
)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))
ニューラルネット
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
model = Sequential()
model.add(Dense(units=64, activation="relu", input_dim=X_train.shape[1]))
model.add(Dense(units=32, activation="relu"))
model.add(Dense(units=3, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, onehot(y_train), epochs=200, batch_size=256)
np.argmax(model.predict(X_valid, batch_size=256), axis=1)
k近傍法
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print(knn.score(X_valid,y_valid))
print(knn.predict_proba(X_valid))
決定木 (CART)
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=4)
clf = clf.fit(X_train, y_train)
print(clf.score(X_valid,y_valid))
線形判別分析 (LDA)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=2)
lda.fit(X_train,y_train)
print(lda.score(X_valid,y_valid))
X_lda=lda.transform(X_train)
for c in set(y_train):
plt.scatter(X_lda[y_train==c,0], X_lda[y_train==c,1], label=f"class{c}")
plt.legend()
クラスタリング
K-meansクラスタリング
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, n_init="auto")
kmeans.fit(X)
print(kmeans.labels_)
階層型クラスタリング
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, set_link_color_palette
ward = linkage(X, method="ward", metric="euclidean")
print("ward:")
print(ward) # ウォード法による結果を表示
threshold = 0.7 * np.max(ward[:, 2]) # 適当に閾値を決める
dendrogram(ward, color_threshold=threshold) # 階層型クラスタリング結果をプロット
plt.show() # 表示
clustered = fcluster(ward, threshold, criterion="distance") # 閾値で切ったときに, 各データが属するクラスタ計算
print("clustering:")
print(clustered)
# wardは (n - 1) × 4 の行列になる
# n - 1はステップ
# 1つ目と2つ目にはどのクラスタがくっついたかっていう番号
# 3つ目には2つのクラスタ距離
# 4つ目には統合されたときのクラスタの要素数が入ってる