LoginSignup
49
82

Pythonの機械学習テンプレート

Last updated at Posted at 2018-11-21

Pythonの機械学習テンプレート

毎回調べるのが面倒なので適当にまとめる. pandasの基礎はこちら.
githubはこちら.

まず, データ分析で使いそうな基本的なパッケージのインポート. (モデル以外)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import datasets
from collections import defaultdict
import json

pd.set_option("display.max_columns", 100)

ツール

使いそうなやつのまとめ.

変数のセーブとロード

import pickle

def var_save(filename, var):
    with open(filename, "wb") as f:
        pickle.dump(var, f)

def var_load(filename):
    with open(filename, "rb") as f:
        var = pickle.load(f)
    return var

K-Fold CV

 kf = KFold(n_splits=5, random_state=1, shuffle=True)
 for train_index, valid_index in kf.split(X):
     X_train, X_valid = X[train_index], X[valid_index]
     y_train, y_valid = y[train_index], y[valid_index]

onehot

def onehot(target):
    return np.eye(np.unique(target).shape[0])[target]

targetは0-(n-1)の整数のカテゴリ値を持つとする.
pandasを使って, pd.get_dummies(target)でもよい.
事前にカテゴリ数がわかっているが, targetにすべて含まれているかわからない場合は,
onehot = lambda target, num: np.eye(num)[target]でよい.

AUC

from sklearn import metrics

def auc(train, predict):
    fpr, tpr, thresholds = metrics.roc_curve(train, predict)
    return metrics.auc(fpr, tpr)

2クラス用

係数最適化

import numpy as np
from functools import partial
from scipy.optimize import minimize


class CoefficientOptimizer:
    def __init__(
        self,
        loss_func,
        initial_coef=None,
    ):
        self.loss_func = loss_func
        self.initial_coef = initial_coef
        self.status = dict()

    @property
    def coefficients(self):
        assert "x" in self.status
        return self.status["x"]

    def fit(self, X, y):
        if self.initial_coef is None:
            self.initial_coef = np.ones(X.shape[1]) / X.shape[1]
        loss_partial = partial(self._score, X=X, y=y)
        self.status = minimize(
            loss_partial,
            self.initial_coef,
            method="nelder-mead",
        )

    def _score(self, coef, X, y):
        blend = self.predict(X, coef)
        score = self.loss_func(y, blend)
        return score

    def score(self, X, y, coef=None):
        if coef is None:
            coef = self.coefficients
        return self._score(coef, X, y)

    def predict(self, X, coef=None):
        if coef is None:
            coef = self.coefficients
        blend = np.dot(X, coef)
        return blend

アンサンブル時の係数最適化。

次元削減

PCA (主成分分析)

from sklearn.decomposition import PCA

def pca(X, dim):
    pca = PCA(n_components=dim)
    pca.fit(X)
    print(sum(pca.explained_variance_ratio_)) # 寄与率
    return pca.transform(X)

元の次元からdim次元へ減らす

AutoEncoder

from keras.layers import Input, Dense
from keras.models import Model

def autoenc(X, encoding_dim, epochs):
    input_dim = X.shape[1] # 入力次元
    # encode, decodeの深さ
    depth = 3
    # エンコーダー層
    input_img = Input(shape=(input_dim,))
    encoded = Dense(128, activation="relu")(input_img)
    encoded = Dense(64, activation="relu")(encoded)
    encoded = Dense(encoding_dim, activation="sigmoid")(encoded)
    # デコーダー層
    decoded = Dense(64, activation="relu")(encoded)
    decoded = Dense(128, activation="relu")(decoded)
    decoded = Dense(input_dim, activation="sigmoid")(decoded)
    # AutoEncoder
    autoencoder = Model(input_img, decoded)
    # エンコーダーモデル
    encoder = Model(input_img, encoded)
    # デコーダーモデル
    encoded_input = Input(shape=(encoding_dim,))
    decoder_layer = encoded_input
    for i in range(depth):
        decoder_layer = autoencoder.layers[i-depth](decoder_layer)
    decoder = Model(encoded_input, decoder_layer)
    # 最適化
    autoencoder.compile(optimizer="adam", loss="mean_squared_error")
    autoencoder.fit(X, X, epochs=epochs, batch_size=128, shuffle=True)
    return (encoder, decoder)

使用するデータに合わせて, 層の数やノード数を変える.

自然言語のベクトル化

TF-IDF vector

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

tfidf = TfidfVectorizer(
    stop_words=None,
    binary=False,
    norm="l2",
    analyzer="char_wb",
    ngram_range=(2, 3),
    min_df=2,
    dtype=np.float32
)

texts = ["あの鳥はスズメです", "これはペンです", "あの鳥はカラスです"]
text_embeddings = tfidf.fit_transform(texts)

# 次元削減
# svd = TruncatedSVD(n_components=512, random_state=1)
# vectors = svd.fit_transform(text_embeddings)

Universal Sentence Encoder

import tensorflow_hub as hub
import tensorflow_text

# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

texts = ["あの鳥はスズメです", "これはペンです", "あの鳥はカラスです"]
vectors = embed(texts)

モデル

回帰や分類のモデルの紹介.
説明無しのコピペ用.

データセットの準備

まず, 試す用データセット:
回帰

diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1, test_size=0.1)

分類

iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1,test_size=0.1)

以下, 回帰と分類に分けて説明するが, 分類のモデルが回帰に使えるやつもあるので, 特に気にしないでいただきたい.

回帰

LASSO

from sklearn import linear_model

model= linear_model.Lasso(alpha=1, fit_intercept=True) # alpha: L1係数
model.fit(X_train, y_train)

print(model.predict(X_valid))
print(model.score(X_valid,y_valid))
print(model.coef_)

リッジ回帰

from sklearn.linear_model import Ridge

model = Ridge(alpha=1.0) # alpha: L2の係数
model.fit(X_train, y_train)

print(model.predict(X_valid))
print(model.score(X_valid,y_valid))

ランダムフォレスト

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(max_depth=5, random_state=1, n_estimators=100)
model.fit(X_train, y_train)

print(model.predict(X_valid))
print(model.score(X_valid,y_valid))

分類

ロジスティック回帰

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1, penalty="l1", solver="liblinear") # C: 損失の係数(正則化係数の逆数)
model.fit(X_train, y_train)

print(model.score(X_valid, y_valid))
print(model.predict(X_valid))
print(model.predict_proba(X_valid))

SVM

from sklearn.svm import LinearSVC

model = LinearSVC(C=1, penalty="l1", loss="squared_hinge", dual=False, max_iter=10000) # C: 損失の係数(正則化係数の逆数)
model.fit(X_train, y_train)

print(model.predict(X_valid))
print(model.score(X_valid,y_valid))
print(model._predict_proba_lr(X_valid))
print(model.coef_)

kernel SVM

from sklearn.svm import SVC

model = SVC(C=1, kernel="rbf")
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))

事前に計算したカーネルを使いたいときは, precomputedを使う.

ランダムフォレスト

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X_train, y_train)
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))

LightGBM

import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

lgbm_params = {
    "objective": "multiclass", #"xentropy",
    "num_class": 3,
    "metric": "multi_logloss", #"auc",
    "num_leaves": 15,
    # "min_data_in_leaf": 20,
    # "bagging_fraction": 0.9,
    # "bagging_freq" : 2,
    # "feature_fraction": 0.9
    "lambda_l1":0.1,
    "lambda_l2":0.01,
    # "min_gain_to_split":0.1
    # "max_depth":10
    "learning_rate": 0.05, 
    "seed":1,
}
model = lgb.train(
    lgbm_params, lgb_train, 
    valid_sets=[lgb_train, lgb_eval], 
    valid_names=["train", "valid"],
    num_boost_round=5000, 
#     early_stopping_rounds=50, verbose_eval=50
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(50)
    ],
#     feval=lgb_custom_metric
)
print(model.predict(X_valid, num_iteration=model.best_iteration))
print(model.feature_importance())
lgb.plot_importance(model, ignore_zero=False, height=0.5)
model.best_score

カスタムメトリックの書き方メモ

def lgb_custom_metric(preds, data):
    y_true = data.get_label()
    y_pred = np.argmax(preds.reshape(3,-1), axis=0)
    acc = np.mean(y_true == y_pred)
    return "custom_metric", acc, True # 高いほうが良いときTrue

xgboost

import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
evals = [(dtrain, "train"), (dvalid, "eval")]

xgb_params = {
    "objective": "multi:softmax",#"rmse", 
    "num_class": 3,
    "eval_metric": "mlogloss",#"rmse",
    "max_depth": 5, 
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "alpha": 0.1,
    "lambda": 0.01,
    "gamma": 0.1
}
model=xgb.train(xgb_params, dtrain, num_boost_round=10000, early_stopping_rounds=100, evals=evals)
print(model.predict(dvalid, 
#                     ntree_limit=model.best_ntree_limit,
                    iteration_range=(0,model.best_ntree_limit)
))
print(model.get_score())
xgb.plot_importance(model)

catboost

import catboost

model = catboost.CatBoostClassifier(iterations=1000, 
                                    use_best_model=True, 
                                    eval_metric = "HingeLoss", # "AUC",
                                    random_seed=1, 
                                    l2_leaf_reg=3,
                                    depth=6,
                                    loss_function="MultiClass",# "CrossEntropy",
                                    classes_count=3
                                  )
model.fit(X_train, y_train, 
        # cat_features=categorical_features_index, 
        eval_set=(X_valid, y_valid),
        early_stopping_rounds=20
        )
print(model.predict(X_valid))
print(model.score(X_valid,y_valid))

ニューラルネット

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

model = Sequential()
model.add(Dense(units=64, activation="relu", input_dim=X_train.shape[1]))
model.add(Dense(units=32, activation="relu"))
model.add(Dense(units=3, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(X_train, onehot(y_train), epochs=200, batch_size=256)
np.argmax(model.predict(X_valid, batch_size=256), axis=1)

k近傍法

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print(knn.score(X_valid,y_valid))
print(knn.predict_proba(X_valid))

決定木 (CART)

from sklearn import tree

clf = tree.DecisionTreeClassifier(max_depth=4)
clf = clf.fit(X_train, y_train)
print(clf.score(X_valid,y_valid))

線形判別分析 (LDA)

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=2)
lda.fit(X_train,y_train)

print(lda.score(X_valid,y_valid))
X_lda=lda.transform(X_train)
for c in set(y_train):
    plt.scatter(X_lda[y_train==c,0], X_lda[y_train==c,1], label=f"class{c}")
plt.legend()

クラスタリング

K-meansクラスタリング

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, n_init="auto")
kmeans.fit(X)
print(kmeans.labels_)

階層型クラスタリング

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, set_link_color_palette

ward = linkage(X, method="ward", metric="euclidean")
print("ward:")
print(ward) # ウォード法による結果を表示
threshold = 0.7 * np.max(ward[:, 2]) # 適当に閾値を決める
dendrogram(ward, color_threshold=threshold) # 階層型クラスタリング結果をプロット
plt.show() # 表示
clustered = fcluster(ward, threshold, criterion="distance") # 閾値で切ったときに, 各データが属するクラスタ計算
print("clustering:")
print(clustered)
# wardは (n - 1) × 4 の行列になる
# n - 1はステップ
# 1つ目と2つ目にはどのクラスタがくっついたかっていう番号
# 3つ目には2つのクラスタ距離
# 4つ目には統合されたときのクラスタの要素数が入ってる
49
82
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
49
82