More than 1 year has passed since last update.

Kaggle Masterに学ぶ機械学習実践アプローチ写経 08

Python

Posted at 2024-08-14

# %%
import numpy as np
import pandas as pd

from functools import partial

from sklearn import ensemble, metrics, model_selection

from skopt import gp_minimize, space

def optimize(params, param_names, x, y):
    params = dict(zip(param_names, params))
    model = ensemble.RandomForestClassifier(**params)
    kf = model_selection.StratifiedKFold(n_splits=5)

    accuracies = []
    for train_idx, test_idx in kf.split(X=x, y=y):
        x_train = x.iloc[train_idx].values
        y_train = y.iloc[train_idx].values
        x_test = x.iloc[test_idx].values
        y_test = y.iloc[test_idx].values

        model.fit(x_train, y_train)
        preds = model.predict(x_test)

        fold_accuracy = metrics.accuracy_score(y_test, preds)
        accuracies.append(fold_accuracy)

    return -1 * np.mean(accuracies)

if __name__ == "__main__":
    df = pd.read_csv("../input/train.csv")
    # 目的変数を削除
    X = df.drop("price_range", axis=1)
    y = df.price_range

    param_space = [
        # max_depthは1から10の範囲で整数
        space.Integer(3, 15, name="max_depth"),
        # n_estimatorsは100から1000の範囲で整数
        space.Integer(100, 1500, name="n_estimators"),
        # criterionはカテゴリ型のリスト
        space.Categorical(["gini", "entropy"], name="criterion"),
        # 分布を指定したreal型も扱える
        space.Real(0.01, 1, prior="uniform", name="max_features")
    ]

    param_names = ["max_depth", "n_estimators", "criterion", "max_features"]
    optimization_function = partial(optimize, param_names=param_names, x=X, y=y)
    result = gp_minimize(optimization_function, dimensions = param_space, n_calls=15, n_random_starts=10, verbose=10)
    best_params = dict(zip(param_names, result.x))

    print(best_params)


# %%
import matplotlib.pyplot as plt

# %%
from skopt.plots import plot_convergence

plot_convergence(result)

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection


if __name__ == "__main__":
    df = pd.read_csv("../input/train.csv")
    
    X = df.drop("price_range", axis=1)

    y = df.price_range.values

    classifier = RandomForestClassifier(n_jobs=-1)

    param_grid = {
        "n_estimators": [100, 200, 300, 400, 500],
        "max_depth": [1,2,5,7,11,15],
        "criterion": ["gini", "entropy"]
    }

    model = model_selection.GridSearchCV(
        estimator=classifier,
        param_grid=param_grid,
        scoring="accuracy",
        verbose=10,
        cv=5,
        n_jobs=-1
    )

    model.fit(X, y)
    print(f"Best score: {model.best_score_}")
    print("best parameters set")

    best_parameters = model.best_estimator_.get_params()
    for param_name in sorted(best_parameters.keys()):
        print(f"{param_name}: {best_parameters[param_name]}")

    print("")

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection


if __name__ == "__main__":
    df = pd.read_csv("../input/train.csv")
    
    X = df.drop("price_range", axis=1)

    y = df.price_range.values

    classifier = RandomForestClassifier(n_jobs=-1)

    param_grid = {
        "n_estimators": [100, 200, 300, 400, 500],
        "max_depth": [1,2,5,7,11,15],
        "criterion": ["gini", "entropy"]
    }

    model = model_selection.RandomizedSearchCV(
        estimator=classifier,
        param_distributions=param_grid,
        scoring="accuracy",
        verbose=10,
        cv=5,
        n_jobs=-1,
        n_iter=20
    )

    model.fit(X, y)
    print(f"Best score: {model.best_score_}")
    print("best parameters set")

    best_parameters = model.best_estimator_.get_params()
    for param_name in sorted(best_parameters.keys()):
        print(f"{param_name}: {best_parameters[param_name]}")

    print("")

# %%
import numpy as np
import pandas as pd

from functools import partial

from sklearn import ensemble, metrics, model_selection

from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope

def optimize(params, x, y):

    model = ensemble.RandomForestClassifier(**params)
    kf = model_selection.StratifiedKFold(n_splits=5)

    accuracies = []
    for train_idx, test_idx in kf.split(X=x, y=y):
        x_train = x.iloc[train_idx].values
        y_train = y.iloc[train_idx].values
        x_test = x.iloc[test_idx].values
        y_test = y.iloc[test_idx].values

        model.fit(x_train, y_train)
        preds = model.predict(x_test)

        fold_accuracy = metrics.accuracy_score(y_test, preds)
        accuracies.append(fold_accuracy)

    return -1 * np.mean(accuracies)

if __name__ == "__main__":
    df = pd.read_csv("../input/train.csv")
    # 目的変数を削除
    X = df.drop("price_range", axis=1)
    y = df.price_range

    param_space = {
        "max_depth": scope.int(hp.quniform('max_depth', 3, 15, 1)),
        "n_estimators": scope.int(hp.quniform('n_estimators', 100, 1500, 1)),
        "criterion": hp.choice("criterion", ["gini", "entropy"]),
        "max_features": hp.uniform("max_features", 0.01, 1)
    }

    optimization_function = partial(optimize, x=X, y=y)
    trials = Trials()
    hopt = fmin(fn=optimization_function, space=param_space, algo=tpe.suggest, max_evals=15, trials=trials)

    print(hopt)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

Kaggle Masterに学ぶ機械学習実践アプローチ 写経 08

Kaggle Masterに学ぶ機械学習実践アプローチ写経 08