LoginSignup
1
5

More than 1 year has passed since last update.

回帰 lightGBM 教師あり学習 ソースコード

Posted at

pickleモジュールをload

pickleモジュールについては以下の記事がわかりやすいです!

import pandas as pd
import itertools
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import codecs
import os
import pickle
from datetime import datetime
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn import datasets, preprocessing
from sklearn.cluster import KMeans

# 非pickle化 merge_dataを取り出す
with open('./output/pre_data/merge_data.pkl', 'rb') as f:
    merge_data = pickle.load(f)
merge_data

AI解析前の処理

X_train,X_test,y_train,y_test,X_val,y_val を作成します
手元にあるデータセットは訓練用、検証用、テスト(最終評価)用に分けます

#AI解析前の処理 X_train,X_test,y_train,y_test,X_val,y_val の作成
def Detection_before_AI_run(merge_data):
    # 目的変数を分離
    X = merge_data.drop("target",axis=1).values
    y = merge_data["target"].values
    columns_name = merge_data.drop("target",axis=1).columns

    def Test_data_and_training_data_split(df,X,Y):
                 N_train = int(len(df) * 0.8)
                 N_test = len(df) - N_train
                 X_train, X_test, y_train, y_test = \
                    train_test_split(X, Y, test_size=N_test,random_state=42)
                 return X_train, X_test, y_train, y_test

    # Execute a function that separates data for training and data for testing.
    X_train, X_test, y_train, y_test = Test_data_and_training_data_split(merge_data,X,y)
    X_train = pd.DataFrame(X_train, columns=columns_name)
    X_test = pd.DataFrame(X_test, columns=columns_name)
    X_test_df = pd.DataFrame(X_test)
    y_test_df = pd.DataFrame(y_test)
    test_dfp = pd.concat([y_test_df,X_test_df], axis=1)
    test_df=test_dfp.rename(columns={0:"target"})
    y_trainp = pd.DataFrame(y_train)
    X_trainp = pd.DataFrame(X_train)
    train=pd.concat([y_trainp, X_trainp], axis=1)
    merge_data_p=train.rename(columns={0:"target"})
    X = merge_data_p.drop("target",axis=1).values
    y = merge_data_p["target"].values
    columns_name = merge_data_p.drop("target",axis=1).columns
    def Test_data_and_training_data_split(df,X,Y):
                 N_train = int(len(df) * 0.80)
                 N_test = len(df) - N_train
                 X_train, X_test, y_train, y_test = \
                    train_test_split(X, Y, test_size=N_test,random_state=42)
                 return X_train, X_test, y_train, y_test
    # Execute a function that separates the data for training from the data for validation.
    X_train,X_val, y_train,y_val = Test_data_and_training_data_split(merge_data_p,X,y)
    X_train = pd.DataFrame(X_train, columns=columns_name)
    X_val = pd.DataFrame(X_val, columns=columns_name)
    #training verification Combine test data vertically
    y_trainp = pd.DataFrame(y_train)
    X_trainp = pd.DataFrame(X_train)
    train=pd.concat([y_trainp, X_trainp], axis=1)
    y_valp = pd.DataFrame(y_val)
    X_valp = pd.DataFrame(X_val)
    val=pd.concat([y_valp, X_valp], axis=1)
    train_vol=pd.concat([train, val])
    order_of_things=train_vol.rename(columns={0:"target"})
    X_test_df = pd.DataFrame(X_test)
    y_test_df = pd.DataFrame(y_test)
    test_dfp = pd.concat([y_test_df,X_test_df], axis=1)
    test_df=test_dfp.rename(columns={0:"target"})
    marge_data_out=pd.concat([order_of_things, test_df])

    # shape 確認
    print("train shape", X_train.shape)
    print("test shape", X_test.shape)
    print("Xtest", X_test)
    print("validation shape", X_val.shape)
    print("y_train shape", y_train.shape)
    print("y_test shape", y_test.shape)
    print("y_validation shape", y_val.shape)
    print("y_test describe",y_test_df.describe())
    print("ytest", y_test)
    print("not_ y_test describe",(~y_test_df.duplicated()).sum())
    #y_test_df.value_counts().plot(kind="bar")
    print("y_test_df.duplicated().sum()",y_test_df.duplicated().sum())

    #テストデータをtest_df とする
    test_dfp = pd.concat([y_test_df,X_test_df], axis=1)
    test_df=test_dfp.rename(columns={0:"target"})
    test_df
    #pd.DataFrame に戻して 縦に train val 結合していく
    y_trainp = pd.DataFrame(y_train)
    X_trainp = pd.DataFrame(X_train)
    train=pd.concat([y_trainp, X_trainp], axis=1)
    train

    y_valp = pd.DataFrame(y_val)
    X_valp = pd.DataFrame(X_val)
    val=pd.concat([y_valp, X_valp], axis=1)
    val
    test_vol=pd.concat([train, val])
    test_vol
    #yの目的変数のカラムが0になってるので target に変化
    order_of_things=test_vol.rename(columns={0:"target"})
    order_of_things
    #テストデータ及び
    merge_data_test=pd.concat([order_of_things, test_df])
    merge_data_test
    return X_train,X_test,y_train,y_test,X_val,y_val

X_train,X_test,y_train,y_test,X_val,y_val=Detection_before_AI_run(merge_data)

機械学習lightGBMでテストの実行

今回実施するのは 回帰の教師あり学習です
"回帰"とは簡単に言うと、"数値を予測すること"です

#テストの実行
def reg_top10_lightGBM(outname,no,random_state_number):
    from sklearn.model_selection import StratifiedShuffleSplit
    #shap
    import shap
    shap.initjs()
    import lightgbm as lgb
    # データセットを作成
    train = lgb.Dataset(X_train, label=y_train)
    valid = lgb.Dataset(X_val, label=y_val)

    # モデルのパラメータを設定
    params = {'task': 'train',
              'boosting_type': 'gbdt',
              'objective': 'regression',      # 目的関数:回帰
              'metric': 'rmse',
              'learning_rate': 0.1,
              "seed":random_state_number}

    callbacks = [
    lgb.early_stopping(200),]
    # モデルを訓練
    model = lgb.train(params,
                      train,
                      valid_sets=valid,
                      num_boost_round=3000,
                      callbacks=callbacks,)

    # 予測
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    #shap
    #explainer = shap.TreeExplainer(model)
    #shap_values = explainer.shap_values(X_test)
    #shap.summary_plot(shap_values, X_test, plot_type="bar")
    shap_values = shap.TreeExplainer(model).shap_values(X_test)
    shap.summary_plot(shap_values, X_test, plot_type="bar",show=False)
    plt.savefig("./output/"+outname+"shap_values_bar.png")

    shap_values = shap.TreeExplainer(model).shap_values(X_test)
    shap.summary_plot(shap_values, X_test, plot_type="dot",show=False)
    plt.savefig("./output/"+outname+"shap_values_dot.png")

    from sklearn.metrics import mean_squared_error # モデル評価用(平均二乗誤差)
    from sklearn.metrics import r2_score # モデル評価用(決定係数)

    # モデル評価
    # rmse : 平均二乗誤差の平方根
    mse = mean_squared_error(y_test, y_pred) # MSE(平均二乗誤差)の算出
    rmse = np.sqrt(mse) # RSME = √MSEの算出
    print('RMSE :',rmse)
    # r2 : 決定係数
    r2 = r2_score(y_test,y_pred)
    print('R2 :',r2)
    #2変数間に、どの程度、順位づけの直線関係があるかを調べる際に使う分析手段がスピアマンの順位相関
    from scipy.stats import spearmanr
    correlation, pvalue = spearmanr(y_test,y_pred)
    print("correlation",correlation) #0.6727272727272726

    df_Df = pd.DataFrame({'regression_y_test_'+no:y_test,'regression_y_pred_'+no:y_pred,'RMSE_'+no:rmse,'R2_'+no:r2,"corr"+no:correlation})
    df_Df.to_csv(r""+"./output/"+outname+no+'.csv', encoding = 'shift-jis')

    importance = pd.DataFrame(model.feature_importance(importance_type='gain'), columns=['importance'])
    #display(importance)
    C_you=merge_data.drop(["target"], axis=1)
    importance["columns"] =list(C_you.columns)
    importance.to_csv(r""+"./output/importance"+outname+no+'.csv', encoding = 'shift-jis')
    return importance,shap_values

importance,shap_values = reg_top10_lightGBM("lightGBM_reg_check_data","_1",1)

今回のソースコードは
importance,shap_values の値を出す
さらにoutputフォルダに結果をcsvで保存する ものとなっております!

1
5
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
5