pickleモジュールをload
pickleモジュールについては以下の記事がわかりやすいです!
import pandas as pd
import itertools
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import codecs
import os
import pickle
from datetime import datetime
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn import datasets, preprocessing
from sklearn.cluster import KMeans
# 非pickle化 merge_dataを取り出す
with open('./output/pre_data/merge_data.pkl', 'rb') as f:
merge_data = pickle.load(f)
merge_data
AI解析前の処理
X_train,X_test,y_train,y_test,X_val,y_val を作成します
手元にあるデータセットは訓練用、検証用、テスト(最終評価)用に分けます
#AI解析前の処理 X_train,X_test,y_train,y_test,X_val,y_val の作成
def Detection_before_AI_run(merge_data):
# 目的変数を分離
X = merge_data.drop("target",axis=1).values
y = merge_data["target"].values
columns_name = merge_data.drop("target",axis=1).columns
def Test_data_and_training_data_split(df,X,Y):
N_train = int(len(df) * 0.8)
N_test = len(df) - N_train
X_train, X_test, y_train, y_test = \
train_test_split(X, Y, test_size=N_test,random_state=42)
return X_train, X_test, y_train, y_test
# Execute a function that separates data for training and data for testing.
X_train, X_test, y_train, y_test = Test_data_and_training_data_split(merge_data,X,y)
X_train = pd.DataFrame(X_train, columns=columns_name)
X_test = pd.DataFrame(X_test, columns=columns_name)
X_test_df = pd.DataFrame(X_test)
y_test_df = pd.DataFrame(y_test)
test_dfp = pd.concat([y_test_df,X_test_df], axis=1)
test_df=test_dfp.rename(columns={0:"target"})
y_trainp = pd.DataFrame(y_train)
X_trainp = pd.DataFrame(X_train)
train=pd.concat([y_trainp, X_trainp], axis=1)
merge_data_p=train.rename(columns={0:"target"})
X = merge_data_p.drop("target",axis=1).values
y = merge_data_p["target"].values
columns_name = merge_data_p.drop("target",axis=1).columns
def Test_data_and_training_data_split(df,X,Y):
N_train = int(len(df) * 0.80)
N_test = len(df) - N_train
X_train, X_test, y_train, y_test = \
train_test_split(X, Y, test_size=N_test,random_state=42)
return X_train, X_test, y_train, y_test
# Execute a function that separates the data for training from the data for validation.
X_train,X_val, y_train,y_val = Test_data_and_training_data_split(merge_data_p,X,y)
X_train = pd.DataFrame(X_train, columns=columns_name)
X_val = pd.DataFrame(X_val, columns=columns_name)
#training verification Combine test data vertically
y_trainp = pd.DataFrame(y_train)
X_trainp = pd.DataFrame(X_train)
train=pd.concat([y_trainp, X_trainp], axis=1)
y_valp = pd.DataFrame(y_val)
X_valp = pd.DataFrame(X_val)
val=pd.concat([y_valp, X_valp], axis=1)
train_vol=pd.concat([train, val])
order_of_things=train_vol.rename(columns={0:"target"})
X_test_df = pd.DataFrame(X_test)
y_test_df = pd.DataFrame(y_test)
test_dfp = pd.concat([y_test_df,X_test_df], axis=1)
test_df=test_dfp.rename(columns={0:"target"})
marge_data_out=pd.concat([order_of_things, test_df])
# shape 確認
print("train shape", X_train.shape)
print("test shape", X_test.shape)
print("Xtest", X_test)
print("validation shape", X_val.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)
print("y_validation shape", y_val.shape)
print("y_test describe",y_test_df.describe())
print("ytest", y_test)
print("not_ y_test describe",(~y_test_df.duplicated()).sum())
#y_test_df.value_counts().plot(kind="bar")
print("y_test_df.duplicated().sum()",y_test_df.duplicated().sum())
#テストデータをtest_df とする
test_dfp = pd.concat([y_test_df,X_test_df], axis=1)
test_df=test_dfp.rename(columns={0:"target"})
test_df
#pd.DataFrame に戻して 縦に train val 結合していく
y_trainp = pd.DataFrame(y_train)
X_trainp = pd.DataFrame(X_train)
train=pd.concat([y_trainp, X_trainp], axis=1)
train
y_valp = pd.DataFrame(y_val)
X_valp = pd.DataFrame(X_val)
val=pd.concat([y_valp, X_valp], axis=1)
val
test_vol=pd.concat([train, val])
test_vol
#yの目的変数のカラムが0になってるので target に変化
order_of_things=test_vol.rename(columns={0:"target"})
order_of_things
#テストデータ及び
merge_data_test=pd.concat([order_of_things, test_df])
merge_data_test
return X_train,X_test,y_train,y_test,X_val,y_val
X_train,X_test,y_train,y_test,X_val,y_val=Detection_before_AI_run(merge_data)
機械学習lightGBMでテストの実行
今回実施するのは 回帰の教師あり学習です
"回帰"とは簡単に言うと、"数値を予測すること"です
#テストの実行
def reg_top10_lightGBM(outname,no,random_state_number):
from sklearn.model_selection import StratifiedShuffleSplit
#shap
import shap
shap.initjs()
import lightgbm as lgb
# データセットを作成
train = lgb.Dataset(X_train, label=y_train)
valid = lgb.Dataset(X_val, label=y_val)
# モデルのパラメータを設定
params = {'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression', # 目的関数:回帰
'metric': 'rmse',
'learning_rate': 0.1,
"seed":random_state_number}
callbacks = [
lgb.early_stopping(200),]
# モデルを訓練
model = lgb.train(params,
train,
valid_sets=valid,
num_boost_round=3000,
callbacks=callbacks,)
# 予測
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
#shap
#explainer = shap.TreeExplainer(model)
#shap_values = explainer.shap_values(X_test)
#shap.summary_plot(shap_values, X_test, plot_type="bar")
shap_values = shap.TreeExplainer(model).shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar",show=False)
plt.savefig("./output/"+outname+"shap_values_bar.png")
shap_values = shap.TreeExplainer(model).shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="dot",show=False)
plt.savefig("./output/"+outname+"shap_values_dot.png")
from sklearn.metrics import mean_squared_error # モデル評価用(平均二乗誤差)
from sklearn.metrics import r2_score # モデル評価用(決定係数)
# モデル評価
# rmse : 平均二乗誤差の平方根
mse = mean_squared_error(y_test, y_pred) # MSE(平均二乗誤差)の算出
rmse = np.sqrt(mse) # RSME = √MSEの算出
print('RMSE :',rmse)
# r2 : 決定係数
r2 = r2_score(y_test,y_pred)
print('R2 :',r2)
#2変数間に、どの程度、順位づけの直線関係があるかを調べる際に使う分析手段がスピアマンの順位相関
from scipy.stats import spearmanr
correlation, pvalue = spearmanr(y_test,y_pred)
print("correlation",correlation) #0.6727272727272726
df_Df = pd.DataFrame({'regression_y_test_'+no:y_test,'regression_y_pred_'+no:y_pred,'RMSE_'+no:rmse,'R2_'+no:r2,"corr"+no:correlation})
df_Df.to_csv(r""+"./output/"+outname+no+'.csv', encoding = 'shift-jis')
importance = pd.DataFrame(model.feature_importance(importance_type='gain'), columns=['importance'])
#display(importance)
C_you=merge_data.drop(["target"], axis=1)
importance["columns"] =list(C_you.columns)
importance.to_csv(r""+"./output/importance"+outname+no+'.csv', encoding = 'shift-jis')
return importance,shap_values
importance,shap_values = reg_top10_lightGBM("lightGBM_reg_check_data","_1",1)
今回のソースコードは
importance,shap_values の値を出す
さらにoutputフォルダに結果をcsvで保存する ものとなっております!