#ボストンの住宅価格のデータのロード
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
boston = load_boston()
df = pd.DataFrame(boston["data"], columns = boston["feature_names"])
df["PRICE"] = boston["target"]
df.head()
#サイキットラーンで実装(ハイパーパラメーターは適当)
#サイキットラーンを使うやり方
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.ensemble import GradientBoostingRegressor
#インプットデータ
X = df.drop("PRICE", axis=1)
Y = df["PRICE"]
#トレインデータとテストデータに分割
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
#値を標準化
sc = StandardScaler()
X_train__std = sc.fit_transform(X_train)
Y_train_std = sc.fit_transform(Y_train.values.reshape(-1,1))
X_test_std = sc.transform(X_test)
Y_test_std = sc.transform(Y_test.values.reshape(-1, 1))
#線形回帰
print("***線形回帰***")
model_linear = LinearRegression()
model_linear.fit(X_train, Y_train)
print("訓練データの相関係数:", model_linear.score(X_train, Y_train))
print("検証データの相関係数:", model_linear.score(X_test, Y_test))
Y_train_pred = model_linear.predict(X_train)
Y_test_pred = model_linear.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
#線形カーネルのSVM回帰
print("***SVM回帰***")
#正則化パラメーター=1、線形カーネルを使用
model_svm = svm.SVR(C=1.0, kernel='linear', epsilon=0.1)
model_svm.fit(X_train, Y_train)
print("訓練データの相関係数:", model_svm.score(X_train, Y_train))
print("検証データの相関係数:", model_svm.score(X_test, Y_test))
Y_train_pred = model_svm.predict(X_train)
Y_test_pred = model_svm.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
#リッジ回帰
print("***リッジ回帰***")
model_ridge = Ridge(alpha=1.0, fit_intercept=True,
normalize=False, copy_X=True,
max_iter=None, tol=0.001, random_state=0)
model_ridge.fit(X_train, Y_train)
print("訓練データの相関係数:", model_ridge.score(X_train, Y_train))
print("検証データの相関係数:", model_ridge.score(X_test, Y_test))
Y_train_pred = model_ridge.predict(X_train)
Y_test_pred = model_ridge.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
#ラッソ回帰
print("***ラッソ回帰***")
model_lasso = Lasso(alpha=1.0, fit_intercept=True,
normalize=False, copy_X=True,
max_iter=1000, tol=0.0001,
warm_start=False, positive=False,
random_state=None, selection="cyclic")
model_lasso.fit(X_train, Y_train)
print("訓練データの相関係数:", model_lasso.score(X_train, Y_train))
print("検証データの相関係数:", model_lasso.score(X_test, Y_test))
Y_train_pred = model_lasso.predict(X_train)
Y_test_pred = model_lasso.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
#エラスティックネット回帰
print("***エラスティックネット回帰***")
model_lasso_elasticnet = ElasticNet(alpha=1.0, l1_ratio=0.5,
fit_intercept=True, normalize=False,
max_iter=1000, copy_X=True,
tol=0.0001, warm_start=False,
positive=False, random_state=None,
selection='cyclic')
model_lasso_elasticnet.fit(X_train, Y_train)
print("訓練データの相関係数:", model_lasso_elasticnet.score(X_train, Y_train))
print("検証データの相関係数:", model_lasso_elasticnet.score(X_test, Y_test))
Y_train_pred = model_lasso_elasticnet.predict(X_train)
Y_test_pred = model_lasso_elasticnet.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
#ランダムフォレスト回帰
print("***ランダムフォレスト回帰***")
model_randomforest = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
oob_score=False, random_state=2525, verbose=0, warm_start=False)
model_randomforest.fit(X_train, Y_train)
print("訓練データの相関係数:", model_randomforest.score(X_train, Y_train))
print("検証データの相関係数:", model_randomforest.score(X_test, Y_test))
Y_train_pred = model_randomforest.predict(X_train)
Y_test_pred = model_randomforest.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
#勾配ブースティング回帰
print("勾配ブースティング回帰")
model_gbc = GradientBoostingRegressor(random_state=0)
model_gbc.fit(X_train, Y_train)
print("訓練データの相関係数:", model_gbc.score(X_train, Y_train))
print("検証データの相関係数:", model_gbc.score(X_test, Y_test))
Y_train_pred = model_gbc.predict(X_train)
Y_test_pred = model_gbc.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()
#多項式(2次、3次など)の項を作成する方法
from sklearn.preprocessing import PolynomialFeatures
df1 = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]] ,columns=["col_a", "col_b", "col_c"])
print(df1)
pf = PolynomialFeatures(degree=2, include_bias=False)
df2 = pd.DataFrame(pf.fit_transform(a), columns = pf.get_feature_names(a.columns))
print(df2)
#評価方法
#平均二乗誤差(残差の二乗の合計)
from sklearn.metrics import mean_squared_error
mean_squared_error(y, y_pred)
#決定係数(0.0-1.0の間でモデルの当てはまりの良さを評価)
#ただし、負の決定係数が出るケースもある。当てはまりの悪さを表している
from sklearn.metrics import r2_score
r2_score(y, y_pred)
#クロスバリデーション
・scoringに"f1"を指定すると、F値で評価してくれる
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True)
model_rfc_1 = RandomForestClassifier()
cross_val_score(model_rfc_1, X, y, cv=cv, scoring='accuracy')
#グリッドサーチ
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True)
param_grid = {'max_depth': [5, 10, 15], 'n_estimators': [10, 20, 30]}
model_rfc_2 = RandomForestClassifier()
grid_search = GridSearchCV(model_rfc_2, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X, y)