0
3

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 3 years have passed since last update.

回帰分析の方法

Last updated at Posted at 2020-05-24

#ボストンの住宅価格のデータのロード

from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

boston = load_boston()

df = pd.DataFrame(boston["data"], columns = boston["feature_names"])

df["PRICE"] = boston["target"]

df.head()

#サイキットラーンで実装(ハイパーパラメーターは適当)

#サイキットラーンを使うやり方
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.ensemble import GradientBoostingRegressor

#インプットデータ
X = df.drop("PRICE", axis=1)
Y = df["PRICE"]

#トレインデータとテストデータに分割
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

#値を標準化
sc = StandardScaler()
X_train__std = sc.fit_transform(X_train)
Y_train_std = sc.fit_transform(Y_train.values.reshape(-1,1))
X_test_std = sc.transform(X_test)
Y_test_std = sc.transform(Y_test.values.reshape(-1, 1))


#線形回帰
print("***線形回帰***")
model_linear = LinearRegression()
model_linear.fit(X_train, Y_train)
print("訓練データの相関係数:", model_linear.score(X_train, Y_train))
print("検証データの相関係数:", model_linear.score(X_test, Y_test))
Y_train_pred = model_linear.predict(X_train)
Y_test_pred = model_linear.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

#線形カーネルのSVM回帰
print("***SVM回帰***")
#正則化パラメーター=1、線形カーネルを使用
model_svm = svm.SVR(C=1.0, kernel='linear', epsilon=0.1)
model_svm.fit(X_train, Y_train)
print("訓練データの相関係数:", model_svm.score(X_train, Y_train))
print("検証データの相関係数:", model_svm.score(X_test, Y_test))
Y_train_pred = model_svm.predict(X_train)
Y_test_pred = model_svm.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

#リッジ回帰
print("***リッジ回帰***")
model_ridge = Ridge(alpha=1.0, fit_intercept=True, 
                           normalize=False, copy_X=True, 
                           max_iter=None, tol=0.001, random_state=0)
model_ridge.fit(X_train, Y_train)
print("訓練データの相関係数:", model_ridge.score(X_train, Y_train))
print("検証データの相関係数:", model_ridge.score(X_test, Y_test))
Y_train_pred = model_ridge.predict(X_train)
Y_test_pred = model_ridge.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

#ラッソ回帰
print("***ラッソ回帰***")
model_lasso = Lasso(alpha=1.0, fit_intercept=True, 
                           normalize=False, copy_X=True, 
                           max_iter=1000, tol=0.0001, 
                           warm_start=False, positive=False, 
                           random_state=None, selection="cyclic")
model_lasso.fit(X_train, Y_train)
print("訓練データの相関係数:", model_lasso.score(X_train, Y_train))
print("検証データの相関係数:", model_lasso.score(X_test, Y_test))
Y_train_pred = model_lasso.predict(X_train)
Y_test_pred = model_lasso.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

#エラスティックネット回帰
print("***エラスティックネット回帰***")
model_lasso_elasticnet = ElasticNet(alpha=1.0, l1_ratio=0.5, 
                                fit_intercept=True, normalize=False, 
                                max_iter=1000, copy_X=True, 
                                tol=0.0001, warm_start=False, 
                                positive=False, random_state=None, 
                                selection='cyclic')
model_lasso_elasticnet.fit(X_train, Y_train)
print("訓練データの相関係数:", model_lasso_elasticnet.score(X_train, Y_train))
print("検証データの相関係数:", model_lasso_elasticnet.score(X_test, Y_test))
Y_train_pred = model_lasso_elasticnet.predict(X_train)
Y_test_pred = model_lasso_elasticnet.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

#ランダムフォレスト回帰
print("***ランダムフォレスト回帰***")
model_randomforest = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=2525, verbose=0, warm_start=False)
model_randomforest.fit(X_train, Y_train)
print("訓練データの相関係数:", model_randomforest.score(X_train, Y_train))
print("検証データの相関係数:", model_randomforest.score(X_test, Y_test))
Y_train_pred = model_randomforest.predict(X_train)
Y_test_pred = model_randomforest.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

#勾配ブースティング回帰
print("勾配ブースティング回帰")
model_gbc = GradientBoostingRegressor(random_state=0)
model_gbc.fit(X_train, Y_train)
print("訓練データの相関係数:", model_gbc.score(X_train, Y_train))
print("検証データの相関係数:", model_gbc.score(X_test, Y_test))
Y_train_pred = model_gbc.predict(X_train)
Y_test_pred = model_gbc.predict(X_test)
plt.scatter(Y_train,Y_train_pred, label = "train_data")
plt.scatter(Y_test,Y_test_pred, label = "test_data")
plt.legend()
plt.show()

#多項式(2次、3次など)の項を作成する方法


from sklearn.preprocessing import PolynomialFeatures
df1 = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]] ,columns=["col_a", "col_b", "col_c"])
print(df1)
pf = PolynomialFeatures(degree=2, include_bias=False)
df2 = pd.DataFrame(pf.fit_transform(a), columns = pf.get_feature_names(a.columns))
print(df2)

#評価方法


#平均二乗誤差(残差の二乗の合計)
from sklearn.metrics import mean_squared_error
mean_squared_error(y, y_pred)

#決定係数(0.0-1.0の間でモデルの当てはまりの良さを評価)
#ただし、負の決定係数が出るケースもある。当てはまりの悪さを表している
from sklearn.metrics import r2_score
r2_score(y, y_pred)

#クロスバリデーション
・scoringに"f1"を指定すると、F値で評価してくれる


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True)
model_rfc_1 = RandomForestClassifier()
cross_val_score(model_rfc_1, X, y, cv=cv, scoring='accuracy')

#グリッドサーチ


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
cv = KFold(5, shuffle=True)
param_grid = {'max_depth': [5, 10, 15], 'n_estimators': [10, 20, 30]}
model_rfc_2 = RandomForestClassifier()
grid_search = GridSearchCV(model_rfc_2, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X, y)
0
3
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
3

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?