次の課題に submit するまでの流れです。
House Prices: Advanced Regression Techniques
次のページを参考にしました。
住宅価格を予測する〜Kaggle House Priceチュートリアルに挑む
houseprice01.py
#! /usr/bin/python
#
# houseprice01.py
#
# Feb/23/2020
# --------------------------------------------------------------------------
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import (
LinearRegression,
Ridge,
Lasso
)
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
#
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
# --------------------------------------------------------------------------
# [4]:
def read_proc(file_train,file_test):
train = pd.read_csv(file_train)
test = pd.read_csv(file_test)
# 学習データとテストデータのマージ
train['WhatIsData'] = 'Train'
test['WhatIsData'] = 'Test'
test['SalePrice'] = 9999999999
alldata = pd.concat([train,test],axis=0,sort=True).reset_index(drop=True)
print('The size of train is : ' + str(train.shape))
print('The size of test is : ' + str(test.shape))
#
return train,test,alldata
# --------------------------------------------------------------------------
# [6]:
def compensate_proc(alldata):
na_col_list = alldata.isnull().sum()[alldata.isnull().sum()>0].index.tolist()
# データ型に応じて欠損値を補完する
# floatの場合は0
# objectの場合は'NA'
na_float_cols = alldata[na_col_list].dtypes[alldata[na_col_list].dtypes=='float64'].index.tolist() #float64
na_obj_cols = alldata[na_col_list].dtypes[alldata[na_col_list].dtypes=='object'].index.tolist() #object
# float64型で欠損している場合は0を代入
for na_float_col in na_float_cols:
alldata.loc[alldata[na_float_col].isnull(),na_float_col] = 0.0
# object型で欠損している場合は'NA'を代入
for na_obj_col in na_obj_cols:
alldata.loc[alldata[na_obj_col].isnull(),na_obj_col] = 'NA'
#
#
p1 = alldata.isnull().sum()[alldata.isnull().sum()>0].sort_values(ascending=False)
#
print(p1)
#
return alldata
# --------------------------------------------------------------------------
# [8]:
def categorical_proc(alldata):
# カテゴリカル変数の特徴量をリスト化
cat_cols = alldata.dtypes[alldata.dtypes=='object'].index.tolist()
# 数値変数の特徴量をリスト化
num_cols = alldata.dtypes[alldata.dtypes!='object'].index.tolist()
# データ分割および提出時に必要なカラムをリスト化
other_cols = ['Id','WhatIsData']
# 余計な要素をリストから削除
cat_cols.remove('WhatIsData') #学習データ・テストデータ区別フラグ除去
num_cols.remove('Id') #Id削除
# カテゴリカル変数をダミー化
alldata_cat = pd.get_dummies(alldata[cat_cols])
# データ統合
all_data = pd.concat([alldata[other_cols],alldata[num_cols],alldata_cat],axis=1)
#
return all_data
# --------------------------------------------------------------------------
# [10]:
def divide_proc(all_data):
# マージデータを学習データとテストデータに分割
train_ = all_data[all_data['WhatIsData']=='Train'].drop(['WhatIsData','Id'], axis=1).reset_index(drop=True)
test_ = all_data[all_data['WhatIsData']=='Test'].drop(['WhatIsData','SalePrice'], axis=1).reset_index(drop=True)
# 学習データ内の分割
train_x = train_.drop('SalePrice',axis=1)
train_y = np.log(train_['SalePrice'])
# テストデータ内の分割
test_id = test_['Id']
test_data = test_.drop('Id',axis=1)
#
return train_x,train_y,test_id,test_data
# --------------------------------------------------------------------------
# [12]:
def scaling_proc(train_x,train_y):
scaler = StandardScaler() #スケーリング
param_grid = [0.001, 0.01, 0.1, 1.0, 10.0,100.0,1000.0] #パラメータグリッド
cnt = 0
for alpha in param_grid:
ls = Lasso(alpha=alpha) #Lasso回帰モデル
pipeline = make_pipeline(scaler, ls) #パイプライン生成
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.3, random_state=0)
pipeline.fit(X_train,y_train)
train_rmse = np.sqrt(mean_squared_error(y_train, pipeline.predict(X_train)))
test_rmse = np.sqrt(mean_squared_error(y_test, pipeline.predict(X_test)))
if cnt == 0:
best_score = test_rmse
best_estimator = pipeline
best_param = alpha
elif best_score > test_rmse:
best_score = test_rmse
best_estimator = pipeline
best_param = alpha
else:
pass
cnt = cnt + 1
print('alpha : ' + str(best_param))
print('test score is : ' +str(best_score))
#
return scaler
# --------------------------------------------------------------------------
# [14]:
def submit_proc(file_submit,scaler,train_x,train_y):
# test_id
ls = Lasso(alpha = 0.01)
pipeline = make_pipeline(scaler, ls)
pipeline.fit(train_x,train_y)
test_SalePrice = pd.DataFrame(np.exp(pipeline.predict(test_data)),columns=['SalePrice'])
test_Id = pd.DataFrame(test_id,columns=['Id'])
pd.concat([test_Id, test_SalePrice],axis=1).to_csv(file_submit,index=False)
#
# --------------------------------------------------------------------------
sys.stderr.write("*** 開始 ***\n")
#
file_train = "train.csv"
file_test = "test.csv"
file_submit = "house-prices_submit.csv"
#
train,test,alldata = read_proc(file_train,file_test)
#
alldata = compensate_proc(alldata)
#
all_data = categorical_proc(alldata)
#
train_x,train_y,test_id,test_data = divide_proc(all_data)
#
scaler = scaling_proc(train_x,train_y)
#
submit_proc(file_submit,scaler,train_x,train_y)
#
sys.stderr.write("*** 終了 ***\n")
# --------------------------------------------------------------------------