2
1

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 3 years have passed since last update.

Kaggle House Prices

Posted at

次の課題に submit するまでの流れです。
House Prices: Advanced Regression Techniques

次のページを参考にしました。
住宅価格を予測する〜Kaggle House Priceチュートリアルに挑む

houseprice01.py
#! /usr/bin/python
#
#	houseprice01.py
#
#					Feb/23/2020
# --------------------------------------------------------------------------
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import (
    LinearRegression,
    Ridge,
    Lasso
)
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
#
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
# --------------------------------------------------------------------------
# [4]:
def read_proc(file_train,file_test):
	train = pd.read_csv(file_train)
	test = pd.read_csv(file_test)
# 学習データとテストデータのマージ
	train['WhatIsData'] = 'Train'
	test['WhatIsData'] = 'Test'
	test['SalePrice'] = 9999999999
	alldata = pd.concat([train,test],axis=0,sort=True).reset_index(drop=True)
	print('The size of train is : ' + str(train.shape))
	print('The size of test is : ' + str(test.shape))
#
	return train,test,alldata
# --------------------------------------------------------------------------
# [6]:
def compensate_proc(alldata):
	na_col_list = alldata.isnull().sum()[alldata.isnull().sum()>0].index.tolist()
# データ型に応じて欠損値を補完する
# floatの場合は0
# objectの場合は'NA'
	na_float_cols = alldata[na_col_list].dtypes[alldata[na_col_list].dtypes=='float64'].index.tolist() #float64
	na_obj_cols = alldata[na_col_list].dtypes[alldata[na_col_list].dtypes=='object'].index.tolist() #object
# float64型で欠損している場合は0を代入
	for na_float_col in na_float_cols:
		alldata.loc[alldata[na_float_col].isnull(),na_float_col] = 0.0
# object型で欠損している場合は'NA'を代入
	for na_obj_col in na_obj_cols:
		alldata.loc[alldata[na_obj_col].isnull(),na_obj_col] = 'NA'
#
#
	p1 = alldata.isnull().sum()[alldata.isnull().sum()>0].sort_values(ascending=False)
#
	print(p1)
#
	return	alldata
# --------------------------------------------------------------------------
# [8]:
def categorical_proc(alldata):
# カテゴリカル変数の特徴量をリスト化
	cat_cols = alldata.dtypes[alldata.dtypes=='object'].index.tolist()
# 数値変数の特徴量をリスト化
	num_cols = alldata.dtypes[alldata.dtypes!='object'].index.tolist()
# データ分割および提出時に必要なカラムをリスト化
	other_cols = ['Id','WhatIsData']
# 余計な要素をリストから削除
	cat_cols.remove('WhatIsData') #学習データ・テストデータ区別フラグ除去
	num_cols.remove('Id') #Id削除
# カテゴリカル変数をダミー化
	alldata_cat = pd.get_dummies(alldata[cat_cols])
# データ統合
	all_data = pd.concat([alldata[other_cols],alldata[num_cols],alldata_cat],axis=1)
#
	return all_data

# --------------------------------------------------------------------------
# [10]:
def divide_proc(all_data):
# マージデータを学習データとテストデータに分割
	train_ = all_data[all_data['WhatIsData']=='Train'].drop(['WhatIsData','Id'], axis=1).reset_index(drop=True)
	test_ = all_data[all_data['WhatIsData']=='Test'].drop(['WhatIsData','SalePrice'], axis=1).reset_index(drop=True)
# 学習データ内の分割
	train_x = train_.drop('SalePrice',axis=1)
	train_y = np.log(train_['SalePrice'])
# テストデータ内の分割
	test_id = test_['Id']
	test_data = test_.drop('Id',axis=1)
#
	return train_x,train_y,test_id,test_data

# --------------------------------------------------------------------------
# [12]:
def scaling_proc(train_x,train_y):
	scaler = StandardScaler()  #スケーリング
	param_grid = [0.001, 0.01, 0.1, 1.0, 10.0,100.0,1000.0] #パラメータグリッド
	cnt = 0
	for alpha in param_grid:
		ls = Lasso(alpha=alpha) #Lasso回帰モデル
		pipeline = make_pipeline(scaler, ls) #パイプライン生成
		X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.3, random_state=0)
		pipeline.fit(X_train,y_train)
		train_rmse = np.sqrt(mean_squared_error(y_train, pipeline.predict(X_train)))
		test_rmse = np.sqrt(mean_squared_error(y_test, pipeline.predict(X_test)))
		if cnt == 0:
			best_score = test_rmse
			best_estimator = pipeline
			best_param = alpha
		elif best_score > test_rmse:
			best_score = test_rmse
			best_estimator = pipeline
			best_param = alpha
		else:
			pass
		cnt = cnt + 1
    
	print('alpha : ' + str(best_param))
	print('test score is : ' +str(best_score))
#
	return scaler
# --------------------------------------------------------------------------
# [14]:
def submit_proc(file_submit,scaler,train_x,train_y):
# test_id
	ls = Lasso(alpha = 0.01)
	pipeline = make_pipeline(scaler, ls)
	pipeline.fit(train_x,train_y)
	test_SalePrice = pd.DataFrame(np.exp(pipeline.predict(test_data)),columns=['SalePrice'])
	test_Id = pd.DataFrame(test_id,columns=['Id'])
	pd.concat([test_Id, test_SalePrice],axis=1).to_csv(file_submit,index=False)
#

# --------------------------------------------------------------------------
sys.stderr.write("*** 開始 ***\n")
#
file_train = "train.csv"
file_test = "test.csv"
file_submit = "house-prices_submit.csv"
#
train,test,alldata = read_proc(file_train,file_test)
#
alldata = compensate_proc(alldata)
#
all_data = categorical_proc(alldata)
#
train_x,train_y,test_id,test_data = divide_proc(all_data)
#
scaler = scaling_proc(train_x,train_y)
#
submit_proc(file_submit,scaler,train_x,train_y)
#
sys.stderr.write("*** 終了 ***\n")
# --------------------------------------------------------------------------

スコアは、 0.13110 でした。
houseprices_mar2301.png

2
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
1

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?