次の課題に submit するまでの流れです。
Predict Future Sales
次のページを参考にしました。
来月の商品の売上数を予測する〜Kaggle Predict Future Salesに挑む(その2)
futuresales01.py
#! /usr/bin/python
#
# futuresales01.py
#
# Feb/23/2020
# --------------------------------------------------------------------------
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
import datetime
# --------------------------------------------------------------------------
def dump_frame_proc(df):
try:
print(len(df.index),len(df.columns))
except Exception as ee:
sys.stderr.write("*** error *** dump_frame_proc ***\n")
sys.stderr.write(str(ee) + "\n")
# --------------------------------------------------------------------------
# [4]:
def read_proc():
items = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops = pd.read_csv('shops.csv')
sales_train = pd.read_csv('sales_train.csv')
test = pd.read_csv('test.csv')
#
print(len(items.index),len(items.columns))
print(len(item_categories.index),len(item_categories.columns))
print(len(shops.index),len(shops.columns))
print(len(sales_train.index),len(sales_train.columns))
print(len(test.index),len(test.columns))
dump_frame_proc(test)
#
return items,item_categories,shops,sales_train,test
# --------------------------------------------------------------------------
def gen_lag_s4_proc(train,lag_col,lag,set_col_name):
try:
df_lag = train[['shop_id', 'item_id','date_block_num',
lag_col]].sort_values(
['shop_id', 'item_id','date_block_num'],
ascending=[True, True,True]
).reset_index(drop=True).shift(lag).rename(columns={lag_col: set_col_name})
except Exception as ee:
sys.stderr.write("*** gen_lag_s4_proc ***\n")
sys.stderr.write(str(ee) + "\n")
#
return df_lag
# --------------------------------------------------------------------------
# [16]:
def gen_lag_proc(train):
sys.stderr.write("*** ラグ特徴量の作成 *** before ***\n")
#
# 月次売上数をクリップ
train['mon_shop_item_cnt'] = train['mon_shop_item_cnt'].clip(0,20)
#
# ラグ生成対象のカラム
lag_col_list = ['mon_shop_item_cnt','mon_shop_item_sales']
# ラグリスト(1ヶ月前、3ヶ月前、6ヶ月前、9ヶ月前、12ヶ月前)
lag_num_list = [1,3,6,9,12]
sys.stderr.write("*** shop_id*item_id*date_block_numでソート ***\n")
# shop_id*item_id*date_block_numでソート
train = train.sort_values(
['shop_id', 'item_id','date_block_num'],
ascending=[True, True,True]
).reset_index(drop=True)
sys.stderr.write("*** ラグ特徴量の生成 *** middle ***\n")
# ラグ特徴量の生成
for lag_col in lag_col_list:
sys.stderr.write("lag_col = " + lag_col + "\n")
for lag in lag_num_list:
sys.stderr.write("lag = %d\n" % lag)
#
set_col_name = lag_col + '_' + str(lag)
df_lag = gen_lag_s4_proc(train,lag_col,lag,set_col_name)
train = pd.concat([train, df_lag[set_col_name]], axis=1)
#
sys.stderr.write("*** 欠損を0埋め ***\n")
# 欠損を0埋め
train = train.fillna(0)
#
sys.stderr.write("*** ラグ特徴量の作成 *** after ***\n")
#
return train
# --------------------------------------------------------------------------
sys.stderr.write("*** 開始 ***\n")
#
items,item_categories,shops,sales_train,test = read_proc()
#
#
# ' - 'で文字列分割
item_categories['big_category_name'] = item_categories['item_category_name'].map(lambda x: x.split(' - ')[0])
#
item_categories.loc[
item_categories['big_category_name']=='Чистые носители (штучные)',
'big_categry'
] = 'Чистые носители (шпиль)'
# 集約具合を確認
#print(item_categories['big_category_name'].value_counts())
#
shops['city_name'] = shops['shop_name'].map(lambda x: x.split(' ')[0])
shops.loc[shops['city_name']=='!Якутск','city_name'] = 'Якутск'
# 集約具合を確認
#print(shops['city_name'].value_counts())
#
sales_train['date_sales'] = sales_train['item_cnt_day'] * sales_train['item_price']
#
#
sys.stderr.write("*** 月次shop_id*item_id別売上点数 ***\n")
#
# 月次shop_id*item_id別売上点数
mon_shop_item_cnt = sales_train[
['date_block_num','shop_id','item_id','item_cnt_day']
].groupby(
['date_block_num','shop_id','item_id'],
as_index=False
).sum().rename(columns={'item_cnt_day':'mon_shop_item_cnt'})
# 月次shop_id*item_id別売上金額
mon_shop_item_sales = sales_train[
['date_block_num','shop_id','item_id','date_sales']
].groupby(
['date_block_num','shop_id','item_id'],
as_index=False
).sum().rename(columns={'date_sales':'mon_shop_item_sales'})
#
sys.stderr.write("*** 学習データセットをフルに拡張 ***\n")
#
# 学習データセットをフルに拡張
# 34月*shop_id*item_id
train_full_comb = pd.DataFrame()
for i in range(35):
mid = test[['shop_id','item_id']]
mid['date_block_num'] = i
train_full_comb = pd.concat([train_full_comb,mid],axis=0)
#
sys.stderr.write("*** 月次売上商品数 ***\n")
dump_frame_proc(train_full_comb)
dump_frame_proc(mon_shop_item_cnt)
# print(train_full_comb)
# print(mon_shop_item_cnt)
#
# 月次売上商品数
train = pd.DataFrame()
try:
train = pd.merge(
train_full_comb,
mon_shop_item_cnt,
on=['date_block_num','shop_id','item_id'],
how='left'
)
except Exception as ee:
sys.stderr.write(str(ee) + "\n")
#
dump_frame_proc(train)
dump_frame_proc(mon_shop_item_sales)
sys.stderr.write("*** 月次売上金額 *** before ***\n")
# 月次売上金額
try:
train = pd.merge(
train,
mon_shop_item_sales,
on=['date_block_num','shop_id','item_id'],
how='left'
)
except Exception as ee:
sys.stderr.write(str(ee) + "\n")
#
sys.stderr.write("*** 月次売上金額 *** after ***\n")
dump_frame_proc(train)
sys.stderr.write("*** 学習データにマスタをマージ ***\n")
#
# 学習データにマスタをマージ
# item_idのjoin
train = pd.merge(
train,
items[['item_id','item_category_id']],
on='item_id',
how='left'
)
#
sys.stderr.write("*** item_categry_idのjoin ***\n")
# item_categry_idのjoin
train = pd.merge(
train,
item_categories[['item_category_id','big_category_name']],
on='item_category_id',
how='left'
)
sys.stderr.write("*** shop_idのjoin *** before ***\n")
# shop_idのjoin
train = pd.merge(
train,
shops[['shop_id','city_name']],
on='shop_id',
how='left'
)
#
sys.stderr.write("*** shop_idのjoin *** after ***\n")
try:
train = gen_lag_proc(train)
except Exception as ee:
sys.stderr.write("*** gen_lag_proc ***\n")
sys.stderr.write(str(ee) + "\n")
#
sys.stderr.write("*** 予測モデルの構築 ***\n")
#
# ラグで最大12ヶ月前の売上数を使用するため
train_ = train[(train['date_block_num']<=33) & (train['date_block_num']>=12)].reset_index(drop=True)
test_ = train[train['date_block_num']==34].reset_index(drop=True)
sys.stderr.write("*** モデルに入力する特徴量とターゲット変数に分割 ***\n")
# モデルに入力する特徴量とターゲット変数に分割
train_y = train_['mon_shop_item_cnt']
train_X = train_.drop(columns=['date_block_num','mon_shop_item_cnt', 'mon_shop_item_sales'])
test_X = test_.drop(columns=['date_block_num','mon_shop_item_cnt', 'mon_shop_item_sales'])
#
sys.stderr.write("*** string型の特徴量をLabel Encoding ***\n")
#
from sklearn.preprocessing import LabelEncoder
obj_col_list = ['big_category_name','city_name']
for obj_col in obj_col_list:
le = LabelEncoder()
train_X[obj_col] = pd.DataFrame({obj_col:le.fit_transform(train_X[obj_col])})
test_X[obj_col] = pd.DataFrame({obj_col:le.fit_transform(test_X[obj_col])})
#
sys.stderr.write("*** 予測モデルを当てはめ ***\n")
#
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(train_X,train_y)
#
sys.stderr.write("*** 予測結果を提出 ***\n")
test_y = rfr.predict(test_X)
test_X['item_cnt_month'] = test_y
submission = pd.merge(
test,
test_X[['shop_id','item_id','item_cnt_month']],
on=['shop_id','item_id'],
how='left'
)
sys.stderr.write("*** 提出ファイル作成 ***\n")
# 提出ファイル作成
submission[['ID','item_cnt_month']].to_csv('submission.csv', index=False)
#
sys.stderr.write("*** 終了 ***\n")
# --------------------------------------------------------------------------
スコアは、1.06064 でした。
実行にはノートパソコンで1時間半かかりました。スワップも使いました。
スワップを使わないとプロセスが途中で、Killed になります。
real 86m15.723s
user 32m22.617s
sys 1m14.147s