def sliding_window_catboost(
df_training: pd.DataFrame,
df_validation: pd.DataFrame,
target_col: str,
feature_cols: list,
date_col: str = "datetime",
train_months: int = 12,
test_months: int = 1,
model_save_path: str = "models_list.pkl"
):
"""
スライディングウィンドウ + CatBoost + 平日学習 + 平均予測 + 全日付出力
"""
df_training[date_col] = pd.to_datetime(df_training[date_col])
df_validation[date_col] = pd.to_datetime(df_validation[date_col])
# 平日のみ学習用
df_train_weekday = df_training[df_training[date_col].dt.weekday < 5].sort_values(date_col)
start_date = df_train_weekday[date_col].min()
end_date = df_train_weekday[date_col].max()
predictions_list = []
actual_list = []
models_list = []
current_start = start_date
while True:
train_end = current_start + pd.DateOffset(months=train_months)
test_end = train_end + pd.DateOffset(months=test_months)
if train_end > end_date:
break
train = df_train_weekday[(df_train_weekday[date_col] >= current_start) & (df_train_weekday[date_col] < train_end)]
test = df_train_weekday[(df_train_weekday[date_col] >= train_end) & (df_train_weekday[date_col] < test_end)]
if train.empty or test.empty:
current_start += pd.DateOffset(months=1)
continue
X_train, y_train = train[feature_cols], train[target_col]
X_test, y_test = test[feature_cols], test[target_col]
model = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, verbose=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
predictions_list.append(y_pred)
actual_list.append(y_test.values)
models_list.append(model)
current_start += pd.DateOffset(months=1)
# 学習期間平均予測
min_len = min(len(a) for a in predictions_list)
predictions_mean = np.mean([a[:min_len] for a in predictions_list], axis=0)
actual_mean = np.mean([a[:min_len] for a in actual_list], axis=0)
train_rmse = np.sqrt(mean_squared_error(actual_mean, predictions_mean))
train_mape = (np.abs((actual_mean - predictions_mean) / actual_mean).replace([np.inf, -np.inf], np.nan).dropna() * 100).mean()
# モデル保存
with open(model_save_path, "wb") as f:
pickle.dump(models_list, f)
# 検証データ予測(平日ベース)
df_val_weekday = df_validation[df_validation[date_col].dt.weekday < 5]
X_val, y_val = df_val_weekday[feature_cols], df_val_weekday[target_col]
val_preds = np.column_stack([m.predict(X_val) for m in models_list])
val_pred_mean = val_preds.mean(axis=1)
val_rmse = np.sqrt(mean_squared_error(y_val, val_pred_mean))
val_mape = (np.abs((y_val - val_pred_mean) / y_val).replace([np.inf, -np.inf], np.nan).dropna() * 100).mean()
# 全日付実績テーブルにマージして土日祝は0
result_df = df_validation[[date_col, target_col]].copy()
result_df.rename(columns={target_col: "actual"}, inplace=True)
pred_df = pd.DataFrame({
date_col: df_val_weekday[date_col].values,
"pred": val_pred_mean
})
result_df = result_df.merge(pred_df, on=date_col, how="left")
result_df["pred"].fillna(0, inplace=True) # 土日祝は0
metrics_df = pd.DataFrame({
"Train_RMSE": [train_rmse],
"Train_MAPE": [train_mape],
"Validation_RMSE": [val_rmse],
"Validation_MAPE": [val_mape]
})
return result_df, metrics_df
Register as a new user and use Qiita more conveniently
- You get articles that match your needs
- You can efficiently read back useful information
- You can use dark theme