import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
データの読み込み
train_data = pd.read_csv('hour_train.csv')
test_data = pd.read_csv('hour_test.csv')
移動平均の計算(windowサイズは3に設定)
train_data['temp_move_avg'] = train_data['temp'].rolling(window=3, min_periods=1).mean()
train_data['hum_move_avg'] = train_data['hum'].rolling(window=3, min_periods=1).mean()
train_data['windspeed_move_avg'] = train_data['windspeed'].rolling(window=3, min_periods=1).mean()
train_data['weathersit_move_avg'] = train_data['weathersit'].rolling(window=3, min_periods=1).mean()
test_data['temp_move_avg'] = test_data['temp'].rolling(window=3, min_periods=1).mean()
test_data['hum_move_avg'] = test_data['hum'].rolling(window=3, min_periods=1).mean()
test_data['windspeed_move_avg'] = test_data['windspeed'].rolling(window=3, min_periods=1).mean()
test_data['weathersit_move_avg'] = test_data['weathersit'].rolling(window=3, min_periods=1).mean()
新しい特徴量の作成
train_data['hr_8_17_18'] = train_data['hr'].apply(lambda x: 1 if x in [8, 17, 18] else 0)
test_data['hr_8_17_18'] = test_data['hr'].apply(lambda x: 1 if x in [8, 17, 18] else 0)
train_data['hr_7_20'] = train_data['hr'].apply(lambda x: 1 if 7 <= x <= 20 else 0)
test_data['hr_7_20'] = test_data['hr'].apply(lambda x: 1 if 7 <= x <= 20 else 0)
トレーニングデータとテストデータでのダミー変数の列を一致させる
train_columns = set(train_data.columns)
test_columns = set(test_data.columns)
missing_in_test = train_columns - test_columns
missing_in_train = test_columns - train_columns
for col in missing_in_test:
test_data[col] = 0
for col in missing_in_train:
train_data[col] = 0
列を揃えるために並べ替え
train_data = train_data.sort_index(axis=1)
test_data = test_data.sort_index(axis=1)
特徴量とターゲット変数の設定
features = ['yr', 'mnth', 'hr', 'holiday', 'workingday', 'hr_8_17_18', 'hr_7_20',
'temp_move_avg', 'hum_move_avg', 'windspeed_move_avg', 'weathersit_move_avg']
target_casual = 'casual'
target_registered = 'registered'
X_train = train_data[features]
y_train_casual = train_data[target_casual]
y_train_registered = train_data[target_registered]
データの標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
ニューラルネットワークモデルの定義
def build_nn_model():
model = Sequential()
model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
return model
casual モデルのトレーニング
model_casual = build_nn_model()
history_casual = model_casual.fit(X_train_scaled, y_train_casual, epochs=100, validation_split=0.2, verbose=0)
registered モデルのトレーニング
model_registered = build_nn_model()
history_registered = model_registered.fit(X_train_scaled, y_train_registered, epochs=100, validation_split=0.2, verbose=0)
テストデータの準備
X_test = test_data[features]
X_test_scaled = scaler.transform(X_test)
y_test_actual = test_data['cnt']
テストデータでの予測
y_test_pred_casual = model_casual.predict(X_test_scaled)
y_test_pred_registered = model_registered.predict(X_test_scaled)
y_test_pred_total = y_test_pred_casual + y_test_pred_registered
テストデータの評価
test_mse_casual_registered = mean_squared_error(y_test_actual, y_test_pred_total)
test_r2_casual_registered = r2_score(y_test_actual, y_test_pred_total)
test_mae_casual_registered = mean_absolute_error(y_test_actual, y_test_pred_total)
print(f'Test Mean Squared Error (casual + registered): {test_mse_casual_registered:.2f}')
print(f'Test R^2 Score (casual + registered): {test_r2_casual_registered:.2f}')
print(f'Test Mean Absolute Error (casual + registered): {test_mae_casual_registered:.2f}')
変数重要度の取得と可視化は、ニューラルネットワークモデルでは難しいため、代わりに重みの平均絶対値を使用
importance_casual = np.mean(np.abs(model_casual.layers[0].get_weights()[0]), axis=1)
importance_df_casual = pd.DataFrame({'feature': features, 'importance': importance_casual})
importance_df_casual = importance_df_casual.sort_values(by='importance', ascending=False)
plt.figure(figsize=(12, 6))
plt.barh(importance_df_casual['feature'], importance_df_casual['importance'], color='royalblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance (casual)')
plt.gca().invert_yaxis()
plt.show()
importance_registered = np.mean(np.abs(model_registered.layers[0].get_weights()[0]), axis=1)
importance_df_registered = pd.DataFrame({'feature': features, 'importance': importance_registered})
importance_df_registered = importance_df_registered.sort_values(by='importance', ascending=False)
plt.figure(figsize=(12, 6))
plt.barh(importance_df_registered['feature'], importance_df_registered['importance'], color='royalblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance (registered)')
plt.gca().invert_yaxis()
plt.show()
時系列ごとの実測値と予測値の折れ線グラフを作成
test_data['y_test_pred_casual'] = y_test_pred_casual
test_data['y_test_pred_registered'] = y_test_pred_registered
test_data['y_test_pred_total'] = y_test_pred_total
時間帯ごとの実測値と予測値を計算 (casual)
hourly_avg_actual_casual = test_data.groupby('hr')['casual'].mean()
hourly_avg_pred_casual = test_data.groupby('hr')['y_test_pred_casual'].mean()
時間帯ごとの実測値と予測値を計算 (registered)
hourly_avg_actual_registered = test_data.groupby('hr')['registered'].mean()
hourly_avg_pred_registered = test_data.groupby('hr')['y_test_pred_registered'].mean()
時間帯ごとの実測値と予測値を計算 (total)
hourly_avg_actual_total = test_data.groupby('hr')['cnt'].mean()
hourly_avg_pred_total = test_data.groupby('hr')['y_test_pred_total'].mean()
時間帯ごとの予測値と実測値の折れ線グラフを作成 (casual)
plt.figure(figsize=(12, 6))
plt.plot(hourly_avg_actual_casual.index, hourly_avg_actual_casual, label='Actual (casual)', marker='o', color='royalblue')
plt.plot(hourly_avg_pred_casual.index, hourly_avg_pred_casual, label='Predicted (casual)', marker='o', color='skyblue')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Count')
plt.title('Average Actual and Predicted Count by Hour of the Day (casual)')
plt.legend()
plt.grid(True)
plt.show()
時間帯ごとの予測値と実測値の折れ線グラフを作成 (registered)
plt.figure(figsize=(12, 6))
plt.plot(hourly_avg_actual_registered.index, hourly_avg_actual_registered, label='Actual (registered)', marker='o', color='royalblue')
plt.plot(hourly_avg_pred_registered.index, hourly_avg_pred_registered, label='Predicted (registered)', marker='o', color='skyblue')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Count')
plt.title('Average Actual and Predicted Count by Hour of the Day (registered)')
plt.legend()
plt.grid(True)
plt.show()
時間帯ごとの予測値と実測値の折れ線グラフを作成 (total)
plt.figure(figsize=(12, 6))
plt.plot(hourly_avg_actual_total.index, hourly_avg_actual_total, label='Actual (total)', marker='o', color='royalblue')
plt.plot(hourly_avg_pred_total.index, hourly_avg_pred_total, label='Predicted (total)', marker='o', color='skyblue')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Count')
plt.title('Average Actual and Predicted Count by Hour of the Day (total)')
plt.legend()
plt.grid(True)
plt.show()
日付ごとの実測値と予測値の折れ線グラフを作成 (casual)
test_data['dteday'] = pd.to_datetime(test_data['dteday']) # dtedayをdatetime型に変換
daily_avg_actual_casual = test_data.groupby('dteday')['casual'].mean()
daily_avg_pred_casual = test_data.groupby('dteday')['y_test_pred_casual'].mean()
plt.figure(figsize=(12, 6))
plt.plot(daily_avg_actual_casual.index, daily_avg_actual_casual, label='Actual (casual)', marker='o', color='royalblue')
plt.plot(daily_avg_pred_casual.index, daily_avg_pred_casual, label='Predicted (casual)', marker='o', color='skyblue')
plt.xlabel('Date')
plt.ylabel('Average Count')
plt.title('Average Actual and Predicted Count by Date (casual)')
plt.legend()
plt.grid(True)
plt.show()
日付ごとの実測値と予測値の折れ線グラフを作成 (registered)
daily_avg_actual_registered = test_data.groupby('dteday')['registered'].mean()
daily_avg_pred_registered = test_data.groupby('dteday')['y_test_pred_registered'].mean()
plt.figure(figsize=(12, 6))
plt.plot(daily_avg_actual_registered.index, daily_avg_actual_registered, label='Actual (registered)', marker='o', color='royalblue')
plt.plot(daily_avg_pred_registered.index, daily_avg_pred_registered, label='Predicted (registered)', marker='o', color='skyblue')
plt.xlabel('Date')
plt.ylabel('Average Count')
plt.title('Average Actual and Predicted Count by Date (registered)')
plt.legend()
plt.grid(True)
plt.show()
日付ごとの実測値と予測値の折れ線グラフを作成 (total)
daily_avg_actual_total = test_data.groupby('dteday')['cnt'].mean()
daily_avg_pred_total = test_data.groupby('dteday')['y_test_pred_total'].mean()
plt.figure(figsize=(12, 6))
plt.plot(daily_avg_actual_total.index, daily_avg_actual_total, label='Actual (total)', marker='o', color='royalblue')
plt.plot(daily_avg_pred_total.index, daily_avg_pred_total, label='Predicted (total)', marker='o', color='skyblue')
plt.xlabel('Date')
plt.ylabel('Average Count')
plt.title('Average Actual and Predicted Count by Date (total)')
plt.legend()
plt.grid(True)
plt.show()
① casual と registered の分布の違いを一つの図で表すヒストグラム
plt.figure(figsize=(12, 6))
plt.hist(train_data['casual'], bins=30, alpha=0.5, label='casual', color='skyblue')
plt.hist(train_data['registered'], bins=30, alpha=0.5, label='registered', color='royalblue')
plt.xlabel('Number of Users')
plt.ylabel('Frequency')
plt.title('Distribution of casual and registered users')
plt.legend()
plt.show()
② 指定された変数ごとに casual と registered の平均値を示す折れ線グラフ
variables = ['yr', 'mnth', 'hr', 'holiday', 'workingday', 'weekday', 'temp', 'hum', 'windspeed', 'weathersit']
for var in variables:
plt.figure(figsize=(12, 6))
grouped_data = train_data.groupby(var).mean()
plt.plot(grouped_data.index, grouped_data['casual'], label='casual', marker='o', color='skyblue')
plt.plot(grouped_data.index, grouped_data['registered'], label='registered', marker='o', color='royalblue')
plt.xlabel(var)
plt.ylabel('Average Count')
plt.title(f'Average casual and registered users by {var}')
plt.legend()
plt.grid(True)
plt.show()