More than 1 year has passed since last update.

★ニューラルネットーワーク　cnt　

機械

Posted at 2024-07-31

104/104 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step
Validation Mean Squared Error: 2460.65
Validation R^2 Score: 0.92
Validation Mean Absolute Error: 33.47
24/24 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step
Test Mean Squared Error: 9220.11
Test R^2 Score: 0.66
Test Mean Absolute Error: 63.23

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt

データの読み込み

train_data = pd.read_csv('hour_train.csv')
test_data = pd.read_csv('hour_test.csv')

欠損の処理

def fill_missing_hours(df):
mis_indexs = df[df.isnull().any(axis=1)].index # 欠損のある行番号を抜き出す
for mis_index in mis_indexs:
if mis_index > 0:
hour = df.iloc[mis_index - 1]['hr'] + 1
else:
hour = df.iloc[mis_index + 1]['hr'] - 1
if hour == 24:
hour = 0
elif hour == -1:
hour = 23
df.at[mis_index, 'hr'] = hour

fill_missing_hours(train_data)
fill_missing_hours(test_data)

① casualとregisteredの分布の違いを一つの図で表すヒストグラム

plt.figure(figsize=(12, 6))
plt.hist(train_data['casual'], bins=30, alpha=0.5, label='casual', color='skyblue')
plt.hist(train_data['registered'], bins=30, alpha=0.5, label='registered', color='royalblue')
plt.xlabel('Number of Users')
plt.ylabel('Frequency')
plt.title('Distribution of casual and registered users')
plt.legend()
plt.show()

② 指定された変数ごとにcasualとregisteredの平均値を示す折れ線グラフ

variables = ['yr', 'mnth', 'hr', 'holiday', 'workingday', 'weekday', 'temp', 'hum', 'windspeed', 'weathersit']
for var in variables:
plt.figure(figsize=(12, 6))
grouped_data = train_data.groupby(var).mean()
plt.plot(grouped_data.index, grouped_data['casual'], label='casual', marker='o', color='skyblue')
plt.plot(grouped_data.index, grouped_data['registered'], label='registered', marker='o', color='royalblue')
plt.xlabel(var)
plt.ylabel('Average Count')
plt.title(f'Average casual and registered users by {var}')
plt.legend()
plt.grid(True)
plt.show()

移動平均の計算（windowサイズは3に設定していますが、適宜調整してください）

train_data['temp_move_avg'] = train_data['temp'].rolling(window=3, min_periods=1).mean()
train_data['hum_move_avg'] = train_data['hum'].rolling(window=3, min_periods=1).mean()
train_data['windspeed_move_avg'] = train_data['windspeed'].rolling(window=3, min_periods=1).mean()
train_data['weathersit_move_avg'] = train_data['weathersit'].rolling(window=3, min_periods=1).mean()

test_data['temp_move_avg'] = test_data['temp'].rolling(window=3, min_periods=1).mean()
test_data['hum_move_avg'] = test_data['hum'].rolling(window=3, min_periods=1).mean()
test_data['windspeed_move_avg'] = test_data['windspeed'].rolling(window=3, min_periods=1).mean()
test_data['weathersit_move_avg'] = test_data['weathersit'].rolling(window=3, min_periods=1).mean()

新しい特徴量の作成

train_data['hr_8_17_18'] = train_data['hr'].apply(lambda x: 1 if x in [8, 17, 18] else 0)
test_data['hr_8_17_18'] = test_data['hr'].apply(lambda x: 1 if x in [8, 17, 18] else 0)

train_data['hr_7_20'] = train_data['hr'].apply(lambda x: 1 if 7 <= x <= 20 else 0)
test_data['hr_7_20'] = test_data['hr'].apply(lambda x: 1 if 7 <= x <= 20 else 0)

トレーニングデータとテストデータでのダミー変数の列を一致させる

train_columns = set(train_data.columns)
test_columns = set(test_data.columns)
missing_in_test = train_columns - test_columns
missing_in_train = test_columns - train_columns

for col in missing_in_test:
test_data[col] = 0
for col in missing_in_train:
train_data[col] = 0

列を揃えるために並べ替え

train_data = train_data.sort_index(axis=1)
test_data = test_data.sort_index(axis=1)

特徴量とターゲット変数の設定

features = ['yr', 'mnth', 'hr', 'holiday', 'workingday', 'hr_8_17_18', 'hr_7_20',
'temp_move_avg', 'hum_move_avg', 'windspeed_move_avg', 'weathersit_move_avg']
target = 'cnt'

X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]
y_test = test_data[target]

データの標準化

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ニューラルネットワークモデルの作成

model = Sequential([
Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
Dense(32, activation='relu'),
Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

EarlyStoppingの設定

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

データの分割（トレーニングとバリデーション用）

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

モデルのトレーニング

history = model.fit(X_train_split, y_train_split,
validation_data=(X_val_split, y_val_split),
epochs=100,
batch_size=32,
callbacks=[early_stopping],
verbose=0)

バリデーションデータでの予測

y_val_pred = model.predict(X_val_split).flatten()

バリデーションデータでの評価

print(f'Validation Mean Squared Error: {mean_squared_error(y_val_split, y_val_pred):.2f}')
print(f'Validation R^2 Score: {r2_score(y_val_split, y_val_pred):.2f}')
print(f'Validation Mean Absolute Error: {mean_absolute_error(y_val_split, y_val_pred):.2f}')

テストデータでの予測

y_test_pred = model.predict(X_test_scaled).flatten()

テストデータの評価

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print(f'Test Mean Squared Error: {test_mse:.2f}')
print(f'Test R^2 Score: {test_r2:.2f}')
print(f'Test Mean Absolute Error: {test_mae:.2f}')

実測値と予測値の差を日付と時間ごとに可視化

test_data['dteday'] = pd.to_datetime(test_data['dteday']) # dteday列を日付に変換
test_data['datetime'] = test_data.apply(lambda row: pd.Timestamp(year=row['dteday'].year, month=row['dteday'].month, day=row['dteday'].day, hour=row['hr']), axis=1)
test_data['actual'] = y_test
test_data['predicted'] = y_test_pred
test_data['difference'] = test_data['actual'] - test_data['predicted']

日付ごとの平均差分をプロット

daily_diff = test_data.groupby(test_data['datetime'].dt.date)['difference'].mean()
plt.figure(figsize=(12, 6))
daily_diff.plot(kind='bar', color='skyblue')
plt.xlabel('Date')
plt.ylabel('Average Difference')
plt.title('Average Difference between Actual and Predicted Counts by Date')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

時間ごとの平均差分をプロット

hourly_diff = test_data.groupby(test_data['datetime'].dt.hour)['difference'].mean()
plt.figure(figsize=(12, 6))
hourly_diff.plot(kind='bar', color='royalblue')
plt.xlabel('Hour')
plt.ylabel('Average Difference')
plt.title('Average Difference between Actual and Predicted Counts by Hour')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

実測値と予測値をプロットする折れ線グラフ

日付ごとの実測値と予測値のプロット

daily_actual = test_data.groupby(test_data['datetime'].dt.date)['actual'].sum()
daily_predicted = test_data.groupby(test_data['datetime'].dt.date)['predicted'].sum()

plt.figure(figsize=(12, 6))
plt.plot(daily_actual.index, daily_actual.values, label='Actual', marker='o', color='skyblue')
plt.plot(daily_predicted.index, daily_predicted.values, label='Predicted', marker='o', color='royalblue')
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Actual vs Predicted Counts by Date')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()

時間ごとの実測値と予測値のプロット

hourly_actual = test_data.groupby(test_data['datetime'].dt.hour)['actual'].sum()
hourly_predicted = test_data.groupby(test_data['datetime'].dt.hour)['predicted'].sum()

plt.figure(figsize=(12, 6))
plt.plot(hourly_actual.index, hourly_actual.values, label='Actual', marker='o', color='skyblue')
plt.plot(hourly_predicted.index, hourly_predicted.values, label='Predicted', marker='o', color='royalblue')
plt.xlabel('Hour')
plt.ylabel('Count')
plt.title('Actual vs Predicted Counts by Hour')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

★ニューラルネットーワーク cnt