import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
データの読み込み
train_data = pd.read_csv('hour_train.csv')
test_data = pd.read_csv('hour_test.csv')
① casualとregisteredの分布の違いを一つの図で表すヒストグラム
plt.figure(figsize=(12, 6))
plt.hist(train_data['casual'], bins=30, alpha=0.5, label='casual', color='skyblue')
plt.hist(train_data['registered'], bins=30, alpha=0.5, label='registered', color='royalblue')
plt.xlabel('Number of Users')
plt.ylabel('Frequency')
plt.title('Distribution of casual and registered users')
plt.legend()
plt.show()
② 指定された変数ごとにcasualとregisteredの平均値を示す折れ線グラフ
variables = ['yr', 'mnth', 'hr', 'holiday', 'workingday', 'weekday', 'temp', 'hum', 'windspeed', 'weathersit']
for var in variables:
plt.figure(figsize=(12, 6))
grouped_data = train_data.groupby(var).mean()
plt.plot(grouped_data.index, grouped_data['casual'], label='casual', marker='o', color='skyblue')
plt.plot(grouped_data.index, grouped_data['registered'], label='registered', marker='o', color='royalblue')
plt.xlabel(var)
plt.ylabel('Average Count')
plt.title(f'Average casual and registered users by {var}')
plt.legend()
plt.grid(True)
plt.show()
移動平均の計算(windowサイズは3に設定していますが、適宜調整してください)
train_data['temp_move_avg'] = train_data['temp'].rolling(window=3, min_periods=1).mean()
train_data['hum_move_avg'] = train_data['hum'].rolling(window=3, min_periods=1).mean()
train_data['windspeed_move_avg'] = train_data['windspeed'].rolling(window=3, min_periods=1).mean()
train_data['weathersit_move_avg'] = train_data['weathersit'].rolling(window=3, min_periods=1).mean()
test_data['temp_move_avg'] = test_data['temp'].rolling(window=3, min_periods=1).mean()
test_data['hum_move_avg'] = test_data['hum'].rolling(window=3, min_periods=1).mean()
test_data['windspeed_move_avg'] = test_data['windspeed'].rolling(window=3, min_periods=1).mean()
test_data['weathersit_move_avg'] = test_data['weathersit'].rolling(window=3, min_periods=1).mean()
新しい特徴量の作成
train_data['hr_8_17_18'] = train_data['hr'].apply(lambda x: 1 if x in [8, 17, 18] else 0)
test_data['hr_8_17_18'] = test_data['hr'].apply(lambda x: 1 if x in [8, 17, 18] else 0)
train_data['hr_7_20'] = train_data['hr'].apply(lambda x: 1 if 7 <= x <= 20 else 0)
test_data['hr_7_20'] = test_data['hr'].apply(lambda x: 1 if 7 <= x <= 20 else 0)
トレーニングデータとテストデータでのダミー変数の列を一致させる
train_columns = set(train_data.columns)
test_columns = set(test_data.columns)
missing_in_test = train_columns - test_columns
missing_in_train = test_columns - train_columns
for col in missing_in_test:
test_data[col] = 0
for col in missing_in_train:
train_data[col] = 0
列を揃えるために並べ替え
train_data = train_data.sort_index(axis=1)
test_data = test_data.sort_index(axis=1)
特徴量とターゲット変数の設定
features = ['yr', 'mnth', 'hr', 'holiday', 'workingday', 'hr_8_17_18', 'hr_7_20',
'temp_move_avg', 'hum_move_avg', 'windspeed_move_avg', 'weathersit_move_avg']
target = 'cnt'
X_train = train_data[features]
y_train = train_data[target]
データの標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
LightGBMモデルの作成(verboseを0に設定してログを抑制)
model = lgb.LGBMRegressor(random_state=42, verbose=0)
クロスバリデーションによるモデルの評価
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
mean_cv_mse = -cv_scores.mean()
mean_cv_r2 = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2').mean()
print(f'Cross-Validation Mean Squared Error: {mean_cv_mse:.2f}')
print(f'Cross-Validation R^2 Score: {mean_cv_r2:.2f}')
データの分割(トレーニングとバリデーション用)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)
モデルのトレーニング
model.fit(X_train_split, y_train_split)
バリデーションデータでの予測
y_val_pred = model.predict(X_val_split)
バリデーションデータでの評価
print(f'Validation Mean Squared Error: {mean_squared_error(y_val_split, y_val_pred):.2f}')
print(f'Validation R^2 Score: {r2_score(y_val_split, y_val_pred):.2f}')
print(f'Validation Mean Absolute Error: {mean_absolute_error(y_val_split, y_val_pred):.2f}')
テストデータでの予測
X_test = test_data[features]
X_test_scaled = scaler.transform(X_test)
y_test_pred = model.predict(X_test_scaled)
テストデータの実際の値
y_test_actual = test_data[target]
テストデータの評価
test_mse = mean_squared_error(y_test_actual, y_test_pred)
test_r2 = r2_score(y_test_actual, y_test_pred)
test_mae = mean_absolute_error(y_test_actual, y_test_pred)
print(f'Test Mean Squared Error: {test_mse:.2f}')
print(f'Test R^2 Score: {test_r2:.2f}')
print(f'Test Mean Absolute Error: {test_mae:.2f}')
変数重要度の取得
importance = model.feature_importances_
importance_df = pd.DataFrame({'feature': features, 'importance': importance})
importance_df = importance_df.sort_values(by='importance', ascending=False)
変数重要度の棒グラフを作成
plt.figure(figsize=(12, 6))
plt.barh(importance_df['feature'], importance_df['importance'], color='royalblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()
casualとregisteredを別々にモデリング
y_train_casual = train_data['casual']
y_train_registered = train_data['registered']
casualモデルの作成
model_casual = lgb.LGBMRegressor(random_state=42, verbose=0)
cv_scores_casual = cross_val_score(model_casual, X_train_scaled, y_train_casual, cv=5, scoring='neg_mean_squared_error')
mean_cv_mse_casual = -cv_scores_casual.mean()
mean_cv_r2_casual = cross_val_score(model_casual, X_train_scaled, y_train_casual, cv=5, scoring='r2').mean()
print(f'Cross-Validation Mean Squared Error (casual): {mean_cv_mse_casual:.2f}')
print(f'Cross-Validation R^2 Score (casual): {mean_cv_r2_casual:.2f}')
データの分割(トレーニングとバリデーション用)
X_train_split, X_val_split, y_train_split_casual, y_val_split_casual = train_test_split(X_train_scaled, y_train_casual, test_size=0.2, random_state=42)
モデルのトレーニング
model_casual.fit(X_train_split, y_train_split_casual)
バリデーションデータでの予測
y_val_pred_casual = model_casual.predict(X_val_split)
バリデーションデータでの評価
print(f'Validation Mean Squared Error (casual): {mean_squared_error(y_val_split_casual, y_val_pred_casual):.2f}')
print(f'Validation R^2 Score (casual): {r2_score(y_val_split_casual, y_val_pred_casual):.2f}')
print(f'Validation Mean Absolute Error (casual): {mean_absolute_error(y_val_split_casual, y_val_pred_casual):.2f}')
registeredモデルの作成
model_registered = lgb.LGBMRegressor(random_state=42, verbose=0)
cv_scores_registered = cross_val_score(model_registered, X_train_scaled, y_train_registered, cv=5, scoring='neg_mean_squared_error')
mean_cv_mse_registered = -cv_scores_registered.mean()
mean_cv_r2_registered = cross_val_score(model_registered, X_train_scaled, y_train_registered, cv=5, scoring='r2').mean()
print(f'Cross-Validation Mean Squared Error (registered): {mean_cv_mse_registered:.2f}')
print(f'Cross-Validation R^2 Score (registered): {mean_cv_r2_registered:.2f}')
データの分割(トレーニングとバリデーション用)
X_train_split, X_val_split, y_train_split_registered, y_val_split_registered = train_test_split(X_train_scaled, y_train_registered, test_size=0.2, random_state=42)
モデルのトレーニング
model_registered.fit(X_train_split, y_train_split_registered)
バリデーションデータでの予測
y_val_pred_registered = model_registered.predict(X_val_split)
バリデーションデータでの評価
print(f'Validation Mean Squared Error (registered): {mean_squared_error(y_val_split_registered, y_val_pred_registered):.2f}')
print(f'Validation R^2 Score (registered): {r2_score(y_val_split_registered, y_val_pred_registered):.2f}')
print(f'Validation Mean Absolute Error (registered): {mean_absolute_error(y_val_split_registered, y_val_pred_registered):.2f}')
テストデータでの予測
y_test_pred_casual = model_casual.predict(X_test_scaled)
y_test_pred_registered = model_registered.predict(X_test_scaled)
予測の合算
y_test_pred_combined = y_test_pred_casual + y_test_pred_registered
テストデータでの評価
test_mse_combined = mean_squared_error(y_test_actual, y_test_pred_combined)
test_r2_combined = r2_score(y_test_actual, y_test_pred_combined)
test_mae_combined = mean_absolute_error(y_test_actual, y_test_pred_combined)
print(f'Test Mean Squared Error (combined): {test_mse_combined:.2f}')
print(f'Test R^2 Score (combined): {test_r2_combined:.2f}')
print(f'Test Mean Absolute Error (combined): {test_mae_combined:.2f}')
実測値と予測値の差を日付と時間ごとに可視化
test_data['dteday'] = pd.to_datetime(test_data['dteday']) # dteday列を日付に変換
test_data['datetime'] = test_data.apply(lambda row: pd.Timestamp(year=row['dteday'].year, month=row['dteday'].month, day=row['dteday'].day, hour=row['hr']), axis=1)
test_data['actual'] = y_test_actual
test_data['predicted'] = y_test_pred_combined
test_data['difference'] = test_data['actual'] - test_data['predicted']
日付ごとの平均差分をプロット
daily_diff = test_data.groupby(test_data['datetime'].dt.date)['difference'].mean()
plt.figure(figsize=(12, 6))
daily_diff.plot(kind='bar', color='skyblue')
plt.xlabel('Date')
plt.ylabel('Average Difference')
plt.title('Average Difference between Actual and Predicted Counts by Date')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
時間ごとの平均差分をプロット
hourly_diff = test_data.groupby(test_data['datetime'].dt.hour)['difference'].mean()
plt.figure(figsize=(12, 6))
hourly_diff.plot(kind='bar', color='royalblue')
plt.xlabel('Hour')
plt.ylabel('Average Difference')
plt.title('Average Difference between Actual and Predicted Counts by Hour')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
実測値と予測値をプロットする折れ線グラフ
日付ごとの実測値と予測値のプロット
daily_actual = test_data.groupby(test_data['datetime'].dt.date)['actual'].sum()
daily_predicted = test_data.groupby(test_data['datetime'].dt.date)['predicted'].sum()
plt.figure(figsize=(12, 6))
plt.plot(daily_actual.index, daily_actual.values, label='Actual', marker='o', color='skyblue')
plt.plot(daily_predicted.index, daily_predicted.values, label='Predicted', marker='o', color='royalblue')
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Actual vs Predicted Counts by Date')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()
時間ごとの実測値と予測値のプロット
hourly_actual = test_data.groupby(test_data['datetime'].dt.hour)['actual'].sum()
hourly_predicted = test_data.groupby(test_data['datetime'].dt.hour)['predicted'].sum()
plt.figure(figsize=(12, 6))
plt.plot(hourly_actual.index, hourly_actual.values, label='Actual', marker='o', color='skyblue')
plt.plot(hourly_predicted.index, hourly_predicted.values, label='Predicted', marker='o', color='royalblue')
plt.xlabel('Hour')
plt.ylabel('Count')
plt.title('Actual vs Predicted Counts by Hour')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()