More than 1 year has passed since last update.

★こうばいぶーすてぃんぐ　cnt

機械学習

Posted at 2024-07-31

Cross-Validation Mean Squared Error: 3824.35
Cross-Validation R^2 Score: 0.81
Validation Mean Squared Error: 1953.78
Validation R^2 Score: 0.94
Validation Mean Absolute Error: 28.24
Test Mean Squared Error: 6861.00
Test R^2 Score: 0.75
Test Mean Absolute Error: 51.86

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import lightgbm as lgb
import logging

LightGBMのログ出力をエラーのみ表示

logging.getLogger('lightgbm').setLevel(logging.ERROR)

データの読み込み

train_data = pd.read_csv('hour_train.csv')
test_data = pd.read_csv('hour_test.csv')

欠損の処理

def fill_missing_hours(df):
mis_indexs = df[df.isnull().any(axis=1)].index # 欠損のある行番号を抜き出す
for mis_index in mis_indexs:
if mis_index > 0:
hour = df.iloc[mis_index - 1]['hr'] + 1
else:
hour = df.iloc[mis_index + 1]['hr'] - 1
if hour == 24:
hour = 0
elif hour == -1:
hour = 23
df.at[mis_index, 'hr'] = hour

fill_missing_hours(train_data)
fill_missing_hours(test_data)

① casualとregisteredの分布の違いを一つの図で表すヒストグラム

plt.figure(figsize=(12, 6))
plt.hist(train_data['casual'], bins=30, alpha=0.5, label='casual', color='skyblue')
plt.hist(train_data['registered'], bins=30, alpha=0.5, label='registered', color='royalblue')
plt.xlabel('Number of Users')
plt.ylabel('Frequency')
plt.title('Distribution of casual and registered users')
plt.legend()
plt.show()

② 指定された変数ごとにcasualとregisteredの平均値を示す折れ線グラフ

variables = ['yr', 'mnth', 'hr', 'holiday', 'workingday', 'weekday', 'temp', 'hum', 'windspeed', 'weathersit']
for var in variables:
plt.figure(figsize=(12, 6))
grouped_data = train_data.groupby(var).mean()
plt.plot(grouped_data.index, grouped_data['casual'], label='casual', marker='o', color='skyblue')
plt.plot(grouped_data.index, grouped_data['registered'], label='registered', marker='o', color='royalblue')
plt.xlabel(var)
plt.ylabel('Average Count')
plt.title(f'Average casual and registered users by {var}')
plt.legend()
plt.grid(True)
plt.show()

移動平均の計算（windowサイズは3に設定していますが、適宜調整してください）

train_data['temp_move_avg'] = train_data['temp'].rolling(window=3, min_periods=1).mean()
train_data['hum_move_avg'] = train_data['hum'].rolling(window=3, min_periods=1).mean()
train_data['windspeed_move_avg'] = train_data['windspeed'].rolling(window=3, min_periods=1).mean()
train_data['weathersit_move_avg'] = train_data['weathersit'].rolling(window=3, min_periods=1).mean()

test_data['temp_move_avg'] = test_data['temp'].rolling(window=3, min_periods=1).mean()
test_data['hum_move_avg'] = test_data['hum'].rolling(window=3, min_periods=1).mean()
test_data['windspeed_move_avg'] = test_data['windspeed'].rolling(window=3, min_periods=1).mean()
test_data['weathersit_move_avg'] = test_data['weathersit'].rolling(window=3, min_periods=1).mean()

新しい特徴量の作成

train_data['hr_8_17_18'] = train_data['hr'].apply(lambda x: 1 if x in [8, 17, 18] else 0)
test_data['hr_8_17_18'] = test_data['hr'].apply(lambda x: 1 if x in [8, 17, 18] else 0)

train_data['hr_7_20'] = train_data['hr'].apply(lambda x: 1 if 7 <= x <= 20 else 0)
test_data['hr_7_20'] = test_data['hr'].apply(lambda x: 1 if 7 <= x <= 20 else 0)

トレーニングデータとテストデータでのダミー変数の列を一致させる

train_columns = set(train_data.columns)
test_columns = set(test_data.columns)
missing_in_test = train_columns - test_columns
missing_in_train = test_columns - train_columns

for col in missing_in_test:
test_data[col] = 0
for col in missing_in_train:
train_data[col] = 0

列を揃えるために並べ替え

train_data = train_data.sort_index(axis=1)
test_data = test_data.sort_index(axis=1)

特徴量とターゲット変数の設定

features = ['yr', 'mnth', 'hr', 'holiday', 'workingday', 'hr_8_17_18', 'hr_7_20',
'temp_move_avg', 'hum_move_avg', 'windspeed_move_avg', 'weathersit_move_avg']
target = 'cnt'

X_train = train_data[features]
y_train = train_data[target]

データの標準化

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

LightGBMモデルの作成（verboseを0に設定してログを抑制）

model = lgb.LGBMRegressor(random_state=42, verbose=0)

クロスバリデーションによるモデルの評価

cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
mean_cv_mse = -cv_scores.mean()
mean_cv_r2 = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2').mean()

print(f'Cross-Validation Mean Squared Error: {mean_cv_mse:.2f}')
print(f'Cross-Validation R^2 Score: {mean_cv_r2:.2f}')

データの分割（トレーニングとバリデーション用）

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

モデルのトレーニング

model.fit(X_train_split, y_train_split)

バリデーションデータでの予測

y_val_pred = model.predict(X_val_split)

バリデーションデータでの評価

print(f'Validation Mean Squared Error: {mean_squared_error(y_val_split, y_val_pred):.2f}')
print(f'Validation R^2 Score: {r2_score(y_val_split, y_val_pred):.2f}')
print(f'Validation Mean Absolute Error: {mean_absolute_error(y_val_split, y_val_pred):.2f}')

テストデータでの予測

X_test = test_data[features]
X_test_scaled = scaler.transform(X_test)
y_test_pred = model.predict(X_test_scaled)

テストデータの実際の値

y_test_actual = test_data[target]

テストデータの評価

test_mse = mean_squared_error(y_test_actual, y_test_pred)
test_r2 = r2_score(y_test_actual, y_test_pred)
test_mae = mean_absolute_error(y_test_actual, y_test_pred)

print(f'Test Mean Squared Error: {test_mse:.2f}')
print(f'Test R^2 Score: {test_r2:.2f}')
print(f'Test Mean Absolute Error: {test_mae:.2f}')

変数重要度の取得

importance = model.feature_importances_
importance_df = pd.DataFrame({'feature': features, 'importance': importance})
importance_df = importance_df.sort_values(by='importance', ascending=False)

変数重要度の棒グラフを作成

plt.figure(figsize=(12, 6))
plt.barh(importance_df['feature'], importance_df['importance'], color='royalblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()

実測値と予測値の差を日付と時間ごとに可視化

test_data['dteday'] = pd.to_datetime(test_data['dteday']) # dteday列を日付に変換
test_data['datetime'] = test_data.apply(lambda row: pd.Timestamp(year=row['dteday'].year, month=row['dteday'].month, day=row['dteday'].day, hour=row['hr']), axis=1)
test_data['actual'] = y_test_actual
test_data['predicted'] = y_test_pred
test_data['difference'] = test_data['actual'] - test_data['predicted']

日付ごとの平均差分をプロット

daily_diff = test_data.groupby('dteday')['difference'].mean()

plt.figure(figsize=(12, 6))
plt.plot(daily_diff.index, daily_diff.values, marker='o', linestyle='-', color='royalblue')
plt.xlabel('Date')
plt.ylabel('Average Difference')
plt.title('Daily Average Difference between Actual and Predicted Count')
plt.grid(True)
plt.show()

時間帯ごとの平均差分をプロット

hourly_diff = test_data.groupby('hr')['difference'].mean()

plt.figure(figsize=(12, 6))
plt.plot(hourly_diff.index, hourly_diff.values, marker='o', linestyle='-', color='royalblue')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Difference')
plt.title('Hourly Average Difference between Actual and Predicted Count')
plt.grid(True)
plt.show()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

★こうばいぶーすてぃんぐ cnt