0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

完成1

Posted at

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb

データの読み込み

train_data = pd.read_csv('hour_train.csv')
test_data = pd.read_csv('hour_test.csv')

① casualとregisteredの分布の違いを一つの図で表すヒストグラム

plt.figure(figsize=(12, 6))
plt.hist(train_data['casual'], bins=30, alpha=0.5, label='casual', color='skyblue')
plt.hist(train_data['registered'], bins=30, alpha=0.5, label='registered', color='royalblue')
plt.xlabel('Number of Users')
plt.ylabel('Frequency')
plt.title('Distribution of casual and registered users')
plt.legend()
plt.show()

② 指定された変数ごとにcasualとregisteredの平均値を示す折れ線グラフ

variables = ['yr', 'mnth', 'hr', 'holiday', 'workingday', 'weekday', 'temp', 'hum', 'windspeed', 'weathersit']
for var in variables:
plt.figure(figsize=(12, 6))
grouped_data = train_data.groupby(var).mean()
plt.plot(grouped_data.index, grouped_data['casual'], label='casual', marker='o', color='skyblue')
plt.plot(grouped_data.index, grouped_data['registered'], label='registered', marker='o', color='royalblue')
plt.xlabel(var)
plt.ylabel('Average Count')
plt.title(f'Average casual and registered users by {var}')
plt.legend()
plt.grid(True)
plt.show()

移動平均の計算(windowサイズは3に設定していますが、適宜調整してください)

train_data['temp_move_avg'] = train_data['temp'].rolling(window=3, min_periods=1).mean()
train_data['hum_move_avg'] = train_data['hum'].rolling(window=3, min_periods=1).mean()
train_data['windspeed_move_avg'] = train_data['windspeed'].rolling(window=3, min_periods=1).mean()
train_data['weathersit_move_avg'] = train_data['weathersit'].rolling(window=3, min_periods=1).mean()

test_data['temp_move_avg'] = test_data['temp'].rolling(window=3, min_periods=1).mean()
test_data['hum_move_avg'] = test_data['hum'].rolling(window=3, min_periods=1).mean()
test_data['windspeed_move_avg'] = test_data['windspeed'].rolling(window=3, min_periods=1).mean()
test_data['weathersit_move_avg'] = test_data['weathersit'].rolling(window=3, min_periods=1).mean()

新しい特徴量の作成

train_data['hr_8_17_18'] = train_data['hr'].apply(lambda x: 1 if x in [8, 17, 18] else 0)
test_data['hr_8_17_18'] = test_data['hr'].apply(lambda x: 1 if x in [8, 17, 18] else 0)

train_data['hr_7_20'] = train_data['hr'].apply(lambda x: 1 if 7 <= x <= 20 else 0)
test_data['hr_7_20'] = test_data['hr'].apply(lambda x: 1 if 7 <= x <= 20 else 0)

トレーニングデータとテストデータでのダミー変数の列を一致させる

train_columns = set(train_data.columns)
test_columns = set(test_data.columns)
missing_in_test = train_columns - test_columns
missing_in_train = test_columns - train_columns

for col in missing_in_test:
test_data[col] = 0
for col in missing_in_train:
train_data[col] = 0

列を揃えるために並べ替え

train_data = train_data.sort_index(axis=1)
test_data = test_data.sort_index(axis=1)

特徴量とターゲット変数の設定

features = ['yr', 'mnth', 'hr', 'holiday', 'workingday', 'hr_8_17_18', 'hr_7_20',
'temp_move_avg', 'hum_move_avg', 'windspeed_move_avg', 'weathersit_move_avg']
target = 'cnt'

X_train = train_data[features]
y_train = train_data[target]

データの標準化

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

LightGBMモデルの作成(verboseを0に設定してログを抑制)

model = lgb.LGBMRegressor(random_state=42, verbose=0)

クロスバリデーションによるモデルの評価

cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
mean_cv_mse = -cv_scores.mean()
mean_cv_r2 = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2').mean()

print(f'Cross-Validation Mean Squared Error: {mean_cv_mse:.2f}')
print(f'Cross-Validation R^2 Score: {mean_cv_r2:.2f}')

データの分割(トレーニングとバリデーション用)

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

モデルのトレーニング

model.fit(X_train_split, y_train_split)

バリデーションデータでの予測

y_val_pred = model.predict(X_val_split)

バリデーションデータでの評価

print(f'Validation Mean Squared Error: {mean_squared_error(y_val_split, y_val_pred):.2f}')
print(f'Validation R^2 Score: {r2_score(y_val_split, y_val_pred):.2f}')
print(f'Validation Mean Absolute Error: {mean_absolute_error(y_val_split, y_val_pred):.2f}')

テストデータでの予測

X_test = test_data[features]
X_test_scaled = scaler.transform(X_test)
y_test_pred = model.predict(X_test_scaled)

テストデータの実際の値

y_test_actual = test_data[target]

テストデータの評価

test_mse = mean_squared_error(y_test_actual, y_test_pred)
test_r2 = r2_score(y_test_actual, y_test_pred)
test_mae = mean_absolute_error(y_test_actual, y_test_pred)

print(f'Test Mean Squared Error: {test_mse:.2f}')
print(f'Test R^2 Score: {test_r2:.2f}')
print(f'Test Mean Absolute Error: {test_mae:.2f}')

変数重要度の取得

importance = model.feature_importances_
importance_df = pd.DataFrame({'feature': features, 'importance': importance})
importance_df = importance_df.sort_values(by='importance', ascending=False)

変数重要度の棒グラフを作成

plt.figure(figsize=(12, 6))
plt.barh(importance_df['feature'], importance_df['importance'], color='royalblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()

casualとregisteredを別々にモデリング

y_train_casual = train_data['casual']
y_train_registered = train_data['registered']

casualモデルの作成

model_casual = lgb.LGBMRegressor(random_state=42, verbose=0)
cv_scores_casual = cross_val_score(model_casual, X_train_scaled, y_train_casual, cv=5, scoring='neg_mean_squared_error')
mean_cv_mse_casual = -cv_scores_casual.mean()
mean_cv_r2_casual = cross_val_score(model_casual, X_train_scaled, y_train_casual, cv=5, scoring='r2').mean()

print(f'Cross-Validation Mean Squared Error (casual): {mean_cv_mse_casual:.2f}')
print(f'Cross-Validation R^2 Score (casual): {mean_cv_r2_casual:.2f}')

データの分割(トレーニングとバリデーション用)

X_train_split, X_val_split, y_train_split_casual, y_val_split_casual = train_test_split(X_train_scaled, y_train_casual, test_size=0.2, random_state=42)

モデルのトレーニング

model_casual.fit(X_train_split, y_train_split_casual)

バリデーションデータでの予測

y_val_pred_casual = model_casual.predict(X_val_split)

バリデーションデータでの評価

print(f'Validation Mean Squared Error (casual): {mean_squared_error(y_val_split_casual, y_val_pred_casual):.2f}')
print(f'Validation R^2 Score (casual): {r2_score(y_val_split_casual, y_val_pred_casual):.2f}')
print(f'Validation Mean Absolute Error (casual): {mean_absolute_error(y_val_split_casual, y_val_pred_casual):.2f}')

registeredモデルの作成

model_registered = lgb.LGBMRegressor(random_state=42, verbose=0)
cv_scores_registered = cross_val_score(model_registered, X_train_scaled, y_train_registered, cv=5, scoring='neg_mean_squared_error')
mean_cv_mse_registered = -cv_scores_registered.mean()
mean_cv_r2_registered = cross_val_score(model_registered, X_train_scaled, y_train_registered, cv=5, scoring='r2').mean()

print(f'Cross-Validation Mean Squared Error (registered): {mean_cv_mse_registered:.2f}')
print(f'Cross-Validation R^2 Score (registered): {mean_cv_r2_registered:.2f}')

データの分割(トレーニングとバリデーション用)

X_train_split, X_val_split, y_train_split_registered, y_val_split_registered = train_test_split(X_train_scaled, y_train_registered, test_size=0.2, random_state=42)

モデルのトレーニング

model_registered.fit(X_train_split, y_train_split_registered)

バリデーションデータでの予測

y_val_pred_registered = model_registered.predict(X_val_split)

バリデーションデータでの評価

print(f'Validation Mean Squared Error (registered): {mean_squared_error(y_val_split_registered, y_val_pred_registered):.2f}')
print(f'Validation R^2 Score (registered): {r2_score(y_val_split_registered, y_val_pred_registered):.2f}')
print(f'Validation Mean Absolute Error (registered): {mean_absolute_error(y_val_split_registered, y_val_pred_registered):.2f}')

テストデータでの予測

y_test_pred_casual = model_casual.predict(X_test_scaled)
y_test_pred_registered = model_registered.predict(X_test_scaled)

予測の合算

y_test_pred_combined = y_test_pred_casual + y_test_pred_registered

テストデータでの評価

test_mse_combined = mean_squared_error(y_test_actual, y_test_pred_combined)
test_r2_combined = r2_score(y_test_actual, y_test_pred_combined)
test_mae_combined = mean_absolute_error(y_test_actual, y_test_pred_combined)

print(f'Test Mean Squared Error (combined): {test_mse_combined:.2f}')
print(f'Test R^2 Score (combined): {test_r2_combined:.2f}')
print(f'Test Mean Absolute Error (combined): {test_mae_combined:.2f}')

実測値と予測値の差を日付と時間ごとに可視化

test_data['dteday'] = pd.to_datetime(test_data['dteday']) # dteday列を日付に変換
test_data['datetime'] = test_data.apply(lambda row: pd.Timestamp(year=row['dteday'].year, month=row['dteday'].month, day=row['dteday'].day, hour=row['hr']), axis=1)
test_data['actual'] = y_test_actual
test_data['predicted'] = y_test_pred_combined
test_data['difference'] = test_data['actual'] - test_data['predicted']

日付ごとの平均差分をプロット

daily_diff = test_data.groupby(test_data['datetime'].dt.date)['difference'].mean()
plt.figure(figsize=(12, 6))
daily_diff.plot(kind='bar', color='skyblue')
plt.xlabel('Date')
plt.ylabel('Average Difference')
plt.title('Average Difference between Actual and Predicted Counts by Date')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

時間ごとの平均差分をプロット

hourly_diff = test_data.groupby(test_data['datetime'].dt.hour)['difference'].mean()
plt.figure(figsize=(12, 6))
hourly_diff.plot(kind='bar', color='royalblue')
plt.xlabel('Hour')
plt.ylabel('Average Difference')
plt.title('Average Difference between Actual and Predicted Counts by Hour')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

実測値と予測値をプロットする折れ線グラフ

日付ごとの実測値と予測値のプロット

daily_actual = test_data.groupby(test_data['datetime'].dt.date)['actual'].sum()
daily_predicted = test_data.groupby(test_data['datetime'].dt.date)['predicted'].sum()

plt.figure(figsize=(12, 6))
plt.plot(daily_actual.index, daily_actual.values, label='Actual', marker='o', color='skyblue')
plt.plot(daily_predicted.index, daily_predicted.values, label='Predicted', marker='o', color='royalblue')
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Actual vs Predicted Counts by Date')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()

時間ごとの実測値と予測値のプロット

hourly_actual = test_data.groupby(test_data['datetime'].dt.hour)['actual'].sum()
hourly_predicted = test_data.groupby(test_data['datetime'].dt.hour)['predicted'].sum()

plt.figure(figsize=(12, 6))
plt.plot(hourly_actual.index, hourly_actual.values, label='Actual', marker='o', color='skyblue')
plt.plot(hourly_predicted.index, hourly_predicted.values, label='Predicted', marker='o', color='royalblue')
plt.xlabel('Hour')
plt.ylabel('Count')
plt.title('Actual vs Predicted Counts by Hour')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?