import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
データの読み込み
train_data = pd.read_csv('hour_train.csv')
test_data = pd.read_csv('hour_test.csv')
'season'と'weathersit'をダミー変数に変換
train_data = pd.get_dummies(train_data, columns=['season', 'weathersit'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['season', 'weathersit'], drop_first=True)
トレーニングデータとテストデータでのダミー変数の列を一致させる
train_columns = set(train_data.columns)
test_columns = set(test_data.columns)
missing_in_test = train_columns - test_columns
missing_in_train = test_columns - train_columns
for col in missing_in_test:
test_data[col] = 0
for col in missing_in_train:
train_data[col] = 0
列を揃えるために並べ替え
train_data = train_data.sort_index(axis=1)
test_data = test_data.sort_index(axis=1)
特徴量とターゲット変数の設定
features = ['temp', 'atemp', 'hum', 'yr', 'mnth', 'hr', 'holiday', 'workingday'] +
[col for col in train_data.columns if col.startswith('season_') or col.startswith('weathersit_')]
target = 'cnt'
X_train = train_data[features]
y_train = train_data[target]
データの標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
GBDTモデルの作成
model = GradientBoostingRegressor(random_state=42)
クロスバリデーションによるモデルの評価
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
mean_cv_mse = -cv_scores.mean()
mean_cv_r2 = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2').mean()
print(f'Cross-Validation Mean Squared Error: {mean_cv_mse:.2f}')
print(f'Cross-Validation R^2 Score: {mean_cv_r2:.2f}')
データの分割(トレーニングとバリデーション用)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)
モデルのトレーニング
model.fit(X_train_split, y_train_split)
バリデーションデータでの予測
y_val_pred = model.predict(X_val_split)
バリデーションデータでの評価
print(f'Validation Mean Squared Error: {mean_squared_error(y_val_split, y_val_pred):.2f}')
print(f'Validation R^2 Score: {r2_score(y_val_split, y_val_pred):.2f}')
テストデータでの予測
X_test = test_data[features]
X_test_scaled = scaler.transform(X_test)
y_test_pred = model.predict(X_test_scaled)
テストデータの実際の値
y_test_actual = test_data[target]
テストデータの評価
test_mse = mean_squared_error(y_test_actual, y_test_pred)
test_r2 = r2_score(y_test_actual, y_test_pred)
print(f'Test Mean Squared Error: {test_mse:.2f}')
print(f'Test R^2 Score: {test_r2:.2f}')
予測値と実際の値の散布図を作成
plt.figure(figsize=(10, 6))
plt.scatter(y_test_actual, y_test_pred, alpha=0.5)
plt.xlabel('Actual Count')
plt.ylabel('Predicted Count')
plt.title('Actual vs Predicted Count')
plt.plot([y_test_actual.min(), y_test_actual.max()], [y_test_actual.min(), y_test_actual.max()], 'k--', lw=2)
plt.grid(True)
plt.show()