NYCタクシーデータを使って、DQNの練習をしてみた

Posted at 2025-02-21

はじめに

今回は、「ChatGPTにハンズオンを作らせてみた」の第6弾で、DQNを勉強しました。

第5弾はこちら↓

DQN

状態と行動の組み合わせごとに「Q値（行動価値）」をテーブル（Qテーブル）で管理し、ディープラーニングで学習することで、大規模な状態空間に対応可能にした手法（Q学習の応用）。

使用データ

今回は、NYC Taxi Trip Dataという、ニューヨーク市タクシー＆リムジン委員会（TLC: Taxi and Limousine Commission）によって提供されている、2024年11月のタクシーの乗車に関するオープンデータセットを使用しました。各変数の説明はこちらです。

変数	説明
VendorID	データを提供したベンダーのID
tpep_pickup_datetime	乗車時間
tpep_dropoff_datetime	降車時間
passenger_count	乗客の数
trip_distance	走行距離（マイル単位）
RatecodeID	料金コード（運賃ルール）
store_and_fwd_flag	データがリアルタイムで送信されたかどうか
PULocationID	乗車地点（TLCが定めるゾーンID）
DOLocationID	降車地点（TLCが定めるゾーンID）
payment_type	支払い方法
fare_amount	運賃（メーター料金）
extra	追加料金（時間帯による追加料金など）
mta_tax	MTA（メトロポリタン交通局）税
tip_amount	チップの金額（ドル単位）
tolls_amount	通行料（トール料金）
improvement_surcharge	インフラ改善追加料金
total_amount	運賃（ドル単位）
congestion_surcharge	渋滞追加料金
Airport_fee	空港利用料金

やること

エージェント：DQNAgent
環境：ニューヨーク市内
状態：タクシーの現在地
行動：疑似的な東西南北＋待機の５パターン
報酬：total_amount（運賃）を100で割った値
方策：ε-greedy方策
価値関数：Q値

使用コード

2024年11月における、黄色タクシーの乗車データを取得します。

import pandas as pd

# データの読み込み
data_path = "yellow_tripdata_2024-11.parquet"
df = pd.read_parquet(data_path)

乱数を使うことになるので、シード値を固定します。

import torch
import random
import numpy as np

def set_seed(seed=42):
    random.seed(seed)  # Python の random モジュールのシード
    np.random.seed(seed)  # NumPy のシード
    torch.manual_seed(seed)  # PyTorch のシード
    torch.cuda.manual_seed_all(seed)  # GPU用のシード
    torch.backends.cudnn.deterministic = True  # PyTorch の畳み込み演算の再現性を確保
    torch.backends.cudnn.benchmark = False  # 学習の最適化を抑えて再現性を優先

# 乱数シードを設定
set_seed(42)

タクシー配車の強化学習環境を定義します。プライバシーの観点から緯度・経度のデータがなく、ゾーンIDのデータしかないため、ゾーンIDを±1、±10として、疑似的に東西南北を決めています。

import gym
import numpy as np
import pandas as pd
from gym import spaces

class TaxiDispatchEnv(gym.Env):
    def __init__(self, data):
        super(TaxiDispatchEnv, self).__init__()
        self.data = data
        self.time_step = 0

        # 状態: [エリアID, 時刻]
        self.state_space = spaces.MultiDiscrete([265, 24])  # NYCには265のタクシーゾーンがある
        self.action_space = spaces.Discrete(5)  # [北, 南, 東, 西, 待機]

        self.current_state = None

    def reset(self):
        # ランダムな開始地点
        self.current_state = [np.random.randint(1, 266), np.random.randint(0, 24)]
        self.time_step = 0
        return np.array(self.current_state)

    def step(self, action):
        pu_location, hour = self.current_state

        # アクションの処理 (単純にエリアIDを±1変化)
        if action == 0:  # 北
            pu_location = max(1, pu_location - 1)
        elif action == 1:  # 南
            pu_location = min(265, pu_location + 1)
        elif action == 2:  # 東
            pu_location = min(265, pu_location + 10)
        elif action == 3:  # 西
            pu_location = max(1, pu_location - 10)

        hour = (hour + 1) % 24  # 時間を進める

        # 需要予測 (そのエリア・時間の乗客数を確認)
        demand = self.data[(self.data["PULocationID"] == pu_location) & 
                           (self.data["pickup_hour"] == hour)]

        # 乗客がいるかどうかで報酬を決定
        reward = demand["total_amount"].sum() / 100 if len(demand) > 0 else -1  # 収益 or ペナルティ

        self.current_state = [pu_location, hour]
        done = (self.time_step >= 100)  # 100ステップで終了
        self.time_step += 1

        return np.array(self.current_state), reward, done, {}

# データの前処理
df["pickup_hour"] = pd.to_datetime(df["tpep_pickup_datetime"]).dt.hour

DQNAgentを実装して、学習させていきます。

import torch.nn as nn
import torch.optim as optim
from collections import deque

class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # 割引率
        self.epsilon = 1.0  # 探索率
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001

        self.model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0)
        actions = self.model(state)
        return torch.argmax(actions).item()

    def train(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target += self.gamma * torch.max(self.model(torch.FloatTensor(next_state).unsqueeze(0))).item()
            target_f = self.model(torch.FloatTensor(state).unsqueeze(0))
            target_f[0][action] = target
            self.optimizer.zero_grad()
            loss = self.criterion(target_f, self.model(torch.FloatTensor(state).unsqueeze(0)))
            loss.backward()
            self.optimizer.step()

import matplotlib.pyplot as plt

# 環境とDQNの初期化
env = TaxiDispatchEnv(df)
set_seed(42)
agent = DQNAgent(state_size=2, action_size=5)

# 報酬の記録リスト
total_rewards = []

# 学習ループ
episodes = 1000  # エピソード数
batch_size = 32  # 学習用のミニバッチサイズ

for e in range(episodes):
    state = env.reset()
    total_reward = 0  # そのエピソードの合計報酬
    for time in range(100):
        action = agent.act(state)  # 行動を選択
        next_state, reward, done, _ = env.step(action)  # 環境を進める
        agent.memory.append((state, action, reward, next_state, done))  # 経験を保存
        state = next_state  # 状態を更新
        total_reward += reward  # 報酬を加算
        if done:
            break

    total_rewards.append(total_reward)  # 報酬を記録

    if len(agent.memory) > batch_size:
        agent.train(batch_size)  # DQNの学習を実行

    if e % 100 == 0:
        print(f"Episode {e}, Total Reward: {total_reward}, Epsilon: {agent.epsilon}")

# 学習曲線をプロット
plt.figure(figsize=(10,5))
plt.plot(total_rewards, label="Total Reward per Episode", alpha=0.5)
plt.plot(pd.Series(total_rewards).rolling(100).mean(), label="Moving Average (100 episodes)", linewidth=3, color="red")
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Training Progress of DQN for Taxi Dispatch")
plt.legend()
plt.show()

分析結果

100エピソードごとに合計報酬、探索率を表示しています。合計報酬を見ていると、うまく学習しているように見えますが、探索率が常に1.0で、ずっとランダムに行動を選択しているように見えます。本来、学習が進むにつれて、探索率は減っていってほしいので、ここは改善ポイントです。

学習の進捗を可視化してみました（青：各エピソードごとの報酬、赤：100エピソードごとの移動平均）。常にランダムに行動を選択していることが影響して、ほとんど学習が進んでいないように見えます。

おわりに

今回は、DQNの手法に対する理解に時間を費やしてしまったので、今度はしっかり学習を進められるようにしていこうと思います。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up