More than 3 years have passed since last update.

強化学習の勉強: 迷路探索問題（価値反復法）

Last updated at 2022-05-06Posted at 2022-05-04

強化学習の勉強: 迷路探索問題（価値反復法）

近年、注目されている深層強化学習をロボットにも応用したいと考えており、そのためにまずは強化学習について理解を深め、深層強化学習を学ぶうえで必要なことを身に着けることが目的。

強化学習アルゴリズム：価値反復法

⭐ゴール以外の位置（状態）にも価値（優先度）をつける作戦

報酬

（例）

迷路
- ゴールした時に与える
ロボット
- 歩行であれば，転ばずに歩けている間，毎ステップ与える
囲碁・将棋
- 勝てば与える

即時報酬：

ある時刻でもらえる報酬 $R_t$

報酬和：

今後未来にわたって得られるであろう報酬の合計 $G_t$

未来の報酬を割り引いて考える（時間割引）

時間割引考慮なし

$$
G_t = R_{t+1} + R_{t+2} + R_{t+3} + ...
$$
時間割引考慮あり

$$
G_t = R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + ...
$$
※$\gamma$ は0~1の値。未来にある価値の方が、時間的に価値が小さくなる考え。

価値

行動価値（action value）：

ゴールにたどり着くために２ステップ多くかかる場合，
その多くなった分，割り引いてあげたものがその行動の価値となる

状態価値（state value）：

割引報酬和 $G_t$

アルゴリズム1：Sarsa

概要

$s$, $a$, $R$, $s$, $a$ の変数値を扱うことからSarsaと呼ばれる

行動価値関数の更新式

$$
Q(s_t, a_t) = Q(s_t, a_t) + \eta(R_{t+1} + \gamma Q(s_{t+1}, a_{t+1}) - Q(s_t, a_t))
$$

$R_{t+1} + \gamma Q(s_{t+1}, a_{t+1}) - Q(s_t, a_t)$ はTD誤差（Temporal Difference）とよばれ，
$TD \approx 0$ となれば，きちんと学習できたことになる

方策オン型：

$Q$ の更新が $a_{t+1}$ を求める方策に依存する

実装

迷路の作成

maze_rl_map.py

"""
迷路探索問題で強化学習を学ぶ
"""
import matplotlib.pyplot as plt


class MAZE():
    def __init__(self) -> None:
        ### 迷路作成 ###############
        self.fig = plt.figure(figsize=(5, 5))    # 5x5のグリッド図を作成（1区画を1マスとする）
        self.ax = plt.gca()                      # get current axis 今はplt.subplot(111)と同じである。つまりは、左上のマスの操作ができる。エージェントの初期位置を描画するために用意

        # 赤い壁を描く（赤い壁は通ることができないという定義）：直線描画で表現
        plt.plot([1,1], [0,1], color='red', linewidth=2)        # plt.plot(x, y, color, linewidth)   xデータ（x座標）, yデータ（y座標）, 線色, 線幅
        plt.plot([1,2], [2,2], color='red', linewidth=2)
        plt.plot([2,2], [2,1], color='red', linewidth=2)
        plt.plot([2,3], [1,1], color='red', linewidth=2)

        # 状態を表す文字S0~S8を描く
        plt.text(x=0.5, y=2.5, s='S0', size=14, ha='center')
        plt.text(x=1.5, y=2.5, s='S1', size=14, ha='center')
        plt.text(x=2.5, y=2.5, s='S2', size=14, ha='center')
        plt.text(x=0.5, y=1.5, s='S3', size=14, ha='center')
        plt.text(x=1.5, y=1.5, s='S4', size=14, ha='center')
        plt.text(x=2.5, y=1.5, s='S5', size=14, ha='center')
        plt.text(x=0.5, y=0.5, s='S6', size=14, ha='center')
        plt.text(x=1.5, y=0.5, s='S7', size=14, ha='center')
        plt.text(x=2.5, y=0.5, s='S8', size=14, ha='center')
        plt.text(x=0.5, y=2.3, s='START', size=10, ha='center')
        plt.text(x=2.5, y=0.3, s='GOAL', size=10, ha='center')

        # 描画範囲の設定とメモリを消す設定
        self.ax.set_xlim(0, 3)
        self.ax.set_ylim(0, 3)
        plt.tick_params(axis='both', which='both', bottom=False, top=False, labelbottom=False, right=False, left=False, labelleft=False)
    def set_start(self):
        # 現在地S0に緑丸を描画する
        line, = self.ax.plot([0.5], [2.5], marker='o', color='lightgreen', markersize=60)    # のちに更新するためにaxで戻り値としてlineを受け取っている。lineにアクセスして座標変更が可能（代入文）←コンマが必要
        return line                                                                                # 代入文：https://docs.python.org/ja/3/reference/simple_stmts.html#assignment-statements

    def show(self):
        plt.show()

if __name__ == '__main__':
    maze = MAZE()
    line = maze.set_start()
    maze.show()

GIFによる動作の記録

maze_rl_gif.py

"""
迷路探索問題で強化学習を学ぶ
"""
from maze_rl_agent_random import Agent      # 作成したエージェントをモジュール化しているためインポート
from maze_rl_map import MAZE         # 作成した迷路をモジュール化しているためインポート
from matplotlib import animation as ani
from os.path import join, dirname, abspath

### 動いている様子を可視化 #############
class GIF():
    def __init__(self, fig, line, state_history) -> None:
        self.fig = fig
        self.state_history = state_history
        self.line = line

    def init_func(self):
        """背景画像の初期化"""
        line = self.line.set_data([], [])
        return (line,)

    def animate(self, i):
        """フレームごとの描画内容"""
        state = self.state_history[i]
        x = (state%3) + 0.5         # 状態のx座標は、3で割ったあまり + 0.5
        y = 2.5 - int(state/3)         # 状態のy座標は、2.5 - 3で割った商

        line = self.line.set_data(x, y)
        return (line,)

    def create(self, file_name="maze_random.gif"):
        anim = ani.FuncAnimation(self.fig,  self.animate, init_func=self.init_func, frames=len(self.state_history), interval=200, repeat=False)

        save_path = dirname(abspath(__file__))
        anim.save(f"{save_path}/{file_name}")

if __name__ == '__main__':
    # 迷路の作成
    maze = MAZE()
    line = maze.set_start()     # 動き回るエージェントの座標を変更できる変数を取得

    # エージェント
    agent = Agent()
    pi_0 = agent.simple_convert_into_pi_from_theta(theta=agent.theta_0)     # 初期の方策
    state_history = agent.goal_maze(pi_0)                                   # ゴールするまで1つの方策でランダム動き回る

    # 記録
    gif = GIF(maze.fig, line, state_history)
    gif.create(file_name="maze_random.gif")

学習を考慮したエージェントの作成

maze_rl_agent_Sarsa.py

"""
迷路探索問題で強化学習を学ぶ
"""
import numpy as np

### エージェントの実装 ####################
class Agent():
    def __init__(self) -> None:
        # 進めるルールを定義
        # 行：状態S0~S7（S8はゴールであるから方策不要）、列：選択（↑, →, ↓, ←）
        self.theta_0 = np.array([
                            [np.nan,    1,      1,      np.nan],    # S0: ↑ 不可    → 可    ↓ 可    ← 不可
                            [np.nan,    1,      np.nan, 1],         # S1: ↑ 不可    → 可    ↓ 不可  ← 可
                            [np.nan,    np.nan, 1,      1],         # S2: ↑ 不可    → 不可  ↓ 可    ← 可
                            [1 ,        1,      1,      np.nan],    # S3: ↑ 可      → 可    ↓ 可    ← 不可
                            [np.nan,    np.nan, 1,      1],         # S4: ↑ 不可    → 不可  ↓ 可    ← 可
                            [1,         np.nan, np.nan, np.nan],    # S5: ↑ 可      → 不可  ↓ 不可  ← 不可
                            [1,         np.nan, np.nan, np.nan],    # S6: ↑ 可      → 不可  ↓ 不可  ← 不可
                            [1,         1,      np.nan, np.nan],    # S7: ↑ 可      → 可    ↓ 不可  ← 不可
                            ])
        
        a, b = self.theta_0.shape
        self.Q = np.random.rand(a, b) * self.theta_0

    # 方策パラメータ（ルール）から行動方策piを導く
    def simple_convert_into_pi_from_theta(self, theta):
        """単純に割合（その行動をとる確率）を計算する"""

        [m, n] = theta.shape    # thetaの行列サイズを取得
        pi = np.zeros((m, n))

        for i in range(m):
            pi[i, :] = theta[i, :] / np.nansum(theta[i, :]) # 割合計算（各箇所をその要素合計で割る）
        
        pi = np.nan_to_num(pi)

        return pi

    # 1 step移動後の状態sを求める
    def get_action(self, s, Q, epsilon, pi_0):
        direction = ['up', 'right', 'down', 'left']

        # 行動を決める
        if np.random.rand() < epsilon:
            # epsilonの確率でランダムに動く
            next_direction = np.random.choice(direction, p=pi_0[s, :])    # pi[s,:]の確率に従ってdirectionが選択される
        else:
            # Qの最大値の行動を採用
            next_direction = direction[np.nanargmax(Q[s, :])]       # np.nanargmax()
                                                                    # https://hydrocul.github.io/wiki/numpy/ndarray-max-min.html

        if next_direction == 'up':
            action = 0
        elif next_direction == 'right':
            action = 1
        elif next_direction == 'down':
            action = 2
        elif next_direction == 'left':
            action = 3
        
        return action

    # 1 step移動後の状態sを求める
    def get_next_s(self, s, a, Q, epsilon, pi_0):
        direction = ['up', 'right', 'down', 'left']

        next_direction = direction[a]    # pi[s,:]の確率に従ってdirectionが選択される

        if next_direction == 'up':
            s_next = s - 3
        elif next_direction == 'right':
            s_next = s + 1
        elif next_direction == 'down':
            s_next = s + 3
        elif next_direction == 'left':
            s_next = s - 1
        
        return s_next

    # Sarsaによる行動価値関数Qの更新
    def Sarsa(self, s, a, r, s_next, a_next, Q, eta, gamma):
        
        
        if s_next == 8: # ゴールした場合
            Q[s, a] = Q[s, a] + eta * (r - Q[s, a])
        else:
            Q[s, a] = Q[s, a] + eta * (r + gamma * Q[s_next, a_next] - Q[s, a])
        
        return Q

    # 迷路内をエージェントがゴールするまで移動させる
    def goal_maze(self, Q, epsilon, eta, gamma, pi):
        s = 0       # スタート状態S0
        action = a_next = self.get_action(s, Q, epsilon, pi)
        s_a_history = [[0, np.nan]] # エージェントの移動した道を記録

        while True:
            action = a_next      # 行動更新

            s_a_history[-1][1] = action     # 現在の状態を格納
            s_next = self.get_next_s(s, action, Q, epsilon, pi) # 次の状態を取得
            s_a_history.append([s_next, np.nan])  # 次の状態を格納。行動については現時点ではわからないので、np.nan

            if s_next == 8:
                r = 1   # ゴールにたどり着いたなら報酬を与える
                a_next = np.nan
            else:
                r = 0   # 報酬なし
                a_next = self.get_action(s_next, Q, epsilon, pi)
            
            # 価値関数の更新
            Q = self.Sarsa(s, action, r, s_next, a_next, Q, eta, gamma)
            
            # 終了判定
            if s_next == 8:
                break
            else:
                s = s_next
        
        return [s_a_history, Q]

if __name__ == '__main__':
    agent = Agent()
    pi_0 = agent.simple_convert_into_pi_from_theta(agent.theta_0)     # 初期の方策
    s_a_history, Q = agent.goal_maze(agent.Q, epsilon=0.5, eta=0.1, gamma=0.9, pi=pi_0)                                   # ゴールするまで1つの方策でランダム動き回る
    print(s_a_history)
    print(Q)

出力

1度だけ更新されたQ

[[       nan 0.3906937  0.10505809        nan]
 [       nan 0.15036867        nan 0.32046707]
 [       nan        nan 0.29114881 0.2434914 ]
 [0.15215576 0.54439504 0.38450278        nan]
 [       nan        nan 0.62616076 0.6973528 ]
 [0.66610023        nan        nan        nan]
 [0.79491608        nan        nan        nan]
 [0.957271   0.97924621        nan        nan]]

ここで注意したいのは、各要素は確率ではなく価値を表していること。

学習とその結果の記録

mazw_rl_learning_test.py

"""
迷路探索問題で強化学習を学ぶ
"""
from maze_rl_map import MAZE      # 作成した迷路をモジュール化しているためインポート
from maze_rl_agent_Sarsa import Agent      # 作成した迷路をモジュール化しているためインポート
import numpy as np


### Sarsaで迷路を解く ###########

agent = Agent()

# 初期値
Q = agent.Q
theta_0 = agent.theta_0
pi_0 = agent.simple_convert_into_pi_from_theta(theta=agent.theta_0)

# 初期値で初期化
theta = theta_0
pi = pi_0


# パラメータ設定
eta = 0.1
gamma = 0.9
epsilon = 0.5
v = np.nanmax(Q, axis=1)    # 状態ごとに価値の最大値を求める

is_continue = True      # ループさせるフラグ
episode = 1               # 学習回数（エピソード）カウント

while is_continue:
    print(f"エピソード：{episode}")

    # epsilon-greedyの値を少しずつ小さくする
    epsilon /= 2

    # Sarsaで価値関数Qを更新
    s_a_history, Q = agent.goal_maze(Q, epsilon, eta, gamma, pi_0)

    # 状態価値の変化
    new_v = np.nanmax(Q, axis=1)    # 状態ごとに価値の最大値を求める
    print(f"状態価値の変化：{np.sum(np.abs(new_v - v)):.5f}")
    v = new_v

    print(f"迷路を解くのにかかったステップ数：{len(s_a_history) - 1}")

    # 100エピソード繰り返す
    if episode >= 100:
        break

    episode += 1

print(Q)

from maze_rl_gif import GIF
maze = MAZE()
state_history = [s_a[0] for s_a in s_a_history]
# print(state_history)
print(f"学習回数（エピソード）：{episode}")
gif = GIF(maze.fig, maze.set_start(), state_history)

gif.create("maze_learning_Sarsa.gif")

出力

更新された最終の価値関数Q

[[       nan 0.23111675 0.72408668        nan]
 [       nan 0.23085754        nan 0.23042495]
 [       nan        nan 0.23043687 0.23035335]
 [0.24286869 0.80861437 0.24296961        nan]
 [       nan        nan 0.8997397  0.2491935 ]
 [0.23025602        nan        nan        nan]
 [0.24299482        nan        nan        nan]
 [0.25508957 0.9999756         nan        nan]]

かなり価値関数Qが更新されている。
結果的には、4ステップでゴールできる。

結果

ここで結果を示し、考察していく。
まず、上記のgif画像から、無駄なく、迷いなくゴールへ向かっていることがわかる。
実際にこの移動を決めている価値関数Qについて見ていくこととする。

価値関数Q

[[       nan 0.23111675 0.72408668        nan]
 [       nan 0.23085754        nan 0.23042495]
 [       nan        nan 0.23043687 0.23035335]
 [0.24286869 0.80861437 0.24296961        nan]
 [       nan        nan 0.8997397  0.2491935 ]
 [0.23025602        nan        nan        nan]
 [0.24299482        nan        nan        nan]
 [0.25508957 0.9999756         nan        nan]]

上の行から準備S0, S1, S2, ..., S7の行動「上(↑), 右(→), 下(↓), 左(←) 」を表している。

考察

状態S0から順に価値の高いものを追いかけると、わかりやすい。
まず、S0。S0においては、下（↓）が0.724であり、次の遷移はS3となる。S3においては、右（→）が0.808であり、次の遷移はS4となる。S4においては、下（↓）が0.899であり、次の遷移はS7となる。S7においては、右（→）が0.999であり、ゴールする。

0.7以上の価値を含む状態は

S0 → S3 → S4 → S7

アルゴリズム2：Q学習

概要

Sarsaとの違いは１つ

行動価値関数の更新式

$$
Q(s_t, a_t) = Q(s_t, a_t) + \eta(R_{t+1} + \gamma \max_aQ(s_{t+1}, a) - Q(s_t, a_t))
$$

方策オフ型：

$Q$ の更新が $a_{t+1}$ を求める方策に依存しない

したがって，行動価値関数の収束がSarsaよりもQ学習のほうが早い

実装

迷路の作成

maze_rl_map.py

"""
迷路探索問題で強化学習を学ぶ
"""
import matplotlib.pyplot as plt


class MAZE():
    def __init__(self) -> None:
        ### 迷路作成 ###############
        self.fig = plt.figure(figsize=(5, 5))    # 5x5のグリッド図を作成（1区画を1マスとする）
        self.ax = plt.gca()                      # get current axis 今はplt.subplot(111)と同じである。つまりは、左上のマスの操作ができる。エージェントの初期位置を描画するために用意

        # 赤い壁を描く（赤い壁は通ることができないという定義）：直線描画で表現
        plt.plot([1,1], [0,1], color='red', linewidth=2)        # plt.plot(x, y, color, linewidth)   xデータ（x座標）, yデータ（y座標）, 線色, 線幅
        plt.plot([1,2], [2,2], color='red', linewidth=2)
        plt.plot([2,2], [2,1], color='red', linewidth=2)
        plt.plot([2,3], [1,1], color='red', linewidth=2)

        # 状態を表す文字S0~S8を描く
        plt.text(x=0.5, y=2.5, s='S0', size=14, ha='center')
        plt.text(x=1.5, y=2.5, s='S1', size=14, ha='center')
        plt.text(x=2.5, y=2.5, s='S2', size=14, ha='center')
        plt.text(x=0.5, y=1.5, s='S3', size=14, ha='center')
        plt.text(x=1.5, y=1.5, s='S4', size=14, ha='center')
        plt.text(x=2.5, y=1.5, s='S5', size=14, ha='center')
        plt.text(x=0.5, y=0.5, s='S6', size=14, ha='center')
        plt.text(x=1.5, y=0.5, s='S7', size=14, ha='center')
        plt.text(x=2.5, y=0.5, s='S8', size=14, ha='center')
        plt.text(x=0.5, y=2.3, s='START', size=10, ha='center')
        plt.text(x=2.5, y=0.3, s='GOAL', size=10, ha='center')

        # 描画範囲の設定とメモリを消す設定
        self.ax.set_xlim(0, 3)
        self.ax.set_ylim(0, 3)
        plt.tick_params(axis='both', which='both', bottom=False, top=False, labelbottom=False, right=False, left=False, labelleft=False)
    def set_start(self):
        # 現在地S0に緑丸を描画する
        line, = self.ax.plot([0.5], [2.5], marker='o', color='lightgreen', markersize=60)    # のちに更新するためにaxで戻り値としてlineを受け取っている。lineにアクセスして座標変更が可能（代入文）←コンマが必要
        return line                                                                                # 代入文：https://docs.python.org/ja/3/reference/simple_stmts.html#assignment-statements

    def show(self):
        plt.show()

if __name__ == '__main__':
    maze = MAZE()
    line = maze.set_start()
    maze.show()

GIFによる動作の記録

maze_rl_gif.py

"""
迷路探索問題で強化学習を学ぶ
"""
from maze_rl_agent_random import Agent      # 作成したエージェントをモジュール化しているためインポート
from maze_rl_map import MAZE         # 作成した迷路をモジュール化しているためインポート
from matplotlib import animation as ani
from os.path import join, dirname, abspath

### 動いている様子を可視化 #############
class GIF():
    def __init__(self, fig, line, state_history) -> None:
        self.fig = fig
        self.state_history = state_history
        self.line = line

    def init_func(self):
        """背景画像の初期化"""
        line = self.line.set_data([], [])
        return (line,)

    def animate(self, i):
        """フレームごとの描画内容"""
        state = self.state_history[i]
        x = (state%3) + 0.5         # 状態のx座標は、3で割ったあまり + 0.5
        y = 2.5 - int(state/3)         # 状態のy座標は、2.5 - 3で割った商

        line = self.line.set_data(x, y)
        return (line,)

    def create(self, file_name="maze_random.gif"):
        anim = ani.FuncAnimation(self.fig,  self.animate, init_func=self.init_func, frames=len(self.state_history), interval=200, repeat=False)

        save_path = dirname(abspath(__file__))
        anim.save(f"{save_path}/{file_name}")

if __name__ == '__main__':
    # 迷路の作成
    maze = MAZE()
    line = maze.set_start()     # 動き回るエージェントの座標を変更できる変数を取得

    # エージェント
    agent = Agent()
    pi_0 = agent.simple_convert_into_pi_from_theta(theta=agent.theta_0)     # 初期の方策
    state_history = agent.goal_maze(pi_0)                                   # ゴールするまで1つの方策でランダム動き回る

    # 記録
    gif = GIF(maze.fig, line, state_history)
    gif.create(file_name="maze_random.gif")

学習を考慮したエージェントの作成

```python:maze_rl_agent_Q.py """ 迷路探索問題で強化学習を学ぶ """ import numpy as np

エージェントの実装

class Agent():
def init(self) -> None:
# 進めるルールを定義
# 行：状態S0~S7（S8はゴールであるから方策不要）、列：選択（↑, →, ↓, ←）
self.theta_0 = np.array([
[np.nan, 1, 1, np.nan], # S0: ↑ 不可 → 可 ↓ 可 ← 不可
[np.nan, 1, np.nan, 1], # S1: ↑ 不可 → 可 ↓ 不可 ← 可
[np.nan, np.nan, 1, 1], # S2: ↑ 不可 → 不可 ↓ 可 ← 可
[1 , 1, 1, np.nan], # S3: ↑ 可 → 可 ↓ 可 ← 不可
[np.nan, np.nan, 1, 1], # S4: ↑ 不可 → 不可 ↓ 可 ← 可
[1, np.nan, np.nan, np.nan], # S5: ↑ 可 → 不可 ↓ 不可 ← 不可
[1, np.nan, np.nan, np.nan], # S6: ↑ 可 → 不可 ↓ 不可 ← 不可
[1, 1, np.nan, np.nan], # S7: ↑ 可 → 可 ↓ 不可 ← 不可
])

    a, b = self.theta_0.shape
    self.Q = np.random.rand(a, b) * self.theta_0

# 方策パラメータ（ルール）から行動方策piを導く
def simple_convert_into_pi_from_theta(self, theta):
    """単純に割合（その行動をとる確率）を計算する"""

    [m, n] = theta.shape    # thetaの行列サイズを取得
    pi = np.zeros((m, n))

    for i in range(m):
        pi[i, :] = theta[i, :] / np.nansum(theta[i, :]) # 割合計算（各箇所をその要素合計で割る）
    
    pi = np.nan_to_num(pi)

    return pi

# 1 step移動後の状態sを求める
def get_action(self, s, Q, epsilon, pi_0):
    direction = ['up', 'right', 'down', 'left']

    # 行動を決める
    if np.random.rand() < epsilon:
        # epsilonの確率でランダムに動く
        next_direction = np.random.choice(direction, p=pi_0[s, :])    # pi[s,:]の確率に従ってdirectionが選択される
    else:
        # Qの最大値の行動を採用
        next_direction = direction[np.nanargmax(Q[s, :])]       # np.nanargmax()
                                                                # https://hydrocul.github.io/wiki/numpy/ndarray-max-min.html

    if next_direction == 'up':
        action = 0
    elif next_direction == 'right':
        action = 1
    elif next_direction == 'down':
        action = 2
    elif next_direction == 'left':
        action = 3
    
    return action

# 1 step移動後の状態sを求める
def get_next_s(self, s, a, Q, epsilon, pi_0):
    direction = ['up', 'right', 'down', 'left']

    next_direction = direction[a]    # pi[s,:]の確率に従ってdirectionが選択される

    if next_direction == 'up':
        s_next = s - 3
    elif next_direction == 'right':
        s_next = s + 1
    elif next_direction == 'down':
        s_next = s + 3
    elif next_direction == 'left':
        s_next = s - 1
    
    return s_next

# Q学習による行動価値関数Qの更新
def Q_learning(self, s, a, r, s_next, a_next, Q, eta, gamma):
    
    if s_next == 8: # ゴールした場合
        Q[s, a] = Q[s, a] + eta * (r - Q[s, a])
    else:
        Q[s, a] = Q[s, a] + eta * (r + gamma * np.nanmax(Q[s_next, :]) - Q[s, a])
    
    return Q

# 迷路内をエージェントがゴールするまで移動させる
def goal_maze(self, Q, epsilon, eta, gamma, pi):
    s = 0       # スタート状態S0
    action = a_next = self.get_action(s, Q, epsilon, pi)
    s_a_history = [[0, np.nan]] # エージェントの移動した道を記録

    while True:
        action = a_next      # 行動更新

        s_a_history[-1][1] = action     # 現在の状態を格納
        s_next = self.get_next_s(s, action, Q, epsilon, pi) # 次の状態を取得
        s_a_history.append([s_next, np.nan])  # 次の状態を格納。行動については現時点ではわからないので、np.nan

        if s_next == 8:
            r = 1   # ゴールにたどり着いたなら報酬を与える
            a_next = np.nan
        else:
            r = 0   # 報酬なし
            a_next = self.get_action(s_next, Q, epsilon, pi)
        
        # 価値関数の更新
        Q = self.Q_learning(s, action, r, s_next, a_next, Q, eta, gamma)
        
        # 終了判定
        if s_next == 8:
            break
        else:
            s = s_next
    
    return [s_a_history, Q]

if name == 'main':
agent = Agent()
pi_0 = agent.simple_convert_into_pi_from_theta(agent.theta_0) # 初期の方策
s_a_history, Q = agent.goal_maze(agent.Q, epsilon=0.5, eta=0.1, gamma=0.9, pi=pi_0) # ゴールするまで1つの方策でランダム動き回る
print(s_a_history)
print(Q)


### 出力

```bash:エージェントの移動した道と行動の記録
[[0, 1], [1, 1], [2, 2], [5, 0], [2, 2], [5, 0], [2, 3], [1, 1], [2, 2], [5, 0], [2, 2], [5, 0], [2, 3], [1, 1], [2, 2], [5, 0], [2, 2], [5, 0], [2, 2], [5, 0], [2, 2], [5, 0], [2, 2], [5, 0], [2, 3], [1, 3], [0, 1], [1, 3], [0, 2], [3, 2], [6, 0], [3, 2], [6, 0], [3, 0], [0, 2], [3, 2], [6, 0], [3, 0], [0, 2], [3, 2], [6, 0], [3, 2], [6, 0], [3, 0], [0, 2], [3, 2], [6, 0], [3, 2], [6, 0], [3, 2], [6, 0], [3, 2], [6, 0], [3, 2], [6, 0], [3, 1], [4, 2], [7, 0], [4, 2], [7, 1], [8, nan]]

[[       nan 0.22865948 0.36509093        nan]
 [       nan 0.6474626         nan 0.51989819]
 [       nan        nan 0.57864166 0.27887054]
 [0.7705546  0.76207574 0.83870212        nan]
 [       nan        nan 0.16012168 0.10271835]
 [0.6193851         nan        nan        nan]
 [0.81659541        nan        nan        nan]
 [0.22493677 0.66579869        nan        nan]]

ここで注意したいのは、各要素は確率ではなく価値を表していること。

学習とその結果の記録

mazw_rl_learning_test.py

"""
迷路探索問題で強化学習を学ぶ
"""
from maze_rl_map import MAZE      # 作成した迷路をモジュール化しているためインポート
from maze_rl_agent_Q import Agent      # 作成した迷路をモジュール化しているためインポート
import numpy as np


### Sarsaで迷路を解く ###########

agent = Agent()

# 初期値
Q = agent.Q
theta_0 = agent.theta_0
pi_0 = agent.simple_convert_into_pi_from_theta(theta=agent.theta_0)

# 初期値で初期化
theta = theta_0
pi = pi_0


# パラメータ設定
eta = 0.1
gamma = 0.9
epsilon = 0.5
v = np.nanmax(Q, axis=1)    # 状態ごとに価値の最大値を求める

is_continue = True      # ループさせるフラグ
episode = 1               # 学習回数（エピソード）カウント

while is_continue:
    print(f"エピソード：{episode}")

    # epsilon-greedyの値を少しずつ小さくする
    epsilon /= 2

    # Sarsaで価値関数Qを更新
    s_a_history, Q = agent.goal_maze(Q, epsilon, eta, gamma, pi_0)

    # 状態価値の変化
    new_v = np.nanmax(Q, axis=1)    # 状態ごとに価値の最大値を求める: 特に学習に用いているわけではない
    print(f"状態価値の変化：{np.sum(np.abs(new_v - v)):.5f}")
    v = new_v
    print(f"迷路を解くのにかかったステップ数：{len(s_a_history) - 1}")

    # 100エピソード繰り返す
    if episode >= 100:
        break

    episode += 1

# print(Q)

from maze_rl_gif import GIF
maze = MAZE()
state_history = [s_a[0] for s_a in s_a_history]
# print(state_history)
print(f"学習回数（エピソード）：{episode}")
gif = GIF(maze.fig, maze.set_start(), state_history)

gif.create("maze_learning_Q.gif")

出力

更新された最終の価値関数Q

[[       nan 0.20476518 0.72625014        nan]
 [       nan 0.20489772        nan 0.20393756]
 [       nan        nan 0.20477496 0.20315207]
 [0.2108445  0.8093074  0.18321797        nan]
 [       nan        nan 0.89988575 0.21384864]
 [0.20411111        nan        nan        nan]
 [0.35434513        nan        nan        nan]
 [0.64060359 0.9999908         nan        nan]]

かなり価値関数Qが更新されている。
結果的には、4ステップでゴールできる。

結果

価値関数Q

[[       nan 0.20476518 0.72625014        nan]
 [       nan 0.20489772        nan 0.20393756]
 [       nan        nan 0.20477496 0.20315207]
 [0.2108445  0.8093074  0.18321797        nan]
 [       nan        nan 0.89988575 0.21384864]
 [0.20411111        nan        nan        nan]
 [0.35434513        nan        nan        nan]
 [0.64060359 0.9999908         nan        nan]]

上の行から準備S0, S1, S2, ..., S7の行動「上(↑), 右(→), 下(↓), 左(←) 」を表している。

考察

状態S0から順に価値の高いものを追いかけると、わかりやすい。
まず、S0。S0においては、下（↓）が0.726であり、次の遷移はS3となる。S3においては、右（→）が0.809であり、次の遷移はS4となる。S4においては、下（↓）が0.899であり、次の遷移はS7となる。S7においては、右（→）が0.999であり、ゴールする。

0.7以上の価値を含む状態は

S0 → S3 → S4 → S7

感想

価値反復法について整理した。前回学んだ方策反復法とは異なるアルゴリズムだが、方策を学習させるか、価値を学習させるのかの違いであって、プログラムからもアルゴリズムからもわかるように非常に異なるものというわけでもなく、考え方は似ているような印象を受けた。そうとらえると、何となく何がしたいのか、何をさせようとしているのかが見えてきたように思える。

次は、倒立振り子の問題でもう一度Q学習の理解を深めていきたい。
そのあとにはDQNに触れながら、いよいよ深層強化学習に入っていく。そのあたりで、ロボットに深層強化学習を適用することを考えている。

参考文献

「作りながら学ぶ深層強化学習 PyTorchによる実践プログラミング」
小川雄太郎　著　　マイナビ出版

「現場で使える！Python 深層強化学習入門　強化学習と深層強化学習による探索と制御」
伊藤多一、今津善充、須藤広大、仁ノ平将人、川崎悠介、酒井　裕企、魏崇哲　著　　翔泳社出版

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up