強化学習(Q学習)による2次関数の最小値探索

Last updated at 2025-04-08Posted at 2025-04-08
Q学習による2次関数の最小値探索

import numpy as np
import matplotlib.pyplot as plt
import random

# ============================================
# 2次関数の係数定義 / Define coefficients of the quadratic function
# f(x) = ax^2 + bx + c
# ============================================
a = 1.0
b = -6.0
c = 8.0

# 2次関数の定義 / Define the quadratic function
def quadratic_function(x):
    return a * x**2 + b * x + c

# ============================================
# 状態空間の定義 / Define the range and resolution of the state space
# ============================================
x_min = -10             # xの最小値 / minimum x value
x_max = 10              # xの最大値 / maximum x value
x_resolution = 0.1      # xの刻み幅 / resolution of x

# 状態リスト（xを離散化）/ Discretize x into states
states = np.arange(x_min, x_max + x_resolution, x_resolution)
n_states = len(states)  # 状態数 / number of states

# ============================================
# 行動空間の定義 / Define action space
# 左（-1）、そのまま（0）、右（+1）/ Move left, stay, move right
# ============================================
actions = [-1, 0, 1]
n_actions = len(actions)

# ============================================
# Qテーブルの初期化 / Initialize Q-table (state × action)
# ============================================
Q = np.zeros((n_states, n_actions))

# ============================================
# Q学習のハイパーパラメータ設定 / Hyperparameters for Q-learning
# ============================================
alpha = 0.1        # 学習率 / learning rate
gamma = 0.9        # 割引率 / discount factor
epsilon = 0.1      # 探索率 / exploration rate
n_episodes = 3000  # 総エピソード数 / number of episodes

# ============================================
# 学習履歴の保存用リスト / Lists for tracking learning progress
# ============================================
history = []         # エピソードごとのベスト結果 / (episode, best_x, f(best_x), total_reward)
total_rewards = []   # エピソードごとの累積報酬 / total reward per episode

# ============================================
# 学習ループ / Q-learning training loop
# ============================================
for episode in range(n_episodes):
    # ランダムに初期状態を選択 / Randomly choose initial state
    state_idx = random.randint(0, n_states - 1)
    total_reward = 0  # 累積報酬の初期化 / initialize cumulative reward

    for step in range(100):
        # ε-greedy による行動選択 / Choose action (exploration vs exploitation)
        if random.random() < epsilon:
            action_idx = random.randint(0, n_actions - 1)  # 探索 / explore
        else:
            action_idx = np.argmax(Q[state_idx])           # 活用 / exploit

        # 次の状態を計算 / Compute next state index
        next_state_idx = state_idx + actions[action_idx]
        if next_state_idx < 0 or next_state_idx >= n_states:
            next_state_idx = state_idx  # 範囲外ならそのまま / stay in range

        # 現在と次のxの値を取得 / Get x and next_x
        x = states[state_idx]
        next_x = states[next_state_idx]

        # 報酬は -f(x)（関数の最小化を目的とするため）/ Reward is -f(x) to encourage minimization
        reward = -quadratic_function(next_x)
        total_reward += reward  # 累積報酬を加算 / accumulate reward

        # Q値の更新 / Q-value update (Bellman equation)
        Q[state_idx, action_idx] += alpha * (
            reward + gamma * np.max(Q[next_state_idx]) - Q[state_idx, action_idx]
        )

        # 状態の更新 / Move to next state
        state_idx = next_state_idx

    # ベストxとそのf(x)を記録 / Record best x and its function value
    best_idx = np.argmax(np.max(Q, axis=1))
    best_x = states[best_idx]
    best_y = quadratic_function(best_x)
    history.append((episode, best_x, best_y, total_reward))
    total_rewards.append(total_reward)

    # 100エピソードごとに進捗表示 / Show progress every 100 episodes
    if (episode + 1) % 100 == 0 or episode == 0:
        print(f"Episode {episode + 1}/{n_episodes}")
        print(f"  → Best x so far: {best_x:.4f}")
        print(f"  → f(x): {best_y:.4f}")
        print(f"  → Total reward in this episode: {total_reward:.2f}")
        print("-" * 40)

# ============================================
# 最終結果の表示 / Show final result
# ============================================
best_state_idx = np.argmax(np.max(Q, axis=1))
best_x = states[best_state_idx]
best_y = quadratic_function(best_x)

print("\nFinal Result")
print(f"x = {best_x:.4f} gives minimum f(x) = {best_y:.4f}")

# ============================================
# 関数と最小値のプロット / Plot the function and found minimum
# ============================================
x_vals = np.linspace(x_min, x_max, 500)
y_vals = quadratic_function(x_vals)

plt.figure(figsize=(10, 5))
plt.plot(x_vals, y_vals, label="f(x) = ax² + bx + c")
plt.plot(best_x, best_y, 'ro', label="Minimum (via RL)")
plt.xlabel("x")
plt.ylabel("f(x)")
plt.title("Quadratic Function and Minimum Found by Reinforcement Learning")
plt.grid(True)
plt.legend()
plt.show()

# ============================================
# 学習曲線のプロット / Plot learning curve of total reward
# ============================================
plt.figure(figsize=(10, 4))
plt.plot(total_rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Learning Curve: Total Reward per Episode")
plt.grid(True)
plt.show()
結果
You get articles that match your needs
You can efficiently read back useful information
You can use dark theme
What you can do with signing up