Q学習による2次関数の最小値探索
import numpy as np
import matplotlib.pyplot as plt
import random
# ============================================
# 2次関数の係数定義 / Define coefficients of the quadratic function
# f(x) = ax^2 + bx + c
# ============================================
a = 1.0
b = -6.0
c = 8.0
# 2次関数の定義 / Define the quadratic function
def quadratic_function(x):
return a * x**2 + b * x + c
# ============================================
# 状態空間の定義 / Define the range and resolution of the state space
# ============================================
x_min = -10 # xの最小値 / minimum x value
x_max = 10 # xの最大値 / maximum x value
x_resolution = 0.1 # xの刻み幅 / resolution of x
# 状態リスト(xを離散化)/ Discretize x into states
states = np.arange(x_min, x_max + x_resolution, x_resolution)
n_states = len(states) # 状態数 / number of states
# ============================================
# 行動空間の定義 / Define action space
# 左(-1)、そのまま(0)、右(+1)/ Move left, stay, move right
# ============================================
actions = [-1, 0, 1]
n_actions = len(actions)
# ============================================
# Qテーブルの初期化 / Initialize Q-table (state × action)
# ============================================
Q = np.zeros((n_states, n_actions))
# ============================================
# Q学習のハイパーパラメータ設定 / Hyperparameters for Q-learning
# ============================================
alpha = 0.1 # 学習率 / learning rate
gamma = 0.9 # 割引率 / discount factor
epsilon = 0.1 # 探索率 / exploration rate
n_episodes = 3000 # 総エピソード数 / number of episodes
# ============================================
# 学習履歴の保存用リスト / Lists for tracking learning progress
# ============================================
history = [] # エピソードごとのベスト結果 / (episode, best_x, f(best_x), total_reward)
total_rewards = [] # エピソードごとの累積報酬 / total reward per episode
# ============================================
# 学習ループ / Q-learning training loop
# ============================================
for episode in range(n_episodes):
# ランダムに初期状態を選択 / Randomly choose initial state
state_idx = random.randint(0, n_states - 1)
total_reward = 0 # 累積報酬の初期化 / initialize cumulative reward
for step in range(100):
# ε-greedy による行動選択 / Choose action (exploration vs exploitation)
if random.random() < epsilon:
action_idx = random.randint(0, n_actions - 1) # 探索 / explore
else:
action_idx = np.argmax(Q[state_idx]) # 活用 / exploit
# 次の状態を計算 / Compute next state index
next_state_idx = state_idx + actions[action_idx]
if next_state_idx < 0 or next_state_idx >= n_states:
next_state_idx = state_idx # 範囲外ならそのまま / stay in range
# 現在と次のxの値を取得 / Get x and next_x
x = states[state_idx]
next_x = states[next_state_idx]
# 報酬は -f(x)(関数の最小化を目的とするため)/ Reward is -f(x) to encourage minimization
reward = -quadratic_function(next_x)
total_reward += reward # 累積報酬を加算 / accumulate reward
# Q値の更新 / Q-value update (Bellman equation)
Q[state_idx, action_idx] += alpha * (
reward + gamma * np.max(Q[next_state_idx]) - Q[state_idx, action_idx]
)
# 状態の更新 / Move to next state
state_idx = next_state_idx
# ベストxとそのf(x)を記録 / Record best x and its function value
best_idx = np.argmax(np.max(Q, axis=1))
best_x = states[best_idx]
best_y = quadratic_function(best_x)
history.append((episode, best_x, best_y, total_reward))
total_rewards.append(total_reward)
# 100エピソードごとに進捗表示 / Show progress every 100 episodes
if (episode + 1) % 100 == 0 or episode == 0:
print(f"Episode {episode + 1}/{n_episodes}")
print(f" → Best x so far: {best_x:.4f}")
print(f" → f(x): {best_y:.4f}")
print(f" → Total reward in this episode: {total_reward:.2f}")
print("-" * 40)
# ============================================
# 最終結果の表示 / Show final result
# ============================================
best_state_idx = np.argmax(np.max(Q, axis=1))
best_x = states[best_state_idx]
best_y = quadratic_function(best_x)
print("\nFinal Result")
print(f"x = {best_x:.4f} gives minimum f(x) = {best_y:.4f}")
# ============================================
# 関数と最小値のプロット / Plot the function and found minimum
# ============================================
x_vals = np.linspace(x_min, x_max, 500)
y_vals = quadratic_function(x_vals)
plt.figure(figsize=(10, 5))
plt.plot(x_vals, y_vals, label="f(x) = ax² + bx + c")
plt.plot(best_x, best_y, 'ro', label="Minimum (via RL)")
plt.xlabel("x")
plt.ylabel("f(x)")
plt.title("Quadratic Function and Minimum Found by Reinforcement Learning")
plt.grid(True)
plt.legend()
plt.show()
# ============================================
# 学習曲線のプロット / Plot learning curve of total reward
# ============================================
plt.figure(figsize=(10, 4))
plt.plot(total_rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Learning Curve: Total Reward per Episode")
plt.grid(True)
plt.show()
結果