for step in range(env.max_steps):
if random.random() < epsilon:
action = np.random.uniform(-1, 1, env.n_joints)
else:
with torch.no_grad():
state_tensor = torch.FloatTensor(state).unsqueeze(0)
action = policy_net(state_tensor).squeeze().numpy()
next_state, reward, done, info = env.step(action)
replay_buffer.append((state, action, reward, next_state, done))
episode_reward += reward
state = next_state
if len(replay_buffer) >= batch_size:
batch = random.sample(replay_buffer, batch_size)
states = torch.FloatTensor([b[0] for b in batch])
actions = torch.FloatTensor([b[1] for b in batch])
rewards = torch.FloatTensor([b[2] for b in batch])
next_states = torch.FloatTensor([b[3] for b in batch])
dones = torch.BoolTensor([b[4] for b in batch])
current_q = policy_net(states)
next_q = target_net(next_states).detach()
target_q = rewards.unsqueeze(1) + gamma * next_q * (~dones).unsqueeze(1)
loss = nn.MSELoss()(current_q, target_q)
optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(policy_net.parameters(), 1.0)
optimizer.step()
for tp, pp in zip(target_net.parameters(), policy_net.parameters()):
tp.data.copy_(tau * pp.data + (1 - tau) * tp.data)
if done:
if info['distance_to_target'] < 0.01:
successes += 1
break
epsilon = max(0.01, epsilon * 0.995)
history['episode_rewards'].append(episode_reward)
if (episode + 1) % 50 == 0:
success_rate = successes / (episode + 1)
history['success_rate'].append(success_rate)
print(f"Episode {episode + 1}: Reward={episode_reward:.2f}, "
f"Success Rate={success_rate:.2%}")
return history
---