More than 3 years have passed since last update.

Advantage Actor Critic : A2C-CartPole Improved version from GitHub

Last updated at 2021-05-12Posted at 2019-11-10

Advantage Actor Critic

The codes bellow doesn't work at all.
3_A2C-nstep-TUTORIAL.ipynb

So I improved them myself.
Google Colaboratory

a2c.py

import gym
import os
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable

a2c.py

torch.__version__

'1.3.0+cu100'

a2c.py

def calc_actual_state_values(rewards, dones):
    R = []
    rewards.reverse()

    # If we happen to end the set on a terminal state, set next return to zero
    if dones[-1] == True: next_return = 0
        
    # If not terminal state, bootstrap v(s) using our critic
    # TODO: don't need to estimate again, just take from last value of v(s) estimates
    else: 
        s = torch.from_numpy(states[-1]).float().unsqueeze(0)
        next_return = model.get_state_value(Variable(s)).data[0][0] 
    
    # Backup from last state to calculate "true" returns for each state in the set
    R.append(next_return)
    dones.reverse()
    for r in range(1, len(rewards)):
        if not dones[r]: this_return = rewards[r] + next_return * GAMMA
        else: this_return = 0
        R.append(this_return)
        next_return = this_return

    R.reverse()
    state_values_true = Variable(torch.FloatTensor(R)).unsqueeze(1)
    
    return state_values_true

a2c.py

def reflect(states, actions, rewards, dones):
    
    # Calculating the ground truth "labels" as described above
    state_values_true = calc_actual_state_values(rewards, dones)

    s = Variable(torch.FloatTensor(states))
    action_probs, state_values_est = model.evaluate_actions(s)
    action_log_probs = action_probs.log() 
    
    a = Variable(torch.LongTensor(actions).view(-1,1))
    chosen_action_log_probs = action_log_probs.gather(1, a)

    # This is also the TD error
    advantages = state_values_true - state_values_est

    entropy = (action_probs * action_log_probs).sum(1).mean()
    action_gain = (chosen_action_log_probs * advantages).mean()
    value_loss = advantages.pow(2).mean()
    total_loss = value_loss - action_gain - 0.0001*entropy

    optimizer.zero_grad()
    total_loss.backward()
    nn.utils.clip_grad_norm(model.parameters(), 0.5)
    optimizer.step()

a2c.py

class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.linear1 = nn.Linear(N_INPUTS, 64)
        self.linear2 = nn.Linear(64, 128)
        self.linear3 = nn.Linear(128, 64)
        
        self.actor = nn.Linear(64, N_ACTIONS)
        self.critic = nn.Linear(64, 1)
    
    # In a PyTorch model, you only have to define the forward pass. PyTorch computes the backwards pass for you!
    def forward(self, x):
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.relu(x)
        x = self.linear3(x)
        x = F.relu(x) 
        return x
    
    # Only the Actor head
    def get_action_probs(self, x):
        x = self(x)
        action_probs = F.softmax(self.actor(x))
        return action_probs
    
    # Only the Critic head
    def get_state_value(self, x):
        x = self(x)
        state_value = self.critic(x)
        return state_value
    
    # Both heads
    def evaluate_actions(self, x):
        x = self(x)
        action_probs = F.softmax(self.actor(x))
        state_values = self.critic(x)
        return action_probs, state_values

a2c.py

def test_model(model):
    score = 0
    done = False
    env = gym.make('CartPole-v0')
    state = env.reset()
    global action_probs
    while not done:
        score += 1
        print("score", score)
        s = torch.from_numpy(state).float().unsqueeze(0)
        print("s", s)
        
        action_probs = model.get_action_probs(Variable(s))
        print("action_probs", action_probs)
        
        _, action_index = action_probs.max(1)
        action = action_index.data[0].tolist()
        print("action", action)

        next_state, reward, done, thing = env.step(action)
        print("next_state", next_state)
        print("reward", reward)
        state = next_state
    return score

a2c.py

# Discount factor. Model is not very sensitive to this value.
GAMMA = .95

# LR of 3e-2 explodes the gradients, LR of 3e-4 trains slower
LR = 3e-3
N_GAMES = 2000

# OpenAI baselines uses nstep of 5.
N_STEPS = 20

env = gym.make('CartPole-v0')
N_ACTIONS = 2 # get from env
N_INPUTS = 4 # get from env

model = ActorCritic()
optimizer = optim.Adam(model.parameters(), lr=LR)

a2c.py

state = env.reset()
finished_games = 0

while finished_games < N_GAMES:
    states, actions, rewards, dones = [], [], [], []

    # Gather training data
    for i in range(N_STEPS):
        s = Variable(torch.from_numpy(state).float().unsqueeze(0))
        # print(s)
        action_probs = model.get_action_probs(s)
        # action = action_probs.multinomial().data[0][0]
        action = action_probs.multinomial(1).data[0][0].tolist()
        # print("action",action)
        next_state, reward, done, _ = env.step(action)

        states.append(state); actions.append(action); rewards.append(reward); dones.append(done)

        if done: state = env.reset(); finished_games += 1
        else: state = next_state

    # Reflect on training data
    reflect(states, actions, rewards, dones)

a2c.py

test_model(model)

Result

score 1 s tensor([[-0.0172, -0.0327, 0.0452, -0.0357]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [-0.01788058 0.16175133 0.0444981 -0.31380265] reward 1.0 score 2 s tensor([[-0.0179, 0.1618, 0.0445, -0.3138]]) action_probs tensor([[0.9913, 0.0087]], grad_fn=) action 0 next_state [-0.01464556 -0.03397533 0.03822205 -0.00742515] reward 1.0 score 3 s tensor([[-0.0146, -0.0340, 0.0382, -0.0074]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [-0.01532506 0.16057821 0.03807354 -0.28780777] reward 1.0 score 4 s tensor([[-0.0153, 0.1606, 0.0381, -0.2878]]) action_probs tensor([[0.9918, 0.0082]], grad_fn=) action 0 next_state [-0.0121135 -0.03506543 0.03231739 0.01663593] reward 1.0 score 5 s tensor([[-0.0121, -0.0351, 0.0323, 0.0166]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [-0.01281481 0.15957851 0.03265011 -0.26567786] reward 1.0 score 6 s tensor([[-0.0128, 0.1596, 0.0327, -0.2657]]) action_probs tensor([[0.9921, 0.0079]], grad_fn=) action 0 next_state [-0.00962324 -0.03599386 0.02733655 0.03712177] reward 1.0 score 7 s tensor([[-0.0096, -0.0360, 0.0273, 0.0371]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [-0.01034312 0.15872563 0.02807899 -0.2468124 ] reward 1.0 score 8 s tensor([[-0.0103, 0.1587, 0.0281, -0.2468]]) action_probs tensor([[0.9924, 0.0076]], grad_fn=) action 0 next_state [-0.0071686 -0.03678586 0.02314274 0.05459337] reward 1.0 score 9 s tensor([[-0.0072, -0.0368, 0.0231, 0.0546]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [-0.00790432 0.15799674 0.0242346 -0.23069894] reward 1.0 score 10 s tensor([[-0.0079, 0.1580, 0.0242, -0.2307]]) action_probs tensor([[0.9926, 0.0074]], grad_fn=) action 0 next_state [-0.00474439 -0.03746298 0.01962063 0.06952882] reward 1.0 score 11 s tensor([[-0.0047, -0.0375, 0.0196, 0.0695]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [-0.00549364 0.15737226 0.0210112 -0.21689969] reward 1.0 score 12 s tensor([[-0.0055, 0.1574, 0.0210, -0.2169]]) action_probs tensor([[0.9927, 0.0073]], grad_fn=) action 0 next_state [-0.0023462 -0.03804365 0.01667321 0.08233632] reward 1.0 score 13 s tensor([[-0.0023, -0.0380, 0.0167, 0.0823]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [-0.00310707 0.15683537 0.01831994 -0.20503989] reward 1.0 score 14 s tensor([[-0.0031, 0.1568, 0.0183, -0.2050]]) action_probs tensor([[0.9929, 0.0071]], grad_fn=) action 0 next_state [ 2.96351055e-05 -3.85437078e-02 1.42191373e-02 9.33653117e-02] reward 1.0 score 15 s tensor([[ 2.9635e-05, -3.8544e-02, 1.4219e-02, 9.3365e-02]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [-0.00074124 0.15637159 0.01608644 -0.19479779] reward 1.0 score 16 s tensor([[-0.0007, 0.1564, 0.0161, -0.1948]]) action_probs tensor([[0.9929, 0.0071]], grad_fn=) action 0 next_state [ 0.00238619 -0.03897674 0.01219049 0.10291599] reward 1.0 score 17 s tensor([[ 0.0024, -0.0390, 0.0122, 0.1029]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.00160666 0.15596841 0.01424881 -0.18589608] reward 1.0 score 18 s tensor([[ 0.0016, 0.1560, 0.0142, -0.1859]]) action_probs tensor([[0.9930, 0.0070]], grad_fn=) action 0 next_state [ 0.00472603 -0.03935448 0.01053089 0.11124752] reward 1.0 score 19 s tensor([[ 0.0047, -0.0394, 0.0105, 0.1112]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.00393894 0.155615 0.01275584 -0.17809446] reward 1.0 score 20 s tensor([[ 0.0039, 0.1556, 0.0128, -0.1781]]) action_probs tensor([[0.9930, 0.0070]], grad_fn=) action 0 next_state [ 0.00705124 -0.03968715 0.00919395 0.11858504] reward 1.0 score 21 s tensor([[ 0.0071, -0.0397, 0.0092, 0.1186]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.00625749 0.15530187 0.01156565 -0.17118315] reward 1.0 score 22 s tensor([[ 0.0063, 0.1553, 0.0116, -0.1712]]) action_probs tensor([[0.9930, 0.0070]], grad_fn=) action 0 next_state [ 0.00936353 -0.03998369 0.00814199 0.12512583] reward 1.0 score 23 s tensor([[ 0.0094, -0.0400, 0.0081, 0.1251]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.00856386 0.15502068 0.0106445 -0.16497731] reward 1.0 score 24 s tensor([[ 0.0086, 0.1550, 0.0106, -0.1650]]) action_probs tensor([[0.9930, 0.0070]], grad_fn=) action 0 next_state [ 0.01166427 -0.04025202 0.00734496 0.13104457] reward 1.0 score 25 s tensor([[ 0.0117, -0.0403, 0.0073, 0.1310]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.01085923 0.15476395 0.00996585 -0.1593121 ] reward 1.0 score 26 s tensor([[ 0.0109, 0.1548, 0.0100, -0.1593]]) action_probs tensor([[0.9929, 0.0071]], grad_fn=) action 0 next_state [ 0.01395451 -0.04049925 0.0067796 0.13649806] reward 1.0 score 27 s tensor([[ 0.0140, -0.0405, 0.0068, 0.1365]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.01314452 0.15452494 0.00950957 -0.15403831] reward 1.0 score 28 s tensor([[ 0.0131, 0.1545, 0.0095, -0.1540]]) action_probs tensor([[0.9928, 0.0072]], grad_fn=) action 0 next_state [ 0.01623502 -0.04073187 0.0064288 0.14162943] reward 1.0 score 29 s tensor([[ 0.0162, -0.0407, 0.0064, 0.1416]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.01542039 0.15429742 0.00926139 -0.1490184 ] reward 1.0 score 30 s tensor([[ 0.0154, 0.1543, 0.0093, -0.1490]]) action_probs tensor([[0.9926, 0.0074]], grad_fn=) action 0 next_state [ 0.01850633 -0.04095592 0.00628102 0.14657185] reward 1.0 score 31 s tensor([[ 0.0185, -0.0410, 0.0063, 0.1466]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.01768722 0.15407553 0.00921246 -0.14412293] reward 1.0 score 32 s tensor([[ 0.0177, 0.1541, 0.0092, -0.1441]]) action_probs tensor([[0.9924, 0.0076]], grad_fn=) action 0 next_state [ 0.02076873 -0.04117714 0.00633 0.15145206] reward 1.0 score 33 s tensor([[ 0.0208, -0.0412, 0.0063, 0.1515]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.01994518 0.1538536 0.00935904 -0.13922719] reward 1.0 score 34 s tensor([[ 0.0199, 0.1539, 0.0094, -0.1392]]) action_probs tensor([[0.9921, 0.0079]], grad_fn=) action 0 next_state [ 0.02302226 -0.04140113 0.0065745 0.15639361] reward 1.0 score 35 s tensor([[ 0.0230, -0.0414, 0.0066, 0.1564]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.02219423 0.15362607 0.00970237 -0.13420799] reward 1.0 score 36 s tensor([[ 0.0222, 0.1536, 0.0097, -0.1342]]) action_probs tensor([[0.9918, 0.0082]], grad_fn=) action 0 next_state [ 0.02526675 -0.0416335 0.00701821 0.16152004] reward 1.0 score 37 s tensor([[ 0.0253, -0.0416, 0.0070, 0.1615]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.02443408 0.15338728 0.01024861 -0.12894059] reward 1.0 score 38 s tensor([[ 0.0244, 0.1534, 0.0102, -0.1289]]) action_probs tensor([[0.9913, 0.0087]], grad_fn=) action 0 next_state [ 0.02750183 -0.04187997 0.0076698 0.16695795] reward 1.0 score 39 s tensor([[ 0.0275, -0.0419, 0.0077, 0.1670]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.02666423 0.15313135 0.01100896 -0.12329554] reward 1.0 score 40 s tensor([[ 0.0267, 0.1531, 0.0110, -0.1233]]) action_probs tensor([[0.9907, 0.0093]], grad_fn=) action 0 next_state [ 0.02972686 -0.04214657 0.00854305 0.17284017] reward 1.0 score 41 s tensor([[ 0.0297, -0.0421, 0.0085, 0.1728]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.02888393 0.15285207 0.01199985 -0.1171355 ] reward 1.0 score 42 s tensor([[ 0.0289, 0.1529, 0.0120, -0.1171]]) action_probs tensor([[0.9900, 0.0100]], grad_fn=) action 0 next_state [ 0.03194097 -0.04243974 0.00965714 0.179309 ] reward 1.0 score 43 s tensor([[ 0.0319, -0.0424, 0.0097, 0.1793]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.03109217 0.15254269 0.01324332 -0.11031186] reward 1.0 score 44 s tensor([[ 0.0311, 0.1525, 0.0132, -0.1103]]) action_probs tensor([[0.9891, 0.0109]], grad_fn=) action 0 next_state [ 0.03414303 -0.0427665 0.01103708 0.18651966] reward 1.0 score 45 s tensor([[ 0.0341, -0.0428, 0.0110, 0.1865]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.0332877 0.15219581 0.01476747 -0.10266116] reward 1.0 score 46 s tensor([[ 0.0333, 0.1522, 0.0148, -0.1027]]) action_probs tensor([[0.9879, 0.0121]], grad_fn=) action 0 next_state [ 0.03633161 -0.04313463 0.01271425 0.19464403] reward 1.0 score 47 s tensor([[ 0.0363, -0.0431, 0.0127, 0.1946]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.03546892 0.15180317 0.01660713 -0.09400114] reward 1.0 score 48 s tensor([[ 0.0355, 0.1518, 0.0166, -0.0940]]) action_probs tensor([[0.9863, 0.0137]], grad_fn=) action 0 next_state [ 0.03850498 -0.04355283 0.01472711 0.20387472] reward 1.0 score 49 s tensor([[ 0.0385, -0.0436, 0.0147, 0.2039]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.03763393 0.15135545 0.0188046 -0.08412638] reward 1.0 score 50 s tensor([[ 0.0376, 0.1514, 0.0188, -0.0841]]) action_probs tensor([[0.9845, 0.0155]], grad_fn=) action 0 next_state [ 0.04066104 -0.04403094 0.01712208 0.21442961] reward 1.0 score 51 s tensor([[ 0.0407, -0.0440, 0.0171, 0.2144]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.03978042 0.1508421 0.02141067 -0.07280345] reward 1.0 score 52 s tensor([[ 0.0398, 0.1508, 0.0214, -0.0728]]) action_probs tensor([[0.9827, 0.0173]], grad_fn=) action 0 next_state [ 0.04279726 -0.04458016 0.0199546 0.226557 ] reward 1.0 score 53 s tensor([[ 0.0428, -0.0446, 0.0200, 0.2266]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.04190566 0.15025102 0.02448574 -0.05976531] reward 1.0 score 54 s tensor([[ 0.0419, 0.1503, 0.0245, -0.0598]]) action_probs tensor([[0.2231, 0.7769]], grad_fn=) action 1 next_state [ 0.04491068 0.34501349 0.02329043 -0.34462336] reward 1.0 score 55 s tensor([[ 0.0449, 0.3450, 0.0233, -0.3446]]) action_probs tensor([[0.9964, 0.0036]], grad_fn=) action 0 next_state [ 0.05181095 0.14956809 0.01639797 -0.04468801] reward 1.0 score 56 s tensor([[ 0.0518, 0.1496, 0.0164, -0.0447]]) action_probs tensor([[0.9817, 0.0183]], grad_fn=) action 0 next_state [ 0.05480231 -0.04578511 0.01550421 0.25312319] reward 1.0 score 57 s tensor([[ 0.0548, -0.0458, 0.0155, 0.2531]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.05388661 0.14911206 0.02056667 -0.03462938] reward 1.0 score 58 s tensor([[ 0.0539, 0.1491, 0.0206, -0.0346]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.05686885 0.34393314 0.01987408 -0.32075302] reward 1.0 score 59 s tensor([[ 0.0569, 0.3439, 0.0199, -0.3208]]) action_probs tensor([[0.9963, 0.0037]], grad_fn=) action 0 next_state [ 0.06374751 0.14853388 0.01345902 -0.02186941] reward 1.0 score 60 s tensor([[ 0.0637, 0.1485, 0.0135, -0.0219]]) action_probs tensor([[0.1429, 0.8571]], grad_fn=) action 1 next_state [ 0.06671819 0.34346025 0.01302163 -0.31027566] reward 1.0 score 61 s tensor([[ 0.0667, 0.3435, 0.0130, -0.3103]]) action_probs tensor([[0.9967, 0.0033]], grad_fn=) action 0 next_state [ 0.07358739 0.14815522 0.00681612 -0.0135147 ] reward 1.0 score 62 s tensor([[ 0.0736, 0.1482, 0.0068, -0.0135]]) action_probs tensor([[0.9819, 0.0181]], grad_fn=) action 0 next_state [ 0.0765505 -0.04706382 0.00654583 0.28131098] reward 1.0 score 63 s tensor([[ 0.0766, -0.0471, 0.0065, 0.2813]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.07560922 0.14796416 0.01217205 -0.00930025] reward 1.0 score 64 s tensor([[ 0.0756, 0.1480, 0.0122, -0.0093]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.0785685 0.34290945 0.01198604 -0.29811803] reward 1.0 score 65 s tensor([[ 0.0786, 0.3429, 0.0120, -0.2981]]) action_probs tensor([[0.9966, 0.0034]], grad_fn=) action 0 next_state [ 0.08542669 0.14761871 0.00602368 -0.00167915] reward 1.0 score 66 s tensor([[ 0.0854, 0.1476, 0.0060, -0.0017]]) action_probs tensor([[0.4934, 0.5066]], grad_fn=) action 1 next_state [ 0.08837907 0.34265375 0.0059901 -0.29245547] reward 1.0 score 67 s tensor([[ 0.0884, 0.3427, 0.0060, -0.2925]]) action_probs tensor([[0.9970, 0.0030]], grad_fn=) action 0 next_state [9.52321425e-02 1.47446910e-01 1.40987779e-04 2.11062457e-03] reward 1.0 score 68 s tensor([[9.5232e-02, 1.4745e-01, 1.4099e-04, 2.1106e-03]]) action_probs tensor([[0.9832, 0.0168]], grad_fn=) action 0 next_state [ 9.81810807e-02 -4.76770629e-02 1.83200270e-04 2.94838031e-01] reward 1.0 score 69 s tensor([[ 9.8181e-02, -4.7677e-02, 1.8320e-04, 2.9484e-01]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [0.09722754 0.14744228 0.00607996 0.00221289] reward 1.0 score 70 s tensor([[0.0972, 0.1474, 0.0061, 0.0022]]) action_probs tensor([[0.0794, 0.9206]], grad_fn=) action 1 next_state [ 0.10017638 0.34247651 0.00612422 -0.28854555] reward 1.0 score 71 s tensor([[ 0.1002, 0.3425, 0.0061, -0.2885]]) action_probs tensor([[0.9970, 0.0030]], grad_fn=) action 0 next_state [0.10702592 0.14726776 0.00035331 0.00606259] reward 1.0 score 72 s tensor([[0.1070, 0.1473, 0.0004, 0.0061]]) action_probs tensor([[0.9780, 0.0220]], grad_fn=) action 0 next_state [ 0.10997127 -0.04785926 0.00047456 0.29885696] reward 1.0 score 73 s tensor([[ 0.1100, -0.0479, 0.0005, 0.2989]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [0.10901409 0.14725593 0.0064517 0.00632374] reward 1.0 score 74 s tensor([[0.1090, 0.1473, 0.0065, 0.0063]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.1119592 0.34228476 0.00657817 -0.28431663] reward 1.0 score 75 s tensor([[ 0.1120, 0.3423, 0.0066, -0.2843]]) action_probs tensor([[0.9969, 0.0031]], grad_fn=) action 0 next_state [0.1188049 0.14706961 0.00089184 0.01043373] reward 1.0 score 76 s tensor([[0.1188, 0.1471, 0.0009, 0.0104]]) action_probs tensor([[0.5821, 0.4179]], grad_fn=) action 0 next_state [ 0.12174629 -0.04806512 0.00110052 0.30339791] reward 1.0 score 77 s tensor([[ 0.1217, -0.0481, 0.0011, 0.3034]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [0.12078499 0.14704113 0.00716847 0.01106226] reward 1.0 score 78 s tensor([[0.1208, 0.1470, 0.0072, 0.0111]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.12372581 0.34205954 0.00738972 -0.27935033] reward 1.0 score 79 s tensor([[ 0.1237, 0.3421, 0.0074, -0.2794]]) action_probs tensor([[0.9968, 0.0032]], grad_fn=) action 0 next_state [0.130567 0.14683296 0.00180271 0.01565411] reward 1.0 score 80 s tensor([[0.1306, 0.1468, 0.0018, 0.0157]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.13350366 0.34192901 0.00211579 -0.2764595 ] reward 1.0 score 81 s tensor([[ 0.1335, 0.3419, 0.0021, -0.2765]]) action_probs tensor([[0.9971, 0.0029]], grad_fn=) action 0 next_state [ 0.14034224 0.14677694 -0.0034134 0.01689 ] reward 1.0 score 82 s tensor([[ 0.1403, 0.1468, -0.0034, 0.0169]]) action_probs /usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:24: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. tensor([[0.9058, 0.0942]], grad_fn=) action 0 next_state [ 0.14327778 -0.04829589 -0.0030756 0.30849401] reward 1.0 score 83 s tensor([[ 0.1433, -0.0483, -0.0031, 0.3085]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [0.14231186 0.14686974 0.00309428 0.01484271] reward 1.0 score 84 s tensor([[0.1423, 0.1469, 0.0031, 0.0148]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.14524926 0.34194718 0.00339114 -0.27686233] reward 1.0 score 85 s tensor([[ 0.1452, 0.3419, 0.0034, -0.2769]]) action_probs tensor([[0.9970, 0.0030]], grad_fn=) action 0 next_state [ 0.1520882 0.14677702 -0.00214611 0.01688823] reward 1.0 score 86 s tensor([[ 0.1521, 0.1468, -0.0021, 0.0169]]) action_probs tensor([[0.4412, 0.5588]], grad_fn=) action 1 next_state [ 0.15502374 0.34192968 -0.00180834 -0.27647105] reward 1.0 score 87 s tensor([[ 0.1550, 0.3419, -0.0018, -0.2765]]) action_probs tensor([[0.9974, 0.0026]], grad_fn=) action 0 next_state [ 0.16186233 0.14683357 -0.00733776 0.01564098] reward 1.0 score 88 s tensor([[ 0.1619, 0.1468, -0.0073, 0.0156]]) action_probs tensor([[0.9837, 0.0163]], grad_fn=) action 0 next_state [ 0.16479901 -0.04818238 -0.00702494 0.30599975] reward 1.0 score 89 s tensor([[ 0.1648, -0.0482, -0.0070, 0.3060]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.16383536 0.14703897 -0.00090495 0.01110964] reward 1.0 score 90 s tensor([[ 0.1638, 0.1470, -0.0009, 0.0111]]) action_probs tensor([[0.4868, 0.5132]], grad_fn=) action 1 next_state [ 0.16677614 0.34217389 -0.00068276 -0.28185867] reward 1.0 score 91 s tensor([[ 0.1668, 0.3422, -0.0007, -0.2819]]) action_probs tensor([[0.9974, 0.0026]], grad_fn=) action 0 next_state [ 0.17361962 0.14706168 -0.00631993 0.01060884] reward 1.0 score 92 s tensor([[ 0.1736, 0.1471, -0.0063, 0.0106]]) action_probs tensor([[0.9838, 0.0162]], grad_fn=) action 0 next_state [ 0.17656085 -0.04796907 -0.00610775 0.30129107] reward 1.0 score 93 s tensor([[ 0.1766, -0.0480, -0.0061, 0.3013]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 1.75601468e-01 1.47239403e-01 -8.19322507e-05 6.68815714e-03] reward 1.0 score 94 s tensor([[ 1.7560e-01, 1.4724e-01, -8.1932e-05, 6.6882e-03]]) action_probs tensor([[0.5264, 0.4736]], grad_fn=) action 0 next_state [ 1.78546256e-01 -4.78813727e-02 5.18308921e-05 2.99345232e-01] reward 1.0 score 95 s tensor([[ 1.7855e-01, -4.7881e-02, 5.1831e-05, 2.9935e-01]]) action_probs tensor([[0.0087, 0.9913]], grad_fn=) action 1 next_state [0.17758863 0.14723984 0.00603874 0.00667865] reward 1.0 score 96 s tensor([[0.1776, 0.1472, 0.0060, 0.0067]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.18053343 0.34227467 0.00617231 -0.28409288] reward 1.0 score 97 s tensor([[ 0.1805, 0.3423, 0.0062, -0.2841]]) action_probs tensor([[0.9970, 0.0030]], grad_fn=) action 0 next_state [0.18737892 0.14706523 0.00049045 0.01053035] reward 1.0 score 98 s tensor([[0.1874, 0.1471, 0.0005, 0.0105]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.19032022 0.34218014 0.00070106 -0.2819978 ] reward 1.0 score 99 s tensor([[ 0.1903, 0.3422, 0.0007, -0.2820]]) action_probs tensor([[0.9973, 0.0027]], grad_fn=) action 0 next_state [ 0.19716383 0.1470482 -0.0049389 0.01090616] reward 1.0 score 100 s tensor([[ 0.1972, 0.1470, -0.0049, 0.0109]]) action_probs tensor([[0.9799, 0.0201]], grad_fn=) action 0 next_state [ 0.20010479 -0.04800257 -0.00472077 0.30202672] reward 1.0 score 101 s tensor([[ 0.2001, -0.0480, -0.0047, 0.3020]]) action_probs tensor([[0.0105, 0.9895]], grad_fn=) action 1 next_state [0.19914474 0.14718634 0.00131976 0.00785871] reward 1.0 score 102 s tensor([[0.1991, 0.1472, 0.0013, 0.0079]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.20208847 0.34228934 0.00147693 -0.28440752] reward 1.0 score 103 s tensor([[ 0.2021, 0.3423, 0.0015, -0.2844]]) action_probs tensor([[0.9973, 0.0027]], grad_fn=) action 0 next_state [ 0.20893425 0.14714636 -0.00421122 0.00874085] reward 1.0 score 104 s tensor([[ 0.2089, 0.1471, -0.0042, 0.0087]]) action_probs tensor([[0.9573, 0.0427]], grad_fn=) action 0 next_state [ 0.21187718 -0.04791495 -0.0040364 0.30009212] reward 1.0 score 105 s tensor([[ 0.2119, -0.0479, -0.0040, 0.3001]]) action_probs tensor([[0.0075, 0.9925]], grad_fn=) action 1 next_state [0.21091888 0.1472643 0.00196544 0.00613893] reward 1.0 score 106 s tensor([[0.2109, 0.1473, 0.0020, 0.0061]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.21386417 0.34235801 0.00208822 -0.28592323] reward 1.0 score 107 s tensor([[ 0.2139, 0.3424, 0.0021, -0.2859]]) action_probs tensor([[0.9973, 0.0027]], grad_fn=) action 0 next_state [ 0.22071133 0.14720634 -0.00363024 0.00741758] reward 1.0 score 108 s tensor([[ 0.2207, 0.1472, -0.0036, 0.0074]]) action_probs tensor([[0.8998, 0.1002]], grad_fn=) action 0 next_state [ 0.22365545 -0.04786336 -0.00348189 0.29895291] reward 1.0 score 109 s tensor([[ 0.2237, -0.0479, -0.0035, 0.2990]]) action_probs tensor([[0.0053, 0.9947]], grad_fn=) action 1 next_state [0.22269819 0.14730805 0.00249717 0.0051739 ] reward 1.0 score 110 s tensor([[0.2227, 0.1473, 0.0025, 0.0052]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.22564435 0.3423941 0.00260064 -0.2867201 ] reward 1.0 score 111 s tensor([[ 0.2256, 0.3424, 0.0026, -0.2867]]) action_probs tensor([[0.9973, 0.0027]], grad_fn=) action 0 next_state [ 0.23249223 0.14723516 -0.00313376 0.00678191] reward 1.0 score 112 s tensor([[ 0.2325, 0.1472, -0.0031, 0.0068]]) action_probs tensor([[0.7475, 0.2525]], grad_fn=) action 0 next_state [ 0.23543693 -0.04784171 -0.00299812 0.29847446] reward 1.0 score 113 s tensor([[ 0.2354, -0.0478, -0.0030, 0.2985]]) action_probs tensor([[0.0037, 0.9963]], grad_fn=) action 1 next_state [0.2344801 0.14732285 0.00297137 0.00484749] reward 1.0 score 114 s tensor([[0.2345, 0.1473, 0.0030, 0.0048]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.23742656 0.34240206 0.00306832 -0.28689646] reward 1.0 score 115 s tensor([[ 0.2374, 0.3424, 0.0031, -0.2869]]) action_probs tensor([[0.9972, 0.0028]], grad_fn=) action 0 next_state [ 0.2442746 0.14723648 -0.00266961 0.00675261] reward 1.0 score 116 s tensor([[ 0.2443, 0.1472, -0.0027, 0.0068]]) action_probs tensor([[0.3966, 0.6034]], grad_fn=) action 1 next_state [ 0.24721933 0.34239662 -0.00253456 -0.28677141] reward 1.0 score 117 s tensor([[ 0.2472, 0.3424, -0.0025, -0.2868]]) action_probs tensor([[0.9976, 0.0024]], grad_fn=) action 0 next_state [ 0.25406726 0.14731091 -0.00826999 0.00511106] reward 1.0 score 118 s tensor([[ 0.2541, 0.1473, -0.0083, 0.0051]]) action_probs tensor([[0.9844, 0.0156]], grad_fn=) action 0 next_state [ 0.25701348 -0.04769147 -0.00816776 0.29517327] reward 1.0 score 119 s tensor([[ 0.2570, -0.0477, -0.0082, 0.2952]]) action_probs tensor([[0.0076, 0.9924]], grad_fn=) action 1 next_state [ 2.56059647e-01 1.47545964e-01 -2.26429890e-03 -7.44081029e-05] reward 1.0 score 120 s tensor([[ 2.5606e-01, 1.4755e-01, -2.2643e-03, -7.4408e-05]]) action_probs tensor([[0.7405, 0.2595]], grad_fn=) action 0 next_state [ 0.25901057 -0.04754344 -0.00226579 0.29189325] reward 1.0 score 121 s tensor([[ 0.2590, -0.0475, -0.0023, 0.2919]]) action_probs tensor([[0.0024, 0.9976]], grad_fn=) action 1 next_state [ 0.2580597 0.14761074 0.00357208 -0.00150342] reward 1.0 score 122 s tensor([[ 0.2581, 0.1476, 0.0036, -0.0015]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.26101191 0.34268128 0.00354201 -0.29305718] reward 1.0 score 123 s tensor([[ 0.2610, 0.3427, 0.0035, -0.2931]]) action_probs tensor([[0.9973, 0.0027]], grad_fn=) action 0 next_state [ 0.26786554 0.14750901 -0.00231913 0.00074075] reward 1.0 score 124 s tensor([[ 0.2679, 0.1475, -0.0023, 0.0007]]) action_probs tensor([[0.4485, 0.5515]], grad_fn=) action 1 next_state [ 0.27081572 0.34266415 -0.00230432 -0.29267299] reward 1.0 score 125 s tensor([[ 0.2708, 0.3427, -0.0023, -0.2927]]) action_probs tensor([[0.9976, 0.0024]], grad_fn=) action 0 next_state [ 0.277669 0.14757513 -0.00815778 -0.00071771] reward 1.0 score 126 s tensor([[ 0.2777, 0.1476, -0.0082, -0.0007]]) action_probs tensor([[0.9848, 0.0152]], grad_fn=) action 0 next_state [ 0.2806205 -0.04742889 -0.00817213 0.28938022] reward 1.0 score 127 s tensor([[ 0.2806, -0.0474, -0.0082, 0.2894]]) action_probs tensor([[0.0056, 0.9944]], grad_fn=) action 1 next_state [ 0.27967193 0.14780864 -0.00238453 -0.00586887] reward 1.0 score 128 s tensor([[ 0.2797, 0.1478, -0.0024, -0.0059]]) action_probs tensor([[0.8846, 0.1154]], grad_fn=) action 0 next_state [ 0.2826281 -0.04727904 -0.00250191 0.28606075] reward 1.0 score 129 s tensor([[ 0.2826, -0.0473, -0.0025, 0.2861]]) action_probs tensor([[0.0019, 0.9981]], grad_fn=) action 1 next_state [ 0.28168252 0.14787851 0.00321931 -0.0074102 ] reward 1.0 score 130 s tensor([[ 0.2817, 0.1479, 0.0032, -0.0074]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.28464009 0.34295414 0.0030711 -0.29907566] reward 1.0 score 131 s tensor([[ 0.2846, 0.3430, 0.0031, -0.2991]]) action_probs tensor([[0.9974, 0.0026]], grad_fn=) action 0 next_state [ 0.29149917 0.14778855 -0.00291041 -0.00542575] reward 1.0 score 132 s tensor([[ 0.2915, 0.1478, -0.0029, -0.0054]]) action_probs tensor([[0.8473, 0.1527]], grad_fn=) action 0 next_state [ 0.29445494 -0.04729154 -0.00301892 0.28633749] reward 1.0 score 133 s tensor([[ 0.2945, -0.0473, -0.0030, 0.2863]]) action_probs tensor([[0.0015, 0.9985]], grad_fn=) action 1 next_state [ 0.29350911 0.14787333 0.00270783 -0.00729605] reward 1.0 score 134 s tensor([[ 0.2935, 0.1479, 0.0027, -0.0073]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.29646658 0.34295635 0.00256191 -0.2991234 ] reward 1.0 score 135 s tensor([[ 0.2965, 0.3430, 0.0026, -0.2991]]) action_probs tensor([[0.9974, 0.0026]], grad_fn=) action 0 next_state [ 0.3033257 0.14779797 -0.00342056 -0.0056336 ] reward 1.0 score 136 s tensor([[ 0.3033, 0.1478, -0.0034, -0.0056]]) action_probs tensor([[0.8423, 0.1577]], grad_fn=) action 0 next_state [ 0.30628166 -0.04727476 -0.00353323 0.28596814] reward 1.0 score 137 s tensor([[ 0.3063, -0.0473, -0.0035, 0.2860]]) action_probs tensor([[0.0013, 0.9987]], grad_fn=) action 1 next_state [ 0.30533617 0.1478974 0.00218613 -0.00782705] reward 1.0 score 138 s tensor([[ 0.3053, 0.1479, 0.0022, -0.0078]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.30829412 0.34298794 0.00202959 -0.29981942] reward 1.0 score 139 s tensor([[ 0.3083, 0.3430, 0.0020, -0.2998]]) action_probs tensor([[0.9975, 0.0025]], grad_fn=) action 0 next_state [ 0.31515388 0.14783711 -0.0039668 -0.0064971 ] reward 1.0 score 140 s tensor([[ 0.3152, 0.1478, -0.0040, -0.0065]]) action_probs tensor([[0.8819, 0.1181]], grad_fn=) action 0 next_state [ 0.31811062 -0.04722772 -0.00409674 0.28493162] reward 1.0 score 141 s tensor([[ 0.3181, -0.0472, -0.0041, 0.2849]]) action_probs tensor([[0.0012, 0.9988]], grad_fn=) action 1 next_state [ 0.31716606 0.14795242 0.00160189 -0.00904057] reward 1.0 score 142 s tensor([[ 0.3172, 0.1480, 0.0016, -0.0090]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.32012511 0.34305136 0.00142108 -0.30121765] reward 1.0 score 143 s tensor([[ 0.3201, 0.3431, 0.0014, -0.3012]]) action_probs tensor([[0.9976, 0.0024]], grad_fn=) action 0 next_state [ 0.32698614 0.14790918 -0.00460328 -0.00808688] reward 1.0 score 144 s tensor([[ 0.3270, 0.1479, -0.0046, -0.0081]]) action_probs tensor([[0.9449, 0.0551]], grad_fn=) action 0 next_state [ 0.32994432 -0.04714645 -0.00476501 0.28314011] reward 1.0 score 145 s tensor([[ 0.3299, -0.0471, -0.0048, 0.2831]]) action_probs tensor([[0.0011, 0.9989]], grad_fn=) action 1 next_state [ 0.32900139 0.14804314 0.00089779 -0.01104186] reward 1.0 score 146 s tensor([[ 0.3290, 0.1480, 0.0009, -0.0110]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.33196226 0.3431522 0.00067695 -0.30344139] reward 1.0 score 147 s tensor([[ 0.3320, 0.3432, 0.0007, -0.3034]]) action_probs tensor([[0.9976, 0.0024]], grad_fn=) action 0 next_state [ 0.3388253 0.14802061 -0.00539188 -0.01054505] reward 1.0 score 148 s tensor([[ 0.3388, 0.1480, -0.0054, -0.0105]]) action_probs tensor([[0.9817, 0.0183]], grad_fn=) action 0 next_state [ 0.34178571 -0.0470236 -0.00560278 0.28043181] reward 1.0 score 149 s tensor([[ 0.3418, -0.0470, -0.0056, 0.2804]]) action_probs tensor([[0.0011, 0.9989]], grad_fn=) action 1 next_state [ 3.40845241e-01 1.48177823e-01 5.85915699e-06 -1.40129348e-02] reward 1.0 score 150 s tensor([[ 3.4085e-01, 1.4818e-01, 5.8592e-06, -1.4013e-02]]) action_probs tensor([[0.0157, 0.9843]], grad_fn=) action 1 next_state [ 3.43808797e-01 3.43299690e-01 -2.74399540e-04 -3.06694013e-01] reward 1.0 score 151 s tensor([[ 3.4381e-01, 3.4330e-01, -2.7440e-04, -3.0669e-01]]) action_probs tensor([[0.9977, 0.0023]], grad_fn=) action 0 next_state [ 0.35067479 0.14818165 -0.00640828 -0.01409764] reward 1.0 score 152 s tensor([[ 0.3507, 0.1482, -0.0064, -0.0141]]) action_probs tensor([[0.9833, 0.0167]], grad_fn=) action 0 next_state [ 0.35363842 -0.04684781 -0.00669023 0.27655653] reward 1.0 score 153 s tensor([[ 0.3536, -0.0468, -0.0067, 0.2766]]) action_probs tensor([[0.0013, 0.9987]], grad_fn=) action 1 next_state [ 0.35270147 0.14836894 -0.0011591 -0.01822897] reward 1.0 score 154 s tensor([[ 0.3527, 0.1484, -0.0012, -0.0182]]) action_probs tensor([[0.4715, 0.5285]], grad_fn=) action 1 next_state [ 0.35566885 0.3435075 -0.00152368 -0.31127738] reward 1.0 score 155 s tensor([[ 0.3557, 0.3435, -0.0015, -0.3113]]) action_probs tensor([[0.9978, 0.0022]], grad_fn=) action 0 next_state [ 0.362539 0.14840729 -0.00774923 -0.01907536] reward 1.0 score 156 s tensor([[ 0.3625, 0.1484, -0.0077, -0.0191]]) action_probs tensor([[0.9853, 0.0147]], grad_fn=) action 0 next_state [ 0.36550714 -0.04660268 -0.00813074 0.27115255] reward 1.0 score 157 s tensor([[ 0.3655, -0.0466, -0.0081, 0.2712]]) action_probs tensor([[0.0016, 0.9984]], grad_fn=) action 1 next_state [ 0.36457509 0.14863435 -0.00270768 -0.02408371] reward 1.0 score 158 s tensor([[ 0.3646, 0.1486, -0.0027, -0.0241]]) action_probs tensor([[0.9817, 0.0183]], grad_fn=) action 0 next_state [ 0.36754778 -0.04644867 -0.00318936 0.26774368] reward 1.0 score 159 s tensor([[ 0.3675, -0.0464, -0.0032, 0.2677]]) action_probs tensor([[6.3704e-04, 9.9936e-01]], grad_fn=) action 1 next_state [ 0.3666188 0.14871865 0.00216551 -0.02594349] reward 1.0 score 160 s tensor([[ 0.3666, 0.1487, 0.0022, -0.0259]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.36959318 0.34380948 0.00164664 -0.31794238] reward 1.0 score 161 s tensor([[ 0.3696, 0.3438, 0.0016, -0.3179]]) action_probs tensor([[0.9977, 0.0023]], grad_fn=) action 0 next_state [ 0.37646937 0.14866412 -0.0047122 -0.02474062] reward 1.0 score 162 s tensor([[ 0.3765, 0.1487, -0.0047, -0.0247]]) action_probs tensor([[0.9834, 0.0166]], grad_fn=) action 0 next_state [ 0.37944265 -0.04638994 -0.00520702 0.26645184] reward 1.0 score 163 s tensor([[ 0.3794, -0.0464, -0.0052, 0.2665]]) action_probs tensor([[7.7874e-04, 9.9922e-01]], grad_fn=) action 1 next_state [ 3.78514849e-01 1.48805936e-01 1.22021325e-04 -2.78688660e-02] reward 1.0 score 164 s tensor([[ 3.7851e-01, 1.4881e-01, 1.2202e-04, -2.7869e-02]]) action_probs tensor([[0.5618, 0.4382]], grad_fn=) action 0 next_state [ 0.38149097 -0.04631776 -0.00043536 0.26485256] reward 1.0 score 165 s tensor([[ 0.3815, -0.0463, -0.0004, 0.2649]]) action_probs tensor([[2.8913e-04, 9.9971e-01]], grad_fn=) action 1 next_state [ 0.38056461 0.1488104 0.0048617 -0.02796765] reward 1.0 score 166 s tensor([[ 0.3806, 0.1488, 0.0049, -0.0280]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.38354082 0.34386229 0.00430234 -0.3191127 ] reward 1.0 score 167 s tensor([[ 0.3835, 0.3439, 0.0043, -0.3191]]) action_probs tensor([[0.9976, 0.0024]], grad_fn=) action 0 next_state [ 0.39041807 0.14867933 -0.00207991 -0.02507609] reward 1.0 score 168 s tensor([[ 0.3904, 0.1487, -0.0021, -0.0251]]) action_probs tensor([[0.8262, 0.1738]], grad_fn=) action 0 next_state [ 0.39339165 -0.04641273 -0.00258143 0.26694988] reward 1.0 score 169 s tensor([[ 0.3934, -0.0464, -0.0026, 0.2669]]) action_probs tensor([[3.1021e-04, 9.9969e-01]], grad_fn=) action 1 next_state [ 0.3924634 0.14874597 0.00275756 -0.02654614] reward 1.0 score 170 s tensor([[ 0.3925, 0.1487, 0.0028, -0.0265]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.39543832 0.34382827 0.00222664 -0.31835775] reward 1.0 score 171 s tensor([[ 0.3954, 0.3438, 0.0022, -0.3184]]) action_probs tensor([[0.9977, 0.0023]], grad_fn=) action 0 next_state [ 0.40231488 0.14867467 -0.00414051 -0.02497345] reward 1.0 score 172 s tensor([[ 0.4023, 0.1487, -0.0041, -0.0250]]) action_probs tensor([[0.9806, 0.0194]], grad_fn=) action 0 next_state [ 0.40528838 -0.04638766 -0.00463998 0.26640023] reward 1.0 score 173 s tensor([[ 0.4053, -0.0464, -0.0046, 0.2664]]) action_probs tensor([[3.4814e-04, 9.9965e-01]], grad_fn=) action 1 next_state [ 0.40436062 0.14880021 0.00068802 -0.02774257] reward 1.0 score 174 s tensor([[ 0.4044, 0.1488, 0.0007, -0.0277]]) action_probs tensor([[0.0174, 0.9826]], grad_fn=) action 1 next_state [ 4.07336627e-01 3.43912287e-01 1.33170325e-04 -3.20208338e-01] reward 1.0 score 175 s tensor([[ 4.0734e-01, 3.4391e-01, 1.3317e-04, -3.2021e-01]]) action_probs tensor([[0.9979, 0.0021]], grad_fn=) action 0 next_state [ 0.41421487 0.14878844 -0.006271 -0.02748342] reward 1.0 score 176 s tensor([[ 0.4142, 0.1488, -0.0063, -0.0275]]) action_probs tensor([[0.9838, 0.0162]], grad_fn=) action 0 next_state [ 0.41719064 -0.04624302 -0.00682066 0.26321436] reward 1.0 score 177 s tensor([[ 0.4172, -0.0462, -0.0068, 0.2632]]) action_probs tensor([[4.5204e-04, 9.9955e-01]], grad_fn=) action 1 next_state [ 0.41626578 0.14897562 -0.00155638 -0.03161205] reward 1.0 score 178 s tensor([[ 0.4163, 0.1490, -0.0016, -0.0316]]) action_probs tensor([[0.8434, 0.1566]], grad_fn=) action 0 next_state [ 0.41924529 -0.04612398 -0.00218862 0.26057942] reward 1.0 score 179 s tensor([[ 0.4192, -0.0461, -0.0022, 0.2606]]) action_probs tensor([[2.0539e-04, 9.9979e-01]], grad_fn=) action 1 next_state [ 0.41832281 0.14902915 0.00302297 -0.03279302] reward 1.0 score 180 s tensor([[ 0.4183, 0.1490, 0.0030, -0.0328]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.4213034 0.34410762 0.00236711 -0.32452064] reward 1.0 score 181 s tensor([[ 0.4213, 0.3441, 0.0024, -0.3245]]) action_probs tensor([[0.9978, 0.0022]], grad_fn=) action 0 next_state [ 0.42818555 0.14895204 -0.0041233 -0.03109217] reward 1.0 score 182 s tensor([[ 0.4282, 0.1490, -0.0041, -0.0311]]) action_probs tensor([[0.9820, 0.0180]], grad_fn=) action 0 next_state [ 0.43116459 -0.04611054 -0.00474515 0.26028697] reward 1.0 score 183 s tensor([[ 0.4312, -0.0461, -0.0047, 0.2603]]) action_probs tensor([[2.6307e-04, 9.9974e-01]], grad_fn=) action 1 next_state [ 0.43024238 0.14907883 0.00046059 -0.03388885] reward 1.0 score 184 s tensor([[ 0.4302, 0.1491, 0.0005, -0.0339]]) action_probs tensor([[0.1359, 0.8641]], grad_fn=) action 1 next_state [ 4.33223956e-01 3.44194173e-01 -2.17184329e-04 -3.26426424e-01] reward 1.0 score 185 s tensor([[ 4.3322e-01, 3.4419e-01, -2.1718e-04, -3.2643e-01]]) action_probs tensor([[0.9979, 0.0021]], grad_fn=) action 0 next_state [ 0.44010784 0.14907531 -0.00674571 -0.033812 ] reward 1.0 score 186 s tensor([[ 0.4401, 0.1491, -0.0067, -0.0338]]) action_probs tensor([[0.9848, 0.0152]], grad_fn=) action 0 next_state [ 0.44308935 -0.04594925 -0.00742195 0.25673498] reward 1.0 score 187 s tensor([[ 0.4431, -0.0459, -0.0074, 0.2567]]) action_probs tensor([[4.1067e-04, 9.9959e-01]], grad_fn=) action 1 next_state [ 0.44217036 0.14927787 -0.00228725 -0.03827968] reward 1.0 score 188 s tensor([[ 0.4422, 0.1493, -0.0023, -0.0383]]) action_probs tensor([[0.9808, 0.0192]], grad_fn=) action 0 next_state [ 0.44515592 -0.04581121 -0.00305285 0.25368072] reward 1.0 score 189 s tensor([[ 0.4452, -0.0458, -0.0031, 0.2537]]) action_probs tensor([[2.0283e-04, 9.9980e-01]], grad_fn=) action 1 next_state [ 0.44423969 0.1493542 0.00202077 -0.03996356] reward 1.0 score 190 s tensor([[ 0.4442, 0.1494, 0.0020, -0.0400]]) action_probs tensor([[0.0265, 0.9735]], grad_fn=) action 1 next_state [ 0.44722678 0.34444712 0.0012215 -0.33200823] reward 1.0 score 191 s tensor([[ 0.4472, 0.3444, 0.0012, -0.3320]]) action_probs tensor([[0.9979, 0.0021]], grad_fn=) action 0 next_state [ 0.45411572 0.1493078 -0.00541867 -0.03894036] reward 1.0 score 192 s tensor([[ 0.4541, 0.1493, -0.0054, -0.0389]]) action_probs tensor([[0.9843, 0.0157]], grad_fn=) action 0 next_state [ 0.45710188 -0.04573603 -0.00619748 0.25202801] reward 1.0 score 193 s tensor([[ 0.4571, -0.0457, -0.0062, 0.2520]]) action_probs tensor([[3.1042e-04, 9.9969e-01]], grad_fn=) action 1 next_state [ 0.45618716 0.14947387 -0.00115692 -0.04260326] reward 1.0 score 194 s tensor([[ 0.4562, 0.1495, -0.0012, -0.0426]]) action_probs tensor([[0.9516, 0.0484]], grad_fn=) action 0 next_state [ 0.45917663 -0.04563147 -0.00200898 0.24971443] reward 1.0 score 195 s tensor([[ 0.4592, -0.0456, -0.0020, 0.2497]]) action_probs tensor([[1.5721e-04, 9.9984e-01]], grad_fn=) action 1 next_state [ 0.458264 0.14951911 0.00298531 -0.0436015 ] reward 1.0 score 196 s tensor([[ 0.4583, 0.1495, 0.0030, -0.0436]]) action_probs tensor([[0.0140, 0.9860]], grad_fn=) action 1 next_state [ 0.46125439 0.34459812 0.00211328 -0.33534104] reward 1.0 score 197 s tensor([[ 0.4613, 0.3446, 0.0021, -0.3353]]) action_probs tensor([[0.9979, 0.0021]], grad_fn=) action 0 next_state [ 0.46814635 0.14944616 -0.00459354 -0.04199244] reward 1.0 score 198 s tensor([[ 0.4681, 0.1494, -0.0046, -0.0420]]) action_probs tensor([[0.9837, 0.0163]], grad_fn=) action 0 next_state [ 0.47113527 -0.04560962 -0.00543339 0.24923765] reward 1.0 score 199 s tensor([[ 0.4711, -0.0456, -0.0054, 0.2492]]) action_probs tensor([[2.3249e-04, 9.9977e-01]], grad_fn=) action 1 next_state [ 4.70223079e-01 1.49589502e-01 -4.48638357e-04 -4.51541243e-02] reward 1.0 score 200 s tensor([[ 4.7022e-01, 1.4959e-01, -4.4864e-04, -4.5154e-02]]) action_probs tensor([[0.8600, 0.1400]], grad_fn=) action 0 next_state [ 0.47321487 -0.04552601 -0.00135172 0.24738722] reward 1.0 200

a2c.py

#Not trained
untrained_model = ActorCritic()
test_model(untrained_model)

Result

score 1 s tensor([[ 0.0294, -0.0329, -0.0324, 0.0198]]) action_probs tensor([[0.5035, 0.4965]], grad_fn=) action 0 next_state [ 0.02878646 -0.22759263 -0.03198369 0.30206728] reward 1.0 score 2 s tensor([[ 0.0288, -0.2276, -0.0320, 0.3021]]) action_probs tensor([[0.5025, 0.4975]], grad_fn=) action 0 next_state [ 0.02423461 -0.42224449 -0.02594234 0.58449414] reward 1.0 score 3 s tensor([[ 0.0242, -0.4222, -0.0259, 0.5845]]) action_probs tensor([[0.5021, 0.4979]], grad_fn=) action 0 next_state [ 0.01578972 -0.61699361 -0.01425246 0.86889333] reward 1.0 score 4 s tensor([[ 0.0158, -0.6170, -0.0143, 0.8689]]) action_probs tensor([[0.5012, 0.4988]], grad_fn=) action 0 next_state [ 0.00344985 -0.81191879 0.00312541 1.15706133] reward 1.0 score 5 s tensor([[ 0.0034, -0.8119, 0.0031, 1.1571]]) action_probs tensor([[0.5009, 0.4991]], grad_fn=) action 0 next_state [-0.01278853 -1.00708134 0.02626664 1.45072259] reward 1.0 score 6 s tensor([[-0.0128, -1.0071, 0.0263, 1.4507]]) action_probs tensor([[0.5015, 0.4985]], grad_fn=) action 0 next_state [-0.03293016 -1.20251603 0.05528109 1.75149501] reward 1.0 score 7 s tensor([[-0.0329, -1.2025, 0.0553, 1.7515]]) action_probs tensor([[0.5020, 0.4980]], grad_fn=) action 0 next_state [-0.05698048 -1.3982201 0.09031099 2.06084703] reward 1.0 score 8 s tensor([[-0.0570, -1.3982, 0.0903, 2.0608]]) action_probs tensor([[0.5023, 0.4977]], grad_fn=) action 0 next_state [-0.08494488 -1.59413991 0.13152793 2.38004446] reward 1.0 score 9 s tensor([[-0.0849, -1.5941, 0.1315, 2.3800]]) action_probs tensor([[0.5024, 0.4976]], grad_fn=) action 0 next_state [-0.11682768 -1.790155 0.17912882 2.71008534] reward 1.0 score 10 s tensor([[-0.1168, -1.7902, 0.1791, 2.7101]]) action_probs tensor([[0.5025, 0.4975]], grad_fn=) action 0 next_state [-0.15263078 -1.98605958 0.23333053 3.05162298] reward 1.0 /usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:24: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. 10

Explain

A2C receives status inputs - sensor inputs in the case of Klukovka - and generates two outputs:

1) An estimate of how much reward will be received, starting from the moment of the current state, with the exception of the current (existing) reward.

2) A recommendation on what action to take (policy).

Critic: wow, what a wonderful valley! It will be a fruitful day for foraging! I bet today I’ll collect 20 points before sunset.
“Subject”: these flowers look beautiful, I feel a craving for “A”.

Deep RL models are input-output mapping machines, like any other classification or regression model. Instead of categorizing images or text, deep RL models bring states to actions and / or states to state values. A2C does both.

This state-action-reward set makes up one observation. She will write this line of data to her journal, but she is not going to think about it yet. She will fill it when she stops to think.

Some authors associate reward 1 with time step 1, others associate it with step 2, but all have in mind the same concept: the reward is related to the state, and the action immediately precedes it.

Hooking repeats the process again. First, she perceives her surroundings and develops a function V (S) and a recommendation for action.

Critic: This valley looks pretty standard. V (S) = 19. “Subject”: The options for action look very similar. I think I'll just go on track “C”.

Further it acts.

Receives a reward of +20! And records the observation.

She repeats the process again.

After collecting three observations, Klyukovka stops to think.

Other families of models wait until the very end of the day (Monte Carlo), while others think after each step (one-steps).
Before she can set up her internal critic, Klukovka needs to calculate how many points she will actually receive in each given state.

But first!
Let's look at how Klukovka's cousin, Lis Monte Carlo, calculates the true meaning of each state.

Monte Carlo models do not reflect their experience until the end of the game, and since the value of the last state is zero, it is very simple to find the true value of this previous state as the sum of the rewards received after this moment.

In fact, this is just a high dispersion sample V (S). The agent could easily follow a different trajectory from the same state, thus receiving a different aggregate reward.

But Klyukovka goes, stops and reflects many times until the day comes to an end. She wants to know how many points she really will get from each state to the end of the game, because there are several hours left until the end of the game.

That's where she does something really smart - the Klyukovka fox estimates how many points she will receive for her last fortune in this set. Fortunately, she has the correct assessment of her condition - her critic. With this assessment, Klyukovka can calculate the “correct” values of the previous states exactly as the Monte Carlo fox does.

Lis Monte Carlo evaluates the target marks, making the deployment of the trajectory and adding rewards forward from each state. A2C cuts this trajectory and replaces it with an assessment of its critic. This initial load reduces the variance of the score and allows the A2C to run continuously, albeit by introducing a small bias.

Rewards are often reduced to reflect the fact that rewards are better now than in the future. For simplicity, Klukovka does not reduce its rewards.

Klyukovka collects valuable experience that is worth considering.

“I poorly rated states 1 and 2. What did I do wrong? Yeah! The next time I see feathers like these, I will increase V (S).

It may seem crazy that Klukovka is able to use her V (S) rating as a basis to compare it with other forecasts. But animals (including us) do this all the time! If you feel that things are going well, you do not need to retrain the actions that brought you into this state.

By trimming our calculated outputs and replacing them with an initial load estimate, we replaced the large Monte Carlo variance with a small bias. RL models typically suffer from high dispersion (representing all possible paths), and such a replacement is usually worth it.

Klukovka repeats this process all day, collecting three observations of state-action-reward and reflecting on them.

Each set of three observations is a small, autocorrelated series of labeled training data. To reduce this autocorrelation, many A2Cs train many agents in parallel, adding up their experience together before sending it to a common neural network.

The day is finally coming to an end. Only two steps left. As we said earlier, the recommendations of Klukovka’s actions are expressed in percentage confidence about its capabilities. Instead of just choosing the most reliable choice, Klukovka chooses from this distribution of actions. This ensures that she does not always agree to safe, but potentially mediocre actions.

I could regret it, but ... Sometimes, exploring unknown things, you can come to exciting new discoveries ...

To further encourage research, a value called entropy is subtracted from the loss function. Entropy means the “scope” of the distribution of actions.
- It seems that the game has paid off!

Or not?

Sometimes the agent is in a state where all actions lead to negative outcomes. A2C, however, copes well with bad situations.

When the sun went down, Klyukovka reflected on the last set of solutions.

We talked about how Klyukovka sets up his inner critic. But how does she fine-tune her inner “subject"? How does she learn to make such exquisite choices?

The simple-minded fox Gradient-Policy would look at the actual income after the action and adjust its policy to make good income more likely: - It seems that my policy in this state led to a loss of 20 points, I think that in the future it is better to do “C” less likely.

- But wait! It is unfair to blame the action “C”. This condition had an estimated value of -100, so choosing “C” and ending it with -20 was actually a relative improvement of 80! I have to make “C” more likely in the future.

Instead of adjusting its policy in response to the total revenue it received by selecting action C, it tunes its action to the relative revenues from action C. This is called an “advantage”.

What we called an advantage is simply a mistake. As an advantage, Klukovka uses it to make actions that were surprisingly good, more likely. As a mistake, she uses the same amount to push her internal critic to improve her assessment of the status value.

Subject takes advantage:
“Wow, it worked better than I thought, action C must be a good idea.” The critic uses the error:
“But why was I surprised? I probably shouldn't have evaluated this condition so negatively.”

Now we can show how the total losses are calculated - we minimize this function to improve our model:
“Total loss = loss of action + loss of value - entropy”

Please note that to calculate the gradients of three qualitatively different types, we take the values “through one”. This is effective, but can make convergence more difficult.

Like all animals, as Klyukovka grows older, he will hone his ability to predict the values of states, gain more confidence in his actions, and less often be surprised at awards.

RL agents, such as Klukovka, not only generate all the necessary data, simply interacting with the environment, but also evaluate the target labels themselves. That's right, RL models update previous grades to better match new and improved grades.

As Dr. David Silver, head of the RL group at Google Deepmind says: AI = DL + RL. When an agent like Klyukovka can set his own intelligence, the possibilities are endless ...

End.

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up