More than 3 years have passed since last update.

RLLibのパラメータ、使い方など

Posted at 2021-11-19

参考：# https://dmv.nico/ja/articles/how_to_use_rllib/

QuickStart

基本的にはtune.Experimentに設定を書き、tune.run_experiments()を使うのが良い

import argparse
import gym
from gym.spaces import Discrete, Box
import numpy as np
import os
import random
import cv2

import ray
from ray.tune import Experiment
from ray import tune
from ray.rllib.agents import ppo
from ray.rllib.env.env_context import EnvContext
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.tune.logger import pretty_print
from torch._C import dtype

if __name__ == "__main__":
    ray.init()

    # https://docs.ray.io/en/latest/rllib-training.html#specifying-resources
    experiment_spec = Experiment(
        "my_experiment_name",
        run="DQN",
        stop={
            "training_iteration": 100,
            #"timesteps_total": 100000,
            #"episode_reward_mean": 10,
        },
        config={
            "env": "PongDeterministic-v4",
            "env_config": {},
            "framework": "torch",
        },
        local_dir="~/ray_results",
        checkpoint_freq=10,
        max_failures=2
    )

    # automated run with Tune and grid search and TensorBoard
    print("Training automatically with Ray Tune")
    tune.run_experiments(experiment_spec)
    ray.shutdown()
    print("Finished")

EnvとModelをカスタマイズする場合

class SimpleCorridor(gym.Env):
    """Example of a custom env in which you have to walk down a corridor.
    You can configure the length of the corridor via the env config."""

    def __init__(self, config: EnvContext):
        self.end_pos = config["corridor_length"]
        self.cur_pos = 0
        self.action_space = Discrete(2)
        self.observation_space = Box(
            0.0, self.end_pos, shape=(1, ), dtype=np.float32)
        # Set the seed. This is only used for the final (reach goal) reward.
        self.seed(config.worker_index * config.num_workers)

    def reset(self):
        self.cur_pos = 0
        return np.array([self.cur_pos], dtype=np.float32)

    def step(self, action):
        assert action in [0, 1], action
        if action == 0 and self.cur_pos > 0:
            self.cur_pos -= 1
        elif action == 1:
            self.cur_pos += 1
        done = self.cur_pos >= self.end_pos
        # Produce a random reward when we reach the goal.
        return np.array([self.cur_pos], dtype=np.float32), \
            random.random() * 2 if done else -0.1, done, {}

    def seed(self, seed=None):
        random.seed(seed)


class TorchCustomModel(TorchModelV2, nn.Module):
    """Example of a PyTorch custom model that just delegates to a fc-net."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
                              model_config, name)
        nn.Module.__init__(self)
        print("obsspace: ", obs_space, action_space, model_config)
        self.torch_sub_model = TorchFC(obs_space, action_space, num_outputs,
                                       model_config, name)

    def forward(self, input_dict, state, seq_lens):
        input_dict["obs"] = input_dict["obs"].float()
        fc_out, _ = self.torch_sub_model(input_dict, state, seq_lens)
        return fc_out, []

    def value_function(self):
        return torch.reshape(self.torch_sub_model.value_function(), [-1])

if __name__ == "__main__":
    ray.init()
    ModelCatalog.register_custom_model(
        "my_model", TorchCustomModel
    )
    # https://docs.ray.io/en/latest/rllib-training.html#specifying-resources
    experiment_spec = Experiment(
        "my_experiment_name",
        run="DQN",
        stop={
            "training_iteration": 100,
            #"timesteps_total": 100000,
            #"episode_reward_mean": 10,
        },
        config={
            "env": SimpleCorridor,
            "env_config": {
                "corridor_length": 5,
            },
            "model": {
                "custom_model": "my_model",
            },
            "framework": "torch",
        },
        local_dir="~/ray_results",
        checkpoint_freq=10,
        max_failures=2
    )

    # automated run with Tune and grid search and TensorBoard
    print("Training automatically with Ray Tune")
    tune.run_experiments(experiment_spec)
    ray.shutdown()
    print("Finished")

DQNのパラメーターとRainbowの実装について

詳しいパラメータ内容はhttps://docs.ray.io/en/latest/rllib-algorithms.html#dqn

DEFAULT_CONFIG = Trainer.merge_trainer_configs(
    SIMPLEQ_DEFAULT_CONFIG,
    {
        # === Model ===
        # Number of atoms for representing the distribution of return. When
        # this is greater than 1, distributional Q-learning is used.
        # the discrete supports are bounded by v_min and v_max
        "num_atoms": 1,
        "v_min": -10.0,
        "v_max": 10.0,
        # Whether to use noisy network
        "noisy": False,
        # control the initial value of noisy nets
        "sigma0": 0.5,
        # Whether to use dueling dqn
        "dueling": True,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens": [256],
        # Whether to use double dqn
        "double_q": True,
        # N-step Q learning
        "n_step": 1,

        # === Prioritized replay buffer ===
        # If True prioritized replay buffer will be used.
        "prioritized_replay": True,
        # Alpha parameter for prioritized replay buffer.
        "prioritized_replay_alpha": 0.6,
        # Beta parameter for sampling from prioritized replay buffer.
        "prioritized_replay_beta": 0.4,
        # Final value of beta (by default, we use constant beta=0.4).
        "final_prioritized_replay_beta": 0.4,
        # Time steps over which the beta parameter is annealed.
        "prioritized_replay_beta_annealing_timesteps": 20000,
        # Epsilon to add to the TD errors when updating priorities.
        "prioritized_replay_eps": 1e-6,

        # Callback to run before learning on a multi-agent batch of
        # experiences.
        "before_learn_on_batch": None,

        # The intensity with which to update the model (vs collecting samples
        # from the env). If None, uses the "natural" value of:
        # `train_batch_size` / (`rollout_fragment_length` x `num_workers` x
        # `num_envs_per_worker`).
        # If provided, will make sure that the ratio between ts inserted into
        # and sampled from the buffer matches the given value.
        # Example:
        #   training_intensity=1000.0
        #   train_batch_size=250 rollout_fragment_length=1
        #   num_workers=1 (or 0) num_envs_per_worker=1
        #   -> natural value = 250 / 1 = 250.0
        #   -> will make sure that replay+train op will be executed 4x as
        #      often as rollout+insert op (4 * 250 = 1000).
        # See: rllib/agents/dqn/dqn.py::calculate_rr_weights for further
        # details.
        "training_intensity": None,

        # === Parallelism ===
        # Whether to compute priorities on workers.
        "worker_side_prioritization": False,
    },
    _allow_unknown_configs=True,
)

DuelingNetworkとDoubleQNetworkはデフォルトでONになっていることに注意

Rainbow

Rainbowにするには以下のように変更する必要がある

For a complete rainbow setup, make the following changes to the default DQN config: "n_step": [between 1 and 10], "noisy": True, "num_atoms": [more than 1], "v_min": -10.0, "v_max": 10.0 (set v_min and v_max according to your expected range of returns).

config全体としては以下のようになる

atari-basic-dqn:
    env:
        grid_search:
            - BreakoutNoFrameskip-v4
            - BeamRiderNoFrameskip-v4
            - QbertNoFrameskip-v4
            - SpaceInvadersNoFrameskip-v4
    run: DQN
    config:
        framework: torch
        double_q: true
        dueling: true
        num_atoms: 51
        noisy: true
        gamma: 0.99
        lr: .0001
        hiddens: [512]
        learning_starts: 10000
        buffer_size: 50000
        rollout_fragment_length: 4
        train_batch_size: 32
        exploration_config:
          epsilon_timesteps: 2
          final_epsilon: 0.0
        target_network_update_freq: 500
        prioritized_replay: True
        prioritized_replay_alpha: 0.5
        final_prioritized_replay_beta: 1.0
        prioritized_replay_beta_annealing_timesteps: 400000
        n_step: 3
        gpu: True

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up