Multiprocessing: Vectorized Environments
import gym
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines import A2C
def make_env(env_id, rank, seed=0):
"""
Utility function for multiprocessed env.
:param env_id: (str) the environment ID
:param num_env: (int) the number of environments you wish to have in subprocesses
:param seed: (int) the inital seed for RNG
:param rank: (int) index of the subprocess
"""
def _init():
env = gym.make(env_id)
env.seed(seed + rank)
return env
return _init
n_cpu = 8
env = DummyVecEnv([make_env('CartPole-v1', i) for i in range(n_cpu)])
model = A2C('MlpPolicy', env, verbose=1).learn(int(5e5))
Custom Policy Network
import tensorflow as tf
from stable_baselines import PPO2, SAC, TD3
# Common policies (A2C family: A2C, PPO, TRPO, ACKTR, ACER)
# Custom MLP policy of two layers of size 32 each with tanh
# activation function
policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[32, 32])
# Different architecture for actor/critic
# net_arch=[128, dict(vf=[256], pi=[16])]
model = PPO2('MlpPolicy', 'Pendulum-v0', policy_kwargs=policy_kwargs)
# Custom Architecture (DDPG, SAC, TD3)
model = TD3('MlpPolicy', 'MountainCarContinous-v0',
policy_kwargs=dict(layers=[400, 300]))
Monitoring Training: Monitor Wrapper (1/3)
import gym
from stable_baselines import PPO2
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.bench import Monitor
env = Monitor(gym.make('CartPole-v1'), filename=None, allow_early_resets=True)
env = DummyVecEnv([lambda: env])
model = PPO2('MlpPolicy', env, verbose=1).learn(int(1e5))
Monitoring Training: Tensorboard (2/3)
from stable_baselines import SAC
model = SAC('MlpPolicy', 'LunarLanderContinuous-v2', verbose=1,
tensorboard_log='/tmp/sac/')
model.learn(int(1e5))
Monitoring Training: Callback (3/3)
import numpy as np
from stable_baselines import SAC
def callback(locals_, globals_):
self_ = locals_['self']
# Check every 1000 calls
if self_.n_callback_calls % 1000 == 0:
# Save best model (according to training reward)
if locals_.get('mean_reward', -np.inf) > self_.best_mean_reward:
print("Saving best model")
self_.save('sac_best')
self_.best_mean_reward = locals_['mean_reward']
# Stop training when target performance attained
if self_.best_mean_reward > -800:
print("Stopping training")
return False
self_.n_callback_calls += 1
return True
model = SAC("MlpPolicy", "Pendulum-v0", verbose=1)
# Define a properties to avoid global variables
model.best_mean_reward = -np.inf
model.n_callback_calls = 0
model.learn(100000, callback=callback)