David (aka HASy)
ENSTA Robotique
Racing Robot
Baxter
“Don't learn what you already know.”
"S-RL Toolbox: Environments, Datasets and Evaluation Metrics for SRL" (Raffin et al. 2018)
"Decoupling feature extraction from policy learning" (Raffin et al. 2018)
Ashley
Antonin
Adam
Maximilian
class LinearSchedule(Schedule):
"""
Linear interpolation between initial_p and final_p over
schedule_timesteps. After this many timesteps pass
final_p is returned.
:param schedule_timesteps: (int) Number of timesteps for
which to linearly anneal initial_p to final_p
:param initial_p: (float) initial output value
:param final_p: (float) final output value
"""
import gym
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines import A2C
def make_env(env_id, rank, seed=0):
"""
Utility function for multiprocessed env.
:param env_id: (str) the environment ID
:param num_env: (int) the number of environments you wish to have in subprocesses
:param seed: (int) the inital seed for RNG
:param rank: (int) index of the subprocess
"""
def _init():
env = gym.make(env_id)
env.seed(seed + rank)
return env
return _init
n_cpu = 8
env = DummyVecEnv([make_env('CartPole-v1', i) for i in range(n_cpu)])
model = A2C('MlpPolicy', env, verbose=1).learn(int(5e5))
import tensorflow as tf
from stable_baselines import PPO2, SAC, TD3
# Common policies (A2C family: A2C, PPO, TRPO, ACKTR, ACER)
# Custom MLP policy of two layers of size 32 each with tanh
# activation function
policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[32, 32])
# Different architecture for actor/critic
# net_arch=[128, dict(vf=[256], pi=[16])]
model = PPO2('MlpPolicy', 'Pendulum-v0', policy_kwargs=policy_kwargs)
# Custom Architecture (DDPG, SAC, TD3)
model = TD3('MlpPolicy', 'MountainCarContinous-v0',
policy_kwargs=dict(layers=[400, 300]))
import gym
from stable_baselines import PPO2
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.bench import Monitor
env = Monitor(gym.make('CartPole-v1'), filename=None, allow_early_resets=True)
env = DummyVecEnv([lambda: env])
model = PPO2('MlpPolicy', env, verbose=1).learn(int(1e5))
from stable_baselines import SAC
model = SAC('MlpPolicy', 'LunarLanderContinuous-v2', verbose=1,
tensorboard_log='/tmp/sac/')
model.learn(int(1e5))
import numpy as np
from stable_baselines import SAC
def callback(locals_, globals_):
self_ = locals_['self']
# Check every 1000 calls
if self_.n_callback_calls % 1000 == 0:
# Save best model (according to training reward)
if locals_.get('mean_reward', -np.inf) > self_.best_mean_reward:
print("Saving best model")
self_.save('sac_best')
self_.best_mean_reward = locals_['mean_reward']
# Stop training when target performance attained
if self_.best_mean_reward > -800:
print("Stopping training")
return False
self_.n_callback_calls += 1
return True
model = SAC("MlpPolicy", "Pendulum-v0", verbose=1)
# Define a properties to avoid global variables
model.best_mean_reward = -np.inf
model.n_callback_calls = 0
model.learn(100000, callback=callback)
HalfCheetahBulletEnv-v0:
env_wrapper: utils.wrappers.TimeFeatureWrapper
n_timesteps: !!float 2e6
policy: 'MlpPolicy'
gamma: 0.99
buffer_size: 1000000
noise_type: 'normal'
noise_std: 0.1
learning_starts: 10000
batch_size: 100
learning_rate: !!float 1e-3
train_freq: 1000
gradient_steps: 1000
policy_kwargs: 'dict(layers=[400, 300])'
python train.py --algo td3 --env HalfCheetahBulletEnv-v0
python enjoy.py --algo td3 --env HalfCheetahBulletEnv-v0
python -m utils.record_video --algo td3 --env HalfCheetahBulletEnv-v0 -n 1000
python train.py --algo ppo2 --env MountainCar-v0 \
--optimize --n-trials 1000 --n-jobs 2 \
--sampler tpe --pruner median