from stable_baselines3 import DQN
# SAC, TD3, TQC are all successors of DQN
from stable_baselines3 import SAC, TD3
from sb3_contrib import TQC
# Instantiate the algorithm on the Lunar Lander env
model = DQN("MlpPolicy", "LunarLander-v2", verbose=1)
# Train for 100 000 steps
model.learn(100_000, progress_bar=True)
Raffin et al. "Learning to Exploit Elastic Actuators for Quadruped Locomotion.": https://github.com/araffin/sbx
How good is it to be in this state?
Win: 1.0 | Draw: 0.5 | Lose: 0.0
Depends on the state
Depends on the policy
Source: Freek Stulp - Master AIC
What if we have no model?
Solution: $Q_\pi(s, a)$ instead of $V_\pi(s)$
\[\begin{aligned} \pi(s) = \argmax_{a \in A} Q_\pi(s, a) \end{aligned} \]
Bellman equation for optimal value function:
Q-learning update rule
$\alpha=1$ (learning rate)
$\textcolor{#1864ab}{Q_\theta(s_t, a_t)}$ depends on $\textcolor{#a61e4d}{Q_\theta(s_{t+1}, a')}$...
What can we do about it?
Iterate! Use $Q^{\textcolor{green}{n}}_\theta(s_t, a_t)$
initial_targets = rewards
# Initial Q-value estimate
qf_input = np.concatenate((states, actions))
qf_model.fit(qf_input, initial_targets)
for _ in range(N_ITERATIONS):
# Re-use Q-value model from previous iteration
# to create the next targets
next_q_values = get_max_q_values(qf_model, next_states)
# Non-terminal states target
targets[non_terminal_states] = rewards + gamma * next_q_values
# Special case for terminal states
targets[terminal_states] = rewards
# Update Q-value estimate
qf_model.fit(qf_input, targets)
https://gymnasium.farama.org/environments/classic_control/cart_pole/
import gymnasium as gym
# Create the environment
env = gym.make("CartPole-v1", render_mode="human")
# Reset env and get first observation
obs, _ = env.reset()
# Step in the env with random actions and display the env
for _ in range(100):
env.render() # Display the env
action = env.action_space.sample()
# Retrieve new observation, reward,
# termination signal, truncation signal
# and additional infos
next_obs, reward, terminated, truncated, info = env.step(action)
# Update current observation
obs = next_obs
# End of an episode
if terminated or truncated:
obs, _ = env.reset()
# Retrieve q values for the current observation
q_values = q_model(current_obs)
# Follow greedy-policy:
# take the action with the highest q_value
action = np.argmax(q_values)
# Do one step in the env
next_obs, reward, terminated, _, _ = env.step(action)
# Store transition in the replay buffer
replay_buffer.store(obs, action, reward, terminated, next_obs)
# Flip a biased coin
take_random_action = np.random.rand() < exploration_rate
if take_random_action:
# Random action
action = action_space.sample()
else:
# Greedy action
action = np.argmax(q_values)
NumPy | PyTorch |
---|---|
np.array([[1, 2], [3, 4]]) |
th.tensor([[1, 2], [3, 4]]) |
np.ones((2, 3)) |
th.ones(2, 3) |
np.concatenate |
th.cat |
x.shape |
x.shape |
x.argmax(axis=...) |
x.argmax(dim=...) |
x.item() |
x.item() |
NumPy to PyTorch: | th.as_tensor |
import numpy as np
from sklearn.linear_model import LinearRegression
# Generate some data (noisy linear function)
x = np.linspace(0, 5, num=50).reshape(50, 1)
y = 2 * x + 10 + 0.1 * np.random.rand()
# Fit a linear model using least squares
model = LinearRegression().fit(x, y)
y_predict = model.predict(x)
# Retrieve the optimized parameters
slope, bias = model.coef_, model.intercept_