from stable_baselines3 import DQN # SAC, TD3, TQC are all successors of DQN from stable_baselines3 import SAC, TD3 from sb3_contrib import TQC # Instantiate the algorithm on the Lunar Lander env model = DQN("MlpPolicy", "LunarLander-v2", verbose=1) # Train for 100 000 steps model.learn(100_000, progress_bar=True)
from stable_baselines3 import DQN # SAC, TD3, TQC are all successors of DQN from stable_baselines3 import SAC, TD3 from sb3_contrib import TQC # Instantiate the algorithm on the Lunar Lander env model = DQN("MlpPolicy", "LunarLander-v2", verbose=1) # Train for 100 000 steps model.learn(100_000, progress_bar=True)
Raffin et al. "Learning to Exploit Elastic Actuators for Quadruped Locomotion.": https://github.com/araffin/sbx
How good is it to be in this state?
Win: 1.0 | Draw: 0.5 | Lose: 0.0
Depends on the state
Depends on the policy
Source: Freek Stulp - Master AIC
What if we have no model?
Solution: Qπ(s,a) instead of Vπ(s)
π(s)=a∈AargmaxQπ(s,a)
Bellman equation for optimal value function:
Q-learning update rule
α=1 (learning rate)
Qθ(st,at) depends on Qθ(st+1,a′)...
What can we do about it?
Iterate! Use Qθn(st,at)
initial_targets = rewards # Initial Q-value estimate qf_input = np.concatenate((states, actions)) qf_model.fit(qf_input, initial_targets) for _ in range(N_ITERATIONS): # Re-use Q-value model from previous iteration # to create the next targets next_q_values = get_max_q_values(qf_model, next_states) # Non-terminal states target targets[non_terminal_states] = rewards + gamma * next_q_values # Special case for terminal states targets[terminal_states] = rewards # Update Q-value estimate qf_model.fit(qf_input, targets)
initial_targets = rewards # Initial Q-value estimate qf_input = np.concatenate((states, actions)) qf_model.fit(qf_input, initial_targets) for _ in range(N_ITERATIONS): # Re-use Q-value model from previous iteration # to create the next targets next_q_values = get_max_q_values(qf_model, next_states) # Non-terminal states target targets[non_terminal_states] = rewards + gamma * next_q_values # Special case for terminal states targets[terminal_states] = rewards # Update Q-value estimate qf_model.fit(qf_input, targets)
initial_targets = rewards # Initial Q-value estimate qf_input = np.concatenate((states, actions)) qf_model.fit(qf_input, initial_targets) for _ in range(N_ITERATIONS): # Re-use Q-value model from previous iteration # to create the next targets next_q_values = get_max_q_values(qf_model, next_states) # Non-terminal states target targets[non_terminal_states] = rewards + gamma * next_q_values # Special case for terminal states targets[terminal_states] = rewards # Update Q-value estimate qf_model.fit(qf_input, targets)
initial_targets = rewards # Initial Q-value estimate qf_input = np.concatenate((states, actions)) qf_model.fit(qf_input, initial_targets) for _ in range(N_ITERATIONS): # Re-use Q-value model from previous iteration # to create the next targets next_q_values = get_max_q_values(qf_model, next_states) # Non-terminal states target targets[non_terminal_states] = rewards + gamma * next_q_values # Special case for terminal states targets[terminal_states] = rewards # Update Q-value estimate qf_model.fit(qf_input, targets)
initial_targets = rewards # Initial Q-value estimate qf_input = np.concatenate((states, actions)) qf_model.fit(qf_input, initial_targets) for _ in range(N_ITERATIONS): # Re-use Q-value model from previous iteration # to create the next targets next_q_values = get_max_q_values(qf_model, next_states) # Non-terminal states target targets[non_terminal_states] = rewards + gamma * next_q_values # Special case for terminal states targets[terminal_states] = rewards # Update Q-value estimate qf_model.fit(qf_input, targets)
initial_targets = rewards # Initial Q-value estimate qf_input = np.concatenate((states, actions)) qf_model.fit(qf_input, initial_targets) for _ in range(N_ITERATIONS): # Re-use Q-value model from previous iteration # to create the next targets next_q_values = get_max_q_values(qf_model, next_states) # Non-terminal states target targets[non_terminal_states] = rewards + gamma * next_q_values # Special case for terminal states targets[terminal_states] = rewards # Update Q-value estimate qf_model.fit(qf_input, targets)
https://gymnasium.farama.org/environments/classic_control/cart_pole/
import gymnasium as gym # Create the environment env = gym.make("CartPole-v1", render_mode="human") # Reset env and get first observation obs, _ = env.reset() # Step in the env with random actions and display the env for _ in range(100): env.render() # Display the env action = env.action_space.sample() # Retrieve new observation, reward, # termination signal, truncation signal # and additional infos next_obs, reward, terminated, truncated, info = env.step(action) # Update current observation obs = next_obs # End of an episode if terminated or truncated: obs, _ = env.reset()
import gymnasium as gym # Create the environment env = gym.make("CartPole-v1", render_mode="human") # Reset env and get first observation obs, _ = env.reset() # Step in the env with random actions and display the env for _ in range(100): env.render() # Display the env action = env.action_space.sample() # Retrieve new observation, reward, # termination signal, truncation signal # and additional infos next_obs, reward, terminated, truncated, info = env.step(action) # Update current observation obs = next_obs # End of an episode if terminated or truncated: obs, _ = env.reset()
import gymnasium as gym # Create the environment env = gym.make("CartPole-v1", render_mode="human") # Reset env and get first observation obs, _ = env.reset() # Step in the env with random actions and display the env for _ in range(100): env.render() # Display the env action = env.action_space.sample() # Retrieve new observation, reward, # termination signal, truncation signal # and additional infos next_obs, reward, terminated, truncated, info = env.step(action) # Update current observation obs = next_obs # End of an episode if terminated or truncated: obs, _ = env.reset()
import gymnasium as gym # Create the environment env = gym.make("CartPole-v1", render_mode="human") # Reset env and get first observation obs, _ = env.reset() # Step in the env with random actions and display the env for _ in range(100): env.render() # Display the env action = env.action_space.sample() # Retrieve new observation, reward, # termination signal, truncation signal # and additional infos next_obs, reward, terminated, truncated, info = env.step(action) # Update current observation obs = next_obs # End of an episode if terminated or truncated: obs, _ = env.reset()
import gymnasium as gym # Create the environment env = gym.make("CartPole-v1", render_mode="human") # Reset env and get first observation obs, _ = env.reset() # Step in the env with random actions and display the env for _ in range(100): env.render() # Display the env action = env.action_space.sample() # Retrieve new observation, reward, # termination signal, truncation signal # and additional infos next_obs, reward, terminated, truncated, info = env.step(action) # Update current observation obs = next_obs # End of an episode if terminated or truncated: obs, _ = env.reset()
import gymnasium as gym # Create the environment env = gym.make("CartPole-v1", render_mode="human") # Reset env and get first observation obs, _ = env.reset() # Step in the env with random actions and display the env for _ in range(100): env.render() # Display the env action = env.action_space.sample() # Retrieve new observation, reward, # termination signal, truncation signal # and additional infos next_obs, reward, terminated, truncated, info = env.step(action) # Update current observation obs = next_obs # End of an episode if terminated or truncated: obs, _ = env.reset()
import gymnasium as gym # Create the environment env = gym.make("CartPole-v1", render_mode="human") # Reset env and get first observation obs, _ = env.reset() # Step in the env with random actions and display the env for _ in range(100): env.render() # Display the env action = env.action_space.sample() # Retrieve new observation, reward, # termination signal, truncation signal # and additional infos next_obs, reward, terminated, truncated, info = env.step(action) # Update current observation obs = next_obs # End of an episode if terminated or truncated: obs, _ = env.reset()
# Retrieve q values for the current observation q_values = q_model(current_obs) # Follow greedy-policy: # take the action with the highest q_value action = np.argmax(q_values) # Do one step in the env next_obs, reward, terminated, _, _ = env.step(action) # Store transition in the replay buffer replay_buffer.store(obs, action, reward, terminated, next_obs)
# Retrieve q values for the current observation q_values = q_model(current_obs) # Follow greedy-policy: # take the action with the highest q_value action = np.argmax(q_values) # Do one step in the env next_obs, reward, terminated, _, _ = env.step(action) # Store transition in the replay buffer replay_buffer.store(obs, action, reward, terminated, next_obs)
# Retrieve q values for the current observation q_values = q_model(current_obs) # Follow greedy-policy: # take the action with the highest q_value action = np.argmax(q_values) # Do one step in the env next_obs, reward, terminated, _, _ = env.step(action) # Store transition in the replay buffer replay_buffer.store(obs, action, reward, terminated, next_obs)
# Retrieve q values for the current observation q_values = q_model(current_obs) # Follow greedy-policy: # take the action with the highest q_value action = np.argmax(q_values) # Do one step in the env next_obs, reward, terminated, _, _ = env.step(action) # Store transition in the replay buffer replay_buffer.store(obs, action, reward, terminated, next_obs)
# Retrieve q values for the current observation q_values = q_model(current_obs) # Follow greedy-policy: # take the action with the highest q_value action = np.argmax(q_values) # Do one step in the env next_obs, reward, terminated, _, _ = env.step(action) # Store transition in the replay buffer replay_buffer.store(obs, action, reward, terminated, next_obs)
# Flip a biased coin take_random_action = np.random.rand() < exploration_rate if take_random_action: # Random action action = action_space.sample() else: # Greedy action action = np.argmax(q_values)
# Flip a biased coin take_random_action = np.random.rand() < exploration_rate if take_random_action: # Random action action = action_space.sample() else: # Greedy action action = np.argmax(q_values)
# Flip a biased coin take_random_action = np.random.rand() < exploration_rate if take_random_action: # Random action action = action_space.sample() else: # Greedy action action = np.argmax(q_values)
# Flip a biased coin take_random_action = np.random.rand() < exploration_rate if take_random_action: # Random action action = action_space.sample() else: # Greedy action action = np.argmax(q_values)
NumPy | PyTorch |
---|---|
np.array([[1, 2], [3, 4]]) |
th.tensor([[1, 2], [3, 4]]) |
np.ones((2, 3)) |
th.ones(2, 3) |
np.concatenate |
th.cat |
x.shape |
x.shape |
x.argmax(axis=...) |
x.argmax(dim=...) |
x.item() |
x.item() |
NumPy to PyTorch: | th.as_tensor |
import numpy as np from sklearn.linear_model import LinearRegression # Generate some data (noisy linear function) x = np.linspace(0, 5, num=50).reshape(50, 1) y = 2 * x + 10 + 0.1 * np.random.rand() # Fit a linear model using least squares model = LinearRegression().fit(x, y) y_predict = model.predict(x) # Retrieve the optimized parameters slope, bias = model.coef_, model.intercept_
import numpy as np from sklearn.linear_model import LinearRegression # Generate some data (noisy linear function) x = np.linspace(0, 5, num=50).reshape(50, 1) y = 2 * x + 10 + 0.1 * np.random.rand() # Fit a linear model using least squares model = LinearRegression().fit(x, y) y_predict = model.predict(x) # Retrieve the optimized parameters slope, bias = model.coef_, model.intercept_
import numpy as np from sklearn.linear_model import LinearRegression # Generate some data (noisy linear function) x = np.linspace(0, 5, num=50).reshape(50, 1) y = 2 * x + 10 + 0.1 * np.random.rand() # Fit a linear model using least squares model = LinearRegression().fit(x, y) y_predict = model.predict(x) # Retrieve the optimized parameters slope, bias = model.coef_, model.intercept_
import numpy as np from sklearn.linear_model import LinearRegression # Generate some data (noisy linear function) x = np.linspace(0, 5, num=50).reshape(50, 1) y = 2 * x + 10 + 0.1 * np.random.rand() # Fit a linear model using least squares model = LinearRegression().fit(x, y) y_predict = model.predict(x) # Retrieve the optimized parameters slope, bias = model.coef_, model.intercept_
import numpy as np from sklearn.linear_model import LinearRegression # Generate some data (noisy linear function) x = np.linspace(0, 5, num=50).reshape(50, 1) y = 2 * x + 10 + 0.1 * np.random.rand() # Fit a linear model using least squares model = LinearRegression().fit(x, y) y_predict = model.predict(x) # Retrieve the optimized parameters slope, bias = model.coef_, model.intercept_