Hey I had a problem with my RL environment so I stripped it down to the smallest thing possible but PPO is still unable to learn the optimal action. My Environment looks like this:
from typing import Any
import gymnasium as gym
import numpy as np
from gymnasium.core import ObsType, ActType
from stable_baselines3.common.callbacks import BaseCallback
class LoggingCallback(BaseCallback):
def __init__(self, verbose=0):
super().__init__(verbose)
self.infos = []
def _on_step(self) -> bool:
self.infos.append(self.locals["infos"][0])
return True
def __call__(self, locals_, globals_):
self.locals = locals_
self.globals = globals_
self._on_step()
class Dummy(gym.Env):
def __init__(self):
self.observation_space = gym.spaces.Box(low=0, high=1, shape=(1,),dtype=np.float32)
self.action_space = gym.spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)
self.state = 0.5
self.counter = 0
def reset(self, seed: int | None = None, options: dict[str, Any] | None = None, ) -> tuple[ObsType, dict[str, Any]]:
super().reset(seed=seed)
episode = 0
self.counter = 0
self.state = 0.5
observation = np.array([self.state], dtype=np.float32)
return observation, {}
def step(self, action: ActType) -> tuple[ObsType, float, bool, bool, dict]:
self.state += action[0] * 0.1
observation = np.array([self.state], dtype=np.float32)
reward = self.state
terminated = self.counter > 100
truncated = False
self.counter += 1
info = {"observation": observation, "action": action, "reward": reward}
return observation, reward, terminated, truncated, info
Any my training code is:
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from src.environment.dummy import Dummy, LoggingCallback
env = Dummy()
model = PPO("MlpPolicy", env)
model.learn(total_timesteps=1000)
trained_callback = LoggingCallback()
trained_accumulated_reward, trained_std_reward = evaluate_policy(model, env, n_eval_episodes=1, callback=trained_callback)
print([el["action"] for el in trained_callback.infos])
And the actions taken are basically always around 0.15. The relation between observation, action and reward is literally linear, why does PPO not learn to take the action 1 all the time?