I used the Proximal Policy Optimization (PPO) deep reinforcement learning algorithm to train a custom environment called TemperatureEnv for one million steps, but the performance is not satisfactory.
Source code:
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env
class TemperatureEnv(Env):
def __init__(self):
# 0,1,2
self.action_space = Discrete(3)
# temperature range: 0-100
self.observation_space = Box(low=np.array([0]), high=np.array([100]))
# initial temperature: 47-53
self.state = 50 + random.randint(-3,3)
# episode length: 60
self.episode_length = 60
def step(self, action):
# 0: temperature-1, 1: temperature do not change, 2: temperature+1
self.state += action -1
# episode length--
self.episode_length -= 1
# reward mechanism
if self.state >=49 and self.state <=51:
reward = 1
else:
reward = -1
# end condition
if self.episode_length <= 0:
terminated = True
else:
terminated = False
if self.state < 0 or self.state > 100:
truncated = True
else:
truncated = False
# info
info = {}
return self.state, reward, terminated, truncated, info
def render(self):
# Implement viz
pass
def reset(self, seed=None):
# reset temperature
self.state = np.array([50 + random.randint(-3,3)],dtype=np.float32)
info = {}
# reset episode length
self.episode_length = 60
return self.state, info
# training
env=TemperatureEnv()
log_dir = './log/'
os.makedirs(log_dir, exist_ok=True)
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_dir)
model.learn(total_timesteps=int(10e6), progress_bar=True, tb_log_name='ppo_custom')
Library versions:
python==3.8.10
stable-baselines3==2.1.0
gymnasium==0.29.0
Output:
Trained for 1 million and 10 million time steps, respectively
I believe that for a simple environment like TemperatureEnv, such training iterations should be sufficient to achieve satisfactory results (with scores approaching 60). What is the issue and where is it occurring?