Problems during the training of a simple custom environment using the Proximal Policy Optimization (PPO) algorithm

22 Views Asked by At

I used the Proximal Policy Optimization (PPO) deep reinforcement learning algorithm to train a custom environment called TemperatureEnv for one million steps, but the performance is not satisfactory.

Source code:

import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env
class TemperatureEnv(Env):
    def __init__(self):
        # 0,1,2
        self.action_space = Discrete(3)
        # temperature range: 0-100
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        # initial temperature: 47-53
        self.state = 50 + random.randint(-3,3)
        # episode length: 60
        self.episode_length = 60
        
    def step(self, action):
        # 0: temperature-1, 1: temperature do not change, 2: temperature+1
        self.state += action -1 
        # episode length--
        self.episode_length -= 1 
        
        # reward mechanism
        if self.state >=49 and self.state <=51: 
            reward = 1 
        else: 
            reward = -1
        
        # end condition
        if self.episode_length <= 0: 
            terminated = True
        else:
            terminated = False
        if self.state < 0 or self.state > 100:
            truncated = True
        else:
            truncated = False

        # info
        info = {}
        
        return self.state, reward, terminated, truncated, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self, seed=None):
        # reset temperature
        self.state = np.array([50 + random.randint(-3,3)],dtype=np.float32)
        info = {}
        # reset episode length
        self.episode_length = 60
        return self.state, info

# training
env=TemperatureEnv()
log_dir = './log/'
os.makedirs(log_dir, exist_ok=True)
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_dir)
model.learn(total_timesteps=int(10e6), progress_bar=True, tb_log_name='ppo_custom')

Library versions:

python==3.8.10
stable-baselines3==2.1.0
gymnasium==0.29.0

Output:

Trained for 1 million and 10 million time steps, respectively

I believe that for a simple environment like TemperatureEnv, such training iterations should be sufficient to achieve satisfactory results (with scores approaching 60). What is the issue and where is it occurring?

0

There are 0 best solutions below