My state space has a dimenstion of 10 - pv generation and price forecasts, my train data has 8000 hours of data and my test has 760 hours. I use a low learning rate, a huge replay memory, target network is updated with tau = 0.005, batch_size = 64, epsilon decays slowly from 1 to 0.01, but it's still not stable. Which hyperparams should i tune? Should i consider using data augmentation to increase trainset. Thank you!
class DQN(nn.Module):
def __init__(self, n_observations: int, n_actions: int):
super(DQN, self).__init__()
self.layer1 = nn.Linear(observation_space_n, 64)
self.layer2 = nn.Linear(64, 32)
self.layer3 = nn.Linear(32, n_actions)
def forward(self, x):
x = F.relu(self.layer1(x))
x = F.relu(self.layer2(x))
return self.layer3(x)
BATCH_SIZE = 64
GAMMA = 0.99
EPS_START = 1
EPS_END = 0.01
EPS_DECAY = 10000
TAU = 0.005
LR = 0.00001
MEMORY_SIZE = 30000
# Get the number of state observations
state, info = microgrid_train_env.reset()
n_observations = observation_space_n
policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
optimizer = optim.RMSprop(policy_net.parameters(), lr=LR)
memory = ReplayMemory(MEMORY_SIZE)
steps_done = 0
writer = SummaryWriter('microgrid_models/first_microgrid_models/rl_algo/runs/1')
def select_action(state ):
global steps_done
sample = random.random()
eps_threshold = EPS_END + (EPS_START - EPS_END) * \
math.exp(-1. * steps_done / EPS_DECAY)
steps_done += 1
if sample > eps_threshold:
with torch.no_grad():
# t.max(1) will return the largest column value of each row.
# second column on max result is index of where max element was
# found, so we pick action with the larger expected reward.
return policy_net(state).max(1).indices.view(1, 1)
else:
return torch.tensor([[random.randrange(n_actions)]], device=device,
dtype=torch.long)
episode_durations = []
def optimize_model():
if len(memory) < BATCH_SIZE:
return
transitions = memory.sample(BATCH_SIZE)
batch = Transition(*zip(*transitions))
non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
batch.next_state)), device=device, dtype=torch.bool)
non_final_next_states = torch.cat([s for s in batch.next_state
if s is not None])
state_batch = torch.cat(batch.state)
action_batch = torch.cat(batch.action)
reward_batch = torch.cat(batch.reward)
state_action_values = policy_net(state_batch).gather(1, action_batch)
next_state_values = torch.zeros(BATCH_SIZE, device=device)
with torch.no_grad():
next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values
expected_state_action_values = (next_state_values * GAMMA) + reward_batch
criterion = nn.SmoothL1Loss()
loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
optimizer.zero_grad()
loss.backward()
# In-place gradient clipping
torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
optimizer.step()
def train_agent():
if torch.cuda.is_available():
num_episodes = 1000
else:
num_episodes = 200
val_lost = list()
writer = SummaryWriter('microgrid_models/first_microgrid_models/rl_algo/runs/fashion_mnist_experiment_1')
for i_episode in range(num_episodes):
test_reward = evaluate_model(target_net, microgrid_test_env,
device=device)
val_lost.append(test_reward)
if i_episode % 2 == 0:
writer.add_scalar('test loss',
-test_reward,
i_episode)
print("episode", i_episode, "test reward", test_reward)
state, info = microgrid_train_env.reset()
state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
#
# if early_stopping.early_stop(test_reward):
# break
actions = []
for t in count():
action = select_action(state)
actions.append(action)
observation, reward, terminated, truncated, _ = microgrid_train_env.step(action.item())
reward = torch.tensor([reward], device=device)
done = terminated or truncated
if terminated:
next_state = None
else:
next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
# Store the transition in memory
memory.push(state, action, next_state, reward)
# Move to the next state
state = next_state
# Perform one step of the optimization (on the policy network)
optimize_model()
# Soft update of the target network's weights
# θ′ ← τ θ + (1 −τ )θ′
target_net_state_dict = target_net.state_dict()
policy_net_state_dict = policy_net.state_dict()
for key in policy_net_state_dict:
target_net_state_dict[key] = policy_net_state_dict[key] * TAU + target_net_state_dict[key] * (1 - TAU)
target_net.load_state_dict(target_net_state_dict)
if done:
episode_durations.append(t + 1)
counter = Counter(actions)
print(counter)
break
writer.close()
train_reward = evaluate_model(target_net, microgrid_train_env,
device=device)
return target_net, val_lost, train_reward
I try many hyperparams setting but it's still unstable and not converging.