My state space has a dimenstion of 10 - pv generation and price forecasts, my train data has 8000 hours of data and my test has 760 hours. I use a low learning rate, a huge replay memory, target network is updated with tau = 0.005, batch_size = 64, epsilon decays slowly from 1 to 0.01, but it's still not stable. Which hyperparams should i tune? Should i consider using data augmentation to increase trainset. Thank you!

class DQN(nn.Module):

    def __init__(self, n_observations: int, n_actions: int):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(observation_space_n, 64)
        self.layer2 = nn.Linear(64, 32)
        self.layer3 = nn.Linear(32, n_actions)


    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)



BATCH_SIZE = 64
GAMMA = 0.99
EPS_START = 1
EPS_END = 0.01
EPS_DECAY = 10000
TAU = 0.005
LR = 0.00001
MEMORY_SIZE = 30000


# Get the number of state observations
state, info = microgrid_train_env.reset()
n_observations = observation_space_n

policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.RMSprop(policy_net.parameters(), lr=LR)
memory = ReplayMemory(MEMORY_SIZE)

steps_done = 0

writer = SummaryWriter('microgrid_models/first_microgrid_models/rl_algo/runs/1')
def select_action(state ):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
                    math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return the largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.

            return policy_net(state).max(1).indices.view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device,
                            dtype=torch.long)


episode_durations = []



def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)

    batch = Transition(*zip(*transitions))

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                            batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                       if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = policy_net(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()


def train_agent():
    if torch.cuda.is_available():
        num_episodes = 1000
    else:
        num_episodes = 200
    val_lost = list()

    writer = SummaryWriter('microgrid_models/first_microgrid_models/rl_algo/runs/fashion_mnist_experiment_1')

    for i_episode in range(num_episodes):
        test_reward = evaluate_model(target_net, microgrid_test_env,
                                     device=device)
        val_lost.append(test_reward)
        if i_episode % 2 == 0:
            writer.add_scalar('test loss',
                              -test_reward,
                              i_episode)
            print("episode", i_episode, "test reward", test_reward)


        state, info = microgrid_train_env.reset()
        state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
        #
        # if early_stopping.early_stop(test_reward):
        #     break
        actions = []
        for t in count():
            action = select_action(state)
            actions.append(action)
            observation, reward, terminated, truncated, _ = microgrid_train_env.step(action.item())
            reward = torch.tensor([reward], device=device)
            done = terminated or truncated

            if terminated:
                next_state = None
            else:
                next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the policy network)
            optimize_model()

            # Soft update of the target network's weights
            # θ′ ← τ θ + (1 −τ )θ′
            target_net_state_dict = target_net.state_dict()
            policy_net_state_dict = policy_net.state_dict()
            for key in policy_net_state_dict:
                target_net_state_dict[key] = policy_net_state_dict[key] * TAU + target_net_state_dict[key] *   (1 - TAU)
            target_net.load_state_dict(target_net_state_dict)



            if done:
                episode_durations.append(t + 1)
                counter = Counter(actions)
                print(counter)
                break

    writer.close()
    train_reward = evaluate_model(target_net, microgrid_train_env,
                                  device=device)
    return target_net, val_lost, train_reward
    

I try many hyperparams setting but it's still unstable and not converging.

0

There are 0 best solutions below