1

I made a DQN to learn tic tac toe. So far, I let the agent play all moves in order to see, if it learns to always make legal moves that leads to a draw or to a win for one or the other player. After training the network for about 10.000 games, it is able to make a draw or a win at about 30 to 40 % of the games.

Afterwards, I wanted to the test network in evaluation mode. Unfortunately, it performs significantly worse than with only about 1% draw or win. The code for training and testing looks as follows:

def train(n_games, lr):
    env = TicTacToe()
    brain = Agent(gamma=0.99, epsilon=1.0, batch_size=512, n_actions=9,
                  input_dims=[10], lr=lr)
    scores = []
    test_scores = []
    eps_history = []

    for i in range(n_games):
        if i % 100 == 0 and i > 0:
            avg_score = np.mean(scores[max(0, i-100):(i+1)])
            print('episode:', i, 'average score %.3f:' % avg_score, 'epsilon:', brain.epsilon)
        score = 0
        eps_history.append(brain.epsilon)
        observation = env.reset()
        done = False
        while not done:
            action = brain.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            brain.store_transition(observation, action, reward, observation_, done)
            brain.learn()
            observation = observation_

        scores.append(score)

    # testing the performance
    for i in range(100):
        brain.epsilon = 0.3
        score = 0
        observation = env.reset()
        done = False
        while not done:
            action = brain.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            observation = observation_
        test_scores.append(score)

    print(np.mean(test_scores))

The only thing that changed is the two lines:

brain.store_transition(observation, action, reward, observation_, done)
brain.learn()

The agent class looks like this:

class Agent(object):
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
                 max_mem_size=1_000_000, eps_end=0.05, eps_dec=0.99995):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_end = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size        # we need a memory to store experiences and randommly sample over them
        self.mem_counter = 0
        self.Q_eval = DQN(lr=self.lr, n_actions=self.n_actions, input_dims=input_dims,
                          fc1_dims=32, fc2_dims=32)
        self.Q_target = DQN(lr=self.lr, n_actions=self.n_actions, input_dims=input_dims,
                          fc1_dims=64, fc2_dims=64)
        self.state_memory = np.zeros((self.mem_size, *input_dims))
        self.new_state_memeory = np.zeros((self.mem_size, *input_dims))
        self.action_memory = np.zeros((self.mem_size, self.n_actions),
                                      dtype=np.uint8)
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.uint8)      # sequence of done flags

    def store_transition(self, state, action, reward, state_, terminal):
        index = self.mem_counter % self.mem_size
        self.state_memory[index] = state
        actions = np.zeros(self.n_actions)
        actions[action] = 1.0       # one hot encoding of actions
        self.action_memory[index] = actions
        self.reward_memory[index] = reward
        self.terminal_memory[index] = not terminal
        self.new_state_memeory[index] = state_
        self.mem_counter += 1

    def choose_action(self, observation):
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.Q_eval.forward(observation)
            action = torch.argmax(actions).item()
        return action

    def learn(self):
        if self.mem_counter > self.batch_size:
            self.Q_eval.optimizer.zero_grad()

            max_mem = self.mem_counter if self.mem_counter < self.mem_size \
                else self.mem_size
            batch = np.random.choice(max_mem, self.batch_size)

            state_batch = self.state_memory[batch]
            action_batch = self.action_memory[batch]
            action_values = np.array(self.action_space, dtype=np.int32)
            action_indices = np.dot(action_batch, action_values)
            reward_batch = self.reward_memory[batch]
            terminal_batch = self.terminal_memory[batch]
            new_state_batch = self.new_state_memeory[batch]

            reward_batch = torch.Tensor(reward_batch).to(self.Q_eval.device)
            terminal_batch = torch.Tensor(terminal_batch).to(self.Q_eval.device)

            q_eval = self.Q_eval.forward(state_batch).to(self.Q_eval.device)
            #q_target = self.Q_target.forward(state_batch).to(self.Q_target.device)  # alternative to q_eval.clone()
            q_target = q_eval.clone()
            q_next = self.Q_eval.forward(new_state_batch).to(self.Q_eval.device)

            batch_index = np.arange(self.batch_size, dtype=np.int32)
            q_target[batch_index, action_indices] = reward_batch + \
                self.gamma * torch.max(q_next, dim=1)[0] * terminal_batch

            self.epsilon = self.epsilon * self.eps_dec if self.epsilon > \
                self.eps_end else self.eps_end
            loss = F.smooth_l1_loss(q_target, q_eval).to(self.Q_eval.device)
            loss.backward()
            self.Q_eval.optimizer.step()

Can anyone explain me why this is happening? Thank you for your help!

An additional information:

It turned out, that the network is always calculating the same q values for whatever state it gets, so basically the line:

actions = self.Q_eval.forward(observation)

always gives out the same q values, even for different state observations. For example:

[1, 0, 0, 0, 0, 0, 0, 0, 0, -1]
tensor([0.3155, 0.1449, 0.2217, 0.2078, 0.1867, 0.1810, 0.2689, 0.1995, 0.3029],
       grad_fn=<AddBackward0>)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
tensor([0.3155, 0.1449, 0.2217, 0.2078, 0.1867, 0.1810, 0.2689, 0.1995, 0.3029],
       grad_fn=<AddBackward0>)
[1, 0, 0, 0, 0, 0, 0, 0, 0, -1]
tensor([0.3155, 0.1449, 0.2217, 0.2078, 0.1867, 0.1810, 0.2689, 0.1995, 0.3029],
       grad_fn=<AddBackward0>)

The first lines represent the input states passed to the forward method and the second line are the corresponding qvalues for the possible actions.

spadel
  • 998
  • 2
  • 16
  • 40

1 Answers1

0

It turned out, that my learning rate was way too large by a factor of about 1000.

spadel
  • 998
  • 2
  • 16
  • 40