I've been working on solving the Gym Taxi-v3 problem using reinforcement learning algorithms. Initially, I applied tabular Q-learning and after 10,000 training iterations, the algorithm achieved a mean reward of 8.x with 100% success rate, which was satisfactory.
However, when I tried to solve the problem using a DQN (Deep Q-learning Network), the results were not as good. After approximately 100 training iterations, the evaluation episode_reward_mean seems to converge at around -210 and the episode_len_mean at around 200.
According to what I have learned from ChatGPT, DQN should work for the Taxi-v3 problem. I'm unsure why my model isn't performing well.
I would appreciate if someone could provide insight into what could be going wrong and how to use DQN to solve the Taxi-v3 problem effectively. I'm particularly interested in DQN as I believe it to be more suitable for complex practical problems than tabular Q-learning.
My DQN training and evaluation code
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.algorithms.dqn.dqn import DQN, DQNConfig
from ray.rllib.algorithms.a2c import A2CConfig
import ray
import csv
import datetime
import os
ray.init(local_mode=True)
# ray.init(address='auto') # connect to Ray cluster
# config = DQNConfig()
num_rollout_workers = 62
max_train_iter_times = 20000
config = DQNConfig()
config = config.environment("Taxi-v3")
config = config.rollouts(num_rollout_workers=num_rollout_workers)
config = config.framework("torch")
# Update exploration_config
exploration_config={
"type": "EpsilonGreedy",
"initial_epsilon": 1.0,
"final_epsilon": 0.02,
"epsilon_timesteps": max_train_iter_times
}
config = config.exploration(exploration_config=exploration_config)
config.evaluation_config = {
"evaluation_interval": 10,
"evaluation_num_episodes": 10,
}
# Update replay_buffer_config
replay_buffer_config = {
"_enable_replay_buffer_api": True,
"type": "MultiAgentPrioritizedReplayBuffer",
"capacity": 1000,
"prioritized_replay_alpha": 0.5,
"prioritized_replay_beta": 0.5,
"prioritized_replay_eps": 3e-6,
}
config = config.training(
model={"fcnet_hiddens": [50, 50, 50]},
lr=0.001,
gamma=0.99,
replay_buffer_config=replay_buffer_config,
target_network_update_freq=500,
double_q=True,
dueling=True,
num_atoms=1,
noisy=False,
n_step=3,
)
algo = DQN(config=config)
# algo = config.build() # 2. build the algorithm,
no_improvement_counter = 0
prev_reward = None
# Get the current date
current_date = datetime.datetime.now().strftime('%Y%m%d')
# Open the csv file in write mode
with open(f'train_{current_date}.csv', 'w', newline='') as file:
writer = csv.writer(file)
# Write the header row
writer.writerow(["Iteration", "Reward_Mean", "Episode_Length_Mean"])
for i in range(max_train_iter_times):
print(f'#{i}: {algo.train()}\n') # 3. train it,
# Save the model every 5 iterations
if (i + 1) % 10 == 0:
checkpoint = algo.save()
print("Model checkpoint saved at", checkpoint)
eval_result = algo.evaluate()
print(f'to evaluate model: {eval_result}') # 4. and evaluate it.
cur_reward = eval_result['evaluation']['sampler_results']['episode_reward_mean']
cur_episode_len_mean = eval_result['evaluation']['sampler_results']['episode_len_mean']
# Write the iteration, reward and episode length to csv
writer.writerow([i + 1, cur_reward, cur_episode_len_mean])
# Force the file to be written to disk immediately
file.flush()
os.fsync(file.fileno())
if prev_reward is not None and cur_reward <= prev_reward:
no_improvement_counter += 1
else:
no_improvement_counter = 0
print(f'evaluated episode_reward_mean: {cur_reward}, no improvement counter: {no_improvement_counter}\n')
if no_improvement_counter >= 20:
print(f"Training stopped as the episode_reward_mean did not improve for 20 consecutive evaluations. totalIterNum: {i + 1}")
break
prev_reward = cur_reward
I've tried revise DQN replay_buffer_config capacity to 10000, n_step to 20, not works. The results are the same.