I tried implementing my own DQN with LSTM model and the problem is that it's taking too long to run. For instance, running 2 episodes takes 10 mins, and each episode only has around 1.2k time steps (financial time series). I'm not sure if there are better ways to implement the model to speed up training time, or if I'm just limited by processing power.
I run my Python script on Google Colab and switching the hardware accelerator to GPU/TPU didn't help either.
I know there are RL libraries out there, but some of them don't support RNN, and I find the documentations for creating a custom policy model to be very confusing. That's why I decided to code out the DQN agent instead of relying on existing libraries. I get to understand more in-depth as well since I'm new to Machine Learning.
Here's the code for the LSTM model:
learning_rate = 0.0001
no_of_features = 9
lookback_window = 60
def create_DL_model():
model = Sequential()
model.add(LSTM(64,input_shape= (no_of_features,lookback_window),return_sequences=True)) #return_sequences =True if its followed by another Recurrent layer
model.add(LeakyReLU())
model.add(LSTM(32))
model.add(LeakyReLU())
model.add(Dense(no_of_actions))
model.compile(loss='mse',optimizer = Adam(learning_rate = learning_rate))
return model
#initialise main network and target network with the same weights
main_net = create_DL_model()
target_net = create_DL_model()
target_net.set_weights(tmain_net.get_weights())
and the code for DQN agent:
discount = 0.3
replay_size = 5000
min_replay_size= 100
network_update_steps = 1000
no_of_episodes = 10
batch_size = 64
epsilon = 1.0 # Epsilon greedy parameter
epsilon_min = 0.01 # Minimum epsilon greedy parameter
epsilon_max = 1.0 # Maximum epsilon greedy parameter
epsilon_interval = epsilon_max - epsilon_min
epsilon_greedy_frames = 150000
replay_memory = deque(maxlen= replay_size)
timesteps = 0
episode_reward_list =[]
episode_counter = 0
#for loop for episodes
for i in range(2):
episode_counter+=1
episode_reward = 0
state = np.array(train_env.reset()) #state has shape (1,9,60)
#flag for end of episode
done = False
#while loop to run through df per episode
while not done:
timesteps+=1
#exploration
if epsilon> np.random.random() or len(replay_memory) < min_replay_size:
#take random action
action = np.random.choice(no_of_actions)
else:
#select action with max expected future reward
action = np.argmax(main_net.predict(state)[0])
#Epsilon decay
epsilon -= epsilon_interval / epsilon_greedy_frames
epsilon = max(epsilon, epsilon_min)
next_state,reward,done,info = train_env.step([action]) #'action' must be iterable
next_state = np.array(next_state)
#update reward
episode_reward += reward
#add transition into replay memory
replay_memory.append((state,action,reward,next_state,done))
#update state and step.
state=next_state
#Train the model only if there are sufficient transitions in replay memory
if len(replay_memory) < min_replay_size:
continue
else:
batch = random.sample(replay_memory,batch_size)
#current states
train_states = np.array([transition[0][0] for transition in batch])
Q_list = main_net.predict(train_states)
#future states
train_future_states = np.array([transition[3][0] for transition in batch])
future_Q_list = target_net.predict(train_future_states)
#for model training
X = []
y= []
for index, (state,action,reward,next_state,done_) in enumerate(batch):
if not done_:
max_future_q = np.max(future_Q_list[index])
new_q = reward + discount*max_future_q
#last step no more future reward
else:
new_q = reward
#update the 'correct' Q value from target network
curr_Q = Q_list[index]
curr_Q[action] = new_q
#append to training data
X.append(state[0])
y.append(curr_Q)
with tensorflow.device('/gpu:0'):
#model fit training data to update the weights based on loss
main_net.fit(np.array(X),np.array(y),batch_size = batch_size,verbose=0)
#update target network every 1000 timesteps
if timesteps % network_update_steps ==0:
print(timesteps)
print(epsilon)
target_net.set_weights(main_net.get_weights())
#update episode reward
episode_reward_list.append(episode_reward)
print('Episode #{}: Reward{}'.format(episode_counter,episode_reward))