2

I want to train my reinforcement learning model preliminary with gym environment, and then to deploy it in real environment to continue with reinforcement learning in real environment.

I am using TF, Keras RL + gym for initial training, the code is below What is the way to manage that? Lost in googling of that

My guess is that in real environment I should have 2 agent, one for prediction and another for further training. Training agent should work based on state-action samples collected in runtime and then this new trained model should be merged to predicting model. If it's correct assumption, how can it be implemented?

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

# custom gym environment     
env = FooEnv()
env.seed(0)

states = env.observation_space.shape
actions = env.action_space.n

def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

from rl.agents import DQNAgent
from rl.callbacks import ModelIntervalCheckpoint, FileLogger
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory

model = build_model(states, actions)
model.summary()

def build_agent(model, actions):
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1, value_min=0.1, value_test=0.05,
                                  nb_steps=500)
    memory = SequentialMemory(limit=10000, window_length=1)

    dqn = DQNAgent(model=model, memory=memory, policy=policy, enable_double_dqn=True,
                   nb_actions=actions, gamma=.98, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

def build_callbacks(env_name):
    checkpoint_weights_filename = 'weights/dqn_' + env_name + '_weights_{step}.h5f'
    log_filename = 'weights/dqn_{}_log.json'.format(env_name)
    callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=1000)]
    callbacks += [FileLogger(log_filename, interval=100)]
    return callbacks

callbacks = build_callbacks('FooEnv')

dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=30000, log_interval=1000, nb_max_episode_steps=50, visualize=False, verbose=1, callbacks=callbacks)

scores = dqn.test(env, nb_episodes=1, visualize=True)

dqn.save_weights('weights/saved_weights')

1 Answers1

1

The solution is probably to use fit method after model is compiled and weights are loaded in prod:

... dqn.load_weights('./weights/weights') dqn.fit(env, nb_steps=30000, log_interval=1000, nb_max_episode_steps=100, visualize=False, verbose=1, callbacks=callbacks)