Now, we will see how to train the network.
First, we define the DQN class and initialize all variables in the __init__ method:
class DQN(object):
def __init__(self, state_size,
action_size,
session,
summary_writer = None,
exploration_period = 1000,
minibatch_size = 32,
discount_factor = 0.99,
experience_replay_buffer = 10000,
target_qnet_update_frequency = 10000,
initial_exploration_epsilon = 1.0,
final_exploration_epsilon = 0.05,
reward_clipping = -1,
):
Initialize all variables:
self.state_size = state_size
self.action_size = action_size
self.session = session
self.exploration_period = float(exploration_period)
self.minibatch_size = minibatch_size
self.discount_factor = tf.constant(discount_factor)
self.experience_replay_buffer = experience_replay_buffer
self.summary_writer = summary_writer
self.reward_clipping = reward_clipping
self.target_qnet_update_frequency = target_qnet_update_frequency
self.initial_exploration_epsilon = initial_exploration_epsilon
self.final_exploration_epsilon = final_exploration_epsilon
self.num_training_steps = 0
Initialize the primary dueling DQN by creating an instance to our QNetworkDueling class:
self.qnet = QNetworkDueling(self.state_size, self.action_size, "qnet")
Similarly, initialize the target dueling DQN:
self.target_qnet = QNetworkDueling(self.state_size, self.action_size, "target_qnet")
Next, initialize the optimizer as an RMSPropOptimizer:
self.qnet_optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.99, epsilon=0.01)
Now, initialize experience_replay_buffer by creating the instance to our ReplayMemoryFast class:
self.experience_replay = ReplayMemoryFast(self.experience_replay_buffer, self.minibatch_size)
# Setup the computational graph
self.create_graph()
Next, we define the copy_to_target_network function for copying weights from the primary network to our target network:
def copy_to_target_network(source_network, target_network):
target_network_update = []
for v_source, v_target in zip(source_network.variables(), target_network.variables()):
# update target network
update_op = v_target.assign(v_source)
target_network_update.append(update_op)
return tf.group(*target_network_update)
Now, we define the create_graph function and build our computational graph:
def create_graph(self):
We calculate the q_values and select the action that has the maximum q value:
with tf.name_scope("pick_action"):
# placeholder for state
self.state = tf.placeholder(tf.float32, (None,)+self.state_size , name="state")
# placeholder for q values
self.q_values = tf.identity(self.qnet(self.state) , name="q_values")
# placeholder for predicted actions
self.predicted_actions = tf.argmax(self.q_values, dimension=1 , name="predicted_actions")
# plot histogram to track max q values
tf.histogram_summary("Q values", tf.reduce_mean(tf.reduce_max(self.q_values, 1))) # save max q-values to track learning
Next, we calculate the target future reward:
with tf.name_scope("estimating_future_rewards"):
self.next_state = tf.placeholder(tf.float32, (None,)+self.state_size , name="next_state")
self.next_state_mask = tf.placeholder(tf.float32, (None,) , name="next_state_mask")
self.rewards = tf.placeholder(tf.float32, (None,) , name="rewards")
self.next_q_values_targetqnet = tf.stop_gradient(self.target_qnet(self.next_state), name="next_q_values_targetqnet")
self.next_q_values_qnet = tf.stop_gradient(self.qnet(self.next_state), name="next_q_values_qnet")
self.next_selected_actions = tf.argmax(self.next_q_values_qnet, dimension=1)
self.next_selected_actions_onehot = tf.one_hot(indices=self.next_selected_actions, depth=self.action_size)
self.next_max_q_values = tf.stop_gradient( tf.reduce_sum( tf.mul( self.next_q_values_targetqnet, self.next_selected_actions_onehot ) , reduction_indices=[1,] ) * self.next_state_mask )
self.target_q_values = self.rewards + self.discount_factor*self.next_max_q_values
Next, we perform the optimization using RMS prop optimizer:
with tf.name_scope("optimization_step"):
self.action_mask = tf.placeholder(tf.float32, (None, self.action_size) , name="action_mask")
self.y = tf.reduce_sum( self.q_values * self.action_mask , reduction_indices=[1,])
## ERROR CLIPPING
self.error = tf.abs(self.y - self.target_q_values)
quadratic_part = tf.clip_by_value(self.error, 0.0, 1.0)
linear_part = self.error - quadratic_part
self.loss = tf.reduce_mean( 0.5*tf.square(quadratic_part) + linear_part )
# optimize the gradients
qnet_gradients = self.qnet_optimizer.compute_gradients(self.loss, self.qnet.variables())
for i, (grad, var) in enumerate(qnet_gradients):
if grad is not None:
qnet_gradients[i] = (tf.clip_by_norm(grad, 10), var)
self.qnet_optimize = self.qnet_optimizer.apply_gradients(qnet_gradients)
Copy the primary network weights to the target network:
with tf.name_scope("target_network_update"):
self.hard_copy_to_target = DQN.copy_to_target_network(self.qnet, self.target_qnet)
We define the store function for storing all the experience in the experience_replay_buffer:
def store(self, state, action, reward, next_state, is_terminal):
# rewards clipping
if self.reward_clipping > 0.0:
reward = np.clip(reward, -self.reward_clipping, self.reward_clipping)
self.experience_replay.store(state, action, reward, next_state, is_terminal)
We define an action function for selecting actions using a decaying epsilon-greedy policy:
def action(self, state, training = False):
if self.num_training_steps > self.exploration_period:
epsilon = self.final_exploration_epsilon
else:
epsilon = self.initial_exploration_epsilon - float(self.num_training_steps) * (self.initial_exploration_epsilon - self.final_exploration_epsilon) / self.exploration_period
if not training:
epsilon = 0.05
if random.random() <= epsilon:
action = random.randint(0, self.action_size-1)
else:
action = self.session.run(self.predicted_actions, {self.state:[state] } )[0]
return action
Now, we define a train function for training our network:
def train(self):
Copy the primary network weights to the target network:
if self.num_training_steps == 0:
print "Training starts..."
self.qnet.copy_to(self.target_qnet)
Sample experiences from the replay memory:
minibatch = self.experience_replay.sample()
Get the states, actions, rewards, and next states from the minibatch:
batch_states = np.asarray( [d[0] for d in minibatch] )
actions = [d[1] for d in minibatch]
batch_actions = np.zeros( (self.minibatch_size, self.action_size) )
for i in xrange(self.minibatch_size):
batch_actions[i, actions[i]] = 1
batch_rewards = np.asarray( [d[2] for d in minibatch] )
batch_newstates = np.asarray( [d[3] for d in minibatch] )
batch_newstates_mask = np.asarray( [not d[4] for d in minibatch] )
Perform the training operation:
scores, _, = self.session.run([self.q_values, self.qnet_optimize],
{ self.state: batch_states,
self.next_state: batch_newstates,
self.next_state_mask: batch_newstates_mask,
self.rewards: batch_rewards,
self.action_mask: batch_actions} )
Update the target network weights:
if self.num_training_steps % self.target_qnet_update_frequency == 0:
self.session.run( self.hard_copy_to_target )
print 'mean maxQ in minibatch: ',np.mean(np.max(scores,1))
str_ = self.session.run(self.summarize, { self.state: batch_states,
self.next_state: batch_newstates,
self.next_state_mask: batch_newstates_mask,
self.rewards: batch_rewards,
self.action_mask: batch_actions})
self.summary_writer.add_summary(str_, self.num_training_steps)
self.num_training_steps += 1