Training the network

Now, we will see how to train the network.

First, we define the DQN class and initialize all variables in the __init__ method:

class DQN(object):
    def __init__(self, state_size,
                       action_size,
                       session,
                       summary_writer = None,
                       exploration_period = 1000,
                       minibatch_size = 32,
                       discount_factor = 0.99,
                       experience_replay_buffer = 10000,
                       target_qnet_update_frequency = 10000,
                       initial_exploration_epsilon = 1.0,
                       final_exploration_epsilon = 0.05,
                       reward_clipping = -1,
                        ):

Initialize all variables:

   
        self.state_size = state_size
        self.action_size = action_size


        self.session = session
        self.exploration_period = float(exploration_period)
        self.minibatch_size = minibatch_size
        self.discount_factor = tf.constant(discount_factor)
        self.experience_replay_buffer = experience_replay_buffer
        self.summary_writer = summary_writer
        self.reward_clipping = reward_clipping


        self.target_qnet_update_frequency = target_qnet_update_frequency
        self.initial_exploration_epsilon = initial_exploration_epsilon
        self.final_exploration_epsilon = final_exploration_epsilon
        self.num_training_steps = 0

Initialize the primary dueling DQN by creating an instance to our QNetworkDueling class:

        self.qnet = QNetworkDueling(self.state_size, self.action_size, "qnet")

Similarly, initialize the target dueling DQN:


        self.target_qnet = QNetworkDueling(self.state_size, self.action_size, "target_qnet")

Next, initialize the optimizer as an RMSPropOptimizer:

        self.qnet_optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.99, epsilon=0.01)

Now, initialize experience_replay_buffer by creating the instance to our ReplayMemoryFast class:

        self.experience_replay = ReplayMemoryFast(self.experience_replay_buffer, self.minibatch_size)
        # Setup the computational graph
        self.create_graph()

Next, we define the copy_to_target_network function for copying weights from the primary network to our target network:

    def copy_to_target_network(source_network, target_network):
        target_network_update = []

        for v_source, v_target in zip(source_network.variables(), target_network.variables()):

            # update target network
            update_op = v_target.assign(v_source)
            target_network_update.append(update_op)

        return tf.group(*target_network_update)

Now, we define the create_graph function and build our computational graph:

    def create_graph(self):

We calculate the q_values and select the action that has the maximum q value:


        with tf.name_scope("pick_action"):

            # placeholder for state
            self.state = tf.placeholder(tf.float32, (None,)+self.state_size , name="state")

            # placeholder for q values
            self.q_values = tf.identity(self.qnet(self.state) , name="q_values")

            # placeholder for predicted actions
            self.predicted_actions = tf.argmax(self.q_values, dimension=1 , name="predicted_actions")

            # plot histogram to track max q values
            tf.histogram_summary("Q values", tf.reduce_mean(tf.reduce_max(self.q_values, 1))) # save max q-values to track learning

Next, we calculate the target future reward:

       with tf.name_scope("estimating_future_rewards"):
          
            self.next_state = tf.placeholder(tf.float32, (None,)+self.state_size , name="next_state")

            self.next_state_mask = tf.placeholder(tf.float32, (None,) , name="next_state_mask")

            self.rewards = tf.placeholder(tf.float32, (None,) , name="rewards")

            self.next_q_values_targetqnet = tf.stop_gradient(self.target_qnet(self.next_state), name="next_q_values_targetqnet")
    
            self.next_q_values_qnet = tf.stop_gradient(self.qnet(self.next_state), name="next_q_values_qnet")

            self.next_selected_actions = tf.argmax(self.next_q_values_qnet, dimension=1)

            self.next_selected_actions_onehot = tf.one_hot(indices=self.next_selected_actions, depth=self.action_size)

            self.next_max_q_values = tf.stop_gradient( tf.reduce_sum( tf.mul( self.next_q_values_targetqnet, self.next_selected_actions_onehot ) , reduction_indices=[1,] ) * self.next_state_mask )

            self.target_q_values = self.rewards + self.discount_factor*self.next_max_q_values

Next, we perform the optimization using RMS prop optimizer:

        with tf.name_scope("optimization_step"):
            self.action_mask = tf.placeholder(tf.float32, (None, self.action_size) , name="action_mask") 

            self.y = tf.reduce_sum( self.q_values * self.action_mask , reduction_indices=[1,])

            ## ERROR CLIPPING 
            self.error = tf.abs(self.y - self.target_q_values)

            quadratic_part = tf.clip_by_value(self.error, 0.0, 1.0)
            linear_part = self.error - quadratic_part

            self.loss = tf.reduce_mean( 0.5*tf.square(quadratic_part) + linear_part )

            # optimize the gradients

            qnet_gradients = self.qnet_optimizer.compute_gradients(self.loss, self.qnet.variables())


            for i, (grad, var) in enumerate(qnet_gradients):
                if grad is not None:
                    qnet_gradients[i] = (tf.clip_by_norm(grad, 10), var)

            self.qnet_optimize = self.qnet_optimizer.apply_gradients(qnet_gradients)

Copy the primary network weights to the target network:

        with tf.name_scope("target_network_update"):
            self.hard_copy_to_target = DQN.copy_to_target_network(self.qnet, self.target_qnet)

We define the store function for storing all the experience in the experience_replay_buffer:

    def store(self, state, action, reward, next_state, is_terminal):
        # rewards clipping
        if self.reward_clipping > 0.0:
            reward = np.clip(reward, -self.reward_clipping, self.reward_clipping)

        self.experience_replay.store(state, action, reward, next_state, is_terminal)

We define an action function for selecting actions using a decaying epsilon-greedy policy:

    def action(self, state, training = False):
     
        if self.num_training_steps > self.exploration_period:
            epsilon = self.final_exploration_epsilon
        else:
            epsilon = self.initial_exploration_epsilon - float(self.num_training_steps) * (self.initial_exploration_epsilon - self.final_exploration_epsilon) / self.exploration_period

        if not training:
            epsilon = 0.05

        if random.random() <= epsilon:
            action = random.randint(0, self.action_size-1)
        else:
            action = self.session.run(self.predicted_actions, {self.state:[state] } )[0]

        return action

Now, we define a train function for training our network:

def train(self):

Copy the primary network weights to the target network:

        if self.num_training_steps == 0:
            print "Training starts..."
            self.qnet.copy_to(self.target_qnet)

Sample experiences from the replay memory:

        minibatch = self.experience_replay.sample()

Get the states, actions, rewards, and next states from the minibatch:

        batch_states = np.asarray( [d[0] for d in minibatch] )
        actions = [d[1] for d in minibatch]
        batch_actions = np.zeros( (self.minibatch_size, self.action_size) )
        for i in xrange(self.minibatch_size):
            batch_actions[i, actions[i]] = 1

        batch_rewards = np.asarray( [d[2] for d in minibatch] )
        batch_newstates = np.asarray( [d[3] for d in minibatch] )

        batch_newstates_mask = np.asarray( [not d[4] for d in minibatch] )

Perform the training operation:


        scores, _, = self.session.run([self.q_values, self.qnet_optimize],
                                      { self.state: batch_states,
                                        self.next_state: batch_newstates,
                                        self.next_state_mask: batch_newstates_mask,
                                        self.rewards: batch_rewards,
                                        self.action_mask: batch_actions} )

Update the target network weights:

        if self.num_training_steps % self.target_qnet_update_frequency == 0:
            self.session.run( self.hard_copy_to_target )

            print 'mean maxQ in minibatch: ',np.mean(np.max(scores,1))

            str_ = self.session.run(self.summarize, { self.state: batch_states,
                                        self.next_state: batch_newstates,
                                        self.next_state_mask: batch_newstates_mask,
                                        self.rewards: batch_rewards,
                                        self.action_mask: batch_actions})

            self.summary_writer.add_summary(str_, self.num_training_steps)

        self.num_training_steps += 1

Table of Contents for Training the network

Create new playlist

Sign In

Sign Up

Table of Contents for
Training the network