Training the network

Now, we will see how to train the network.

First, we define the DQN class and initialize all variables in the __init__ method:

class DQN(object):
def __init__(self, state_size,
action_size,
session,
summary_writer = None,
exploration_period = 1000,
minibatch_size = 32,
discount_factor = 0.99,
experience_replay_buffer = 10000,
target_qnet_update_frequency = 10000,
initial_exploration_epsilon = 1.0,
final_exploration_epsilon = 0.05,
reward_clipping = -1,
):

Initialize all variables:

   
self.state_size = state_size
self.action_size = action_size


self.session = session
self.exploration_period = float(exploration_period)
self.minibatch_size = minibatch_size
self.discount_factor = tf.constant(discount_factor)
self.experience_replay_buffer = experience_replay_buffer
self.summary_writer = summary_writer
self.reward_clipping = reward_clipping


self.target_qnet_update_frequency = target_qnet_update_frequency
self.initial_exploration_epsilon = initial_exploration_epsilon
self.final_exploration_epsilon = final_exploration_epsilon
self.num_training_steps = 0

Initialize the primary dueling DQN by creating an instance to our QNetworkDueling class:

        self.qnet = QNetworkDueling(self.state_size, self.action_size, "qnet")

Similarly, initialize the target dueling DQN:


self.target_qnet = QNetworkDueling(self.state_size, self.action_size, "target_qnet")

Next, initialize the optimizer as an RMSPropOptimizer:

        self.qnet_optimizer = tf.train.RMSPropOptimizer(learning_rate=0.00025, decay=0.99, epsilon=0.01) 

Now, initialize experience_replay_buffer by creating the instance to our ReplayMemoryFast class:

        self.experience_replay = ReplayMemoryFast(self.experience_replay_buffer, self.minibatch_size)
# Setup the computational graph
self.create_graph()

Next, we define the copy_to_target_network function for copying weights from the primary network to our target network:

    def copy_to_target_network(source_network, target_network):
target_network_update = []

for v_source, v_target in zip(source_network.variables(), target_network.variables()):

# update target network
update_op = v_target.assign(v_source)
target_network_update.append(update_op)

return tf.group(*target_network_update)

Now, we define the create_graph function and build our computational graph:

    def create_graph(self):

We calculate the q_values and select the action that has the maximum q value:


with tf.name_scope("pick_action"):

# placeholder for state
self.state = tf.placeholder(tf.float32, (None,)+self.state_size , name="state")

# placeholder for q values
self.q_values = tf.identity(self.qnet(self.state) , name="q_values")

# placeholder for predicted actions
self.predicted_actions = tf.argmax(self.q_values, dimension=1 , name="predicted_actions")

# plot histogram to track max q values
tf.histogram_summary("Q values", tf.reduce_mean(tf.reduce_max(self.q_values, 1))) # save max q-values to track learning

 Next, we calculate the target future reward:

       with tf.name_scope("estimating_future_rewards"):

self.next_state = tf.placeholder(tf.float32, (None,)+self.state_size , name="next_state")

self.next_state_mask = tf.placeholder(tf.float32, (None,) , name="next_state_mask")

self.rewards = tf.placeholder(tf.float32, (None,) , name="rewards")

self.next_q_values_targetqnet = tf.stop_gradient(self.target_qnet(self.next_state), name="next_q_values_targetqnet")

self.next_q_values_qnet = tf.stop_gradient(self.qnet(self.next_state), name="next_q_values_qnet")

self.next_selected_actions = tf.argmax(self.next_q_values_qnet, dimension=1)

self.next_selected_actions_onehot = tf.one_hot(indices=self.next_selected_actions, depth=self.action_size)

self.next_max_q_values = tf.stop_gradient( tf.reduce_sum( tf.mul( self.next_q_values_targetqnet, self.next_selected_actions_onehot ) , reduction_indices=[1,] ) * self.next_state_mask )

self.target_q_values = self.rewards + self.discount_factor*self.next_max_q_values

Next, we perform the optimization using RMS prop optimizer:

        with tf.name_scope("optimization_step"):
self.action_mask = tf.placeholder(tf.float32, (None, self.action_size) , name="action_mask")

self.y = tf.reduce_sum( self.q_values * self.action_mask , reduction_indices=[1,])

## ERROR CLIPPING
self.error = tf.abs(self.y - self.target_q_values)

quadratic_part = tf.clip_by_value(self.error, 0.0, 1.0)
linear_part = self.error - quadratic_part

self.loss = tf.reduce_mean( 0.5*tf.square(quadratic_part) + linear_part )

# optimize the gradients

qnet_gradients = self.qnet_optimizer.compute_gradients(self.loss, self.qnet.variables())


for i, (grad, var) in enumerate(qnet_gradients):
if grad is not None:
qnet_gradients[i] = (tf.clip_by_norm(grad, 10), var)

self.qnet_optimize = self.qnet_optimizer.apply_gradients(qnet_gradients)

Copy the primary network weights to the target network:

        with tf.name_scope("target_network_update"):
self.hard_copy_to_target = DQN.copy_to_target_network(self.qnet, self.target_qnet)

We define the store function for storing all the experience in the experience_replay_buffer:

    def store(self, state, action, reward, next_state, is_terminal):
# rewards clipping
if self.reward_clipping > 0.0:
reward = np.clip(reward, -self.reward_clipping, self.reward_clipping)

self.experience_replay.store(state, action, reward, next_state, is_terminal)

 We define an action function for selecting actions using a decaying epsilon-greedy policy:

    def action(self, state, training = False):

if self.num_training_steps > self.exploration_period:
epsilon = self.final_exploration_epsilon
else:
epsilon = self.initial_exploration_epsilon - float(self.num_training_steps) * (self.initial_exploration_epsilon - self.final_exploration_epsilon) / self.exploration_period

if not training:
epsilon = 0.05

if random.random() <= epsilon:
action = random.randint(0, self.action_size-1)
else:
action = self.session.run(self.predicted_actions, {self.state:[state] } )[0]

return action

Now, we define a train function for training our network:

def train(self):

Copy the primary network weights to the target network:

        if self.num_training_steps == 0:
print "Training starts..."
self.qnet.copy_to(self.target_qnet)

Sample experiences from the replay memory:

        minibatch = self.experience_replay.sample()

Get the states, actions, rewards, and next states from the minibatch:

        batch_states = np.asarray( [d[0] for d in minibatch] )
actions = [d[1] for d in minibatch]
batch_actions = np.zeros( (self.minibatch_size, self.action_size) )
for i in xrange(self.minibatch_size):
batch_actions[i, actions[i]] = 1

batch_rewards = np.asarray( [d[2] for d in minibatch] )
batch_newstates = np.asarray( [d[3] for d in minibatch] )

batch_newstates_mask = np.asarray( [not d[4] for d in minibatch] )

Perform the training operation:


scores, _, = self.session.run([self.q_values, self.qnet_optimize],
{ self.state: batch_states,
self.next_state: batch_newstates,
self.next_state_mask: batch_newstates_mask,
self.rewards: batch_rewards,
self.action_mask: batch_actions} )

Update the target network weights:

        if self.num_training_steps % self.target_qnet_update_frequency == 0:
self.session.run( self.hard_copy_to_target )

print 'mean maxQ in minibatch: ',np.mean(np.max(scores,1))

str_ = self.session.run(self.summarize, { self.state: batch_states,
self.next_state: batch_newstates,
self.next_state_mask: batch_newstates_mask,
self.rewards: batch_rewards,
self.action_mask: batch_actions})

self.summary_writer.add_summary(str_, self.num_training_steps)

self.num_training_steps += 1
..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset
3.137.167.195