Solving the taxi problem using SARSA

Now we will solve the same taxi problem using SARSA:

import gym
import random
env = gym.make('Taxi-v1')

Also, we will initialize the learning rate, gamma, and epsilon. Q table has a dictionary:


alpha = 0.85
gamma = 0.90
epsilon = 0.8

Q = {}
for s in range(env.observation_space.n):
for a in range(env.action_space.n):
Q[(s,a)] = 0.0

As usual, we define an epsilon_greedy policy for exploration:

def epsilon_greedy(state, epsilon):
if random.uniform(0,1) < epsilon:
return env.action_space.sample()
else:
return max(list(range(env.action_space.n)), key = lambda x: Q[(state,x)])

Now, the actual SARSA algorithm comes in:

for i in range(4000):

#We store cumulative reward of each episodes in r
r = 0

#Then for every iterations, we initialize the state,
state = env.reset()

#then we pick up the action using epsilon greedy policy
action = epsilon_greedy(state,epsilon)

while True:

#Then we perform the action in the state and move the next state
nextstate, reward, done, _ = env.step(action)

#Then we pick up the next action using epsilon greedy policy
nextaction = epsilon_greedy(nextstate,epsilon)

#we calculate Q value of the previous state using our update rule
Q[(state,action)] += alpha * (reward + gamma * Q[(nextstate,nextaction)]-Q[(state,action)])

        #finally we update our state and action with next action 
# and next state
action = nextaction
state = nextstate
r += reward

#we will break the loop, if we are at the terminal
#state of the episode
if done:
break



env.close()

You can run the program and see how SARSA is finding the optimal path. 

The full program is given here:

#Like we did in Q learning, we import necessary libraries and initialize environment

import gym
import random
env = gym.make('Taxi-v1')

alpha = 0.85
gamma = 0.90
epsilon = 0.8

#Then we initialize Q table as dictionary for storing the state-action values
Q = {}
for s in range(env.observation_space.n):
for a in range(env.action_space.n):
Q[(s,a)] = 0.0


#Now, we define a function called epsilon_greedy for performing action
#according epsilon greedy policy
def epsilon_greedy(state, epsilon):
if random.uniform(0,1) < epsilon:
return env.action_space.sample()
else:
return max(list(range(env.action_space.n)), key = lambda x: Q[(state,x)])


for i in range(4000):

#We store cumulative reward of each episodes in
r = 0

#Then for every iterations, we initialize the state,
state = env.reset()

#then we pick up the action using epsilon greedy policy
action = epsilon_greedy(state,epsilon)

while True:

#Then we perform the action in the state and move the next state
nextstate, reward, done, _ = env.step(action)

#Then we pick up the next action using epsilon greedy policy
nextaction = epsilon_greedy(nextstate,epsilon)

#we calculate Q value of the previous state using our update rule
Q[(state,action)] += alpha * (reward + gamma * Q[(nextstate,nextaction)]-Q[(state,action)])

#finally we update our state and action with next action
#and next state
action = nextaction
state = nextstate
r += reward

#we will break the loop, if we are at the terminal
#state of the episode
if done:
break



env.close()
..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset
18.118.208.97