Solving the taxi problem using SARSA

Now we will solve the same taxi problem using SARSA:

import gym
import random
env = gym.make('Taxi-v1')

Also, we will initialize the learning rate, gamma, and epsilon. Q table has a dictionary:


alpha = 0.85
gamma = 0.90
epsilon = 0.8

Q = {}
for s in range(env.observation_space.n):
    for a in range(env.action_space.n):
        Q[(s,a)] = 0.0

As usual, we define an epsilon_greedy policy for exploration:

def epsilon_greedy(state, epsilon):
    if random.uniform(0,1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key = lambda x: Q[(state,x)])

Now, the actual SARSA algorithm comes in:

for i in range(4000):
    
    #We store cumulative reward of each episodes in r
    r = 0
    
    #Then for every iterations, we initialize the state,
    state = env.reset()
    
    #then we pick up the action using epsilon greedy policy
    action = epsilon_greedy(state,epsilon)
    
    while True:
       
        #Then we perform the action in the state and move the next state
        nextstate, reward, done, _ = env.step(action)
        
        #Then we pick up the next action using epsilon greedy policy 
        nextaction = epsilon_greedy(nextstate,epsilon) 
    
        #we calculate Q value of the previous state using our update rule
        Q[(state,action)] += alpha * (reward + gamma * Q[(nextstate,nextaction)]-Q[(state,action)])

        #finally we update our state and action with next action 
        # and next state
        action = nextaction
        state = nextstate
        r += reward
        
        #we will break the loop, if we are at the terminal 
        #state of the episode
        if done:
            break

  

env.close()

You can run the program and see how SARSA is finding the optimal path.

The full program is given here:

#Like we did in Q learning, we import necessary libraries and initialize environment

import gym
import random
env = gym.make('Taxi-v1')

alpha = 0.85
gamma = 0.90
epsilon = 0.8

#Then we initialize Q table as dictionary for storing the state-action values
Q = {}
for s in range(env.observation_space.n):
    for a in range(env.action_space.n):
        Q[(s,a)] = 0.0

        
#Now, we define a function called epsilon_greedy for performing action 
#according epsilon greedy policy 
def epsilon_greedy(state, epsilon):
    if random.uniform(0,1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key = lambda x: Q[(state,x)])

for i in range(4000):
    
    #We store cumulative reward of each episodes in 
    r = 0
    
    #Then for every iterations, we initialize the state,
    state = env.reset()
    
    #then we pick up the action using epsilon greedy policy
    action = epsilon_greedy(state,epsilon)
    
    while True:
       
        #Then we perform the action in the state and move the next state
        nextstate, reward, done, _ = env.step(action)
        
        #Then we pick up the next action using epsilon greedy policy 
        nextaction = epsilon_greedy(nextstate,epsilon) 
    
        #we calculate Q value of the previous state using our update rule
        Q[(state,action)] += alpha * (reward + gamma * Q[(nextstate,nextaction)]-Q[(state,action)])

        #finally we update our state and action with next action 
        #and next state
        action = nextaction
        state = nextstate
        r += reward
        
        #we will break the loop, if we are at the terminal 
        #state of the episode
        if done:
            break

  

env.close()

Table of Contents for Solving the taxi problem using SARSA

Create new playlist

Sign In

Sign Up

Table of Contents for
Solving the taxi problem using SARSA