-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathq_learning-mountaincar.py
109 lines (83 loc) · 3.49 KB
/
q_learning-mountaincar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# -*- coding: utf-8 -*-
"""Untitled5.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1tBZyGdFuHcuHACR6pi4scIDoFS75xi9-
"""
import gym
import numpy as np
import matplotlib.pyplot as plt
# Import and initialize Mountain Car Environment
env = gym.make('MountainCar-v0')
env.reset()
def QLearning(env, learning, discount, epsilon, min_eps, episodes):
# Determine size of discretized state space
num_states = (env.observation_space.high - env.observation_space.low)*\
np.array([10, 100])
num_states = np.round(num_states, 0).astype(int) + 1
# Initialize Q table
Q = np.random.uniform(low = -1, high = 1,
size = (num_states[0], num_states[1],
env.action_space.n))
# Initialize variables to track rewards
reward_list = []
ave_reward_list = []
# Calculate episodic reduction in epsilon
reduction = (epsilon - min_eps)/episodes
# Run Q learning algorithm
for i in range(episodes):
# Initialize parameters
done = False
tot_reward, reward = 0,0
state = env.reset()
# Discretize state
state_adj = (state - env.observation_space.low)*np.array([10, 100])
state_adj = np.round(state_adj, 0).astype(int)
while done != True:
# Render environment for last five episodes
# if i >= (episodes - 10):
# env.render()
# Determine next action - epsilon greedy strategy
if np.random.random() < 1 - epsilon:
action = np.argmax(Q[state_adj[0], state_adj[1]])
else:
action = np.random.randint(0, env.action_space.n)
# Get next state and reward
state2, reward, done, info = env.step(action)
# Discretize state2
state2_adj = (state2 - env.observation_space.low)*np.array([10, 100])
state2_adj = np.round(state2_adj, 0).astype(int)
#Allow for terminal states
if done and state2[0] >= 0.5:
Q[state_adj[0], state_adj[1], action] = reward
# Adjust Q value for current state
else:
delta = learning*(reward +
discount*np.max(Q[state2_adj[0],
state2_adj[1]]) -
Q[state_adj[0], state_adj[1],action])
Q[state_adj[0], state_adj[1],action] += delta
# Update variables
tot_reward += reward
state_adj = state2_adj
# Decay epsilon
if epsilon > min_eps:
epsilon -= reduction
# Track rewards
reward_list.append(tot_reward)
if (i+1) % 200 == 0:
ave_reward = np.mean(reward_list)
ave_reward_list.append(ave_reward)
reward_list = []
if (i+1) % 200 == 0:
print('Episode {} Average Reward: {}'.format(i+1, ave_reward))
env.close()
return ave_reward_list
# Run Q-learning algorithm
rewards = QLearning(env, 0.1, 0.9, 0.8, 0, 10000)
# Plot Rewards
plt.plot(100*(np.arange(len(rewards)) + 1), rewards)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Episodes')
plt.show()