forked from LeaMarion/quantum-enhanceable-deep-rl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_circular_gridworld_reward-discounting.py
219 lines (187 loc) · 7.9 KB
/
run_circular_gridworld_reward-discounting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import os
import sys
#matplotlib.use('agg')
import matplotlib.pyplot as plt
import numpy as np
import math
import torch
import pickle
from environments.circular_gridworld import Circular_GridWorld
'''
Usage: run command line of the form `run_circular_gridworld_reward-discounting.py {sarsa-ps} {5-9-13-17-21-25} {0-0.9} True 20/$i {1->$i}`
e.g. `python run_circular_gridworld_reward-discounting.py sarsa 5 0.9 True 20 1`
'''
args = sys.argv
if str(args[1])=="sarsa":
from agents.debn_sarsa import *
from agents.dqn_sarsa import *
elif str(args[1])=="ps":
from agents.debn_ps import *
from agents.dqn_ps import *
# Allow only one thread/CPU for pytorch (no gain by multiprocessing)
torch.set_num_threads(1)
args = sys.argv
### INTERACTION PARAMETERS
EPISODES = 500 # number of episodes for each agent
NUM_AGENTS = int(args[5])
MAX_STEPS_PER_TRIAL = 100
### AGENT PARAMETERS
CAPACITY = 2000 # size of memory
BATCH_SIZE = 10 # size of training batch
GAMMA = float(args[3])
LR = 0.01 # learning rate for optimizer
REPLAY = 10 # every X interactions, the network is trained
UPDATE = 10 # every X experience replays the target net is updated
DROPOUT = [0.,0.] # dropout rate
### ENVIRONMENT PARAMETERS
p = 0.99 # probability of good action
reward = 1
good = reward*(1-GAMMA+GAMMA*p)/(1-GAMMA)
bad = reward*GAMMA*p/(1-GAMMA)
#print(good,bad)
one_all = (str(args[4]) == "True") # when True, only shortest action rewarded
### AGENT-ENV PARAMETERS
num_percepts = int(args[2])
max_num_percepts = max(num_percepts,25) # change when considering larger env
dim_percept = num_percepts
n_action_units = num_percepts+1
num_actions = int((n_action_units/2)**2)
n_hidden_units_layer = 10 # at largest env size
nb_hidden_layers = 2
max_num_actions = int(((max_num_percepts+1)/2)**2)
# Compute nb weights debn and dqn
n_weights = (max_num_percepts + nb_hidden_layers + max_num_actions)*n_hidden_units_layer+(n_hidden_units_layer**2)*(nb_hidden_layers-1)
print("Nb weights: ", n_weights)
b = dim_percept + nb_hidden_layers + n_action_units
delta = b**2 + 4*(nb_hidden_layers-1)*(n_weights - dim_percept - n_action_units)
n_hidden_units_layer_debn = (-b+np.sqrt(delta))/(2*(nb_hidden_layers-1))
b = dim_percept + nb_hidden_layers + num_actions
delta = b**2 + 4*(nb_hidden_layers-1)*n_weights
n_hidden_units_layer_dqn = (-b+np.sqrt(delta))/(2*(nb_hidden_layers-1))
NUM_HIDDEN_DEBN = [int(round(n_hidden_units_layer_debn))]*nb_hidden_layers
NUM_HIDDEN_DQN = [int(round(n_hidden_units_layer_dqn))]*nb_hidden_layers
print("Hidden units DEBN/DQN:", NUM_HIDDEN_DEBN,NUM_HIDDEN_DQN)
### EXECUTION PARAMETERS
plot = True
dump = True
save_agent = True
filename = "results/"+str(args[1])+"_circular_gridworld"+"_onegoodaction"*one_all+"_target"+str(UPDATE)+"_agents"+str(args[5])+"_gamma"+str(args[3])+"_percept"+str(dim_percept)+"_layers"+str(nb_hidden_layers)+"_units"+str(n_hidden_units_layer)+"_actionunits"+str(n_action_units)+"-"+str(args[6])+".pckl"
print("Filename:", filename)
def one_hot(percept):
one_hot = np.zeros(env.gridlength)
one_hot[int(percept)]=1.
return one_hot
def evaluate_m_values_debn(percept,all_actions,agent):
m_values = []
for action in all_actions:
m_values+=[agent.policy_net(percept,action)[0].detach().numpy()[0]]
m_values = np.array(m_values)
return(m_values)
def evaluate_m_values_dqn(percept,agent):
m_values = agent.policy_net(percept)[0].detach().numpy()
m_values = np.array(m_values)
return(m_values)
def mse(vector_1,vector_2):
return(np.mean((vector_1-vector_2)**2))
if __name__ == "__main__":
env = Circular_GridWorld(grid_size=np.array([dim_percept]),one_action=one_all)
percept_size = env.gridlength
all_actions = env.generate_all_actions()
action_size = int(env.num_actions)
action_length = int(percept_size+1) # Composite movement (i to left, j to right), 2-hot encoding
print('Nb states: ', percept_size)
print('Nb actions: ', action_size)
# Compute env immediate rewards
rewards = np.zeros((percept_size,action_size))
for percept in range(env.gridlength):
state = np.array([float(percept)])
for i in range(all_actions.size(0)):
action = all_actions[i]
rewards[percept][i]+=[env.step(action,state)[1]]
# Train DEBN and DQN off-policy
all_full_losses_debn = []
all_full_losses_dqn = []
for ag in range(NUM_AGENTS):
print("Agent #{}".format(ag))
sys.stdout.flush()
agent = DEBNAgent(percept_size, action_length, all_actions,
capacity=CAPACITY, batch_size=BATCH_SIZE, gamma=GAMMA, learning_rate=LR,
dim_hidden=NUM_HIDDEN_DEBN, replay_time=REPLAY,
dropout_rate=DROPOUT, target_update = UPDATE)
dqn_agent = DQNAgent(percept_size, action_size,
capacity=CAPACITY, batch_size=BATCH_SIZE, gamma=GAMMA, learning_rate=LR,
dim_hidden=NUM_HIDDEN_DQN, replay_time=REPLAY,
dropout_rate=DROPOUT, target_update = UPDATE)
full_losses_debn = []
full_losses_dqn = []
for e in range(EPISODES):
reward = 0
percept = float(np.random.choice(env.gridlength))
state = np.array([percept])
percept = one_hot(percept)
percept = np.reshape(percept , [1, percept_size])
percept = torch.Tensor(percept)
rewarded = rewards[int(state[0])]
losses_debn = []
losses_dqn = []
for t in range(1, MAX_STEPS_PER_TRIAL+1):
policy = rewarded*p/np.sum(rewarded)+(1-rewarded)*(1-p)/np.sum(1-rewarded) # off-policy
action_id = np.random.choice(range(action_size),p=policy)
action = all_actions[action_id]
agent.deliberate_and_learn(percept, action, reward, GAMMA, done=(t==MAX_STEPS_PER_TRIAL))
dqn_agent.deliberate_and_learn(percept, action_id, reward, GAMMA, done=(t==MAX_STEPS_PER_TRIAL))
reward = rewarded[action_id]
percept = float(np.random.choice(env.gridlength))
state = np.array([percept])
percept = one_hot(percept)
percept = np.reshape(percept , [1, percept_size])
percept = torch.Tensor(percept)
rewarded = rewards[int(state[0])]
# monitoring test performance
if t>=9*MAX_STEPS_PER_TRIAL/10:
target_m_values = rewarded*good+(1-rewarded)*bad
loss_debn = mse(target_m_values,evaluate_m_values_debn(percept,all_actions,agent))
loss_dqn = mse(target_m_values,evaluate_m_values_dqn(percept,dqn_agent))
losses_debn += [loss_debn]
losses_dqn += [loss_dqn]
full_losses_debn += [np.mean(losses_debn)]
full_losses_dqn += [np.mean(losses_dqn)]
if e%10 == 0:
print(e, full_losses_debn[-1])
print("dqn", full_losses_dqn[-1])
print("")
if save_agent and (ag==0):
file = open('saved_models/'+str(args[1])+"_circular_gridworld"+"_onegoodaction"*one_all+"_target"+str(UPDATE)+"_agents"+str(args[5])+"_gamma"+str(args[3])+"_percept"+str(dim_percept)+"_layers"+str(nb_hidden_layers)+"_units"+str(n_hidden_units_layer)+"_actionunits"+str(n_action_units)+"-"+str(args[6])+".pckl", "wb")
pickle.dump([agent.policy_net, dqn_agent.policy_net], file)
all_full_losses_debn += [full_losses_debn]
all_full_losses_dqn += [full_losses_dqn]
if dump:
pickle.dump([all_full_losses_debn,all_full_losses_dqn],open(filename, "wb"))
if plot:
w = 20
h = 15
d = 130
plt.figure(figsize=(w, h), dpi=d)
all_full_losses_debn = np.array(all_full_losses_debn)
all_full_losses_dqn = np.array(all_full_losses_dqn)
mean_debn = np.mean(all_full_losses_debn, axis=0)
std_debn = np.std(all_full_losses_debn, axis=0)
mean_dqn = np.mean(all_full_losses_dqn, axis=0)
std_dqn = np.std(all_full_losses_dqn, axis=0)
x = range(len(mean_dqn))
plt.subplot(211)
plt.title('Testing loss during off-policy training')
plt.grid(True)
plt.yscale('log')
plt.fill_between(x, mean_dqn+std_dqn, mean_dqn-std_dqn,alpha=0.3,color='r')
plt.plot(x,mean_dqn,c='r')
plt.fill_between(x, mean_debn+std_debn, mean_debn-std_debn,alpha=0.3,color='g')
plt.plot(x,mean_debn,c='g')
plt.subplot(212)
plt.grid(True)
plt.fill_between(x, mean_dqn+std_dqn, mean_dqn-std_dqn,alpha=0.3,color='r')
plt.plot(x,mean_dqn,c='r')
plt.fill_between(x, mean_debn+std_debn, mean_debn-std_debn,alpha=0.3,color='g')
plt.plot(x,mean_debn,c='g')
#plt.savefig(filename[:-5]+'.png')
plt.show()