-
Notifications
You must be signed in to change notification settings - Fork 0
/
dqn_bot.py
270 lines (240 loc) · 11.2 KB
/
dqn_bot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import marlo
import numpy as np
import random
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import Conv2D
from keras.layers import MaxPooling2D,Flatten, AveragePooling2D
from collections import deque
from keras.models import model_from_yaml
from matplotlib import pyplot as plt
from past.utils import old_div # tutorial 5
import MalmoPython
import sys
import utils
import csv
from time import sleep
import pdb
from keras.backend import manual_variable_initialization
# Notes:
# env.action_space.sample() gives a random action from those available, applies to any env
# TODO The loaded model seems to always give action 2 as the best one. I suspsect this is n issue with the loading as the model performes better after a period of training
# I've checked that the model correctly pulls in weights so it can't be that.
# TODO to make best use of the DQN I need to use an LTSM or stack previous game frames before sending them to the NN. See the deepmind atari paper or hausknecht and stone 2015
# TODO info data contains the orientation and position of the agent, could use this as a feature to train the nn. That might be best as a separate nn that takes in the history of actions taken How does it deal with the straint position being none zero, how does it deal with the maps changing?
# L Even just having the NN output the position it thinks its in would be a useful thing to train. What other features are valuable to extract from the images?
# TODO consider transfer learining from pretrained CNN
# TODO Ben: Create method to rotate the floor grid by the yaw as a function
# TODO Johnny: Create method to do 5 turns in a given direction rather than one
# TODO Matt: Adapt the NN to output the floor grid as well as the q-values.
def trainAgent(env, agent):
# Amount of steps till stop
goal_steps = 100
# How many games to train over
initial_games = 10000
# Batch for back-propagation
batch_size = 16
scores = deque(maxlen=50)
results = []
# Loop over the games initialised
for i in range(initial_games):
reward = 0
game_score = 0
# Short wait required to prevent loss of connection to marlo
sleep(2)
env.reset()
state = env.last_image
# For each step of taken action
for j in range(goal_steps):
print("Starting goal step: ", j + 1, " of game: ", i + 1, " avg score: ", np.mean(scores))
action = agent.act(state)
# Receive the outcome of the action
new_state, reward, done, info = env.step(action)
# Adds this image and action to memory
agent.memory.append((state,action, reward, new_state, done))
if done:
# Score is the scores for finished games
print("Game: ",i ," complete, score: " , game_score," last 50 scores avg: ", np.mean(scores), " epsilon ", agent.epsilon)
scores.append(game_score)
break
game_score += reward
state = new_state
oldInfo = info
# This memory is the last seen game images
if len(agent.memory) > batch_size:
# Find a random batch from the memory
randomBatch = random.sample(agent.memory, batch_size)
# Perform backpropagation
agent.replay(randomBatch)
results.append([game_score,j,oldInfo['observation']['TotalTime'], agent.epsilon])
# Save results so far
with open(agent.CSVName,"w") as f:
wr = csv.writer(f)
wr.writerows(results)
# Decay the epsilon until the minimum
if agent.epsilon > agent.epsilon_min:
agent.epsilon *= agent.epsilon_decay
else:
agent.epsilon = 0
# Update the storage of the model
model_yaml = agent.model.to_yaml()
with open("model.yaml", "w") as yaml_file:
yaml_file.write(model_yaml)
# Save the weights of the model
agent.model.save_weights('model_weights.h5')
return scores
def testAgent(env, agent):
goal_steps = 500
initial_games = 50
scores = deque(maxlen=50)
for i in range(initial_games):
reward = 0
game_score = 0
env.reset()
state = env.last_image
for j in range(goal_steps):
action = agent.act(state)
print("Starting goal step: ", j, " of game: ", i, " avg score: ", np.mean(scores), " action: ", action)
new_state, reward, done, info = env.step(action)
#pdb.set_trace()
if done:
print("Game: ",i ," complete, score: " , game_score," last 50 scores avg: ", np.mean(scores), " epsilon ", agent.epsilon)
scores.append(game_score)
break
game_score += reward
state = new_state
return scores
class agent:
def __init__(self, observation_shape, action_size, block_map_shape, load_model_file = False, epsilon = 1.0):
# Initialise parameters for the agent
self.observation_shape = observation_shape
self.action_size = action_size
self.block_list = ['air','cobblestone','stone','gold_block']
self.block_vision_size = len(self.block_list) * block_map_shape[0] * block_map_shape[1]
self.memory = deque(maxlen=2000)
self.gamma = 1.0 # discount rate
self.epsilon_min = 0.01
self.epsilon = epsilon
self.epsilon_decay = 0.999
self.learning_rate = 0.5
self.CSVName = 'dqn_bot_results.csv'
if load_model_file:
# If you want to load a previous model
# This is required to stop tensorflow reinitialising weights on model load
#manual_variable_initialization(True)
#self.model = load_model('model.h5')
#self.model.load_weights('model.h5')
yaml_file = open('model.yaml', 'r')
loaded_model_yaml = yaml_file.read()
yaml_file.close()
self.model = model_from_yaml(loaded_model_yaml)
self.model.load_weights('model_weights.h5')
self.model.compile(loss='mse', optimizer='rmsprop')
else:
# Start from scratch
self.model = self.create_model()
def create_model(self):
model = Sequential()
# Need to check that this is processing the colour bands correctly <- have checked this and:
# the default is channels last which is what we have
# This max pooling layer is quite extreme because of memory limits on machine
model.add(AveragePooling2D(pool_size=(8, 8), input_shape=(self.observation_shape)))
model.add(Conv2D(32, 8, 4)) # Convolutions set to same as in Lample and Chaplet
model.add(Conv2D(64, 4, 2)) # Convolutions set to same as in Lample and Chaplet
# Flatten needed to get a single vector as output otherwise get a matrix
model.add(Flatten())
model.add(Dense(128,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(self.action_size,activation='linear'))
model.compile(loss='mse', optimizer='rmsprop')
return model
def act(self, state):
# Randomly choose to take a randomly chosen action to allow exploration
# When epsilon is high, higher chance, therefore decrease it overtime
# This then results in exploration early on with greater exploitation later
if np.random.rand() <= self.epsilon:
print("Random Action")
return random.randrange(self.action_size)
act_values = self.model.predict(state.reshape([-1, 600, 800, 3]))
return np.argmax(act_values[0])
def replay(self, batch):
# This is how the agent is trained
x_train = []
y_train = []
for state, action, reward, newState, done in batch:
if done:
# If finished
# Set the reward for finishing the game
target_q = reward
else:
# If not finished
#pdb.set_trace()
#self.model.predict(newState.reshape([-1, 600, 800, 3]))
# Bellman equation - use the estimates of the
# Recalling what happened, not what could happen
# Target_Q is the ground truth Y
target_q = reward + self.gamma * np.amax(self.model.predict(newState.reshape([-1, 600, 800, 3])))
# prediction is prediction_q
# prediction has the 5 actions and predicted q-values
prediction = self.model.predict(state.reshape([-1, 600, 800, 3]))
# update the certain action that we did take with a better target, from above
prediction[0][action] = target_q
# Create the training data for X and Y that we use to fit the CNN on
x_train.append(state)
y_train.append(prediction[0])
# Use the training data to fit the model, via the batch
self.model.fit(np.asarray(x_train),np.asarray(y_train),epochs=1,verbose=0)
return
def blockEncoder(floorList):
# We need to convert the block names from strings to vectors as they are categorical data
# takes in a i-length list of the blocks with j different block types and returns an i*j length list indicating the encoded version.
blockList = self.blockList
# TODO need to simplfy the classes to classify these under a type of: air, goal, solid, danger (lava)
blockDict = {}
for i,block in enumerate(blockList):
blockDict[block] = np.zeros(len(blockList))
blockDict[block][i] = 1
vectorisedList = []
for i in floorList:
# Adds content of list to other list. N.B. we might want to use append here depending on how we handle the data
vectorisedList.extend(blockDict[i])
return vectorisedList
def loadMissionFile(filename):
with open(filename, 'r') as file:
missionXML = file.read()
return missionXML
def main():
if len(sys.argv) > 1:
env = utils.setupEnv(sys.argv[1])
else:
env = utils.setupEnv()
# Get the number of available states and actions - generates the output of CNN
observation_shape = env.observation_space.shape
action_size = env.action_space.n
#pdb.set_trace()
# Can start from a pre-built model
#load = input("Load model? y/n or an epsilon value to continue: ")
block_map_shape = (4,4,3)
myagent = agent(observation_shape, action_size,block_map_shape)
#pdb.set_trace()
scores = trainAgent(env, myagent)
'''
if load == 'y':
myagent = agent(observation_shape, action_size, block_map_shape,True,0.1)
#pdb.set_trace()
scores = testAgent(env,myagent)
elif load == 'n':
myagent = agent(observation_shape, action_size,block_map_shape)
#pdb.set_trace()
scores = trainAgent(env, myagent)
else:
#TODO - how come the 'epsilon value' runs still load a model??
myagent = agent(observation_shape, action_size, block_map_shape,True,float(load))
scores = trainAgent(env,myagent)
'''
np.savetxt('dqn_botscores',np.array(scores))
#plt.plot(scores)
#plt.show()
return
if __name__ == "__main__":
main()