tensorflow - DDPG in python and Unity -
i'm making autonomous driving vehicle simulation in uinity , python using tcp, tensorflow, keras, ddpg algorithm
i found project , modified fit system. system use image input learn correct actions.
however, when start learning, feedforward of actor network outputs incorrect value.
i think procedure not wrong
but can't find wrong.
this main.py , runs on window10, python3.6, tensorflow-1.2.0-rc2, keras2.0.6
import socket pil import image import io import struct import numpy np import tensorflow tf replaybuffer import replaybuffer actornetwork import actornetwork criticnetwork import criticnetwork import keras.backend k buffer_size = 100000 batch_size = 32 gamma = 0.99 tau = 0.001 #target network hyperparameters lar = 0.0001 # learning rate actor lrc = 0.001 # learning rate critic action_dim = 3 img_rows, img_cols = 100,100 if k.image_data_format() == 'channels_first' : input_shape = (3, image_rows, img_cols) else : input_shape = (img_rows, img_cols, 3) explore = 100000 episode_count = 2000 max_steps = 100000 reward = 0 done = false step = 0 epsilon = 1 sess = tf.session() k.set_session(sess) actor.actornetwork(sess, input_shape, action_dim, batch_size, tau, lra) critic = criticnetwork(sess, input_shape, action_dim, batch_size, tau, lrc) buff = replaybuffer(buffer_size) host = '127.0.0.1' port = 1024 s = socket.socket(socket.af_inet, socket.sock_stream) # connecting unity s.connect((host, port)) print("connected") data = s.recv(4) # recv image image_buffer_size = int.from_bytes(data, byteorder="little") data = s.recv(image_buffer_size) stream = io.bytesio(data) img = image.open(stream) img_rgb = img.convert('rgb') imgarr = np.array(img_rgb) in range(episode_count) : total_reward = 0 j in range(max_steps) : loss = 0 epsilon -= 1.0 / explore a_t = actor.model.predict(imgarr.reshape(1, imgarr.shape[0], imgarr.shape[1], imgarr.shape[2])) a1_byte = bytearray(struct.pack("f", a_t[0][0])) a2_byte = bytearray(struct.pack("f", a_t[0][1])) a3_byte = bytearray(struct.pack("f", a_t[0][2])) a_byte = a1_byte + a2_byte + a3_byte s.send(a_byte) # send actions reward = s.recv(8) # recv reward reward = int.from_bytes(reward, byteorder="little") data = s.recv(4) # recv next image image_buffer_size = int.from_bytes(data, byteorder="little") data = s.recv(image_buffer_size) stream = io.bytesio(data) img = image.open(stream) img_rgb = img.convert('rgb') next_imgarr = np.array(img_rgb) buff.add(imgarr, a_t, reward, next_imgarr, done=done) batch = buff.getbatch(batch_size) states = np.asarray(list(e[0] e in batch)) actions = np.asarray(list(e[1] e in batch)) rewards = np.asarray(list(e[2] e in batch)) new_states = np.asarrayu(list(e[3] e in batch)) dones = np.asarray(list(e[4] e in batch)) y_t = np.asarray(list(e[1] e in batch)) actions = actions.reshape(actions.shape[0], actions.shape[2]) y_t = y_t.reshape(y_t.shape[0], y_t.shape[2]) a_t_1 = actor.target_model.predict(new_states) target_q_values = critic.target_model.predict([new_states, a_t_1]) k in range(len(batch)) : if dones[k] : y_t[k] = rewards[k] else : y_t[k] = rewards[k] + gamma*target_q_values[k] if(j%100 == 0) : loss += critic.model.train_on_batch([states,actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) actor.target_train() critic_weights = critic.model.get_weights() critic_target_weights = critic.target_model.get_weights() s in range(len(critic_weights)) : critic_target_weights[s] = critic.tau * critic_weights[s] + (1-critic.tau) * critic_target_weights[s] critic.target_model.set_weights(critic_target_weights) total_reward += reward imgarr = next_imgarr step += 1 if done : break
and actornetwork.py
import numpy np import math keras.initializers import normal, identity, random_uniform keras.models import model_from_json keras.models import sequential, model keras.layers import dense, flatten, input, merge, lambda, conv2d, maxpooling2d keras.optimizers import adam import tensorflow tf import keras.backend k hidden1_units = 300 hidden2_units = 600 #input image dimensions img_rows, img_cols = 100, 100 if k.image_data_format() == 'channels_first' : input_shape = (3, img_rows, img_cols) else : input_shape = (img_rows, img_cols, 3) class actornetwork(object) : def __init__(self, sess, state_size, action_size, batch_size, tau,learning_rate): self.sess = sess self.batch_size = batch_size self.tau = tau self.learning_rate = learning_rate k.set_session(sess) self.model, self.weights, self.state = self.create_actor_network(state_size, action_size) self.target_model, self.target_weights, self.target_state = self.create_actor_network(state_size, action_size) self.action_gradient = tf.placeholder(tf.float32, [none, action_size]) self.params_grad = tf.gradients(self.model.output, self.weights, -self.action_gradient) grads = zip(self.params_grad, self.weights) self.optimize = tf.train.adamoptimizer(learning_rate).apply_gradients(grads) self.sess.run(tf.initialize_all_variables()) def train(self, states, action_grads): self.sess.run(self.optimize, feed_dict={self.state : states, self.action_gradient:action_grads}) def target_train(self): actor_weights = self.model.get_weights() actor_target_weights = self.target_model.get_weights() in range(len(actor_weights)) : actor_target_weights[i] = self.tau * actor_weights[i] + (1- self.tau) * actor_target_weights[i] self.target_model.set_weights(actor_target_weights) def create_actor_network(self, state_size, action_dim): s = input(shape=state_size) h0 = conv2d(32, (3,3), input_shape=state_size, activation='relu')(s) p0 = maxpooling2d(pool_size=(2,2))(h0) h1 = conv2d(64, (3,3), activation='relu')(p0) p1 = maxpooling2d(pool_size=(2,2))(h1) f0 = flatten()(p1) steering = dense(1, activation='tanh', init=random_uniform(minval=-(1e-4), maxval=1e-4))(f0) acceleration = dense(1, activation='sigmoid', init=random_uniform(minval=-(1e-4), maxval=1e-4))(f0) brake = dense(1, activation='sigmoid', init=random_uniform(minval=-(1e-4), maxval=1e-4))(f0) v = merge([steering, acceleration, brake], mode='concat') model = model(input=s, output=v) return model, model.trainable_weights, s
and criticnetwork.py
import numpy np import math keras.initializers import normal, identity, random_uniform keras.models import model_from_json keras.models import sequential keras.layers import dense, flatten, input, merge, lambda, activation, maxpooling2d, conv2d keras.models import sequential, model keras.optimizers import adam import keras.backend k import tensorflow tf hidden1_units = 300 hidden2_units = 600 #input image dimensions img_rows, img_cols = 100, 100 if k.image_data_format() == 'channels_first' : input_shape = (3, img_rows, img_cols) else : input_shape = (img_rows, img_cols, 3) class criticnetwork(object) : def __init__(self, sess, state_size, action_size, batch_size, tau, learning_rate): self.sess = sess self.batch_size = batch_size self.tau = tau self.learning_rate = learning_rate self.action_size = action_size k.set_session(sess) self.model, self.action, self.state = self.create_critic_network(state_size, action_size) self.target_model, self.target_action, self.target_state = self.create_critic_network(state_size, action_size) self.action_grads = tf.gradients(self.model.output, self.action) self.sess.run(tf.initialize_all_variables()) def gradients(self, states, actions) : return self.sess.run(self.action_grads, feed_dict={self.state:states, self.action:actions})[0] def target_train(self): critic_weights = self.model.get_weights() critic_target_weights = self.target_model.get_weights() in range(len(critic_weights)) : critic_target_weights[i] = self.tau * critic_weights[i] + (1 - self.tau) * critic_target_weights[i] self.target_model.set_weigths(critic_target_weights) def create_critic_network(self, state_size, action_dim): model = sequential() s = input(shape=state_size) = input(shape=[action_dim], name='action') a1 = dense(hidden2_units, activation='linear')(a) w1 = conv2d(32, (3,3), input_shape=input_shape, activation='relu')(s) m1 = maxpooling2d(pool_size=(2,2))(w1) h1 = conv2d(64, (3,3), activation='relu')(m1) m2 = maxpooling2d(pool_size=(2,2))(h1) f1 = flatten()(m2) h2 = dense(hidden2_units, activation='linear')(f1) h3 = merge([h2,a1], mode='sum') h4 = dense(hidden2_units, activation='relu')(h3) v = dense(action_dim)(h4) model = model(input=[s,a], output=v) adam = adam(lr=self.learning_rate) model.compile(loss='mse', optimizer=adam) return model, a, s
this replaybuffer.py
from collections import deque import random class replaybuffer(object): def __init__(self, buffer_size): self.buffer_size = buffer_size self.num_experiences = 0 self.buffer = deque() def getbatch(self, batch_size) : if self.num_experiences < batch_size : return random.sample(self.buffer, self.num_experiences) else: return random.sample(self.buffer, batch_size) def size(self): return self.buffer_size def add(self, state, action, reward, new_state, done): experience = (state, action, reward, new_state, done) if self.num_experiences < self.buffer_size : self.buffer.append(experience) self.num_experiences += 1 else: self.buffer.popleft() self.buffer.append(experience) def count(self): return self.num_experiences def erase(self): self.buffer = deque() self.num_experiences = 0
i appreciate if point out errors in code. when program runs, output value of "actor.predict" goes 0 after passing "actor.train(states, grads)" part of main.py.
thank reading.
Comments
Post a Comment