tensorflow - DDPG in python and Unity -


i'm making autonomous driving vehicle simulation in uinity , python using tcp, tensorflow, keras, ddpg algorithm

i found project , modified fit system. system use image input learn correct actions.

however, when start learning, feedforward of actor network outputs incorrect value.

i think procedure not wrong

but can't find wrong.

this main.py , runs on window10, python3.6, tensorflow-1.2.0-rc2, keras2.0.6

import socket pil import image import io import struct import numpy np import tensorflow tf  replaybuffer import replaybuffer actornetwork import actornetwork criticnetwork import criticnetwork import keras.backend k  buffer_size = 100000 batch_size = 32 gamma = 0.99 tau = 0.001   #target network hyperparameters lar = 0.0001  # learning rate actor lrc = 0.001   # learning rate critic  action_dim = 3 img_rows, img_cols = 100,100  if k.image_data_format() == 'channels_first' :     input_shape = (3, image_rows, img_cols) else :     input_shape = (img_rows, img_cols, 3)  explore = 100000 episode_count = 2000 max_steps = 100000 reward = 0 done = false step = 0 epsilon = 1  sess = tf.session() k.set_session(sess)  actor.actornetwork(sess, input_shape, action_dim, batch_size, tau, lra) critic = criticnetwork(sess, input_shape, action_dim, batch_size, tau, lrc) buff = replaybuffer(buffer_size)  host = '127.0.0.1' port = 1024 s = socket.socket(socket.af_inet, socket.sock_stream) # connecting unity s.connect((host, port)) print("connected")  data = s.recv(4) # recv image image_buffer_size = int.from_bytes(data, byteorder="little")  data = s.recv(image_buffer_size) stream = io.bytesio(data) img = image.open(stream) img_rgb = img.convert('rgb') imgarr = np.array(img_rgb)  in range(episode_count) :     total_reward = 0      j in range(max_steps) :         loss = 0         epsilon -= 1.0 / explore          a_t = actor.model.predict(imgarr.reshape(1, imgarr.shape[0], imgarr.shape[1], imgarr.shape[2]))          a1_byte = bytearray(struct.pack("f", a_t[0][0]))         a2_byte = bytearray(struct.pack("f", a_t[0][1]))         a3_byte = bytearray(struct.pack("f", a_t[0][2]))          a_byte = a1_byte + a2_byte + a3_byte          s.send(a_byte) # send actions          reward = s.recv(8) # recv reward         reward = int.from_bytes(reward, byteorder="little")          data = s.recv(4) # recv next image         image_buffer_size = int.from_bytes(data, byteorder="little")          data = s.recv(image_buffer_size)          stream = io.bytesio(data)         img = image.open(stream)         img_rgb = img.convert('rgb')         next_imgarr = np.array(img_rgb)          buff.add(imgarr, a_t, reward, next_imgarr, done=done)          batch = buff.getbatch(batch_size)         states = np.asarray(list(e[0] e in batch))         actions = np.asarray(list(e[1] e in batch))         rewards = np.asarray(list(e[2] e in batch))         new_states = np.asarrayu(list(e[3] e in batch))         dones = np.asarray(list(e[4] e in batch))         y_t = np.asarray(list(e[1] e in batch))          actions = actions.reshape(actions.shape[0], actions.shape[2])         y_t = y_t.reshape(y_t.shape[0], y_t.shape[2])          a_t_1 = actor.target_model.predict(new_states)          target_q_values = critic.target_model.predict([new_states, a_t_1])          k in range(len(batch)) :             if dones[k] :                 y_t[k] = rewards[k]             else :                 y_t[k] = rewards[k] + gamma*target_q_values[k]          if(j%100 == 0) :             loss += critic.model.train_on_batch([states,actions], y_t)             a_for_grad = actor.model.predict(states)             grads = critic.gradients(states, a_for_grad)             actor.train(states, grads)             actor.target_train()             critic_weights = critic.model.get_weights()             critic_target_weights = critic.target_model.get_weights()             s in range(len(critic_weights)) :                 critic_target_weights[s] = critic.tau * critic_weights[s] + (1-critic.tau) * critic_target_weights[s]              critic.target_model.set_weights(critic_target_weights)          total_reward += reward         imgarr = next_imgarr          step += 1         if done :             break 

and actornetwork.py

import numpy np import math keras.initializers import normal, identity, random_uniform keras.models import model_from_json keras.models import sequential, model keras.layers import dense, flatten, input, merge, lambda, conv2d,  maxpooling2d keras.optimizers import adam import tensorflow tf import keras.backend k  hidden1_units = 300 hidden2_units = 600  #input image dimensions img_rows, img_cols = 100, 100  if k.image_data_format() == 'channels_first' :     input_shape = (3, img_rows, img_cols)  else :     input_shape = (img_rows, img_cols, 3)  class actornetwork(object) :     def __init__(self, sess, state_size, action_size, batch_size, tau,learning_rate):         self.sess = sess         self.batch_size = batch_size         self.tau = tau         self.learning_rate = learning_rate          k.set_session(sess)          self.model, self.weights, self.state = self.create_actor_network(state_size, action_size)         self.target_model, self.target_weights, self.target_state = self.create_actor_network(state_size, action_size)         self.action_gradient = tf.placeholder(tf.float32, [none, action_size])         self.params_grad = tf.gradients(self.model.output, self.weights, -self.action_gradient)         grads = zip(self.params_grad, self.weights)         self.optimize = tf.train.adamoptimizer(learning_rate).apply_gradients(grads)         self.sess.run(tf.initialize_all_variables())      def train(self, states, action_grads):         self.sess.run(self.optimize, feed_dict={self.state : states, self.action_gradient:action_grads})      def target_train(self):         actor_weights = self.model.get_weights()         actor_target_weights = self.target_model.get_weights()         in range(len(actor_weights)) :             actor_target_weights[i] = self.tau * actor_weights[i] + (1- self.tau) * actor_target_weights[i]         self.target_model.set_weights(actor_target_weights)      def create_actor_network(self, state_size, action_dim):         s = input(shape=state_size)         h0 = conv2d(32, (3,3), input_shape=state_size, activation='relu')(s)         p0 = maxpooling2d(pool_size=(2,2))(h0)         h1 = conv2d(64, (3,3), activation='relu')(p0)         p1 = maxpooling2d(pool_size=(2,2))(h1)         f0 = flatten()(p1)         steering = dense(1, activation='tanh', init=random_uniform(minval=-(1e-4), maxval=1e-4))(f0)         acceleration = dense(1, activation='sigmoid', init=random_uniform(minval=-(1e-4), maxval=1e-4))(f0)         brake = dense(1, activation='sigmoid', init=random_uniform(minval=-(1e-4), maxval=1e-4))(f0)         v = merge([steering, acceleration, brake], mode='concat')         model = model(input=s, output=v)         return model, model.trainable_weights, s 

and criticnetwork.py

import numpy np import math keras.initializers import normal, identity, random_uniform keras.models import model_from_json keras.models import sequential keras.layers import dense, flatten, input, merge, lambda, activation, maxpooling2d, conv2d keras.models import sequential, model keras.optimizers import adam import keras.backend k import tensorflow tf  hidden1_units = 300 hidden2_units = 600  #input image dimensions img_rows, img_cols = 100, 100  if k.image_data_format() == 'channels_first' : input_shape = (3, img_rows, img_cols)  else :     input_shape = (img_rows, img_cols, 3)  class criticnetwork(object) :     def __init__(self, sess, state_size, action_size, batch_size, tau, learning_rate):         self.sess = sess         self.batch_size = batch_size         self.tau = tau         self.learning_rate = learning_rate         self.action_size = action_size          k.set_session(sess)          self.model, self.action, self.state = self.create_critic_network(state_size, action_size)         self.target_model, self.target_action, self.target_state = self.create_critic_network(state_size, action_size)         self.action_grads = tf.gradients(self.model.output, self.action)         self.sess.run(tf.initialize_all_variables())       def gradients(self, states, actions) :         return self.sess.run(self.action_grads, feed_dict={self.state:states, self.action:actions})[0]      def target_train(self):         critic_weights = self.model.get_weights()         critic_target_weights = self.target_model.get_weights()         in range(len(critic_weights)) :             critic_target_weights[i] = self.tau * critic_weights[i] + (1 - self.tau) * critic_target_weights[i]         self.target_model.set_weigths(critic_target_weights)      def create_critic_network(self, state_size, action_dim):         model = sequential()         s = input(shape=state_size)         = input(shape=[action_dim], name='action')         a1 = dense(hidden2_units, activation='linear')(a)         w1 = conv2d(32, (3,3), input_shape=input_shape, activation='relu')(s)         m1 = maxpooling2d(pool_size=(2,2))(w1)         h1 = conv2d(64, (3,3), activation='relu')(m1)         m2 = maxpooling2d(pool_size=(2,2))(h1)         f1 = flatten()(m2)         h2 = dense(hidden2_units, activation='linear')(f1)         h3 = merge([h2,a1], mode='sum')         h4 = dense(hidden2_units, activation='relu')(h3)         v = dense(action_dim)(h4)         model = model(input=[s,a], output=v)         adam = adam(lr=self.learning_rate)         model.compile(loss='mse', optimizer=adam)         return model, a, s 

this replaybuffer.py

from collections import deque import random  class replaybuffer(object):     def __init__(self, buffer_size):         self.buffer_size = buffer_size         self.num_experiences = 0         self.buffer = deque()      def getbatch(self, batch_size) :         if self.num_experiences < batch_size :             return random.sample(self.buffer, self.num_experiences)         else:             return random.sample(self.buffer, batch_size)      def size(self):         return self.buffer_size      def add(self, state, action, reward, new_state, done):         experience = (state, action, reward, new_state, done)         if self.num_experiences < self.buffer_size :             self.buffer.append(experience)             self.num_experiences += 1          else:             self.buffer.popleft()             self.buffer.append(experience)      def count(self):         return self.num_experiences      def erase(self):         self.buffer = deque()         self.num_experiences = 0 

i appreciate if point out errors in code. when program runs, output value of "actor.predict" goes 0 after passing "actor.train(states, grads)" part of main.py.

thank reading.


Comments

Popular posts from this blog

Is there a better way to structure post methods in Class Based Views -

performance - Why is XCHG reg, reg a 3 micro-op instruction on modern Intel architectures? -

c# - Asp.net web api : redirect unauthorized requst to forbidden page -