diff --git a/feudalconfig.cfg b/feudalconfig.cfg index 3b310f37132adf5716e39f4f3cae7878c5b4738b..cb24e8f4c4fc8173412f33b7db0eacc779d297f9 100644 --- a/feudalconfig.cfg +++ b/feudalconfig.cfg @@ -72,7 +72,7 @@ sample_argmax = False features=learned si_policy_type=acer only_master = True -jsd_reward = True +jsd_reward = False #jsd_function = tanh js_threshold = 0.2 js_threshold_master = 1 @@ -89,7 +89,7 @@ env_model_path = env_model/env1_acer_200.pkl [dqnpolicy] q_update = double -architecture = duel +architecture = noisy_duel #architecture = duel h1_size = 300 h2_size = 100 diff --git a/policy/feudalgainRL/FeudalNoisyDQNPolicy.py b/policy/feudalgainRL/FeudalNoisyDQNPolicy.py index 5ada20841ad370836f976327ba8e6c2c4422a4f2..3fffadbb49839298c03157c5d4e11cdab6add668 100644 --- a/policy/feudalgainRL/FeudalNoisyDQNPolicy.py +++ b/policy/feudalgainRL/FeudalNoisyDQNPolicy.py @@ -106,7 +106,7 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy): if cfg.has_option('feudalpolicy', 'actfreq_ds'): self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds') - self.use_pass = True + self.use_pass = False if cfg.has_option('feudalpolicy', 'use_pass'): self.use_pass = cfg.getboolean('feudalpolicy', 'use_pass') @@ -320,7 +320,6 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy): logger.info('start training...') a_batch_one_hot_new = None - #updating only states where the action is not "pass()" complicates things :/ #since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples if self.js_threshold < 1.0 or not self.use_pass: @@ -363,7 +362,6 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy): t_batch_new = t_batch if self.js_threshold < 1.0 or self.jsd_reward: - #TODO: This is highly inefficient js_divergence_batch = [] for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot): if slot != "None": diff --git a/policy/feudalgainRL/noisydqn.py b/policy/feudalgainRL/noisydqn.py index 03bdb48dd6116ddbe3992390dcf0165dc4732da9..d59aa3b7b12ccd9d23b0dc1e60b1abf048076ca3 100644 --- a/policy/feudalgainRL/noisydqn.py +++ b/policy/feudalgainRL/noisydqn.py @@ -29,236 +29,6 @@ Author: Pei-Hao Su """ import tensorflow as tf -# =========================== -# Deep Q Network -# =========================== -class DeepQNetwork(object): - """ - Input to the network is the state and action, output is Q(s,a). - """ - def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars, minibatch_size=64, - architecture='duel', h1_size=130, h2_size=50, dropout_rate=0.): - self.sess = sess - self.s_dim = state_dim - self.a_dim = action_dim - self.learning_rate = learning_rate - self.tau = tau - self.architecture = architecture - self.h1_size = h1_size - self.h2_size = h2_size - self.minibatch_size = minibatch_size - - # Create the deep Q network - self.inputs, self.action, self.Qout = \ - self.create_ddq_network(self.architecture, self.h1_size, self.h2_size, dropout_rate=dropout_rate) - self.network_params = tf.trainable_variables() - - # Target Network - self.target_inputs, self.target_action, self.target_Qout = \ - self.create_ddq_network(self.architecture, self.h1_size, self.h2_size, dropout_rate=dropout_rate) - self.target_network_params = tf.trainable_variables()[len(self.network_params):] - - # Op for periodically updating target network - self.update_target_network_params = \ - [self.target_network_params[i].assign(\ - tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau)) - for i in range(len(self.target_network_params))] - - # Network target (y_i) - self.sampled_q = tf.placeholder(tf.float32, [None, 1]) - #self.temperature = tf.placeholder(shape=None,dtype=tf.float32) - - # for Boltzman exploration - #self.softmax_Q = tf.nn.softmax(self.self.Qout/self.temperature) - - # Predicted Q given state and chosed action - #actions_one_hot = tf.one_hot(self.action, self.a_dim, 1.0, 0.0, name='action_one_hot') - actions_one_hot = self.action - - if architecture!= 'dip': - self.pred_q = tf.reshape(tf.reduce_sum(self.Qout * actions_one_hot, axis=1, name='q_acted'), - [self.minibatch_size, 1]) - else: - self.pred_q = self.Qout #DIP case, not sure if will work - - #self.pred_q = tf.reduce_sum(self.Qout * actions_one_hot, reduction_indices=1, name='q_acted_target') - - #self.a_maxQ = tf.argmax(self.Qout, 1) - #action_maxQ_one_hot = tf.one_hot(self.a_maxQ, self.a_dim, 1.0, 0.0, name='action_maxQ_one_hot') - #self.action_maxQ_target = tf.reduce_sum(self.target_Qout * action_maxQ_one_hot, reduction_indices=1, name='a_maxQ_target') - - # Define loss and optimization Op - self.diff = self.sampled_q - self.pred_q - self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss') - - self.optimizer = tf.train.AdamOptimizer(self.learning_rate) - self.optimize = self.optimizer.minimize(self.loss) - - # gs = tf.gradients(self.loss, self.network_params) - # capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in zip(gs, self.network_params)] - # - # self.optimize = self.optimizer.apply_gradients(capped_gvs) - - def create_ddq_network(self, architecture='duel', h1_size=130, h2_size=50, dropout_rate=0.): - keep_prob = 1 - dropout_rate - inputs = tf.placeholder(tf.float32, [None, self.s_dim]) - action = tf.placeholder(tf.float32, [None, self.a_dim]) - - if architecture == 'duel': - W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01)) - b_fc1 = tf.Variable(tf.zeros([h1_size])) - h_fc1 = tf.nn.relu(tf.matmul(inputs, W_fc1) + b_fc1) - - # value function - W_value = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) - b_value = tf.Variable(tf.zeros([h2_size])) - h_value = tf.nn.relu(tf.matmul(h_fc1, W_value) + b_value) - - W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01)) - b_value = tf.Variable(tf.zeros([1])) - value_out = tf.matmul(h_value, W_value) + b_value - - # advantage function - W_advantage = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) - b_advantage = tf.Variable(tf.zeros([h2_size])) - h_advantage = tf.nn.relu(tf.matmul(h_fc1, W_advantage) + b_advantage) - - W_advantage = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01)) - b_advantage = tf.Variable(tf.zeros([self.a_dim])) - Advantage_out = tf.matmul(h_advantage, W_advantage) + b_advantage - - Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, axis=1, keep_dims=True)) - - elif architecture == 'noisy_duel': - print("WE USE DUEL NOISY ARCHITECTURE") - h_fc1 = self.noisy_dense_layer(inputs, self.s_dim, h1_size, activation=tf.nn.relu) - # value function - h_value = self.noisy_dense_layer(h_fc1, h1_size, h2_size, activation=tf.nn.relu) - value_out = self.noisy_dense_layer(h_value, h2_size, 1) - - # advantage function - h_advantage = self.noisy_dense_layer(h_fc1, h1_size, h2_size, activation=tf.nn.relu) - Advantage_out = self.noisy_dense_layer(h_advantage, h2_size, self.a_dim) - - Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, axis=1, keep_dims=True)) - - elif architecture == 'dip': - - # state network - W_fc1_s = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01)) - b_fc1_s = tf.Variable(tf.zeros([h1_size])) - h_fc1_s = tf.nn.relu(tf.matmul(inputs, W_fc1_s) + b_fc1_s) - - # action network - W_fc1_a = tf.Variable(tf.truncated_normal([self.a_dim, h1_size], stddev=0.01)) - b_fc1_a = tf.Variable(tf.zeros([h1_size])) - h_fc1_a = tf.nn.relu(tf.matmul(action, W_fc1_a) + b_fc1_a) - - W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) - b_fc2_s = tf.Variable(tf.zeros([h2_size])) - h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s) - - W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) - b_fc2_a = tf.Variable(tf.zeros([h2_size])) - h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a) - - Qout = tf.reduce_sum(tf.multiply(h_fc2_s, h_fc2_a), 1) - - else: - W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01)) - b_fc1 = tf.Variable(tf.zeros([h1_size])) - h_fc1 = tf.nn.relu(tf.matmul(inputs, W_fc1) + b_fc1) - if keep_prob < 1: - h_fc1 = tf.nn.dropout(h_fc1, keep_prob) - - W_fc2 = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) - b_fc2 = tf.Variable(tf.zeros([h2_size])) - h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2) - if keep_prob < 1: - h_fc2 = tf.nn.dropout(h_fc2, keep_prob) - - W_out = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01)) - b_out = tf.Variable(tf.zeros([self.a_dim])) - Qout = tf.matmul(h_fc2, W_out) + b_out - - return inputs, action, Qout - - def noisy_dense_layer(self, input, input_neurons, output_neurons, activation=tf.identity): - - W_mu = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01)) - W_sigma = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01)) - W_eps = tf.random_normal(shape=[input_neurons, output_neurons]) - W = W_mu + tf.multiply(W_sigma, W_eps) - - b_mu = tf.Variable(tf.zeros([output_neurons])) - b_sigma = tf.Variable(tf.zeros([output_neurons])) - b_eps = tf.random_normal(shape=[output_neurons]) - b = b_mu + tf.multiply(b_sigma, b_eps) - - return activation(tf.matmul(input, W) + b) - - def train(self, inputs, action, sampled_q): - return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={ #yes, needs to be changed too - self.inputs: inputs, - self.action: action, - self.sampled_q: sampled_q - }) - - - def predict(self, inputs): - return self.sess.run(self.Qout, feed_dict={ - self.inputs: inputs - }) - - def predict_dip(self, inputs, action): - return self.sess.run(self.Qout, feed_dict={ - self.inputs: inputs, - self.action: action - }) - - def predict_action(self, inputs): - return self.sess.run(self.pred_q, feed_dict={ - self.inputs: inputs - }) - - def predict_target(self, inputs): - return self.sess.run(self.target_Qout, feed_dict={ - self.target_inputs: inputs - }) - - def predict_target_dip(self, inputs, action): - return self.sess.run(self.target_Qout, feed_dict={ - self.target_inputs: inputs, - self.target_action: action - }) - - def predict_target_with_action_maxQ(self, inputs): - return self.sess.run(self.action_maxQ_target, feed_dict={ - self.target_inputs: inputs, - self.inputs: inputs - }) - - def update_target_network(self): - self.sess.run(self.update_target_network_params) #yes, but no need to change - - def load_network(self, load_filename): - self.saver = tf.train.Saver() - if load_filename.split('.')[-3] != '0': - try: - self.saver.restore(self.sess, './' + load_filename) - print("Successfully loaded:", load_filename) - except: - print("Could not find old network weights") - else: - print('nothing loaded in first iteration') - - def save_network(self, save_filename): - print('Saving deepq-network...') - self.saver.save(self.sess, './' +save_filename) # yes but no need to change - - def clipped_error(self, x): - return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false - class NNFDeepQNetwork(object): """ @@ -474,159 +244,3 @@ class NNFDeepQNetwork(object): self.mean_noisy_b.append(tf.reduce_mean(tf.abs(b_sigma))) return activation(tf.matmul(input, W) + b) - -class RNNFDeepQNetwork(object): - """ - Input to the network is the state and action, output is Q(s,a). - """ - def __init__(self, sess, si_state_dim, sd_state_dim, action_dim, learning_rate, tau, num_actor_vars, minibatch_size=64, - architecture='duel', h1_size=130, h2_size=50, sd_enc_size=40, si_enc_size=80, dropout_rate=0., slot='si'): - #super(NNFDeepQNetwork, self).__init__(sess, si_state_dim + sd_state_dim, action_dim, learning_rate, tau, num_actor_vars, - # minibatch_size=64, architecture='duel', h1_size=130, h2_size=50) - self.sess = sess - self.si_dim = si_state_dim - self.sd_dim = sd_state_dim - self.a_dim = action_dim - self.learning_rate = learning_rate - self.tau = tau - self.architecture = architecture - self.h1_size = h1_size - self.h2_size = h2_size - self.minibatch_size = minibatch_size - self.sd_enc_size = sd_enc_size - self.si_enc_size = si_enc_size - self.dropout_rate = dropout_rate - - # Create the deep Q network - self.inputs, self.action, self.Qout = \ - self.create_rnnfdq_network(self.h1_size, self.h2_size, self.sd_enc_size, self.si_enc_size, self.dropout_rate, slot=slot) - self.network_params = tf.trainable_variables() - - # Target Network - self.target_inputs, self.target_action, self.target_Qout = \ - self.create_rnnfdq_network(self.h1_size, self.h2_size, self.sd_enc_size, self.si_enc_size, self.dropout_rate, tn='target', slot=slot) - self.target_network_params = tf.trainable_variables()[len(self.network_params):] - - # Op for periodically updating target network - self.update_target_network_params = \ - [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) - + tf.multiply(self.target_network_params[i], 1. - self.tau)) - for i in range(len(self.target_network_params))] - - # Network target (y_i) - self.sampled_q = tf.placeholder(tf.float32, [None, 1]) - - # Predicted Q given state and chosed action - actions_one_hot = self.action - - if architecture!= 'dip': - self.pred_q = tf.reshape(tf.reduce_sum(self.Qout * actions_one_hot, axis=1, name='q_acted'), - [self.minibatch_size, 1]) - else: - self.pred_q = self.Qout - - # Define loss and optimization Op - self.diff = self.sampled_q - self.pred_q - self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss') - - self.optimizer = tf.train.AdamOptimizer(self.learning_rate) - self.optimize = self.optimizer.minimize(self.loss) - - #def create_slot_encoder(self): - - - def create_rnnfdq_network(self, h1_size=130, h2_size=50, sd_enc_size=40, si_enc_size=80, dropout_rate=0., - tn='normal', slot='si'): - inputs = tf.placeholder(tf.float32, [None, self.sd_dim + self.si_dim]) - keep_prob = 1 - dropout_rate - sd_inputs, si_inputs = tf.split(inputs, [self.sd_dim, self.si_dim], 1) - action = tf.placeholder(tf.float32, [None, self.a_dim]) - if slot == 'sd': - sd_inputs = tf.reshape(sd_inputs, (tf.shape(sd_inputs)[0], 1, self.sd_dim)) - - #slots encoder - with tf.variable_scope(tn): - #try: - lstm_cell = tf.nn.rnn_cell.GRUCell(self.sd_enc_size) - hidden_state = lstm_cell.zero_state(tf.shape(sd_inputs)[0], tf.float32) - _, h_sdfe = tf.nn.dynamic_rnn(lstm_cell, sd_inputs, initial_state=hidden_state) - #except: - # lstm_cell = tf.contrib.rnn.GRUCell(self.sd_enc_size) - # hidden_state = lstm_cell.zero_state(tf.shape(sd_inputs)[0], tf.float32) - # _, h_sdfe = tf.contrib.rnn.dynamic_rnn(lstm_cell, sd_inputs, initial_state=hidden_state) - else: - W_sdfe = tf.Variable(tf.truncated_normal([self.sd_dim, sd_enc_size], stddev=0.01)) - b_sdfe = tf.Variable(tf.zeros([sd_enc_size])) - h_sdfe = tf.nn.relu(tf.matmul(sd_inputs, W_sdfe) + b_sdfe) - if keep_prob < 1: - h_sdfe = tf.nn.dropout(h_sdfe, keep_prob) - - W_sife = tf.Variable(tf.truncated_normal([self.si_dim, si_enc_size], stddev=0.01)) - b_sife = tf.Variable(tf.zeros([si_enc_size])) - h_sife = tf.nn.relu(tf.matmul(si_inputs, W_sife) + b_sife) - if keep_prob < 1: - h_sife = tf.nn.dropout(h_sife, keep_prob) - - W_fc1 = tf.Variable(tf.truncated_normal([sd_enc_size+si_enc_size, h1_size], stddev=0.01)) - b_fc1 = tf.Variable(tf.zeros([h1_size])) - h_fc1 = tf.nn.relu(tf.matmul(tf.concat((h_sdfe, h_sife), 1), W_fc1) + b_fc1) - - W_fc2 = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) - b_fc2 = tf.Variable(tf.zeros([h2_size])) - h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2) - - W_out = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01)) - b_out = tf.Variable(tf.zeros([self.a_dim])) - Qout = tf.matmul(h_fc2, W_out) + b_out - - return inputs, action, Qout - - def predict(self, inputs): - return self.sess.run(self.Qout, feed_dict={ #inputs where a single flat_bstate - self.inputs: inputs - }) - - def predict_dip(self, inputs, action): - return self.sess.run(self.Qout, feed_dict={ #inputs and action where array of 64 (batch size) - self.inputs: inputs, - self.action: action - }) - - def predict_target(self, inputs): - return self.sess.run(self.target_Qout, feed_dict={ #inputs where a single flat_bstate - self.target_inputs: inputs - }) - - def predict_target_dip(self, inputs, action): - return self.sess.run(self.target_Qout, feed_dict={ #inputs and action where array of 64 (batch size) - self.target_inputs: inputs, - self.target_action: action - }) - - def train(self, inputs, action, sampled_q): - return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={ #all the inputs are arrays of 64 - self.inputs: inputs, - self.action: action, - self.sampled_q: sampled_q - }) - - def clipped_error(self, x): - return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false - - def save_network(self, save_filename): - print('Saving deepq-network...') - self.saver.save(self.sess, save_filename) - - def update_target_network(self): - self.sess.run(self.update_target_network_params) - - def load_network(self, load_filename): - self.saver = tf.train.Saver() - if load_filename.split('.')[-3] != '0': - try: - self.saver.restore(self.sess, load_filename) - print("Successfully loaded:", load_filename) - except: - print("Could not find old network weights") - else: - print('nothing loaded in first iteration')