Skip to content
Snippets Groups Projects
Commit 8838b1d1 authored by Christian's avatar Christian
Browse files

refactored noisydqn

parent 5fa29847
No related branches found
No related tags found
No related merge requests found
...@@ -72,7 +72,7 @@ sample_argmax = False ...@@ -72,7 +72,7 @@ sample_argmax = False
features=learned features=learned
si_policy_type=acer si_policy_type=acer
only_master = True only_master = True
jsd_reward = True jsd_reward = False
#jsd_function = tanh #jsd_function = tanh
js_threshold = 0.2 js_threshold = 0.2
js_threshold_master = 1 js_threshold_master = 1
...@@ -89,7 +89,7 @@ env_model_path = env_model/env1_acer_200.pkl ...@@ -89,7 +89,7 @@ env_model_path = env_model/env1_acer_200.pkl
[dqnpolicy] [dqnpolicy]
q_update = double q_update = double
architecture = duel architecture = noisy_duel
#architecture = duel #architecture = duel
h1_size = 300 h1_size = 300
h2_size = 100 h2_size = 100
......
...@@ -106,7 +106,7 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy): ...@@ -106,7 +106,7 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
if cfg.has_option('feudalpolicy', 'actfreq_ds'): if cfg.has_option('feudalpolicy', 'actfreq_ds'):
self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds') self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')
self.use_pass = True self.use_pass = False
if cfg.has_option('feudalpolicy', 'use_pass'): if cfg.has_option('feudalpolicy', 'use_pass'):
self.use_pass = cfg.getboolean('feudalpolicy', 'use_pass') self.use_pass = cfg.getboolean('feudalpolicy', 'use_pass')
...@@ -320,7 +320,6 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy): ...@@ -320,7 +320,6 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
logger.info('start training...') logger.info('start training...')
a_batch_one_hot_new = None a_batch_one_hot_new = None
#updating only states where the action is not "pass()" complicates things :/
#since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples #since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples
if self.js_threshold < 1.0 or not self.use_pass: if self.js_threshold < 1.0 or not self.use_pass:
...@@ -363,7 +362,6 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy): ...@@ -363,7 +362,6 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
t_batch_new = t_batch t_batch_new = t_batch
if self.js_threshold < 1.0 or self.jsd_reward: if self.js_threshold < 1.0 or self.jsd_reward:
#TODO: This is highly inefficient
js_divergence_batch = [] js_divergence_batch = []
for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot): for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot):
if slot != "None": if slot != "None":
......
...@@ -29,236 +29,6 @@ Author: Pei-Hao Su ...@@ -29,236 +29,6 @@ Author: Pei-Hao Su
""" """
import tensorflow as tf import tensorflow as tf
# ===========================
# Deep Q Network
# ===========================
class DeepQNetwork(object):
"""
Input to the network is the state and action, output is Q(s,a).
"""
def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars, minibatch_size=64,
architecture='duel', h1_size=130, h2_size=50, dropout_rate=0.):
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.learning_rate = learning_rate
self.tau = tau
self.architecture = architecture
self.h1_size = h1_size
self.h2_size = h2_size
self.minibatch_size = minibatch_size
# Create the deep Q network
self.inputs, self.action, self.Qout = \
self.create_ddq_network(self.architecture, self.h1_size, self.h2_size, dropout_rate=dropout_rate)
self.network_params = tf.trainable_variables()
# Target Network
self.target_inputs, self.target_action, self.target_Qout = \
self.create_ddq_network(self.architecture, self.h1_size, self.h2_size, dropout_rate=dropout_rate)
self.target_network_params = tf.trainable_variables()[len(self.network_params):]
# Op for periodically updating target network
self.update_target_network_params = \
[self.target_network_params[i].assign(\
tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau))
for i in range(len(self.target_network_params))]
# Network target (y_i)
self.sampled_q = tf.placeholder(tf.float32, [None, 1])
#self.temperature = tf.placeholder(shape=None,dtype=tf.float32)
# for Boltzman exploration
#self.softmax_Q = tf.nn.softmax(self.self.Qout/self.temperature)
# Predicted Q given state and chosed action
#actions_one_hot = tf.one_hot(self.action, self.a_dim, 1.0, 0.0, name='action_one_hot')
actions_one_hot = self.action
if architecture!= 'dip':
self.pred_q = tf.reshape(tf.reduce_sum(self.Qout * actions_one_hot, axis=1, name='q_acted'),
[self.minibatch_size, 1])
else:
self.pred_q = self.Qout #DIP case, not sure if will work
#self.pred_q = tf.reduce_sum(self.Qout * actions_one_hot, reduction_indices=1, name='q_acted_target')
#self.a_maxQ = tf.argmax(self.Qout, 1)
#action_maxQ_one_hot = tf.one_hot(self.a_maxQ, self.a_dim, 1.0, 0.0, name='action_maxQ_one_hot')
#self.action_maxQ_target = tf.reduce_sum(self.target_Qout * action_maxQ_one_hot, reduction_indices=1, name='a_maxQ_target')
# Define loss and optimization Op
self.diff = self.sampled_q - self.pred_q
self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss')
self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
self.optimize = self.optimizer.minimize(self.loss)
# gs = tf.gradients(self.loss, self.network_params)
# capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in zip(gs, self.network_params)]
#
# self.optimize = self.optimizer.apply_gradients(capped_gvs)
def create_ddq_network(self, architecture='duel', h1_size=130, h2_size=50, dropout_rate=0.):
keep_prob = 1 - dropout_rate
inputs = tf.placeholder(tf.float32, [None, self.s_dim])
action = tf.placeholder(tf.float32, [None, self.a_dim])
if architecture == 'duel':
W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01))
b_fc1 = tf.Variable(tf.zeros([h1_size]))
h_fc1 = tf.nn.relu(tf.matmul(inputs, W_fc1) + b_fc1)
# value function
W_value = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
b_value = tf.Variable(tf.zeros([h2_size]))
h_value = tf.nn.relu(tf.matmul(h_fc1, W_value) + b_value)
W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01))
b_value = tf.Variable(tf.zeros([1]))
value_out = tf.matmul(h_value, W_value) + b_value
# advantage function
W_advantage = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
b_advantage = tf.Variable(tf.zeros([h2_size]))
h_advantage = tf.nn.relu(tf.matmul(h_fc1, W_advantage) + b_advantage)
W_advantage = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01))
b_advantage = tf.Variable(tf.zeros([self.a_dim]))
Advantage_out = tf.matmul(h_advantage, W_advantage) + b_advantage
Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, axis=1, keep_dims=True))
elif architecture == 'noisy_duel':
print("WE USE DUEL NOISY ARCHITECTURE")
h_fc1 = self.noisy_dense_layer(inputs, self.s_dim, h1_size, activation=tf.nn.relu)
# value function
h_value = self.noisy_dense_layer(h_fc1, h1_size, h2_size, activation=tf.nn.relu)
value_out = self.noisy_dense_layer(h_value, h2_size, 1)
# advantage function
h_advantage = self.noisy_dense_layer(h_fc1, h1_size, h2_size, activation=tf.nn.relu)
Advantage_out = self.noisy_dense_layer(h_advantage, h2_size, self.a_dim)
Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, axis=1, keep_dims=True))
elif architecture == 'dip':
# state network
W_fc1_s = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01))
b_fc1_s = tf.Variable(tf.zeros([h1_size]))
h_fc1_s = tf.nn.relu(tf.matmul(inputs, W_fc1_s) + b_fc1_s)
# action network
W_fc1_a = tf.Variable(tf.truncated_normal([self.a_dim, h1_size], stddev=0.01))
b_fc1_a = tf.Variable(tf.zeros([h1_size]))
h_fc1_a = tf.nn.relu(tf.matmul(action, W_fc1_a) + b_fc1_a)
W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
b_fc2_s = tf.Variable(tf.zeros([h2_size]))
h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s)
W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
b_fc2_a = tf.Variable(tf.zeros([h2_size]))
h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a)
Qout = tf.reduce_sum(tf.multiply(h_fc2_s, h_fc2_a), 1)
else:
W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01))
b_fc1 = tf.Variable(tf.zeros([h1_size]))
h_fc1 = tf.nn.relu(tf.matmul(inputs, W_fc1) + b_fc1)
if keep_prob < 1:
h_fc1 = tf.nn.dropout(h_fc1, keep_prob)
W_fc2 = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
b_fc2 = tf.Variable(tf.zeros([h2_size]))
h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)
if keep_prob < 1:
h_fc2 = tf.nn.dropout(h_fc2, keep_prob)
W_out = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01))
b_out = tf.Variable(tf.zeros([self.a_dim]))
Qout = tf.matmul(h_fc2, W_out) + b_out
return inputs, action, Qout
def noisy_dense_layer(self, input, input_neurons, output_neurons, activation=tf.identity):
W_mu = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01))
W_sigma = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01))
W_eps = tf.random_normal(shape=[input_neurons, output_neurons])
W = W_mu + tf.multiply(W_sigma, W_eps)
b_mu = tf.Variable(tf.zeros([output_neurons]))
b_sigma = tf.Variable(tf.zeros([output_neurons]))
b_eps = tf.random_normal(shape=[output_neurons])
b = b_mu + tf.multiply(b_sigma, b_eps)
return activation(tf.matmul(input, W) + b)
def train(self, inputs, action, sampled_q):
return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={ #yes, needs to be changed too
self.inputs: inputs,
self.action: action,
self.sampled_q: sampled_q
})
def predict(self, inputs):
return self.sess.run(self.Qout, feed_dict={
self.inputs: inputs
})
def predict_dip(self, inputs, action):
return self.sess.run(self.Qout, feed_dict={
self.inputs: inputs,
self.action: action
})
def predict_action(self, inputs):
return self.sess.run(self.pred_q, feed_dict={
self.inputs: inputs
})
def predict_target(self, inputs):
return self.sess.run(self.target_Qout, feed_dict={
self.target_inputs: inputs
})
def predict_target_dip(self, inputs, action):
return self.sess.run(self.target_Qout, feed_dict={
self.target_inputs: inputs,
self.target_action: action
})
def predict_target_with_action_maxQ(self, inputs):
return self.sess.run(self.action_maxQ_target, feed_dict={
self.target_inputs: inputs,
self.inputs: inputs
})
def update_target_network(self):
self.sess.run(self.update_target_network_params) #yes, but no need to change
def load_network(self, load_filename):
self.saver = tf.train.Saver()
if load_filename.split('.')[-3] != '0':
try:
self.saver.restore(self.sess, './' + load_filename)
print("Successfully loaded:", load_filename)
except:
print("Could not find old network weights")
else:
print('nothing loaded in first iteration')
def save_network(self, save_filename):
print('Saving deepq-network...')
self.saver.save(self.sess, './' +save_filename) # yes but no need to change
def clipped_error(self, x):
return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false
class NNFDeepQNetwork(object): class NNFDeepQNetwork(object):
""" """
...@@ -474,159 +244,3 @@ class NNFDeepQNetwork(object): ...@@ -474,159 +244,3 @@ class NNFDeepQNetwork(object):
self.mean_noisy_b.append(tf.reduce_mean(tf.abs(b_sigma))) self.mean_noisy_b.append(tf.reduce_mean(tf.abs(b_sigma)))
return activation(tf.matmul(input, W) + b) return activation(tf.matmul(input, W) + b)
class RNNFDeepQNetwork(object):
"""
Input to the network is the state and action, output is Q(s,a).
"""
def __init__(self, sess, si_state_dim, sd_state_dim, action_dim, learning_rate, tau, num_actor_vars, minibatch_size=64,
architecture='duel', h1_size=130, h2_size=50, sd_enc_size=40, si_enc_size=80, dropout_rate=0., slot='si'):
#super(NNFDeepQNetwork, self).__init__(sess, si_state_dim + sd_state_dim, action_dim, learning_rate, tau, num_actor_vars,
# minibatch_size=64, architecture='duel', h1_size=130, h2_size=50)
self.sess = sess
self.si_dim = si_state_dim
self.sd_dim = sd_state_dim
self.a_dim = action_dim
self.learning_rate = learning_rate
self.tau = tau
self.architecture = architecture
self.h1_size = h1_size
self.h2_size = h2_size
self.minibatch_size = minibatch_size
self.sd_enc_size = sd_enc_size
self.si_enc_size = si_enc_size
self.dropout_rate = dropout_rate
# Create the deep Q network
self.inputs, self.action, self.Qout = \
self.create_rnnfdq_network(self.h1_size, self.h2_size, self.sd_enc_size, self.si_enc_size, self.dropout_rate, slot=slot)
self.network_params = tf.trainable_variables()
# Target Network
self.target_inputs, self.target_action, self.target_Qout = \
self.create_rnnfdq_network(self.h1_size, self.h2_size, self.sd_enc_size, self.si_enc_size, self.dropout_rate, tn='target', slot=slot)
self.target_network_params = tf.trainable_variables()[len(self.network_params):]
# Op for periodically updating target network
self.update_target_network_params = \
[self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau)
+ tf.multiply(self.target_network_params[i], 1. - self.tau))
for i in range(len(self.target_network_params))]
# Network target (y_i)
self.sampled_q = tf.placeholder(tf.float32, [None, 1])
# Predicted Q given state and chosed action
actions_one_hot = self.action
if architecture!= 'dip':
self.pred_q = tf.reshape(tf.reduce_sum(self.Qout * actions_one_hot, axis=1, name='q_acted'),
[self.minibatch_size, 1])
else:
self.pred_q = self.Qout
# Define loss and optimization Op
self.diff = self.sampled_q - self.pred_q
self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss')
self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
self.optimize = self.optimizer.minimize(self.loss)
#def create_slot_encoder(self):
def create_rnnfdq_network(self, h1_size=130, h2_size=50, sd_enc_size=40, si_enc_size=80, dropout_rate=0.,
tn='normal', slot='si'):
inputs = tf.placeholder(tf.float32, [None, self.sd_dim + self.si_dim])
keep_prob = 1 - dropout_rate
sd_inputs, si_inputs = tf.split(inputs, [self.sd_dim, self.si_dim], 1)
action = tf.placeholder(tf.float32, [None, self.a_dim])
if slot == 'sd':
sd_inputs = tf.reshape(sd_inputs, (tf.shape(sd_inputs)[0], 1, self.sd_dim))
#slots encoder
with tf.variable_scope(tn):
#try:
lstm_cell = tf.nn.rnn_cell.GRUCell(self.sd_enc_size)
hidden_state = lstm_cell.zero_state(tf.shape(sd_inputs)[0], tf.float32)
_, h_sdfe = tf.nn.dynamic_rnn(lstm_cell, sd_inputs, initial_state=hidden_state)
#except:
# lstm_cell = tf.contrib.rnn.GRUCell(self.sd_enc_size)
# hidden_state = lstm_cell.zero_state(tf.shape(sd_inputs)[0], tf.float32)
# _, h_sdfe = tf.contrib.rnn.dynamic_rnn(lstm_cell, sd_inputs, initial_state=hidden_state)
else:
W_sdfe = tf.Variable(tf.truncated_normal([self.sd_dim, sd_enc_size], stddev=0.01))
b_sdfe = tf.Variable(tf.zeros([sd_enc_size]))
h_sdfe = tf.nn.relu(tf.matmul(sd_inputs, W_sdfe) + b_sdfe)
if keep_prob < 1:
h_sdfe = tf.nn.dropout(h_sdfe, keep_prob)
W_sife = tf.Variable(tf.truncated_normal([self.si_dim, si_enc_size], stddev=0.01))
b_sife = tf.Variable(tf.zeros([si_enc_size]))
h_sife = tf.nn.relu(tf.matmul(si_inputs, W_sife) + b_sife)
if keep_prob < 1:
h_sife = tf.nn.dropout(h_sife, keep_prob)
W_fc1 = tf.Variable(tf.truncated_normal([sd_enc_size+si_enc_size, h1_size], stddev=0.01))
b_fc1 = tf.Variable(tf.zeros([h1_size]))
h_fc1 = tf.nn.relu(tf.matmul(tf.concat((h_sdfe, h_sife), 1), W_fc1) + b_fc1)
W_fc2 = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
b_fc2 = tf.Variable(tf.zeros([h2_size]))
h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)
W_out = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01))
b_out = tf.Variable(tf.zeros([self.a_dim]))
Qout = tf.matmul(h_fc2, W_out) + b_out
return inputs, action, Qout
def predict(self, inputs):
return self.sess.run(self.Qout, feed_dict={ #inputs where a single flat_bstate
self.inputs: inputs
})
def predict_dip(self, inputs, action):
return self.sess.run(self.Qout, feed_dict={ #inputs and action where array of 64 (batch size)
self.inputs: inputs,
self.action: action
})
def predict_target(self, inputs):
return self.sess.run(self.target_Qout, feed_dict={ #inputs where a single flat_bstate
self.target_inputs: inputs
})
def predict_target_dip(self, inputs, action):
return self.sess.run(self.target_Qout, feed_dict={ #inputs and action where array of 64 (batch size)
self.target_inputs: inputs,
self.target_action: action
})
def train(self, inputs, action, sampled_q):
return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={ #all the inputs are arrays of 64
self.inputs: inputs,
self.action: action,
self.sampled_q: sampled_q
})
def clipped_error(self, x):
return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false
def save_network(self, save_filename):
print('Saving deepq-network...')
self.saver.save(self.sess, save_filename)
def update_target_network(self):
self.sess.run(self.update_target_network_params)
def load_network(self, load_filename):
self.saver = tf.train.Saver()
if load_filename.split('.')[-3] != '0':
try:
self.saver.restore(self.sess, load_filename)
print("Successfully loaded:", load_filename)
except:
print("Could not find old network weights")
else:
print('nothing loaded in first iteration')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment