First running version of FeudalGain

5fa29847 · Christian · 8f6a7ef6 · 5fa29847 · 5fa29847 · 5fa29847
Commit 5fa29847 authored Aug 24, 2021 by Christian
--- a/config/set1-ErrorModel.cfg
+++ b/config/set1-ErrorModel.cfg
+### Error model parameters ###
+### set1 ###
+correctNBLenDist = [0.264, 0.215, 0.138, 0.099, 0.08]
+correctMean = [0.871, 0.103, 0.045, 0.032, 0.027, 0.049]
+correctVar = [0.029, 0.01, 0.002, 0.001, 0.0, 0.001]
+incorrectNBLenDist = [0.3, 0.068, 0.085, 0.072, 0.079]
+incorrectNBPosDist = [0.0, 0.179, 0.063, 0.027, 0.017]
+incorrectMean = [0.757, 0.179, 0.077, 0.048, 0.037, 0.065]
+incorrectVar = [0.052, 0.014, 0.003, 0.001, 0.0, 0.002]
--- a/config/set2-ErrorModel.cfg
+++ b/config/set2-ErrorModel.cfg
+### Error model parameters ###
+### set2 ###
+correctNBLenDist = [0.322, 0.455, 0.183, 0.033, 0.006]
+correctMean = [0.922, 0.105, 0.024, 0.021, 0.018, 0.022]
+correctVar = [0.014, 0.013, 0.002, 0.001, 0.001, 0.001]
+incorrectNBLenDist = [0.248, 0.462, 0.189, 0.073, 0.02]
+incorrectNBPosDist = [0.0, 0.297, 0.045, 0.008, 0.002]
+incorrectMean = [0.819, 0.201, 0.084, 0.052, 0.036, 0.042]
+incorrectVar = [0.034, 0.02, 0.005, 0.002, 0.001, 0.004]
--- a/config/set3-ErrorModel.cfg
+++ b/config/set3-ErrorModel.cfg
+### Error model parameters ###
+### set3 ###
+correctNBLenDist = [0.147, 0.43, 0.212, 0.147, 0.048]
+correctMean = [0.905, 0.101, 0.017, 0.008, 0.005, 0.01]
+correctVar = [0.016, 0.013, 0.002, 0.001, 0.0, 0.001]
+incorrectNBLenDist = [0.351, 0.388, 0.158, 0.071, 0.02]
+incorrectNBPosDist = [0.0, 0.116, 0.027, 0.007, 0.002]
+incorrectMean = [0.871, 0.164, 0.065, 0.043, 0.03, 0.054]
+incorrectVar = [0.029, 0.02, 0.005, 0.002, 0.001, 0.003]
--- a/config/set4-ErrorModel.cfg
+++ b/config/set4-ErrorModel.cfg
+### Error model parameters ###
+### set4 ###
+correctNBLenDist = [0.143, 0.405, 0.24, 0.155, 0.043]
+correctMean = [0.9, 0.104, 0.02, 0.01, 0.004, 0.004]
+correctVar = [0.016, 0.012, 0.002, 0.001, 0.0, 0.0]
+incorrectNBLenDist = [0.315, 0.387, 0.175, 0.089, 0.024]
+incorrectNBPosDist = [0.0, 0.128, 0.042, 0.01, 0.0]
+incorrectMean = [0.868, 0.155, 0.064, 0.043, 0.029, 0.038]
+incorrectVar = [0.03, 0.02, 0.004, 0.002, 0.001, 0.001]
--- a/feudalconfig.cfg
+++ b/feudalconfig.cfg
+# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator
+# User model: standard sampled params, sampled patience
+# Masks: off
+###### General parameters ######
+[GENERAL]
+domains = CamRestaurants
+singledomain = True
+tracedialog = 0
+seed = 1
+[exec_config]
+configdir = _benchmarkpolicies/env3-feudal
+logfiledir = _benchmarklogs/env3-feudal
+numtrainbatches = 20
+traindialogsperbatch = 200
+numbatchtestdialogs =  500
+trainsourceiteration = 0
+numtestdialogs =  500
+trainerrorrate = 15
+testerrorrate  = 15
+testeverybatch = True
+deleteprevpolicy = True
+[logging]
+usecolor = False
+screen_level = results
+file_level = results
+file = auto
+###### Environment parameters ######
+[agent]
+maxturns = 25
+[usermodel]
+usenewgoalscenarios = True
+oldstylepatience = False
+patience = 4,6
+configfile = config/sampledUM.cfg
+[errormodel]
+nbestsize = 5
+confusionmodel = LevenshteinConfusions
+nbestgeneratormodel = DSTC2NBestGenerator
+confscorer = DSTC2
+configfile = config/set1-ErrorModel.cfg
+[summaryacts]
+maxinformslots = 5
+informmask = True
+requestmask = True
+informcountaccepted = 4
+byemask = True
+###### Dialogue Manager parameters ######
+[policy]
+policydir = _benchmarkpolicies/env3-feudal
+belieftype = focus
+useconfreq = False
+learning = True
+policytype = feudalgain
+startwithhello = False
+inpolicyfile = auto
+outpolicyfile = auto
+temperature = 0.0
+noisy_acer = True
+sample_argmax = False
+[feudalpolicy]
+features=learned
+si_policy_type=acer
+only_master = True
+jsd_reward = True
+#jsd_function = tanh
+js_threshold = 0.2
+js_threshold_master = 1
+[i2a]
+is_imaging = False
+deepmind = False
+load_pretrain_data = False
+improve_env = False
+share_layer = 2
+new_q_loss = False
+device = cpu
+env_model_path = env_model/env1_acer_200.pkl
+[dqnpolicy]
+q_update = double
+architecture = duel
+#architecture = duel
+h1_size = 300
+h2_size = 100
+capacity = 2000
+beta = 0.95
+epsilon_start = 0.3
+maxiter = 4000
+minibatch_size = 64
+is_threshold = 5.0
+episodeNum = 0.0
+epsilon_end = 0.0
+n_in = 268
+features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"]
+###### Evaluation parameters ######
+[eval]
+rewardvenuerecommended=0
+penaliseallturns = True
+wrongvenuepenalty = 0
+notmentionedvaluepenalty = 0
+successmeasure = objective
+successreward = 20
--- a/policy/FeudalGainPolicy.py
+++ b/policy/FeudalGainPolicy.py
--- a/policy/PolicyManager.py
+++ b/policy/PolicyManager.py
@@ -303,6 +303,9 @@ class PolicyManager(object):
            elif policy_type == 'feudalAC':
                from policy import FeudalACPolicy
                self.domainPolicies[domainString] = FeudalACPolicy.FeudalACPolicy(in_policy_file, out_policy_file, domainString, learning)
+            elif policy_type == 'feudalgain':
+                from policy import FeudalGainPolicy
+                self.domainPolicies[domainString] = FeudalGainPolicy.FeudalGainPolicy(in_policy_file, out_policy_file, domainString, learning)
            else:
                try:
                    # try to view the config string as a complete module path to the class to be instantiated

--- a/policy/feudalgainRL/DIP_parametrisation.py
+++ b/policy/feudalgainRL/DIP_parametrisation.py
--- a/policy/feudalgainRL/DQNPolicy_latest.py
+++ b/policy/feudalgainRL/DQNPolicy_latest.py
--- a/policy/feudalgainRL/FeudalACERPolicy.py
+++ b/policy/feudalgainRL/FeudalACERPolicy.py
--- a/policy/feudalgainRL/FeudalBBQNPolicy.py
+++ b/policy/feudalgainRL/FeudalBBQNPolicy.py
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+'''
+DQNPolicy.py - deep Q network policy
+==================================================
+Author: Chris Tegho and Pei-Hao (Eddy) Su  (Copyright CUED Dialogue Systems Group 2016)
+.. seealso:: CUED Imports/Dependencies: 
+    import :class:`Policy`
+    import :class:`utils.ContextLogger`
+.. warning::
+        Documentation not done.
+************************
+'''
+import copy
+import os
+import json
+import numpy as np
+import pickle as pickle
+import random
+import sys
+import utils
+from utils.Settings import config as cfg
+from utils import ContextLogger, DiaAct, DialogueState
+import ontology.FlatOntologyManager as FlatOnt
+# from theano_dialogue.util.tool import *
+import tensorflow as tf
+from policy.DRL.replay_bufferVanilla import ReplayBuffer
+from policy.DRL.replay_prioritisedVanilla import ReplayPrioritised
+import policy.DRL.utils as drlutils
+from policy.DRL import bdqn as bbqn
+import policy.Policy
+import policy.SummaryAction
+import policy.BBQNPolicy
+from policy.Policy import TerminalAction, TerminalState
+from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state
+logger = utils.ContextLogger.getLogger('')
+# --- for flattening the belief --- # 
+domainUtil = FlatOnt.FlatDomainOntology('CamRestaurants')
+class FeudalBBQNPolicy(policy.BBQNPolicy.BBQNPolicy):
+    '''Derived from :class:`BBQNPolicy`
+    '''
+    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False,
+                 action_names=None, slot=None):
+        super(FeudalBBQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training)
+        tf.reset_default_graph()
+        self.domainString = domainString
+        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
+        self.in_policy_file = in_policy_file
+        self.out_policy_file = out_policy_file
+        self.is_training = is_training
+        self.accum_belief = []
+        self.prev_state_check = None
+        self.episode_ave_max_q = []
+        self.capacity *= 4 #set the capacity for episode methods, multiply it to adjust to turn based methods
+        self.slot = slot
+        # init session
+        self.sess = tf.Session()
+        with tf.device("/cpu:0"):
+            np.random.seed(self.randomseed)
+            tf.set_random_seed(self.randomseed)
+            # initialise an replay buffer
+            if self.replay_type == 'vanilla':
+                self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed)
+            elif self.replay_type == 'prioritized':
+                self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size,
+                                                                     self.randomseed)
+            # replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
+            # self.episodes = []
+            self.samplecount = 0
+            self.episodecount = 0
+            # construct the models
+            self.state_dim = 89  # current DIP state dim
+            self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
+            self.action_names = action_names
+            self.action_dim = len(self.action_names)
+            action_bound = len(self.action_names)
+            self.stats = [0 for _ in range(self.action_dim)]
+            self.stdVar = []
+            self.meanVar = []
+            self.stdMean = []
+            self.meanMean = []
+            self.td_error = []
+            self.td_errorVar = []
+            self.target_update_freq = 1
+            if cfg.has_option('bbqnpolicy', 'target_update_freq'):
+                self.target_update_freq = cfg.get('bbqnpolicy', 'target_update_freq')
+            #feudal params
+            self.features = 'dip'
+            self.sd_enc_size = 25
+            self.si_enc_size = 50
+            self.dropout_rate = 0.
+            if cfg.has_option('feudalpolicy', 'features'):
+                self.features = cfg.get('feudalpolicy', 'features')
+            if cfg.has_option('feudalpolicy', 'sd_enc_size'):
+                self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
+            if cfg.has_option('feudalpolicy', 'si_enc_size'):
+                self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
+            if cfg.has_option('feudalpolicy', 'dropout_rate') and self.is_training:
+                self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+            self.actfreq_ds = False
+            if cfg.has_option('feudalpolicy', 'actfreq_ds'):
+                self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')
+            if self.features == 'dip':
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        self.state_dim += 16
+                    elif self.domainString == 'SFRestaurants':
+                        self.state_dim += 25
+                    elif self.domainString == 'Laptops11':
+                        self.state_dim += 40
+                self.bbqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate, self.tau,
+                                              action_bound, self.architecture, self.h1_size, self.h2_size,
+                                              self.n_samples,
+                                              self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu,
+                                              self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling,
+                                              self.alpha_divergence, self.alpha, self.sigma_eps)
+            elif self.features == 'learned' or self.features == 'rnn':
+                si_state_dim = 72
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        si_state_dim += 16
+                    elif self.domainString == 'SFRestaurants':
+                        si_state_dim += 25
+                    elif self.domainString == 'Laptops11':
+                        si_state_dim += 40
+                if self.domainString == 'CamRestaurants':
+                    sd_state_dim = 94
+                elif self.domainString == 'SFRestaurants':
+                    sd_state_dim = 158
+                elif self.domainString == 'Laptops11':
+                    sd_state_dim = 13
+                else:
+                    logger.error(
+                        'Domain {} not implemented in feudal-DQN yet')  # just find out the size of sd_state_dim for the new domain
+                if self.features == 'rnn':
+                    arch = 'rnn'
+                    self.state_dim = si_state_dim + sd_state_dim
+                    self.bbqn = bbqn.RNNBBQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate,
+                                                  self.tau, action_bound, arch, self.h1_size, self.h2_size, self.n_samples,
+                                                  self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu,
+                                                  self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling,
+                                                  self.alpha_divergence, self.alpha, self.sigma_eps, sd_enc_size=self.sd_enc_size,
+                                                   si_enc_size=self.sd_enc_size, dropout_rate=self.dropout_rate, slot=slot)
+                else:
+                    arch = 'vanilla'
+                    self.state_dim = si_state_dim + sd_state_dim
+                    self.bbqn = bbqn.NNBBQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate,
+                                                  self.tau, action_bound, arch, self.h1_size, self.h2_size, self.n_samples,
+                                                  self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu,
+                                                  self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling,
+                                                  self.alpha_divergence, self.alpha, self.sigma_eps, sd_enc_size=self.sd_enc_size,
+                                                   si_enc_size=self.sd_enc_size, dropout_rate=self.dropout_rate, slot=slot)
+            else:
+                logger.error('features "{}" not implemented'.format(self.features))
+            # when all models are defined, init all variables
+            init_op = tf.global_variables_initializer()
+            self.sess.run(init_op)
+            self.loadPolicy(self.in_policy_file)
+            print('loaded replay size: ', self.episodes[self.domainString].size())
+            self.bbqn.update_target_network()
+    def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.actToBeRecorded is None:
+            # self.actToBeRecorded = self.lastSystemAction
+            self.actToBeRecorded = self.summaryAct
+        if state is None:
+            state = self.prevbelief
+        if action is None:
+            action = self.actToBeRecorded
+        cState, cAction = state, action
+        reward /= 20.0
+        cur_cState = np.vstack([np.expand_dims(x, 0) for x in [cState]])
+        cur_action_q = self.bbqn.predict(cur_cState)
+        cur_target_q = self.bbqn.predict_target(cur_cState)
+        if exec_mask is not None:
+            admissible = np.add(cur_target_q, np.array(exec_mask))
+        else:
+            admissible = cur_target_q
+        Q_s_t_a_t_ = cur_action_q[0][cAction]
+        gamma_Q_s_tplu1_maxa_ = self.gamma * np.max(admissible)
+        if weight == None:
+            if self.replay_type == 'vanilla':
+                self.episodes[domainInControl].record(state=cState, \
+                                                      state_ori=state, action=cAction, reward=reward)
+            elif self.replay_type == 'prioritized':
+                # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
+                if True:
+                    # if self.samplecount >= self.capacity:
+                    self.episodes[domainInControl].record(state=cState, \
+                                                          state_ori=state, action=cAction, reward=reward, \
+                                                          Q_s_t_a_t_=Q_s_t_a_t_,
+                                                          gamma_Q_s_tplu1_maxa_=gamma_Q_s_tplu1_maxa_, uniform=False)
+                else:
+                    self.episodes[domainInControl].record(state=cState, \
+                                                          state_ori=state, action=cAction, reward=reward, \
+                                                          Q_s_t_a_t_=Q_s_t_a_t_,
+                                                          gamma_Q_s_tplu1_maxa_=gamma_Q_s_tplu1_maxa_, uniform=True)
+        else:
+            self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward,
+                                                  ma_weight=weight)
+        self.actToBeRecorded = None
+        self.samplecount += 1
+        return
+    def finalizeRecord(self, reward, domainInControl=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.episodes[domainInControl] is None:
+            logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
+            return
+        # normalising total return to -1~1
+        # if reward == 0:
+        #    reward = -20.0
+        reward /= 20.0
+        """
+        if reward == 20.0:
+            reward = 1.0
+        else:
+            reward = -0.5
+        """
+        # reward = float(reward+10.0)/40.0
+        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=terminal_state, \
+                                                  state_ori=TerminalState(), action=terminal_action, reward=reward,
+                                                  terminal=True)
+        elif self.replay_type == 'prioritized':
+            # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
+            if True:
+                # if self.samplecount >= self.capacity:
+                self.episodes[domainInControl].record(state=terminal_state, \
+                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
+                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False,
+                                                      terminal=True)
+            else:
+                self.episodes[domainInControl].record(state=terminal_state, \
+                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
+                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=True,
+                                                      terminal=True)
+    def convertStateAction(self, state, action):
+        '''
+        '''
+        if isinstance(state, TerminalState):
+            return [0] * 89, action
+        else:
+            if self.features == 'learned' or self.features == 'rnn':
+                dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
+            else:
+                dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
+            action_name = self.actions.action_names[action]
+            act_slot = 'general'
+            for slot in dip_state.slots:
+                if slot in action_name:
+                    act_slot = slot
+            flat_belief = dip_state.get_beliefStateVec(act_slot)
+            self.prev_state_check = flat_belief
+            return flat_belief, action
+    def nextAction(self, beliefstate):
+        '''
+        select next action
+        :param beliefstate:
+        :param hyps:
+        :returns: (int) next summary action
+        '''
+        if self.exploration_type == 'e-greedy':
+            # epsilon greedy
+            if self.is_training and utils.Settings.random.rand() < self.epsilon:
+                action_Q = np.random.rand(len(self.action_names))
+            else:
+                action_Q = self.bbqn.predict(np.reshape(beliefstate, (1, len(beliefstate))))  # + (1. / (1. + i + j))
+                self.episode_ave_max_q.append(np.max(action_Q))
+        # return the Q vect, the action will be converted in the feudal policy
+        return action_Q
+    def train(self):
+        '''
+        call this function when the episode ends
+        '''
+        if not self.is_training:
+            logger.info("Not in training mode")
+            return
+        else:
+            logger.info("Update dqn policy parameters.")
+        self.episodecount += 1
+        logger.info("Sample Num so far: %s" % (self.samplecount))
+        logger.info("Episode Num so far: %s" % (self.episodecount))
+        if self.samplecount >= self.minibatch_size * 10 and self.episodecount % self.training_frequency == 0:
+            logger.info('start training...')
+            s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
+                self.episodes[self.domainString].sample_batch()
+            s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch])
+            s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch])
+            a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch]
+            action_q = self.bbqn.predict_dip(s2_batch, a_batch_one_hot)
+            target_q = self.bbqn.predict_target_dip(s2_batch, a_batch_one_hot)
+            # print 'action Q and target Q:', action_q, target_q
+            y_i = []
+            for k in range(min(self.minibatch_size, self.episodes[self.domainString].size())):
+                Q_bootstrap_label = 0
+                if t_batch[k]:
+                    Q_bootstrap_label = r_batch[k]
+                else:
+                    if self.q_update == 'single':
+                        belief = s2_ori_batch[k]
+                        execMask = [0.0] * len(self.action_names)  # TODO: find out how to compute the mask here, or save it when recording the state
+                        execMask[-1] = -sys.maxsize
+                        action_Q = target_q[k]
+                        admissible = np.add(action_Q, np.array(execMask))
+                        Q_bootstrap_label = r_batch[k] + self.gamma * np.max(admissible)
+                y_i.append(Q_bootstrap_label)
+            # Update the critic given the targets
+            reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i])
+            predicted_q_value, _, currentLoss, logLikelihood, varFC2, meanFC2, td_error, KL_div = self.bbqn.train(s_batch, a_batch_one_hot, reshaped_yi, self.episodecount)
+        if self.episodecount % self.target_update_freq == 0:
+            self.bbqn.update_target_network()
+        if self.episodecount % self.save_step == 0:
+            self.savePolicyInc()  # self.out_policy_file)
+# END OF FILE
--- a/policy/feudalgainRL/FeudalBBQNPolicyNew.py
+++ b/policy/feudalgainRL/FeudalBBQNPolicyNew.py
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+'''
+DQNPolicy.py - deep Q network policy
+==================================================
+Copyright CUED Dialogue Systems Group 2015 - 2017
+.. seealso:: CUED Imports/Dependencies:
+    import :class:`Policy`
+    import :class:`utils.ContextLogger`
+.. warning::
+        Documentation not done.
+************************
+'''
+import copy
+import os
+import sys
+import json
+import numpy as np
+import pickle as pickle
+from itertools import product
+from scipy.stats import entropy
+import utils
+from utils.Settings import config as cfg
+from utils import ContextLogger, DiaAct, DialogueState
+import ontology.FlatOntologyManager as FlatOnt
+import tensorflow as tf
+from policy.DRL.replay_buffer import ReplayBuffer
+from policy.DRL.replay_prioritised import ReplayPrioritised
+import policy.DRL.utils as drlutils
+import policy.DRL.dqn as dqn
+import policy.Policy
+import policy.DQNPolicy
+import policy.SummaryAction
+from policy.Policy import TerminalAction, TerminalState
+from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state
+from policy.feudalRL.feudalUtils import get_feudal_masks
+from policy.DRL import bdqn as bbqn
+logger = utils.ContextLogger.getLogger('')
+class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
+    '''Derived from :class:`DQNPolicy`
+    '''
+    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False,
+                 action_names=None, slot=None, sd_state_dim=50, js_threshold=0, info_reward=0.0, jsd_reward=False,
+                 jsd_function=None):
+        super(FeudalDQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training)
+        tf.reset_default_graph()
+        self.domainString = domainString
+        self.sd_state_dim = sd_state_dim
+        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
+        self.in_policy_file = in_policy_file
+        self.out_policy_file = out_policy_file
+        self.is_training = is_training
+        self.accum_belief = []
+        self.info_reward = info_reward
+        self.js_threshold = js_threshold
+        self.jsd_reward = jsd_reward
+        self.jsd_function = jsd_function
+        if self.jsd_function is not None:
+            print("We use the JSD-function", self.jsd_function)
+        if self.js_threshold != 1.0 and not self.jsd_reward:
+            print("We use JS-divergence, threshold =", self.js_threshold)
+        if self.jsd_reward:
+            print("We train with raw JSD reward.")
+        self.slots = slot
+        self.features = 'dip'
+        if cfg.has_option('feudalpolicy', 'features'):
+            self.features = cfg.get('feudalpolicy', 'features')
+        self.actfreq_ds = False
+        if cfg.has_option('feudalpolicy', 'actfreq_ds'):
+            self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')
+        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
+        self.prev_state_check = None
+        self.max_k = 5
+        if cfg.has_option('dqnpolicy', 'max_k'):
+            self.max_k = cfg.getint('dqnpolicy', 'max_k')
+        self.capacity *= 5  # capacity for episode methods, multiply it to adjust to turn based methods
+        # init session
+        self.sess = tf.Session()
+        with tf.device("/cpu:0"):
+            np.random.seed(self.randomseed)
+            tf.set_random_seed(self.randomseed)
+            # initialise a replay buffer
+            if self.replay_type == 'vanilla':
+                self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size*4, self.randomseed)
+            elif self.replay_type == 'prioritized':
+                self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size,
+                                                                     self.randomseed)
+            self.samplecount = 0
+            self.episodecount = 0
+            # construct the models
+            self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
+            self.action_names = action_names
+            self.action_dim = len(self.action_names)
+            action_bound = len(self.action_names)
+            self.stats = [0 for _ in range(self.action_dim)]
+            if self.features == 'learned' or self.features == 'rnn':
+                si_state_dim = 73
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        si_state_dim += 9#16
+                    elif self.domainString == 'SFRestaurants':
+                        si_state_dim += 9#25
+                    elif self.domainString == 'Laptops11':
+                        si_state_dim += 9#40
+                self.sd_enc_size = 50
+                self.si_enc_size = 25
+                self.dropout_rate = 0.
+                if cfg.has_option('feudalpolicy', 'sd_enc_size'):
+                    self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
+                if cfg.has_option('feudalpolicy', 'si_enc_size'):
+                    self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
+                if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
+                    self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+                if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
+                    self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+                self.state_dim = si_state_dim + sd_state_dim
+                if self.features == 'learned':
+                    self.dqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate,
+                                                  self.tau,
+                                                  action_bound, self.architecture, self.h1_size, self.h2_size,
+                                                  self.n_samples,
+                                                  self.minibatch_size)
+                elif self.features == 'rnn':
+                    self.dqn = dqn.RNNFDeepQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim,
+                                                   self.learning_rate, self.tau, action_bound, self.minibatch_size,
+                                                   self.architecture, self.h1_size, self.h2_size,
+                                                   sd_enc_size=self.sd_enc_size, si_enc_size=self.si_enc_size,
+                                                   dropout_rate=self.dropout_rate, slot=self.slot)
+            else: # self.features = 'dip'
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        self.state_dim += 9#16
+                    elif self.domainString == 'SFRestaurants':
+                        self.state_dim += 9#25
+                    elif self.domainString == 'Laptops11':
+                        self.state_dim += 9#40
+                self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim,
+                                            self.learning_rate, self.tau, action_bound, self.minibatch_size,
+                                            self.architecture, self.h1_size,
+                                            self.h2_size, dropout_rate=self.dropout_rate)
+            # when all models are defined, init all variables (this might to be sent to the main policy too)
+            init_op = tf.global_variables_initializer()
+            self.sess.run(init_op)
+            self.loadPolicy(self.in_policy_file)
+            print('loaded replay size: ', self.episodes[self.domainString].size())
+            self.dqn.update_target_network()
+    def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.actToBeRecorded is None:
+            self.actToBeRecorded = self.summaryAct
+        if state is None:
+            state = self.prevbelief
+        if action is None:
+            action = self.actToBeRecorded
+        cState, cAction = state, action
+        # normalising total return to -1~1
+        reward /= 20.0
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=cState, \
+                                                  state_ori=state, action=cAction, reward=reward)
+        self.actToBeRecorded = None
+        self.samplecount += 1
+    def finalizeRecord(self, reward, domainInControl=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.episodes[domainInControl] is None:
+            logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
+            return
+        reward /= 20.0
+        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=terminal_state, \
+                                                  state_ori=TerminalState(), action=terminal_action, reward=reward,
+                                                  terminal=True)
+        elif self.replay_type == 'prioritized':
+            self.episodes[domainInControl].record(state=terminal_state, \
+                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
+                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False,
+                                                      terminal=True)
+            print('total TD', self.episodes[self.domainString].tree.total())
+    def convertStateAction(self, state, action):
+        '''
+        '''
+        if isinstance(state, TerminalState):
+            return [0] * 89, action
+        else:
+            if self.features == 'learned' or self.features == 'rnn':
+                dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
+            else:
+                dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
+            action_name = self.actions.action_names[action]
+            act_slot = 'general'
+            for slot in dip_state.slots:
+                if slot in action_name:
+                    act_slot = slot
+            flat_belief = dip_state.get_beliefStateVec(act_slot)
+            self.prev_state_check = flat_belief
+            return flat_belief, action
+    def nextAction(self, beliefstate):
+        '''
+        select next action
+        :param beliefstate: already converted to dipstatevec of the specific slot (or general)
+        :returns: (int) next summary action
+        '''
+        if self.exploration_type == 'e-greedy':
+            # epsilon greedy
+            if self.is_training and utils.Settings.random.rand() < self.epsilon:
+                action_Q = np.random.rand(len(self.action_names))
+            else:
+                if len(beliefstate.shape) == 1:
+                    action_Q = self.dqn.predict(np.reshape(beliefstate, (1, -1)))
+                else:
+                    action_Q = self.dqn.predict(beliefstate)
+                # add current max Q to self.episode_ave_max_q
+                self.episode_ave_max_q.append(np.max(action_Q))
+        #return the Q vect, the action will be converted in the feudal policy
+        return action_Q
+    def train(self):
+        '''
+        call this function when the episode ends
+        '''
+        if not self.is_training:
+            logger.info("Not in training mode")
+            return
+        else:
+            logger.info("Update dqn policy parameters.")
+        self.episodecount += 1
+        logger.info("Sample Num so far: %s" % (self.samplecount))
+        logger.info("Episode Num so far: %s" % (self.episodecount))
+        s_batch_new, s_batch_beliefstate, s_batch_chosen_slot, s2_batch_dipstate, s2_batch_beliefstate, t_batch_new, r_batch_new = \
+            [], [], [], [], [], [], []
+        if self.samplecount >= self.minibatch_size * 8 and self.episodecount % self.training_frequency == 0:
+            logger.info('start training...')
+            a_batch_one_hot_new = None
+            #updating only states where the action is not "pass()" complicates things :/
+            #since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples
+            while len(s_batch_new) < self.minibatch_size:
+                s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
+                    self.episodes[self.domainString].sample_batch()
+                a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch]
+                #we only wanna update state-action pairs, where action != pass()
+                valid_steps = [action[-1] != 1 for action in a_batch_one_hot]
+                a_batch_one_hot = a_batch_one_hot[valid_steps]
+                s_batch_new += [s[0] for i, s in enumerate(s_batch) if valid_steps[i]]
+                s_batch_beliefstate += [s[1] for i, s in enumerate(s_batch) if valid_steps[i]]
+                s_batch_chosen_slot += [s[2] for i, s in enumerate(s_batch) if valid_steps[i]]
+                s2_batch_dipstate += [s[3] for s, valid in zip(s2_batch, valid_steps) if valid]
+                s2_batch_beliefstate += [s[1] for s, valid in zip(s2_batch, valid_steps) if valid]
+                r_batch_new += [r for r, valid in zip(r_batch, valid_steps) if valid]
+                t_batch_new += [t for t, valid in zip(t_batch, valid_steps) if valid]
+                if a_batch_one_hot_new is None:
+                    a_batch_one_hot_new = a_batch_one_hot
+                else:
+                    a_batch_one_hot_new = np.vstack((a_batch_one_hot_new, a_batch_one_hot))
+            s_batch_new = np.vstack(s_batch_new)
+            s2_batch_dipstate = np.vstack(s2_batch_dipstate)
+            if self.js_threshold < 1.0 or self.jsd_reward:
+                #TODO: This is highly inefficient
+                js_divergence_batch = []
+                for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot):
+                    if slot != "None":
+                        keys = belief['beliefs'][slot].keys()
+                        b = [belief['beliefs'][slot]['**NONE**']] + \
+                            [belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
+                        b_2 = [belief2['beliefs'][slot]['**NONE**']] + \
+                              [belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
+                        js_divergence = self.compute_js_divergence(b, b_2)
+                        js_divergence_batch.append(js_divergence)
+                    else:
+                        js_divergence_batch.append(0.0)
+            else:
+                js_divergence_batch = [0] * len(r_batch_new)
+            tanh_n = np.tanh(1)
+            if self.jsd_reward:
+                if self.jsd_function == 'tanh':
+                    js_divergence_batch = np.tanh(np.array(js_divergence_batch)) / tanh_n
+                #normalize jsd between -1 and 1
+                js_divergence_batch = (-1 + 2 * np.array(js_divergence_batch)).tolist()
+            elif self.js_threshold < 1.0:
+                # normalizing bound to [0, 2] and then /20
+                js_divergence_batch = [2/20 * int(x > self.js_threshold) for x in js_divergence_batch]
+            action_q = self.dqn.predict_dip(s2_batch_dipstate, a_batch_one_hot_new)
+            target_q = self.dqn.predict_target_dip(s2_batch_dipstate, a_batch_one_hot_new)
+            action_q = np.reshape(action_q, (s_batch_new.shape[0], -1, self.action_dim))
+            target_q = np.reshape(target_q, (s_batch_new.shape[0], -1, self.action_dim))
+            y_i = []
+            for k in range(min(s_batch_new.shape[0], self.episodes[self.domainString].size())):
+                Q_bootstrap_label = 0
+                if t_batch_new[k]:
+                    Q_bootstrap_label = r_batch_new[k]
+                else:
+                    if self.q_update == 'single':
+                        action_Q = target_q[k]
+                        if self.jsd_reward:
+                            Q_bootstrap_label = js_divergence_batch[k] + self.gamma * np.max(action_Q)
+                        else:
+                            Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q)
+                    elif self.q_update == 'double':
+                        action_Q = action_q[k]
+                        argmax_tuple = np.unravel_index(np.argmax(action_Q, axis=None), action_Q.shape)
+                        value_Q = target_q[k][argmax_tuple]
+                        Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q
+                y_i.append(Q_bootstrap_label)
+                if self.replay_type == 'prioritized':
+                    # update the sum-tree
+                    # update the TD error of the samples in the minibatch
+                    currentQ_s_a_ = action_q[k][a_batch[k]]
+                    error = abs(currentQ_s_a_ - Q_bootstrap_label)
+                    self.episodes[self.domainString].update(idx_batch[k], error)
+            reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i])
+            predicted_q_value, _, currentLoss = self.dqn.train(s_batch_new, a_batch_one_hot_new, reshaped_yi)
+            if self.episodecount % 1 == 0:
+                # Update target networks
+                self.dqn.update_target_network()
+        self.savePolicyInc()
+    def compute_js_divergence(self, P, Q):
+        M = [p + q for p, q in zip(P, Q)]
+        return 0.5 * (entropy(P, M, base=2) + entropy(Q, M, base=2))
+# END OF FILE
--- a/policy/feudalgainRL/FeudalENACPolicy.py
+++ b/policy/feudalgainRL/FeudalENACPolicy.py
--- a/policy/feudalgainRL/FeudalNoisyACERPolicy.py
+++ b/policy/feudalgainRL/FeudalNoisyACERPolicy.py
--- a/policy/feudalgainRL/FeudalNoisyDQNPolicy.py
+++ b/policy/feudalgainRL/FeudalNoisyDQNPolicy.py
--- a/policy/feudalgainRL/NoisyACERPolicy.py
+++ b/policy/feudalgainRL/NoisyACERPolicy.py
--- a/policy/feudalgainRL/__init__.py
+++ b/policy/feudalgainRL/__init__.py
--- a/policy/feudalgainRL/dqn_latest.py
+++ b/policy/feudalgainRL/dqn_latest.py
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+""" 
+Implementation of DQAN -  Deep Q Action Network
+The algorithm is developed with tflearn + Tensorflow
+Author: Pei-Hao Su
+"""
+import tensorflow as tf
+import numpy as np
+import tflearn
+from policy.DRL.replay_buffer import ReplayBuffer
+# ===========================
+#   Deep Q Action Network
+# ===========================
+class DeepQNetwork(object):
+    """ 
+    Input to the network is the state and action, output is Q(s,a).
+    """
+    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, \
+                    num_actor_vars, architecture = 'duel', h1_size = 130, h2_size = 50):
+        self.sess = sess
+        self.s_dim = state_dim
+        self.a_dim = action_dim
+        self.learning_rate = learning_rate
+        self.tau = tau
+        self.architecture = architecture
+        self.h1_size = h1_size
+        self.h2_size = h2_size
+        # Create the deep Q network
+        self.inputs, self.action, self.Qout = \
+                        self.create_ddq_network(self.architecture, self.h1_size, self.h2_size)
+        self.network_params = tf.trainable_variables()
+        # Target Network
+        self.target_inputs, self.target_action, self.target_Qout = \
+                        self.create_ddq_network(self.architecture, self.h1_size, self.h2_size)
+        self.target_network_params = tf.trainable_variables()[len(self.network_params):]
+        # Op for periodically updating target network
+        self.update_target_network_params = \
+            [self.target_network_params[i].assign(\
+                tf.mul(self.network_params[i], self.tau) + tf.mul(self.target_network_params[i], 1. - self.tau))
+                for i in range(len(self.target_network_params))]
+        # Network target (y_i)
+        self.sampled_q = tf.placeholder(tf.float32, [None, 1])
+        # Predicted Q given state and chosed action
+        #actions_one_hot = self.action
+        #self.pred_q = tf.reduce_sum(self.Qout * actions_one_hot, reduction_indices=1, name='q_acted')
+        self.pred_q = self.Qout
+        self.diff = self.sampled_q - self.pred_q
+        self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss')
+        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
+        self.optimize = self.optimizer.minimize(self.loss)
+    def create_ddq_network(self, architecture = 'duel', h1_size = 130, h2_size = 50):
+        inputs = tf.placeholder(tf.float32, [None, self.s_dim])
+        action = tf.placeholder(tf.float32, [None, self.a_dim])
+        # state network
+        W_fc1_s = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01))
+        b_fc1_s = tf.Variable(tf.zeros([h1_size]))
+        h_fc1_s = tf.nn.relu(tf.matmul(inputs, W_fc1_s) + b_fc1_s)
+        # action network
+        W_fc1_a = tf.Variable(tf.truncated_normal([self.a_dim, h1_size], stddev=0.01))
+        b_fc1_a = tf.Variable(tf.zeros([h1_size]))
+        h_fc1_a = tf.nn.relu(tf.matmul(action, W_fc1_a) + b_fc1_a)
+        #h_fc1 = tf.nn.tanh(tf.matmul(inputs, W_fc1) + b_fc1)
+        #if architecture == 'duel':
+        if False:
+            """
+            W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_fc2_s = tf.Variable(tf.zeros([h2_size]))
+            h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s)
+            W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01))
+            b_value = tf.Variable(tf.zeros([1]))
+            value_out  = tf.matmul(h_fc2_s, W_value) + b_value
+            W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_fc2_a = tf.Variable(tf.zeros([h2_size]))
+            h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a)
+            Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1)
+            """
+            # value function
+            W_value = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_value = tf.Variable(tf.zeros([h2_size]))
+            h_value = tf.nn.relu(tf.matmul(h_fc1, W_value) + b_value)
+            W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01))
+            b_value = tf.Variable(tf.zeros([1]))
+            value_out  = tf.matmul(h_value, W_value) + b_value
+            # advantage function
+            W_advantage = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_advantage = tf.Variable(tf.zeros([h2_size]))
+            h_advantage = tf.nn.relu(tf.matmul(h_fc1, W_advantage) + b_advantage)
+            W_advantage = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01))
+            b_advantage = tf.Variable(tf.zeros([self.a_dim]))
+            Advantage_out  = tf.matmul(h_advantage, W_advantage) + b_advantage
+            Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, reduction_indices=1, keep_dims=True))
+        else:
+            W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_fc2_s = tf.Variable(tf.zeros([h2_size]))
+            h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s)
+            W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_fc2_a = tf.Variable(tf.zeros([h2_size]))
+            h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a)
+            # inner product of state s and action a
+            #Qout = tf.mul(h_fc2_s,h_fc2_a)
+            Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1)
+            #Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1, keep_dims=True )
+            #Qout = tf.reduce_sum(tf.mul(h_fc2_s,h_fc2_a))
+        return inputs, action, Qout
+    def train(self, inputs, action, sampled_q):
+        return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={
+            self.inputs: inputs,
+            self.action: action,
+            self.sampled_q: sampled_q
+        })
+    def predict(self, inputs, action):
+        #return self.sess.run(self.pred_q, feed_dict={
+        return self.sess.run(self.Qout, feed_dict={
+            self.inputs: inputs,
+            self.action: action
+        })
+    def predict_target(self, inputs, action):
+        #return self.sess.run(self.pred_q, feed_dict={
+        return self.sess.run(self.target_Qout, feed_dict={
+            self.target_inputs: inputs,
+            self.target_action: action
+        })
+    def update_target_network(self):
+        self.sess.run(self.update_target_network_params)
+    def load_network(self, load_filename):
+        self.saver = tf.train.Saver()
+        try:
+            self.saver.restore(self.sess, load_filename)
+            print("Successfully loaded:", load_filename)
+        except:
+            print("Could not find old network weights")
+    def save_network(self, save_filename):
+        print('Saving deepq-network...')
+        self.saver.save(self.sess, save_filename)
+    def clipped_error(self, x): 
+        return tf.select(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false
--- a/policy/feudalgainRL/feudalUtils.py
+++ b/policy/feudalgainRL/feudalUtils.py
--- a/policy/feudalgainRL/noisyacer.py
+++ b/policy/feudalgainRL/noisyacer.py