diff --git a/config/set1-ErrorModel.cfg b/config/set1-ErrorModel.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..faf5d1a67b39d44e52326b21673bf532d78b7d4d
--- /dev/null
+++ b/config/set1-ErrorModel.cfg
@@ -0,0 +1,10 @@
+### Error model parameters ###
+### set1 ###
+correctNBLenDist = [0.264, 0.215, 0.138, 0.099, 0.08]
+correctMean = [0.871, 0.103, 0.045, 0.032, 0.027, 0.049]
+correctVar = [0.029, 0.01, 0.002, 0.001, 0.0, 0.001]
+incorrectNBLenDist = [0.3, 0.068, 0.085, 0.072, 0.079]
+incorrectNBPosDist = [0.0, 0.179, 0.063, 0.027, 0.017]
+incorrectMean = [0.757, 0.179, 0.077, 0.048, 0.037, 0.065]
+incorrectVar = [0.052, 0.014, 0.003, 0.001, 0.0, 0.002]
+
diff --git a/config/set2-ErrorModel.cfg b/config/set2-ErrorModel.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..d508eda07ef7a079432da231ea5a5b3aafd7b7e0
--- /dev/null
+++ b/config/set2-ErrorModel.cfg
@@ -0,0 +1,9 @@
+### Error model parameters ###
+### set2 ###
+correctNBLenDist = [0.322, 0.455, 0.183, 0.033, 0.006]
+correctMean = [0.922, 0.105, 0.024, 0.021, 0.018, 0.022]
+correctVar = [0.014, 0.013, 0.002, 0.001, 0.001, 0.001]
+incorrectNBLenDist = [0.248, 0.462, 0.189, 0.073, 0.02]
+incorrectNBPosDist = [0.0, 0.297, 0.045, 0.008, 0.002]
+incorrectMean = [0.819, 0.201, 0.084, 0.052, 0.036, 0.042]
+incorrectVar = [0.034, 0.02, 0.005, 0.002, 0.001, 0.004]
diff --git a/config/set3-ErrorModel.cfg b/config/set3-ErrorModel.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..8707fbcc7ad81eff85e9c209b9e6724b58743def
--- /dev/null
+++ b/config/set3-ErrorModel.cfg
@@ -0,0 +1,10 @@
+### Error model parameters ###
+### set3 ###
+correctNBLenDist = [0.147, 0.43, 0.212, 0.147, 0.048]
+correctMean = [0.905, 0.101, 0.017, 0.008, 0.005, 0.01]
+correctVar = [0.016, 0.013, 0.002, 0.001, 0.0, 0.001]
+incorrectNBLenDist = [0.351, 0.388, 0.158, 0.071, 0.02]
+incorrectNBPosDist = [0.0, 0.116, 0.027, 0.007, 0.002]
+incorrectMean = [0.871, 0.164, 0.065, 0.043, 0.03, 0.054]
+incorrectVar = [0.029, 0.02, 0.005, 0.002, 0.001, 0.003]
+
diff --git a/config/set4-ErrorModel.cfg b/config/set4-ErrorModel.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..a492c587d602e6dc84820a66c930780311540f84
--- /dev/null
+++ b/config/set4-ErrorModel.cfg
@@ -0,0 +1,9 @@
+### Error model parameters ###
+### set4 ###
+correctNBLenDist = [0.143, 0.405, 0.24, 0.155, 0.043]
+correctMean = [0.9, 0.104, 0.02, 0.01, 0.004, 0.004]
+correctVar = [0.016, 0.012, 0.002, 0.001, 0.0, 0.0]
+incorrectNBLenDist = [0.315, 0.387, 0.175, 0.089, 0.024]
+incorrectNBPosDist = [0.0, 0.128, 0.042, 0.01, 0.0]
+incorrectMean = [0.868, 0.155, 0.064, 0.043, 0.029, 0.038]
+incorrectVar = [0.03, 0.02, 0.004, 0.002, 0.001, 0.001]
diff --git a/feudalconfig.cfg b/feudalconfig.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..3b310f37132adf5716e39f4f3cae7878c5b4738b
--- /dev/null
+++ b/feudalconfig.cfg
@@ -0,0 +1,116 @@
+# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator
+# User model: standard sampled params, sampled patience
+# Masks: off
+
+###### General parameters ######
+[GENERAL]
+domains = CamRestaurants
+singledomain = True
+tracedialog = 0
+seed = 1
+
+[exec_config]
+configdir = _benchmarkpolicies/env3-feudal
+logfiledir = _benchmarklogs/env3-feudal
+numtrainbatches = 20
+traindialogsperbatch = 200
+numbatchtestdialogs =  500
+trainsourceiteration = 0
+numtestdialogs =  500
+trainerrorrate = 15
+testerrorrate  = 15
+testeverybatch = True
+deleteprevpolicy = True
+
+[logging]
+usecolor = False
+screen_level = results
+file_level = results
+file = auto
+
+###### Environment parameters ######
+
+[agent]
+maxturns = 25
+
+[usermodel]
+usenewgoalscenarios = True
+oldstylepatience = False
+patience = 4,6
+configfile = config/sampledUM.cfg
+
+[errormodel]
+nbestsize = 5
+confusionmodel = LevenshteinConfusions
+nbestgeneratormodel = DSTC2NBestGenerator
+confscorer = DSTC2
+configfile = config/set1-ErrorModel.cfg
+
+
+[summaryacts]
+maxinformslots = 5
+informmask = True
+requestmask = True
+informcountaccepted = 4
+byemask = True
+
+###### Dialogue Manager parameters ######
+[policy]
+policydir = _benchmarkpolicies/env3-feudal
+belieftype = focus
+useconfreq = False
+learning = True
+policytype = feudalgain
+startwithhello = False
+inpolicyfile = auto
+outpolicyfile = auto
+temperature = 0.0
+noisy_acer = True
+sample_argmax = False
+
+[feudalpolicy]
+features=learned
+si_policy_type=acer
+only_master = True
+jsd_reward = True
+#jsd_function = tanh
+js_threshold = 0.2
+js_threshold_master = 1
+
+[i2a]
+is_imaging = False
+deepmind = False
+load_pretrain_data = False
+improve_env = False
+share_layer = 2
+new_q_loss = False
+device = cpu
+env_model_path = env_model/env1_acer_200.pkl
+
+[dqnpolicy]
+q_update = double
+architecture = duel
+#architecture = duel
+h1_size = 300
+h2_size = 100
+capacity = 2000
+beta = 0.95
+epsilon_start = 0.3
+maxiter = 4000
+minibatch_size = 64
+is_threshold = 5.0
+episodeNum = 0.0
+epsilon_end = 0.0
+n_in = 268
+features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"]
+
+###### Evaluation parameters ######
+
+[eval]
+rewardvenuerecommended=0
+penaliseallturns = True
+wrongvenuepenalty = 0
+notmentionedvaluepenalty = 0
+successmeasure = objective
+successreward = 20
+
diff --git a/policy/FeudalGainPolicy.py b/policy/FeudalGainPolicy.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a767ab2be8993f9ab07668968d9ba81535978f0
--- /dev/null
+++ b/policy/FeudalGainPolicy.py
@@ -0,0 +1,425 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+
+import numpy as np
+import random
+import utils
+from utils.Settings import config as cfg
+from utils import ContextLogger, DiaAct
+
+import ontology.FlatOntologyManager as FlatOnt
+from ontology import Ontology
+from policy import Policy
+from policy import SummaryAction
+from policy.feudalgainRL.DIP_parametrisation import DIP_state, padded_state
+from policy.feudalgainRL.FeudalNoisyDQNPolicy import FeudalDQNPolicy
+from policy.feudalgainRL.FeudalNoisyACERPolicy import FeudalNoisyACERPolicy
+from policy.feudalgainRL.feudalUtils import get_feudalAC_masks
+
+logger = utils.ContextLogger.getLogger('')
+
+
+class FeudalGainPolicy(Policy.Policy):
+    '''Derived from :class:`Policy`
+    '''
+
+    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False):
+        super(FeudalGainPolicy, self).__init__(domainString, is_training)
+
+        self.domainString = domainString
+        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
+        self.in_policy_file = in_policy_file
+        self.out_policy_file = out_policy_file
+        self.is_training = is_training
+        self.accum_belief = []
+
+        self.prev_state_check = None
+        #feudalRL variables
+        self.prev_sub_policy = None
+        self.prev_master_act = None
+        self.prev_master_belief = None
+        self.prev_child_act = None
+        self.prev_child_belief = None
+
+        self.slots = list(Ontology.global_ontology.get_informable_slots(domainString))
+
+        if 'price' in self.slots:
+            self.slots.remove('price')  # remove price from SFR ont, its not used
+        if 'name' in self.slots:
+            self.slots.remove('name')
+
+        self.features = 'dip'
+        if cfg.has_option('feudalpolicy', 'features'):
+            self.features = cfg.get('feudalpolicy', 'features')
+        self.si_policy_type = 'dqn'
+        if cfg.has_option('feudalpolicy', 'si_policy_type'):
+            self.si_policy_type = cfg.get('feudalpolicy', 'si_policy_type')
+        self.sd_policy_type = 'dqn'
+        if cfg.has_option('feudalpolicy', 'sd_policy_type'):
+            self.sd_policy_type = cfg.get('feudalpolicy', 'sd_policy_type')
+        self.probability_max = 50
+        if cfg.has_option('feudalpolicy', 'probability_max'):
+            self.probability_max = cfg.get('feudalpolicy', 'probability_max')
+        self.info_reward = 0.0
+        if cfg.has_option('feudalpolicy', 'info_reward'):
+            self.info_reward = cfg.getfloat('feudalpolicy', 'info_reward')
+        self.js_threshold = 1.0
+        if cfg.has_option('feudalpolicy', 'js_threshold'):
+            self.js_threshold = cfg.getfloat('feudalpolicy', 'js_threshold')
+        self.jsd_reward = False
+        if cfg.has_option('feudalpolicy', 'jsd_reward'):
+            self.jsd_reward = cfg.getboolean('feudalpolicy', 'jsd_reward')
+        self.jsd_function = None
+        if cfg.has_option('feudalpolicy', 'jsd_function'):
+            self.jsd_function = cfg.get('feudalpolicy', 'jsd_function')
+        self.info_reward_master = 0.0
+        if cfg.has_option('feudalpolicy', 'info_reward_master'):
+            self.info_reward_master = cfg.getfloat('feudalpolicy', 'info_reward_master')
+            print("Master policy trains with info_gain reward")
+        self.js_threshold_master = 1.0
+        if cfg.has_option('feudalpolicy', 'js_threshold_master'):
+            self.js_threshold_master = cfg.getfloat('feudalpolicy', 'js_threshold_master')
+        self.only_master = False
+        if cfg.has_option('feudalpolicy', 'only_master'):
+            self.only_master = cfg.getboolean('feudalpolicy', 'only_master')
+        if self.only_master:
+            print("We train with merged master!")
+
+        self.bye_mask = False
+        if cfg.has_option('summaryacts', 'byemask'):
+            self.bye_mask = cfg.getboolean('summaryacts', 'byemask')
+            print("WE USE BYEMASK: ", self.bye_mask)
+
+        self.critic_regularizer_path = None
+        if cfg.has_option('policy', 'critic_regularizer'):
+            self.critic_regularizer_path = cfg.get('policy', 'critic_regularizer')
+            print(f"We use {self.critic_regularizer_path} as a critic regularizer.")
+
+        self.critic_regularizer_weight = 0
+        if cfg.has_option('policy', 'critic_regularizer_weight'):
+            self.critic_regularizer_weight = cfg.getfloat('policy', 'critic_regularizer_weight')
+
+        self.randomseed = 1234
+        if cfg.has_option('GENERAL', 'seed'):
+            self.randomseed = cfg.getint('GENERAL', 'seed')
+
+        self.load_master_policy = True
+        if cfg.has_option('policy', 'bootstrap_master_policy'):
+            self.load_master_policy = cfg.getboolean('policy', 'bootstrap_master_policy')
+            print("FeudalAC: BOOTSTRAP MASTER Policy: ", self.load_master_policy)
+
+        # Create the feudal structure (including feudal masks)
+
+        self.summaryaction = SummaryAction.SummaryAction(domainString)
+        self.full_action_list = self.summaryaction.action_names
+        self.slot_independent_actions = ["inform",
+                                            "inform_byname",
+                                            "inform_alternatives",
+                                            "reqmore",
+                                         'bye',
+                                         'pass'
+                                         ]
+
+        self.slot_specific_actions = ["request",
+                                        "confirm",
+                                        "select",
+                                        'pass']
+
+        self.master_actions = ['slot_ind', 'slot_dep']
+
+        self.chosen = False
+
+        if self.only_master:
+            print("Using ACER with merged policy.")
+            self.master_actions = self.slot_independent_actions[:-1] + ['slot_dep']
+            self.master_policy = FeudalNoisyACERPolicy(self._modify_policyfile('master', in_policy_file),
+                                                   self._modify_policyfile('master', out_policy_file),
+                                                   domainString=self.domainString, is_training=self.is_training,
+                                                   action_names=self.master_actions, sd_state_dim=self.probability_max,
+                                                   slot='si', js_threshold=self.js_threshold_master,
+                                                   info_reward=self.info_reward_master, load_policy=self.load_master_policy,
+                                                   critic_regularizer_weight=self.critic_regularizer_weight)
+
+        elif self.si_policy_type == 'acer':
+            print("Using ACER with give_info and master_policy.")
+            self.master_policy = FeudalNoisyACERPolicy(self._modify_policyfile('master', in_policy_file),
+                                                  self._modify_policyfile('master', out_policy_file),
+                                                  domainString=self.domainString, is_training=self.is_training,
+                                                  action_names=self.master_actions, sd_state_dim=self.probability_max,
+                                                  slot='si', js_threshold=self.js_threshold_master,
+                                                   info_reward=self.info_reward_master)
+            self.give_info_policy = FeudalNoisyACERPolicy(self._modify_policyfile('gi', in_policy_file),
+                                                     self._modify_policyfile('gi', out_policy_file),
+                                                     domainString=self.domainString, is_training=self.is_training,
+                                                     action_names=self.slot_independent_actions, slot='si',
+                                                     sd_state_dim=self.probability_max)
+        elif self.si_policy_type == 'dqn':
+            self.master_policy = FeudalNoisyACERPolicy(self._modify_policyfile('master', in_policy_file),
+                                                  self._modify_policyfile('master', out_policy_file),
+                                                  domainString=self.domainString, is_training=self.is_training,
+                                                  action_names=self.master_actions, sd_state_dim=self.probability_max,
+                                                  slot='si')
+            self.give_info_policy = FeudalDQNPolicy(self._modify_policyfile('gi', in_policy_file),
+                                                       self._modify_policyfile('gi', out_policy_file),
+                                                       domainString=self.domainString, is_training=self.is_training,
+                                                       action_names=self.slot_independent_actions, slot='si',
+                                                       sd_state_dim=0)
+
+        else:
+            self.master_policy = FeudalDQNPolicy(self._modify_policyfile('master', in_policy_file),
+                                                 self._modify_policyfile('master', out_policy_file),
+                                                     domainString=self.domainString, is_training=self.is_training,
+                                                     action_names=self.master_actions,
+                                                 slot='si')#pass is always masked, but its needed for implementation
+            self.give_info_policy = FeudalDQNPolicy(self._modify_policyfile('gi', in_policy_file),
+                                                    self._modify_policyfile('gi', out_policy_file),
+                                                     domainString=self.domainString, is_training=self.is_training,
+                                                     action_names=self.slot_independent_actions, slot='si')
+
+        self.request_info_policy = FeudalDQNPolicy(self._modify_policyfile('ri', in_policy_file),
+                                                   self._modify_policyfile('ri', out_policy_file),
+                                                   domainString=self.domainString, is_training=self.is_training,
+                                                   action_names=self.slot_specific_actions, slot='sd',
+                                                   sd_state_dim=self.probability_max,
+                                                   js_threshold=self.js_threshold, info_reward=self.info_reward,
+                                                   jsd_reward=self.jsd_reward, jsd_function=self.jsd_function)
+        self.critic_regularizer = None
+
+    def _modify_policyfile(self, mod, policyfile):
+        pf_split = policyfile.split('/')
+        pf_split[-1] = mod + '_' + pf_split[-1]
+        return '/'.join(pf_split)
+
+    def act_on(self, state, hyps=None):
+        if self.lastSystemAction is None and self.startwithhello:
+            systemAct, nextaIdex = 'hello()', -1
+            self.chosen_slot_ = None
+        else:
+            systemAct, nextaIdex = self.nextAction(state)
+        self.lastSystemAction = systemAct
+        self.summaryAct = nextaIdex
+        self.prevbelief = state
+
+        systemAct = DiaAct.DiaAct(systemAct)
+        return systemAct
+
+    def record(self, reward, domainInControl=None, weight=None, state=None, action=None):
+        self.record_master(reward)
+        self.record_childs(reward)
+
+    def finalizeRecord(self, reward, domainInControl=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        self.master_policy.finalizeRecord(reward)
+        if not self.only_master:
+            self.give_info_policy.finalizeRecord(reward)
+        self.request_info_policy.finalizeRecord(reward)
+
+        #print("DIALOGUE FINISHED")
+        #print("REWARD:", reward)
+        #print("\n")
+
+    def record_master(self, reward):
+        if self.only_master or self.si_policy_type == 'acer':
+            self.master_policy.record(reward, domainInControl=self.domainString,
+                                      state=[self.prev_master_belief, self.beliefstate, self.chosen_slot],
+                                      action=self.prev_master_act)
+        else:
+            self.master_policy.record(reward, domainInControl=self.domainString,
+                                      state=self.prev_master_belief, action=self.prev_master_act)
+
+    def record_childs(self, reward):
+        if self.prev_sub_policy == 'si':
+            if not self.only_master:
+                self.give_info_policy.record(reward, domainInControl=self.domainString,
+                                             state=[self.prev_master_belief, 0 , 0],
+                                             action=self.prev_child_act)
+
+            state_for_pi_d = np.concatenate([np.zeros(self.probability_max), self.prev_master_belief])
+            state_for_pi_d[0] = 1.0
+
+            self.request_info_policy.record(reward, domainInControl=self.domainString,
+                                            state=[state_for_pi_d,
+                                                   self.beliefstate, self.chosen_slot, self.dipstatevec_slots],
+                                            action=len(self.slot_specific_actions) - 1)
+        elif self.prev_sub_policy == 'sd':
+            self.request_info_policy.record(reward, domainInControl=self.domainString,
+                                            state=[self.prev_child_belief, self.beliefstate, self.chosen_slot, self.dipstatevec_slots],
+                                            action=self.prev_child_act)
+            if not self.only_master:
+                self.give_info_policy.record(reward, domainInControl=self.domainString,
+                                             state=[self.prev_master_belief, 0 , 0],
+                                             action=len(self.slot_independent_actions) - 1)
+
+    def convertStateAction(self, state, action):
+        pass
+
+    def nextAction(self, beliefstate):
+        '''
+        select next action
+
+        :param beliefstate:
+        :returns: (int) next summary action
+        '''
+
+        # compute main belief
+
+        if self.features == 'learned' or self.features == 'rnn':
+            dipstate = padded_state(beliefstate, domainString=self.domainString, probability_max=self.probability_max)
+        else:
+            dipstate = DIP_state(beliefstate,domainString=self.domainString)
+        dipstatevec = dipstate.get_beliefStateVec('general')
+
+        non_exec = self.summaryaction.getNonExecutable(beliefstate.domainStates[beliefstate.currentdomain], self.lastSystemAction)
+        masks = get_feudalAC_masks(non_exec, self.slots, self.slot_independent_actions, self.slot_specific_actions,
+                                   only_master=self.only_master)
+
+        master_Q_values = self.master_policy.nextAction(dipstatevec, masks["master"])
+        #TODO: MASTER ACTIONS ARE NOT MASKED, ONLY COMPLETELY VALID FOR ENV4 ATM
+        master_decision = np.argmax(master_Q_values)
+        self.prev_master_act = master_decision
+        self.prev_master_belief = dipstatevec
+        self.beliefstate = beliefstate.domainStates[beliefstate.currentdomain]
+
+        self.dipstatevec_slots, self.maskvec_slots = self.get_dipstate_vec_slots_and_masks(dipstate, masks)
+        self.slot_beliefs = self.get_slot_beliefs(dipstate)
+
+        if self.master_actions[master_decision] != 'slot_dep':
+            # drop to give_info policy
+            self.prev_sub_policy = 'si'
+            if not self.only_master:
+                child_Q_values = self.give_info_policy.nextAction(dipstatevec, masks['give_info'])
+                child_Q_values = np.add(child_Q_values, masks['give_info'])
+                #TODO: sample from the distribution instead of argmax..
+                child_decision = np.argmax(child_Q_values)
+                summaryAct = self.slot_independent_actions[child_decision]
+                self.prev_child_act = child_decision
+                self.prev_child_belief = dipstatevec
+            else:
+                summaryAct = self.master_actions[master_decision]
+            self.chosen_slot = "None"
+        else:
+            self.prev_sub_policy = 'sd'
+
+            child_Q_values = self.request_info_policy.nextAction(self.dipstatevec_slots)
+            #if we chose randomly, child_Q_values is of shape len(actions), else shape=(number_slots, len(actions))
+            if len(child_Q_values.shape) == 1:
+                #we chose a random action, now we need a random slot to it
+                random_slot = random.choice(self.slots)
+                child_Q_values = np.add(child_Q_values, masks['req_info'][random_slot])
+                child_decision = np.argmax(child_Q_values)
+                self.prev_child_act = child_decision
+                self.prev_child_belief = dipstate.get_beliefStateVec(random_slot)
+                self.chosen_slot = random_slot
+                summaryAct = self.slot_specific_actions[child_decision] + "_" + random_slot
+            else:
+                child_Q_values = np.add(child_Q_values, self.maskvec_slots)
+                child_decision = np.unravel_index(np.argmax(child_Q_values, axis=None), child_Q_values.shape)
+                #child_decision is tuple of length 2!
+                chosen_slot = child_decision[0]
+                chosen_action = child_decision[1]
+                self.chosen_slot = self.slots[chosen_slot]
+                self.chosen_slot_ = self.slots[chosen_slot]
+                self.prev_child_act = chosen_action
+                self.prev_child_belief = dipstate.get_beliefStateVec(self.slots[chosen_slot])
+                summaryAct = self.slot_specific_actions[chosen_action] + "_" + self.slots[chosen_slot]
+                self.chosen = True
+
+        #if self.chosen_slot_:
+        #    print(self.chosen_slot_)
+        #    keys = self.beliefstate['beliefs'][self.chosen_slot_].keys()
+        #    b = [self.beliefstate['beliefs'][self.chosen_slot_]['**NONE**']] + \
+        #        [self.beliefstate['beliefs'][self.chosen_slot_][value] for value in list(keys) if value != '**NONE**']
+        #    print(f"DISTRIBUTION FOR SLOT {self.chosen_slot_}:", b)
+
+        beliefstate = beliefstate.getDomainState(self.domainUtil.domainString)
+        masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction)
+        nextaIdex = self.full_action_list.index(summaryAct)
+
+        return masterAct, nextaIdex
+
+    def train(self):
+        '''
+        call this function when the episode ends
+        '''
+        self.master_policy.train(self.critic_regularizer)
+        if not self.only_master:
+            self.give_info_policy.train()
+        self.request_info_policy.train()
+
+    def get_slot_beliefs(self, dipstate):
+
+        slot_beliefs = []
+        for slot in self.slots:
+            slot_dependent_vec = dipstate.get_beliefStateVec(slot)
+            slot_beliefs.append(slot_dependent_vec)
+        return np.concatenate(slot_beliefs, axis=0)
+
+    def get_dipstate_vec_slots_and_masks(self, dipstate, masks):
+
+        dipstatevec_slots = []
+        maskvec_slots = []
+        for slot in self.slots:
+            slot_dependent_vec = dipstate.get_beliefStateVec(slot)
+            dipstatevec_slots.append(slot_dependent_vec)
+            maskvec_slots.append(masks['req_info'][slot])
+        dipstatevec_slots = np.vstack(dipstatevec_slots)
+        maskvec_slots = np.asarray(maskvec_slots)
+
+        return dipstatevec_slots, maskvec_slots
+
+    def savePolicy(self, FORCE_SAVE=False):
+        """
+        Does not use this, cause it will be called from agent after every episode.
+        we want to save the policy only periodically.
+        """
+        pass
+
+    def savePolicyInc(self, FORCE_SAVE=False):
+        """
+        save model and replay buffer
+        """
+        # just save each sub-policy
+        self.master_policy.savePolicyInc()
+        if not self.only_master:
+            self.give_info_policy.savePolicyInc()
+        self.request_info_policy.savePolicyInc()
+
+    def loadPolicy(self, filename):
+        """
+        load model and replay buffer
+        """
+        # load policy models one by one
+        pass
+
+    def restart(self):
+        self.summaryAct = None
+        self.lastSystemAction = None
+        self.prevbelief = None
+        self.actToBeRecorded = None
+        self.master_policy.restart()
+        if not self.only_master:
+            self.give_info_policy.restart()
+        self.request_info_policy.restart()
+
+# END OF FILE
diff --git a/policy/PolicyManager.py b/policy/PolicyManager.py
index 6e9d754d8be7b4fb988ee1df6355dcdd8da24e7e..3ca85b89159b3f27f93341d03aaba96f2b28583b 100644
--- a/policy/PolicyManager.py
+++ b/policy/PolicyManager.py
@@ -303,6 +303,9 @@ class PolicyManager(object):
             elif policy_type == 'feudalAC':
                 from policy import FeudalACPolicy
                 self.domainPolicies[domainString] = FeudalACPolicy.FeudalACPolicy(in_policy_file, out_policy_file, domainString, learning)
+            elif policy_type == 'feudalgain':
+                from policy import FeudalGainPolicy
+                self.domainPolicies[domainString] = FeudalGainPolicy.FeudalGainPolicy(in_policy_file, out_policy_file, domainString, learning)
             else:
                 try:
                     # try to view the config string as a complete module path to the class to be instantiated
diff --git a/policy/feudalgainRL/DIP_parametrisation.py b/policy/feudalgainRL/DIP_parametrisation.py
new file mode 100644
index 0000000000000000000000000000000000000000..82db843c5e2e4d7336497c64368e3fbe321d3a6f
--- /dev/null
+++ b/policy/feudalgainRL/DIP_parametrisation.py
@@ -0,0 +1,2022 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+'''
+Class to convert belief states into DIP parametrisations
+'''
+
+import numpy as np
+import copy
+from itertools import product
+from scipy.stats import entropy
+
+from policy.Policy import Policy, Action, State, TerminalAction, TerminalState
+from ontology import Ontology
+from utils import Settings, ContextLogger, DialogueState
+logger = ContextLogger.getLogger('')
+
+class DIP_state(State):
+    def __init__(self, belief, domainString=None, action_freq=None):
+        #params
+        self.domainString = domainString
+        self.N_bins = 10
+        self.slots = list(Ontology.global_ontology.get_informable_slots(domainString))
+        if 'price' in self.slots:
+            self.slots.remove('price') #remove price from SFR ont, its not used
+
+        if 'name' in self.slots:
+            self.slots.remove('name')
+        self.DIP_state = {'general':None, 'joint':None}
+        for slot in self.slots:
+            self.DIP_state[slot]=None
+
+        # convert belief state into DIP params
+        if action_freq is not None:
+            self.DIP_state['general'] = np.concatenate((action_freq,self.convert_general_b(belief)))
+        else:
+            self.DIP_state['general'] = self.convert_general_b(belief)
+        self.DIP_state['joint'] = self.convert_joint_slot_b(belief)
+        for slot in self.slots:
+            self.DIP_state[slot] = self.convert_slot_b(belief, slot)
+
+        # create DIP vector and masks
+        self.get_DIP_vector()
+        self.beliefStateVec = None #for compatibility with GP sarsa implementation
+
+    def get_DIP_vector(self):
+        """
+        convert the DIP state into a numpy vector and a set of masks per slot
+        :return:
+        """
+        pad_v = np.zeros(len(self.DIP_state[self.slots[0]]))
+        slot_len = len(pad_v)
+        general_len = len(self.DIP_state['general']) + len(self.DIP_state['joint'])
+        pad_v[0] = 1.
+        self.DIP_vector = [pad_v]
+        self.DIP_masks = {}
+        mask_template = [False] * (slot_len * (len(self.slots) + 1)) + [True] * general_len
+        i = 1
+        for slot in self.slots:
+            self.DIP_vector.append(self.DIP_state[slot])
+            self.DIP_masks[slot] = np.array(mask_template)
+            self.DIP_masks[slot][slot_len*i:slot_len*(i+1)] = True
+            i += 1
+        self.DIP_vector.append(self.DIP_state['general'])
+        self.DIP_vector.append(self.DIP_state['joint'])
+        self.DIP_masks['general'] = np.array(mask_template)
+        self.DIP_masks['general'][:slot_len] = True
+
+        self.DIP_vector = np.concatenate(self.DIP_vector)
+
+    def get_beliefStateVec(self, slot):
+        return self.DIP_vector[self.DIP_masks[slot]]
+
+    def get_DIP_state(self, slot):
+        return np.array([self.DIP_state['general'] + self.DIP_state['joint'] + self.DIP_state[slot]])
+
+    def get_full_DIP_state(self):
+        full_slot_bstate = []
+        for slot in self.slots:
+            full_slot_bstate += self.DIP_state[slot]
+        full_DIP_state = np.array([full_slot_bstate + self.DIP_state['general'] + self.DIP_state['joint']])
+        DIP_mask = [True]*(len(self.DIP_state['general']) + len(self.DIP_state['joint'])) + [False] * len(full_slot_bstate)
+        return full_DIP_state, DIP_mask
+
+    def convert_general_b(self, belief):
+        """
+        Extracts from the belief state the DIP vector corresponding to the general features (e.g. method, user act...)
+        :param belief: The full belief state
+        :return: The DIP general vector
+        """
+        if type(belief) == DialogueState.DialogueState:
+            belief = belief.domainStates[belief.currentdomain]
+
+        dial_act = list(belief['beliefs']['discourseAct'].values())
+
+        requested = self._get_DIP_requested_vector(belief)
+        method = list(belief['beliefs']['method'].values())
+        features = [int(belief['features']['offerHappened']), int(belief['features']['lastActionInformNone']), int(bool(belief['features']['lastInformedVenue']))]
+        discriminable = [int(x) for x in belief['features']['inform_info']]
+        slot_n = 1/len(self.slots)
+        val_n = []
+        for slot in self.slots:
+            val_n.append(len(Ontology.global_ontology.get_informable_slot_values(self.domainString, slot)))
+        avg_value_n = 1/np.mean(val_n)
+
+
+        return dial_act + requested + method + features + discriminable + [slot_n, avg_value_n]
+
+
+    def _get_DIP_requested_vector(self, belief):
+        n_requested = sum([x>0.5 for x in list(belief['beliefs']['requested'].values())])
+        ret_vec = [0] * 5
+        if n_requested > 4:
+            n_requested = 4
+        ret_vec[n_requested] = 1.
+        return ret_vec
+
+    def convert_joint_slot_b(self, belief):
+        """
+        Extracts the features for the joint DIP vector for all the slots
+        :param belief: The full belief state
+        :return: The DIP joint slot vector
+        """
+        if type(belief) == DialogueState.DialogueState:
+            belief = belief.domainStates[belief.currentdomain]
+
+        joint_beliefs = []
+        joint_none = 1.
+        informable_beliefs = [copy.deepcopy(belief['beliefs'][x]) for x in list(belief['beliefs'].keys()) if x in self.slots] # this might be inneficent
+        for i, b in enumerate(informable_beliefs):
+            joint_none *= b['**NONE**']
+            del b['**NONE**'] # should I put **NONE** prob mass to dontcare?
+            informable_beliefs[i] = sorted([x for x in list(b.values()) if x != 0], reverse=True)[:2]
+            while len(informable_beliefs[i]) < 2:
+                informable_beliefs[i].append(0.)
+        for probs in product(*informable_beliefs):
+            joint_beliefs.append(np.prod(probs))
+        j_top = joint_beliefs[0]
+        j_2nd = joint_beliefs[1]
+        j_3rd = joint_beliefs[2]
+        first_joint_beliefs = joint_beliefs[:8]
+        if sum(first_joint_beliefs) == 0:
+            first_joint_beliefs = np.ones(len(first_joint_beliefs)) / len(first_joint_beliefs)
+        else:
+            first_joint_beliefs = np.array(first_joint_beliefs) / sum(first_joint_beliefs) # why normalise?
+
+        # difference between 1st and 2dn values
+        j_ent = entropy(first_joint_beliefs)
+        j_dif = joint_beliefs[0] - joint_beliefs[1]
+        j_dif_bin = [0.] * 5
+        idx = int((j_dif) * 5)
+        if idx == 5:
+            idx = 4
+        j_dif_bin[idx] = 1
+
+        # number of slots which are not **NONE**
+        n = 0
+        for key in belief['beliefs']:
+            if key in self.slots:
+                none_val = belief['beliefs'][key]['**NONE**']
+                top_val = np.max([belief['beliefs'][key][value] for value in list(belief['beliefs'][key].keys()) if value != '**NONE**'])
+                if top_val > none_val:
+                    n += 1
+        not_none = [0.] * 5
+        if n > 4:
+            n = 4
+        not_none[n] = 1.
+
+        return [j_top, j_2nd, j_3rd, joint_none, j_ent, j_dif] + j_dif_bin + not_none
+
+    def convert_slot_b(self, belief, slot):
+        """
+        Extracts the slot DIP features.
+        :param belief: The full belief state
+        :return: The slot DIP vector
+        """
+        if type(belief) == DialogueState.DialogueState:
+            belief = belief.domainStates[belief.currentdomain]
+        b = [belief['beliefs'][slot]['**NONE**']] + sorted([belief['beliefs'][slot][value] for value in list(belief['beliefs'][slot].keys()) if value != '**NONE**'], reverse=True)
+        b_top = b[1]
+        b_2nd = b[2]
+        b_3rd = b[3]
+        b_ent = entropy(b)
+        b_none = b[0]
+        b_dif = b[1] - b[2]
+        b_dif_bin = [0.] * 5
+        idx = int((b_dif) * 5)
+        if idx == 5:
+            idx = 4
+        b_dif_bin[idx] = 1
+        non_zero_rate = [x != 0 for x in b[1:]]
+        non_zero_rate = sum(non_zero_rate) / len(non_zero_rate)
+        requested_prob = belief['beliefs']['requested'][slot]
+
+        # Ontology and DB based features
+        V_len = len(Ontology.global_ontology.get_informable_slot_values(self.domainString, slot))
+        norm_N_values = 1 / V_len
+        v_len_bin_vector = [0.] * self.N_bins
+        v_len_bin_vector[int(np.log2(V_len))] = 1.
+        #ocurr_prob, not_occur_prob, first_prob, second_prob, later_prob = self._get_importance_and_priority(slot) # this was manually set in the original DIP paper, I think it can be learned from the other features
+        val_dist_in_DB = self._get_val_dist_in_DB(slot)
+        # potential_contr_to_DB_search = self._get_potential_contr_to_DB_search(slot, belief)
+        #potential_contr_to_DB_search = [0, 0, 0, 0] # the implementation of this method is too slow right now, dont knwo how useful these features are (but they seem quite useful)
+        return [0, b_top, b_2nd, b_3rd, b_ent, b_none, non_zero_rate, requested_prob, norm_N_values, val_dist_in_DB] + b_dif_bin + v_len_bin_vector
+
+    def _get_val_dist_in_DB(self, slot):
+        # The entropy of the normalised histogram (|DB(s=v)|/|DB|) \forall v \in V_s
+        values = Ontology.global_ontology.get_informable_slot_values(self.domainString, slot)
+        entities = Ontology.global_ontology.entity_by_features(self.domainString, {})
+        val_dist = np.zeros(len(values))
+        n = 0
+        for ent in entities:
+            if ent[slot] != 'not available':
+                val_dist[values.index(ent[slot])] += 1
+                n += 1
+        return entropy(val_dist/n)
+
+
+class padded_state(State):
+    def __init__(self, belief, domainString=None, action_freq=None, probability_max=50):
+        #params
+        self.domainString = domainString
+        self.sortbelief = True
+        self.probability_max = probability_max
+        #self.action_freq = False
+        if Settings.config.has_option('feudalpolicy', 'sortbelief'):
+            self.sortbelief = Settings.config.getboolean('feudalpolicy', 'sortbelief')
+        #if Settings.config.has_option('feudalpolicy', 'action_freq'):
+        #    self.action_freq = Settings.config.getboolean('feudalpolicy', 'action_freq')
+        self.slots = list(Ontology.global_ontology.get_informable_slots(domainString))
+        if 'price' in self.slots:
+            self.slots.remove('price') #remove price from SFR ont, its not used
+
+        if 'name' in self.slots:
+            self.slots.remove('name')
+
+        slot_values = Ontology.global_ontology.get_informable_slots_and_values(domainString)
+        self.max_v = np.max([len(slot_values[s]) for s in self.slots]) + 3 # (+**NONE**+dontcare+pad)
+        self.max_v = 158
+        self.si_size = 72 # size of general plus joint vectors
+        self.sd_size = self.max_v
+
+        self.DIP_state = {'general':None, 'joint':None}
+        for slot in self.slots:
+            self.DIP_state[slot]=None
+
+        # convert belief state into DIP params
+        if action_freq is not None:
+            self.DIP_state['general'] = np.concatenate((action_freq,self.convert_general_b(belief)))
+        else:
+            self.DIP_state['general'] = self.convert_general_b(belief)
+        self.DIP_state['joint'] = self.convert_joint_slot_b(belief)
+        for slot in self.slots:
+            self.DIP_state[slot] = self.convert_slot_b(belief, slot)
+
+        # create vector and masks
+        self.get_DIP_vector()
+        self.beliefStateVec = None #for compatibility with GP sarsa implementation
+
+    def get_DIP_vector(self):
+        """
+        convert the state into a numpy vector and a set of masks per slot
+        :return:
+        """
+        pad_v = np.zeros(len(self.DIP_state[self.slots[0]]))
+        slot_len = len(pad_v)
+        general_len = len(self.DIP_state['general']) + len(self.DIP_state['joint'])
+
+        self.DIP_vector = []
+        self.DIP_masks = {}
+        mask_template = [False] * (slot_len * (len(self.slots))) + [True] * general_len
+        i = 0
+        for slot in self.slots:
+            self.DIP_vector.append(self.DIP_state[slot])
+            self.DIP_masks[slot] = np.array(mask_template)
+            self.DIP_masks[slot][slot_len*i:slot_len*(i+1)] = True
+            i += 1
+        self.DIP_vector.append(self.DIP_state['general'])
+        self.DIP_vector.append(self.DIP_state['joint'])
+        self.DIP_masks['general'] = np.array(mask_template)
+
+        self.DIP_vector = np.concatenate(self.DIP_vector)
+
+    def get_beliefStateVec(self, slot):
+        return self.DIP_vector[self.DIP_masks[slot]]
+
+    def get_DIP_state(self, slot):
+        return np.array([self.DIP_state['general'] + self.DIP_state['joint'] + self.DIP_state[slot]])
+
+    def get_full_DIP_state(self):
+        full_slot_bstate = []
+        for slot in self.slots:
+            full_slot_bstate += self.DIP_state[slot]
+        full_DIP_state = np.array([full_slot_bstate + self.DIP_state['general'] + self.DIP_state['joint']])
+        DIP_mask = [True]*(len(self.DIP_state['general']) + len(self.DIP_state['joint'])) + [False] * len(full_slot_bstate)
+        return full_DIP_state, DIP_mask
+
+    def convert_general_b(self, belief):
+        """
+        Extracts from the belief state the vector corresponding to the general features (e.g. method, user act...)
+        :param belief: The full belief state
+        :return: The general vector
+        """
+        if type(belief) == DialogueState.DialogueState:
+            belief = belief.domainStates[belief.currentdomain]
+
+        #print("BELIEF: ", belief['features'])
+
+        dial_act = list(belief['beliefs']['discourseAct'].values())
+
+        requested = self._get_requested_vector(belief)
+        method = list(belief['beliefs']['method'].values())
+        features = [int(belief['features']['offerHappened']), int(belief['features']['lastActionInformNone']),
+                    int(bool(belief['features']['lastInformedVenue']))]
+        discriminable = [int(x) for x in belief['features']['inform_info']]
+
+        return dial_act + requested + method + features + discriminable + [1.0/len(self.slots)]
+
+    def _get_requested_vector(self, belief):
+        n_requested = sum([x>0.5 for x in list(belief['beliefs']['requested'].values())])
+        ret_vec = [0] * 5
+        if n_requested > 4:
+            n_requested = 4
+        ret_vec[n_requested] = 1.
+        return ret_vec
+
+    def convert_joint_slot_b(self, belief):
+        """
+            Extracts the features for the joint vector of all the slots
+            :param belief: The full belief state
+            :return: The joint slot vector
+            """
+        #ic340 note: this should probably be done with an rnn encoder
+        if type(belief) == DialogueState.DialogueState:
+            belief = belief.domainStates[belief.currentdomain]
+
+        joint_beliefs = []
+        joint_none = 1.
+        informable_beliefs = [copy.deepcopy(belief['beliefs'][x]) for x in list(belief['beliefs'].keys()) if
+                              x in self.slots]  # this might be inneficent
+        for i, b in enumerate(informable_beliefs):
+            joint_none *= b['**NONE**']
+            del b['**NONE**']  # should I put **NONE** prob mass to dontcare?
+            informable_beliefs[i] = sorted([x for x in list(b.values()) if x != 0], reverse=True)[:2]
+            while len(informable_beliefs[i]) < 2:
+                informable_beliefs[i].append(0.)
+        for probs in product(*informable_beliefs):
+            joint_beliefs.append(np.prod(probs))
+        first_joint_beliefs = np.zeros(20)
+        joint_beliefs = joint_beliefs[:20]
+        len_joint_beliefs = len(joint_beliefs)
+        first_joint_beliefs[:len_joint_beliefs] = joint_beliefs
+
+        if sum(first_joint_beliefs) == 0:
+            first_joint_beliefs = list(np.ones(len(first_joint_beliefs)) / len(first_joint_beliefs))
+        else:
+            first_joint_beliefs = list(np.array(first_joint_beliefs) / sum(first_joint_beliefs))  # why normalise?
+
+        # number of slots which are not **NONE**
+        n = 0
+        for key in belief['beliefs']:
+            if key in self.slots:
+                none_val = belief['beliefs'][key]['**NONE**']
+                top_val = np.max(
+                    [belief['beliefs'][key][value] for value in list(belief['beliefs'][key].keys()) if value != '**NONE**'])
+                if top_val > none_val:
+                    n += 1
+        not_none = [0.] * 5
+        if n > 4:
+            n = 4
+        not_none[n] = 1.
+
+        return [joint_none] + first_joint_beliefs + not_none
+
+    def convert_slot_b(self, belief, slot):
+        """
+        Extracts the slot features by padding the distribution vector with 0s.
+        :param belief: The full belief state
+        :return: The slot DIP vector
+        """
+        if type(belief) == DialogueState.DialogueState:
+            belief = belief.domainStates[belief.currentdomain]
+        if self.sortbelief is True:
+            b = [belief['beliefs'][slot]['**NONE**']] + sorted(
+                [belief['beliefs'][slot][value] for value in list(belief['beliefs'][slot].keys()) if value != '**NONE**'],
+                reverse=True) # sorted values
+            b = b[:self.probability_max]
+        else:
+            b = [belief['beliefs'][slot]['**NONE**']] + \
+                [belief['beliefs'][slot][value] for value in list(belief['beliefs'][slot].keys()) if value != '**NONE**'] # unsorted values
+
+        padded_b = np.zeros(self.probability_max)
+        padded_b[0:len(b)] = b
+        return np.array(padded_b)
+
+    def _get_val_dist_in_DB(self, slot):
+        # The entropy of the normalised histogram (|DB(s=v)|/|DB|) \forall v \in V_s
+        values = Ontology.global_ontology.get_informable_slot_values(self.domainString, slot)
+        entities = Ontology.global_ontology.entity_by_features(self.domainString, {})
+        val_dist = np.zeros(len(values))
+        n = 0
+        for ent in entities:
+            if ent[slot] != 'not available':
+                val_dist[values.index(ent[slot])] += 1
+                n += 1
+        return entropy(val_dist/n)
+
+
+def get_test_beliefs():
+    b1 = {'beliefs': {'allowedforkids': {'**NONE**': 0.0,
+   '0': 0.0,
+   '1': 0.0,
+   'dontcare': 1.0},
+  'area': {'**NONE**': 1.0,
+   'alamo square': 0.0,
+   'amanico ergina village': 0.0,
+   'anza vista': 0.0,
+   'ashbury heights': 0.0,
+   'balboa terrace': 0.0,
+   'bayview district': 0.0,
+   'bayview heights': 0.0,
+   'bernal heights': 0.0,
+   'bernal heights north': 0.0,
+   'bernal heights south': 0.0,
+   'buena vista park': 0.0,
+   'castro': 0.0,
+   'cathedral hill': 0.0,
+   'cayuga terrace': 0.0,
+   'central richmond': 0.0,
+   'central sunset': 0.0,
+   'central waterfront': 0.0,
+   'chinatown': 0.0,
+   'civic center': 0.0,
+   'clarendon heights': 0.0,
+   'cole valley': 0.0,
+   'corona heights': 0.0,
+   'cow hollow': 0.0,
+   'crocker amazon': 0.0,
+   'diamond heights': 0.0,
+   'doelger city': 0.0,
+   'dogpatch': 0.0,
+   'dolores heights': 0.0,
+   'dontcare': 0.0,
+   'downtown': 0.0,
+   'duboce triangle': 0.0,
+   'embarcadero': 0.0,
+   'eureka valley': 0.0,
+   'eureka valley dolores heights': 0.0,
+   'excelsior': 0.0,
+   'financial district': 0.0,
+   'financial district south': 0.0,
+   'fishermans wharf': 0.0,
+   'forest hill': 0.0,
+   'forest hill extension': 0.0,
+   'forest knolls': 0.0,
+   'fort mason': 0.0,
+   'fort winfield scott': 0.0,
+   'frederick douglass haynes gardens': 0.0,
+   'friendship village': 0.0,
+   'glen park': 0.0,
+   'glenridge': 0.0,
+   'golden gate heights': 0.0,
+   'golden gate park': 0.0,
+   'haight ashbury': 0.0,
+   'hayes valley': 0.0,
+   'hunters point': 0.0,
+   'india basin': 0.0,
+   'ingleside': 0.0,
+   'ingleside heights': 0.0,
+   'ingleside terrace': 0.0,
+   'inner mission': 0.0,
+   'inner parkside': 0.0,
+   'inner richmond': 0.0,
+   'inner sunset': 0.0,
+   'inset': 0.0,
+   'jordan park': 0.0,
+   'laguna honda': 0.0,
+   'lake': 0.0,
+   'lake shore': 0.0,
+   'lakeside': 0.0,
+   'laurel heights': 0.0,
+   'lincoln park': 0.0,
+   'lincoln park lobos': 0.0,
+   'little hollywood': 0.0,
+   'little italy': 0.0,
+   'little osaka': 0.0,
+   'little russia': 0.0,
+   'lone mountain': 0.0,
+   'lower haight': 0.0,
+   'lower nob hill': 0.0,
+   'lower pacific heights': 0.0,
+   'malcolm x square': 0.0,
+   'marcus garvey square': 0.0,
+   'marina district': 0.0,
+   'martin luther king square': 0.0,
+   'mastro': 0.0,
+   'merced heights': 0.0,
+   'merced manor': 0.0,
+   'midtown terrace': 0.0,
+   'miraloma park': 0.0,
+   'mission bay': 0.0,
+   'mission district': 0.0,
+   'mission dolores': 0.0,
+   'mission terrace': 0.0,
+   'monterey heights': 0.0,
+   'mount davidson manor': 0.0,
+   'nob hill': 0.0,
+   'noe valley': 0.0,
+   'noma': 0.0,
+   'north beach': 0.0,
+   'north panhandle': 0.0,
+   'north park': 0.0,
+   'north waterfront': 0.0,
+   'oceanview': 0.0,
+   'opera plaza': 0.0,
+   'outer mission': 0.0,
+   'outer parkside': 0.0,
+   'outer richmond': 0.0,
+   'outer sunset': 0.0,
+   'outset': 0.0,
+   'pacific heights': 0.0,
+   'panhandle': 0.0,
+   'park merced': 0.0,
+   'parkmerced': 0.0,
+   'parkside': 0.0,
+   'pine lake park': 0.0,
+   'portola': 0.0,
+   'potrero flats': 0.0,
+   'potrero hill': 0.0,
+   'presidio': 0.0,
+   'presidio heights': 0.0,
+   'richmond district': 0.0,
+   'russian hill': 0.0,
+   'saint francis wood': 0.0,
+   'san francisco airport': 0.0,
+   'san francisco state university': 0.0,
+   'sea cliff': 0.0,
+   'sherwood forest': 0.0,
+   'showplace square': 0.0,
+   'silver terrace': 0.0,
+   'somisspo': 0.0,
+   'south basin': 0.0,
+   'south beach': 0.0,
+   'south of market': 0.0,
+   'st francis square': 0.0,
+   'st francis wood': 0.0,
+   'stonestown': 0.0,
+   'sunnydale': 0.0,
+   'sunnyside': 0.0,
+   'sunset district': 0.0,
+   'telegraph hill': 0.0,
+   'tenderloin': 0.0,
+   'thomas paine square': 0.0,
+   'transmission': 0.0,
+   'treasure island': 0.0,
+   'twin peaks': 0.0,
+   'twin peaks west': 0.0,
+   'upper market': 0.0,
+   'van ness': 0.0,
+   'victoria mews': 0.0,
+   'visitacion valley': 0.0,
+   'vista del monte': 0.0,
+   'west of twin peaks': 0.0,
+   'west portal': 0.0,
+   'western addition': 0.0,
+   'westlake and olympic': 0.0,
+   'westwood highlands': 0.0,
+   'westwood park': 0.0,
+   'yerba buena island': 0.0,
+   'zion district': 0.0},
+  'discourseAct': {'ack': 0.0,
+   'bye': 0.0,
+   'hello': 0.0,
+   'none': 1.0,
+   'repeat': 0.0,
+   'silence': 0.0,
+   'thankyou': 0.0},
+  'food': {'**NONE**': 0.0,
+   'afghan': 0.0,
+   'arabian': 0.0,
+   'asian': 0.0,
+   'basque': 0.0,
+   'brasseries': 0.0,
+   'brazilian': 0.0,
+   'buffets': 0.0,
+   'burgers': 0.0,
+   'burmese': 0.0,
+   'cafes': 0.0,
+   'cambodian': 0.0,
+   'cantonese': 1.0,
+   'chinese': 0.0,
+   'comfort food': 0.0,
+   'creperies': 0.0,
+   'dim sum': 0.0,
+   'dontcare': 0.0,
+   'ethiopian': 0.0,
+   'ethnic food': 0.0,
+   'french': 0.0,
+   'gluten free': 0.0,
+   'himalayan': 0.0,
+   'indian': 0.0,
+   'indonesian': 0.0,
+   'indpak': 0.0,
+   'italian': 0.0,
+   'japanese': 0.0,
+   'korean': 0.0,
+   'kosher': 0.0,
+   'latin': 0.0,
+   'lebanese': 0.0,
+   'lounges': 0.0,
+   'malaysian': 0.0,
+   'mediterranean': 0.0,
+   'mexican': 0.0,
+   'middle eastern': 0.0,
+   'modern european': 0.0,
+   'moroccan': 0.0,
+   'new american': 0.0,
+   'pakistani': 0.0,
+   'persian': 0.0,
+   'peruvian': 0.0,
+   'pizza': 0.0,
+   'raw food': 0.0,
+   'russian': 0.0,
+   'sandwiches': 0.0,
+   'sea food': 0.0,
+   'shanghainese': 0.0,
+   'singaporean': 0.0,
+   'soul food': 0.0,
+   'spanish': 0.0,
+   'steak': 0.0,
+   'sushi': 0.0,
+   'taiwanese': 0.0,
+   'tapas': 0.0,
+   'thai': 0.0,
+   'traditionnal american': 0.0,
+   'turkish': 0.0,
+   'vegetarian': 0.0,
+   'vietnamese': 0.0},
+  'goodformeal': {'**NONE**': 0.0,
+   'breakfast': 0.0,
+   'brunch': 0.0,
+   'dinner': 0.0,
+   'dontcare': 1.0,
+   'lunch': 0.0},
+  'method': {'byalternatives': 0.0,
+   'byconstraints': 0.0,
+   'byname': 0.9285714285714286,
+   'finished': 0.0,
+   'none': 0.0714285714285714,
+   'restart': 0.0},
+  'name': {'**NONE**': 0.0,
+   'a 16': 0.0,
+   'a la turca restaurant': 0.0,
+   'abacus': 0.0,
+   'alamo square seafood grill': 0.0,
+   'albona ristorante istriano': 0.0,
+   'alborz persian cuisine': 0.0,
+   'allegro romano': 0.0,
+   'amarena': 0.0,
+   'amber india': 0.0,
+   'ame': 0.0,
+   'ananda fuara': 0.0,
+   'anchor oyster bar': 0.0,
+   'angkor borei restaurant': 0.0,
+   'aperto restaurant': 0.0,
+   'ar roi restaurant': 0.0,
+   'arabian nights restaurant': 0.0,
+   'assab eritrean restaurant': 0.0,
+   'atelier crenn': 0.0,
+   'aux delices restaurant': 0.0,
+   'aziza': 0.0,
+   'b star bar': 0.0,
+   'bar crudo': 0.0,
+   'beijing restaurant': 0.0,
+   'bella trattoria': 0.0,
+   'benu': 0.0,
+   'betelnut': 0.0,
+   'bistro central parc': 0.0,
+   'bix': 0.0,
+   'borgo': 0.0,
+   'borobudur restaurant': 0.0,
+   'bouche': 0.0,
+   'boulevard': 0.0,
+   'brothers restaurant': 0.0,
+   'bund shanghai restaurant': 0.0,
+   'burma superstar': 0.0,
+   'butterfly': 0.0,
+   'cafe claude': 0.0,
+   'cafe jacqueline': 0.0,
+   'campton place restaurant': 0.0,
+   'canteen': 0.0,
+   'canto do brasil restaurant': 0.0,
+   'capannina': 0.0,
+   'capital restaurant': 0.0,
+   'chai yo thai restaurant': 0.0,
+   'chaya brasserie': 0.0,
+   'chenery park': 0.0,
+   'chez maman': 0.0,
+   'chez papa bistrot': 0.0,
+   'chez spencer': 0.0,
+   'chiaroscuro': 0.0,
+   'chouchou': 0.0,
+   'chow': 0.0,
+   'city view restaurant': 0.0,
+   'claudine': 0.0,
+   'coi': 0.0,
+   'colibri mexican bistro': 0.0,
+   'coqueta': 0.0,
+   'crustacean restaurant': 0.0,
+   'da flora a venetian osteria': 0.0,
+   'darbar restaurant': 0.0,
+   'delancey street restaurant': 0.0,
+   'delfina': 0.0,
+   'dong baek restaurant': 0.0,
+   'dontcare': 0.0,
+   'dosa on fillmore': 0.0,
+   'dosa on valencia': 0.0,
+   'eiji': 0.0,
+   'enjoy vegetarian restaurant': 0.0,
+   'espetus churrascaria': 0.0,
+   'fang': 0.0,
+   'farallon': 0.0,
+   'fattoush restaurant': 0.0,
+   'fifth floor': 0.0,
+   'fino restaurant': 0.0,
+   'firefly': 0.0,
+   'firenze by night ristorante': 0.0,
+   'fleur de lys': 0.0,
+   'fog harbor fish house': 0.0,
+   'forbes island': 0.0,
+   'foreign cinema': 0.0,
+   'frances': 0.0,
+   'franchino': 0.0,
+   'franciscan crab restaurant': 0.0,
+   'frascati': 0.0,
+   'fresca': 0.0,
+   'fringale': 0.0,
+   'fujiyama ya japanese restaurant': 0.0,
+   'gajalee': 0.0,
+   'gamine': 0.0,
+   'garcon restaurant': 0.0,
+   'gary danko': 0.0,
+   'gitane': 0.0,
+   'golden era restaurant': 0.0,
+   'gracias madre': 0.0,
+   'great eastern restaurant': 1.0,
+   'hakka restaurant': 0.0,
+   'hakkasan': 0.0,
+   'han second kwan': 0.0,
+   'heirloom cafe': 0.0,
+   'helmand palace': 0.0,
+   'hi dive': 0.0,
+   'hillside supper club': 0.0,
+   'hillstone': 0.0,
+   'hong kong clay pot restaurant': 0.0,
+   'house of nanking': 0.0,
+   'house of prime rib': 0.0,
+   'hunan homes restaurant': 0.0,
+   'incanto': 0.0,
+   'isa': 0.0,
+   'jannah': 0.0,
+   'jasmine garden': 0.0,
+   'jitlada thai cuisine': 0.0,
+   'kappa japanese restaurant': 0.0,
+   'kim thanh restaurant': 0.0,
+   'kirin chinese restaurant': 0.0,
+   'kiss seafood': 0.0,
+   'kokkari estiatorio': 0.0,
+   'la briciola': 0.0,
+   'la ciccia': 0.0,
+   'la folie': 0.0,
+   'la mediterranee': 0.0,
+   'la traviata': 0.0,
+   'lahore karahi': 0.0,
+   'lavash': 0.0,
+   'le charm': 0.0,
+   'le colonial': 0.0,
+   'le soleil': 0.0,
+   'lime tree southeast asian kitchen': 0.0,
+   'little delhi': 0.0,
+   'little nepal': 0.0,
+   'luce': 0.0,
+   'lucky creation restaurant': 0.0,
+   'luella': 0.0,
+   'lupa': 0.0,
+   'm y china': 0.0,
+   'maki restaurant': 0.0,
+   'mangia tutti ristorante': 0.0,
+   'manna': 0.0,
+   'marlowe': 0.0,
+   'marnee thai': 0.0,
+   'maverick': 0.0,
+   'mela tandoori kitchen': 0.0,
+   'mescolanza': 0.0,
+   'mezes': 0.0,
+   'michael mina restaurant': 0.0,
+   'millennium': 0.0,
+   'minako organic japanese restaurant': 0.0,
+   'minami restaurant': 0.0,
+   'mission chinese food': 0.0,
+   'mochica': 0.0,
+   'modern thai': 0.0,
+   'mona lisa restaurant': 0.0,
+   'mozzeria': 0.0,
+   'muguboka restaurant': 0.0,
+   'my tofu house': 0.0,
+   'nicaragua restaurant': 0.0,
+   'nob hill cafe': 0.0,
+   'nopa': 0.0,
+   'old jerusalem restaurant': 0.0,
+   'old skool cafe': 0.0,
+   'one market restaurant': 0.0,
+   'orexi': 0.0,
+   'original us restaurant': 0.0,
+   'osha thai': 0.0,
+   'oyaji restaurant': 0.0,
+   'ozumo': 0.0,
+   'pad thai restaurant': 0.0,
+   'panta rei restaurant': 0.0,
+   'park tavern': 0.0,
+   'pera': 0.0,
+   'piperade': 0.0,
+   'ploy 2': 0.0,
+   'poc chuc': 0.0,
+   'poesia': 0.0,
+   'prospect': 0.0,
+   'quince': 0.0,
+   'radius san francisco': 0.0,
+   'range': 0.0,
+   'red door cafe': 0.0,
+   'restaurant ducroix': 0.0,
+   'ristorante bacco': 0.0,
+   'ristorante ideale': 0.0,
+   'ristorante milano': 0.0,
+   'ristorante parma': 0.0,
+   'rn74': 0.0,
+   'rue lepic': 0.0,
+   'saha': 0.0,
+   'sai jai thai restaurant': 0.0,
+   'salt house': 0.0,
+   'san tung chinese restaurant': 0.0,
+   'san wang restaurant': 0.0,
+   'sanjalisco': 0.0,
+   'sanraku': 0.0,
+   'seasons': 0.0,
+   'seoul garden': 0.0,
+   'seven hills': 0.0,
+   'shangri la vegetarian restaurant': 0.0,
+   'singapore malaysian restaurant': 0.0,
+   'skool': 0.0,
+   'so': 0.0,
+   'sotto mare': 0.0,
+   'source': 0.0,
+   'specchio ristorante': 0.0,
+   'spruce': 0.0,
+   'straits restaurant': 0.0,
+   'stroganoff restaurant': 0.0,
+   'sunflower potrero hill': 0.0,
+   'sushi bistro': 0.0,
+   'taiwan restaurant': 0.0,
+   'tanuki restaurant': 0.0,
+   'tataki': 0.0,
+   'tekka japanese restaurant': 0.0,
+   'thai cottage restaurant': 0.0,
+   'thai house express': 0.0,
+   'thai idea vegetarian': 0.0,
+   'thai time restaurant': 0.0,
+   'thanh long': 0.0,
+   'the big 4 restaurant': 0.0,
+   'the blue plate': 0.0,
+   'the house': 0.0,
+   'the richmond': 0.0,
+   'the slanted door': 0.0,
+   'the stinking rose': 0.0,
+   'thep phanom thai restaurant': 0.0,
+   'tommys joynt': 0.0,
+   'toraya japanese restaurant': 0.0,
+   'town hall': 0.0,
+   'trattoria contadina': 0.0,
+   'tu lan': 0.0,
+   'tuba restaurant': 0.0,
+   'u lee restaurant': 0.0,
+   'udupi palace': 0.0,
+   'venticello ristorante': 0.0,
+   'vicoletto': 0.0,
+   'yank sing': 0.0,
+   'yummy yummy': 0.0,
+   'z and y restaurant': 0.0,
+   'zadin': 0.0,
+   'zare at fly trap': 0.0,
+   'zarzuela': 0.0,
+   'zen yai thai restaurant': 0.0,
+   'zuni cafe': 0.0,
+   'zushi puzzle': 0.0},
+  'near': {'**NONE**': 0.0,
+   'bayview hunters point': 0.0,
+   'dontcare': 1.0,
+   'haight': 0.0,
+   'japantown': 0.0,
+   'marina cow hollow': 0.0,
+   'mission': 0.0,
+   'nopa': 0.0,
+   'north beach telegraph hill': 0.0,
+   'soma': 0.0,
+   'union square': 0.0},
+  'price': {'**NONE**': 1.0,
+   '10 dollar': 0.0,
+   '10 euro': 0.0,
+   '11 euro': 0.0,
+   '15 euro': 0.0,
+   '18 euro': 0.0,
+   '20 euro': 0.0,
+   '22 euro': 0.0,
+   '25 euro': 0.0,
+   '26 euro': 0.0,
+   '29 euro': 0.0,
+   '37 euro': 0.0,
+   '6': 0.0,
+   '7': 0.0,
+   '9': 0.0,
+   'between 0 and 15 euro': 0.0,
+   'between 10 and 13 euro': 0.0,
+   'between 10 and 15 euro': 0.0,
+   'between 10 and 18 euro': 0.0,
+   'between 10 and 20 euro': 0.0,
+   'between 10 and 23 euro': 0.0,
+   'between 10 and 30 euro': 0.0,
+   'between 11 and 15 euro': 0.0,
+   'between 11 and 18 euro': 0.0,
+   'between 11 and 22 euro': 0.0,
+   'between 11 and 25 euro': 0.0,
+   'between 11 and 29 euro': 0.0,
+   'between 11 and 35 euro': 0.0,
+   'between 13 and 15 euro': 0.0,
+   'between 13 and 18 euro': 0.0,
+   'between 13 and 24 euro': 0.0,
+   'between 15 and 18 euro': 0.0,
+   'between 15 and 22 euro': 0.0,
+   'between 15 and 26 euro': 0.0,
+   'between 15 and 29 euro': 0.0,
+   'between 15 and 33 euro': 0.0,
+   'between 15 and 44 euro': 0.0,
+   'between 15 and 58 euro': 0.0,
+   'between 18 and 26 euro': 0.0,
+   'between 18 and 29 euro': 0.0,
+   'between 18 and 44 euro': 0.0,
+   'between 18 and 55 euro': 0.0,
+   'between 18 and 58 euro': 0.0,
+   'between 18 and 73 euro': 0.0,
+   'between 18 and 78 euro': 0.0,
+   'between 2 and 15 euro': 0.0,
+   'between 20 and 30 euro': 0.0,
+   'between 21 and 23 euro': 0.0,
+   'between 22 and 29 euro': 0.0,
+   'between 22 and 30 dollar': 0.0,
+   'between 22 and 37 euro': 0.0,
+   'between 22 and 58 euro': 0.0,
+   'between 22 and 73 euro': 0.0,
+   'between 23 and 29': 0.0,
+   'between 23 and 29 euro': 0.0,
+   'between 23 and 37 euro': 0.0,
+   'between 23 and 58': 0.0,
+   'between 23 and 58 euro': 0.0,
+   'between 26 and 33 euro': 0.0,
+   'between 26 and 34 euro': 0.0,
+   'between 26 and 37 euro': 0.0,
+   'between 29 and 37 euro': 0.0,
+   'between 29 and 44 euro': 0.0,
+   'between 29 and 58 euro': 0.0,
+   'between 29 and 73 euro': 0.0,
+   'between 30 and 58': 0.0,
+   'between 30 and 58 euro': 0.0,
+   'between 31 and 50 euro': 0.0,
+   'between 37 and 110 euro': 0.0,
+   'between 37 and 44 euro': 0.0,
+   'between 37 and 58 euro': 0.0,
+   'between 4 and 22 euro': 0.0,
+   'between 4 and 58 euro': 0.0,
+   'between 5 an 30 euro': 0.0,
+   'between 5 and 10 euro': 0.0,
+   'between 5 and 11 euro': 0.0,
+   'between 5 and 15 dollar': 0.0,
+   'between 5 and 20 euro': 0.0,
+   'between 5 and 25 euro': 0.0,
+   'between 6 and 10 euro': 0.0,
+   'between 6 and 11 euro': 0.0,
+   'between 6 and 15 euro': 0.0,
+   'between 6 and 29 euro': 0.0,
+   'between 7 and 11 euro': 0.0,
+   'between 7 and 13 euro': 0.0,
+   'between 7 and 15 euro': 0.0,
+   'between 7 and 37 euro': 0.0,
+   'between 8 and 22 euro': 0.0,
+   'between 9 and 13 dolllar': 0.0,
+   'between 9 and 15 euro': 0.0,
+   'between 9 and 58 euro': 0.0,
+   'bteween 11 and 15 euro': 0.0,
+   'bteween 15 and 22 euro': 0.0,
+   'bteween 22 and 37': 0.0,
+   'bteween 30 and 58 euro': 0.0,
+   'bteween 51 and 73 euro': 0.0,
+   'netween 20 and 30 euro': 0.0},
+  'pricerange': {'**NONE**': 1.0,
+   'cheap': 0.0,
+   'dontcare': 0.0,
+   'expensive': 0.0,
+   'moderate': 0.0},
+  'requested': {'addr': 1.0,
+   'allowedforkids': 0.0,
+   'area': 0.0,
+   'food': 0.0,
+   'goodformeal': 0.0,
+   'name': 0.0,
+   'near': 0.0,
+   'phone': 1,
+   'postcode': 0.0,
+   'price': 0.0,
+   'pricerange': 0.0}},
+ 'features': {'inform_info': [False,
+   False,
+   True,
+   False,
+   True,
+   False,
+   False,
+   True,
+   False,
+   True,
+   False,
+   False,
+   True,
+   False,
+   True,
+   False,
+   False,
+   True,
+   False,
+   True,
+   False,
+   False,
+   True,
+   False,
+   True],
+  'informedVenueSinceNone': ['great eastern restaurant',
+   'great eastern restaurant'],
+  'lastActionInformNone': False,
+  'lastInformedVenue': 'great eastern restaurant',
+  'offerHappened': False},
+ 'userActs': [('request(name="great eastern restaurant",phone)', 1.0)]}
+    b2 = {'beliefs': {'allowedforkids': {'**NONE**': 0.014367834316388661,
+   '0': 0.009175995595522114,
+   '1': 0.9579333306577846,
+   'dontcare': 0.01852283943030468},
+  'area': {'**NONE**': 0.9753165718480455,
+   'alamo square': 0.0,
+   'amanico ergina village': 0.0,
+   'anza vista': 0.0,
+   'ashbury heights': 0.0,
+   'balboa terrace': 0.0,
+   'bayview district': 0.0,
+   'bayview heights': 0.0,
+   'bernal heights': 0.0,
+   'bernal heights north': 0.0,
+   'bernal heights south': 0.0,
+   'buena vista park': 0.0,
+   'castro': 0.0,
+   'cathedral hill': 0.0,
+   'cayuga terrace': 0.0,
+   'central richmond': 0.0,
+   'central sunset': 0.0,
+   'central waterfront': 0.0,
+   'chinatown': 0.0,
+   'civic center': 0.0,
+   'clarendon heights': 0.0,
+   'cole valley': 0.0,
+   'corona heights': 0.0,
+   'cow hollow': 0.0,
+   'crocker amazon': 0.0,
+   'diamond heights': 0.0,
+   'doelger city': 0.0,
+   'dogpatch': 0.0,
+   'dolores heights': 0.0,
+   'dontcare': 0.0,
+   'downtown': 0.0,
+   'duboce triangle': 0.0,
+   'embarcadero': 0.0,
+   'eureka valley': 0.0,
+   'eureka valley dolores heights': 0.0,
+   'excelsior': 0.0,
+   'financial district': 0.0,
+   'financial district south': 0.0,
+   'fishermans wharf': 0.0,
+   'forest hill': 0.0,
+   'forest hill extension': 0.0,
+   'forest knolls': 0.0,
+   'fort mason': 0.0,
+   'fort winfield scott': 0.0,
+   'frederick douglass haynes gardens': 0.0,
+   'friendship village': 0.0,
+   'glen park': 0.0,
+   'glenridge': 0.0,
+   'golden gate heights': 0.0,
+   'golden gate park': 0.0,
+   'haight ashbury': 0.0,
+   'hayes valley': 0.0,
+   'hunters point': 0.0,
+   'india basin': 0.0,
+   'ingleside': 0.0,
+   'ingleside heights': 0.0,
+   'ingleside terrace': 0.0,
+   'inner mission': 0.0,
+   'inner parkside': 0.0,
+   'inner richmond': 0.0,
+   'inner sunset': 0.0,
+   'inset': 0.0,
+   'jordan park': 0.0,
+   'laguna honda': 0.0,
+   'lake': 0.0,
+   'lake shore': 0.0,
+   'lakeside': 0.0,
+   'laurel heights': 0.0,
+   'lincoln park': 0.0,
+   'lincoln park lobos': 0.0,
+   'little hollywood': 0.0,
+   'little italy': 0.0,
+   'little osaka': 0.0,
+   'little russia': 0.0,
+   'lone mountain': 0.0,
+   'lower haight': 0.0,
+   'lower nob hill': 0.0,
+   'lower pacific heights': 0.0,
+   'malcolm x square': 0.0,
+   'marcus garvey square': 0.0,
+   'marina district': 0.0,
+   'martin luther king square': 0.0,
+   'mastro': 0.0,
+   'merced heights': 0.0,
+   'merced manor': 0.0,
+   'midtown terrace': 0.0,
+   'miraloma park': 0.0,
+   'mission bay': 0.0,
+   'mission district': 0.0,
+   'mission dolores': 0.0,
+   'mission terrace': 0.0,
+   'monterey heights': 0.0,
+   'mount davidson manor': 0.0,
+   'nob hill': 0.0,
+   'noe valley': 0.0,
+   'noma': 0.0,
+   'north beach': 0.0,
+   'north panhandle': 0.0,
+   'north park': 0.0,
+   'north waterfront': 0.0,
+   'oceanview': 0.0,
+   'opera plaza': 0.0,
+   'outer mission': 0.0,
+   'outer parkside': 0.0,
+   'outer richmond': 0.0,
+   'outer sunset': 0.0,
+   'outset': 0.0,
+   'pacific heights': 0.0,
+   'panhandle': 0.0,
+   'park merced': 0.0,
+   'parkmerced': 0.0,
+   'parkside': 0.0,
+   'pine lake park': 0.0,
+   'portola': 0.0,
+   'potrero flats': 0.0,
+   'potrero hill': 0.0,
+   'presidio': 0.0,
+   'presidio heights': 0.0,
+   'richmond district': 0.0,
+   'russian hill': 0.0,
+   'saint francis wood': 0.0,
+   'san francisco airport': 0.0,
+   'san francisco state university': 0.0,
+   'sea cliff': 0.0,
+   'sherwood forest': 0.0,
+   'showplace square': 0.0,
+   'silver terrace': 0.0,
+   'somisspo': 0.0,
+   'south basin': 0.0,
+   'south beach': 0.0,
+   'south of market': 0.0,
+   'st francis square': 0.0,
+   'st francis wood': 0.0,
+   'stonestown': 0.024683428151954484,
+   'sunnydale': 0.0,
+   'sunnyside': 0.0,
+   'sunset district': 0.0,
+   'telegraph hill': 0.0,
+   'tenderloin': 0.0,
+   'thomas paine square': 0.0,
+   'transmission': 0.0,
+   'treasure island': 0.0,
+   'twin peaks': 0.0,
+   'twin peaks west': 0.0,
+   'upper market': 0.0,
+   'van ness': 0.0,
+   'victoria mews': 0.0,
+   'visitacion valley': 0.0,
+   'vista del monte': 0.0,
+   'west of twin peaks': 0.0,
+   'west portal': 0.0,
+   'western addition': 0.0,
+   'westlake and olympic': 0.0,
+   'westwood highlands': 0.0,
+   'westwood park': 0.0,
+   'yerba buena island': 0.0,
+   'zion district': 0.0},
+  'discourseAct': {'ack': 0.0,
+   'bye': 0.0,
+   'hello': 0.0,
+   'none': 0.9999999999999998,
+   'repeat': 0.0,
+   'silence': 0.0,
+   'thankyou': 0.0},
+  'food': {'**NONE**': 1.0,
+   'afghan': 0.0,
+   'arabian': 0.0,
+   'asian': 0.0,
+   'basque': 0.0,
+   'brasseries': 0.0,
+   'brazilian': 0.0,
+   'buffets': 0.0,
+   'burgers': 0.0,
+   'burmese': 0.0,
+   'cafes': 0.0,
+   'cambodian': 0.0,
+   'cantonese': 0.0,
+   'chinese': 0.0,
+   'comfort food': 0.0,
+   'creperies': 0.0,
+   'dim sum': 0.0,
+   'dontcare': 0.0,
+   'ethiopian': 0.0,
+   'ethnic food': 0.0,
+   'french': 0.0,
+   'gluten free': 0.0,
+   'himalayan': 0.0,
+   'indian': 0.0,
+   'indonesian': 0.0,
+   'indpak': 0.0,
+   'italian': 0.0,
+   'japanese': 0.0,
+   'korean': 0.0,
+   'kosher': 0.0,
+   'latin': 0.0,
+   'lebanese': 0.0,
+   'lounges': 0.0,
+   'malaysian': 0.0,
+   'mediterranean': 0.0,
+   'mexican': 0.0,
+   'middle eastern': 0.0,
+   'modern european': 0.0,
+   'moroccan': 0.0,
+   'new american': 0.0,
+   'pakistani': 0.0,
+   'persian': 0.0,
+   'peruvian': 0.0,
+   'pizza': 0.0,
+   'raw food': 0.0,
+   'russian': 0.0,
+   'sandwiches': 0.0,
+   'sea food': 0.0,
+   'shanghainese': 0.0,
+   'singaporean': 0.0,
+   'soul food': 0.0,
+   'spanish': 0.0,
+   'steak': 0.0,
+   'sushi': 0.0,
+   'taiwanese': 0.0,
+   'tapas': 0.0,
+   'thai': 0.0,
+   'traditionnal american': 0.0,
+   'turkish': 0.0,
+   'vegetarian': 0.0,
+   'vietnamese': 0.0},
+  'goodformeal': {'**NONE**': 1.0,
+   'breakfast': 0.0,
+   'brunch': 0.0,
+   'dinner': 0.0,
+   'dontcare': 0.0,
+   'lunch': 0.0},
+  'method': {'byalternatives': 0.0,
+   'byconstraints': 0.7725475751076113,
+   'byname': 0.0,
+   'finished': 0.0,
+   'none': 0.0,
+   'restart': 0.0},
+  'name': {'**NONE**': 1.0,
+   'a 16': 0.0,
+   'a la turca restaurant': 0.0,
+   'abacus': 0.0,
+   'alamo square seafood grill': 0.0,
+   'albona ristorante istriano': 0.0,
+   'alborz persian cuisine': 0.0,
+   'allegro romano': 0.0,
+   'amarena': 0.0,
+   'amber india': 0.0,
+   'ame': 0.0,
+   'ananda fuara': 0.0,
+   'anchor oyster bar': 0.0,
+   'angkor borei restaurant': 0.0,
+   'aperto restaurant': 0.0,
+   'ar roi restaurant': 0.0,
+   'arabian nights restaurant': 0.0,
+   'assab eritrean restaurant': 0.0,
+   'atelier crenn': 0.0,
+   'aux delices restaurant': 0.0,
+   'aziza': 0.0,
+   'b star bar': 0.0,
+   'bar crudo': 0.0,
+   'beijing restaurant': 0.0,
+   'bella trattoria': 0.0,
+   'benu': 0.0,
+   'betelnut': 0.0,
+   'bistro central parc': 0.0,
+   'bix': 0.0,
+   'borgo': 0.0,
+   'borobudur restaurant': 0.0,
+   'bouche': 0.0,
+   'boulevard': 0.0,
+   'brothers restaurant': 0.0,
+   'bund shanghai restaurant': 0.0,
+   'burma superstar': 0.0,
+   'butterfly': 0.0,
+   'cafe claude': 0.0,
+   'cafe jacqueline': 0.0,
+   'campton place restaurant': 0.0,
+   'canteen': 0.0,
+   'canto do brasil restaurant': 0.0,
+   'capannina': 0.0,
+   'capital restaurant': 0.0,
+   'chai yo thai restaurant': 0.0,
+   'chaya brasserie': 0.0,
+   'chenery park': 0.0,
+   'chez maman': 0.0,
+   'chez papa bistrot': 0.0,
+   'chez spencer': 0.0,
+   'chiaroscuro': 0.0,
+   'chouchou': 0.0,
+   'chow': 0.0,
+   'city view restaurant': 0.0,
+   'claudine': 0.0,
+   'coi': 0.0,
+   'colibri mexican bistro': 0.0,
+   'coqueta': 0.0,
+   'crustacean restaurant': 0.0,
+   'da flora a venetian osteria': 0.0,
+   'darbar restaurant': 0.0,
+   'delancey street restaurant': 0.0,
+   'delfina': 0.0,
+   'dong baek restaurant': 0.0,
+   'dosa on fillmore': 0.0,
+   'dosa on valencia': 0.0,
+   'eiji': 0.0,
+   'enjoy vegetarian restaurant': 0.0,
+   'espetus churrascaria': 0.0,
+   'fang': 0.0,
+   'farallon': 0.0,
+   'fattoush restaurant': 0.0,
+   'fifth floor': 0.0,
+   'fino restaurant': 0.0,
+   'firefly': 0.0,
+   'firenze by night ristorante': 0.0,
+   'fleur de lys': 0.0,
+   'fog harbor fish house': 0.0,
+   'forbes island': 0.0,
+   'foreign cinema': 0.0,
+   'frances': 0.0,
+   'franchino': 0.0,
+   'franciscan crab restaurant': 0.0,
+   'frascati': 0.0,
+   'fresca': 0.0,
+   'fringale': 0.0,
+   'fujiyama ya japanese restaurant': 0.0,
+   'gajalee': 0.0,
+   'gamine': 0.0,
+   'garcon restaurant': 0.0,
+   'gary danko': 0.0,
+   'gitane': 0.0,
+   'golden era restaurant': 0.0,
+   'gracias madre': 0.0,
+   'great eastern restaurant': 0.0,
+   'hakka restaurant': 0.0,
+   'hakkasan': 0.0,
+   'han second kwan': 0.0,
+   'heirloom cafe': 0.0,
+   'helmand palace': 0.0,
+   'hi dive': 0.0,
+   'hillside supper club': 0.0,
+   'hillstone': 0.0,
+   'hong kong clay pot restaurant': 0.0,
+   'house of nanking': 0.0,
+   'house of prime rib': 0.0,
+   'hunan homes restaurant': 0.0,
+   'incanto': 0.0,
+   'isa': 0.0,
+   'jannah': 0.0,
+   'jasmine garden': 0.0,
+   'jitlada thai cuisine': 0.0,
+   'kappa japanese restaurant': 0.0,
+   'kim thanh restaurant': 0.0,
+   'kirin chinese restaurant': 0.0,
+   'kiss seafood': 0.0,
+   'kokkari estiatorio': 0.0,
+   'la briciola': 0.0,
+   'la ciccia': 0.0,
+   'la folie': 0.0,
+   'la mediterranee': 0.0,
+   'la traviata': 0.0,
+   'lahore karahi': 0.0,
+   'lavash': 0.0,
+   'le charm': 0.0,
+   'le colonial': 0.0,
+   'le soleil': 0.0,
+   'lime tree southeast asian kitchen': 0.0,
+   'little delhi': 0.0,
+   'little nepal': 0.0,
+   'luce': 0.0,
+   'lucky creation restaurant': 0.0,
+   'luella': 0.0,
+   'lupa': 0.0,
+   'm y china': 0.0,
+   'maki restaurant': 0.0,
+   'mangia tutti ristorante': 0.0,
+   'manna': 0.0,
+   'marlowe': 0.0,
+   'marnee thai': 0.0,
+   'maverick': 0.0,
+   'mela tandoori kitchen': 0.0,
+   'mescolanza': 0.0,
+   'mezes': 0.0,
+   'michael mina restaurant': 0.0,
+   'millennium': 0.0,
+   'minako organic japanese restaurant': 0.0,
+   'minami restaurant': 0.0,
+   'mission chinese food': 0.0,
+   'mochica': 0.0,
+   'modern thai': 0.0,
+   'mona lisa restaurant': 0.0,
+   'mozzeria': 0.0,
+   'muguboka restaurant': 0.0,
+   'my tofu house': 0.0,
+   'nicaragua restaurant': 0.0,
+   'nob hill cafe': 0.0,
+   'nopa': 0.0,
+   'old jerusalem restaurant': 0.0,
+   'old skool cafe': 0.0,
+   'one market restaurant': 0.0,
+   'orexi': 0.0,
+   'original us restaurant': 0.0,
+   'osha thai': 0.0,
+   'oyaji restaurant': 0.0,
+   'ozumo': 0.0,
+   'pad thai restaurant': 0.0,
+   'panta rei restaurant': 0.0,
+   'park tavern': 0.0,
+   'pera': 0.0,
+   'piperade': 0.0,
+   'ploy 2': 0.0,
+   'poc chuc': 0.0,
+   'poesia': 0.0,
+   'prospect': 0.0,
+   'quince': 0.0,
+   'radius san francisco': 0.0,
+   'range': 0.0,
+   'red door cafe': 0.0,
+   'restaurant ducroix': 0.0,
+   'ristorante bacco': 0.0,
+   'ristorante ideale': 0.0,
+   'ristorante milano': 0.0,
+   'ristorante parma': 0.0,
+   'rn74': 0.0,
+   'rue lepic': 0.0,
+   'saha': 0.0,
+   'sai jai thai restaurant': 0.0,
+   'salt house': 0.0,
+   'san tung chinese restaurant': 0.0,
+   'san wang restaurant': 0.0,
+   'sanjalisco': 0.0,
+   'sanraku': 0.0,
+   'seasons': 0.0,
+   'seoul garden': 0.0,
+   'seven hills': 0.0,
+   'shangri la vegetarian restaurant': 0.0,
+   'singapore malaysian restaurant': 0.0,
+   'skool': 0.0,
+   'so': 0.0,
+   'sotto mare': 0.0,
+   'source': 0.0,
+   'specchio ristorante': 0.0,
+   'spruce': 0.0,
+   'straits restaurant': 0.0,
+   'stroganoff restaurant': 0.0,
+   'sunflower potrero hill': 0.0,
+   'sushi bistro': 0.0,
+   'taiwan restaurant': 0.0,
+   'tanuki restaurant': 0.0,
+   'tataki': 0.0,
+   'tekka japanese restaurant': 0.0,
+   'thai cottage restaurant': 0.0,
+   'thai house express': 0.0,
+   'thai idea vegetarian': 0.0,
+   'thai time restaurant': 0.0,
+   'thanh long': 0.0,
+   'the big 4 restaurant': 0.0,
+   'the blue plate': 0.0,
+   'the house': 0.0,
+   'the richmond': 0.0,
+   'the slanted door': 0.0,
+   'the stinking rose': 0.0,
+   'thep phanom thai restaurant': 0.0,
+   'tommys joynt': 0.0,
+   'toraya japanese restaurant': 0.0,
+   'town hall': 0.0,
+   'trattoria contadina': 0.0,
+   'tu lan': 0.0,
+   'tuba restaurant': 0.0,
+   'u lee restaurant': 0.0,
+   'udupi palace': 0.0,
+   'venticello ristorante': 0.0,
+   'vicoletto': 0.0,
+   'yank sing': 0.0,
+   'yummy yummy': 0.0,
+   'z and y restaurant': 0.0,
+   'zadin': 0.0,
+   'zare at fly trap': 0.0,
+   'zarzuela': 0.0,
+   'zen yai thai restaurant': 0.0,
+   'zuni cafe': 0.0,
+   'zushi puzzle': 0.0},
+  'near': {'**NONE**': 0.13300733496332517,
+   'bayview hunters point': 0.0,
+   'dontcare': 0.15859820700896493,
+   'haight': 0.0,
+   'japantown': 0.038712306438467806,
+   'marina cow hollow': 0.0,
+   'mission': 0.0,
+   'nopa': 0.669682151589242,
+   'north beach telegraph hill': 0.0,
+   'soma': 0.0,
+   'union square': 0.0},
+  'price': {'**NONE**': 1.0,
+   '10 dollar': 0.0,
+   '10 euro': 0.0,
+   '11 euro': 0.0,
+   '15 euro': 0.0,
+   '18 euro': 0.0,
+   '20 euro': 0.0,
+   '22 euro': 0.0,
+   '25 euro': 0.0,
+   '26 euro': 0.0,
+   '29 euro': 0.0,
+   '37 euro': 0.0,
+   '6': 0.0,
+   '7': 0.0,
+   '9': 0.0,
+   'between 0 and 15 euro': 0.0,
+   'between 10 and 13 euro': 0.0,
+   'between 10 and 15 euro': 0.0,
+   'between 10 and 18 euro': 0.0,
+   'between 10 and 20 euro': 0.0,
+   'between 10 and 23 euro': 0.0,
+   'between 10 and 30 euro': 0.0,
+   'between 11 and 15 euro': 0.0,
+   'between 11 and 18 euro': 0.0,
+   'between 11 and 22 euro': 0.0,
+   'between 11 and 25 euro': 0.0,
+   'between 11 and 29 euro': 0.0,
+   'between 11 and 35 euro': 0.0,
+   'between 13 and 15 euro': 0.0,
+   'between 13 and 18 euro': 0.0,
+   'between 13 and 24 euro': 0.0,
+   'between 15 and 18 euro': 0.0,
+   'between 15 and 22 euro': 0.0,
+   'between 15 and 26 euro': 0.0,
+   'between 15 and 29 euro': 0.0,
+   'between 15 and 33 euro': 0.0,
+   'between 15 and 44 euro': 0.0,
+   'between 15 and 58 euro': 0.0,
+   'between 18 and 26 euro': 0.0,
+   'between 18 and 29 euro': 0.0,
+   'between 18 and 44 euro': 0.0,
+   'between 18 and 55 euro': 0.0,
+   'between 18 and 58 euro': 0.0,
+   'between 18 and 73 euro': 0.0,
+   'between 18 and 78 euro': 0.0,
+   'between 2 and 15 euro': 0.0,
+   'between 20 and 30 euro': 0.0,
+   'between 21 and 23 euro': 0.0,
+   'between 22 and 29 euro': 0.0,
+   'between 22 and 30 dollar': 0.0,
+   'between 22 and 37 euro': 0.0,
+   'between 22 and 58 euro': 0.0,
+   'between 22 and 73 euro': 0.0,
+   'between 23 and 29': 0.0,
+   'between 23 and 29 euro': 0.0,
+   'between 23 and 37 euro': 0.0,
+   'between 23 and 58': 0.0,
+   'between 23 and 58 euro': 0.0,
+   'between 26 and 33 euro': 0.0,
+   'between 26 and 34 euro': 0.0,
+   'between 26 and 37 euro': 0.0,
+   'between 29 and 37 euro': 0.0,
+   'between 29 and 44 euro': 0.0,
+   'between 29 and 58 euro': 0.0,
+   'between 29 and 73 euro': 0.0,
+   'between 30 and 58': 0.0,
+   'between 30 and 58 euro': 0.0,
+   'between 31 and 50 euro': 0.0,
+   'between 37 and 110 euro': 0.0,
+   'between 37 and 44 euro': 0.0,
+   'between 37 and 58 euro': 0.0,
+   'between 4 and 22 euro': 0.0,
+   'between 4 and 58 euro': 0.0,
+   'between 5 an 30 euro': 0.0,
+   'between 5 and 10 euro': 0.0,
+   'between 5 and 11 euro': 0.0,
+   'between 5 and 15 dollar': 0.0,
+   'between 5 and 20 euro': 0.0,
+   'between 5 and 25 euro': 0.0,
+   'between 6 and 10 euro': 0.0,
+   'between 6 and 11 euro': 0.0,
+   'between 6 and 15 euro': 0.0,
+   'between 6 and 29 euro': 0.0,
+   'between 7 and 11 euro': 0.0,
+   'between 7 and 13 euro': 0.0,
+   'between 7 and 15 euro': 0.0,
+   'between 7 and 37 euro': 0.0,
+   'between 8 and 22 euro': 0.0,
+   'between 9 and 13 dolllar': 0.0,
+   'between 9 and 15 euro': 0.0,
+   'between 9 and 58 euro': 0.0,
+   'bteween 11 and 15 euro': 0.0,
+   'bteween 15 and 22 euro': 0.0,
+   'bteween 22 and 37': 0.0,
+   'bteween 30 and 58 euro': 0.0,
+   'bteween 51 and 73 euro': 0.0,
+   'netween 20 and 30 euro': 0.0},
+  'pricerange': {'**NONE**': 0.22571148184494605,
+   'cheap': 0.0,
+   'dontcare': 0.774288518155054,
+   'expensive': 0.0,
+   'moderate': 0.0},
+  'requested': {'addr': 0.0,
+   'allowedforkids': 0.0,
+   'area': 0.0,
+   'food': 0.0,
+   'goodformeal': 0.0,
+   'name': 0.0,
+   'near': 0.0,
+   'phone': 0.0,
+   'postcode': 0.0,
+   'price': 0.0,
+   'pricerange': 0.0}},
+ 'features': {'inform_info': [False,
+   False,
+   False,
+   True,
+   True,
+   False,
+   False,
+   False,
+   True,
+   True,
+   False,
+   True,
+   False,
+   False,
+   False,
+   False,
+   True,
+   False,
+   False,
+   False,
+   False,
+   True,
+   False,
+   False,
+   False],
+  'informedVenueSinceNone': [],
+  'lastActionInformNone': False,
+  'lastInformedVenue': '',
+  'offerHappened': False},
+ 'userActs': [('inform(allowedforkids="1")', 0.90842356395668944),
+  ('inform(allowedforkids="dontcare")', 0.0091759955955221153),
+  ('inform(allowedforkids="0")', 0.0091759955955221153),
+  ('inform(postcode)', 0.025509267755551478),
+  ('inform(area="stonestown")', 0.024683428151954491),
+  ('null()', 0.023031748944760511)]}
+
+    b3 = {'beliefs': {'area': {'**NONE**': 0.12910550615265692,
+   'centre': 0.8338099777773861,
+   'dontcare': 0.0,
+   'east': 0.03708451606995696,
+   'north': 0.0,
+   'south': 0.0,
+   'west': 0.0},
+  'discourseAct': {'ack': 0.0,
+   'bye': 0.0,
+   'hello': 0.0,
+   'none': 1.0,
+   'repeat': 0.0,
+   'silence': 0.0,
+   'thankyou': 0.0},
+  'food': {'**NONE**': 0.020895546925810415,
+   'afghan': 0.0,
+   'african': 0.0,
+   'afternoon tea': 0.0,
+   'asian oriental': 0.0,
+   'australasian': 0.0,
+   'australian': 0.0,
+   'austrian': 0.0,
+   'barbeque': 0.0,
+   'basque': 0.0,
+   'belgian': 0.0,
+   'bistro': 0.0,
+   'brazilian': 0.0,
+   'british': 0.0,
+   'canapes': 0.0,
+   'cantonese': 0.0,
+   'caribbean': 0.0,
+   'catalan': 0.0,
+   'chinese': 0.0,
+   'christmas': 0.0,
+   'corsica': 0.0,
+   'creative': 0.0,
+   'crossover': 0.0,
+   'cuban': 0.0,
+   'danish': 0.0,
+   'dontcare': 0.0,
+   'eastern european': 0.0,
+   'english': 0.0,
+   'eritrean': 0.0,
+   'european': 0.0,
+   'french': 0.0,
+   'fusion': 0.0,
+   'gastropub': 0.0,
+   'german': 0.0,
+   'greek': 0.0,
+   'halal': 0.0,
+   'hungarian': 0.0,
+   'indian': 0.0,
+   'indonesian': 0.0,
+   'international': 0.0,
+   'irish': 0.0,
+   'italian': 0.0,
+   'jamaican': 0.0,
+   'japanese': 0.0,
+   'korean': 0.0,
+   'kosher': 0.0,
+   'latin american': 0.0,
+   'lebanese': 0.0,
+   'light bites': 0.0,
+   'malaysian': 0.0,
+   'mediterranean': 0.9791044530741896,
+   'mexican': 0.0,
+   'middle eastern': 0.0,
+   'modern american': 0.0,
+   'modern eclectic': 0.0,
+   'modern european': 0.0,
+   'modern global': 0.0,
+   'molecular gastronomy': 0.0,
+   'moroccan': 0.0,
+   'new zealand': 0.0,
+   'north african': 0.0,
+   'north american': 0.0,
+   'north indian': 0.0,
+   'northern european': 0.0,
+   'panasian': 0.0,
+   'persian': 0.0,
+   'polish': 0.0,
+   'polynesian': 0.0,
+   'portuguese': 0.0,
+   'romanian': 0.0,
+   'russian': 0.0,
+   'scandinavian': 0.0,
+   'scottish': 0.0,
+   'seafood': 0.0,
+   'singaporean': 0.0,
+   'south african': 0.0,
+   'south indian': 0.0,
+   'spanish': 0.0,
+   'sri lankan': 0.0,
+   'steakhouse': 0.0,
+   'swedish': 0.0,
+   'swiss': 0.0,
+   'thai': 0.0,
+   'the americas': 0.0,
+   'traditional': 0.0,
+   'turkish': 0.0,
+   'tuscan': 0.0,
+   'unusual': 0.0,
+   'vegetarian': 0.0,
+   'venetian': 0.0,
+   'vietnamese': 0.0,
+   'welsh': 0.0,
+   'world': 0.0},
+  'method': {'byalternatives': 0.0,
+   'byconstraints': 0.6359877465366015,
+   'byname': 0.0,
+   'finished': 0.0,
+   'none': 0.0,
+   'restart': 0.0},
+  'name': {'**NONE**': 1.0,
+   'ali baba': 0.0,
+   'anatolia': 0.0,
+   'ask': 0.0,
+   'backstreet bistro': 0.0,
+   'bangkok city': 0.0,
+   'bedouin': 0.0,
+   'bloomsbury restaurant': 0.0,
+   'caffe uno': 0.0,
+   'cambridge lodge restaurant': 0.0,
+   'charlie chan': 0.0,
+   'chiquito restaurant bar': 0.0,
+   'city stop restaurant': 0.0,
+   'clowns cafe': 0.0,
+   'cocum': 0.0,
+   'cote': 0.0,
+   'cotto': 0.0,
+   'curry garden': 0.0,
+   'curry king': 0.0,
+   'curry prince': 0.0,
+   'curry queen': 0.0,
+   'da vince pizzeria': 0.0,
+   'da vinci pizzeria': 0.0,
+   'darrys cookhouse and wine shop': 0.0,
+   'de luca cucina and bar': 0.0,
+   'dojo noodle bar': 0.0,
+   'don pasquale pizzeria': 0.0,
+   'efes restaurant': 0.0,
+   'eraina': 0.0,
+   'fitzbillies restaurant': 0.0,
+   'frankie and bennys': 0.0,
+   'galleria': 0.0,
+   'golden house': 0.0,
+   'golden wok': 0.0,
+   'gourmet burger kitchen': 0.0,
+   'graffiti': 0.0,
+   'grafton hotel restaurant': 0.0,
+   'hakka': 0.0,
+   'hk fusion': 0.0,
+   'hotel du vin and bistro': 0.0,
+   'india house': 0.0,
+   'j restaurant': 0.0,
+   'jinling noodle bar': 0.0,
+   'kohinoor': 0.0,
+   'kymmoy': 0.0,
+   'la margherita': 0.0,
+   'la mimosa': 0.0,
+   'la raza': 0.0,
+   'la tasca': 0.0,
+   'lan hong house': 0.0,
+   'little seoul': 0.0,
+   'loch fyne': 0.0,
+   'mahal of cambridge': 0.0,
+   'maharajah tandoori restaurant': 0.0,
+   'meghna': 0.0,
+   'meze bar restaurant': 0.0,
+   'michaelhouse cafe': 0.0,
+   'midsummer house restaurant': 0.0,
+   'nandos': 0.0,
+   'nandos city centre': 0.0,
+   'panahar': 0.0,
+   'peking restaurant': 0.0,
+   'pipasha restaurant': 0.0,
+   'pizza express': 0.0,
+   'pizza express fen ditton': 0.0,
+   'pizza hut': 0.0,
+   'pizza hut cherry hinton': 0.0,
+   'pizza hut city centre': 0.0,
+   'pizza hut fen ditton': 0.0,
+   'prezzo': 0.0,
+   'rajmahal': 0.0,
+   'restaurant alimentum': 0.0,
+   'restaurant one seven': 0.0,
+   'restaurant two two': 0.0,
+   'rice boat': 0.0,
+   'rice house': 0.0,
+   'riverside brasserie': 0.0,
+   'royal spice': 0.0,
+   'royal standard': 0.0,
+   'saffron brasserie': 0.0,
+   'saigon city': 0.0,
+   'saint johns chop house': 0.0,
+   'sala thong': 0.0,
+   'sesame restaurant and bar': 0.0,
+   'shanghai family restaurant': 0.0,
+   'shiraz restaurant': 0.0,
+   'sitar tandoori': 0.0,
+   'stazione restaurant and coffee bar': 0.0,
+   'taj tandoori': 0.0,
+   'tandoori palace': 0.0,
+   'tang chinese': 0.0,
+   'thanh binh': 0.0,
+   'the cambridge chop house': 0.0,
+   'the copper kettle': 0.0,
+   'the cow pizza kitchen and bar': 0.0,
+   'the gandhi': 0.0,
+   'the gardenia': 0.0,
+   'the golden curry': 0.0,
+   'the good luck chinese food takeaway': 0.0,
+   'the hotpot': 0.0,
+   'the lucky star': 0.0,
+   'the missing sock': 0.0,
+   'the nirala': 0.0,
+   'the oak bistro': 0.0,
+   'the river bar steakhouse and grill': 0.0,
+   'the slug and lettuce': 0.0,
+   'the varsity restaurant': 0.0,
+   'travellers rest': 0.0,
+   'ugly duckling': 0.0,
+   'venue': 0.0,
+   'wagamama': 0.0,
+   'yippee noodle bar': 0.0,
+   'yu garden': 0.0,
+   'zizzi cambridge': 0.0},
+  'pricerange': {'**NONE**': 0.1340777132648503,
+   'cheap': 0.0,
+   'dontcare': 0.8659222867351497,
+   'expensive': 0.0,
+   'moderate': 0.0},
+  'requested': {'addr': 0.0,
+   'area': 0.0,
+   'description': 0.0,
+   'food': 0.0,
+   'name': 0.0,
+   'phone': 0.0,
+   'postcode': 0.0,
+   'pricerange': 0.0,
+   'signature': 0.0}},
+ 'features': {'inform_info': [False,
+   False,
+   True,
+   False,
+   True,
+   False,
+   False,
+   True,
+   False,
+   False,
+   False,
+   False,
+   True,
+   False,
+   False,
+   False,
+   False,
+   True,
+   False,
+   False,
+   False,
+   False,
+   True,
+   False,
+   False],
+  'informedVenueSinceNone': [],
+  'lastActionInformNone': False,
+  'lastInformedVenue': '',
+  'offerHappened': False},
+ 'userActs': [('inform(food="mediterranean")', 0.84415346579983519),
+  ('inform(area="east")', 0.037084516069956962),
+  ('null()', 0.048530354363153554),
+  ('reqmore()', 0.04541708634740408),
+  ('confirm(phone)', 0.024814577419650211)]}
+
+    return b1, b2, b3
+
+
+def main():
+    """
+    unit test
+    :return:
+    """
+
+    Settings.init('config/Tut-gp-Multidomain.cfg', 12345)
+    Ontology.init_global_ontology()
+
+    b1, b2, b3 = get_test_beliefs()
+    '''state1 = DIP_state(b1, domainString='SFRestaurants')
+    state2 = DIP_state(b2, domainString='SFRestaurants')
+    state3 = DIP_state(b3, domainString='CamRestaurants')'''
+    state1 = padded_state(b1, domainString='SFRestaurants')
+    state2 = padded_state(b2, domainString='SFRestaurants')
+    state3 = padded_state(b3, domainString='CamRestaurants')
+    print(state1.get_beliefStateVec('area')[:state1.max_v])
+    print(len(state2.get_beliefStateVec('near'))-state2.max_v)
+    print(len(state3.get_beliefStateVec('pricerange'))-state3.max_v)
+    #print len(state3.get_beliefStateVec('general'))
+    s2 = state2.get_beliefStateVec('food')
+    s3 = state3.get_beliefStateVec('food')
+    a=1
+    #print state3.get_beliefStateVec('general')[:state2.max_v]
+    #print state2.max_v
+    #print state3.max_v
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/policy/feudalgainRL/DQNPolicy_latest.py b/policy/feudalgainRL/DQNPolicy_latest.py
new file mode 100644
index 0000000000000000000000000000000000000000..559e5bc4f55174efffac683a31cd9c9a6481f198
--- /dev/null
+++ b/policy/feudalgainRL/DQNPolicy_latest.py
@@ -0,0 +1,789 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+'''
+DQNPolicy.py - deep Q network policy
+==================================================
+
+Author: Pei-Hao (Eddy) Su  (Copyright CUED Dialogue Systems Group 2016)
+
+.. seealso:: CUED Imports/Dependencies: 
+
+    import :class:`Policy`
+    import :class:`utils.ContextLogger`
+
+.. warning::
+        Documentation not done.
+
+
+************************
+
+'''
+
+import copy
+import sys
+import os
+import json
+import numpy as np
+import pickle as pickle
+import random
+import utils
+from utils.Settings import config as cfg
+from utils import ContextLogger
+
+import ontology.FlatOntologyManager as FlatOnt
+#from theano_dialogue.util.tool import *
+
+import tensorflow as tf
+from policy.DRL.replay_buffer import ReplayBuffer
+from policy.DRL.replay_prioritised import ReplayPrioritised
+import policy.DRL.utils as drlutils
+import policy.DRL.dqn as dqn
+import policy.Policy
+import policy.SummaryAction
+from policy.Policy import TerminalAction, TerminalState
+import policy.GPPolicy
+
+logger = utils.ContextLogger.getLogger('')
+
+# --- for flattening the belief --- # 
+domainUtil = FlatOnt.FlatDomainOntology('CamRestaurants')
+
+"""
+def flatten_belief(gpstate):
+    '''
+    Flatten the GP-dictionary-typed belief state to a one-dim vector
+    '''
+
+    if isinstance(gpstate, TerminalState):
+        return [0] * 304 #260 #264
+
+    flat_belief = []
+    for key, value in gpstate._bstate.items():
+        flat_belief += value
+
+    return flat_belief
+"""
+
+def flatten_belief(belief,domainUtil=FlatOnt.FlatDomainOntology('CamRestaurants'), merge=False):
+    if isinstance(belief, TerminalState):
+        return [0] * 260 #264
+
+    #for key, value in belief.items():
+    #    print key, value
+
+    #policyfeatures = ['full','method','discourseAct','requested']
+    policyfeatures = ['full','method','discourseAct','requested',\
+                'lastActionInformNone','offerHappened','inform_info']
+
+    flat_belief = []
+    for feat in policyfeatures:
+        add_feature = []
+        if feat == 'kbest':
+            for slot in self.domainUtil.sorted_system_requestable_slots:
+                # print slot, 'belief', belief['beliefs'][slot]
+                temp = [belief['beliefs'][slot][value] for value in domainUtil.ontology['informable'][slot]]
+                temp = sorted(temp, key=lambda b: -b)
+               #temp = [belief['beliefs'][slot]['dontcare']] + [belief['beliefs'][slot]['**NONE**']] + temp
+                temp = temp + [belief['beliefs'][slot]['dontcare']] + [belief['beliefs'][slot]['**NONE**']]
+                temp = temp[0:self.max_k]
+                add_feature += temp
+        elif feat == 'full':
+            #for slot in self.sorted_slots:
+            for slot in domainUtil.ontology['informable']:
+                for value in domainUtil.ontology['informable'][slot]:# + ['**NONE**']:
+                #for value in domainUtil.ontology['informable'][slot] + ['**NONE**']:
+                #for value in domainUtil.ontology['informable'][slot] + ['dontcare'] + ['**NONE**']:
+                    add_feature.append(belief['beliefs'][slot][value])
+        elif feat == 'method':
+            add_feature = [belief['beliefs']['method'][method] for method in domainUtil.ontology['method']]
+        elif feat == 'discourseAct':
+            add_feature = [belief['beliefs']['discourseAct'][discourseAct]
+                           for discourseAct in domainUtil.ontology['discourseAct']]
+        elif feat == 'requested':
+            add_feature = [belief['beliefs']['requested'][slot] \
+                            for slot in domainUtil.ontology['requestable']]
+        elif feat == 'lastActionInformNone':
+            add_feature.append(float(belief['features']['lastActionInformNone']))
+        elif feat == 'offerHappened':
+            add_feature.append(float(belief['features']['offerHappened']))
+        elif feat == 'inform_info':
+            add_feature += belief['features']['inform_info']
+        else:
+            logger.error('Invalid feature name in config: ' + feat)
+
+        flat_belief += add_feature
+
+    return flat_belief
+
+    
+    
+    """ 
+    flat_belief = []
+    for feat in policyfeatures:
+        add_feature = []
+        if feat == 'full':
+            #for slot in self.sorted_slots:
+            for slot in domainUtil.ontology['informable']:
+                if slot == 'name':
+                    continue
+                accumProb = 0.0
+                for value in domainUtil.ontology['informable'][slot] + ['**NONE**']:
+                     if value not in ('dontcare', '**NONE**'):
+                        accumProb += float(belief['beliefs'][slot][value])
+                add_feature.append(accumProb)
+                add_feature.append(belief['beliefs'][slot]['dontcare'])
+                add_feature.append(belief['beliefs'][slot]['**NONE**'])
+
+                   #add_feature.append(belief['beliefs'][slot][value])
+        elif feat == 'method':
+            add_feature = [belief['beliefs']['method'][method] \
+                            for method in domainUtil.ontology['method']]
+        elif feat == 'discourseAct':
+            add_feature = [belief['beliefs']['discourseAct'][discourseAct]
+                           for discourseAct in domainUtil.ontology['discourseAct']]
+        elif feat == 'requested':
+            add_feature = [belief['beliefs']['requested'][slot] \
+                            for slot in domainUtil.ontology['requestable']]
+        elif feat == 'lastActionInformNone':
+            add_feature.append(float(belief['features']['lastActionInformNone']))
+        elif feat == 'offerHappened':
+            add_feature.append(float(belief['features']['offerHappened']))
+        elif feat == 'inform_info':
+            add_feature += (belief['features']['inform_info'])
+        else:
+            logger.error('Invalid feature name in config: ' + feat)
+
+        flat_belief += add_feature
+    return flat_belief
+    """ 
+
+
+class DQNPolicy(Policy.Policy):
+    '''Derived from :class:`Policy`
+    '''
+    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False):
+        super(DQNPolicy, self).__init__(domainString, is_training)
+
+        self.in_policy_file = in_policy_file
+        self.out_policy_file = out_policy_file
+        self.is_training = is_training
+        self.accum_belief = []
+        self.stats = [0 for ii in range(14)]
+
+        self.prev_state_check = None
+
+        # parameter settings
+        self.n_in= 260
+        if cfg.has_option('dqnpolicy_'+domainString, 'n_in'):
+            self.n_in = cfg.getint('dqnpolicy_'+domainString, 'n_in')
+
+        self.actor_lr = 0.0001
+        if cfg.has_option('dqnpolicy_'+domainString, 'actor_lr'):
+            self.actor_lr = cfg.getfloat('dqnpolicy_'+domainString, 'actor_lr')
+
+        self.critic_lr = 0.001
+        if cfg.has_option('dqnpolicy_'+domainString, 'critic_lr'):
+            self.critic_lr = cfg.getfloat('dqnpolicy_'+domainString, 'critic_lr')
+
+        self.tau = 0.001
+        if cfg.has_option('dqnpolicy_'+domainString, 'tau'):
+            self.tau = cfg.getfloat('dqnpolicy_'+domainString, 'tau')
+
+        self.randomseed = 1234
+        if cfg.has_option('GENERAL', 'seed'):
+            self.randomseed = cfg.getint('GENERAL', 'seed')
+        
+        self.gamma = 1.0
+        if cfg.has_option('dqnpolicy_'+domainString, 'gamma'):
+            self.gamma = cfg.getfloat('dqnpolicy_'+domainString, 'gamma')
+
+        self.regularisation = 'l2'
+        if cfg.has_option('dqnpolicy_'+domainString, 'regularisation'):
+            self.regularisation = cfg.get('dqnpolicy_'+domainString, 'regulariser')
+
+        self.learning_rate = 0.001
+        if cfg.has_option('dqnpolicy_'+domainString, 'learning_rate'):
+            self.learning_rate = cfg.getfloat('dqnpolicy_'+domainString, 'learning_rate')
+
+        self.exploration_type = 'e-greedy' # Boltzman
+        if cfg.has_option('dqnpolicy_'+domainString, 'exploration_type'):
+            self.exploration_type = cfg.get('dqnpolicy_'+domainString, 'exploration_type')
+
+        self.episodeNum = 1000
+        if cfg.has_option('dqnpolicy_'+domainString, 'episodeNum'):
+            self.episodeNum = cfg.getfloat('dqnpolicy_'+domainString, 'episodeNum')
+
+        self.maxiter = 5000
+        if cfg.has_option('dqnpolicy_'+domainString, 'maxiter'):
+            self.maxiter = cfg.getfloat('dqnpolicy_'+domainString, 'maxiter')
+
+        self.epsilon = 1
+        if cfg.has_option('dqnpolicy_'+domainString, 'epsilon'):
+            self.epsilon = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon')
+        
+        self.epsilon_start = 1
+        if cfg.has_option('dqnpolicy_'+domainString, 'epsilon_start'):
+            self.epsilon_start = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon_start')
+
+        self.epsilon_end = 1
+        if cfg.has_option('dqnpolicy_'+domainString, 'epsilon_end'):
+            self.epsilon_end = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon_end')
+
+        self.priorProbStart = 1.0
+        if cfg.has_option('dqnpolicy_'+domainString, 'prior_sample_prob_start'):
+            self.priorProbStart = cfg.getfloat('dqnpolicy_'+domainString, 'prior_sample_prob_start')
+
+        self.priorProbEnd = 0.1
+        if cfg.has_option('dqnpolicy_'+domainString, 'prior_sample_prob_end'):
+            self.priorProbEnd = cfg.getfloat('dqnpolicy_'+domainString, 'prior_sample_prob_end')
+
+        self.policyfeatures = []
+        if cfg.has_option('dqnpolicy_'+domainString, 'features'):
+            logger.info('Features: ' + str(cfg.get('dqnpolicy_'+domainString, 'features')))
+            self.policyfeatures = json.loads(cfg.get('dqnpolicy_'+domainString, 'features'))
+
+        self.max_k = 5
+        if cfg.has_option('dqnpolicy_'+domainString, 'max_k'):
+            self.max_k = cfg.getint('dqnpolicy_'+domainString, 'max_k')
+
+        self.learning_algorithm = 'drl'
+        if cfg.has_option('dqnpolicy_'+domainString, 'learning_algorithm'):
+            self.learning_algorithm = cfg.get('dqnpolicy_'+domainString, 'learning_algorithm')
+            logger.info('Learning algorithm: ' + self.learning_algorithm)
+
+        self.minibatch_size = 32
+        if cfg.has_option('dqnpolicy_'+domainString, 'minibatch_size'):
+            self.minibatch_size = cfg.getint('dqnpolicy_'+domainString, 'minibatch_size')
+
+        self.capacity = 1000#max(self.minibatch_size, 2000)
+        if cfg.has_option('dqnpolicy_'+domainString, 'capacity'):
+            self.capacity = max(cfg.getint('dqnpolicy_'+domainString,'capacity'), 2000)
+
+        self.replay_type = 'vanilla'
+        if cfg.has_option('dqnpolicy_'+domainString, 'replay_type'):
+            self.replay_type = cfg.get('dqnpolicy_'+domainString, 'replay_type')
+
+        self.architecture = 'vanilla'
+        if cfg.has_option('dqnpolicy_'+domainString, 'architecture'):
+            self.architecture = cfg.get('dqnpolicy_'+domainString, 'architecture')
+
+        self.q_update = 'single'
+        if cfg.has_option('dqnpolicy_'+domainString, 'q_update'):
+            self.q_update = cfg.get('dqnpolicy_'+domainString, 'q_update')
+
+        self.h1_size = 130
+        if cfg.has_option('dqnpolicy_'+domainString, 'h1_size'):
+            self.h1_size = cfg.getint('dqnpolicy_'+domainString, 'h1_size')
+        
+        self.h2_size = 130
+        if cfg.has_option('dqnpolicy_'+domainString, 'h2_size'):
+            self.h2_size = cfg.getint('dqnpolicy_'+domainString, 'h2_size')
+
+        """
+        self.shuffle = False
+        if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'):
+            self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay')
+        if not self.shuffle:
+            # If we don't use experience replay, we don't need to maintain
+            # sliding window of experiences with maximum capacity.
+            # We only need to maintain the data of minibatch_size
+            self.capacity = self.minibatch_size
+        """
+
+        self.episode_ave_max_q = []
+
+        os.environ["CUDA_VISIBLE_DEVICES"]=""
+
+        # init session
+        self.sess = tf.Session()
+        with tf.device("/cpu:0"):
+
+            np.random.seed(self.randomseed)
+            tf.set_random_seed(self.randomseed)
+
+            # initialise an replay buffer
+            if self.replay_type == 'vanilla':
+                self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed)
+            elif self.replay_type == 'prioritized':
+                self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size, self.randomseed)
+            #replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
+            #self.episodes = []
+            self.samplecount = 0
+            self.episodecount = 0
+
+            # construct the models
+            self.state_dim = self.n_in
+            self.summaryaction = SummaryAction.SummaryAction(domainString)
+            self.action_dim = len(self.summaryaction.action_names)
+            action_bound = len(self.summaryaction.action_names)
+
+            self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, \
+                self.critic_lr, self.tau, action_bound, self.architecture, self.h1_size, self.h2_size)
+
+            # when all models are defined, init all variables
+            init_op = tf.initialize_all_variables()
+            self.sess.run(init_op)
+
+            self.loadPolicy(self.in_policy_file)
+            print('loaded replay size: ', self.episodes[self.domainString].size())
+
+            self.dqn.update_target_network()
+
+    # def record() has been handled...
+
+    def act_on(self, beliefstate, hyps=None):
+        if self.lastSystemAction is None and self.startwithhello:
+            systemAct, nextaIdex = 'hello()', -1
+        else:
+            systemAct, nextaIdex = self.nextAction(beliefstate, hyps)
+        self.lastSystemAction = systemAct
+        self.summaryAct = nextaIdex
+        self.prevbelief = beliefstate
+        return systemAct
+
+    def record(self, reward, domainInControl=None, weight=None, state=None, action=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.episodes[domainInControl] is None:
+            self.episodes[domainInControl] = Episode(dstring=domainInControl)
+        if self.actToBeRecorded is None:
+            #self.actToBeRecorded = self.lastSystemAction
+            self.actToBeRecorded = self.summaryAct
+
+        if state is None:
+            state = self.prevbelief
+        if action is None:
+            action = self.actToBeRecorded
+
+        cState, cAction = self.convertStateAction(state, action)
+
+        # normalising total return to -1~1
+        #reward /= 40.0
+        reward /= 20.0
+        """
+        reward = float(reward+10.0)/40.0
+        """
+
+        if weight == None:
+            if self.replay_type == 'vanilla':
+                self.episodes[domainInControl].record(state=cState, \
+                        state_ori=state, action=cAction, reward=reward)
+            elif self.replay_type == 'prioritized':
+
+                ##### calculate Q_s_t_a_t_ and gamma_Q_s_tplu1_maxa_ for PER ###
+                ################################################################
+                cur_cState = np.vstack([np.expand_dims(x, 0) for x in [cState]])        
+                cur_cAction_one_hot = np.eye(self.action_dim, self.action_dim)[[cAction]]
+                
+                cur_action_q = self.dqn.predict(cur_cState, cur_cAction_one_hot)
+                execMask = self.summaryaction.getExecutableMask(state, cAction)
+               
+                if self.q_update == 'single':
+                    Qs = []
+                    for idx, v in enumerate(execMask):
+                        if v > -sys.maxsize:
+                            Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
+                            Qidx = self.dqn.predict_target(cur_cState, Action_idx)
+                            Qs.append(Qidx[0])
+                            #Qs.append(Qidx[0])
+
+                    Q_s_t_a_t_ = cur_action_q[0]
+                    gamma_Q_s_tplu1_maxa_ = self.gamma * np.max(Qs)
+                elif self.q_update == 'double':
+                    Qs = []
+                    for idx, v in enumerate(execMask):
+                        if v > -sys.maxsize:
+                            Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
+                            Qidx = self.dqn.predict(cur_cState, Action_idx)
+                            Qs.append(Qidx[0])
+                        else:
+                            Qs.append(-sys.maxsize)
+
+                    policyQ_argmax_a = np.argmax(Qs)
+                    policyQ_argmax_a_one_hot = np.eye(self.action_dim, self.action_dim)[[policyQ_argmax_a]]
+                    target_value_Q = self.dqn.predict_target(cur_cState, policyQ_argmax_a_one_hot)
+
+                    Q_s_t_a_t_ = cur_action_q[0]
+                    gamma_Q_s_tplu1_maxa_ = self.gamma * target_value_Q
+
+                print('Q_s_t_a_t_', Q_s_t_a_t_)
+                print('gamma_Q_s_tplu1_maxa_', gamma_Q_s_tplu1_maxa_)
+                ################################################################
+
+                # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
+                #if self.samplecount >= self.capacity:
+                if True:
+                    self.episodes[domainInControl].record(state=cState, \
+                        state_ori=state, action=cAction, reward=reward, \
+                            Q_s_t_a_t_ = Q_s_t_a_t_, gamma_Q_s_tplu1_maxa_= gamma_Q_s_tplu1_maxa_, uniform=False)
+                else:
+                    self.episodes[domainInControl].record(state=cState, \
+                        state_ori=state, action=cAction, reward=reward, \
+                            Q_s_t_a_t_ = Q_s_t_a_t_, gamma_Q_s_tplu1_maxa_= gamma_Q_s_tplu1_maxa_, uniform=True)
+
+        else:
+            self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward, ma_weight=weight)
+
+        self.actToBeRecorded = None
+        self.samplecount += 1
+        return
+
+    def finalizeRecord(self, reward, domainInControl=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.episodes[domainInControl] is None:
+            logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
+            return
+
+        #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size())
+        print('Episode Avg_Max_Q', np.mean(self.episode_ave_max_q))
+
+        print('saving statics')
+        self.saveStats()
+        print(self.stats)
+
+        # normalising total return to -1~1
+        #if reward == 0:
+        #    reward = -20.0
+        reward /= 20.0
+        """
+        if reward == 20.0:
+            reward = 1.0
+        else:
+            reward = -0.5
+        """
+        #reward = float(reward+10.0)/40.0
+
+        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
+
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=terminal_state, \
+                    state_ori=TerminalState(), action=terminal_action, reward=reward, terminal=True)
+        elif self.replay_type == 'prioritized':
+            # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
+                #if self.samplecount >= self.capacity:
+                if True:
+                    self.episodes[domainInControl].record(state=terminal_state, \
+                        state_ori=TerminalState(), action=terminal_action, reward=reward, \
+                            Q_s_t_a_t_ = 0.0, gamma_Q_s_tplu1_maxa_= 0.0, uniform=False, terminal=True)
+                else:
+                    self.episodes[domainInControl].record(state=terminal_state, \
+                        state_ori=TerminalState(), action=terminal_action, reward=reward, \
+                            Q_s_t_a_t_ = 0.0, gamma_Q_s_tplu1_maxa_= 0.0, uniform=True, terminal=True)
+        return
+
+    def convertStateAction(self, state, action):
+        '''
+        nnType = 'dnn'
+        #nnType = 'rnn'
+        # expand one dimension to match the batch size of 1 at axis 0
+        if nnType == 'rnn':
+            belief = np.expand_dims(belief,axis=0)
+        '''
+       
+        if isinstance(state, TerminalState):
+            return [0] * 260, action #260 #264
+        else:
+            flat_belief = flatten_belief(state)
+
+            if flat_belief == self.prev_state_check:
+                print('same state')
+            else:
+                print('diff state')
+            self.prev_state_check = flat_belief
+
+            return flat_belief, action
+
+    def nextAction(self, beliefstate, hyps):
+        '''
+        select next action
+
+        :param beliefstate: 
+        :param hyps:
+        :returns: (int) next summary action
+        '''
+        #beliefVec = flatten_belief(beliefstate, domainUtil)
+        beliefVec = flatten_belief(beliefstate)
+
+        execMask = self.summaryaction.getExecutableMask(beliefstate, self.lastSystemAction)
+        #print sum([ 1 for i in execMask if i==0.0 ])
+        if self.exploration_type == 'e-greedy':
+            # epsilon greedy
+            if self.is_training and utils.Settings.random.rand() < self.epsilon:
+                admissible = [i for i, x in enumerate(execMask) if x == 0.0]
+                random.shuffle(admissible)
+                nextaIdex = admissible[0]
+            else:
+                admissible = []
+                for idx, v in enumerate(execMask):
+                    if v > -sys.maxsize:
+                        Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
+                        Qidx = self.dqn.predict(np.reshape(beliefVec, (1, len(beliefVec))), Action_idx)
+                        admissible.append(Qidx[0])
+                    else:
+                        admissible.append(-sys.maxsize)
+                #action_Q = self.dqn.predict(np.reshape(beliefVec, (1, len(beliefVec))))# + (1. / (1. + i + j))
+                #admissible = np.add(action_Q, np.array(execMask))
+                logger.info('action Q...')
+                print(admissible)
+                nextaIdex = np.argmax(admissible)
+
+                # add current max Q to self.episode_ave_max_q
+                print('current maxQ', np.max(admissible))
+                self.episode_ave_max_q.append(np.max(admissible))
+
+        elif self.exploration_type == 'Boltzman':
+            # randomly assign, not complete
+            admissible = [i for i, x in enumerate(execMask) if x == 0.0]
+            random.shuffle(admissible)
+            nextaIdex = admissible[0]
+      
+        self.stats[nextaIdex] += 1
+        summaryAct = self.summaryaction.action_names[nextaIdex]
+        masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction)
+        return masterAct, nextaIdex
+
+    def train(self):
+        '''
+        call this function when the episode ends
+        '''
+
+        if not self.is_training:
+            logger.info("Not in training mode")
+            return
+        else:
+            logger.info("Update dqn policy parameters.")
+
+        self.episodecount += 1
+        logger.info("Sample Num so far: %s" %(self.samplecount))
+        logger.info("Episode Num so far: %s" %(self.episodecount))
+        #if True:
+        if self.samplecount >= self.minibatch_size * 3 and self.episodecount % 4 == 0:
+        #if self.samplecount >= self.capacity and self.episodecount % 5 == 0:
+        #if self.samplecount > self.minibatch_size:
+        #if self.samplecount > self.capacity:
+            logger.info('start traninig...')
+
+
+            #################################################
+            #################################################
+            #   update TD error for all experience in PER   #
+            #################################################
+            #################################################
+            """
+            #s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
+            #    self.episodes[self.domainString].all_batch()
+            experience, idx_batch = self.episodes[self.domainString].all_batch()
+                #self.episodes[self.domainString].sample_batch_vanilla_PER()
+           
+            #s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch])
+            #s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch])
+
+            # self.s_prev, self.s_ori_prev, self.a_prev, self.r_prev, state, state_ori, termina
+    
+            for k in xrange(len(idx_batch)):
+                Q_bootstrap_label = 0
+                if experience[k][-1]: # terminal
+                    Q_bootstrap_label = experience[k][3] # reward
+                else:
+                    execMask = self.summaryaction.getExecutableMask(experience[k][-2], experience[k][2]) # s_ori, a
+                    if self.q_update == 'single':
+                        admissible = []
+                        for idx, v in enumerate(execMask):
+                            if v > -sys.maxint:
+                                Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
+                                s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][0]] ]) # s
+                                Qidx = self.dqn.predict_target(s2_idx, Action_idx)
+                                admissible.append(Qidx[0])
+                        Q_bootstrap_label = experience[k][3] + self.gamma * np.max(admissible) # reward
+                    elif self.q_update == 'double':
+                        Qs = []
+                        for idx, v in enumerate(execMask):
+                            if v > -sys.maxint:
+                                Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
+                                s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][-3]] ])
+                                Qidx = self.dqn.predict(s2_idx, Action_idx)
+                                Qs.append(Qidx[0])
+                            else:
+                                Qs.append(-sys.maxint)
+
+                        policyQ_argmax_a = np.argmax(Qs)
+                        policyQ_argmax_a_one_hot = np.eye(self.action_dim, self.action_dim)[[policyQ_argmax_a]]
+                        s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][-3]] ])
+                        target_value_Q = self.dqn.predict_target(s2_idx, policyQ_argmax_a_one_hot)
+
+                        Q_bootstrap_label = experience[k][3] + self.gamma * target_value_Q
+
+                if self.replay_type == 'prioritized':
+                    # update the sum-tree
+                    # update the TD error of the samples in the minibatch
+                    current_a = np.eye(self.action_dim, self.action_dim)[[experience[k][2]]]
+                    current_s = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][0]] ])
+                    currentQ_s_a_ = self.dqn.predict(current_s, current_a)
+                    currentQ_s_a_ = currentQ_s_a_[0]
+                    error = abs(currentQ_s_a_ - Q_bootstrap_label)
+                    self.episodes[self.domainString].update(idx_batch[k], error)
+ 
+            """
+
+            s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
+                self.episodes[self.domainString].sample_batch()
+                #self.episodes[self.domainString].sample_batch_vanilla_PER()
+           
+            #s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch])
+            #s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch])
+
+            y_i = []
+            for k in range(min(self.minibatch_size, self.episodes[self.domainString].size())):
+                Q_bootstrap_label = 0
+                if t_batch[k]:
+                    Q_bootstrap_label = r_batch[k]
+                else:
+                    execMask = self.summaryaction.getExecutableMask(s2_ori_batch[k], a_batch[k])
+                    if self.q_update == 'single':
+                        admissible = []
+                        for idx, v in enumerate(execMask):
+                            if v > -sys.maxsize:
+                                Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
+                                s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [s2_batch[k]] ])
+                                Qidx = self.dqn.predict_target(s2_idx, Action_idx)
+                                admissible.append(Qidx[0])
+                        Q_bootstrap_label = r_batch[k] + self.gamma * np.max(admissible)
+                    elif self.q_update == 'double':
+                        Qs = []
+                        for idx, v in enumerate(execMask):
+                            if v > -sys.maxsize:
+                                Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
+                                s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [s2_batch[k]] ])
+                                Qidx = self.dqn.predict(s2_idx, Action_idx)
+                                Qs.append(Qidx[0])
+                            else:
+                                Qs.append(-sys.maxsize)
+
+                        policyQ_argmax_a = np.argmax(Qs)
+                        policyQ_argmax_a_one_hot = np.eye(self.action_dim, self.action_dim)[[policyQ_argmax_a]]
+                        s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [s2_batch[k]] ])
+                        target_value_Q = self.dqn.predict_target(s2_idx, policyQ_argmax_a_one_hot)
+
+                        Q_bootstrap_label = r_batch[k] + self.gamma * target_value_Q
+                y_i.append(Q_bootstrap_label)
+
+                if self.replay_type == 'prioritized':
+                    # update the sum-tree
+                    # update the TD error of the samples in the minibatch
+                    current_a = np.eye(self.action_dim, self.action_dim)[[a_batch[k]]]
+                    current_s = np.vstack([ np.expand_dims(x, 0) for x in [s_batch[k]] ])
+                    currentQ_s_a_ = self.dqn.predict(current_s, current_a)
+                    currentQ_s_a_ = currentQ_s_a_[0]
+                    error = abs(currentQ_s_a_ - Q_bootstrap_label)
+                    self.episodes[self.domainString].update(idx_batch[k], error)
+                                         
+            # change index-based a_batch to one-hot-based a_batch
+            a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch]
+
+            # Update the critic given the targets
+            reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i])
+
+            s_batch_expand = np.vstack([np.expand_dims(x, 0) for x in s_batch])
+            """
+            print s_batch_expand
+            print 'a_batch', a_batch
+            print a_batch_one_hot
+            print len(a_batch)
+            print len(y_i)
+            """
+            #reshaped_yi = np.reshape(y_i, (min(self.minibatch_size, self.episodes[self.domainString].size()), 1))
+            predicted_q_value, _, currentLoss = self.dqn.train(s_batch, a_batch_one_hot, reshaped_yi)
+            #predicted_q_value, _, currentLoss = self.dqn.train(s_batch_expand, a_batch_one_hot, reshaped_yi)
+            
+            print('y_i')
+            print(y_i)
+            print('currentLoss', currentLoss)
+            print('predict Q')
+            print(predicted_q_value)
+
+            if self.episodecount % 1 == 0:
+            #if self.episodecount % 50 == 0:
+            # Update target networks
+                self.dqn.update_target_network()
+
+            self.savePolicyInc()  # self.out_policy_file)
+
+    def savePolicy(self, FORCE_SAVE=False):
+        """
+        Does not use this, cause it will be called from agent after every episode.
+        we want to save the policy only periodically.
+        """
+        pass
+
+    def savePolicyInc(self, FORCE_SAVE=False):
+        """
+        save model and replay buffer
+        """
+        #save_path = self.saver.save(self.sess, self.out_policy_file+'.ckpt')
+        self.dqn.save_network(self.out_policy_file+'.dqn.ckpt')
+
+        f = open(self.out_policy_file+'.episode', 'wb')
+        for obj in [self.samplecount, self.episodes[self.domainString]]:
+            pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
+        f.close()
+        #logger.info("Saving model to %s and replay buffer..." % save_path)
+
+    def saveStats(self, FORCE_SAVE=False):
+        f = open(self.out_policy_file + '.stats', 'wb')
+        pickle.dump(self.stats, f, protocol=pickle.HIGHEST_PROTOCOL)
+        f.close()
+
+    def loadPolicy(self, filename):
+        """
+        load model and replay buffer
+        """
+        # load models
+        self.dqn.load_network(filename+'.dqn.ckpt')
+        
+        # load replay buffer
+        try:
+            print('laod from: ', filename)
+            f = open(filename+'.episode', 'rb')
+            loaded_objects = []
+            for i in range(2): # load nn params and collected data
+                loaded_objects.append(pickle.load(f))
+            self.samplecount = int(loaded_objects[0])
+            self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1])
+            logger.info("Loading both model from %s and replay buffer..." % filename)
+            f.close()
+        except:
+            logger.info("Loading only models...")
+
+    def restart(self):
+        self.summaryAct = None          
+        self.lastSystemAction = None
+        self.prevbelief = None
+        self.actToBeRecorded = None
+        self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter)
+        print('current eps', self.epsilon)
+        #self.episodes = dict.fromkeys(OntologyUtils.available_domains, None)
+        #self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.randomseed)
+        self.episode_ave_max_q = []
+
+#END OF FILE
diff --git a/policy/feudalgainRL/FeudalACERPolicy.py b/policy/feudalgainRL/FeudalACERPolicy.py
new file mode 100644
index 0000000000000000000000000000000000000000..19d9fccb719b8588567e8e397a9a6f2441230de2
--- /dev/null
+++ b/policy/feudalgainRL/FeudalACERPolicy.py
@@ -0,0 +1,457 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+'''
+ACERPolicy.py - ACER - Actor Critic with Experience Replay
+==================================================
+
+Copyright CUED Dialogue Systems Group 2015 - 2017
+
+.. seealso:: CUED Imports/Dependencies:
+
+    import :class:`Policy`
+    import :class:`utils.ContextLogger`
+
+.. warning::
+        Documentation not done.
+
+
+************************
+
+'''
+import copy
+import os
+import json
+import numpy as np
+import scipy
+import scipy.signal
+import pickle as pickle
+import random
+import utils
+from utils.Settings import config as cfg
+from utils import ContextLogger, DiaAct
+
+import ontology.FlatOntologyManager as FlatOnt
+import tensorflow as tf
+from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode
+from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode
+import policy.DRL.utils as drlutils
+from policy.ACERPolicy import ACERPolicy
+import policy.DRL.acer as acer
+import policy.Policy
+import policy.SummaryAction
+from policy.Policy import TerminalAction, TerminalState
+from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state
+
+logger = utils.ContextLogger.getLogger('')
+
+# Discounting function used to calculate discounted returns.
+def discount(x, gamma):
+    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
+
+
+class FeudalACERPolicy(ACERPolicy):
+    '''Derived from :class:`Policy`
+    '''
+    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False,
+                 action_names=None, slot=None, sd_state_dim=50):
+        super(FeudalACERPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training)
+
+        tf.reset_default_graph()
+
+        self.in_policy_file = in_policy_file
+        self.out_policy_file = out_policy_file
+        self.is_training = is_training
+        self.accum_belief = []
+        self.prev_state_check = None
+        self.sd_state_dim = sd_state_dim
+
+        self.domainString = domainString
+        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
+
+        self.features = 'dip'
+        self.sd_enc_size = 80
+        self.si_enc_size = 40
+        self.dropout_rate = 0.
+        if cfg.has_option('feudalpolicy', 'features'):
+            self.features = cfg.get('feudalpolicy', 'features')
+        if cfg.has_option('feudalpolicy', 'sd_enc_size'):
+            self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
+        if cfg.has_option('feudalpolicy', 'si_enc_size'):
+            self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
+        if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
+            self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+        if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
+            self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+        self.actfreq_ds = False
+        if cfg.has_option('feudalpolicy', 'actfreq_ds'):
+            self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')
+
+        # init session
+        self.sess = tf.Session()
+        with tf.device("/cpu:0"):
+
+            np.random.seed(self.randomseed)
+            tf.set_random_seed(self.randomseed)
+
+            # initialise an replay buffer
+            if self.replay_type == 'vanilla':
+                self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed)
+            elif self.replay_type == 'prioritized':
+                self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed)
+            #replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
+            #self.episodes = []
+            self.samplecount = 0
+            self.episodecount = 0
+
+            # construct the models
+            self.state_dim = 89  # current DIP state dim
+            self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
+            self.action_names = action_names
+            self.action_dim = len(self.action_names)
+            action_bound = len(self.action_names)
+            self.stats = [0 for _ in range(self.action_dim)]
+
+            self.global_mu = [0. for _ in range(self.action_dim)]
+
+            if self.features == 'dip':
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        self.state_dim += 9#16
+                    elif self.domainString == 'SFRestaurants':
+                        self.state_dim += 9#25
+                    elif self.domainString == 'Laptops11':
+                        self.state_dim += 9#40
+                self.acer = acer.ACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta,
+                                             self.c, self.alpha, self.h1_size, self.h2_size, self.is_training)
+            elif self.features == 'learned' or self.features == 'rnn':
+                si_state_dim = 73
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        si_state_dim += 9#16
+                    elif self.domainString == 'SFRestaurants':
+                        si_state_dim += 9#25
+                    elif self.domainString == 'Laptops11':
+                        si_state_dim += 9#40
+
+                if 0:#self.features == 'rnn':
+                    self.acer = acer.RNNACERNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.critic_lr,
+                                                    self.delta, self.c, self.alpha, self.h1_size, self.h2_size, self.is_training,
+                                                    sd_enc_size=25, si_enc_size=25, dropout_rate=0., tn='normal', slot='si')
+                else:
+                    self.state_dim = si_state_dim
+                    self.acer = acer.ACERNetwork(self.sess, self.state_dim, self.action_dim,
+                                                 self.critic_lr, self.delta, self.c, self.alpha, self.h1_size,
+                                                 self.h2_size, self.is_training)
+
+            else:
+                logger.error('features "{}" not implemented'.format(self.features))
+
+
+            # when all models are defined, init all variables
+            init_op = tf.global_variables_initializer()
+            self.sess.run(init_op)
+
+            self.loadPolicy(self.in_policy_file)
+            print('loaded replay size: ', self.episodes[self.domainString].size())
+
+            #self.acer.update_target_network()
+
+    # def record() has been handled...
+
+    def convertStateAction(self, state, action):
+        '''
+
+        '''
+        if isinstance(state, TerminalState):
+            return [0] * 89, action
+
+        else:
+            if self.features == 'learned' or self.features == 'rnn':
+                dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
+            else:
+                dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
+            action_name = self.actions.action_names[action]
+            act_slot = 'general'
+            for slot in dip_state.slots:
+                if slot in action_name:
+                    act_slot = slot
+            flat_belief = dip_state.get_beliefStateVec(act_slot)
+            self.prev_state_check = flat_belief
+
+            return flat_belief, action
+
+    def record(self, reward, domainInControl=None, weight=None, state=None, action=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.actToBeRecorded is None:
+            self.actToBeRecorded = self.summaryAct
+
+        if state is None:
+            state = self.prevbelief
+        if action is None:
+            action = self.actToBeRecorded
+        mu_weight = self.prev_mu
+        mask = self.prev_mask
+        if action == self.action_dim-1: # pass action was taken
+            mask = np.zeros(self.action_dim)
+            mu_weight = np.ones(self.action_dim)/self.action_dim
+
+        cState, cAction = state, action
+
+        reward /= 20.0
+
+        value = self.acer.predict_value([cState], [mask])
+
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=cState, \
+                    state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask)
+        elif self.replay_type == 'prioritized':
+            self.episodes[domainInControl].record(state=cState, \
+                    state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask)
+
+        self.actToBeRecorded = None
+        self.samplecount += 1
+        return
+
+    def finalizeRecord(self, reward, domainInControl=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.episodes[domainInControl] is None:
+            logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
+            return
+
+        #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size())
+        #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q)
+        #print self.stats
+
+        # normalising total return to -1~1
+        reward /= 20.0
+
+        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
+        value = 0.0 # not effect on experience replay
+
+        def calculate_discountR_advantage(r_episode, v_episode):
+            #########################################################################
+            # Here we take the rewards and values from the rollout, and use them to
+            # generate the advantage and discounted returns.
+            # The advantage function uses "Generalized Advantage Estimation"
+            bootstrap_value = 0.0
+            self.r_episode_plus = np.asarray(r_episode + [bootstrap_value])
+            discounted_r_episode = discount(self.r_episode_plus,self.gamma)[:-1]
+            self.v_episode_plus = np.asarray(v_episode + [bootstrap_value])
+            advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1]
+            advantage = discount(advantage,self.gamma)
+            #########################################################################
+            return discounted_r_episode, advantage
+
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=terminal_state, \
+                    state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None)
+        elif self.replay_type == 'prioritized':
+            episode_r, episode_v = self.episodes[domainInControl].record_final_and_get_episode(state=terminal_state, \
+                                                                                               state_ori=TerminalState(),
+                                                                                               action=terminal_action,
+                                                                                               reward=reward,
+                                                                                               value=value)
+
+            # TD_error is a list of td error in the current episode
+            _, TD_error = calculate_discountR_advantage(episode_r, episode_v)
+            episodic_TD = np.mean(np.absolute(TD_error))
+            print('episodic_TD')
+            print(episodic_TD)
+            self.episodes[domainInControl].insertPriority(episodic_TD)
+
+        return
+
+    def nextAction(self, beliefstate):
+        '''
+        select next action
+
+        :param beliefstate:
+        :param hyps:
+        :returns: (int) next summarye action
+        '''
+
+        #execMask = self.summaryaction.getExecutableMask(beliefstate, self.lastSystemAction)
+        execMask = np.zeros(self.action_dim)
+
+        def apply_mask(prob, maskval, baseline=9.99999975e-06):
+            return prob if maskval == 0.0 else baseline # not quite 0.0 to avoid division by zero
+
+        if self.exploration_type == 'e-greedy' or not self.is_training:
+            if self.is_training and utils.Settings.random.rand() < self.epsilon:
+                action_prob = np.random.rand(len(self.action_names))
+            else:
+                action_prob = self.acer.predict_policy(np.reshape(beliefstate, (1, len(beliefstate))),
+                                                   np.reshape(execMask, (1, len(execMask))))[0]
+        mu = action_prob / sum(action_prob)
+        self.prev_mu = mu
+        self.prev_mask = execMask
+        return action_prob
+
+    def train(self):
+        '''
+        call this function when the episode ends
+        '''
+        USE_GLOBAL_MU = False
+        self.episode_ct += 1
+
+        if not self.is_training:
+            logger.info("Not in training mode")
+            return
+        else:
+            logger.info("Update acer policy parameters.")
+
+        self.episodecount += 1
+        logger.info("Sample Num so far: %s" % (self.samplecount))
+        logger.info("Episode Num so far: %s" % (self.episodecount))
+        if self.samplecount >= self.minibatch_size * 3 and self.episodecount % self.training_frequency == 0:
+        #if self.episodecount % self.training_frequency == 0:
+            logger.info('start trainig...')
+
+            for _ in range(self.train_iters_per_episode):
+
+                if self.replay_type == 'vanilla' or self.replay_type == 'prioritized':
+                    s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, v_batch, mu_policy, mask_batch = \
+                        self.episodes[self.domainString].sample_batch()
+                    if USE_GLOBAL_MU:
+                        mu_sum = sum(self.global_mu)
+                        mu_normalised = np.array([c / mu_sum for c in self.global_mu])
+                        mu_policy = [[mu_normalised for _ in range(len(mu_policy[i]))] for i in range(len(mu_policy))]
+                else:
+                    assert False  # not implemented yet
+
+                discounted_r_batch = []
+                advantage_batch = []
+                def calculate_discountR_advantage(r_episode, v_episode):
+                    #########################################################################
+                    # Here we take the rewards and values from the rolloutv, and use them to
+                    # generate the advantage and discounted returns.
+                    # The advantage function uses "Generalized Advantage Estimation"
+                    bootstrap_value = 0.0
+                    # r_episode rescale by rhos?
+                    self.r_episode_plus = np.asarray(r_episode + [bootstrap_value])
+                    discounted_r_episode = discount(self.r_episode_plus, self.gamma)[:-1]
+                    self.v_episode_plus = np.asarray(v_episode + [bootstrap_value])
+                    # change sth here
+                    advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1]
+                    advantage = discount(advantage, self.gamma)
+                    #########################################################################
+                    return discounted_r_episode, advantage
+
+                if self.replay_type == 'prioritized':
+                    for item_r, item_v, item_idx in zip(r_batch, v_batch, idx_batch):
+                        # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist())
+                        r, a = calculate_discountR_advantage(item_r, item_v)
+
+                        # flatten nested numpy array and turn it into list
+                        discounted_r_batch += r.tolist()
+                        advantage_batch += a.tolist()
+
+                        # update the sum-tree
+                        # update the TD error of the samples (episode) in the minibatch
+                        episodic_TD_error = np.mean(np.absolute(a))
+                        self.episodes[self.domainString].update(item_idx, episodic_TD_error)
+                else:
+                    for item_r, item_v in zip(r_batch, v_batch):
+                        # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist())
+                        r, a = calculate_discountR_advantage(item_r, item_v)
+
+                        # flatten nested numpy array and turn it into list
+                        discounted_r_batch += r.tolist()
+                        advantage_batch += a.tolist()
+
+                batch_size = len(s_batch)
+
+                a_batch_one_hot = np.eye(self.action_dim)[np.concatenate(a_batch, axis=0).tolist()]
+
+                loss, entropy, optimize = \
+                            self.acer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot,
+                                            np.concatenate(np.array(mask_batch), axis=0).tolist(),
+                                            np.concatenate(np.array(r_batch), axis=0).tolist(), s_batch, r_batch, self.gamma,
+                                            np.concatenate(np.array(mu_policy), axis=0),
+                                            discounted_r_batch, advantage_batch)
+
+                ent, norm_loss = entropy/float(batch_size), loss/float(batch_size)
+
+
+            self.savePolicyInc()  # self.out_policy_file)
+
+
+    def savePolicy(self, FORCE_SAVE=False):
+        """
+        Does not use this, cause it will be called from agent after every episode.
+        we want to save the policy only periodically.
+        """
+        pass
+
+    def savePolicyInc(self, FORCE_SAVE=False):
+        """
+        save model and replay buffer
+        """
+        if self.episodecount % self.save_step == 0:
+            #save_path = self.saver.save(self.sess, self.out_policy_file+'.ckpt')
+            self.acer.save_network(self.out_policy_file+'.acer.ckpt')
+
+            f = open(self.out_policy_file+'.episode', 'wb')
+            for obj in [self.samplecount, self.episodes[self.domainString], self.global_mu]:
+                pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
+            f.close()
+            #logger.info("Saving model to %s and replay buffer..." % save_path)
+
+    def loadPolicy(self, filename):
+        """
+        load model and replay buffer
+        """
+        # load models
+        self.acer.load_network(filename+'.acer.ckpt')
+
+        # load replay buffer
+        if self.load_buffer:
+            try:
+                print('load from: ', filename)
+                f = open(filename+'.episode', 'rb')
+                loaded_objects = []
+                for i in range(2): # load nn params and collected data
+                    loaded_objects.append(pickle.load(f))
+                self.samplecount = int(loaded_objects[0])
+                self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1])
+                self.global_mu = loaded_objects[2]
+                logger.info("Loading both model from %s and replay buffer..." % filename)
+                f.close()
+            except:
+                logger.info("Loading only models...")
+        else:
+            print("We do not load the buffer!")
+
+    def restart(self):
+        self.summaryAct = None
+        self.lastSystemAction = None
+        self.prevbelief = None
+        self.prev_mu = None
+        self.prev_mask = None
+        self.actToBeRecorded = None
+        self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter)
+        self.episode_ave_max_q = []
+
+#END OF FILE
diff --git a/policy/feudalgainRL/FeudalBBQNPolicy.py b/policy/feudalgainRL/FeudalBBQNPolicy.py
new file mode 100644
index 0000000000000000000000000000000000000000..01a6275ac7468b716beaa01e76656a7babf15ddf
--- /dev/null
+++ b/policy/feudalgainRL/FeudalBBQNPolicy.py
@@ -0,0 +1,407 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+'''
+DQNPolicy.py - deep Q network policy
+==================================================
+
+Author: Chris Tegho and Pei-Hao (Eddy) Su  (Copyright CUED Dialogue Systems Group 2016)
+
+.. seealso:: CUED Imports/Dependencies: 
+
+    import :class:`Policy`
+    import :class:`utils.ContextLogger`
+
+.. warning::
+        Documentation not done.
+
+
+************************
+
+'''
+
+import copy
+import os
+import json
+import numpy as np
+import pickle as pickle
+import random
+import sys
+import utils
+from utils.Settings import config as cfg
+from utils import ContextLogger, DiaAct, DialogueState
+
+import ontology.FlatOntologyManager as FlatOnt
+# from theano_dialogue.util.tool import *
+
+import tensorflow as tf
+from policy.DRL.replay_bufferVanilla import ReplayBuffer
+from policy.DRL.replay_prioritisedVanilla import ReplayPrioritised
+import policy.DRL.utils as drlutils
+from policy.DRL import bdqn as bbqn
+import policy.Policy
+import policy.SummaryAction
+import policy.BBQNPolicy
+from policy.Policy import TerminalAction, TerminalState
+from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state
+
+logger = utils.ContextLogger.getLogger('')
+
+# --- for flattening the belief --- # 
+domainUtil = FlatOnt.FlatDomainOntology('CamRestaurants')
+
+
+class FeudalBBQNPolicy(policy.BBQNPolicy.BBQNPolicy):
+    '''Derived from :class:`BBQNPolicy`
+    '''
+    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False,
+                 action_names=None, slot=None):
+        super(FeudalBBQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training)
+
+        tf.reset_default_graph()
+
+        self.domainString = domainString
+        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
+        self.in_policy_file = in_policy_file
+        self.out_policy_file = out_policy_file
+        self.is_training = is_training
+        self.accum_belief = []
+
+        self.prev_state_check = None
+
+        self.episode_ave_max_q = []
+
+        self.capacity *= 4 #set the capacity for episode methods, multiply it to adjust to turn based methods
+        self.slot = slot
+
+        # init session
+        self.sess = tf.Session()
+        with tf.device("/cpu:0"):
+
+            np.random.seed(self.randomseed)
+            tf.set_random_seed(self.randomseed)
+
+            # initialise an replay buffer
+            if self.replay_type == 'vanilla':
+                self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed)
+            elif self.replay_type == 'prioritized':
+                self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size,
+                                                                     self.randomseed)
+            # replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
+            # self.episodes = []
+            self.samplecount = 0
+            self.episodecount = 0
+
+            # construct the models
+            self.state_dim = 89  # current DIP state dim
+            self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
+            self.action_names = action_names
+            self.action_dim = len(self.action_names)
+            action_bound = len(self.action_names)
+            self.stats = [0 for _ in range(self.action_dim)]
+            self.stdVar = []
+            self.meanVar = []
+            self.stdMean = []
+            self.meanMean = []
+            self.td_error = []
+            self.td_errorVar = []
+
+            self.target_update_freq = 1
+            if cfg.has_option('bbqnpolicy', 'target_update_freq'):
+                self.target_update_freq = cfg.get('bbqnpolicy', 'target_update_freq')
+
+            #feudal params
+            self.features = 'dip'
+            self.sd_enc_size = 25
+            self.si_enc_size = 50
+            self.dropout_rate = 0.
+            if cfg.has_option('feudalpolicy', 'features'):
+                self.features = cfg.get('feudalpolicy', 'features')
+            if cfg.has_option('feudalpolicy', 'sd_enc_size'):
+                self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
+            if cfg.has_option('feudalpolicy', 'si_enc_size'):
+                self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
+            if cfg.has_option('feudalpolicy', 'dropout_rate') and self.is_training:
+                self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+            self.actfreq_ds = False
+            if cfg.has_option('feudalpolicy', 'actfreq_ds'):
+                self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')
+
+            if self.features == 'dip':
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        self.state_dim += 16
+                    elif self.domainString == 'SFRestaurants':
+                        self.state_dim += 25
+                    elif self.domainString == 'Laptops11':
+                        self.state_dim += 40
+
+                self.bbqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate, self.tau,
+                                              action_bound, self.architecture, self.h1_size, self.h2_size,
+                                              self.n_samples,
+                                              self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu,
+                                              self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling,
+                                              self.alpha_divergence, self.alpha, self.sigma_eps)
+            elif self.features == 'learned' or self.features == 'rnn':
+                si_state_dim = 72
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        si_state_dim += 16
+                    elif self.domainString == 'SFRestaurants':
+                        si_state_dim += 25
+                    elif self.domainString == 'Laptops11':
+                        si_state_dim += 40
+                if self.domainString == 'CamRestaurants':
+                    sd_state_dim = 94
+                elif self.domainString == 'SFRestaurants':
+                    sd_state_dim = 158
+                elif self.domainString == 'Laptops11':
+                    sd_state_dim = 13
+                else:
+                    logger.error(
+                        'Domain {} not implemented in feudal-DQN yet')  # just find out the size of sd_state_dim for the new domain
+                if self.features == 'rnn':
+                    arch = 'rnn'
+                    self.state_dim = si_state_dim + sd_state_dim
+                    self.bbqn = bbqn.RNNBBQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate,
+                                                  self.tau, action_bound, arch, self.h1_size, self.h2_size, self.n_samples,
+                                                  self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu,
+                                                  self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling,
+                                                  self.alpha_divergence, self.alpha, self.sigma_eps, sd_enc_size=self.sd_enc_size,
+                                                   si_enc_size=self.sd_enc_size, dropout_rate=self.dropout_rate, slot=slot)
+                else:
+                    arch = 'vanilla'
+                    self.state_dim = si_state_dim + sd_state_dim
+                    self.bbqn = bbqn.NNBBQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate,
+                                                  self.tau, action_bound, arch, self.h1_size, self.h2_size, self.n_samples,
+                                                  self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu,
+                                                  self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling,
+                                                  self.alpha_divergence, self.alpha, self.sigma_eps, sd_enc_size=self.sd_enc_size,
+                                                   si_enc_size=self.sd_enc_size, dropout_rate=self.dropout_rate, slot=slot)
+            else:
+                logger.error('features "{}" not implemented'.format(self.features))
+
+
+
+            # when all models are defined, init all variables
+            init_op = tf.global_variables_initializer()
+            self.sess.run(init_op)
+
+            self.loadPolicy(self.in_policy_file)
+            print('loaded replay size: ', self.episodes[self.domainString].size())
+
+            self.bbqn.update_target_network()
+
+    def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.actToBeRecorded is None:
+            # self.actToBeRecorded = self.lastSystemAction
+            self.actToBeRecorded = self.summaryAct
+
+        if state is None:
+            state = self.prevbelief
+        if action is None:
+            action = self.actToBeRecorded
+
+        cState, cAction = state, action
+
+        reward /= 20.0
+
+        cur_cState = np.vstack([np.expand_dims(x, 0) for x in [cState]])
+        cur_action_q = self.bbqn.predict(cur_cState)
+        cur_target_q = self.bbqn.predict_target(cur_cState)
+
+        if exec_mask is not None:
+            admissible = np.add(cur_target_q, np.array(exec_mask))
+        else:
+            admissible = cur_target_q
+
+        Q_s_t_a_t_ = cur_action_q[0][cAction]
+        gamma_Q_s_tplu1_maxa_ = self.gamma * np.max(admissible)
+
+        if weight == None:
+            if self.replay_type == 'vanilla':
+                self.episodes[domainInControl].record(state=cState, \
+                                                      state_ori=state, action=cAction, reward=reward)
+            elif self.replay_type == 'prioritized':
+                # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
+                if True:
+                    # if self.samplecount >= self.capacity:
+                    self.episodes[domainInControl].record(state=cState, \
+                                                          state_ori=state, action=cAction, reward=reward, \
+                                                          Q_s_t_a_t_=Q_s_t_a_t_,
+                                                          gamma_Q_s_tplu1_maxa_=gamma_Q_s_tplu1_maxa_, uniform=False)
+                else:
+                    self.episodes[domainInControl].record(state=cState, \
+                                                          state_ori=state, action=cAction, reward=reward, \
+                                                          Q_s_t_a_t_=Q_s_t_a_t_,
+                                                          gamma_Q_s_tplu1_maxa_=gamma_Q_s_tplu1_maxa_, uniform=True)
+
+        else:
+            self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward,
+                                                  ma_weight=weight)
+
+        self.actToBeRecorded = None
+        self.samplecount += 1
+        return
+
+    def finalizeRecord(self, reward, domainInControl=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.episodes[domainInControl] is None:
+            logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
+            return
+
+        # normalising total return to -1~1
+        # if reward == 0:
+        #    reward = -20.0
+        reward /= 20.0
+        """
+        if reward == 20.0:
+            reward = 1.0
+        else:
+            reward = -0.5
+        """
+        # reward = float(reward+10.0)/40.0
+
+        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
+
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=terminal_state, \
+                                                  state_ori=TerminalState(), action=terminal_action, reward=reward,
+                                                  terminal=True)
+        elif self.replay_type == 'prioritized':
+            # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
+            if True:
+                # if self.samplecount >= self.capacity:
+                self.episodes[domainInControl].record(state=terminal_state, \
+                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
+                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False,
+                                                      terminal=True)
+            else:
+                self.episodes[domainInControl].record(state=terminal_state, \
+                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
+                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=True,
+                                                      terminal=True)
+
+    def convertStateAction(self, state, action):
+        '''
+
+        '''
+        if isinstance(state, TerminalState):
+            return [0] * 89, action
+
+        else:
+            if self.features == 'learned' or self.features == 'rnn':
+                dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
+            else:
+                dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
+            action_name = self.actions.action_names[action]
+            act_slot = 'general'
+            for slot in dip_state.slots:
+                if slot in action_name:
+                    act_slot = slot
+            flat_belief = dip_state.get_beliefStateVec(act_slot)
+            self.prev_state_check = flat_belief
+
+            return flat_belief, action
+
+    def nextAction(self, beliefstate):
+        '''
+        select next action
+
+        :param beliefstate:
+        :param hyps:
+        :returns: (int) next summary action
+        '''
+
+        if self.exploration_type == 'e-greedy':
+            # epsilon greedy
+            if self.is_training and utils.Settings.random.rand() < self.epsilon:
+                action_Q = np.random.rand(len(self.action_names))
+            else:
+                action_Q = self.bbqn.predict(np.reshape(beliefstate, (1, len(beliefstate))))  # + (1. / (1. + i + j))
+
+                self.episode_ave_max_q.append(np.max(action_Q))
+
+        # return the Q vect, the action will be converted in the feudal policy
+        return action_Q
+
+
+    def train(self):
+        '''
+        call this function when the episode ends
+        '''
+
+        if not self.is_training:
+            logger.info("Not in training mode")
+            return
+        else:
+            logger.info("Update dqn policy parameters.")
+
+        self.episodecount += 1
+        logger.info("Sample Num so far: %s" % (self.samplecount))
+        logger.info("Episode Num so far: %s" % (self.episodecount))
+
+        if self.samplecount >= self.minibatch_size * 10 and self.episodecount % self.training_frequency == 0:
+            logger.info('start training...')
+
+            s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
+                self.episodes[self.domainString].sample_batch()
+
+            s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch])
+            s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch])
+
+            a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch]
+            action_q = self.bbqn.predict_dip(s2_batch, a_batch_one_hot)
+            target_q = self.bbqn.predict_target_dip(s2_batch, a_batch_one_hot)
+            # print 'action Q and target Q:', action_q, target_q
+
+            y_i = []
+            for k in range(min(self.minibatch_size, self.episodes[self.domainString].size())):
+                Q_bootstrap_label = 0
+                if t_batch[k]:
+                    Q_bootstrap_label = r_batch[k]
+                else:
+                    if self.q_update == 'single':
+                        belief = s2_ori_batch[k]
+                        execMask = [0.0] * len(self.action_names)  # TODO: find out how to compute the mask here, or save it when recording the state
+                        execMask[-1] = -sys.maxsize
+                        action_Q = target_q[k]
+                        admissible = np.add(action_Q, np.array(execMask))
+                        Q_bootstrap_label = r_batch[k] + self.gamma * np.max(admissible)
+
+                y_i.append(Q_bootstrap_label)
+
+            # Update the critic given the targets
+            reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i])
+
+            predicted_q_value, _, currentLoss, logLikelihood, varFC2, meanFC2, td_error, KL_div = self.bbqn.train(s_batch, a_batch_one_hot, reshaped_yi, self.episodecount)
+
+        if self.episodecount % self.target_update_freq == 0:
+            self.bbqn.update_target_network()
+        if self.episodecount % self.save_step == 0:
+            self.savePolicyInc()  # self.out_policy_file)
+
+
+# END OF FILE
diff --git a/policy/feudalgainRL/FeudalBBQNPolicyNew.py b/policy/feudalgainRL/FeudalBBQNPolicyNew.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d35709818b1fdeed16592c3bcc0cafb9a21c727
--- /dev/null
+++ b/policy/feudalgainRL/FeudalBBQNPolicyNew.py
@@ -0,0 +1,416 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+'''
+DQNPolicy.py - deep Q network policy
+==================================================
+
+Copyright CUED Dialogue Systems Group 2015 - 2017
+
+.. seealso:: CUED Imports/Dependencies:
+
+    import :class:`Policy`
+    import :class:`utils.ContextLogger`
+
+.. warning::
+        Documentation not done.
+
+
+************************
+
+'''
+
+import copy
+import os
+import sys
+import json
+import numpy as np
+import pickle as pickle
+from itertools import product
+from scipy.stats import entropy
+import utils
+from utils.Settings import config as cfg
+from utils import ContextLogger, DiaAct, DialogueState
+
+import ontology.FlatOntologyManager as FlatOnt
+import tensorflow as tf
+from policy.DRL.replay_buffer import ReplayBuffer
+from policy.DRL.replay_prioritised import ReplayPrioritised
+import policy.DRL.utils as drlutils
+import policy.DRL.dqn as dqn
+import policy.Policy
+import policy.DQNPolicy
+import policy.SummaryAction
+from policy.Policy import TerminalAction, TerminalState
+from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state
+from policy.feudalRL.feudalUtils import get_feudal_masks
+from policy.DRL import bdqn as bbqn
+
+
+logger = utils.ContextLogger.getLogger('')
+
+
+class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
+    '''Derived from :class:`DQNPolicy`
+    '''
+
+    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False,
+                 action_names=None, slot=None, sd_state_dim=50, js_threshold=0, info_reward=0.0, jsd_reward=False,
+                 jsd_function=None):
+        super(FeudalDQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training)
+
+        tf.reset_default_graph()
+
+        self.domainString = domainString
+        self.sd_state_dim = sd_state_dim
+        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
+        self.in_policy_file = in_policy_file
+        self.out_policy_file = out_policy_file
+        self.is_training = is_training
+        self.accum_belief = []
+        self.info_reward = info_reward
+        self.js_threshold = js_threshold
+        self.jsd_reward = jsd_reward
+        self.jsd_function = jsd_function
+        if self.jsd_function is not None:
+            print("We use the JSD-function", self.jsd_function)
+        if self.js_threshold != 1.0 and not self.jsd_reward:
+            print("We use JS-divergence, threshold =", self.js_threshold)
+        if self.jsd_reward:
+            print("We train with raw JSD reward.")
+        self.slots = slot
+        self.features = 'dip'
+        if cfg.has_option('feudalpolicy', 'features'):
+            self.features = cfg.get('feudalpolicy', 'features')
+        self.actfreq_ds = False
+        if cfg.has_option('feudalpolicy', 'actfreq_ds'):
+            self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')
+
+        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
+        self.prev_state_check = None
+
+        self.max_k = 5
+        if cfg.has_option('dqnpolicy', 'max_k'):
+            self.max_k = cfg.getint('dqnpolicy', 'max_k')
+
+        self.capacity *= 5  # capacity for episode methods, multiply it to adjust to turn based methods
+
+        # init session
+        self.sess = tf.Session()
+        with tf.device("/cpu:0"):
+
+            np.random.seed(self.randomseed)
+            tf.set_random_seed(self.randomseed)
+
+            # initialise a replay buffer
+            if self.replay_type == 'vanilla':
+                self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size*4, self.randomseed)
+            elif self.replay_type == 'prioritized':
+                self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size,
+                                                                     self.randomseed)
+            self.samplecount = 0
+            self.episodecount = 0
+
+            # construct the models
+            self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
+            self.action_names = action_names
+            self.action_dim = len(self.action_names)
+            action_bound = len(self.action_names)
+            self.stats = [0 for _ in range(self.action_dim)]
+
+            if self.features == 'learned' or self.features == 'rnn':
+                si_state_dim = 73
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        si_state_dim += 9#16
+                    elif self.domainString == 'SFRestaurants':
+                        si_state_dim += 9#25
+                    elif self.domainString == 'Laptops11':
+                        si_state_dim += 9#40
+                self.sd_enc_size = 50
+                self.si_enc_size = 25
+                self.dropout_rate = 0.
+                if cfg.has_option('feudalpolicy', 'sd_enc_size'):
+                    self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
+                if cfg.has_option('feudalpolicy', 'si_enc_size'):
+                    self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
+                if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
+                    self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+                if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
+                    self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+
+                self.state_dim = si_state_dim + sd_state_dim
+                if self.features == 'learned':
+
+                    self.dqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate,
+                                                  self.tau,
+                                                  action_bound, self.architecture, self.h1_size, self.h2_size,
+                                                  self.n_samples,
+                                                  self.minibatch_size)
+
+                elif self.features == 'rnn':
+                    self.dqn = dqn.RNNFDeepQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim,
+                                                   self.learning_rate, self.tau, action_bound, self.minibatch_size,
+                                                   self.architecture, self.h1_size, self.h2_size,
+                                                   sd_enc_size=self.sd_enc_size, si_enc_size=self.si_enc_size,
+                                                   dropout_rate=self.dropout_rate, slot=self.slot)
+            else: # self.features = 'dip'
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        self.state_dim += 9#16
+                    elif self.domainString == 'SFRestaurants':
+                        self.state_dim += 9#25
+                    elif self.domainString == 'Laptops11':
+                        self.state_dim += 9#40
+                self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim,
+                                            self.learning_rate, self.tau, action_bound, self.minibatch_size,
+                                            self.architecture, self.h1_size,
+                                            self.h2_size, dropout_rate=self.dropout_rate)
+
+            # when all models are defined, init all variables (this might to be sent to the main policy too)
+            init_op = tf.global_variables_initializer()
+            self.sess.run(init_op)
+
+            self.loadPolicy(self.in_policy_file)
+            print('loaded replay size: ', self.episodes[self.domainString].size())
+
+            self.dqn.update_target_network()
+
+    def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.actToBeRecorded is None:
+            self.actToBeRecorded = self.summaryAct
+
+        if state is None:
+            state = self.prevbelief
+        if action is None:
+            action = self.actToBeRecorded
+
+        cState, cAction = state, action
+        # normalising total return to -1~1
+        reward /= 20.0
+
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=cState, \
+                                                  state_ori=state, action=cAction, reward=reward)
+
+        self.actToBeRecorded = None
+        self.samplecount += 1
+
+    def finalizeRecord(self, reward, domainInControl=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.episodes[domainInControl] is None:
+            logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
+            return
+
+        reward /= 20.0
+
+        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
+
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=terminal_state, \
+                                                  state_ori=TerminalState(), action=terminal_action, reward=reward,
+                                                  terminal=True)
+        elif self.replay_type == 'prioritized':
+            self.episodes[domainInControl].record(state=terminal_state, \
+                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
+                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False,
+                                                      terminal=True)
+            print('total TD', self.episodes[self.domainString].tree.total())
+
+    def convertStateAction(self, state, action):
+        '''
+
+        '''
+        if isinstance(state, TerminalState):
+            return [0] * 89, action
+        else:
+            if self.features == 'learned' or self.features == 'rnn':
+                dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
+            else:
+                dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
+            action_name = self.actions.action_names[action]
+            act_slot = 'general'
+            for slot in dip_state.slots:
+                if slot in action_name:
+                    act_slot = slot
+            flat_belief = dip_state.get_beliefStateVec(act_slot)
+            self.prev_state_check = flat_belief
+
+            return flat_belief, action
+
+    def nextAction(self, beliefstate):
+        '''
+        select next action
+
+        :param beliefstate: already converted to dipstatevec of the specific slot (or general)
+        :returns: (int) next summary action
+        '''
+
+        if self.exploration_type == 'e-greedy':
+            # epsilon greedy
+            if self.is_training and utils.Settings.random.rand() < self.epsilon:
+                action_Q = np.random.rand(len(self.action_names))
+            else:
+                if len(beliefstate.shape) == 1:
+                    action_Q = self.dqn.predict(np.reshape(beliefstate, (1, -1)))
+                else:
+                    action_Q = self.dqn.predict(beliefstate)
+                # add current max Q to self.episode_ave_max_q
+                self.episode_ave_max_q.append(np.max(action_Q))
+
+        #return the Q vect, the action will be converted in the feudal policy
+        return action_Q
+
+    def train(self):
+        '''
+        call this function when the episode ends
+        '''
+
+        if not self.is_training:
+            logger.info("Not in training mode")
+            return
+        else:
+            logger.info("Update dqn policy parameters.")
+
+        self.episodecount += 1
+        logger.info("Sample Num so far: %s" % (self.samplecount))
+        logger.info("Episode Num so far: %s" % (self.episodecount))
+
+        s_batch_new, s_batch_beliefstate, s_batch_chosen_slot, s2_batch_dipstate, s2_batch_beliefstate, t_batch_new, r_batch_new = \
+            [], [], [], [], [], [], []
+
+        if self.samplecount >= self.minibatch_size * 8 and self.episodecount % self.training_frequency == 0:
+            logger.info('start training...')
+
+            a_batch_one_hot_new = None
+            #updating only states where the action is not "pass()" complicates things :/
+            #since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples
+
+            while len(s_batch_new) < self.minibatch_size:
+
+                s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
+                    self.episodes[self.domainString].sample_batch()
+
+                a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch]
+                #we only wanna update state-action pairs, where action != pass()
+                valid_steps = [action[-1] != 1 for action in a_batch_one_hot]
+                a_batch_one_hot = a_batch_one_hot[valid_steps]
+
+                s_batch_new += [s[0] for i, s in enumerate(s_batch) if valid_steps[i]]
+                s_batch_beliefstate += [s[1] for i, s in enumerate(s_batch) if valid_steps[i]]
+                s_batch_chosen_slot += [s[2] for i, s in enumerate(s_batch) if valid_steps[i]]
+
+                s2_batch_dipstate += [s[3] for s, valid in zip(s2_batch, valid_steps) if valid]
+                s2_batch_beliefstate += [s[1] for s, valid in zip(s2_batch, valid_steps) if valid]
+
+                r_batch_new += [r for r, valid in zip(r_batch, valid_steps) if valid]
+                t_batch_new += [t for t, valid in zip(t_batch, valid_steps) if valid]
+
+                if a_batch_one_hot_new is None:
+                    a_batch_one_hot_new = a_batch_one_hot
+                else:
+                    a_batch_one_hot_new = np.vstack((a_batch_one_hot_new, a_batch_one_hot))
+
+            s_batch_new = np.vstack(s_batch_new)
+            s2_batch_dipstate = np.vstack(s2_batch_dipstate)
+
+            if self.js_threshold < 1.0 or self.jsd_reward:
+                #TODO: This is highly inefficient
+                js_divergence_batch = []
+                for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot):
+                    if slot != "None":
+                        keys = belief['beliefs'][slot].keys()
+
+                        b = [belief['beliefs'][slot]['**NONE**']] + \
+                            [belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
+
+                        b_2 = [belief2['beliefs'][slot]['**NONE**']] + \
+                              [belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
+
+                        js_divergence = self.compute_js_divergence(b, b_2)
+                        js_divergence_batch.append(js_divergence)
+                    else:
+                        js_divergence_batch.append(0.0)
+            else:
+                js_divergence_batch = [0] * len(r_batch_new)
+
+            tanh_n = np.tanh(1)
+            if self.jsd_reward:
+                if self.jsd_function == 'tanh':
+                    js_divergence_batch = np.tanh(np.array(js_divergence_batch)) / tanh_n
+                #normalize jsd between -1 and 1
+                js_divergence_batch = (-1 + 2 * np.array(js_divergence_batch)).tolist()
+            elif self.js_threshold < 1.0:
+                # normalizing bound to [0, 2] and then /20
+                js_divergence_batch = [2/20 * int(x > self.js_threshold) for x in js_divergence_batch]
+
+            action_q = self.dqn.predict_dip(s2_batch_dipstate, a_batch_one_hot_new)
+            target_q = self.dqn.predict_target_dip(s2_batch_dipstate, a_batch_one_hot_new)
+
+            action_q = np.reshape(action_q, (s_batch_new.shape[0], -1, self.action_dim))
+            target_q = np.reshape(target_q, (s_batch_new.shape[0], -1, self.action_dim))
+
+            y_i = []
+            for k in range(min(s_batch_new.shape[0], self.episodes[self.domainString].size())):
+                Q_bootstrap_label = 0
+                if t_batch_new[k]:
+                    Q_bootstrap_label = r_batch_new[k]
+                else:
+                    if self.q_update == 'single':
+                        action_Q = target_q[k]
+                        if self.jsd_reward:
+                            Q_bootstrap_label = js_divergence_batch[k] + self.gamma * np.max(action_Q)
+                        else:
+                            Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q)
+                    elif self.q_update == 'double':
+                        action_Q = action_q[k]
+                        argmax_tuple = np.unravel_index(np.argmax(action_Q, axis=None), action_Q.shape)
+                        value_Q = target_q[k][argmax_tuple]
+                        Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q
+                y_i.append(Q_bootstrap_label)
+
+                if self.replay_type == 'prioritized':
+                    # update the sum-tree
+                    # update the TD error of the samples in the minibatch
+                    currentQ_s_a_ = action_q[k][a_batch[k]]
+                    error = abs(currentQ_s_a_ - Q_bootstrap_label)
+                    self.episodes[self.domainString].update(idx_batch[k], error)
+
+            reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i])
+
+            predicted_q_value, _, currentLoss = self.dqn.train(s_batch_new, a_batch_one_hot_new, reshaped_yi)
+
+            if self.episodecount % 1 == 0:
+                # Update target networks
+                self.dqn.update_target_network()
+
+        self.savePolicyInc()
+
+    def compute_js_divergence(self, P, Q):
+
+        M = [p + q for p, q in zip(P, Q)]
+        return 0.5 * (entropy(P, M, base=2) + entropy(Q, M, base=2))
+
+# END OF FILE
diff --git a/policy/feudalgainRL/FeudalENACPolicy.py b/policy/feudalgainRL/FeudalENACPolicy.py
new file mode 100644
index 0000000000000000000000000000000000000000..216c90e3120f66aa13e49ca2f3db4204711b442a
--- /dev/null
+++ b/policy/feudalgainRL/FeudalENACPolicy.py
@@ -0,0 +1,514 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+'''
+ENACPolicy.py - Advantage Actor-Critic policy
+==================================================
+
+Copyright CUED Dialogue Systems Group 2015 - 2017
+
+.. seealso:: CUED Imports/Dependencies: 
+
+    import :class:`Policy`
+    import :class:`utils.ContextLogger`
+
+.. warning::
+        Documentation not done.
+
+
+************************
+
+'''
+
+import copy
+import os
+import json
+import numpy as np
+import scipy
+import scipy.signal
+import pickle as pickle
+import random
+import utils
+from utils.Settings import config as cfg
+from utils import ContextLogger, DiaAct
+
+import ontology.FlatOntologyManager as FlatOnt
+#from theano_dialogue.util.tool import *
+
+import tensorflow as tf
+from policy.DRL.replay_buffer_episode_enac import ReplayBufferEpisode
+from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode
+import policy.DRL.utils as drlutils
+import policy.DRL.enac as enac
+import policy.Policy
+from policy.ENACPolicy import ENACPolicy
+import policy.SummaryAction
+from policy.Policy import TerminalAction, TerminalState
+from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state
+from policy.feudalRL.feudalUtils import get_feudal_masks
+
+logger = utils.ContextLogger.getLogger('')
+
+
+# Discounting function used to calculate discounted returns.
+def discount(x, gamma):
+    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
+
+class FeudalENACPolicy(ENACPolicy):
+    '''Derived from :class:`Policy`
+    '''
+    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, action_names=None, slot=None):
+        super(FeudalENACPolicy, self).__init__(in_policy_file, out_policy_file, domainString=domainString, is_training=is_training)
+
+        tf.reset_default_graph()
+
+        self.in_policy_file = in_policy_file
+        self.out_policy_file = out_policy_file
+        self.is_training = is_training
+        self.accum_belief = []
+        self.prev_state_check = None
+
+        self.domainString = domainString
+        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
+
+        self.features = 'dip'
+        self.sd_enc_size = 80
+        self.si_enc_size = 40
+        self.dropout_rate = 0.
+        if cfg.has_option('feudalpolicy', 'features'):
+            self.features = cfg.get('feudalpolicy', 'features')
+        if cfg.has_option('feudalpolicy', 'sd_enc_size'):
+            self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
+        if cfg.has_option('feudalpolicy', 'si_enc_size'):
+            self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
+        if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
+            self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+        if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
+            self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+
+
+        # init session
+        self.sess = tf.Session()
+        with tf.device("/cpu:0"):
+
+            np.random.seed(self.randomseed)
+            tf.set_random_seed(self.randomseed)
+
+            # initialise an replay buffer
+            if self.replay_type == 'vanilla':
+                self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed)
+            elif self.replay_type == 'prioritized':
+                self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed)
+            #replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
+            #self.episodes = []
+            self.samplecount = 0
+            self.episodecount = 0
+
+            # construct the models
+            self.state_dim = 89  # current DIP state dim
+            self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
+            self.action_names = action_names
+            self.action_dim = len(self.action_names)
+            action_bound = len(self.action_names)
+            self.stats = [0 for _ in range(self.action_dim)]
+
+            if self.features == 'dip':
+                self.enac = enac.ENACNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.tau,
+                                         action_bound, self.architecture, self.h1_size, self.h2_size, self.is_training)
+            elif self.features == 'learned' or self.features == 'rnn':
+                si_state_dim = 72
+                if self.domainString == 'CamRestaurants':
+                    sd_state_dim = 94
+                elif self.domainString == 'SFRestaurants':
+                    sd_state_dim = 158
+                elif self.domainString == 'Laptops11':
+                    sd_state_dim = 13
+                else:
+                    logger.error(
+                        'Domain {} not implemented in feudal-DQN yet')  # just find out the size of sd_state_dim for the new domain
+                if self.features == 'rnn':
+                    arch = 'rnn'
+                else:
+                    arch = 'vanilla'
+                self.state_dim = si_state_dim + sd_state_dim
+                self.enac = enac.ENACNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.tau,
+                                             action_bound, self.architecture, self.h1_size, self.h2_size, self.is_training)
+            else:
+                logger.error('features "{}" not implemented'.format(self.features))
+
+            # when all models are defined, init all variables
+            init_op = tf.global_variables_initializer()
+            self.sess.run(init_op)
+
+            self.loadPolicy(self.in_policy_file)
+            print('loaded replay size: ', self.episodes[self.domainString].size())
+
+
+    def convertStateAction(self, state, action):
+        '''
+
+        '''
+        if isinstance(state, TerminalState):
+            return [0] * 89, action
+
+        else:
+            if self.features == 'learned' or self.features == 'rnn':
+                dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
+            else:
+                dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
+            action_name = self.actions.action_names[action]
+            act_slot = 'general'
+            for slot in dip_state.slots:
+                if slot in action_name:
+                    act_slot = slot
+            flat_belief = dip_state.get_beliefStateVec(act_slot)
+            self.prev_state_check = flat_belief
+
+            return flat_belief, action
+
+    def record(self, reward, domainInControl=None, weight=None, state=None, action=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.actToBeRecorded is None:
+            #self.actToBeRecorded = self.lastSystemAction
+            self.actToBeRecorded = self.summaryAct
+
+        if state is None:
+            state = self.prevbelief
+        if action is None:
+            action = self.actToBeRecorded
+
+        cState, cAction = state, action
+
+        # normalising total return to -1~1
+        reward /= 20.0
+
+        #value = self.a2c.predict_value([cState])
+        value = np.array([[0.0]])
+        policy_mu = self.mu_prob
+
+        if weight == None:
+            if self.replay_type == 'vanilla':
+                self.episodes[domainInControl].record(state=cState, \
+                        state_ori=state, action=cAction, reward=reward, value=value[0][0], distribution=policy_mu)
+            elif self.replay_type == 'prioritized':
+                self.episodes[domainInControl].record(state=cState, \
+                        state_ori=state, action=cAction, reward=reward, value=value[0][0], distribution=policy_mu)
+        else:
+            self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward, ma_weight=weight)
+
+        self.actToBeRecorded = None
+        self.samplecount += 1
+        return
+
+    def nextAction(self, beliefstate):
+        '''
+        select next action
+
+        :param beliefstate:
+        :returns: (int) next summary action
+        '''
+
+        if self.exploration_type == 'e-greedy':
+
+            # epsilon greedy
+            if self.is_training and utils.Settings.random.rand() < self.epsilon:
+                action_prob = np.random.rand(len(self.action_names))
+
+                # Importance sampling (should be turned off)
+                #if nextaIdex == greedyNextaIdex:
+                #    self.mu_prob = self.epsilon / float(self.action_dim) + 1 - self.epsilon
+                #else:
+                #    self.mu_prob = self.epsilon / float(self.action_dim)
+            else:
+                action_prob = self.enac.predict_policy(np.reshape(beliefstate, (1, len(beliefstate))))
+
+                # add current max Q to self.episode_ave_max_q
+                #print 'current maxQ', np.max(admissible)
+                #self.episode_ave_max_q.append(np.max(admissible))
+                
+                # Importance sampling
+                #self.mu_prob = self.epsilon / float(self.action_dim) + 1 - self.epsilon
+
+        return action_prob
+
+    def train(self):
+        '''
+        call this function when the episode ends
+        '''
+
+        if not self.is_training:
+            logger.info("Not in training mode")
+            return
+        else:
+            logger.info("Update enac policy parameters.")
+
+        self.episodecount += 1
+        logger.info("Sample Num so far: %s" %(self.samplecount))
+        logger.info("Episode Num so far: %s" %(self.episodecount))
+
+        if self.samplecount >= self.minibatch_size and self.episodecount % self.training_frequency == 0:
+            logger.info('start training...')
+
+            s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, v_batch, mu_policy = \
+                self.episodes[self.domainString].sample_batch()
+
+            discounted_return_batch = []
+        
+
+            def weightsImportanceSampling(mu_policy, r_batch):
+                mu_policy = np.asarray(mu_policy)
+                mu_cum = []
+                lenghts = []  # to properly divide on dialogues pi_policy later on
+                for mu in mu_policy:
+                    lenghts.append(len(mu))
+                    mu = np.asarray(mu).astype(np.longdouble)
+                    mu_cum.append(np.cumprod(mu[::-1])[::-1])  # going forward with cumulative product
+                # mu_cum = np.concatenate(np.array(mu_cum), axis=0).tolist()
+                mu_policy = np.concatenate(np.array(mu_policy), axis=0).tolist()  # concatenate all behavioral probs
+                lengths = np.cumsum(lenghts)  # time steps for ends of dialogues
+                lengths = np.concatenate((np.array([0]), lengths), axis=0)  # add first dialogue
+
+                if self.importance_sampling == 'max':
+                    pass
+                elif self.importance_sampling == "soft":
+                    # get the probabilities of actions taken from the batch
+                    pi_policy = self.enac.getPolicy(np.concatenate(np.array(s_batch), axis=0).tolist())[0]  # policy given s_t
+                    columns = np.asarray([np.concatenate(a_batch, axis=0).tolist()]).astype(int)  # actions taken at s_t
+                    rows = np.asarray([ii for ii in range(len(pi_policy))])
+                    pi_policy = pi_policy[rows, columns][0].astype(np.longdouble)  # getting probabilities for current policy
+
+                #####################################
+                # Weights for importance sampling
+                # it goes through each dialogue and computes in reverse order cumulative prod:
+                # rho_n = pi_n / mu_n
+                # ...
+                # rho_1 = pi_1 / mu_1 *  ... * pi_n / mu_n
+                # using dialogue and weight_cum lists
+                #####################################
+
+                rho_forward = []  # rho_forward from eq. 3.3 (the first one)
+                rho_whole = []  # product across the whole dialogue from eq. 3.3 (the second one)
+                #pi_cum2 = []  # stats to compare
+                #mu_cum2 = []  # stats to compare
+                #pi_cum = []  # stats to compare
+
+                # Precup version
+                r_vector = np.concatenate(np.array(r_batch), axis=0).tolist()
+                r_weighted = []
+
+                for ii in range(len(lengths) - 1):  # over dialogues
+                    weight_cum = 1.
+                    dialogue = []
+
+                    for pi, mu in zip(pi_policy[lengths[ii]:lengths[ii + 1]], mu_policy[lengths[ii]:lengths[ii + 1]]):
+                        weight_cum *= pi / mu
+                        dialogue.append(weight_cum)
+
+                    dialogue = np.array(dialogue)
+                    dialogue = np.clip(dialogue, 0.5, 1)  # clipping the weights
+                    dialogue = dialogue.tolist()
+
+                    rho_forward.extend(dialogue)
+                    #rho_whole.append(dialogue[-1])
+                    rho_whole.extend(np.ones(len(dialogue)) * dialogue[-1])
+                    r_weighted.extend(r_vector[lengths[ii]: lengths[ii + 1]] * np.asarray(dialogue))
+
+                # go back to original form:
+                ind = 0
+                r_new = copy.deepcopy(r_batch)
+                for id, batch in enumerate(r_new):
+                    for id2, _ in enumerate(batch):
+                        r_new[id][id2] = r_weighted[ind]
+                        ind += 1
+
+                # ONE STEP WEIGHTS
+                weights = np.asarray(pi_policy) / np.asarray(mu_policy)
+                weights = np.clip(weights, 0.5, 1)  # clipping the weights
+
+                return weights, rho_forward, rho_whole, r_new
+
+            weights, rho_forward, rho_whole, r_new = weightsImportanceSampling(mu_policy, r_batch)
+
+            weights = np.nan_to_num(weights)
+            rho_forward = np.nan_to_num(rho_forward)
+            rho_whole = np.nan_to_num(rho_whole)
+            """
+            print 'w',weights
+            print 'rho_for',rho_forward
+            print 'rho_who',rho_whole
+            """
+
+            def calculate_discountR(r_episode, idx):
+                #########################################################################
+                # Here we take the rewards and values from the rollouts, and use them to
+                # generate the advantage and discounted returns.
+                # The advantage function uses "Generalized Advantage Estimation"
+                bootstrap_value = 0.0
+                # r_episode rescale by rhos?
+                self.r_episode_plus = np.asarray(r_episode[idx:] + [bootstrap_value])
+                if self.importance_sampling:
+                    self.r_episode_plus = self.r_episode_plus
+                else:
+                    self.r_episode_plus = self.r_episode_plus/rho_forward[idx]
+                discounted_r_episode = discount(self.r_episode_plus, self.gamma)[:-1]
+                #########################################################################
+                return discounted_r_episode[0]
+
+            if self.replay_type == 'prioritized':
+                for item_r, item_v, item_idx in zip(r_new, v_batch, idx_batch):
+                    rlist = []
+                    for idx in range(len(item_r)):
+                        r = calculate_discountR(item_r, idx)
+                        rlist.append(r)
+
+                    discounted_return_batch.append(rlist[-1])
+            else:
+                for item_r, item_v in zip(r_new, v_batch):
+                    rlist = []
+                    for idx in range(len(item_r)):
+                        r = calculate_discountR(item_r, idx)
+                        rlist.append(r)
+
+                    discounted_return_batch.append(rlist[-1]) 
+
+            batch_size = len(s_batch)
+
+            if self.importance_sampling:
+                discounted_return_batch = np.clip(discounted_return_batch, -1, 1)
+
+            # get gradient info and create matrix
+            gradient_matrix = []
+            for item_s, item_a in zip(s_batch, a_batch):
+                item_a_one_hot = np.eye(self.action_dim)[item_a]
+                policy_gradient = self.enac.get_policy_gradient(item_s, item_a_one_hot)
+                policy_gradient = [(policy_gradient_idv.flatten()).tolist() for policy_gradient_idv in policy_gradient]
+                policy_gradient_flatten = np.hstack(policy_gradient)
+                policy_gradient_flatten = np.append(policy_gradient_flatten, [1.0])
+                gradient_matrix.append(policy_gradient_flatten.tolist())
+            
+            gradient_matrix = np.matrix(gradient_matrix)
+            return_matrix = np.matrix(discounted_return_batch)
+
+            logger.info("Updating eNAC policy parameters, before calculate eNac matrix")
+            try: 
+                natural_gradient = np.dot(np.linalg.pinv(gradient_matrix), return_matrix.T)
+                # convert a matrix to list-like array
+                natural_gradient = np.array(natural_gradient.flatten()).ravel()
+                natural_gradient = natural_gradient[:-1] # discard the last element
+            except:
+                natural_gradient = self.natural_gradient_prev 
+                print('SVD problem')
+
+            logger.info("Updating eNAC policy parameters, after calculate eNac matrix")
+
+            self.natural_gradient_prev = natural_gradient
+
+            all_params = self.enac.get_params()
+
+            cnt = 0
+            modelW = []
+            modelB = []
+            for variable in all_params:
+                       
+                shape = variable.shape
+                # weight matrix
+                if np.array(variable).ndim == 1:
+                    until = np.array(variable).shape[0]
+                    subNG = np.reshape(natural_gradient[cnt:cnt+until],shape)
+                    cnt += until
+                    modelB.append(subNG)
+                # bias vector
+                elif np.array(variable).ndim == 2:
+                    until = np.array(variable).shape[0]*np.array(variable).shape[1]
+                    subNG = np.reshape(natural_gradient[cnt:cnt+until],shape)
+                    cnt += until
+                    modelW.append(subNG)
+
+            a_batch_one_hot = np.eye(self.action_dim)[np.concatenate(a_batch, axis=0).tolist()]
+
+            policy_loss, entropy, all_loss, optimise = self.enac.train( \
+                    np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, \
+                    modelW[0], modelB[0], modelW[1], modelB[1], modelW[2], modelB[2] \
+            )
+
+            norm_p_l, ent, norm_all_l = \
+                    policy_loss/float(batch_size), \
+                    entropy/float(batch_size), all_loss/float(batch_size)
+
+        self.savePolicyInc()  # self.out_policy_file)
+
+    def savePolicy(self, FORCE_SAVE=False):
+        """
+        Does not use this, cause it will be called from agent after every episode.
+        we want to save the policy only periodically.
+        """
+        pass
+
+    def savePolicyInc(self, FORCE_SAVE=False):
+        """
+        save model and replay buffer
+        """
+        if self.episodecount % self.save_step == 0:
+            self.enac.save_network(self.out_policy_file+'.enac.ckpt')
+
+            f = open(self.out_policy_file+'.episode', 'wb')
+            for obj in [self.samplecount, self.episodes[self.domainString]]:
+                pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
+            f.close()
+            #logger.info("Saving model to %s and replay buffer..." % save_path)
+
+    def saveStats(self, FORCE_SAVE=False):
+        f = open(self.out_policy_file + '.stats', 'wb')
+        pickle.dump(self.stats, f, protocol=pickle.HIGHEST_PROTOCOL)
+        f.close()
+
+    def loadPolicy(self, filename):
+        """
+        load model and replay buffer
+        """
+        # load models
+        self.enac.load_network(filename+'.enac.ckpt')
+        
+        # load replay buffer
+        try:
+            print('load from: ', filename)
+            f = open(filename+'.episode', 'rb')
+            loaded_objects = []
+            for i in range(2): # load nn params and collected data
+                loaded_objects.append(pickle.load(f))
+            self.samplecount = int(loaded_objects[0])
+            self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1])
+            logger.info("Loading both model from %s and replay buffer..." % filename)
+            f.close()
+        except:
+            logger.info("Loading only models...")
+
+    def restart(self):
+        self.summaryAct = None          
+        self.lastSystemAction = None
+        self.prevbelief = None
+        self.actToBeRecorded = None
+        self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter)
+        self.episode_ave_max_q = []
+
+#END OF FILE
diff --git a/policy/feudalgainRL/FeudalNoisyACERPolicy.py b/policy/feudalgainRL/FeudalNoisyACERPolicy.py
new file mode 100644
index 0000000000000000000000000000000000000000..732ee8a0d2528e5773a271c3e915db312cbfd6d2
--- /dev/null
+++ b/policy/feudalgainRL/FeudalNoisyACERPolicy.py
@@ -0,0 +1,561 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+'''
+ACERPolicy.py - ACER - Actor Critic with Experience Replay
+==================================================
+
+Copyright CUED Dialogue Systems Group 2015 - 2017
+
+.. seealso:: CUED Imports/Dependencies:
+
+    import :class:`Policy`
+    import :class:`utils.ContextLogger`
+
+.. warning::
+        Documentation not done.
+
+
+************************
+
+'''
+import copy
+import os
+import json
+import numpy as np
+import scipy
+import scipy.signal
+from scipy.stats import entropy
+import pickle as pickle
+import random
+import utils
+from policy.feudalgainRL.NoisyACERPolicy import NoisyACERPolicy
+from utils.Settings import config as cfg
+from utils import ContextLogger, DiaAct
+
+import ontology.FlatOntologyManager as FlatOnt
+import tensorflow as tf
+from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode
+from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode
+import policy.DRL.utils as drlutils
+#from policy.SACERPolicy import SACERPolicy
+import policy.feudalgainRL.noisyacer as noisy_acer
+import policy.Policy
+import policy.SummaryAction
+from policy.Policy import TerminalAction, TerminalState
+from policy.feudalgainRL.DIP_parametrisation import DIP_state, padded_state
+
+logger = utils.ContextLogger.getLogger('')
+
+# Discounting function used to calculate discounted returns.
+def discount(x, gamma):
+    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
+
+
+class FeudalNoisyACERPolicy(NoisyACERPolicy):
+    '''Derived from :class:`Policy`
+    '''
+    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False,
+                 action_names=None, slot=None, sd_state_dim=50, js_threshold=1.0, info_reward=0.0, load_policy=True,
+                 critic_regularizer_weight=0):
+        super(FeudalNoisyACERPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training)
+
+        tf.reset_default_graph()
+
+        self.in_policy_file = in_policy_file
+        self.out_policy_file = out_policy_file
+        self.is_training = is_training
+        self.accum_belief = []
+        self.prev_state_check = None
+        self.sd_state_dim = sd_state_dim
+        self.info_reward = info_reward
+        self.js_threshold = js_threshold
+        if self.js_threshold != 1.0:
+            print("We train with JS-divergence, threshold =", self.js_threshold)
+
+        self.domainString = domainString
+        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
+        self.critic_regularizer_weight = critic_regularizer_weight
+
+        self.features = 'dip'
+        self.sd_enc_size = 80
+        self.si_enc_size = 40
+        self.dropout_rate = 0.
+        if cfg.has_option('feudalpolicy', 'features'):
+            self.features = cfg.get('feudalpolicy', 'features')
+        if cfg.has_option('feudalpolicy', 'sd_enc_size'):
+            self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
+        if cfg.has_option('feudalpolicy', 'si_enc_size'):
+            self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
+        if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
+            self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+        if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
+            self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+        self.actfreq_ds = False
+        if cfg.has_option('feudalpolicy', 'actfreq_ds'):
+            self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')
+        self.noisy_acer = False
+        if cfg.has_option('policy', 'noisy_acer'):
+            self.noisy_acer = cfg.getboolean('policy', 'noisy_acer')
+
+        self.sample_argmax = False
+        if cfg.has_option('policy', 'sample_argmax'):
+            self.sample_argmax = cfg.getboolean('policy', 'sample_argmax')
+
+        if self.sample_argmax:
+            print("We sample argmax")
+
+        #self.log_path = cfg.get('exec_config', 'logfiledir')
+        #self.log_path = self.log_path + f"/{in_policy_file.split('/')[-1].split('.')[0]}-seed{self.randomseed}.npy"
+
+        self.load_policy = load_policy
+
+        # init session
+        self.sess = tf.Session()
+        with tf.device("/cpu:0"):
+
+            np.random.seed(self.randomseed)
+            tf.set_random_seed(self.randomseed)
+
+            # initialise an replay buffer
+            if self.replay_type == 'vanilla':
+                self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed)
+            elif self.replay_type == 'prioritized':
+                self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed)
+            #replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
+            #self.episodes = []
+            self.samplecount = 0
+            self.episodecount = 0
+
+            # construct the models
+            self.state_dim = 89  # current DIP state dim
+            self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
+            self.action_names = action_names
+            self.action_dim = len(self.action_names)
+            action_bound = len(self.action_names)
+            self.stats = [0 for _ in range(self.action_dim)]
+
+            self.global_mu = [0. for _ in range(self.action_dim)]
+
+            if self.features == 'dip':
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        self.state_dim += 9#16
+                    elif self.domainString == 'SFRestaurants':
+                        self.state_dim += 9#25
+                    elif self.domainString == 'Laptops11':
+                        self.state_dim += 9#40
+                self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta,
+                                             self.c, self.alpha, self.h1_size, self.h2_size, self.is_training)
+            elif self.features == 'learned' or self.features == 'rnn':
+                si_state_dim = 73
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        si_state_dim += 9#16
+                    elif self.domainString == 'SFRestaurants':
+                        si_state_dim += 9#25
+                    elif self.domainString == 'Laptops11':
+                        si_state_dim += 9#40
+
+                self.state_dim = si_state_dim
+                self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim,
+                                                    self.critic_lr, self.delta, self.c, self.alpha, self.h1_size,
+                                                    self.h2_size, self.is_training, temperature=self.temperature,
+                                                    critic_regularizer_weight=self.critic_regularizer_weight,
+                                                    noisy_acer=self.noisy_acer)
+
+            else:
+                logger.error('features "{}" not implemented'.format(self.features))
+
+            # when all models are defined, init all variables
+            init_op = tf.global_variables_initializer()
+            self.sess.run(init_op)
+
+            if self.load_policy:
+                self.loadPolicy(self.in_policy_file)
+                print('loaded replay size: ', self.episodes[self.domainString].size())
+            else:
+                print("We do not load a previous policy.")
+
+            #self.acer.update_target_network()
+
+    # def record() has been handled...
+
+    def convertStateAction(self, state, action):
+        '''
+
+        '''
+        if isinstance(state, TerminalState):
+            return [0] * 89, action
+
+        else:
+            if self.features == 'learned' or self.features == 'rnn':
+                dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
+            else:
+                dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
+            action_name = self.actions.action_names[action]
+            act_slot = 'general'
+            for slot in dip_state.slots:
+                if slot in action_name:
+                    act_slot = slot
+            flat_belief = dip_state.get_beliefStateVec(act_slot)
+            self.prev_state_check = flat_belief
+
+            return flat_belief, action
+
+    def record(self, reward, domainInControl=None, weight=None, state=None, action=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.actToBeRecorded is None:
+            self.actToBeRecorded = self.summaryAct
+
+        if state is None:
+            state = self.prevbelief
+        if action is None:
+            action = self.actToBeRecorded
+        mu_weight = self.prev_mu
+        mask = self.prev_mask
+        if action == self.action_dim-1: # pass action was taken
+            mask = np.zeros(self.action_dim)
+            mu_weight = np.ones(self.action_dim)/self.action_dim
+
+        cState, cAction = state, action
+
+        reward /= 20.0
+
+        value = self.sacer.predict_value([cState[0]], [mask])
+
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=cState, \
+                    state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask)
+        elif self.replay_type == 'prioritized':
+            self.episodes[domainInControl].record(state=cState, \
+                    state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask)
+
+        self.actToBeRecorded = None
+        self.samplecount += 1
+        return
+
+    def finalizeRecord(self, reward, domainInControl=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.episodes[domainInControl] is None:
+            logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
+            return
+
+        #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size())
+        #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q)
+        #print self.stats
+
+        # normalising total return to -1~1
+        reward /= 20.0
+
+        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
+        value = 0.0 # not effect on experience replay
+
+        def calculate_discountR_advantage(r_episode, v_episode):
+            #########################################################################
+            # Here we take the rewards and values from the rollout, and use them to
+            # generate the advantage and discounted returns.
+            # The advantage function uses "Generalized Advantage Estimation"
+            bootstrap_value = 0.0
+            self.r_episode_plus = np.asarray(r_episode + [bootstrap_value])
+            discounted_r_episode = discount(self.r_episode_plus,self.gamma)[:-1]
+            self.v_episode_plus = np.asarray(v_episode + [bootstrap_value])
+            advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1]
+            advantage = discount(advantage,self.gamma)
+            #########################################################################
+            return discounted_r_episode, advantage
+
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=terminal_state, \
+                    state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None)
+        elif self.replay_type == 'prioritized':
+            episode_r, episode_v = self.episodes[domainInControl].record_final_and_get_episode(state=terminal_state, \
+                                                                                               state_ori=TerminalState(),
+                                                                                               action=terminal_action,
+                                                                                               reward=reward,
+                                                                                               value=value)
+
+            # TD_error is a list of td error in the current episode
+            _, TD_error = calculate_discountR_advantage(episode_r, episode_v)
+            episodic_TD = np.mean(np.absolute(TD_error))
+            print('episodic_TD')
+            print(episodic_TD)
+            self.episodes[domainInControl].insertPriority(episodic_TD)
+
+        return
+
+    def compute_responsible_q(self, inputs, actions, mask):
+        return self.sacer.compute_responsible_q(inputs, actions, mask)
+
+    def nextAction(self, beliefstate, execMask):
+        '''
+        select next action
+
+        :param beliefstate:
+        :param hyps:
+        :returns: (int) next summarye action
+        '''
+
+        action_prob = self.sacer.predict_policy(np.reshape(beliefstate, (1, len(beliefstate))),
+                                                np.reshape(execMask, (1, len(execMask))))[0]
+
+        if (self.exploration_type == 'e-greedy' or not self.is_training) and not self.noisy_acer:
+
+            if not self.sample_argmax:
+                epsilon = self.epsilon if self.is_training else 0.
+                eps_prob = np.ones(len(action_prob)) / len(action_prob)
+
+                best_index = np.argmax(action_prob)
+                best_prob = [1. if i == best_index else 0. for i in range(len(action_prob))]
+
+                # we sample a random action with probability epsilon and sample from the policy distribution with probability 1-epsilon
+                action_prob = epsilon * np.array(eps_prob) + (1. - epsilon) * action_prob
+
+                if not self.is_training:
+                    # take the greedy action during evaluation
+                    action_prob = np.array(best_prob)
+            else:
+                if self.is_training and utils.Settings.random.rand() < self.epsilon:
+                    action_prob = np.random.rand(len(self.action_names))
+
+        if not self.is_training:
+            # take the greedy action during evaluation
+            best_index = np.argmax(action_prob)
+            best_prob = [1. if i == best_index else 0. for i in range(len(action_prob))]
+            action_prob = np.array(best_prob)
+
+        if not self.sample_argmax:
+            nextaIdex = np.random.choice(len(action_prob), p=action_prob / sum(action_prob))
+        else:
+            nextaIdex = np.argmax(action_prob)
+        mu = action_prob / sum(action_prob)
+
+        self.prev_mu = mu
+        self.prev_mask = execMask
+
+        return np.array([1. if i == nextaIdex else 0. for i in range(len(action_prob))])
+
+    def train(self, critic_regularizer=None):
+        '''
+        call this function when the episode ends
+        '''
+        USE_GLOBAL_MU = False
+        self.episode_ct += 1
+
+      #  new_noise_man_array = np.expand_dims(np.array(self.sacer.compute_mean_noisy()), axis=0)
+      #  if os.path.exists(self.log_path):
+      #      noise_mean_array = np.load(self.log_path)
+      #      new_noise_man_array = np.concatenate((noise_mean_array, new_noise_man_array), axis=0)
+      #  np.save(self.log_path, new_noise_man_array)
+
+        if not self.is_training:
+            logger.info("Not in training mode")
+            return
+        else:
+            logger.info("Update acer policy parameters.")
+
+        self.episodecount += 1
+        logger.info("Sample Num so far: %s" % (self.samplecount))
+        logger.info("Episode Num so far: %s" % (self.episodecount))
+        if self.samplecount >= self.minibatch_size * 3 and self.episodecount % self.training_frequency == 0:
+        #if self.episodecount % self.training_frequency == 0:
+            logger.info('start trainig...')
+
+            for _ in range(self.train_iters_per_episode):
+
+                if self.replay_type == 'vanilla' or self.replay_type == 'prioritized':
+                    s_batch_full, s_ori_batch, a_batch, r_batch, s2_batch_full, s2_ori_batch, t_batch, idx_batch, v_batch, mu_policy, mask_batch = \
+                        self.episodes[self.domainString].sample_batch()
+                    if USE_GLOBAL_MU:
+                        mu_sum = sum(self.global_mu)
+                        mu_normalised = np.array([c / mu_sum for c in self.global_mu])
+                        mu_policy = [[mu_normalised for _ in range(len(mu_policy[i]))] for i in range(len(mu_policy))]
+                else:
+                    assert False  # not implemented yet
+
+                s_batch = [[state_tuple[0] for state_tuple in epi] for epi in s_batch_full]
+                s_batch_beliefstate = [[state_tuple[1] for state_tuple in epi] for epi in s_batch_full]
+                s_batch_chosen_slot = [[state_tuple[2] for state_tuple in epi] for epi in s_batch_full]
+
+                s2_batch_beliefstate = [[state_tuple[1] for state_tuple in epi] for epi in s2_batch_full]
+
+                js_divergence_batch = []
+
+                if self.js_threshold < 1.0:
+                    #TODO: This is probably highly inefficient
+                    for epi_s, epi_s2, epi_slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot):
+                        for belief, belief2, slot in zip(epi_s, epi_s2, epi_slot):
+                            if slot != "None":
+                                keys = belief['beliefs'][slot].keys()
+
+                                b = [belief['beliefs'][slot]['**NONE**']] + \
+                                    [belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
+
+                                b_2 = [belief2['beliefs'][slot]['**NONE**']] + \
+                                      [belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
+
+                                js_divergence = self.compute_js_divergence(b, b_2)
+                                js_divergence_batch.append(js_divergence)
+                            else:
+                                js_divergence_batch.append(0.0)
+
+                    js_divergence_batch = [int(x > self.js_threshold) for x in js_divergence_batch]
+                    js_divergence_batch = 2/20 * np.array(js_divergence_batch) #normalizing bound to [0, 2] and then /20
+
+                discounted_r_batch = []
+                advantage_batch = []
+                def calculate_discountR_advantage(r_episode, v_episode):
+                    #########################################################################
+                    # Here we take the rewards and values from the rolloutv, and use them to
+                    # generate the advantage and discounted returns.
+                    # The advantage function uses "Generalized Advantage Estimation"
+                    bootstrap_value = 0.0
+                    # r_episode rescale by rhos?
+                    self.r_episode_plus = np.asarray(r_episode + [bootstrap_value])
+                    discounted_r_episode = discount(self.r_episode_plus, self.gamma)[:-1]
+                    self.v_episode_plus = np.asarray(v_episode + [bootstrap_value])
+                    # change sth here
+                    advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1]
+                    advantage = discount(advantage, self.gamma)
+                    #########################################################################
+                    return discounted_r_episode, advantage
+
+                if self.replay_type == 'prioritized':
+                    for item_r, item_v, item_idx in zip(r_batch, v_batch, idx_batch):
+                        # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist())
+                        r, a = calculate_discountR_advantage(item_r, item_v)
+
+                        # flatten nested numpy array and turn it into list
+                        discounted_r_batch += r.tolist()
+                        advantage_batch += a.tolist()
+
+                        # update the sum-tree
+                        # update the TD error of the samples (episode) in the minibatch
+                        episodic_TD_error = np.mean(np.absolute(a))
+                        self.episodes[self.domainString].update(item_idx, episodic_TD_error)
+                else:
+                    for item_r, item_v in zip(r_batch, v_batch):
+                        # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist())
+                        r, a = calculate_discountR_advantage(item_r, item_v)
+
+                        # flatten nested numpy array and turn it into list
+                        discounted_r_batch += r.tolist()
+                        advantage_batch += a.tolist()
+
+                batch_size = len(s_batch)
+
+                a_batch_one_hot = np.eye(self.action_dim)[np.concatenate(a_batch, axis=0).tolist()]
+
+                if self.js_threshold < 1.0:
+                    r_batch_concatenated = np.concatenate(np.array(r_batch), axis=0) + js_divergence_batch
+                else:
+                    r_batch_concatenated = np.concatenate(np.array(r_batch), axis=0)
+
+                if critic_regularizer is not None:
+                    critic_regularizer_q = critic_regularizer.compute_responsible_q(
+                        np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot,
+                        np.concatenate(np.array(mask_batch), axis=0).tolist())
+
+                    loss, entropy, optimize = \
+                                self.sacer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot,
+                                                 np.concatenate(np.array(mask_batch), axis=0).tolist(),
+                                                 r_batch_concatenated, s_batch, r_batch, self.gamma,
+                                                 np.concatenate(np.array(mu_policy), axis=0),
+                                                 discounted_r_batch, advantage_batch,
+                                                 critic_regularizer_output=critic_regularizer_q)
+                else:
+                    loss, entropy, optimize = \
+                        self.sacer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot,
+                                         np.concatenate(np.array(mask_batch), axis=0).tolist(),
+                                         r_batch_concatenated, s_batch, r_batch, self.gamma,
+                                         np.concatenate(np.array(mu_policy), axis=0),
+                                         discounted_r_batch, advantage_batch)
+
+                ent, norm_loss = entropy/float(batch_size), loss/float(batch_size)
+
+            self.savePolicyInc()
+
+    def savePolicy(self, FORCE_SAVE=False):
+        """
+        Does not use this, cause it will be called from agent after every episode.
+        we want to save the policy only periodically.
+        """
+        pass
+
+    def compute_js_divergence(self, P, Q):
+
+        M = [p + q for p, q in zip(P, Q)]
+        return 0.5 * (entropy(P, M, base=2) + entropy(Q, M, base=2))
+
+    def savePolicyInc(self, FORCE_SAVE=False):
+        """
+        save model and replay buffer
+        """
+        if self.episodecount % self.save_step == 0:
+            #save_path = self.saver.save(self.sess, self.out_policy_file+'.ckpt')
+            self.sacer.save_network(self.out_policy_file+'.acer.ckpt')
+
+            f = open(self.out_policy_file+'.episode', 'wb')
+            for obj in [self.samplecount, self.episodes[self.domainString], self.global_mu]:
+                pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
+            f.close()
+            #logger.info("Saving model to %s and replay buffer..." % save_path)
+
+    def loadPolicy(self, filename):
+        """
+        load model and replay buffer
+        """
+        # load models
+        self.sacer.load_network(filename+'.acer.ckpt')
+
+        # load replay buffer
+        if self.load_buffer:
+            try:
+                print('load from: ', filename)
+                f = open(filename+'.episode', 'rb')
+                loaded_objects = []
+                for i in range(2): # load nn params and collected data
+                    loaded_objects.append(pickle.load(f))
+                self.samplecount = int(loaded_objects[0])
+                self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1])
+                self.global_mu = loaded_objects[2]
+                logger.info("Loading both model from %s and replay buffer..." % filename)
+                f.close()
+            except:
+                logger.info("Loading only models...")
+        else:
+            print("We do not load the buffer!")
+
+    def restart(self):
+        self.summaryAct = None
+        self.lastSystemAction = None
+        self.prevbelief = None
+        self.prev_mu = None
+        self.prev_mask = None
+        self.actToBeRecorded = None
+        self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter)
+        self.episode_ave_max_q = []
+
+
+#END OF FILE
diff --git a/policy/feudalgainRL/FeudalNoisyDQNPolicy.py b/policy/feudalgainRL/FeudalNoisyDQNPolicy.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ada20841ad370836f976327ba8e6c2c4422a4f2
--- /dev/null
+++ b/policy/feudalgainRL/FeudalNoisyDQNPolicy.py
@@ -0,0 +1,554 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+'''
+DQNPolicy.py - deep Q network policy
+==================================================
+
+Copyright CUED Dialogue Systems Group 2015 - 2017
+
+.. seealso:: CUED Imports/Dependencies: 
+
+    import :class:`Policy`
+    import :class:`utils.ContextLogger`
+
+.. warning::
+        Documentation not done.
+
+
+************************
+
+'''
+
+import copy
+import os
+import sys
+import json
+import numpy as np
+import pickle as pickle
+from itertools import product
+from scipy.stats import entropy
+import utils
+#from pydial import log_dir
+from utils.Settings import config as cfg
+from utils import ContextLogger, DiaAct, DialogueState
+
+import ontology.FlatOntologyManager as FlatOnt
+import tensorflow as tf
+from policy.DRL.replay_buffer import ReplayBuffer
+from policy.DRL.replay_prioritised import ReplayPrioritised
+import policy.feudalgainRL.noisydqn as dqn
+import policy.Policy
+import policy.DQNPolicy
+import policy.SummaryAction
+from policy.Policy import TerminalAction, TerminalState
+from policy.feudalgainRL.DIP_parametrisation import DIP_state, padded_state
+
+
+logger = utils.ContextLogger.getLogger('')
+
+
+class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
+    '''Derived from :class:`DQNPolicy`
+    '''
+
+    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False,
+                 action_names=None, slot=None, sd_state_dim=50, js_threshold=0, info_reward=0.0, jsd_reward=False,
+                 jsd_function=None):
+        super(FeudalDQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training)
+
+        tf.reset_default_graph()
+
+        self.domainString = domainString
+        self.sd_state_dim = sd_state_dim
+        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
+        self.in_policy_file = in_policy_file
+        self.out_policy_file = out_policy_file
+        self.is_training = is_training
+        self.accum_belief = []
+        self.info_reward = info_reward
+        self.js_threshold = js_threshold
+        self.jsd_reward = jsd_reward
+        self.jsd_function = jsd_function
+        self.log_path = cfg.get('exec_config', 'logfiledir')
+        self.log_path = self.log_path + f"/{in_policy_file.split('/')[-1].split('.')[0]}-seed{self.randomseed}.txt"
+
+        if self.jsd_function is not None:
+            print("We use the JSD-function", self.jsd_function)
+        if self.js_threshold != 1.0 and not self.jsd_reward:
+            print("We use JS-divergence, threshold =", self.js_threshold)
+        if self.jsd_reward:
+            print("We train with raw JSD reward.")
+        self.slots = slot
+        self.features = 'dip'
+        if cfg.has_option('feudalpolicy', 'features'):
+            self.features = cfg.get('feudalpolicy', 'features')
+        self.actfreq_ds = False
+        if cfg.has_option('feudalpolicy', 'actfreq_ds'):
+            self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')
+
+        self.use_pass = True
+        if cfg.has_option('feudalpolicy', 'use_pass'):
+            self.use_pass = cfg.getboolean('feudalpolicy', 'use_pass')
+
+        if self.use_pass:
+            print("We work with pass action in DQN training")
+        else:
+            print("We work without pass action in DQN training")
+
+        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
+        self.prev_state_check = None
+
+        self.max_k = 5
+        if cfg.has_option('dqnpolicy', 'max_k'):
+            self.max_k = cfg.getint('dqnpolicy', 'max_k')
+
+        self.capacity *= 5  # capacity for episode methods, multiply it to adjust to turn based methods
+
+        # init session
+        self.sess = tf.Session()
+        with tf.device("/cpu:0"):
+
+            np.random.seed(self.randomseed)
+            tf.set_random_seed(self.randomseed)
+
+            # initialise a replay buffer
+            if self.replay_type == 'vanilla':
+                self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size*4, self.randomseed)
+            elif self.replay_type == 'prioritized':
+                self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size,
+                                                                     self.randomseed)
+            self.samplecount = 0
+            self.episodecount = 0
+
+            # construct the models
+            self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
+            self.action_names = action_names
+            self.action_dim = len(self.action_names)
+            action_bound = len(self.action_names)
+            self.stats = [0 for _ in range(self.action_dim)]
+
+            if self.features == 'learned' or self.features == 'rnn':
+                si_state_dim = 73
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        si_state_dim += 9#16
+                    elif self.domainString == 'SFRestaurants':
+                        si_state_dim += 9#25
+                    elif self.domainString == 'Laptops11':
+                        si_state_dim += 9#40
+                self.sd_enc_size = 50
+                self.si_enc_size = 25
+                self.dropout_rate = 0.
+                if cfg.has_option('feudalpolicy', 'sd_enc_size'):
+                    self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
+                if cfg.has_option('feudalpolicy', 'si_enc_size'):
+                    self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
+                if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
+                    self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+                if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
+                    self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
+
+                self.state_dim = si_state_dim + sd_state_dim
+                if self.features == 'learned':
+                    self.dqn = dqn.NNFDeepQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim,
+                                            self.learning_rate, self.tau, action_bound, self.minibatch_size,
+                                            self.architecture, self.h1_size, self.h2_size, sd_enc_size=self.sd_enc_size,
+                                               si_enc_size=self.si_enc_size, dropout_rate=self.dropout_rate)
+
+                elif self.features == 'rnn':
+                    self.dqn = dqn.RNNFDeepQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim,
+                                                   self.learning_rate, self.tau, action_bound, self.minibatch_size,
+                                                   self.architecture, self.h1_size, self.h2_size,
+                                                   sd_enc_size=self.sd_enc_size, si_enc_size=self.si_enc_size,
+                                                   dropout_rate=self.dropout_rate, slot=self.slot)
+            else: # self.features = 'dip'
+                if self.actfreq_ds:
+                    if self.domainString == 'CamRestaurants':
+                        self.state_dim += 9#16
+                    elif self.domainString == 'SFRestaurants':
+                        self.state_dim += 9#25
+                    elif self.domainString == 'Laptops11':
+                        self.state_dim += 9#40
+                self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim,
+                                            self.learning_rate, self.tau, action_bound, self.minibatch_size,
+                                            self.architecture, self.h1_size,
+                                            self.h2_size, dropout_rate=self.dropout_rate)
+
+            # when all models are defined, init all variables (this might to be sent to the main policy too)
+            init_op = tf.global_variables_initializer()
+            self.sess.run(init_op)
+
+            self.loadPolicy(self.in_policy_file)
+            print('loaded replay size: ', self.episodes[self.domainString].size())
+
+            self.dqn.update_target_network()
+
+    def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.actToBeRecorded is None:
+            self.actToBeRecorded = self.summaryAct
+
+        if state is None:
+            state = self.prevbelief
+        if action is None:
+            action = self.actToBeRecorded
+
+        cState, cAction = state, action
+        # normalising total return to -1~1
+        reward /= 20.0
+
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=cState, \
+                                                  state_ori=state, action=cAction, reward=reward)
+
+        self.actToBeRecorded = None
+        self.samplecount += 1
+
+    def finalizeRecord(self, reward, domainInControl=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.episodes[domainInControl] is None:
+            logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
+            return
+
+        reward /= 20.0
+
+        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
+
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=terminal_state, \
+                                                  state_ori=TerminalState(), action=terminal_action, reward=reward,
+                                                  terminal=True)
+        elif self.replay_type == 'prioritized':
+            self.episodes[domainInControl].record(state=terminal_state, \
+                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
+                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False,
+                                                      terminal=True)
+            print('total TD', self.episodes[self.domainString].tree.total())
+
+    def convertStateAction(self, state, action):
+        '''
+
+        '''
+        if isinstance(state, TerminalState):
+            return [0] * 89, action
+        else:
+            if self.features == 'learned' or self.features == 'rnn':
+                dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
+            else:
+                dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
+            action_name = self.actions.action_names[action]
+            act_slot = 'general'
+            for slot in dip_state.slots:
+                if slot in action_name:
+                    act_slot = slot
+            flat_belief = dip_state.get_beliefStateVec(act_slot)
+            self.prev_state_check = flat_belief
+
+            return flat_belief, action
+
+    def nextAction(self, beliefstate):
+        '''
+        select next action
+
+        :param beliefstate: already converted to dipstatevec of the specific slot (or general)
+        :returns: (int) next summary action
+        '''
+
+        if self.exploration_type == 'e-greedy' and self.architecture != 'noisy_duel':
+            # epsilon greedy
+            if self.is_training and utils.Settings.random.rand() < self.epsilon:
+                action_Q = np.random.rand(len(self.action_names))
+            else:
+                if len(beliefstate.shape) == 1:
+                    action_Q = self.dqn.predict(np.reshape(beliefstate, (1, -1)))
+                else:
+                    action_Q = self.dqn.predict(beliefstate)
+                # add current max Q to self.episode_ave_max_q
+                self.episode_ave_max_q.append(np.max(action_Q))
+        elif self.architecture == 'noisy_duel':
+            if len(beliefstate.shape) == 1:
+                action_Q = self.dqn.predict(np.reshape(beliefstate, (1, -1)))
+            else:
+                action_Q = self.dqn.predict(beliefstate)
+            # add current max Q to self.episode_ave_max_q
+            self.episode_ave_max_q.append(np.max(action_Q))
+
+        #return the Q vect, the action will be converted in the feudal policy
+        return action_Q
+
+    def train(self):
+        '''
+        call this function when the episode ends
+        '''
+
+        if not self.is_training:
+            logger.info("Not in training mode")
+            return
+        else:
+            logger.info("Update dqn policy parameters.")
+
+        self.episodecount += 1
+        logger.info("Sample Num so far: %s" % (self.samplecount))
+        logger.info("Episode Num so far: %s" % (self.episodecount))
+
+        s_batch_new, s_batch_beliefstate, s_batch_chosen_slot, s2_batch_dipstate, s2_batch_beliefstate, t_batch_new, r_batch_new = \
+            [], [], [], [], [], [], []
+
+        if self.samplecount >= self.minibatch_size * 10 and self.episodecount % self.training_frequency == 0:
+            logger.info('start training...')
+
+            a_batch_one_hot_new = None
+            #updating only states where the action is not "pass()" complicates things :/
+            #since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples
+
+            if self.js_threshold < 1.0 or not self.use_pass:
+                while len(s_batch_new) < self.minibatch_size:
+
+                    s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
+                        self.episodes[self.domainString].sample_batch()
+
+                    a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch]
+                    #we only wanna update state-action pairs, where action != pass()
+                    valid_steps = [action[-1] != 1 for action in a_batch_one_hot]
+                    a_batch_one_hot = a_batch_one_hot[valid_steps]
+
+                    s_batch_new += [s[0] for i, s in enumerate(s_batch) if valid_steps[i]]
+                    s_batch_beliefstate += [s[1] for i, s in enumerate(s_batch) if valid_steps[i]]
+                    s_batch_chosen_slot += [s[2] for i, s in enumerate(s_batch) if valid_steps[i]]
+
+                    s2_batch_dipstate += [s[3] for s, valid in zip(s2_batch, valid_steps) if valid]
+                    s2_batch_beliefstate += [s[1] for s, valid in zip(s2_batch, valid_steps) if valid]
+
+                    r_batch_new += [r for r, valid in zip(r_batch, valid_steps) if valid]
+                    t_batch_new += [t for t, valid in zip(t_batch, valid_steps) if valid]
+
+                    if a_batch_one_hot_new is None:
+                        a_batch_one_hot_new = a_batch_one_hot
+                    else:
+                        a_batch_one_hot_new = np.vstack((a_batch_one_hot_new, a_batch_one_hot))
+
+                s_batch_new = np.vstack(s_batch_new)
+                s2_batch_dipstate = np.vstack(s2_batch_dipstate)
+
+            else:
+                s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
+                    self.episodes[self.domainString].sample_batch()
+
+                a_batch_one_hot_new = np.eye(self.action_dim, self.action_dim)[a_batch]
+                s_batch_new = np.vstack([s[0] for s in s_batch])
+                r_batch_new = r_batch
+                s2_batch_dipstate = np.vstack([s[3] for s in s2_batch])
+                t_batch_new = t_batch
+
+            if self.js_threshold < 1.0 or self.jsd_reward:
+                #TODO: This is highly inefficient
+                js_divergence_batch = []
+                for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot):
+                    if slot != "None":
+                        keys = belief['beliefs'][slot].keys()
+
+                        b = [belief['beliefs'][slot]['**NONE**']] + \
+                            [belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
+
+                        b_2 = [belief2['beliefs'][slot]['**NONE**']] + \
+                              [belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
+
+                        js_divergence = self.compute_js_divergence(b, b_2)
+                        js_divergence_batch.append(js_divergence)
+                    else:
+                        js_divergence_batch.append(0.0)
+            else:
+                js_divergence_batch = [0] * len(r_batch_new)
+
+            tanh_n = np.tanh(1)
+            if self.jsd_reward:
+                if self.jsd_function == 'tanh':
+                    js_divergence_batch = np.tanh(np.array(js_divergence_batch)) / tanh_n
+                #normalize jsd between -1 and 1
+                js_divergence_batch = (-1 + 2 * np.array(js_divergence_batch)).tolist()
+            elif self.js_threshold < 1.0:
+                # normalizing bound to [0, 2] and then /20
+                js_divergence_batch = [2/20 * int(x > self.js_threshold) for x in js_divergence_batch]
+
+            action_q = self.dqn.predict_dip(s2_batch_dipstate, a_batch_one_hot_new)
+            target_q = self.dqn.predict_target_dip(s2_batch_dipstate, a_batch_one_hot_new)
+
+            action_q = np.reshape(action_q, (s_batch_new.shape[0], -1, self.action_dim))
+            target_q = np.reshape(target_q, (s_batch_new.shape[0], -1, self.action_dim))
+
+            y_i = []
+            for k in range(min(s_batch_new.shape[0], self.episodes[self.domainString].size())):
+                Q_bootstrap_label = 0
+                if t_batch_new[k]:
+                    Q_bootstrap_label = r_batch_new[k]
+                else:
+                    if self.q_update == 'single':
+                        action_Q = target_q[k]
+                        if self.jsd_reward:
+                            Q_bootstrap_label = js_divergence_batch[k] + self.gamma * np.max(action_Q)
+                        else:
+                            Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q)
+                    elif self.q_update == 'double':
+                        action_Q = action_q[k]
+                        argmax_tuple = np.unravel_index(np.argmax(action_Q, axis=None), action_Q.shape)
+                        value_Q = target_q[k][argmax_tuple]
+                        if not self.jsd_reward:
+                            Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q
+                        else:
+                            Q_bootstrap_label = js_divergence_batch[k] + self.gamma * value_Q
+
+                y_i.append(Q_bootstrap_label)
+
+                if self.replay_type == 'prioritized':
+                    # update the sum-tree
+                    # update the TD error of the samples in the minibatch
+                    currentQ_s_a_ = action_q[k][a_batch[k]]
+                    error = abs(currentQ_s_a_ - Q_bootstrap_label)
+                    self.episodes[self.domainString].update(idx_batch[k], error)
+
+            reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i])
+
+            predicted_q_value, _, currentLoss = self.dqn.train(s_batch_new, a_batch_one_hot_new, reshaped_yi)
+
+            self.log_loss()
+
+            if self.episodecount % 1 == 0:
+                # Update target networks
+                self.dqn.update_target_network()
+
+        self.savePolicyInc()
+
+    def log_loss(self):
+
+        s_batch_new, s_batch_beliefstate, s_batch_chosen_slot, s2_batch_dipstate, s2_batch_beliefstate, t_batch_new, r_batch_new = \
+            [], [], [], [], [], [], []
+
+        if self.samplecount >= self.minibatch_size * 8 and self.episodecount % self.training_frequency == 0:
+            logger.info('start training...')
+
+            a_batch_one_hot_new = None
+            #updating only states where the action is not "pass()" complicates things :/
+            #since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples
+
+            while len(s_batch_new) < 512:
+
+                s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
+                    self.episodes[self.domainString].sample_batch()
+
+                a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch]
+                #we only wanna update state-action pairs, where action != pass()
+                valid_steps = [action[-1] != 1 for action in a_batch_one_hot]
+                a_batch_one_hot = a_batch_one_hot[valid_steps]
+
+                s_batch_new += [s[0] for i, s in enumerate(s_batch) if valid_steps[i]]
+                s_batch_beliefstate += [s[1] for i, s in enumerate(s_batch) if valid_steps[i]]
+                s_batch_chosen_slot += [s[2] for i, s in enumerate(s_batch) if valid_steps[i]]
+
+                s2_batch_dipstate += [s[3] for s, valid in zip(s2_batch, valid_steps) if valid]
+                s2_batch_beliefstate += [s[1] for s, valid in zip(s2_batch, valid_steps) if valid]
+
+                r_batch_new += [r for r, valid in zip(r_batch, valid_steps) if valid]
+                t_batch_new += [t for t, valid in zip(t_batch, valid_steps) if valid]
+
+                if a_batch_one_hot_new is None:
+                    a_batch_one_hot_new = a_batch_one_hot
+                else:
+                    a_batch_one_hot_new = np.vstack((a_batch_one_hot_new, a_batch_one_hot))
+
+            s_batch_new = np.vstack(s_batch_new)
+            s2_batch_dipstate = np.vstack(s2_batch_dipstate)
+
+            if self.js_threshold < 1.0 or self.jsd_reward:
+                #TODO: This is highly inefficient
+                js_divergence_batch = []
+                for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot):
+                    if slot != "None":
+                        keys = belief['beliefs'][slot].keys()
+
+                        b = [belief['beliefs'][slot]['**NONE**']] + \
+                            [belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
+
+                        b_2 = [belief2['beliefs'][slot]['**NONE**']] + \
+                              [belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
+
+                        js_divergence = self.compute_js_divergence(b, b_2)
+                        js_divergence_batch.append(js_divergence)
+                    else:
+                        js_divergence_batch.append(0.0)
+            else:
+                js_divergence_batch = [0] * len(r_batch_new)
+
+            tanh_n = np.tanh(1)
+            if self.jsd_reward:
+                if self.jsd_function == 'tanh':
+                    js_divergence_batch = np.tanh(np.array(js_divergence_batch)) / tanh_n
+                #normalize jsd between -1 and 1
+                js_divergence_batch = (-1 + 2 * np.array(js_divergence_batch)).tolist()
+            elif self.js_threshold < 1.0:
+                # normalizing bound to [0, 2] and then /20
+                js_divergence_batch = [2/20 * int(x > self.js_threshold) for x in js_divergence_batch]
+
+            action_q = self.dqn.predict_dip(s2_batch_dipstate, a_batch_one_hot_new)
+            target_q = self.dqn.predict_target_dip(s2_batch_dipstate, a_batch_one_hot_new)
+
+            action_q = np.reshape(action_q, (s_batch_new.shape[0], -1, self.action_dim))
+            target_q = np.reshape(target_q, (s_batch_new.shape[0], -1, self.action_dim))
+
+            y_i = []
+            for k in range(s_batch_new.shape[0]):
+                Q_bootstrap_label = 0
+                if t_batch_new[k]:
+                    Q_bootstrap_label = r_batch_new[k]
+                else:
+                    if self.q_update == 'single':
+                        action_Q = target_q[k]
+                        if self.jsd_reward:
+                            Q_bootstrap_label = js_divergence_batch[k] + self.gamma * np.max(action_Q)
+                        else:
+                            Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q)
+                    elif self.q_update == 'double':
+                        action_Q = action_q[k]
+                        argmax_tuple = np.unravel_index(np.argmax(action_Q, axis=None), action_Q.shape)
+                        value_Q = target_q[k][argmax_tuple]
+                        if not self.jsd_reward:
+                            Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q
+                        else:
+                            Q_bootstrap_label = js_divergence_batch[k] + self.gamma * value_Q
+
+                y_i.append(Q_bootstrap_label)
+
+            reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i])
+
+            currentLoss = self.dqn.compute_loss(s_batch_new, a_batch_one_hot_new, reshaped_yi)
+
+            with open(self.log_path, 'a') as file:
+                file.write(str(currentLoss) + "\n")
+
+    def compute_js_divergence(self, P, Q):
+
+        M = [p + q for p, q in zip(P, Q)]
+        return 0.5 * (entropy(P, M, base=2) + entropy(Q, M, base=2))
+
+# END OF FILE
diff --git a/policy/feudalgainRL/NoisyACERPolicy.py b/policy/feudalgainRL/NoisyACERPolicy.py
new file mode 100644
index 0000000000000000000000000000000000000000..5854756136445216cb3f58ce3ffb1569d576f1f4
--- /dev/null
+++ b/policy/feudalgainRL/NoisyACERPolicy.py
@@ -0,0 +1,963 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+'''
+ACERPolicy.py - Sample Efficient Actor Critic with Experience Replay
+==================================================
+
+Copyright CUED Dialogue Systems Group 2015 - 2017
+
+The implementation of the sample efficient actor critic with truncated importance sampling with bias correction,
+the trust region policy optimization method and RETRACE-like multi-step estimation of the value function.
+The parameters ACERPolicy.c, ACERPolicy.alpha, ACERPolicy.
+The details of the implementation can be found here: https://arxiv.org/abs/1802.03753
+
+See also:
+https://arxiv.org/abs/1611.01224
+https://arxiv.org/abs/1606.02647
+
+.. seealso:: CUED Imports/Dependencies:
+
+    import :class:`Policy`
+    import :class:`utils.ContextLogger`
+
+
+
+************************
+
+'''
+import pickle as pickle
+import copy
+import json
+import numpy as np
+import os
+import random
+import scipy
+import scipy.signal
+import tensorflow as tf
+
+import policy.feudalgainRL.noisyacer as noisy_acer
+#from policy.DRL import replay_policy as replay_policy
+from policy.DRL import utils as drlutils
+from policy import Policy
+from policy import SummaryAction
+import ontology.FlatOntologyManager as FlatOnt
+import utils
+from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode
+from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode
+from policy.Policy import TerminalAction, TerminalState
+from curiosity.curiosity_module import Curious
+from utils import ContextLogger, DiaAct
+from utils.Settings import config as cfg
+
+logger = utils.ContextLogger.getLogger('')
+
+
+# --- for flattening the belief --- #
+def flatten_belief(belief, domainUtil, merge=False):
+    belief = belief.getDomainState(domainUtil.domainString)
+    if isinstance(belief, TerminalState):
+        if domainUtil.domainString == 'CamRestaurants':
+            return [0] * 268
+        elif domainUtil.domainString == 'CamHotels':
+            return [0] * 111
+        elif domainUtil.domainString == 'SFRestaurants':
+            return [0] * 633
+        elif domainUtil.domainString == 'SFHotels':
+            return [0] * 438
+        elif domainUtil.domainString == 'Laptops11':
+            return [0] * 257
+        elif domainUtil.domainString == 'TV':
+            return [0] * 188
+
+    policyfeatures = ['full', 'method', 'discourseAct', 'requested', \
+                      'lastActionInformNone', 'offerHappened', 'inform_info']
+
+    flat_belief = []
+    for feat in policyfeatures:
+        add_feature = []
+        if feat == 'full':
+            # for slot in self.sorted_slots:
+            for slot in domainUtil.ontology['informable']:
+                for value in domainUtil.ontology['informable'][slot]:  # + ['**NONE**']:
+                    add_feature.append(belief['beliefs'][slot][value])
+
+                # pfb30 11.03.2017
+                try:
+                    add_feature.append(belief['beliefs'][slot]['**NONE**'])
+                except:
+                    add_feature.append(0.)  # for NONE
+                try:
+                    add_feature.append(belief['beliefs'][slot]['dontcare'])
+                except:
+                    add_feature.append(0.)  # for dontcare
+
+        elif feat == 'method':
+            add_feature = [belief['beliefs']['method'][method] for method in domainUtil.ontology['method']]
+        elif feat == 'discourseAct':
+            add_feature = [belief['beliefs']['discourseAct'][discourseAct]
+                           for discourseAct in domainUtil.ontology['discourseAct']]
+        elif feat == 'requested':
+            add_feature = [belief['beliefs']['requested'][slot] \
+                           for slot in domainUtil.ontology['requestable']]
+        elif feat == 'lastActionInformNone':
+            add_feature.append(float(belief['features']['lastActionInformNone']))
+        elif feat == 'offerHappened':
+            add_feature.append(float(belief['features']['offerHappened']))
+        elif feat == 'inform_info':
+            add_feature += belief['features']['inform_info']
+        else:
+            logger.error('Invalid feature name in config: ' + feat)
+
+        flat_belief += add_feature
+
+    return flat_belief
+
+
+# Discounting function used to calculate discounted returns.
+def discount(x, gamma):
+    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
+
+
+class NoisyACERPolicy(Policy.Policy):
+    '''
+    Derived from :class:`Policy`
+    '''
+    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False):
+        super(NoisyACERPolicy, self).__init__(domainString, is_training)
+
+        tf.reset_default_graph()
+
+        self.file = in_policy_file
+        self.in_policy_file = self.file
+        self.out_policy_file = out_policy_file
+        self.is_training = is_training
+        self.accum_belief = []
+        self.prev_state_check = None
+
+        self.domainString = domainString
+        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
+
+        self.load_buffer = True
+        if cfg.has_option('policy', 'bootstrap_buffer'):
+            self.load_buffer = cfg.getboolean('policy', 'bootstrap_buffer')
+            print("SACER: BOOTSTRAP BUFFER: ", self.load_buffer)
+
+        self.load_policy = True
+        if cfg.has_option('policy', 'bootstrap_master_policy'):
+            self.load_policy = cfg.getboolean('policy', 'bootstrap_master_policy')
+            print("SACER: BOOTSTRAP Policy: ", self.load_policy)
+
+        # parameter settings
+
+        if 0:  # cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper
+            self.n_in = cfg.getint('dqnpolicy', 'n_in')
+        else:
+            self.n_in = self.get_n_in(domainString)
+
+        self.actor_lr = 0.0001
+        if cfg.has_option('dqnpolicy', 'actor_lr'):
+            self.actor_lr = cfg.getfloat('dqnpolicy', 'actor_lr')
+
+        self.critic_lr = 0.001
+        if cfg.has_option('dqnpolicy', 'critic_lr'):
+            self.critic_lr = cfg.getfloat('dqnpolicy', 'critic_lr')
+
+        self.delta = 1.
+        if cfg.has_option('dqnpolicy', 'delta'):
+            self.delta = cfg.getfloat('dqnpolicy', 'delta')
+
+        self.alpha = 0.99
+        if cfg.has_option('dqnpolicy', 'beta'):
+            self.alpha = cfg.getfloat('dqnpolicy', 'beta')
+
+        self.c = 10.
+        if cfg.has_option('dqnpolicy', 'is_threshold'):
+            self.c = cfg.getfloat('dqnpolicy', 'is_threshold')
+
+        self.randomseed = 1234
+        if cfg.has_option('GENERAL', 'seed'):
+            self.randomseed = cfg.getint('GENERAL', 'seed')
+
+        self.gamma = 0.99
+        if cfg.has_option('dqnpolicy', 'gamma'):
+            self.gamma = cfg.getfloat('dqnpolicy', 'gamma')
+
+        self.regularisation = 'l2'
+        if cfg.has_option('dqnpolicy', 'regularisation'):
+            self.regularisation = cfg.get('dqnpolicy', 'regularisation')
+
+        self.learning_rate = 0.001
+        if cfg.has_option('dqnpolicy', 'learning_rate'):
+            self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate')
+
+        self.exploration_type = 'e-greedy'  # Boltzman
+        if cfg.has_option('dqnpolicy', 'exploration_type'):
+            self.exploration_type = cfg.get('dqnpolicy', 'exploration_type')
+
+        self.episodeNum = 1000
+        if cfg.has_option('dqnpolicy', 'episodeNum'):
+            self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum')
+
+        self.maxiter = 4000
+        if cfg.has_option('dqnpolicy', 'maxiter'):
+            self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter')
+
+        self.curiosityreward = False
+        if cfg.has_option('eval', 'curiosityreward'):
+            self.curiosityreward = cfg.getboolean('eval', 'curiosityreward')
+
+        self.epsilon = 1
+        if cfg.has_option('dqnpolicy', 'epsilon'):
+            self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon')
+
+        if not self.curiosityreward:  # no eps-greedy exploration when curious expl. is used
+            self.epsilon_start = 1
+            if cfg.has_option('dqnpolicy', 'epsilon_start'):
+                self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start')
+        else:
+            self.epsilon_start = 0
+
+        self.epsilon_end = 1
+        if cfg.has_option('dqnpolicy', 'epsilon_end'):
+            self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end')
+
+        self.priorProbStart = 1.0
+        if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'):
+            self.priorProbStart = cfg.getfloat('dqnpolicy', 'prior_sample_prob_start')
+
+        self.priorProbEnd = 0.1
+        if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'):
+            self.priorProbEnd = cfg.getfloat('dqnpolicy', 'prior_sample_prob_end')
+
+        self.policyfeatures = []
+        if cfg.has_option('dqnpolicy', 'features'):
+            logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features')))
+            self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features'))
+
+        self.max_k = 5
+        if cfg.has_option('dqnpolicy', 'max_k'):
+            self.max_k = cfg.getint('dqnpolicy', 'max_k')
+
+        self.learning_algorithm = 'drl'
+        if cfg.has_option('dqnpolicy', 'learning_algorithm'):
+            self.learning_algorithm = cfg.get('dqnpolicy', 'learning_algorithm')
+            logger.info('Learning algorithm: ' + self.learning_algorithm)
+
+        self.minibatch_size = 32
+        if cfg.has_option('dqnpolicy', 'minibatch_size'):
+            self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size')
+
+        self.capacity = 1000
+        if cfg.has_option('dqnpolicy', 'capacity'):
+            self.capacity = cfg.getint('dqnpolicy','capacity')
+
+        self.replay_type = 'vanilla'
+        if cfg.has_option('dqnpolicy', 'replay_type'):
+            self.replay_type = cfg.get('dqnpolicy', 'replay_type')
+
+        self.architecture = 'vanilla'
+        if cfg.has_option('dqnpolicy', 'architecture'):
+            self.architecture = cfg.get('dqnpolicy', 'architecture')
+
+        self.q_update = 'single'
+        if cfg.has_option('dqnpolicy', 'q_update'):
+            self.q_update = cfg.get('dqnpolicy', 'q_update')
+
+        self.h1_size = 130
+        if cfg.has_option('dqnpolicy', 'h1_size'):
+            self.h1_size = cfg.getint('dqnpolicy', 'h1_size')
+
+        self.h2_size = 50
+        if cfg.has_option('dqnpolicy', 'h2_size'):
+            self.h2_size = cfg.getint('dqnpolicy', 'h2_size')
+
+        self.save_step = 200
+        if cfg.has_option('policy', 'save_step'):
+            self.save_step = cfg.getint('policy', 'save_step')
+
+        self.temperature = 0.0
+        if cfg.has_option('policy', 'temperature'):
+            self.temperature = cfg.getfloat('policy', 'temperature')
+
+        self.behaviour_cloning = False
+        if cfg.has_option('policy', 'behaviour_cloning'):
+            self.behaviour_cloning = cfg.getboolean('policy', 'behaviour_cloning')
+            if self.behaviour_cloning:
+                print("We use behaviour cloning in addition.")
+
+        self.combined_ER = False
+        if cfg.has_option('policy', 'combined_ER'):
+            self.combined_ER = cfg.getboolean('policy', 'combined_ER')
+
+        self.master_space = False
+        if cfg.has_option('policy', 'master_space'):
+            self.master_space = cfg.getboolean('policy', 'master_space')
+
+        self.optimize_ER = False
+        if cfg.has_option('policy', 'optimize_ER'):
+            self.optimize_ER = cfg.getboolean('policy', 'optimize_ER')
+
+        self.importance_sampling = 'soft'
+        if cfg.has_option('dqnpolicy', 'importance_sampling'):
+            self.importance_sampling = cfg.get('dqnpolicy', 'importance_sampling')
+
+        self.train_iters_per_episode = 1
+        if cfg.has_option('dqnpolicy', 'train_iters_per_episode'):
+            self.train_iters_per_episode = cfg.getint('dqnpolicy', 'train_iters_per_episode')
+
+        self.training_frequency = 2
+        if cfg.has_option('dqnpolicy', 'training_frequency'):
+            self.training_frequency = cfg.getint('dqnpolicy', 'training_frequency')
+
+        # domain specific parameter settings (overrides general policy parameter settings)
+        if cfg.has_option('dqnpolicy_'+domainString, 'n_in'):
+            self.n_in = cfg.getint('dqnpolicy_'+domainString, 'n_in')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'actor_lr'):
+            self.actor_lr = cfg.getfloat('dqnpolicy_'+domainString, 'actor_lr')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'critic_lr'):
+            self.critic_lr = cfg.getfloat('dqnpolicy_'+domainString, 'critic_lr')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'delta'):
+            self.delta = cfg.getfloat('dqnpolicy_'+domainString, 'delta')
+
+        if cfg.has_option('dqnpolicy_' + domainString, 'beta'):
+            self.alpha = cfg.getfloat('dqnpolicy_' + domainString, 'beta')
+
+        if cfg.has_option('dqnpolicy_' + domainString, 'is_threshold'):
+            self.c = cfg.getfloat('dqnpolicy_' + domainString, 'is_threshold')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'gamma'):
+            self.gamma = cfg.getfloat('dqnpolicy_'+domainString, 'gamma')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'regularisation'):
+            self.regularisation = cfg.get('dqnpolicy_'+domainString, 'regulariser')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'learning_rate'):
+            self.learning_rate = cfg.getfloat('dqnpolicy_'+domainString, 'learning_rate')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'exploration_type'):
+            self.exploration_type = cfg.get('dqnpolicy_'+domainString, 'exploration_type')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'episodeNum'):
+            self.episodeNum = cfg.getfloat('dqnpolicy_'+domainString, 'episodeNum')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'maxiter'):
+            self.maxiter = cfg.getfloat('dqnpolicy_'+domainString, 'maxiter')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'epsilon'):
+            self.epsilon = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'epsilon_start'):
+            self.epsilon_start = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon_start')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'epsilon_end'):
+            self.epsilon_end = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon_end')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'prior_sample_prob_start'):
+            self.priorProbStart = cfg.getfloat('dqnpolicy_'+domainString, 'prior_sample_prob_start')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'prior_sample_prob_end'):
+            self.priorProbEnd = cfg.getfloat('dqnpolicy_'+domainString, 'prior_sample_prob_end')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'features'):
+            logger.info('Features: ' + str(cfg.get('dqnpolicy_'+domainString, 'features')))
+            self.policyfeatures = json.loads(cfg.get('dqnpolicy_'+domainString, 'features'))
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'max_k'):
+            self.max_k = cfg.getint('dqnpolicy_'+domainString, 'max_k')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'learning_algorithm'):
+            self.learning_algorithm = cfg.get('dqnpolicy_'+domainString, 'learning_algorithm')
+            logger.info('Learning algorithm: ' + self.learning_algorithm)
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'minibatch_size'):
+            self.minibatch_size = cfg.getint('dqnpolicy_'+domainString, 'minibatch_size')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'capacity'):
+            self.capacity = cfg.getint('dqnpolicy_'+domainString,'capacity')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'replay_type'):
+            self.replay_type = cfg.get('dqnpolicy_'+domainString, 'replay_type')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'architecture'):
+            self.architecture = cfg.get('dqnpolicy_'+domainString, 'architecture')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'q_update'):
+            self.q_update = cfg.get('dqnpolicy_'+domainString, 'q_update')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'h1_size'):
+            self.h1_size = cfg.getint('dqnpolicy_'+domainString, 'h1_size')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'h2_size'):
+            self.h2_size = cfg.getint('dqnpolicy_'+domainString, 'h2_size')
+
+        if cfg.has_option('policy_' + domainString, 'save_step'):
+            self.save_step = cfg.getint('policy_' + domainString, 'save_step')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'importance_sampling'):
+            self.importance_sampling = cfg.get('dqnpolicy_'+domainString, 'importance_sampling')
+
+        if cfg.has_option('dqnpolicy_' + domainString, 'train_iters_per_episode'):
+            self.train_iters_per_episode = cfg.getint('dqnpolicy_' + domainString, 'train_iters_per_episode')
+
+        if cfg.has_option('dqnpolicy_'+domainString, 'training_frequency'):
+            self.training_frequency = cfg.getint('dqnpolicy_'+domainString, 'training_frequency')
+
+        self.episode_ct = 0
+
+        self.episode_ave_max_q = []
+        self.mu_prob = 0.  # behavioral policy
+
+        # os.environ["CUDA_VISIBLE_DEVICES"]=""
+
+        # init session
+        self.sess = tf.Session()
+
+        with tf.device("/cpu:0"):
+
+            np.random.seed(self.randomseed)
+            tf.set_random_seed(self.randomseed)
+            random.seed(self.randomseed)
+
+            # initialise an replay buffer
+            if self.replay_type == 'vanilla':
+                self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size,
+                                                                       self.randomseed)
+            elif self.replay_type == 'prioritized':
+                self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed)
+            #replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
+            #self.episodes = []
+            self.samplecount = 0
+            self.episodecount = 0
+
+            # construct the models
+            self.state_dim = self.n_in
+            if self.master_space:
+                self.masteraction = MasterAction.MasterAction(domainString)
+                self.inform_ways = len(self.masteraction.inform_ways)
+                self.summary_action_dim = len(self.masteraction.summary_action_names)
+                self.payload_dim = len(self.masteraction.inform_names)
+                self.action_dim = [self.summary_action_dim, self.payload_dim, self.inform_ways]
+                self.global_mu = [0. for _ in range(self.action_dim[0])]
+                #dimension of master space should then be:
+                # summary_action_dim - inform_ways + inform_ways * payload_dim
+            else:
+                self.summaryaction = SummaryAction.SummaryAction(domainString)
+                self.action_dim = len(self.summaryaction.action_names)
+                action_bound = len(self.summaryaction.action_names)
+            #self.stats = [0 for _ in range(self.action_dim)]
+                self.global_mu = [0. for _ in range(self.action_dim)]
+
+            self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta,
+                                                self.c, self.alpha, self.h1_size, self.h2_size, self.is_training,
+                                                temperature=self.temperature)
+
+            #if self.optimize_ER:
+            #    self.replay_policy = replay_policy.ReplayPolicy(self.sess, seed=self.randomseed)
+
+            # when all models are defined, init all variables
+            init_op = tf.global_variables_initializer()
+            self.sess.run(init_op)
+
+            if self.load_policy:
+                self.loadPolicy(self.in_policy_file)
+                print('loaded replay size: ', self.episodes[self.domainString].size())
+            else:
+                print("We do not load a previous policy.")
+
+            if self.curiosityreward:
+                self.curiosityFunctions = Curious()
+            #self.acer.update_target_network()
+
+    def get_n_in(self, domain_string):
+        if domain_string == 'CamRestaurants':
+            return 268
+        elif domain_string == 'CamHotels':
+            return 111
+        elif domain_string == 'SFRestaurants':
+            return 636
+        elif domain_string == 'SFHotels':
+            return 438
+        elif domain_string == 'Laptops6':
+            return 268 # ic340: this is wrong
+        elif domain_string == 'Laptops11':
+            return 257
+        elif domain_string is 'TV':
+            return 188
+        else:
+            print('DOMAIN {} SIZE NOT SPECIFIED, PLEASE DEFINE n_in'.format(domain_string))
+
+    def act_on(self, state, hyps=None):
+        if self.lastSystemAction is None and self.startwithhello:
+            systemAct, nextaIdex, mu, mask = 'hello()', -1, None, None
+        else:
+            systemAct, nextaIdex, mu, mask = self.tion(state)
+        self.lastSystemAction = systemAct
+        self.summaryAct = nextaIdex
+        self.prev_mu = mu
+        self.prev_mask = mask
+        self.prevbelief = state
+
+        systemAct = DiaAct.DiaAct(systemAct)
+        return systemAct
+
+    def record(self, reward, domainInControl=None, weight=None, state=None, action=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.actToBeRecorded is None:
+            #self.actToBeRecorded = self.lastSystemAction
+            self.actToBeRecorded = self.summaryAct
+
+        if state is None:
+            state = self.prevbelief
+        if action is None:
+            action = self.actToBeRecorded
+        mu_weight = self.prev_mu
+        mask = self.prev_mask
+
+        cState, cAction = self.convertStateAction(state, action)
+
+        # normalising total return to -1~1
+        #reward /= 40.0
+        reward /= 20.0
+        """
+        reward = float(reward+10.0)/40.0
+        """
+        value = self.sacer.predict_value([cState], [mask])
+
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=cState, \
+                    state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask)
+        elif self.replay_type == 'prioritized':
+            self.episodes[domainInControl].record(state=cState, \
+                    state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask)
+
+        self.actToBeRecorded = None
+        self.samplecount += 1
+        return
+
+    def finalizeRecord(self, reward, domainInControl=None):
+        if domainInControl is None:
+            domainInControl = self.domainString
+        if self.episodes[domainInControl] is None:
+            logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
+            return
+
+        #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size())
+        #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q)
+        #print self.stats
+
+        # normalising total return to -1~1
+        reward /= 20.0
+
+        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
+        value = 0.0  # not effect on experience replay
+
+        def calculate_discountR_advantage(r_episode, v_episode):
+            #########################################################################
+            # Here we take the rewards and values from the rollout, and use them to
+            # generate the advantage and discounted returns.
+            # The advantage function uses "Generalized Advantage Estimation"
+            bootstrap_value = 0.0
+            self.r_episode_plus = np.asarray(r_episode + [bootstrap_value])
+            discounted_r_episode = discount(self.r_episode_plus,self.gamma)[:-1]
+            self.v_episode_plus = np.asarray(v_episode + [bootstrap_value])
+            advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1]
+            advantage = discount(advantage,self.gamma)
+            #########################################################################
+            return discounted_r_episode, advantage
+
+        if self.replay_type == 'vanilla':
+            self.episodes[domainInControl].record(state=terminal_state, \
+                    state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None)
+        elif self.replay_type == 'prioritized':
+            episode_r, episode_v = self.episodes[domainInControl].record_final_and_get_episode(state=terminal_state, \
+                                                                                               state_ori=TerminalState(),
+                                                                                               action=terminal_action,
+                                                                                               reward=reward,
+                                                                                               value=value)
+
+            # TD_error is a list of td error in the current episode
+            _, TD_error = calculate_discountR_advantage(episode_r, episode_v)
+            episodic_TD = np.mean(np.absolute(TD_error))
+            print('episodic_TD')
+            print(episodic_TD)
+            self.episodes[domainInControl].insertPriority(episodic_TD)
+
+        return
+
+    def convertStateAction(self, state, action):
+        if isinstance(state, TerminalState):
+            if self.domainUtil.domainString == 'CamRestaurants':
+                return [0] * 268, action
+            elif self.domainUtil.domainString == 'CamHotels':
+                return [0] * 111, action
+            elif self.domainUtil.domainString == 'SFRestaurants':
+                return [0] * 633, action
+            elif self.domainUtil.domainString == 'SFHotels':
+                return [0] * 438, action
+            elif self.domainUtil.domainString == 'Laptops11':
+                return [0] * 257, action
+            elif self.domainUtil.domainString == 'TV':
+                return [0] * 188, action
+        else:
+            flat_belief = flatten_belief(state, self.domainUtil)
+            self.prev_state_check = flat_belief
+
+            return flat_belief, action
+
+    def tion(self, beliefstate):
+        '''
+        select next action
+
+        :param beliefstate:
+        :param hyps:
+        :returns: (int) next summarye action
+        '''
+        beliefVec = flatten_belief(beliefstate, self.domainUtil)
+        if self.master_space:
+            execMask = self.masteraction.getExecutableMask()
+        else:
+            execMask = self.summaryaction.getExecutableMask(beliefstate, self.lastSystemAction)
+        #execMask = np.zeros(self.action_dim)
+
+        def apply_mask(prob, maskval, baseline=9.99999975e-06):
+            return prob if maskval == 0.0 else baseline # not quite 0.0 to avoid division by zero
+
+        action_prob = self.sacer.predict_policy(np.reshape(beliefVec, (1, len(beliefVec))),
+                                               np.reshape(execMask, (1, len(execMask))))[0]
+
+        if self.exploration_type == 'e-greedy' or not self.is_training:
+            # epsilon greedy
+            epsilon = self.epsilon if self.is_training else 0.
+            if not self.master_space:
+                # a bit hacky here because execMask has a different shape than action_prob
+                eps_prob = [apply_mask(prob, admissible) for prob, admissible in zip(np.ones(len(action_prob)), execMask)]
+            else:
+                #this is fine because we have no execMask for master space at the moment
+                eps_prob = np.ones(len(action_prob))
+            eps_prob /= sum(eps_prob)
+
+            #action_prob = [apply_mask(prob, admissible) for prob, admissible in zip(action_prob, execMask)]
+            best_index = np.argmax(action_prob)
+            best_prob = [1. if i == best_index else 0. for i in range(len(action_prob))]
+
+            #we sample a random action with probability epsilon and sample from the policy distribution with probability 1-epsilon
+            action_prob = epsilon * np.array(eps_prob) + (1. - epsilon) * action_prob
+
+            #take the greedy action during evaluation
+            if not self.is_training:
+                action_prob = np.array(best_prob)
+
+        elif self.exploration_type == 'standard':
+            #action_prob = [apply_mask(prob, admissible) for prob, admissible in zip(action_prob, execMask)]
+            print(action_prob)
+
+        if not self.is_training:
+            best_index = np.argmax(action_prob)
+            best_prob = [1. if i == best_index else 0. for i in range(len(action_prob))]
+            action_prob = np.array(best_prob)
+
+        nextaIdex = np.random.choice(len(action_prob), p=action_prob / sum(action_prob))
+        mu = action_prob / sum(action_prob)
+
+        if self.master_space:
+            beliefstate = beliefstate.getDomainState(self.domainUtil.domainString)
+            print("MASTER ACTION: ", self.masteraction.action_names[nextaIdex])
+            masterAct = self.masteraction.Convert(beliefstate, self.masteraction.action_names[nextaIdex], self.lastSystemAction)
+            print("MASTER ACT: ", masterAct)
+        else:
+            summaryAct = self.summaryaction.action_names[nextaIdex]
+            beliefstate = beliefstate.getDomainState(self.domainUtil.domainString)
+            masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction)
+        return masterAct, nextaIdex, mu, execMask
+
+    def train(self):
+        '''
+        call this function when the episode ends
+        '''
+        USE_GLOBAL_MU = False
+        self.episode_ct += 1
+
+        if not self.is_training:
+            logger.info("Not in training mode")
+            return
+        else:
+            logger.info("Update acer policy parameters.")
+
+        self.episodecount += 1
+        logger.info("Sample Num so far: %s" % (self.samplecount))
+        logger.info("Episode Num so far: %s" % (self.episodecount))
+        #if True:
+        if self.samplecount >= self.minibatch_size * 3 and self.episodecount % self.training_frequency == 0:
+        # if self.episodecount >= self.minibatch_size  and self.episodecount % 2 == 0:
+        # if self.episodecount >= self.minibatch_size * 3 and self.episodecount % 2 == 0:
+        # if self.samplecount >= self.capacity and self.episodecount % 5 == 0:
+            logger.info('start training...')
+
+            for _ in range(self.train_iters_per_episode):
+
+                if self.optimize_ER:
+                    episode_features = self.compute_episode_features()
+                    sub_buffer = self.replay_policy.sample_buffer(episode_features, self.episodes[self.domainString].buffer)
+                else:
+                    sub_buffer = []
+
+                if self.replay_type == 'vanilla' or self.replay_type == 'prioritized':
+                    s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, v_batch, mu_policy, mask_batch = \
+                        self.episodes[self.domainString].sample_batch(sub_buffer)
+                    if USE_GLOBAL_MU:
+                        mu_sum = sum(self.global_mu)
+                        mu_normalised = np.array([c / mu_sum for c in self.global_mu])
+                        #print >> sys.stderr, len(mu_policy), len(mu_policy[0]), mu_policy[0][0]
+                        mu_policy = [[mu_normalised for _ in range(len(mu_policy[i]))] for i in range(len(mu_policy))]
+                else:
+                    assert False  # not implemented yet
+
+                discounted_r_batch = []
+                advantage_batch = []
+
+                def calculate_discountR_advantage(r_episode, v_episode):
+                    #########################################################################
+                    # Here we take the rewards and values from the rolloutv, and use them to
+                    # generate the advantage and discounted returns.
+                    # The advantage function uses "Generalized Advantage Estimation"
+                    bootstrap_value = 0.0
+                    # r_episode rescale by rhos?
+                    self.r_episode_plus = np.asarray(r_episode + [bootstrap_value])
+                    discounted_r_episode = discount(self.r_episode_plus, self.gamma)[:-1]
+                    self.v_episode_plus = np.asarray(v_episode + [bootstrap_value])
+                    # change sth here
+                    advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1]
+                    advantage = discount(advantage, self.gamma)
+                    #########################################################################
+                    return discounted_r_episode, advantage
+
+                if self.replay_type == 'prioritized':
+                    for item_r, item_v, item_idx in zip(r_batch, v_batch, idx_batch):
+                        # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist())
+                        r, a = calculate_discountR_advantage(item_r, item_v)
+
+                        # flatten nested numpy array and turn it into list
+                        discounted_r_batch += r.tolist()
+                        advantage_batch += a.tolist()
+
+                        # update the sum-tree
+                        # update the TD error of the samples (episode) in the minibatch
+                        episodic_TD_error = np.mean(np.absolute(a))
+                        self.episodes[self.domainString].update(item_idx, episodic_TD_error)
+                else:
+                    for item_r, item_v in zip(r_batch, v_batch):
+                        # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist())
+                        r, a = calculate_discountR_advantage(item_r, item_v)
+
+                        # flatten nested numpy array and turn it into list
+                        discounted_r_batch += r.tolist()
+                        advantage_batch += a.tolist()
+
+                batch_size = len(s_batch)
+
+                if self.optimize_ER:
+
+                    if self.episodes[self.domainString].count < self.minibatch_size:
+                        random_indices = list(range(len(self.episodes[self.domainString].buffer)))
+                    else:
+                        random_indices = random.sample(range(len(self.episodes[self.domainString].buffer)), self.minibatch_size)
+                    if len(self.replay_policy.sampled_indices) < self.minibatch_size:
+                        random_sampled_indices = self.replay_policy.sampled_indices
+                    else:
+                        random_sampled_indices = random.sample(self.replay_policy.sampled_indices, self.minibatch_size)
+                    sampled_indices = random_indices + random_sampled_indices
+
+                    start_states = [self.episodes[self.domainString].buffer[i][0][0] for i in sampled_indices]
+                    start_masks = [self.episodes[self.domainString].buffer[i][0][9] for i in sampled_indices]
+
+                    start_values = self.sacer.predict_value(start_states, start_masks)
+                    average_start_value = np.mean(start_values)
+
+                if self.master_space:
+                    a_dim = self.action_dim[0] - self.action_dim[2] + self.action_dim[2] * self.action_dim[1]
+                else:
+                    a_dim = self.action_dim
+                a_batch_one_hot = np.eye(a_dim)[np.concatenate(a_batch, axis=0).tolist()]
+
+                if self.behaviour_cloning:
+                    behaviour_mask = []
+                    for r in r_batch:
+                        if r[-1] > 0:
+                            #episode was successful
+                            behaviour_mask = behaviour_mask + [1] * len(r)
+                        else:
+                            behaviour_mask = behaviour_mask + [0] * len(r)
+                    behaviour_mask = np.array(behaviour_mask, dtype=np.float32)
+                else:
+                    behaviour_mask = np.zeros(shape=[sum([len(l) for l in s_batch])], dtype=np.float32)
+
+                # train curiosity model (Paula)
+                if self.curiosityreward:
+                        self.curiosityFunctions.training(np.concatenate(np.array(s2_batch), axis=0).tolist(),
+                                                         np.concatenate(np.array(s_batch), axis=0).tolist(),
+                                                         a_batch_one_hot)
+
+                loss, entropy, optimize = \
+                    self.sacer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot,
+                                    np.concatenate(np.array(mask_batch), axis=0).tolist(),
+                                    np.concatenate(np.array(r_batch), axis=0).tolist(), s_batch, r_batch, self.gamma,
+                                    np.concatenate(np.array(mu_policy), axis=0),
+                                    discounted_r_batch, advantage_batch,
+                                     mu_values=np.concatenate(np.array(v_batch), axis=0),
+                                     behaviour_mask=behaviour_mask)
+
+                ent, norm_loss = entropy/float(batch_size), loss/float(batch_size)
+
+                if self.optimize_ER:
+                    start_values = self.sacer.predict_value(start_states, start_masks)
+                    new_average_start_value = np.mean(start_values)
+
+                    for number, index in enumerate(random_indices):
+                        self.episodes[self.domainString].buffer[index][0][6] = start_values[number]
+                    for number, index in enumerate(random_sampled_indices):
+                        self.episodes[self.domainString].buffer[index][0][6] = start_values[number + len(random_indices)]
+
+                    if self.minibatch_size < self.episodecount:
+                        print("REWARD SIGNAL ER ACTOR: ", new_average_start_value - average_start_value)
+                        self.replay_policy.train_ER_actor(new_average_start_value - average_start_value)
+
+            self.savePolicyInc()  # self.out_policy_file)
+
+    def savePolicy(self, FORCE_SAVE=False):
+        """
+        Does not use this, cause it will be called from agent after every episode.
+        we want to save the policy only periodically.
+        """
+        pass
+
+    def savePolicyInc(self, FORCE_SAVE=False):
+        """
+        save model and replay buffer
+        """
+        if self.episodecount % self.save_step == 0:
+            # save_path = self.saver.save(self.sess, self.out_policy_file+'.ckpt')
+            self.sacer.save_network(self.out_policy_file+'.acer.ckpt')
+            if self.curiosityreward:
+                self.curiosityFunctions.save_ICM('_curiosity_model/ckpt-curiosity')
+
+            f = open(self.out_policy_file+'.episode', 'wb')
+            for obj in [self.episodecount, self.episodes[self.domainString], self.global_mu]:
+                pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
+            f.close()
+            # logger.info("Saving model to %s and replay buffer..." % save_path)
+
+    def loadPolicy(self, filename):
+        """
+        load model and replay buffer
+        """
+        # load models
+        self.sacer.load_network(filename+'.acer.ckpt')
+        # load replay buffer
+        if self.load_buffer:
+            try:
+                print('load from: ', filename)
+                f = open(filename+'.episode', 'rb')
+                loaded_objects = []
+                for i in range(2):  # load nn params and collected data
+                    loaded_objects.append(pickle.load(f))
+                self.episodecount = int(loaded_objects[0])
+                self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1])
+                self.global_mu = loaded_objects[2]
+                logger.info("Loading both model from %s and replay buffer..." % filename)
+                f.close()
+            except:
+                logger.info("Loading only models...")
+        else:
+            print("SACER: We do not load the buffer.")
+
+    def restart(self):
+        self.summaryAct = None
+        self.lastSystemAction = None
+        self.prevbelief = None
+        self.prev_mu = None
+        self.prev_mask = None
+        self.actToBeRecorded = None
+        self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter)
+        # print 'current eps', self.epsilon
+        # self.episodes = dict.fromkeys(OntologyUtils.available_domains, None)
+        # self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.randomseed)
+        self.episode_ave_max_q = []
+
+    def compute_episode_features(self):
+
+        episode_features = []
+
+        self.update_buffer_divergence()
+
+        for index, episode in enumerate(self.episodes[self.domainString].buffer):
+
+            success = 1 if episode[-1][3] > 0 else 0
+            total_return = len(episode)
+            if success == 1:
+                total_return += 1
+
+            timestep = index / len(self.episodes[self.domainString].buffer)
+
+            episode_features.append([success, total_return, timestep, episode[0][6], episode[-1][10]])
+
+        successful_dialogs = [1 for epi in episode_features if epi[0]==1]
+        print("NUMBER OF SUCCESSFUL DIALOGS: ", len(successful_dialogs))
+
+        return episode_features
+
+    def update_buffer_divergence(self):
+        if self.episodes[self.domainString].count < self.minibatch_size:
+            random_indices = list(range(len(self.episodes[self.domainString].buffer)))
+        else:
+            random_indices = random.sample(range(len(self.episodes[self.domainString].buffer)), self.minibatch_size)
+
+        episodes = [self.episodes[self.domainString].buffer[i] for i in random_indices]
+
+        s_batch = [timestep[0] for epi in episodes for timestep in epi]
+        a_batch = [timestep[2] for epi in episodes for timestep in epi]
+        mu_policy = [timestep[8] for epi in episodes for timestep in epi]
+        mask_batch = [timestep[9] for epi in episodes for timestep in epi]
+
+        a_batch_one_hot = np.eye(self.action_dim)[a_batch]
+
+        rho = self.sacer.compute_rho(s_batch, a_batch_one_hot, mu_policy, mask_batch)
+
+        #pi_prob = self.sacer.compute_responsible_output(s_batch, a_batch_one_hot, mask_batch)
+        #product = rho * pi_prob
+
+        #TODO: normalize by c?
+        product = np.minimum(self.c, rho)
+
+        offset = 0
+        for index in random_indices:
+            length = len(self.episodes[self.domainString].buffer[index])
+            episode_divergence = product[offset:length+offset]
+            episode_divergence = sum(episode_divergence) / length
+
+            offset = offset + length
+
+            self.episodes[self.domainString].buffer[index][-1][10] = episode_divergence
diff --git a/policy/feudalgainRL/__init__.py b/policy/feudalgainRL/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/policy/feudalgainRL/dqn_latest.py b/policy/feudalgainRL/dqn_latest.py
new file mode 100644
index 0000000000000000000000000000000000000000..f945067231ef7176b671fd6c5d35dea2599586e4
--- /dev/null
+++ b/policy/feudalgainRL/dqn_latest.py
@@ -0,0 +1,197 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+""" 
+Implementation of DQAN -  Deep Q Action Network
+
+The algorithm is developed with tflearn + Tensorflow
+
+Author: Pei-Hao Su
+"""
+import tensorflow as tf
+import numpy as np
+import tflearn
+
+from policy.DRL.replay_buffer import ReplayBuffer
+
+# ===========================
+#   Deep Q Action Network
+# ===========================
+class DeepQNetwork(object):
+    """ 
+    Input to the network is the state and action, output is Q(s,a).
+    """
+    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, \
+                    num_actor_vars, architecture = 'duel', h1_size = 130, h2_size = 50):
+        self.sess = sess
+        self.s_dim = state_dim
+        self.a_dim = action_dim
+        self.learning_rate = learning_rate
+        self.tau = tau
+        self.architecture = architecture
+        self.h1_size = h1_size
+        self.h2_size = h2_size
+
+        # Create the deep Q network
+        self.inputs, self.action, self.Qout = \
+                        self.create_ddq_network(self.architecture, self.h1_size, self.h2_size)
+        self.network_params = tf.trainable_variables()
+
+        # Target Network
+        self.target_inputs, self.target_action, self.target_Qout = \
+                        self.create_ddq_network(self.architecture, self.h1_size, self.h2_size)
+        self.target_network_params = tf.trainable_variables()[len(self.network_params):]
+
+        # Op for periodically updating target network
+        self.update_target_network_params = \
+            [self.target_network_params[i].assign(\
+                tf.mul(self.network_params[i], self.tau) + tf.mul(self.target_network_params[i], 1. - self.tau))
+                for i in range(len(self.target_network_params))]
+
+        # Network target (y_i)
+        self.sampled_q = tf.placeholder(tf.float32, [None, 1])
+        
+        # Predicted Q given state and chosed action
+        #actions_one_hot = self.action
+        #self.pred_q = tf.reduce_sum(self.Qout * actions_one_hot, reduction_indices=1, name='q_acted')
+        self.pred_q = self.Qout
+
+        self.diff = self.sampled_q - self.pred_q
+
+        self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss')
+
+        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
+        self.optimize = self.optimizer.minimize(self.loss)
+       
+    def create_ddq_network(self, architecture = 'duel', h1_size = 130, h2_size = 50):
+        inputs = tf.placeholder(tf.float32, [None, self.s_dim])
+        action = tf.placeholder(tf.float32, [None, self.a_dim])
+
+        # state network
+        W_fc1_s = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01))
+        b_fc1_s = tf.Variable(tf.zeros([h1_size]))
+        h_fc1_s = tf.nn.relu(tf.matmul(inputs, W_fc1_s) + b_fc1_s)
+
+        # action network
+        W_fc1_a = tf.Variable(tf.truncated_normal([self.a_dim, h1_size], stddev=0.01))
+        b_fc1_a = tf.Variable(tf.zeros([h1_size]))
+        h_fc1_a = tf.nn.relu(tf.matmul(action, W_fc1_a) + b_fc1_a)
+
+
+        #h_fc1 = tf.nn.tanh(tf.matmul(inputs, W_fc1) + b_fc1)
+        #if architecture == 'duel':
+        if False:
+
+            """
+            W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_fc2_s = tf.Variable(tf.zeros([h2_size]))
+            h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s)
+
+            W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01))
+            b_value = tf.Variable(tf.zeros([1]))
+            value_out  = tf.matmul(h_fc2_s, W_value) + b_value
+
+
+
+            W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_fc2_a = tf.Variable(tf.zeros([h2_size]))
+            h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a)
+
+            Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1)
+            """
+
+
+            # value function
+            W_value = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_value = tf.Variable(tf.zeros([h2_size]))
+            h_value = tf.nn.relu(tf.matmul(h_fc1, W_value) + b_value)
+
+            W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01))
+            b_value = tf.Variable(tf.zeros([1]))
+            value_out  = tf.matmul(h_value, W_value) + b_value
+
+            # advantage function
+            W_advantage = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_advantage = tf.Variable(tf.zeros([h2_size]))
+            h_advantage = tf.nn.relu(tf.matmul(h_fc1, W_advantage) + b_advantage)
+
+            W_advantage = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01))
+            b_advantage = tf.Variable(tf.zeros([self.a_dim]))
+            Advantage_out  = tf.matmul(h_advantage, W_advantage) + b_advantage
+
+            Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, reduction_indices=1, keep_dims=True))
+
+        else:
+            W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_fc2_s = tf.Variable(tf.zeros([h2_size]))
+            h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s)
+            
+            W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_fc2_a = tf.Variable(tf.zeros([h2_size]))
+            h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a)
+
+            # inner product of state s and action a
+            #Qout = tf.mul(h_fc2_s,h_fc2_a)
+            Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1)
+            #Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1, keep_dims=True )
+            #Qout = tf.reduce_sum(tf.mul(h_fc2_s,h_fc2_a))
+        
+        return inputs, action, Qout
+
+    def train(self, inputs, action, sampled_q):
+        return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={
+            self.inputs: inputs,
+            self.action: action,
+            self.sampled_q: sampled_q
+        })
+
+    def predict(self, inputs, action):
+        #return self.sess.run(self.pred_q, feed_dict={
+        return self.sess.run(self.Qout, feed_dict={
+            self.inputs: inputs,
+            self.action: action
+        })
+
+    def predict_target(self, inputs, action):
+        #return self.sess.run(self.pred_q, feed_dict={
+        return self.sess.run(self.target_Qout, feed_dict={
+            self.target_inputs: inputs,
+            self.target_action: action
+        })
+
+    def update_target_network(self):
+        self.sess.run(self.update_target_network_params)
+
+    def load_network(self, load_filename):
+        self.saver = tf.train.Saver()
+        try:
+            self.saver.restore(self.sess, load_filename)
+            print("Successfully loaded:", load_filename)
+        except:
+            print("Could not find old network weights")
+
+    def save_network(self, save_filename):
+        print('Saving deepq-network...')
+        self.saver.save(self.sess, save_filename)
+
+    def clipped_error(self, x): 
+        return tf.select(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false
diff --git a/policy/feudalgainRL/feudalUtils.py b/policy/feudalgainRL/feudalUtils.py
new file mode 100644
index 0000000000000000000000000000000000000000..27d7b625013bc256fe24ad22102f733e2ae57132
--- /dev/null
+++ b/policy/feudalgainRL/feudalUtils.py
@@ -0,0 +1,128 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+import sys
+import numpy as np
+
+import ontology.FlatOntologyManager as FlatOnt
+
+
+def get_feudal_masks(non_exec, slots, slot_independent_actions, slot_specific_actions):
+
+    feudal_masks = {'req_info': {}, 'give_info': None, 'master': None}
+    give_info_masks = np.zeros(len(slot_independent_actions))
+    give_info_masks[-1] = -sys.maxsize
+    for i, action in enumerate(slot_independent_actions):
+        if action in non_exec:
+            give_info_masks[i] = -sys.maxsize
+    feudal_masks['give_info'] = give_info_masks
+    for slot in slots:
+        feudal_masks['req_info'][slot] = np.zeros(len(slot_specific_actions))
+        feudal_masks['req_info'][slot][-1] = -sys.maxsize
+        for i, action in enumerate(slot_specific_actions):
+            if action == 'reqmore':
+                if action in non_exec:
+                    feudal_masks['req_info'][slot][i] = -sys.maxsize
+            elif action + '_' + slot in non_exec:
+                feudal_masks['req_info'][slot][i] = -sys.maxsize
+    master_masks = np.zeros(3)
+    master_masks[:] = -sys.maxsize
+    if 0 in give_info_masks:
+        master_masks[0] = 0
+    for slot in slots:
+        if 0 in feudal_masks['req_info'][slot]:
+            master_masks[1] = 0
+    feudal_masks['master'] = master_masks
+    # print(non_exec)
+    # print(feudal_masks)
+    return feudal_masks
+
+def get_feudalAC_masks(non_exec, slots, slot_independent_actions, slot_specific_actions, only_master=True):
+
+    if only_master:
+
+        feudal_masks = {'req_info': {}, 'give_info': None, 'master': None}
+        give_info_masks = np.zeros(len(slot_independent_actions))
+        give_info_masks[-1] = -sys.maxsize
+        for i, action in enumerate(slot_independent_actions):
+            if action in non_exec:
+                give_info_masks[i] = -sys.maxsize
+        feudal_masks['give_info'] = give_info_masks
+        for slot in slots:
+            feudal_masks['req_info'][slot] = np.zeros(len(slot_specific_actions))
+            feudal_masks['req_info'][slot][-1] = -sys.maxsize
+            for i, action in enumerate(slot_specific_actions):
+                if action + '_' + slot in non_exec:
+                    feudal_masks['req_info'][slot][i] = -sys.maxsize
+        #master_masks = np.zeros(len(slot_independent_actions))
+        #master_masks[:] = -sys.maxsize
+        #if 0 in give_info_masks:
+        #    master_masks[-2] = 0
+        for i, slot in enumerate(slots):
+            if 0 in feudal_masks['req_info'][slot]:
+                give_info_masks[-1] = 0
+        feudal_masks['master'] = give_info_masks
+        # print(non_exec)
+        # print(feudal_masks)
+        return feudal_masks
+
+    else:
+        feudal_masks = {'req_info': {}, 'give_info': None, 'master': None}
+        give_info_masks = np.zeros(len(slot_independent_actions))
+        give_info_masks[-1] = -sys.maxsize
+        for i, action in enumerate(slot_independent_actions):
+            if action in non_exec:
+                give_info_masks[i] = -sys.maxsize
+        feudal_masks['give_info'] = give_info_masks
+        for slot in slots:
+            feudal_masks['req_info'][slot] = np.zeros(len(slot_specific_actions))
+            feudal_masks['req_info'][slot][-1] = -sys.maxsize
+            for i, action in enumerate(slot_specific_actions):
+                if action + '_' + slot in non_exec:
+                    feudal_masks['req_info'][slot][i] = -sys.maxsize
+        master_masks = np.zeros(2) * -sys.maxsize
+        #master_masks[:] = -sys.maxsize
+        if 0 in give_info_masks:
+            master_masks[0] = 0
+        for i, slot in enumerate(slots):
+            if 0 in feudal_masks['req_info'][slot]:
+                master_masks[1] = 0
+        feudal_masks['master'] = master_masks
+        # print(non_exec)
+        # print(feudal_masks)
+        return feudal_masks
+
+
+
+def get_feudal_slot_mask(non_exec, slot, slot_actions):
+    slot_masks = np.zeros(len(slot_actions))
+    slot_masks[-1] = -sys.maxsize
+    if slot == 'master' or slot == 'give_info':
+        for i, action in enumerate(slot_actions):
+            if action in non_exec:
+                slot_masks[i] = -sys.maxsize
+    else:
+        for i, action in enumerate(slot_actions):
+            action = action+'_'+slot
+            if action in non_exec:
+                slot_masks[i] = -sys.maxsize
+    return slot_masks
\ No newline at end of file
diff --git a/policy/feudalgainRL/noisyacer.py b/policy/feudalgainRL/noisyacer.py
new file mode 100644
index 0000000000000000000000000000000000000000..da52ad6ad9af57907094797589ee5cb9b954ab00
--- /dev/null
+++ b/policy/feudalgainRL/noisyacer.py
@@ -0,0 +1,588 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+"""
+Implementation of ACER
+
+The algorithm is developed with Tensorflow
+
+Author: Gellert Weisz
+"""
+
+
+import numpy as np
+import tensorflow as tf
+
+from random import choice
+from time import sleep
+from time import time
+
+import sys # todo remove later
+
+# ===========================
+#   Soft Actor Critic with Experience Replay
+# ===========================
+
+
+class NoisyACERNetwork(object):
+    def __init__(self, sess, state_dim, action_dim, learning_rate, delta, c, alpha, h1_size=130, h2_size=50,
+                 is_training = True, actfreq_loss=None, temperature=0, critic_regularizer_weight=0, noisy_acer=False):
+        self.sess = sess
+        self.s_dim = state_dim
+        self.a_dim = action_dim
+        if isinstance(action_dim, list):
+            self.master_space = True
+            self.master_space_dim = self.a_dim[0] - self.a_dim[2] + self.a_dim[2] * self.a_dim[1]
+        else:
+            self.master_space = False
+        self.learning_rate = learning_rate
+        self.critic_regularizer_weight = critic_regularizer_weight
+        if self.critic_regularizer_weight != 0:
+            print(f"We use a regularizer for the critic with weight {self.critic_regularizer_weight}.")
+        self.delta = delta
+        self.c = c
+        self.noisy_acer = noisy_acer
+        self.alpha = alpha
+        self.h1_size = h1_size
+        self.h2_size = h2_size
+        self.is_training = is_training
+        self.temperature = temperature
+        if self.temperature != 0:
+            print("Using soft ACER, temperature set to: ", self.temperature)
+        else:
+            print("Temperature of Maximum Entropy set to 0, using ACER.")
+
+        #Input and hidden layers
+        self.inputs = tf.placeholder(tf.float32, [None, self.s_dim])
+        self.actions = tf.placeholder(tf.float32, [None, self.a_dim])
+        self.execMask = tf.placeholder(tf.float32, [None, self.a_dim])
+        self.behaviour_mask = tf.placeholder_with_default(tf.zeros(tf.shape(self.actions)[0], dtype=tf.float32), shape=[None])
+        self.mu_values = tf.placeholder(tf.float32, [None])
+        self.mu = tf.placeholder(tf.float32, [None, self.a_dim])
+
+        if self.noisy_acer:
+            print("WE USE NOISY ACER")
+            self.policy, self.q = self.construct_noisy_network()
+            self.network_params = tf.trainable_variables()
+            self.avg_policy, _ = self.construct_noisy_network()
+            self.target_network_params = tf.trainable_variables()[len(self.network_params):]
+        else:
+            self.policy, self.q = self.construct_network()
+            self.network_params = tf.trainable_variables()
+            self.avg_policy, _ = self.construct_network()
+            self.target_network_params = tf.trainable_variables()[len(self.network_params):]
+
+        self.avg_policy = tf.stop_gradient(self.avg_policy)
+
+        # weighted average over q-values according to current policy gives the value of the state
+        self.value = tf.reduce_sum((self.q - self.temperature * tf.log(self.policy)) * self.policy, 1)
+
+        self.actions_onehot = self.actions
+        self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])
+        self.responsible_q = tf.reduce_sum(self.q * self.actions_onehot, [1])
+
+        # IS weights
+        self.responsible_mu = tf.reduce_sum(self.mu * self.actions_onehot, [1])
+        self.rho = self.responsible_outputs / self.responsible_mu
+        self.rho_all = self.policy / self.mu
+        self.rho_bar = tf.minimum(1., self.rho)
+        self.rho_bar_c = tf.minimum(self.c, self.rho)
+
+        self.q_ret = tf.placeholder(tf.float32, [None])
+
+
+        # step 1 from pawel
+        self.advantages_qret = self.q_ret - self.value
+        self.wrt_theta_step1 = -tf.reduce_sum(tf.log(self.responsible_outputs) * tf.stop_gradient(self.rho * self.advantages_qret))
+
+        # step 2 from pawel
+        self.wrt_theta = tf.reduce_sum(
+            tf.log(self.responsible_outputs) *
+            tf.stop_gradient(self.rho_bar_c * (self.advantages_qret - self.temperature * (1 + tf.log(self.responsible_outputs)))) +
+            tf.reduce_sum(tf.log(self.policy) *
+                          tf.stop_gradient(tf.maximum(0., 1. - self.c / self.rho_all) *
+                                           self.policy *
+                                           (self.q - tf.reshape(self.value, [-1, 1]) - self.temperature * (1 + tf.log(self.policy)))), [1]))
+
+        self.q_regularizer = tf.placeholder(tf.float32, [None])
+        if self.critic_regularizer_weight != 0:
+            self.wrt_theta_v = tf.reduce_sum(tf.square(self.q_ret - self.responsible_q)) + \
+                               self.critic_regularizer_weight * tf.reduce_sum(tf.square(self.q_regularizer - self.responsible_q))
+        else:
+            self.wrt_theta_v = tf.reduce_sum(tf.square(self.q_ret - self.responsible_q))
+
+        self.entropy = -tf.reduce_sum(self.policy * tf.log(self.policy))
+        #self.loss = self.wrt_theta_v + self.wrt_theta - self.entropy * 0.01
+
+        self.target_v = tf.placeholder(tf.float32, [None])
+        self.advantages = tf.placeholder(tf.float32, [None])
+        self.advantage_qret_diff = tf.reduce_mean(tf.square(self.advantages - self. advantages_qret))
+
+        self.q_loss = 0.5 * self.wrt_theta_v
+        self.policy_loss = -self.wrt_theta
+        self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy))
+        self.loss = self.q_loss + self.policy_loss - 0.01 * self.entropy
+
+        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
+        self.optimize = self.optimizer.minimize(self.loss)
+
+        # TRPO in theta-space
+        use_trpo = True  # can switch off TRPO here
+        self.value_gradients = self.optimizer.compute_gradients(self.q_loss)
+        self.entropy_gradients = self.optimizer.compute_gradients(-0.01 * self.entropy)
+        #self.behaviour_gradients = self.optimizer.compute_gradients(self.behaviour_loss)
+        self.g = self.optimizer.compute_gradients(-self.policy_loss)
+        self.kl = tf.reduce_sum(tf.reduce_sum(self.avg_policy * tf.log(self.avg_policy / self.policy), [1])) # this is total KL divergence, per batch
+        self.k = self.optimizer.compute_gradients(self.kl)
+        self.g = [(grad, var) for grad, var in self.g if grad is not None]
+        self.k = [(grad, var) for grad, var in self.k if grad is not None]
+        assert len(self.g) == len(self.k)
+        self.klprod = tf.reduce_sum([tf.reduce_sum(tf.reshape(k[0], [-1]) * tf.reshape(g[0], [-1])) for k, g in zip(self.k, self.g)])
+        self.klen = tf.reduce_sum([tf.reduce_sum(tf.reshape(k[0], [-1]) * tf.reshape(k[0], [-1])) for k, g in zip(self.k, self.g)])
+        self.trpo_scale = tf.maximum(0., (self.klprod - self.delta) / self.klen)
+        self.final_gradients = []
+        for i in range(len(self.g)):
+            if use_trpo:
+                self.final_gradients.append((-(self.g[i][0] - self.trpo_scale * self.k[i][0]), self.g[i][1])) # negative because this is loss
+            else:
+                self.final_gradients.append((-self.g[i][0], self.g[i][1])) # negative because this is loss
+
+        if self.temperature == 0 and not self.noisy_acer:
+            self.optimize = [self.optimizer.apply_gradients(self.final_gradients),
+                             self.optimizer.apply_gradients(self.entropy_gradients),
+                             self.optimizer.apply_gradients(self.value_gradients)
+                             ]
+        else:
+            self.optimize = [self.optimizer.apply_gradients(self.final_gradients),
+                             self.optimizer.apply_gradients(self.value_gradients)
+                             ]
+
+        self.update_avg_theta = \
+            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], 1. - self.alpha)
+                                                  + tf.multiply(self.target_network_params[i], self.alpha))
+             for i in range(len(self.target_network_params))]
+
+        self.saver = tf.train.Saver()
+
+    def construct_network(self):
+        W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, self.h1_size], stddev=0.01))
+        b_fc1 = tf.Variable(tf.zeros([self.h1_size]))
+        h_fc1 = tf.nn.relu(tf.matmul(self.inputs, W_fc1) + b_fc1)
+
+        W_h2 = tf.Variable(tf.truncated_normal([self.h1_size, self.h2_size], stddev=0.01))
+        b_h2 = tf.Variable(tf.zeros([self.h2_size]))
+        h_h2 = tf.nn.relu(tf.matmul(h_fc1, W_h2) + b_h2)
+
+        W_q = tf.Variable(tf.truncated_normal([self.h2_size, self.a_dim], stddev=0.01))
+        b_q = tf.Variable(tf.zeros([self.a_dim]))
+        q = tf.matmul(h_h2, W_q) + b_q
+
+        W_policy = tf.Variable(tf.truncated_normal([self.h2_size, self.a_dim], stddev=0.01))
+        b_policy = tf.Variable(tf.zeros([self.a_dim]))
+        policy = tf.nn.softmax(tf.matmul(h_h2, W_policy) + b_policy + self.execMask) + 0.00001
+
+        return policy, q
+
+    def construct_noisy_network(self):
+        self.mean_noisy_w = []
+        self.mean_noisy_b = []
+
+        h_fc1 = self.noisy_dense_layer(self.inputs, self.s_dim, self.h1_size, activation=tf.nn.relu)
+
+        h_h2 = self.noisy_dense_layer(h_fc1, self.h1_size, self.h2_size, activation=tf.nn.relu)
+        # Q function
+        q = self.noisy_dense_layer(h_h2, self.h2_size, self.a_dim)
+
+        policy = self.noisy_dense_layer(h_h2, self.h2_size, self.a_dim)
+        # prevent problem when calling log(self.policy)
+        policy = tf.nn.softmax(policy + self.execMask) + 0.00001
+
+        return policy, q
+
+    def getPolicy(self, inputs, execMask):
+        return self.sess.run([self.policy], feed_dict={
+            self.inputs: inputs,
+            self.execMask: execMask,
+        })
+
+    def train(self, inputs, actions, execMask, rewards, unflattened_inputs, unflattened_rewards, gamma, mu,
+              discounted_rewards, advantages, mu_values=None, behaviour_mask=None, critic_regularizer_output=None):
+        value, responsible_q, rho_bar, responsible_outputs = self.sess.run(
+            [self.value, self.responsible_q, self.rho_bar, self.responsible_outputs], feed_dict={
+            self.inputs: inputs,
+            self.actions: actions,
+            self.execMask: execMask,
+            self.mu: mu,
+        })
+
+        q_rets, offset = [], 0
+        #print >> sys.stderr, rho_bar[0], value[0], responsible_q[0]
+        for j in range(0, len(unflattened_inputs)):  # todo implement retrace for lambda other than one
+            q_ret, new_q_ret = [], 0
+            for i in range(len(unflattened_inputs[j])-1, -1, -1):
+                new_q_ret = rewards[offset+i] + gamma * new_q_ret
+                q_ret.append(new_q_ret)
+                new_q_ret = rho_bar[offset+i] * (new_q_ret - responsible_q[offset+i]) + value[offset+i]
+                #new_q_ret = value[offset+i] # debug
+            q_ret = list(reversed(q_ret))
+            q_rets.append(q_ret)
+            offset += len(unflattened_inputs[j])
+
+        q_ret_flat = np.concatenate(np.array(q_rets), axis=0).tolist()
+
+        feed_dict = {
+            self.inputs: inputs,
+            self.actions: actions,
+            self.execMask: execMask,
+            self.mu: mu,
+            self.q_ret: q_ret_flat,
+            self.target_v: discounted_rewards,
+            self.advantages: advantages,
+            #self.mu_values: mu_values,
+            #self.behaviour_mask: behaviour_mask
+        }
+
+        if self.critic_regularizer_weight != 0:
+            feed_dict[self.q_regularizer] = critic_regularizer_output
+
+        trpo_scale, klprod, kl, diff, entropy, loss, optimize = self.sess.run([self.trpo_scale, self.klprod, self.kl, self.advantage_qret_diff, self.entropy, self.loss, self.optimize], feed_dict=feed_dict)
+        update_avg_theta = self.sess.run([self.update_avg_theta], feed_dict=feed_dict)
+
+        return loss, entropy, optimize
+
+    def predict_policy(self, inputs, execMask):
+        return self.sess.run(self.policy, feed_dict={
+            self.inputs: inputs,
+            self.execMask: execMask,
+        })
+
+    def compute_rho(self, inputs, actions, mu, mask):
+        return self.sess.run(self.rho,
+                             feed_dict={self.inputs: inputs, self.actions: actions, self.mu: mu, self.execMask: mask})
+
+    def compute_responsible_output(self, inputs, actions, mask):
+        return self.sess.run(self.responsible_outputs,
+                             feed_dict={self.inputs: inputs, self.actions: actions, self.execMask: mask})
+
+    def compute_responsible_q(self, inputs, actions, mask):
+        return self.sess.run(self.responsible_q,
+                             feed_dict={self.inputs: inputs, self.actions: actions, self.execMask: mask})
+
+    def predict_value(self, inputs, execMask):
+        return self.sess.run(self.value, feed_dict={
+            self.inputs: inputs,
+            self.execMask: execMask,
+        })
+
+    def predict_action_value(self, inputs, execMask):
+        return self.sess.run([self.policy, self.value], feed_dict={
+            self.inputs: inputs,
+            self.execMask: execMask,
+        })
+
+    def compute_mean_noisy(self):
+        return self.sess.run([self.mean_noisy_w, self.mean_noisy_b])
+
+    def noisy_dense_layer(self, input, input_neurons, output_neurons, activation=tf.identity):
+
+        W_mu = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01))
+        W_sigma = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01))
+        W_eps = tf.random_normal(shape=[input_neurons, output_neurons])
+        W = W_mu + tf.multiply(W_sigma, W_eps)
+
+        b_mu = tf.Variable(tf.zeros([output_neurons]))
+        b_sigma = tf.Variable(tf.zeros([output_neurons]))
+        b_eps = tf.random_normal(shape=[output_neurons])
+        b = b_mu + tf.multiply(b_sigma, b_eps)
+
+        self.mean_noisy_w.append(tf.reduce_mean(tf.abs(W_sigma)))
+        self.mean_noisy_b.append(tf.reduce_mean(tf.abs(b_sigma)))
+
+        return activation(tf.matmul(input, W) + b)
+
+    def load_network(self, load_filename):
+        self.saver = tf.train.Saver()
+        if load_filename.split('.')[-3] != '0':
+            try:
+                self.saver.restore(self.sess, load_filename)
+                print("Successfully loaded:", load_filename)
+            except:
+                print("Could not find old network weights")
+        else:
+            print('nothing loaded in first iteration')
+
+    def save_network(self, save_filename):
+        print('Saving acer-network...')
+        self.saver.save(self.sess, save_filename)
+
+
+class RNNACERNetwork(object):
+    def __init__(self, sess, si_state_dim, sd_state_dim, action_dim, learning_rate, delta, c, alpha, h1_size = 130, h2_size = 50, is_training = True, sd_enc_size=25,
+                                    si_enc_size=25, dropout_rate=0., tn='normal', slot='si'):
+        self.sess = sess
+        self.s_dim = si_state_dim + sd_state_dim
+        self.a_dim = action_dim
+        self.learning_rate = learning_rate
+        self.delta = delta
+        self.c = c
+        self.alpha = alpha
+        self.h1_size = h1_size
+        self.h2_size = h2_size
+        self.is_training = is_training
+        self.sd_dim = sd_state_dim
+        self.si_dim = si_state_dim
+        self.sd_enc_size = sd_enc_size
+
+        #Input and hidden layers
+        self.inputs = tf.placeholder(tf.float32, [None, self.s_dim])
+        self.actions = tf.placeholder(tf.float32, [None, self.a_dim])
+        self.execMask = tf.placeholder(tf.float32, [None, self.a_dim])
+
+        keep_prob = 1 - dropout_rate
+        sd_inputs, si_inputs = tf.split(self.inputs, [self.sd_dim, self.si_dim], 1)
+
+        if slot == 'sd':
+            sd_inputs = tf.reshape(sd_inputs, (tf.shape(sd_inputs)[0], 1, self.sd_dim))
+
+            # slots encoder
+            with tf.variable_scope(tn):
+                # try:
+                lstm_cell = tf.nn.rnn_cell.GRUCell(self.sd_enc_size)
+                if keep_prob < 1:
+                    lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
+                hidden_state = lstm_cell.zero_state(tf.shape(sd_inputs)[0], tf.float32)
+                _, h_sdfe = tf.nn.dynamic_rnn(lstm_cell, sd_inputs, initial_state=hidden_state)
+                # except:
+                #    lstm_cell = tf.contrib.rnn.GRUCell(self.sd_enc_size)
+                #    hidden_state = lstm_cell.zero_state(tf.shape(sd_inputs)[0], tf.float32)
+                #    _, h_sdfe = tf.contrib.rnn.dynamic_rnn(lstm_cell, sd_inputs, initial_state=hidden_state)
+            h1_inputs = tf.concat((si_inputs, h_sdfe), 1)
+        else:
+            '''W_sdfe = tf.Variable(tf.truncated_normal([self.sd_dim, sd_enc_size], stddev=0.01))
+            b_sdfe = tf.Variable(tf.zeros([sd_enc_size]))
+            h_sdfe = tf.nn.relu(tf.matmul(sd_inputs, W_sdfe) + b_sdfe)
+            if keep_prob < 1:
+                h_sdfe = tf.nn.dropout(h_sdfe, keep_prob)'''
+            h1_inputs = self.inputs
+
+        def construct_theta():
+            W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, self.h1_size], stddev=0.01))
+            b_fc1 = tf.Variable(0.0 * tf.ones([self.h1_size]))
+            if self.h2_size > 0:  # todo layer 2 should be shared between policy and q-function?
+                W_h2 = tf.Variable(tf.truncated_normal([self.h1_size, self.h2_size], stddev=0.01))
+                b_h2 = tf.Variable(0.0 * tf.ones([self.h2_size]))
+
+                W_q = tf.Variable(tf.truncated_normal([self.h2_size, self.a_dim], stddev=0.01))
+                b_q = tf.Variable(0.0 * tf.ones([self.a_dim]))
+                W_policy = tf.Variable(tf.truncated_normal([self.h2_size, self.a_dim], stddev=0.01))
+                b_policy = tf.Variable(0.0 * tf.ones([self.a_dim]))
+
+                theta = [W_fc1, b_fc1, W_h2, b_h2, W_q, b_q, W_policy, b_policy]
+            else:
+                W_q = tf.Variable(tf.truncated_normal([self.h1_size, self.a_dim], stddev=0.01))
+                b_q = tf.Variable(0.0 * tf.ones([self.a_dim]))
+                W_policy = tf.Variable(tf.truncated_normal([self.h1_size, self.a_dim], stddev=0.01))
+                b_policy = tf.Variable(0.0 * tf.ones([self.a_dim]))
+
+                theta = [W_fc1, b_fc1, W_q, b_q, W_policy, b_policy]
+            return theta
+
+        self.theta = construct_theta()
+        self.avg_theta = construct_theta()
+
+        def construct_network(theta):
+            if self.h2_size > 0:
+                W_fc1, b_fc1, W_h2, b_h2, W_q, b_q, W_policy, b_policy = theta
+            else:
+                W_fc1, b_fc1, W_q, b_q, W_policy, b_policy = theta
+
+            h_fc1 = tf.nn.relu(tf.matmul(h1_inputs, W_fc1) + b_fc1)
+
+            if self.h2_size > 0:
+                h_h2 = tf.nn.relu(tf.matmul(h_fc1, W_h2) + b_h2)
+                # Q function
+                q = tf.matmul(h_h2, W_q) + b_q
+                # prevent problem when calling log(self.policy)
+                policy = tf.nn.softmax(tf.matmul(h_h2, W_policy) + b_policy + self.execMask) + 0.00001
+            else:  # 1 hidden layer
+                # value function
+                q = tf.matmul(h_fc1, W_q) + b_q
+                # policy function
+                policy = tf.nn.softmax(tf.matmul(h_fc1, W_policy) + b_policy + self.execMask) + 0.00001
+            return policy, q
+
+        self.policy, self.q = construct_network(self.theta)
+        self.avg_policy, _ = construct_network(self.avg_theta)
+        self.avg_policy = tf.stop_gradient(self.avg_policy)
+
+        # weighted average over q-values according to current policy gives the value of the state
+        self.value = tf.reduce_sum(self.q * self.policy, 1)
+
+        self.actions_onehot = self.actions
+        self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])
+        self.responsible_q = tf.reduce_sum(self.q * self.actions_onehot, [1])
+
+        # IS weights
+        self.mu = tf.placeholder(tf.float32, [None, self.a_dim])
+        self.responsible_mu = tf.reduce_sum(self.mu * self.actions_onehot, [1])
+        self.rho = self.responsible_outputs / self.responsible_mu
+        self.rho_all = self.policy / self.mu
+        self.rho_bar = tf.minimum(1., self.rho)
+        self.rho_bar_c = tf.minimum(self.c, self.rho)
+
+        self.q_ret = tf.placeholder(tf.float32, [None])
+
+        # step 1 from pawel
+        self.advantages_qret = self.q_ret - self.value
+        self.wrt_theta_step1 = -tf.reduce_sum(tf.log(self.responsible_outputs) * tf.stop_gradient(self.rho *  self.advantages_qret))
+
+        # step 2 from pawel
+        self.wrt_theta = tf.reduce_sum(
+            tf.log(self.responsible_outputs) * tf.stop_gradient(self.rho_bar_c *  self.advantages_qret) +
+            tf.reduce_sum(tf.log(self.policy) *
+                          tf.stop_gradient(tf.maximum(0., 1. - self.c / self.rho_all) *
+                                           self.policy * (self.q - tf.reshape(self.value, [-1, 1]))), [1]))
+
+        self.wrt_theta_v = tf.reduce_sum(tf.square(self.q_ret - self.responsible_q))
+        self.entropy = -tf.reduce_sum(self.policy * tf.log(self.policy))
+        #self.loss = self.wrt_theta_v + self.wrt_theta - self.entropy * 0.01
+
+        self.target_v = tf.placeholder(tf.float32, [None])
+        self.advantages = tf.placeholder(tf.float32, [None])
+        self.advantage_qret_diff = tf.reduce_mean(tf.square(self.advantages - self. advantages_qret))
+
+        # DEBUG (A2C)
+        #self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1]))) # original a2c
+        self.q_loss = 0.5 * self.wrt_theta_v
+        self.policy_loss = -self.wrt_theta
+        self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy))
+        self.loss = self.q_loss + self.policy_loss - 0.01 * self.entropy
+
+        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
+        self.optimize = self.optimizer.minimize(self.loss)
+
+        # TRPO in theta-space
+        use_trpo = True  # can switch off TRPO here
+        self.value_gradients = self.optimizer.compute_gradients(self.q_loss)
+        self.entropy_gradients = self.optimizer.compute_gradients(-0.01 * self.entropy)
+        self.g = self.optimizer.compute_gradients(-self.policy_loss)
+        self.kl = tf.reduce_sum(tf.reduce_sum(self.avg_policy * tf.log(self.avg_policy / self.policy), [1])) # this is total KL divergence, per batch
+        self.k = self.optimizer.compute_gradients(self.kl)
+        self.g = [(grad, var) for grad, var in self.g if grad is not None]
+        self.k = [(grad, var) for grad, var in self.k if grad is not None]
+        assert len(self.g) == len(self.k)
+        self.klprod = tf.reduce_sum([tf.reduce_sum(tf.reshape(k[0], [-1]) * tf.reshape(g[0], [-1])) for k, g in zip(self.k, self.g)])
+        self.klen = tf.reduce_sum([tf.reduce_sum(tf.reshape(k[0], [-1]) * tf.reshape(k[0], [-1])) for k, g in zip(self.k, self.g)])
+        self.trpo_scale = tf.maximum(0., (self.klprod - self.delta) / self.klen)
+        self.final_gradients = []
+        for i in range(len(self.g)):
+            if use_trpo:
+                self.final_gradients.append((-(self.g[i][0] - self.trpo_scale * self.k[i][0]), self.g[i][1])) # negative because this is loss
+            else:
+                self.final_gradients.append((-self.g[i][0], self.g[i][1])) # negative because this is loss
+
+        self.optimize = [self.optimizer.apply_gradients(self.final_gradients),
+                         self.optimizer.apply_gradients(self.entropy_gradients),
+                         self.optimizer.apply_gradients(self.value_gradients)]
+
+        self.update_avg_theta = [avg_w.assign(self.alpha * avg_w + (1. - self.alpha) * w)
+                                 for avg_w, w in zip(self.avg_theta, self.theta)]
+
+
+    def getPolicy(self, inputs, execMask):
+        return self.sess.run([self.policy], feed_dict={
+            self.inputs: inputs,
+            self.execMask: execMask,
+        })
+
+    def train(self, inputs, actions, execMask, rewards, unflattened_inputs, unflattened_rewards, gamma, mu, discounted_rewards, advantages):
+        value, responsible_q, rho_bar, responsible_outputs = self.sess.run(
+            [self.value, self.responsible_q, self.rho_bar, self.responsible_outputs], feed_dict={
+            self.inputs: inputs,
+            self.actions: actions,
+            self.execMask: execMask,
+            self.mu: mu,
+        })
+
+        q_rets, offset = [], 0
+        #print >> sys.stderr, rho_bar[0], value[0], responsible_q[0]
+        for j in range(0, len(unflattened_inputs)):  # todo implement retrace for lambda other than one
+            q_ret, new_q_ret = [], 0
+            for i in range(len(unflattened_inputs[j])-1, -1, -1):
+                new_q_ret = rewards[offset+i] + gamma * new_q_ret
+                q_ret.append(new_q_ret)
+                new_q_ret = rho_bar[offset+i] * (new_q_ret - responsible_q[offset+i]) + value[offset+i]
+                #new_q_ret = value[offset+i] # debug
+            q_ret = list(reversed(q_ret))
+            q_rets.append(q_ret)
+            offset += len(unflattened_inputs[j])
+
+        q_ret_flat = np.concatenate(np.array(q_rets), axis=0).tolist()
+
+        feed_dict = {
+            self.inputs: inputs,
+            self.actions: actions,
+            self.execMask: execMask,
+            self.mu: mu,
+            self.q_ret: q_ret_flat,
+            self.target_v: discounted_rewards,
+            self.advantages: advantages,
+        }
+
+        trpo_scale, klprod, kl, diff, entropy, loss, optimize = self.sess.run([self.trpo_scale, self.klprod, self.kl, self.advantage_qret_diff, self.entropy, self.loss, self.optimize], feed_dict=feed_dict)
+        update_avg_theta = self.sess.run([self.update_avg_theta], feed_dict=feed_dict)
+
+        return loss, entropy, optimize
+
+    def predict_policy(self, inputs, execMask):
+        return self.sess.run(self.policy, feed_dict={
+            self.inputs: inputs,
+            self.execMask: execMask,
+        })
+
+    def predict_value(self, inputs, execMask):
+        return self.sess.run(self.value, feed_dict={
+            self.inputs: inputs,
+            self.execMask: execMask,
+        })
+
+    def predict_action_value(self, inputs, execMask):
+        return self.sess.run([self.policy, self.value], feed_dict={
+            self.inputs: inputs,
+            self.execMask: execMask,
+        })
+
+    def load_network(self, load_filename):
+        self.saver = tf.train.Saver()
+        if load_filename.split('.')[-3] != '0':
+            try:
+                self.saver.restore(self.sess, load_filename)
+                print("Successfully loaded:", load_filename)
+            except:
+                print("Could not find old network weights")
+        else:
+            print('nothing loaded in first iteration')
+
+    def save_network(self, save_filename):
+        print('Saving sacer-network...')
+        #self.saver = tf.train.Saver()
+        self.saver.save(self.sess, save_filename)
diff --git a/policy/feudalgainRL/noisydqn.py b/policy/feudalgainRL/noisydqn.py
new file mode 100644
index 0000000000000000000000000000000000000000..03bdb48dd6116ddbe3992390dcf0165dc4732da9
--- /dev/null
+++ b/policy/feudalgainRL/noisydqn.py
@@ -0,0 +1,632 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+"""
+Implementation of DQN -  Deep Q Network
+
+The algorithm is developed with tflearn + Tensorflow
+
+Author: Pei-Hao Su
+"""
+import tensorflow as tf
+
+# ===========================
+#   Deep Q Network
+# ===========================
+class DeepQNetwork(object):
+    """
+    Input to the network is the state and action, output is Q(s,a).
+    """
+    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars, minibatch_size=64,
+                 architecture='duel', h1_size=130, h2_size=50, dropout_rate=0.):
+        self.sess = sess
+        self.s_dim = state_dim
+        self.a_dim = action_dim
+        self.learning_rate = learning_rate
+        self.tau = tau
+        self.architecture = architecture
+        self.h1_size = h1_size
+        self.h2_size = h2_size
+        self.minibatch_size = minibatch_size
+
+        # Create the deep Q network
+        self.inputs, self.action, self.Qout = \
+                        self.create_ddq_network(self.architecture, self.h1_size, self.h2_size, dropout_rate=dropout_rate)
+        self.network_params = tf.trainable_variables()
+
+        # Target Network
+        self.target_inputs, self.target_action, self.target_Qout = \
+                        self.create_ddq_network(self.architecture, self.h1_size, self.h2_size, dropout_rate=dropout_rate)
+        self.target_network_params = tf.trainable_variables()[len(self.network_params):]
+
+        # Op for periodically updating target network
+        self.update_target_network_params = \
+            [self.target_network_params[i].assign(\
+                tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau))
+                for i in range(len(self.target_network_params))]
+
+        # Network target (y_i)
+        self.sampled_q = tf.placeholder(tf.float32, [None, 1])
+        #self.temperature = tf.placeholder(shape=None,dtype=tf.float32)
+
+        # for Boltzman exploration
+        #self.softmax_Q = tf.nn.softmax(self.self.Qout/self.temperature)
+
+        # Predicted Q given state and chosed action
+        #actions_one_hot = tf.one_hot(self.action, self.a_dim, 1.0, 0.0, name='action_one_hot')
+        actions_one_hot = self.action
+
+        if architecture!= 'dip':
+            self.pred_q = tf.reshape(tf.reduce_sum(self.Qout * actions_one_hot, axis=1, name='q_acted'),
+                                 [self.minibatch_size, 1])
+        else:
+            self.pred_q = self.Qout #DIP case, not sure if will work
+
+        #self.pred_q = tf.reduce_sum(self.Qout * actions_one_hot, reduction_indices=1, name='q_acted_target')
+
+        #self.a_maxQ = tf.argmax(self.Qout, 1)
+        #action_maxQ_one_hot = tf.one_hot(self.a_maxQ, self.a_dim, 1.0, 0.0, name='action_maxQ_one_hot')
+        #self.action_maxQ_target = tf.reduce_sum(self.target_Qout * action_maxQ_one_hot, reduction_indices=1, name='a_maxQ_target')
+
+        # Define loss and optimization Op
+        self.diff = self.sampled_q - self.pred_q
+        self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss')
+
+        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
+        self.optimize = self.optimizer.minimize(self.loss)
+
+        # gs = tf.gradients(self.loss, self.network_params)
+        # capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in zip(gs, self.network_params)]
+        #
+        # self.optimize = self.optimizer.apply_gradients(capped_gvs)
+
+    def create_ddq_network(self, architecture='duel', h1_size=130, h2_size=50, dropout_rate=0.):
+        keep_prob = 1 - dropout_rate
+        inputs = tf.placeholder(tf.float32, [None, self.s_dim])
+        action = tf.placeholder(tf.float32, [None, self.a_dim])
+
+        if architecture == 'duel':
+            W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01))
+            b_fc1 = tf.Variable(tf.zeros([h1_size]))
+            h_fc1 = tf.nn.relu(tf.matmul(inputs, W_fc1) + b_fc1)
+
+            # value function
+            W_value = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_value = tf.Variable(tf.zeros([h2_size]))
+            h_value = tf.nn.relu(tf.matmul(h_fc1, W_value) + b_value)
+
+            W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01))
+            b_value = tf.Variable(tf.zeros([1]))
+            value_out = tf.matmul(h_value, W_value) + b_value
+
+            # advantage function
+            W_advantage = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_advantage = tf.Variable(tf.zeros([h2_size]))
+            h_advantage = tf.nn.relu(tf.matmul(h_fc1, W_advantage) + b_advantage)
+
+            W_advantage = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01))
+            b_advantage = tf.Variable(tf.zeros([self.a_dim]))
+            Advantage_out  = tf.matmul(h_advantage, W_advantage) + b_advantage
+
+            Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, axis=1, keep_dims=True))
+
+        elif architecture == 'noisy_duel':
+            print("WE USE DUEL NOISY ARCHITECTURE")
+            h_fc1 = self.noisy_dense_layer(inputs, self.s_dim, h1_size, activation=tf.nn.relu)
+            # value function
+            h_value = self.noisy_dense_layer(h_fc1, h1_size, h2_size, activation=tf.nn.relu)
+            value_out = self.noisy_dense_layer(h_value, h2_size, 1)
+
+            # advantage function
+            h_advantage = self.noisy_dense_layer(h_fc1, h1_size, h2_size, activation=tf.nn.relu)
+            Advantage_out = self.noisy_dense_layer(h_advantage, h2_size, self.a_dim)
+
+            Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, axis=1, keep_dims=True))
+
+        elif architecture == 'dip':
+
+            # state network
+            W_fc1_s = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01))
+            b_fc1_s = tf.Variable(tf.zeros([h1_size]))
+            h_fc1_s = tf.nn.relu(tf.matmul(inputs, W_fc1_s) + b_fc1_s)
+
+            # action network
+            W_fc1_a = tf.Variable(tf.truncated_normal([self.a_dim, h1_size], stddev=0.01))
+            b_fc1_a = tf.Variable(tf.zeros([h1_size]))
+            h_fc1_a = tf.nn.relu(tf.matmul(action, W_fc1_a) + b_fc1_a)
+
+            W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_fc2_s = tf.Variable(tf.zeros([h2_size]))
+            h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s)
+
+            W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_fc2_a = tf.Variable(tf.zeros([h2_size]))
+            h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a)
+
+            Qout = tf.reduce_sum(tf.multiply(h_fc2_s, h_fc2_a), 1)
+
+        else:
+            W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01))
+            b_fc1 = tf.Variable(tf.zeros([h1_size]))
+            h_fc1 = tf.nn.relu(tf.matmul(inputs, W_fc1) + b_fc1)
+            if keep_prob < 1:
+                h_fc1 = tf.nn.dropout(h_fc1, keep_prob)
+
+            W_fc2 = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_fc2 = tf.Variable(tf.zeros([h2_size]))
+            h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)
+            if keep_prob < 1:
+                h_fc2 = tf.nn.dropout(h_fc2, keep_prob)
+
+            W_out = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01))
+            b_out = tf.Variable(tf.zeros([self.a_dim]))
+            Qout = tf.matmul(h_fc2, W_out) + b_out
+
+        return inputs, action, Qout
+
+    def noisy_dense_layer(self, input, input_neurons, output_neurons, activation=tf.identity):
+
+        W_mu = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01))
+        W_sigma = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01))
+        W_eps = tf.random_normal(shape=[input_neurons, output_neurons])
+        W = W_mu + tf.multiply(W_sigma, W_eps)
+
+        b_mu = tf.Variable(tf.zeros([output_neurons]))
+        b_sigma = tf.Variable(tf.zeros([output_neurons]))
+        b_eps = tf.random_normal(shape=[output_neurons])
+        b = b_mu + tf.multiply(b_sigma, b_eps)
+
+        return activation(tf.matmul(input, W) + b)
+
+    def train(self, inputs, action, sampled_q):
+        return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={ #yes, needs to be changed too
+            self.inputs: inputs,
+            self.action: action,
+            self.sampled_q: sampled_q
+        })
+
+
+    def predict(self, inputs):
+        return self.sess.run(self.Qout, feed_dict={
+            self.inputs: inputs
+        })
+
+    def predict_dip(self, inputs, action):
+        return self.sess.run(self.Qout, feed_dict={
+            self.inputs: inputs,
+            self.action: action
+        })
+
+    def predict_action(self, inputs):
+        return self.sess.run(self.pred_q, feed_dict={
+            self.inputs: inputs
+        })
+
+    def predict_target(self, inputs):
+        return self.sess.run(self.target_Qout, feed_dict={
+            self.target_inputs: inputs
+        })
+
+    def predict_target_dip(self, inputs, action):
+        return self.sess.run(self.target_Qout, feed_dict={
+            self.target_inputs: inputs,
+            self.target_action: action
+        })
+
+    def predict_target_with_action_maxQ(self, inputs):
+        return self.sess.run(self.action_maxQ_target, feed_dict={
+            self.target_inputs: inputs,
+            self.inputs: inputs
+        })
+
+    def update_target_network(self):
+        self.sess.run(self.update_target_network_params) #yes, but no need to change
+
+    def load_network(self, load_filename):
+        self.saver = tf.train.Saver()
+        if load_filename.split('.')[-3] != '0':
+            try:
+                self.saver.restore(self.sess, './' + load_filename)
+                print("Successfully loaded:", load_filename)
+            except:
+                print("Could not find old network weights")
+        else:
+            print('nothing loaded in first iteration')
+
+    def save_network(self, save_filename):
+        print('Saving deepq-network...')
+        self.saver.save(self.sess, './' +save_filename)  # yes but no need to change
+
+    def clipped_error(self, x):
+        return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false
+
+
+class NNFDeepQNetwork(object):
+    """
+    Input to the network is the state and action, output is Q(s,a).
+    """
+    def __init__(self, sess, si_state_dim, sd_state_dim, action_dim, learning_rate, tau, num_actor_vars, minibatch_size=64,
+                 architecture='duel', h1_size=130, h2_size=50, sd_enc_size=40, si_enc_size=80, dropout_rate=0.):
+        #super(NNFDeepQNetwork, self).__init__(sess, si_state_dim + sd_state_dim, action_dim, learning_rate, tau, num_actor_vars,
+        #                                      minibatch_size=64, architecture='duel', h1_size=130, h2_size=50)
+        self.sess = sess
+        self.si_dim = si_state_dim
+        self.sd_dim = sd_state_dim
+        self.s_dim = self.si_dim + self.sd_dim
+        self.a_dim = action_dim
+        self.learning_rate = learning_rate
+        self.tau = tau
+        self.architecture = architecture
+        self.h1_size = h1_size
+        self.h2_size = h2_size
+        self.minibatch_size = minibatch_size
+        self.sd_enc_size = sd_enc_size
+        self.si_enc_size = si_enc_size
+        self.dropout_rate = dropout_rate
+
+        # Create the deep Q network
+        self.inputs, self.action, self.Qout = \
+                        self.create_nnfdq_network(self.h1_size, self.h2_size, self.sd_enc_size, self.si_enc_size, self.dropout_rate)
+        self.network_params = tf.trainable_variables()
+
+        # Target Network
+        self.target_inputs, self.target_action, self.target_Qout = \
+                        self.create_nnfdq_network(self.h1_size, self.h2_size, self.sd_enc_size, self.si_enc_size, self.dropout_rate)
+        self.target_network_params = tf.trainable_variables()[len(self.network_params):]
+
+        # Op for periodically updating target network
+        self.update_target_network_params = \
+            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau)
+                                                  + tf.multiply(self.target_network_params[i], 1. - self.tau))
+             for i in range(len(self.target_network_params))]
+
+        # Network target (y_i)
+        self.sampled_q = tf.placeholder(tf.float32, [None, 1])
+
+        # Predicted Q given state and chosed action
+        actions_one_hot = self.action
+
+        if architecture != 'dip':
+            self.pred_q = tf.reshape(tf.reduce_sum(self.Qout * actions_one_hot, axis=1, name='q_acted'),
+                                 [-1, 1])
+        else:
+            self.pred_q = self.Qout
+
+        # Define loss and optimization Op
+        self.diff = self.sampled_q - self.pred_q
+        self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss')
+
+        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
+        self.optimize = self.optimizer.minimize(self.loss)
+
+    def create_nnfdq_network(self, h1_size=130, h2_size=50, sd_enc_size=40, si_enc_size=80, dropout_rate=0.):
+
+        keep_prob = 1 - dropout_rate
+        inputs = tf.placeholder(tf.float32, [None, self.s_dim])
+        action = tf.placeholder(tf.float32, [None, self.a_dim])
+
+        if self.architecture == 'duel':
+            print("WE USE THE DUELING ARCHITECTURE!")
+            W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01))
+            b_fc1 = tf.Variable(tf.zeros([h1_size]))
+            h_fc1 = tf.nn.relu(tf.matmul(inputs, W_fc1) + b_fc1)
+
+            # value function
+            W_value = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_value = tf.Variable(tf.zeros([h2_size]))
+            h_value = tf.nn.relu(tf.matmul(h_fc1, W_value) + b_value)
+
+            W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01))
+            b_value = tf.Variable(tf.zeros([1]))
+            value_out = tf.matmul(h_value, W_value) + b_value
+
+            # advantage function
+            W_advantage = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_advantage = tf.Variable(tf.zeros([h2_size]))
+            h_advantage = tf.nn.relu(tf.matmul(h_fc1, W_advantage) + b_advantage)
+
+            W_advantage = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01))
+            b_advantage = tf.Variable(tf.zeros([self.a_dim]))
+            Advantage_out = tf.matmul(h_advantage, W_advantage) + b_advantage
+
+            Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, axis=1, keep_dims=True))
+
+        elif self.architecture == 'noisy_duel':
+            print("WE USE THE NOISY DUELING ARCHITECTURE!")
+            self.mean_noisy_w = []
+            self.mean_noisy_b = []
+            h_fc1 = self.noisy_dense_layer(inputs, self.s_dim, h1_size, activation=tf.nn.relu)
+            # value function
+            h_value = self.noisy_dense_layer(h_fc1, h1_size, h2_size, activation=tf.nn.relu)
+            value_out = self.noisy_dense_layer(h_value, h2_size, 1)
+
+            # advantage function
+            h_advantage = self.noisy_dense_layer(h_fc1, h1_size, h2_size, activation=tf.nn.relu)
+            Advantage_out = self.noisy_dense_layer(h_advantage, h2_size, self.a_dim)
+
+            Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, axis=1, keep_dims=True))
+
+        else:
+            inputs = tf.placeholder(tf.float32, [None, self.sd_dim + self.si_dim])
+            keep_prob = 1 - dropout_rate
+            sd_inputs, si_inputs = tf.split(inputs, [self.sd_dim, self.si_dim], 1)
+            action = tf.placeholder(tf.float32, [None, self.a_dim])
+
+            W_sdfe = tf.Variable(tf.truncated_normal([self.sd_dim, sd_enc_size], stddev=0.01))
+            b_sdfe = tf.Variable(tf.zeros([sd_enc_size]))
+            h_sdfe = tf.nn.relu(tf.matmul(sd_inputs, W_sdfe) + b_sdfe)
+            if keep_prob < 1:
+                h_sdfe = tf.nn.dropout(h_sdfe, keep_prob)
+
+            W_sife = tf.Variable(tf.truncated_normal([self.si_dim, si_enc_size], stddev=0.01))
+            b_sife = tf.Variable(tf.zeros([si_enc_size]))
+            h_sife = tf.nn.relu(tf.matmul(si_inputs, W_sife) + b_sife)
+            if keep_prob < 1:
+                h_sife = tf.nn.dropout(h_sife, keep_prob)
+
+            W_fc1 = tf.Variable(tf.truncated_normal([sd_enc_size+si_enc_size, h1_size], stddev=0.01))
+            b_fc1 = tf.Variable(tf.zeros([h1_size]))
+            h_fc1 = tf.nn.relu(tf.matmul(tf.concat((h_sdfe, h_sife), 1), W_fc1) + b_fc1)
+
+            W_fc2 = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+            b_fc2 = tf.Variable(tf.zeros([h2_size]))
+            h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)
+
+            W_out = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01))
+            b_out = tf.Variable(tf.zeros([self.a_dim]))
+            Qout = tf.matmul(h_fc2, W_out) + b_out
+
+        return inputs, action, Qout
+
+    def predict(self, inputs):
+        return self.sess.run(self.Qout, feed_dict={ #inputs where a single flat_bstate
+            self.inputs: inputs
+        })
+
+    def predict_dip(self, inputs, action):
+        return self.sess.run(self.Qout, feed_dict={ #inputs and action where array of 64 (batch size)
+            self.inputs: inputs,
+            self.action: action
+        })
+
+    def predict_target(self, inputs):
+        return self.sess.run(self.target_Qout, feed_dict={ #inputs where a single flat_bstate
+            self.target_inputs: inputs
+        })
+
+    def predict_target_dip(self, inputs, action):
+        return self.sess.run(self.target_Qout, feed_dict={ #inputs and action where array of 64 (batch size)
+            self.target_inputs: inputs,
+            self.target_action: action
+        })
+
+    def train(self, inputs, action, sampled_q):
+        return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={ #all the inputs are arrays of 64
+            self.inputs: inputs,
+            self.action: action,
+            self.sampled_q: sampled_q
+        })
+
+    def compute_loss(self, inputs, action, sampled_q):
+
+        return self.sess.run(self.loss, feed_dict={  # yes, needs to be changed too
+            self.inputs: inputs,
+            self.action: action,
+            self.sampled_q: sampled_q
+        })
+
+    def clipped_error(self, x):
+        return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false
+
+    def save_network(self, save_filename):
+        print('Saving deepq-network...')
+        self.saver.save(self.sess, './' +  save_filename)
+
+    def update_target_network(self):
+        self.sess.run(self.update_target_network_params)
+
+    def load_network(self, load_filename):
+        self.saver = tf.train.Saver()
+        if load_filename.split('.')[-3] != '0':
+            try:
+                self.saver.restore(self.sess, './' + load_filename)
+                print("Successfully loaded:", load_filename)
+            except:
+                print("Could not find old network weights")
+        else:
+            print('nothing loaded in first iteration')
+
+    def compute_mean_noisy(self):
+        return self.sess.run([self.mean_noisy_w, self.mean_noisy_b])
+
+    def noisy_dense_layer(self, input, input_neurons, output_neurons, activation=tf.identity):
+
+        W_mu = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01))
+        W_sigma = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01))
+        W_eps = tf.random_normal(shape=[input_neurons, output_neurons])
+        W = W_mu + tf.multiply(W_sigma, W_eps)
+
+        b_mu = tf.Variable(tf.zeros([output_neurons]))
+        b_sigma = tf.Variable(tf.zeros([output_neurons]))
+        b_eps = tf.random_normal(shape=[output_neurons])
+        b = b_mu + tf.multiply(b_sigma, b_eps)
+
+        self.mean_noisy_w.append(tf.reduce_mean(tf.abs(W_sigma)))
+        self.mean_noisy_b.append(tf.reduce_mean(tf.abs(b_sigma)))
+
+        return activation(tf.matmul(input, W) + b)
+
+class RNNFDeepQNetwork(object):
+    """
+    Input to the network is the state and action, output is Q(s,a).
+    """
+    def __init__(self, sess, si_state_dim, sd_state_dim, action_dim, learning_rate, tau, num_actor_vars, minibatch_size=64,
+                 architecture='duel', h1_size=130, h2_size=50, sd_enc_size=40, si_enc_size=80, dropout_rate=0., slot='si'):
+        #super(NNFDeepQNetwork, self).__init__(sess, si_state_dim + sd_state_dim, action_dim, learning_rate, tau, num_actor_vars,
+        #                                      minibatch_size=64, architecture='duel', h1_size=130, h2_size=50)
+        self.sess = sess
+        self.si_dim = si_state_dim
+        self.sd_dim = sd_state_dim
+        self.a_dim = action_dim
+        self.learning_rate = learning_rate
+        self.tau = tau
+        self.architecture = architecture
+        self.h1_size = h1_size
+        self.h2_size = h2_size
+        self.minibatch_size = minibatch_size
+        self.sd_enc_size = sd_enc_size
+        self.si_enc_size = si_enc_size
+        self.dropout_rate = dropout_rate
+
+        # Create the deep Q network
+        self.inputs, self.action, self.Qout = \
+                        self.create_rnnfdq_network(self.h1_size, self.h2_size, self.sd_enc_size, self.si_enc_size, self.dropout_rate, slot=slot)
+        self.network_params = tf.trainable_variables()
+
+        # Target Network
+        self.target_inputs, self.target_action, self.target_Qout = \
+                        self.create_rnnfdq_network(self.h1_size, self.h2_size, self.sd_enc_size, self.si_enc_size, self.dropout_rate, tn='target', slot=slot)
+        self.target_network_params = tf.trainable_variables()[len(self.network_params):]
+
+        # Op for periodically updating target network
+        self.update_target_network_params = \
+            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau)
+                                                  + tf.multiply(self.target_network_params[i], 1. - self.tau))
+             for i in range(len(self.target_network_params))]
+
+        # Network target (y_i)
+        self.sampled_q = tf.placeholder(tf.float32, [None, 1])
+
+        # Predicted Q given state and chosed action
+        actions_one_hot = self.action
+
+        if architecture!= 'dip':
+            self.pred_q = tf.reshape(tf.reduce_sum(self.Qout * actions_one_hot, axis=1, name='q_acted'),
+                                 [self.minibatch_size, 1])
+        else:
+            self.pred_q = self.Qout
+
+        # Define loss and optimization Op
+        self.diff = self.sampled_q - self.pred_q
+        self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss')
+
+        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
+        self.optimize = self.optimizer.minimize(self.loss)
+
+    #def create_slot_encoder(self):
+
+
+    def create_rnnfdq_network(self, h1_size=130, h2_size=50, sd_enc_size=40, si_enc_size=80, dropout_rate=0.,
+                              tn='normal', slot='si'):
+        inputs = tf.placeholder(tf.float32, [None, self.sd_dim + self.si_dim])
+        keep_prob = 1 - dropout_rate
+        sd_inputs, si_inputs = tf.split(inputs, [self.sd_dim, self.si_dim], 1)
+        action = tf.placeholder(tf.float32, [None, self.a_dim])
+        if slot == 'sd':
+            sd_inputs = tf.reshape(sd_inputs, (tf.shape(sd_inputs)[0], 1, self.sd_dim))
+
+            #slots encoder
+            with tf.variable_scope(tn):
+                #try:
+                    lstm_cell = tf.nn.rnn_cell.GRUCell(self.sd_enc_size)
+                    hidden_state = lstm_cell.zero_state(tf.shape(sd_inputs)[0], tf.float32)
+                    _, h_sdfe = tf.nn.dynamic_rnn(lstm_cell, sd_inputs, initial_state=hidden_state)
+                #except:
+                #    lstm_cell = tf.contrib.rnn.GRUCell(self.sd_enc_size)
+                #    hidden_state = lstm_cell.zero_state(tf.shape(sd_inputs)[0], tf.float32)
+                #    _, h_sdfe = tf.contrib.rnn.dynamic_rnn(lstm_cell, sd_inputs, initial_state=hidden_state)
+        else:
+            W_sdfe = tf.Variable(tf.truncated_normal([self.sd_dim, sd_enc_size], stddev=0.01))
+            b_sdfe = tf.Variable(tf.zeros([sd_enc_size]))
+            h_sdfe = tf.nn.relu(tf.matmul(sd_inputs, W_sdfe) + b_sdfe)
+            if keep_prob < 1:
+                h_sdfe = tf.nn.dropout(h_sdfe, keep_prob)
+
+        W_sife = tf.Variable(tf.truncated_normal([self.si_dim, si_enc_size], stddev=0.01))
+        b_sife = tf.Variable(tf.zeros([si_enc_size]))
+        h_sife = tf.nn.relu(tf.matmul(si_inputs, W_sife) + b_sife)
+        if keep_prob < 1:
+            h_sife = tf.nn.dropout(h_sife, keep_prob)
+
+        W_fc1 = tf.Variable(tf.truncated_normal([sd_enc_size+si_enc_size, h1_size], stddev=0.01))
+        b_fc1 = tf.Variable(tf.zeros([h1_size]))
+        h_fc1 = tf.nn.relu(tf.matmul(tf.concat((h_sdfe, h_sife), 1), W_fc1) + b_fc1)
+
+        W_fc2 = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
+        b_fc2 = tf.Variable(tf.zeros([h2_size]))
+        h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)
+
+        W_out = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01))
+        b_out = tf.Variable(tf.zeros([self.a_dim]))
+        Qout = tf.matmul(h_fc2, W_out) + b_out
+
+        return inputs, action, Qout
+
+    def predict(self, inputs):
+        return self.sess.run(self.Qout, feed_dict={ #inputs where a single flat_bstate
+            self.inputs: inputs
+        })
+
+    def predict_dip(self, inputs, action):
+        return self.sess.run(self.Qout, feed_dict={ #inputs and action where array of 64 (batch size)
+            self.inputs: inputs,
+            self.action: action
+        })
+
+    def predict_target(self, inputs):
+        return self.sess.run(self.target_Qout, feed_dict={ #inputs where a single flat_bstate
+            self.target_inputs: inputs
+        })
+
+    def predict_target_dip(self, inputs, action):
+        return self.sess.run(self.target_Qout, feed_dict={ #inputs and action where array of 64 (batch size)
+            self.target_inputs: inputs,
+            self.target_action: action
+        })
+
+    def train(self, inputs, action, sampled_q):
+        return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={ #all the inputs are arrays of 64
+            self.inputs: inputs,
+            self.action: action,
+            self.sampled_q: sampled_q
+        })
+
+    def clipped_error(self, x):
+        return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false
+
+    def save_network(self, save_filename):
+        print('Saving deepq-network...')
+        self.saver.save(self.sess, save_filename)
+
+    def update_target_network(self):
+        self.sess.run(self.update_target_network_params)
+
+    def load_network(self, load_filename):
+        self.saver = tf.train.Saver()
+        if load_filename.split('.')[-3] != '0':
+            try:
+                self.saver.restore(self.sess, load_filename)
+                print("Successfully loaded:", load_filename)
+            except:
+                print("Could not find old network weights")
+        else:
+            print('nothing loaded in first iteration')