diff --git a/policy/FeudalGainPolicy.py b/policy/FeudalGainPolicy.py
index 9a767ab2be8993f9ab07668968d9ba81535978f0..5ea4b20fc4f741e665315ca0b858e4e3b8a59984 100644
--- a/policy/FeudalGainPolicy.py
+++ b/policy/FeudalGainPolicy.py
@@ -21,6 +21,14 @@
 ###############################################################################
 
 
+"""
+Implementation of FeudalGain
+
+Paper: Arxiv reference
+
+Author: Christian Geishauser
+"""
+
 import numpy as np
 import random
 import utils
@@ -96,9 +104,6 @@ class FeudalGainPolicy(Policy.Policy):
         if cfg.has_option('feudalpolicy', 'info_reward_master'):
             self.info_reward_master = cfg.getfloat('feudalpolicy', 'info_reward_master')
             print("Master policy trains with info_gain reward")
-        self.js_threshold_master = 1.0
-        if cfg.has_option('feudalpolicy', 'js_threshold_master'):
-            self.js_threshold_master = cfg.getfloat('feudalpolicy', 'js_threshold_master')
         self.only_master = False
         if cfg.has_option('feudalpolicy', 'only_master'):
             self.only_master = cfg.getboolean('feudalpolicy', 'only_master')
@@ -108,16 +113,6 @@ class FeudalGainPolicy(Policy.Policy):
         self.bye_mask = False
         if cfg.has_option('summaryacts', 'byemask'):
             self.bye_mask = cfg.getboolean('summaryacts', 'byemask')
-            print("WE USE BYEMASK: ", self.bye_mask)
-
-        self.critic_regularizer_path = None
-        if cfg.has_option('policy', 'critic_regularizer'):
-            self.critic_regularizer_path = cfg.get('policy', 'critic_regularizer')
-            print(f"We use {self.critic_regularizer_path} as a critic regularizer.")
-
-        self.critic_regularizer_weight = 0
-        if cfg.has_option('policy', 'critic_regularizer_weight'):
-            self.critic_regularizer_weight = cfg.getfloat('policy', 'critic_regularizer_weight')
 
         self.randomseed = 1234
         if cfg.has_option('GENERAL', 'seed'):
@@ -150,51 +145,26 @@ class FeudalGainPolicy(Policy.Policy):
         self.chosen = False
 
         if self.only_master:
-            print("Using ACER with merged policy.")
+            print("Using merged policy pi_mg")
             self.master_actions = self.slot_independent_actions[:-1] + ['slot_dep']
             self.master_policy = FeudalNoisyACERPolicy(self._modify_policyfile('master', in_policy_file),
                                                    self._modify_policyfile('master', out_policy_file),
                                                    domainString=self.domainString, is_training=self.is_training,
                                                    action_names=self.master_actions, sd_state_dim=self.probability_max,
-                                                   slot='si', js_threshold=self.js_threshold_master,
-                                                   info_reward=self.info_reward_master, load_policy=self.load_master_policy,
-                                                   critic_regularizer_weight=self.critic_regularizer_weight)
+                                                   slot='si', load_policy=self.load_master_policy)
 
         elif self.si_policy_type == 'acer':
-            print("Using ACER with give_info and master_policy.")
+            print("Using policies pi_m and pi_g")
             self.master_policy = FeudalNoisyACERPolicy(self._modify_policyfile('master', in_policy_file),
                                                   self._modify_policyfile('master', out_policy_file),
                                                   domainString=self.domainString, is_training=self.is_training,
                                                   action_names=self.master_actions, sd_state_dim=self.probability_max,
-                                                  slot='si', js_threshold=self.js_threshold_master,
-                                                   info_reward=self.info_reward_master)
+                                                  slot='si')
             self.give_info_policy = FeudalNoisyACERPolicy(self._modify_policyfile('gi', in_policy_file),
                                                      self._modify_policyfile('gi', out_policy_file),
                                                      domainString=self.domainString, is_training=self.is_training,
                                                      action_names=self.slot_independent_actions, slot='si',
                                                      sd_state_dim=self.probability_max)
-        elif self.si_policy_type == 'dqn':
-            self.master_policy = FeudalNoisyACERPolicy(self._modify_policyfile('master', in_policy_file),
-                                                  self._modify_policyfile('master', out_policy_file),
-                                                  domainString=self.domainString, is_training=self.is_training,
-                                                  action_names=self.master_actions, sd_state_dim=self.probability_max,
-                                                  slot='si')
-            self.give_info_policy = FeudalDQNPolicy(self._modify_policyfile('gi', in_policy_file),
-                                                       self._modify_policyfile('gi', out_policy_file),
-                                                       domainString=self.domainString, is_training=self.is_training,
-                                                       action_names=self.slot_independent_actions, slot='si',
-                                                       sd_state_dim=0)
-
-        else:
-            self.master_policy = FeudalDQNPolicy(self._modify_policyfile('master', in_policy_file),
-                                                 self._modify_policyfile('master', out_policy_file),
-                                                     domainString=self.domainString, is_training=self.is_training,
-                                                     action_names=self.master_actions,
-                                                 slot='si')#pass is always masked, but its needed for implementation
-            self.give_info_policy = FeudalDQNPolicy(self._modify_policyfile('gi', in_policy_file),
-                                                    self._modify_policyfile('gi', out_policy_file),
-                                                     domainString=self.domainString, is_training=self.is_training,
-                                                     action_names=self.slot_independent_actions, slot='si')
 
         self.request_info_policy = FeudalDQNPolicy(self._modify_policyfile('ri', in_policy_file),
                                                    self._modify_policyfile('ri', out_policy_file),
diff --git a/policy/MasterAction.py b/policy/MasterAction.py
new file mode 100644
index 0000000000000000000000000000000000000000..74309722fb1ede44e334d4b164116a10cae0dd51
--- /dev/null
+++ b/policy/MasterAction.py
@@ -0,0 +1,381 @@
+###############################################################################
+# PyDial: Multi-domain Statistical Spoken Dialogue System Software
+###############################################################################
+#
+# Copyright 2015 - 2019
+# Cambridge University Engineering Department Dialogue Systems Group
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+###############################################################################
+
+'''
+SummaryAction.py - Mapping between summary and master actions
+=============================================================
+
+Copyright CUED Dialogue Systems Group 2015 - 2017, 2017
+
+.. seealso:: CUED Imports/Dependencies:
+
+    import :mod:`policy.SummaryUtils` |.|
+    import :mod:`ontology.Ontology` |.|
+    import :mod:`utils.ContextLogger` |.|
+    import :mod:`utils.Settings`
+
+************************
+
+'''
+
+__author__ = "Christian Geishauser"
+
+from policy import SummaryUtils
+from utils import ContextLogger, Settings
+from ontology import Ontology
+
+logger = ContextLogger.getLogger('')
+
+MAX_NUM_ACCEPTED = 10
+
+
+class MasterAction(object):
+    '''
+    The master action class encapsulates the functionality of a master action
+
+    .. Note::
+        The list of all possible master actions are defined in this class.
+    '''
+
+    def __init__(self, domainString, empty=False, confreq=False):
+        '''
+        Records what domain the class is instantiated for, and what actions are available
+
+        :param domainString: domain tag
+        :type domainString: string
+        :param empty: None
+        :type empty: bool
+        :param confreq: representing if the action confreq is used
+        :type confreq: bool
+        '''
+
+        self.domainString = domainString
+        self.summary_action_names = []
+        self.inform_names = []
+        self.action_names = []
+        self._array_slot_summary = None
+        self._global_summary = None
+
+        self.inform_mask = False
+        if Settings.config.has_option("summaryacts", "informmask"):
+            self.inform_mask = Settings.config.getboolean('summaryacts', 'informmask')
+        self.inform_count_accepted = 4
+        if Settings.config.has_option("summaryacts", "informcountaccepted"):
+            self.inform_count_accepted = Settings.config.getint('summaryacts', 'informcountaccepted')
+        elif Settings.config.has_option("goalgenerator", "maxconstraints"):
+            self.inform_count_accepted = Settings.config.getint('goalgenerator', 'maxconstraints') + 1
+        self.request_mask = False
+        if Settings.config.has_option("summaryacts", "requestmask"):
+            self.request_mask = Settings.config.getboolean('summaryacts', 'requestmask')
+        self.bye_mask = False
+        if Settings.config.has_option("summaryacts", "byemask"):
+            self.bye_mask = Settings.config.getboolean('summaryacts', 'byemask')
+
+        if not empty:
+            for slot in Ontology.global_ontology.get_system_requestable_slots(domainString):
+                self.summary_action_names.append("request_" + slot)
+                self.summary_action_names.append("confirm_" + slot)
+                self.summary_action_names.append("select_" + slot)
+                if confreq:
+                    for slot2 in Ontology.global_ontology.get_system_requestable_slots(domainString):
+                        self.summary_action_names.append("confreq_" + slot + "_" + slot2)
+            self.summary_action_names += ["inform",
+                                  "inform_byname",
+                                  "inform_alternatives",
+                                  "bye",
+                                  "repeat",
+                                  "reqmore",
+                                  "restart"
+                                  ]
+
+            informable_slots = Ontology.global_ontology.get_requestable_slots(domainString)
+            informable_slots.remove("name")
+            for i in range(0, 2**len(informable_slots)):
+                slots_to_inform = ""
+                #get binary representation and reverse it
+                binary_rep = format(i, "b")[::-1]
+                for index, j in enumerate(binary_rep):
+                    if int(j) == 1:
+                        slots_to_inform += f"-{informable_slots[index]}"
+
+                self.inform_names.append(slots_to_inform)
+
+            self.inform_ways = ["inform", "inform_byname", "inform_alternatives"]
+
+            for inform_way in self.inform_ways:
+                for slots_to_inform in self.inform_names:
+                    self.action_names.append(inform_way + slots_to_inform)
+
+            for name in self.summary_action_names:
+                if not name.startswith("inform"):
+                    self.action_names.append(name)
+
+            print("WE WORK IN MASTER ACTION SPACE DIRECTLY!")
+            print("NUMBER OF POSSIBLE MASTER ACTIONS: ", len(self.action_names))
+            print("NOTE: MASKING IS NOT IMPLEMENTED YET!")
+
+        self.reset()
+
+    def reset(self):
+        self.alternatives_requested = False
+
+    def Convert(self, belief, action, lastSystemAction=None):
+        '''
+        Converts the given summary action into a master action based on the current belief and the last system action.
+
+        :param belief: the current master belief
+        :type belief: dict
+        :param action: the summary action to be converted to master action
+        :type action: string
+        :param lastSystemAction: the system action of the previous turn
+        :type lastSystemAction: string
+        '''
+
+        self._array_slot_summary = SummaryUtils.arraySlotSummary(belief, self.domainString)
+        self._global_summary = SummaryUtils.globalSummary(belief, self.domainString)
+        logger.dial('system summary act: {}.'.format(action))
+
+        if action.startswith("inform"):
+            if "byname" in action:
+                output = self.getInformByName(belief, action)
+            elif "alternatives" in action:
+                output = self.getInformAlternatives(belief, action)
+            else:
+                #just inform by constraints possible now
+                output = self.getInformByConstraints(belief, action)
+        elif "request_" in action:
+            output = self.getRequest(action.split("_")[1])
+        elif "select_" in action:
+            output = self.getSelect(action.split("_")[1])
+        elif "confirm_" in action:
+            output = self.getConfirm(action.split("_")[1])
+        elif "confreq_" in action:
+            output = self.getConfReq(action.split("_")[1], action.split("_")[2])
+        elif action == "bye":
+            output = self.getBye()
+        elif action == "repeat":
+            output = lastSystemAction
+        elif action == "reqmore":
+            output = self.getReqMore()
+        elif action == "restart":
+            output = self.getRestart()
+        else:
+            output = ""
+            logger.error("Unknown action: " + action)
+        return output
+
+    # MASK OVER SUMMARY ACTION SET
+    # ------------------------------------------------------------------------------------
+
+    def getNonExecutable(self, belief, lastSystemAction):
+        '''
+        Set of rules defining the mask over the action set, given the current belief state
+        :param belief: the current master belief
+        :type belief: dict
+        :param lastSystemAction: the system action of the previous turn
+        :type lastSystemAction: string
+        :return: list of non-executable (masked) actions
+        '''
+
+        array_slot_summary = SummaryUtils.arraySlotSummary(belief, self.domainString)
+        global_summary = SummaryUtils.globalSummary(belief, self.domainString)
+        if global_summary['GLOBAL_BYALTERNATIVES'] and not global_summary['GLOBAL_THANKYOU'] and not global_summary[
+            'GLOBAL_ACK']:
+            self.alternatives_requested = True
+
+        nonexec = []
+
+        for action in self.action_names:
+            mask_action = False
+
+            if action == "inform":
+                acceptance_list = SummaryUtils.getTopBeliefs(belief, domainString=self.domainString)
+                discriminable = SummaryUtils.acceptanceListCanBeDiscriminated(acceptance_list,
+                                                                              self.domainString)
+                if not global_summary['GLOBAL_BYCONSTRAINTS']:
+                    mask_action = True
+                if global_summary['GLOBAL_COUNTACCEPTED'] < self.inform_count_accepted and discriminable:
+                    mask_action = True
+                if mask_action and self.inform_mask:
+                    nonexec.append(action)
+
+            elif action == "inform_byname":
+                if not global_summary['GLOBAL_BYNAME']:
+                    mask_action = True
+                if belief['features']['lastInformedVenue'] == '' \
+                        and SummaryUtils.getTopBelief(belief['beliefs']['name'])[0] == '**NONE**':
+                    mask_action = True
+                if mask_action and self.inform_mask:
+                    nonexec.append(action)
+
+            elif action == "inform_alternatives":
+                if not self.alternatives_requested:
+                    mask_action = True
+                if belief['features']['lastInformedVenue'] == '':
+                    mask_action = True
+                if mask_action and self.inform_mask:
+                    nonexec.append(action)
+
+            elif action == "bye":
+                if not global_summary['GLOBAL_FINISHED']:
+                    mask_action = True
+                if mask_action and self.bye_mask:
+                    nonexec.append(action)
+
+            elif action == "repeat":
+                if not global_summary['GLOBAL_REPEAT'] or lastSystemAction is None:
+                    mask_action = True
+                mask_action = True  # ic340: this action is "deactivated" because simuser doesnt know how to react to it
+                if mask_action:
+                    nonexec.append(action)
+
+            elif action == "reqmore":
+                if belief['features']['lastInformedVenue'] == '':
+                    mask_action = True
+                if mask_action and self.request_mask:
+                    nonexec.append(action)
+
+            elif action == "restart":
+                if not global_summary['GLOBAL_RESTART']:
+                    mask_action = True
+                mask_action = True  # ic340: this action is "deactivated" because simuser doesnt know how to react to it
+                if mask_action:
+                    nonexec.append(action)
+
+            elif "request_" in action:
+                pass
+                if mask_action and self.request_mask:
+                    nonexec.append(action)
+
+            elif "select_" in action:
+                slot_summary = array_slot_summary[action.split("_")[1]]
+                top_prob = slot_summary['TOPHYPS'][0][1]
+                sec_prob = slot_summary['TOPHYPS'][1][1]
+                if top_prob == 0 or sec_prob == 0:
+                    mask_action = True
+                if mask_action and self.request_mask:
+                    nonexec.append(action)
+
+            elif "confirm_" in action:
+                slot_summary = array_slot_summary[action.split("_")[1]]
+                top_prob = slot_summary['TOPHYPS'][0][1]
+                if top_prob == 0:
+                    mask_action = True
+                if mask_action and self.request_mask:
+                    nonexec.append(action)
+
+            elif "confreq_" in action:
+                slot_summary = array_slot_summary[action.split("_")[1]]
+                top_prob = slot_summary['TOPHYPS'][0][1]
+                if top_prob == 0:
+                    mask_action = True
+                if mask_action and self.request_mask:
+                    nonexec.append(action)
+
+        logger.info('masked inform actions:' + str([act for act in nonexec if 'inform' in act]))
+        return nonexec
+
+    # added by phs26, 4 Nov 2016
+    def getExecutableMask(self):
+        '''
+        '''
+        """
+        # hack, make every action executable
+        return [0.0] * len(self.action_names)
+        """
+        return [0.0] * len(self.summary_action_names)
+
+
+    # CONVERTING METHODS FOR EACH SPECIFIC ACT:
+    # ------------------------------------------------------------------------------------
+
+    def getRequest(self, slot):
+        return 'request({})'.format(slot)
+
+    def getConfirm(self, slot):
+        summary = self._array_slot_summary[slot]
+        top_value = summary['TOPHYPS'][0][0]
+        return 'confirm({}="{}")'.format(slot, top_value)
+
+    def getConfReq(self, cslot, rslot):
+        summary = self._array_slot_summary[cslot]
+        top_value = summary['TOPHYPS'][0][0]
+        return 'confreq({}="{}",{})'.format(cslot, top_value, rslot)
+
+    def getSelect(self, slot):
+        summary = self._array_slot_summary[slot]
+        top_value = summary['TOPHYPS'][0][0]
+        sec_value = summary['TOPHYPS'][1][0]
+        return 'select({}="{}",{}="{}")'.format(slot, top_value, slot, sec_value)
+
+    def getInformByConstraints(self, belief, action):
+        accepted_values = SummaryUtils.getTopBeliefs(belief, domainString=self.domainString)
+
+        inform_slots = action.split("-")
+        if len(inform_slots) == 1:
+            inform_slots = []
+        else:
+            inform_slots = inform_slots[1:]
+        constraints = SummaryUtils.get_constraints(accepted_values)
+        return SummaryUtils.getInformByConstraints(constraints, self.domainString,
+                                                   belief['features']['lastInformedVenue'], inform_slots)
+
+    def getInformByName(self, belief, action):
+
+        requested_slots = action.split("-")
+        if len(requested_slots) == 1:
+            #the policy decided to inform just the name and no slots, we will inform on a random slot then
+            requested_slots = []
+        else:
+            requested_slots = requested_slots[1:]
+        name = SummaryUtils.getTopBelief(belief['beliefs']['name'])[0]
+        if name == '**NONE**':
+            name = belief['features']['lastInformedVenue']
+        return SummaryUtils.getInformRequestedSlots(requested_slots, name, self.domainString)
+
+    def getInformAlternatives(self, belief, action):
+        self.alternatives_requested = False
+        informedVenueSinceNone = set(belief['features']['informedVenueSinceNone'])
+        accepted_values = SummaryUtils.getTopBeliefs(belief, domainString=self.domainString)
+
+        inform_slots = action.split("-")
+        if len(inform_slots) == 1:
+            inform_slots = []
+        else:
+            inform_slots = inform_slots[1:]
+        return SummaryUtils.getInformAlternativeEntities(accepted_values, informedVenueSinceNone, self.domainString, inform_slots)
+
+    def getBye(self):
+        return 'bye()'
+
+    def getReqMore(self):
+        return 'reqmore()'
+
+    def getInformRepeat(self):
+        # TODO: implement the proper action, this was not implemented in PolicyUtils.py
+        return 'null()'
+
+    def getRestart(self):
+        # TODO: implement the proper action, this was not implemented in PolicyUtils.py
+        return 'null()'
+
+# END OF FILE
diff --git a/policy/PolicyManager.py b/policy/PolicyManager.py
index 3ca85b89159b3f27f93341d03aaba96f2b28583b..5671814dfa283407b72724459b9126195cfb0527 100644
--- a/policy/PolicyManager.py
+++ b/policy/PolicyManager.py
@@ -51,6 +51,8 @@ class PolicyManager(object):
         self.domainPolicies = dict.fromkeys(OntologyUtils.available_domains, None)
         self.committees = self._load_committees()
         self.shared_params = None
+
+
         
         self.SPECIAL_DOMAINS = ['topicmanager','wikipedia']
         
diff --git a/policy/feudalgainRL/DQNPolicy_latest.py b/policy/feudalgainRL/DQNPolicy_latest.py
deleted file mode 100644
index 559e5bc4f55174efffac683a31cd9c9a6481f198..0000000000000000000000000000000000000000
--- a/policy/feudalgainRL/DQNPolicy_latest.py
+++ /dev/null
@@ -1,789 +0,0 @@
-###############################################################################
-# PyDial: Multi-domain Statistical Spoken Dialogue System Software
-###############################################################################
-#
-# Copyright 2015 - 2019
-# Cambridge University Engineering Department Dialogue Systems Group
-#
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-###############################################################################
-
-'''
-DQNPolicy.py - deep Q network policy
-==================================================
-
-Author: Pei-Hao (Eddy) Su  (Copyright CUED Dialogue Systems Group 2016)
-
-.. seealso:: CUED Imports/Dependencies: 
-
-    import :class:`Policy`
-    import :class:`utils.ContextLogger`
-
-.. warning::
-        Documentation not done.
-
-
-************************
-
-'''
-
-import copy
-import sys
-import os
-import json
-import numpy as np
-import pickle as pickle
-import random
-import utils
-from utils.Settings import config as cfg
-from utils import ContextLogger
-
-import ontology.FlatOntologyManager as FlatOnt
-#from theano_dialogue.util.tool import *
-
-import tensorflow as tf
-from policy.DRL.replay_buffer import ReplayBuffer
-from policy.DRL.replay_prioritised import ReplayPrioritised
-import policy.DRL.utils as drlutils
-import policy.DRL.dqn as dqn
-import policy.Policy
-import policy.SummaryAction
-from policy.Policy import TerminalAction, TerminalState
-import policy.GPPolicy
-
-logger = utils.ContextLogger.getLogger('')
-
-# --- for flattening the belief --- # 
-domainUtil = FlatOnt.FlatDomainOntology('CamRestaurants')
-
-"""
-def flatten_belief(gpstate):
-    '''
-    Flatten the GP-dictionary-typed belief state to a one-dim vector
-    '''
-
-    if isinstance(gpstate, TerminalState):
-        return [0] * 304 #260 #264
-
-    flat_belief = []
-    for key, value in gpstate._bstate.items():
-        flat_belief += value
-
-    return flat_belief
-"""
-
-def flatten_belief(belief,domainUtil=FlatOnt.FlatDomainOntology('CamRestaurants'), merge=False):
-    if isinstance(belief, TerminalState):
-        return [0] * 260 #264
-
-    #for key, value in belief.items():
-    #    print key, value
-
-    #policyfeatures = ['full','method','discourseAct','requested']
-    policyfeatures = ['full','method','discourseAct','requested',\
-                'lastActionInformNone','offerHappened','inform_info']
-
-    flat_belief = []
-    for feat in policyfeatures:
-        add_feature = []
-        if feat == 'kbest':
-            for slot in self.domainUtil.sorted_system_requestable_slots:
-                # print slot, 'belief', belief['beliefs'][slot]
-                temp = [belief['beliefs'][slot][value] for value in domainUtil.ontology['informable'][slot]]
-                temp = sorted(temp, key=lambda b: -b)
-               #temp = [belief['beliefs'][slot]['dontcare']] + [belief['beliefs'][slot]['**NONE**']] + temp
-                temp = temp + [belief['beliefs'][slot]['dontcare']] + [belief['beliefs'][slot]['**NONE**']]
-                temp = temp[0:self.max_k]
-                add_feature += temp
-        elif feat == 'full':
-            #for slot in self.sorted_slots:
-            for slot in domainUtil.ontology['informable']:
-                for value in domainUtil.ontology['informable'][slot]:# + ['**NONE**']:
-                #for value in domainUtil.ontology['informable'][slot] + ['**NONE**']:
-                #for value in domainUtil.ontology['informable'][slot] + ['dontcare'] + ['**NONE**']:
-                    add_feature.append(belief['beliefs'][slot][value])
-        elif feat == 'method':
-            add_feature = [belief['beliefs']['method'][method] for method in domainUtil.ontology['method']]
-        elif feat == 'discourseAct':
-            add_feature = [belief['beliefs']['discourseAct'][discourseAct]
-                           for discourseAct in domainUtil.ontology['discourseAct']]
-        elif feat == 'requested':
-            add_feature = [belief['beliefs']['requested'][slot] \
-                            for slot in domainUtil.ontology['requestable']]
-        elif feat == 'lastActionInformNone':
-            add_feature.append(float(belief['features']['lastActionInformNone']))
-        elif feat == 'offerHappened':
-            add_feature.append(float(belief['features']['offerHappened']))
-        elif feat == 'inform_info':
-            add_feature += belief['features']['inform_info']
-        else:
-            logger.error('Invalid feature name in config: ' + feat)
-
-        flat_belief += add_feature
-
-    return flat_belief
-
-    
-    
-    """ 
-    flat_belief = []
-    for feat in policyfeatures:
-        add_feature = []
-        if feat == 'full':
-            #for slot in self.sorted_slots:
-            for slot in domainUtil.ontology['informable']:
-                if slot == 'name':
-                    continue
-                accumProb = 0.0
-                for value in domainUtil.ontology['informable'][slot] + ['**NONE**']:
-                     if value not in ('dontcare', '**NONE**'):
-                        accumProb += float(belief['beliefs'][slot][value])
-                add_feature.append(accumProb)
-                add_feature.append(belief['beliefs'][slot]['dontcare'])
-                add_feature.append(belief['beliefs'][slot]['**NONE**'])
-
-                   #add_feature.append(belief['beliefs'][slot][value])
-        elif feat == 'method':
-            add_feature = [belief['beliefs']['method'][method] \
-                            for method in domainUtil.ontology['method']]
-        elif feat == 'discourseAct':
-            add_feature = [belief['beliefs']['discourseAct'][discourseAct]
-                           for discourseAct in domainUtil.ontology['discourseAct']]
-        elif feat == 'requested':
-            add_feature = [belief['beliefs']['requested'][slot] \
-                            for slot in domainUtil.ontology['requestable']]
-        elif feat == 'lastActionInformNone':
-            add_feature.append(float(belief['features']['lastActionInformNone']))
-        elif feat == 'offerHappened':
-            add_feature.append(float(belief['features']['offerHappened']))
-        elif feat == 'inform_info':
-            add_feature += (belief['features']['inform_info'])
-        else:
-            logger.error('Invalid feature name in config: ' + feat)
-
-        flat_belief += add_feature
-    return flat_belief
-    """ 
-
-
-class DQNPolicy(Policy.Policy):
-    '''Derived from :class:`Policy`
-    '''
-    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False):
-        super(DQNPolicy, self).__init__(domainString, is_training)
-
-        self.in_policy_file = in_policy_file
-        self.out_policy_file = out_policy_file
-        self.is_training = is_training
-        self.accum_belief = []
-        self.stats = [0 for ii in range(14)]
-
-        self.prev_state_check = None
-
-        # parameter settings
-        self.n_in= 260
-        if cfg.has_option('dqnpolicy_'+domainString, 'n_in'):
-            self.n_in = cfg.getint('dqnpolicy_'+domainString, 'n_in')
-
-        self.actor_lr = 0.0001
-        if cfg.has_option('dqnpolicy_'+domainString, 'actor_lr'):
-            self.actor_lr = cfg.getfloat('dqnpolicy_'+domainString, 'actor_lr')
-
-        self.critic_lr = 0.001
-        if cfg.has_option('dqnpolicy_'+domainString, 'critic_lr'):
-            self.critic_lr = cfg.getfloat('dqnpolicy_'+domainString, 'critic_lr')
-
-        self.tau = 0.001
-        if cfg.has_option('dqnpolicy_'+domainString, 'tau'):
-            self.tau = cfg.getfloat('dqnpolicy_'+domainString, 'tau')
-
-        self.randomseed = 1234
-        if cfg.has_option('GENERAL', 'seed'):
-            self.randomseed = cfg.getint('GENERAL', 'seed')
-        
-        self.gamma = 1.0
-        if cfg.has_option('dqnpolicy_'+domainString, 'gamma'):
-            self.gamma = cfg.getfloat('dqnpolicy_'+domainString, 'gamma')
-
-        self.regularisation = 'l2'
-        if cfg.has_option('dqnpolicy_'+domainString, 'regularisation'):
-            self.regularisation = cfg.get('dqnpolicy_'+domainString, 'regulariser')
-
-        self.learning_rate = 0.001
-        if cfg.has_option('dqnpolicy_'+domainString, 'learning_rate'):
-            self.learning_rate = cfg.getfloat('dqnpolicy_'+domainString, 'learning_rate')
-
-        self.exploration_type = 'e-greedy' # Boltzman
-        if cfg.has_option('dqnpolicy_'+domainString, 'exploration_type'):
-            self.exploration_type = cfg.get('dqnpolicy_'+domainString, 'exploration_type')
-
-        self.episodeNum = 1000
-        if cfg.has_option('dqnpolicy_'+domainString, 'episodeNum'):
-            self.episodeNum = cfg.getfloat('dqnpolicy_'+domainString, 'episodeNum')
-
-        self.maxiter = 5000
-        if cfg.has_option('dqnpolicy_'+domainString, 'maxiter'):
-            self.maxiter = cfg.getfloat('dqnpolicy_'+domainString, 'maxiter')
-
-        self.epsilon = 1
-        if cfg.has_option('dqnpolicy_'+domainString, 'epsilon'):
-            self.epsilon = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon')
-        
-        self.epsilon_start = 1
-        if cfg.has_option('dqnpolicy_'+domainString, 'epsilon_start'):
-            self.epsilon_start = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon_start')
-
-        self.epsilon_end = 1
-        if cfg.has_option('dqnpolicy_'+domainString, 'epsilon_end'):
-            self.epsilon_end = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon_end')
-
-        self.priorProbStart = 1.0
-        if cfg.has_option('dqnpolicy_'+domainString, 'prior_sample_prob_start'):
-            self.priorProbStart = cfg.getfloat('dqnpolicy_'+domainString, 'prior_sample_prob_start')
-
-        self.priorProbEnd = 0.1
-        if cfg.has_option('dqnpolicy_'+domainString, 'prior_sample_prob_end'):
-            self.priorProbEnd = cfg.getfloat('dqnpolicy_'+domainString, 'prior_sample_prob_end')
-
-        self.policyfeatures = []
-        if cfg.has_option('dqnpolicy_'+domainString, 'features'):
-            logger.info('Features: ' + str(cfg.get('dqnpolicy_'+domainString, 'features')))
-            self.policyfeatures = json.loads(cfg.get('dqnpolicy_'+domainString, 'features'))
-
-        self.max_k = 5
-        if cfg.has_option('dqnpolicy_'+domainString, 'max_k'):
-            self.max_k = cfg.getint('dqnpolicy_'+domainString, 'max_k')
-
-        self.learning_algorithm = 'drl'
-        if cfg.has_option('dqnpolicy_'+domainString, 'learning_algorithm'):
-            self.learning_algorithm = cfg.get('dqnpolicy_'+domainString, 'learning_algorithm')
-            logger.info('Learning algorithm: ' + self.learning_algorithm)
-
-        self.minibatch_size = 32
-        if cfg.has_option('dqnpolicy_'+domainString, 'minibatch_size'):
-            self.minibatch_size = cfg.getint('dqnpolicy_'+domainString, 'minibatch_size')
-
-        self.capacity = 1000#max(self.minibatch_size, 2000)
-        if cfg.has_option('dqnpolicy_'+domainString, 'capacity'):
-            self.capacity = max(cfg.getint('dqnpolicy_'+domainString,'capacity'), 2000)
-
-        self.replay_type = 'vanilla'
-        if cfg.has_option('dqnpolicy_'+domainString, 'replay_type'):
-            self.replay_type = cfg.get('dqnpolicy_'+domainString, 'replay_type')
-
-        self.architecture = 'vanilla'
-        if cfg.has_option('dqnpolicy_'+domainString, 'architecture'):
-            self.architecture = cfg.get('dqnpolicy_'+domainString, 'architecture')
-
-        self.q_update = 'single'
-        if cfg.has_option('dqnpolicy_'+domainString, 'q_update'):
-            self.q_update = cfg.get('dqnpolicy_'+domainString, 'q_update')
-
-        self.h1_size = 130
-        if cfg.has_option('dqnpolicy_'+domainString, 'h1_size'):
-            self.h1_size = cfg.getint('dqnpolicy_'+domainString, 'h1_size')
-        
-        self.h2_size = 130
-        if cfg.has_option('dqnpolicy_'+domainString, 'h2_size'):
-            self.h2_size = cfg.getint('dqnpolicy_'+domainString, 'h2_size')
-
-        """
-        self.shuffle = False
-        if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'):
-            self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay')
-        if not self.shuffle:
-            # If we don't use experience replay, we don't need to maintain
-            # sliding window of experiences with maximum capacity.
-            # We only need to maintain the data of minibatch_size
-            self.capacity = self.minibatch_size
-        """
-
-        self.episode_ave_max_q = []
-
-        os.environ["CUDA_VISIBLE_DEVICES"]=""
-
-        # init session
-        self.sess = tf.Session()
-        with tf.device("/cpu:0"):
-
-            np.random.seed(self.randomseed)
-            tf.set_random_seed(self.randomseed)
-
-            # initialise an replay buffer
-            if self.replay_type == 'vanilla':
-                self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed)
-            elif self.replay_type == 'prioritized':
-                self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size, self.randomseed)
-            #replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
-            #self.episodes = []
-            self.samplecount = 0
-            self.episodecount = 0
-
-            # construct the models
-            self.state_dim = self.n_in
-            self.summaryaction = SummaryAction.SummaryAction(domainString)
-            self.action_dim = len(self.summaryaction.action_names)
-            action_bound = len(self.summaryaction.action_names)
-
-            self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, \
-                self.critic_lr, self.tau, action_bound, self.architecture, self.h1_size, self.h2_size)
-
-            # when all models are defined, init all variables
-            init_op = tf.initialize_all_variables()
-            self.sess.run(init_op)
-
-            self.loadPolicy(self.in_policy_file)
-            print('loaded replay size: ', self.episodes[self.domainString].size())
-
-            self.dqn.update_target_network()
-
-    # def record() has been handled...
-
-    def act_on(self, beliefstate, hyps=None):
-        if self.lastSystemAction is None and self.startwithhello:
-            systemAct, nextaIdex = 'hello()', -1
-        else:
-            systemAct, nextaIdex = self.nextAction(beliefstate, hyps)
-        self.lastSystemAction = systemAct
-        self.summaryAct = nextaIdex
-        self.prevbelief = beliefstate
-        return systemAct
-
-    def record(self, reward, domainInControl=None, weight=None, state=None, action=None):
-        if domainInControl is None:
-            domainInControl = self.domainString
-        if self.episodes[domainInControl] is None:
-            self.episodes[domainInControl] = Episode(dstring=domainInControl)
-        if self.actToBeRecorded is None:
-            #self.actToBeRecorded = self.lastSystemAction
-            self.actToBeRecorded = self.summaryAct
-
-        if state is None:
-            state = self.prevbelief
-        if action is None:
-            action = self.actToBeRecorded
-
-        cState, cAction = self.convertStateAction(state, action)
-
-        # normalising total return to -1~1
-        #reward /= 40.0
-        reward /= 20.0
-        """
-        reward = float(reward+10.0)/40.0
-        """
-
-        if weight == None:
-            if self.replay_type == 'vanilla':
-                self.episodes[domainInControl].record(state=cState, \
-                        state_ori=state, action=cAction, reward=reward)
-            elif self.replay_type == 'prioritized':
-
-                ##### calculate Q_s_t_a_t_ and gamma_Q_s_tplu1_maxa_ for PER ###
-                ################################################################
-                cur_cState = np.vstack([np.expand_dims(x, 0) for x in [cState]])        
-                cur_cAction_one_hot = np.eye(self.action_dim, self.action_dim)[[cAction]]
-                
-                cur_action_q = self.dqn.predict(cur_cState, cur_cAction_one_hot)
-                execMask = self.summaryaction.getExecutableMask(state, cAction)
-               
-                if self.q_update == 'single':
-                    Qs = []
-                    for idx, v in enumerate(execMask):
-                        if v > -sys.maxsize:
-                            Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
-                            Qidx = self.dqn.predict_target(cur_cState, Action_idx)
-                            Qs.append(Qidx[0])
-                            #Qs.append(Qidx[0])
-
-                    Q_s_t_a_t_ = cur_action_q[0]
-                    gamma_Q_s_tplu1_maxa_ = self.gamma * np.max(Qs)
-                elif self.q_update == 'double':
-                    Qs = []
-                    for idx, v in enumerate(execMask):
-                        if v > -sys.maxsize:
-                            Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
-                            Qidx = self.dqn.predict(cur_cState, Action_idx)
-                            Qs.append(Qidx[0])
-                        else:
-                            Qs.append(-sys.maxsize)
-
-                    policyQ_argmax_a = np.argmax(Qs)
-                    policyQ_argmax_a_one_hot = np.eye(self.action_dim, self.action_dim)[[policyQ_argmax_a]]
-                    target_value_Q = self.dqn.predict_target(cur_cState, policyQ_argmax_a_one_hot)
-
-                    Q_s_t_a_t_ = cur_action_q[0]
-                    gamma_Q_s_tplu1_maxa_ = self.gamma * target_value_Q
-
-                print('Q_s_t_a_t_', Q_s_t_a_t_)
-                print('gamma_Q_s_tplu1_maxa_', gamma_Q_s_tplu1_maxa_)
-                ################################################################
-
-                # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
-                #if self.samplecount >= self.capacity:
-                if True:
-                    self.episodes[domainInControl].record(state=cState, \
-                        state_ori=state, action=cAction, reward=reward, \
-                            Q_s_t_a_t_ = Q_s_t_a_t_, gamma_Q_s_tplu1_maxa_= gamma_Q_s_tplu1_maxa_, uniform=False)
-                else:
-                    self.episodes[domainInControl].record(state=cState, \
-                        state_ori=state, action=cAction, reward=reward, \
-                            Q_s_t_a_t_ = Q_s_t_a_t_, gamma_Q_s_tplu1_maxa_= gamma_Q_s_tplu1_maxa_, uniform=True)
-
-        else:
-            self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward, ma_weight=weight)
-
-        self.actToBeRecorded = None
-        self.samplecount += 1
-        return
-
-    def finalizeRecord(self, reward, domainInControl=None):
-        if domainInControl is None:
-            domainInControl = self.domainString
-        if self.episodes[domainInControl] is None:
-            logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
-            return
-
-        #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size())
-        print('Episode Avg_Max_Q', np.mean(self.episode_ave_max_q))
-
-        print('saving statics')
-        self.saveStats()
-        print(self.stats)
-
-        # normalising total return to -1~1
-        #if reward == 0:
-        #    reward = -20.0
-        reward /= 20.0
-        """
-        if reward == 20.0:
-            reward = 1.0
-        else:
-            reward = -0.5
-        """
-        #reward = float(reward+10.0)/40.0
-
-        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
-
-        if self.replay_type == 'vanilla':
-            self.episodes[domainInControl].record(state=terminal_state, \
-                    state_ori=TerminalState(), action=terminal_action, reward=reward, terminal=True)
-        elif self.replay_type == 'prioritized':
-            # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
-                #if self.samplecount >= self.capacity:
-                if True:
-                    self.episodes[domainInControl].record(state=terminal_state, \
-                        state_ori=TerminalState(), action=terminal_action, reward=reward, \
-                            Q_s_t_a_t_ = 0.0, gamma_Q_s_tplu1_maxa_= 0.0, uniform=False, terminal=True)
-                else:
-                    self.episodes[domainInControl].record(state=terminal_state, \
-                        state_ori=TerminalState(), action=terminal_action, reward=reward, \
-                            Q_s_t_a_t_ = 0.0, gamma_Q_s_tplu1_maxa_= 0.0, uniform=True, terminal=True)
-        return
-
-    def convertStateAction(self, state, action):
-        '''
-        nnType = 'dnn'
-        #nnType = 'rnn'
-        # expand one dimension to match the batch size of 1 at axis 0
-        if nnType == 'rnn':
-            belief = np.expand_dims(belief,axis=0)
-        '''
-       
-        if isinstance(state, TerminalState):
-            return [0] * 260, action #260 #264
-        else:
-            flat_belief = flatten_belief(state)
-
-            if flat_belief == self.prev_state_check:
-                print('same state')
-            else:
-                print('diff state')
-            self.prev_state_check = flat_belief
-
-            return flat_belief, action
-
-    def nextAction(self, beliefstate, hyps):
-        '''
-        select next action
-
-        :param beliefstate: 
-        :param hyps:
-        :returns: (int) next summary action
-        '''
-        #beliefVec = flatten_belief(beliefstate, domainUtil)
-        beliefVec = flatten_belief(beliefstate)
-
-        execMask = self.summaryaction.getExecutableMask(beliefstate, self.lastSystemAction)
-        #print sum([ 1 for i in execMask if i==0.0 ])
-        if self.exploration_type == 'e-greedy':
-            # epsilon greedy
-            if self.is_training and utils.Settings.random.rand() < self.epsilon:
-                admissible = [i for i, x in enumerate(execMask) if x == 0.0]
-                random.shuffle(admissible)
-                nextaIdex = admissible[0]
-            else:
-                admissible = []
-                for idx, v in enumerate(execMask):
-                    if v > -sys.maxsize:
-                        Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
-                        Qidx = self.dqn.predict(np.reshape(beliefVec, (1, len(beliefVec))), Action_idx)
-                        admissible.append(Qidx[0])
-                    else:
-                        admissible.append(-sys.maxsize)
-                #action_Q = self.dqn.predict(np.reshape(beliefVec, (1, len(beliefVec))))# + (1. / (1. + i + j))
-                #admissible = np.add(action_Q, np.array(execMask))
-                logger.info('action Q...')
-                print(admissible)
-                nextaIdex = np.argmax(admissible)
-
-                # add current max Q to self.episode_ave_max_q
-                print('current maxQ', np.max(admissible))
-                self.episode_ave_max_q.append(np.max(admissible))
-
-        elif self.exploration_type == 'Boltzman':
-            # randomly assign, not complete
-            admissible = [i for i, x in enumerate(execMask) if x == 0.0]
-            random.shuffle(admissible)
-            nextaIdex = admissible[0]
-      
-        self.stats[nextaIdex] += 1
-        summaryAct = self.summaryaction.action_names[nextaIdex]
-        masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction)
-        return masterAct, nextaIdex
-
-    def train(self):
-        '''
-        call this function when the episode ends
-        '''
-
-        if not self.is_training:
-            logger.info("Not in training mode")
-            return
-        else:
-            logger.info("Update dqn policy parameters.")
-
-        self.episodecount += 1
-        logger.info("Sample Num so far: %s" %(self.samplecount))
-        logger.info("Episode Num so far: %s" %(self.episodecount))
-        #if True:
-        if self.samplecount >= self.minibatch_size * 3 and self.episodecount % 4 == 0:
-        #if self.samplecount >= self.capacity and self.episodecount % 5 == 0:
-        #if self.samplecount > self.minibatch_size:
-        #if self.samplecount > self.capacity:
-            logger.info('start traninig...')
-
-
-            #################################################
-            #################################################
-            #   update TD error for all experience in PER   #
-            #################################################
-            #################################################
-            """
-            #s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
-            #    self.episodes[self.domainString].all_batch()
-            experience, idx_batch = self.episodes[self.domainString].all_batch()
-                #self.episodes[self.domainString].sample_batch_vanilla_PER()
-           
-            #s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch])
-            #s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch])
-
-            # self.s_prev, self.s_ori_prev, self.a_prev, self.r_prev, state, state_ori, termina
-    
-            for k in xrange(len(idx_batch)):
-                Q_bootstrap_label = 0
-                if experience[k][-1]: # terminal
-                    Q_bootstrap_label = experience[k][3] # reward
-                else:
-                    execMask = self.summaryaction.getExecutableMask(experience[k][-2], experience[k][2]) # s_ori, a
-                    if self.q_update == 'single':
-                        admissible = []
-                        for idx, v in enumerate(execMask):
-                            if v > -sys.maxint:
-                                Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
-                                s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][0]] ]) # s
-                                Qidx = self.dqn.predict_target(s2_idx, Action_idx)
-                                admissible.append(Qidx[0])
-                        Q_bootstrap_label = experience[k][3] + self.gamma * np.max(admissible) # reward
-                    elif self.q_update == 'double':
-                        Qs = []
-                        for idx, v in enumerate(execMask):
-                            if v > -sys.maxint:
-                                Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
-                                s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][-3]] ])
-                                Qidx = self.dqn.predict(s2_idx, Action_idx)
-                                Qs.append(Qidx[0])
-                            else:
-                                Qs.append(-sys.maxint)
-
-                        policyQ_argmax_a = np.argmax(Qs)
-                        policyQ_argmax_a_one_hot = np.eye(self.action_dim, self.action_dim)[[policyQ_argmax_a]]
-                        s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][-3]] ])
-                        target_value_Q = self.dqn.predict_target(s2_idx, policyQ_argmax_a_one_hot)
-
-                        Q_bootstrap_label = experience[k][3] + self.gamma * target_value_Q
-
-                if self.replay_type == 'prioritized':
-                    # update the sum-tree
-                    # update the TD error of the samples in the minibatch
-                    current_a = np.eye(self.action_dim, self.action_dim)[[experience[k][2]]]
-                    current_s = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][0]] ])
-                    currentQ_s_a_ = self.dqn.predict(current_s, current_a)
-                    currentQ_s_a_ = currentQ_s_a_[0]
-                    error = abs(currentQ_s_a_ - Q_bootstrap_label)
-                    self.episodes[self.domainString].update(idx_batch[k], error)
- 
-            """
-
-            s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
-                self.episodes[self.domainString].sample_batch()
-                #self.episodes[self.domainString].sample_batch_vanilla_PER()
-           
-            #s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch])
-            #s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch])
-
-            y_i = []
-            for k in range(min(self.minibatch_size, self.episodes[self.domainString].size())):
-                Q_bootstrap_label = 0
-                if t_batch[k]:
-                    Q_bootstrap_label = r_batch[k]
-                else:
-                    execMask = self.summaryaction.getExecutableMask(s2_ori_batch[k], a_batch[k])
-                    if self.q_update == 'single':
-                        admissible = []
-                        for idx, v in enumerate(execMask):
-                            if v > -sys.maxsize:
-                                Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
-                                s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [s2_batch[k]] ])
-                                Qidx = self.dqn.predict_target(s2_idx, Action_idx)
-                                admissible.append(Qidx[0])
-                        Q_bootstrap_label = r_batch[k] + self.gamma * np.max(admissible)
-                    elif self.q_update == 'double':
-                        Qs = []
-                        for idx, v in enumerate(execMask):
-                            if v > -sys.maxsize:
-                                Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]]
-                                s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [s2_batch[k]] ])
-                                Qidx = self.dqn.predict(s2_idx, Action_idx)
-                                Qs.append(Qidx[0])
-                            else:
-                                Qs.append(-sys.maxsize)
-
-                        policyQ_argmax_a = np.argmax(Qs)
-                        policyQ_argmax_a_one_hot = np.eye(self.action_dim, self.action_dim)[[policyQ_argmax_a]]
-                        s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [s2_batch[k]] ])
-                        target_value_Q = self.dqn.predict_target(s2_idx, policyQ_argmax_a_one_hot)
-
-                        Q_bootstrap_label = r_batch[k] + self.gamma * target_value_Q
-                y_i.append(Q_bootstrap_label)
-
-                if self.replay_type == 'prioritized':
-                    # update the sum-tree
-                    # update the TD error of the samples in the minibatch
-                    current_a = np.eye(self.action_dim, self.action_dim)[[a_batch[k]]]
-                    current_s = np.vstack([ np.expand_dims(x, 0) for x in [s_batch[k]] ])
-                    currentQ_s_a_ = self.dqn.predict(current_s, current_a)
-                    currentQ_s_a_ = currentQ_s_a_[0]
-                    error = abs(currentQ_s_a_ - Q_bootstrap_label)
-                    self.episodes[self.domainString].update(idx_batch[k], error)
-                                         
-            # change index-based a_batch to one-hot-based a_batch
-            a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch]
-
-            # Update the critic given the targets
-            reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i])
-
-            s_batch_expand = np.vstack([np.expand_dims(x, 0) for x in s_batch])
-            """
-            print s_batch_expand
-            print 'a_batch', a_batch
-            print a_batch_one_hot
-            print len(a_batch)
-            print len(y_i)
-            """
-            #reshaped_yi = np.reshape(y_i, (min(self.minibatch_size, self.episodes[self.domainString].size()), 1))
-            predicted_q_value, _, currentLoss = self.dqn.train(s_batch, a_batch_one_hot, reshaped_yi)
-            #predicted_q_value, _, currentLoss = self.dqn.train(s_batch_expand, a_batch_one_hot, reshaped_yi)
-            
-            print('y_i')
-            print(y_i)
-            print('currentLoss', currentLoss)
-            print('predict Q')
-            print(predicted_q_value)
-
-            if self.episodecount % 1 == 0:
-            #if self.episodecount % 50 == 0:
-            # Update target networks
-                self.dqn.update_target_network()
-
-            self.savePolicyInc()  # self.out_policy_file)
-
-    def savePolicy(self, FORCE_SAVE=False):
-        """
-        Does not use this, cause it will be called from agent after every episode.
-        we want to save the policy only periodically.
-        """
-        pass
-
-    def savePolicyInc(self, FORCE_SAVE=False):
-        """
-        save model and replay buffer
-        """
-        #save_path = self.saver.save(self.sess, self.out_policy_file+'.ckpt')
-        self.dqn.save_network(self.out_policy_file+'.dqn.ckpt')
-
-        f = open(self.out_policy_file+'.episode', 'wb')
-        for obj in [self.samplecount, self.episodes[self.domainString]]:
-            pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
-        f.close()
-        #logger.info("Saving model to %s and replay buffer..." % save_path)
-
-    def saveStats(self, FORCE_SAVE=False):
-        f = open(self.out_policy_file + '.stats', 'wb')
-        pickle.dump(self.stats, f, protocol=pickle.HIGHEST_PROTOCOL)
-        f.close()
-
-    def loadPolicy(self, filename):
-        """
-        load model and replay buffer
-        """
-        # load models
-        self.dqn.load_network(filename+'.dqn.ckpt')
-        
-        # load replay buffer
-        try:
-            print('laod from: ', filename)
-            f = open(filename+'.episode', 'rb')
-            loaded_objects = []
-            for i in range(2): # load nn params and collected data
-                loaded_objects.append(pickle.load(f))
-            self.samplecount = int(loaded_objects[0])
-            self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1])
-            logger.info("Loading both model from %s and replay buffer..." % filename)
-            f.close()
-        except:
-            logger.info("Loading only models...")
-
-    def restart(self):
-        self.summaryAct = None          
-        self.lastSystemAction = None
-        self.prevbelief = None
-        self.actToBeRecorded = None
-        self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter)
-        print('current eps', self.epsilon)
-        #self.episodes = dict.fromkeys(OntologyUtils.available_domains, None)
-        #self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.randomseed)
-        self.episode_ave_max_q = []
-
-#END OF FILE
diff --git a/policy/feudalgainRL/FeudalACERPolicy.py b/policy/feudalgainRL/FeudalACERPolicy.py
deleted file mode 100644
index 19d9fccb719b8588567e8e397a9a6f2441230de2..0000000000000000000000000000000000000000
--- a/policy/feudalgainRL/FeudalACERPolicy.py
+++ /dev/null
@@ -1,457 +0,0 @@
-###############################################################################
-# PyDial: Multi-domain Statistical Spoken Dialogue System Software
-###############################################################################
-#
-# Copyright 2015 - 2019
-# Cambridge University Engineering Department Dialogue Systems Group
-#
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-###############################################################################
-
-'''
-ACERPolicy.py - ACER - Actor Critic with Experience Replay
-==================================================
-
-Copyright CUED Dialogue Systems Group 2015 - 2017
-
-.. seealso:: CUED Imports/Dependencies:
-
-    import :class:`Policy`
-    import :class:`utils.ContextLogger`
-
-.. warning::
-        Documentation not done.
-
-
-************************
-
-'''
-import copy
-import os
-import json
-import numpy as np
-import scipy
-import scipy.signal
-import pickle as pickle
-import random
-import utils
-from utils.Settings import config as cfg
-from utils import ContextLogger, DiaAct
-
-import ontology.FlatOntologyManager as FlatOnt
-import tensorflow as tf
-from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode
-from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode
-import policy.DRL.utils as drlutils
-from policy.ACERPolicy import ACERPolicy
-import policy.DRL.acer as acer
-import policy.Policy
-import policy.SummaryAction
-from policy.Policy import TerminalAction, TerminalState
-from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state
-
-logger = utils.ContextLogger.getLogger('')
-
-# Discounting function used to calculate discounted returns.
-def discount(x, gamma):
-    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
-
-
-class FeudalACERPolicy(ACERPolicy):
-    '''Derived from :class:`Policy`
-    '''
-    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False,
-                 action_names=None, slot=None, sd_state_dim=50):
-        super(FeudalACERPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training)
-
-        tf.reset_default_graph()
-
-        self.in_policy_file = in_policy_file
-        self.out_policy_file = out_policy_file
-        self.is_training = is_training
-        self.accum_belief = []
-        self.prev_state_check = None
-        self.sd_state_dim = sd_state_dim
-
-        self.domainString = domainString
-        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
-
-        self.features = 'dip'
-        self.sd_enc_size = 80
-        self.si_enc_size = 40
-        self.dropout_rate = 0.
-        if cfg.has_option('feudalpolicy', 'features'):
-            self.features = cfg.get('feudalpolicy', 'features')
-        if cfg.has_option('feudalpolicy', 'sd_enc_size'):
-            self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
-        if cfg.has_option('feudalpolicy', 'si_enc_size'):
-            self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
-        if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
-            self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
-        if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
-            self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
-        self.actfreq_ds = False
-        if cfg.has_option('feudalpolicy', 'actfreq_ds'):
-            self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')
-
-        # init session
-        self.sess = tf.Session()
-        with tf.device("/cpu:0"):
-
-            np.random.seed(self.randomseed)
-            tf.set_random_seed(self.randomseed)
-
-            # initialise an replay buffer
-            if self.replay_type == 'vanilla':
-                self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed)
-            elif self.replay_type == 'prioritized':
-                self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed)
-            #replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
-            #self.episodes = []
-            self.samplecount = 0
-            self.episodecount = 0
-
-            # construct the models
-            self.state_dim = 89  # current DIP state dim
-            self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
-            self.action_names = action_names
-            self.action_dim = len(self.action_names)
-            action_bound = len(self.action_names)
-            self.stats = [0 for _ in range(self.action_dim)]
-
-            self.global_mu = [0. for _ in range(self.action_dim)]
-
-            if self.features == 'dip':
-                if self.actfreq_ds:
-                    if self.domainString == 'CamRestaurants':
-                        self.state_dim += 9#16
-                    elif self.domainString == 'SFRestaurants':
-                        self.state_dim += 9#25
-                    elif self.domainString == 'Laptops11':
-                        self.state_dim += 9#40
-                self.acer = acer.ACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta,
-                                             self.c, self.alpha, self.h1_size, self.h2_size, self.is_training)
-            elif self.features == 'learned' or self.features == 'rnn':
-                si_state_dim = 73
-                if self.actfreq_ds:
-                    if self.domainString == 'CamRestaurants':
-                        si_state_dim += 9#16
-                    elif self.domainString == 'SFRestaurants':
-                        si_state_dim += 9#25
-                    elif self.domainString == 'Laptops11':
-                        si_state_dim += 9#40
-
-                if 0:#self.features == 'rnn':
-                    self.acer = acer.RNNACERNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.critic_lr,
-                                                    self.delta, self.c, self.alpha, self.h1_size, self.h2_size, self.is_training,
-                                                    sd_enc_size=25, si_enc_size=25, dropout_rate=0., tn='normal', slot='si')
-                else:
-                    self.state_dim = si_state_dim
-                    self.acer = acer.ACERNetwork(self.sess, self.state_dim, self.action_dim,
-                                                 self.critic_lr, self.delta, self.c, self.alpha, self.h1_size,
-                                                 self.h2_size, self.is_training)
-
-            else:
-                logger.error('features "{}" not implemented'.format(self.features))
-
-
-            # when all models are defined, init all variables
-            init_op = tf.global_variables_initializer()
-            self.sess.run(init_op)
-
-            self.loadPolicy(self.in_policy_file)
-            print('loaded replay size: ', self.episodes[self.domainString].size())
-
-            #self.acer.update_target_network()
-
-    # def record() has been handled...
-
-    def convertStateAction(self, state, action):
-        '''
-
-        '''
-        if isinstance(state, TerminalState):
-            return [0] * 89, action
-
-        else:
-            if self.features == 'learned' or self.features == 'rnn':
-                dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
-            else:
-                dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
-            action_name = self.actions.action_names[action]
-            act_slot = 'general'
-            for slot in dip_state.slots:
-                if slot in action_name:
-                    act_slot = slot
-            flat_belief = dip_state.get_beliefStateVec(act_slot)
-            self.prev_state_check = flat_belief
-
-            return flat_belief, action
-
-    def record(self, reward, domainInControl=None, weight=None, state=None, action=None):
-        if domainInControl is None:
-            domainInControl = self.domainString
-        if self.actToBeRecorded is None:
-            self.actToBeRecorded = self.summaryAct
-
-        if state is None:
-            state = self.prevbelief
-        if action is None:
-            action = self.actToBeRecorded
-        mu_weight = self.prev_mu
-        mask = self.prev_mask
-        if action == self.action_dim-1: # pass action was taken
-            mask = np.zeros(self.action_dim)
-            mu_weight = np.ones(self.action_dim)/self.action_dim
-
-        cState, cAction = state, action
-
-        reward /= 20.0
-
-        value = self.acer.predict_value([cState], [mask])
-
-        if self.replay_type == 'vanilla':
-            self.episodes[domainInControl].record(state=cState, \
-                    state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask)
-        elif self.replay_type == 'prioritized':
-            self.episodes[domainInControl].record(state=cState, \
-                    state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask)
-
-        self.actToBeRecorded = None
-        self.samplecount += 1
-        return
-
-    def finalizeRecord(self, reward, domainInControl=None):
-        if domainInControl is None:
-            domainInControl = self.domainString
-        if self.episodes[domainInControl] is None:
-            logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
-            return
-
-        #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size())
-        #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q)
-        #print self.stats
-
-        # normalising total return to -1~1
-        reward /= 20.0
-
-        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
-        value = 0.0 # not effect on experience replay
-
-        def calculate_discountR_advantage(r_episode, v_episode):
-            #########################################################################
-            # Here we take the rewards and values from the rollout, and use them to
-            # generate the advantage and discounted returns.
-            # The advantage function uses "Generalized Advantage Estimation"
-            bootstrap_value = 0.0
-            self.r_episode_plus = np.asarray(r_episode + [bootstrap_value])
-            discounted_r_episode = discount(self.r_episode_plus,self.gamma)[:-1]
-            self.v_episode_plus = np.asarray(v_episode + [bootstrap_value])
-            advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1]
-            advantage = discount(advantage,self.gamma)
-            #########################################################################
-            return discounted_r_episode, advantage
-
-        if self.replay_type == 'vanilla':
-            self.episodes[domainInControl].record(state=terminal_state, \
-                    state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None)
-        elif self.replay_type == 'prioritized':
-            episode_r, episode_v = self.episodes[domainInControl].record_final_and_get_episode(state=terminal_state, \
-                                                                                               state_ori=TerminalState(),
-                                                                                               action=terminal_action,
-                                                                                               reward=reward,
-                                                                                               value=value)
-
-            # TD_error is a list of td error in the current episode
-            _, TD_error = calculate_discountR_advantage(episode_r, episode_v)
-            episodic_TD = np.mean(np.absolute(TD_error))
-            print('episodic_TD')
-            print(episodic_TD)
-            self.episodes[domainInControl].insertPriority(episodic_TD)
-
-        return
-
-    def nextAction(self, beliefstate):
-        '''
-        select next action
-
-        :param beliefstate:
-        :param hyps:
-        :returns: (int) next summarye action
-        '''
-
-        #execMask = self.summaryaction.getExecutableMask(beliefstate, self.lastSystemAction)
-        execMask = np.zeros(self.action_dim)
-
-        def apply_mask(prob, maskval, baseline=9.99999975e-06):
-            return prob if maskval == 0.0 else baseline # not quite 0.0 to avoid division by zero
-
-        if self.exploration_type == 'e-greedy' or not self.is_training:
-            if self.is_training and utils.Settings.random.rand() < self.epsilon:
-                action_prob = np.random.rand(len(self.action_names))
-            else:
-                action_prob = self.acer.predict_policy(np.reshape(beliefstate, (1, len(beliefstate))),
-                                                   np.reshape(execMask, (1, len(execMask))))[0]
-        mu = action_prob / sum(action_prob)
-        self.prev_mu = mu
-        self.prev_mask = execMask
-        return action_prob
-
-    def train(self):
-        '''
-        call this function when the episode ends
-        '''
-        USE_GLOBAL_MU = False
-        self.episode_ct += 1
-
-        if not self.is_training:
-            logger.info("Not in training mode")
-            return
-        else:
-            logger.info("Update acer policy parameters.")
-
-        self.episodecount += 1
-        logger.info("Sample Num so far: %s" % (self.samplecount))
-        logger.info("Episode Num so far: %s" % (self.episodecount))
-        if self.samplecount >= self.minibatch_size * 3 and self.episodecount % self.training_frequency == 0:
-        #if self.episodecount % self.training_frequency == 0:
-            logger.info('start trainig...')
-
-            for _ in range(self.train_iters_per_episode):
-
-                if self.replay_type == 'vanilla' or self.replay_type == 'prioritized':
-                    s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, v_batch, mu_policy, mask_batch = \
-                        self.episodes[self.domainString].sample_batch()
-                    if USE_GLOBAL_MU:
-                        mu_sum = sum(self.global_mu)
-                        mu_normalised = np.array([c / mu_sum for c in self.global_mu])
-                        mu_policy = [[mu_normalised for _ in range(len(mu_policy[i]))] for i in range(len(mu_policy))]
-                else:
-                    assert False  # not implemented yet
-
-                discounted_r_batch = []
-                advantage_batch = []
-                def calculate_discountR_advantage(r_episode, v_episode):
-                    #########################################################################
-                    # Here we take the rewards and values from the rolloutv, and use them to
-                    # generate the advantage and discounted returns.
-                    # The advantage function uses "Generalized Advantage Estimation"
-                    bootstrap_value = 0.0
-                    # r_episode rescale by rhos?
-                    self.r_episode_plus = np.asarray(r_episode + [bootstrap_value])
-                    discounted_r_episode = discount(self.r_episode_plus, self.gamma)[:-1]
-                    self.v_episode_plus = np.asarray(v_episode + [bootstrap_value])
-                    # change sth here
-                    advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1]
-                    advantage = discount(advantage, self.gamma)
-                    #########################################################################
-                    return discounted_r_episode, advantage
-
-                if self.replay_type == 'prioritized':
-                    for item_r, item_v, item_idx in zip(r_batch, v_batch, idx_batch):
-                        # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist())
-                        r, a = calculate_discountR_advantage(item_r, item_v)
-
-                        # flatten nested numpy array and turn it into list
-                        discounted_r_batch += r.tolist()
-                        advantage_batch += a.tolist()
-
-                        # update the sum-tree
-                        # update the TD error of the samples (episode) in the minibatch
-                        episodic_TD_error = np.mean(np.absolute(a))
-                        self.episodes[self.domainString].update(item_idx, episodic_TD_error)
-                else:
-                    for item_r, item_v in zip(r_batch, v_batch):
-                        # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist())
-                        r, a = calculate_discountR_advantage(item_r, item_v)
-
-                        # flatten nested numpy array and turn it into list
-                        discounted_r_batch += r.tolist()
-                        advantage_batch += a.tolist()
-
-                batch_size = len(s_batch)
-
-                a_batch_one_hot = np.eye(self.action_dim)[np.concatenate(a_batch, axis=0).tolist()]
-
-                loss, entropy, optimize = \
-                            self.acer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot,
-                                            np.concatenate(np.array(mask_batch), axis=0).tolist(),
-                                            np.concatenate(np.array(r_batch), axis=0).tolist(), s_batch, r_batch, self.gamma,
-                                            np.concatenate(np.array(mu_policy), axis=0),
-                                            discounted_r_batch, advantage_batch)
-
-                ent, norm_loss = entropy/float(batch_size), loss/float(batch_size)
-
-
-            self.savePolicyInc()  # self.out_policy_file)
-
-
-    def savePolicy(self, FORCE_SAVE=False):
-        """
-        Does not use this, cause it will be called from agent after every episode.
-        we want to save the policy only periodically.
-        """
-        pass
-
-    def savePolicyInc(self, FORCE_SAVE=False):
-        """
-        save model and replay buffer
-        """
-        if self.episodecount % self.save_step == 0:
-            #save_path = self.saver.save(self.sess, self.out_policy_file+'.ckpt')
-            self.acer.save_network(self.out_policy_file+'.acer.ckpt')
-
-            f = open(self.out_policy_file+'.episode', 'wb')
-            for obj in [self.samplecount, self.episodes[self.domainString], self.global_mu]:
-                pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
-            f.close()
-            #logger.info("Saving model to %s and replay buffer..." % save_path)
-
-    def loadPolicy(self, filename):
-        """
-        load model and replay buffer
-        """
-        # load models
-        self.acer.load_network(filename+'.acer.ckpt')
-
-        # load replay buffer
-        if self.load_buffer:
-            try:
-                print('load from: ', filename)
-                f = open(filename+'.episode', 'rb')
-                loaded_objects = []
-                for i in range(2): # load nn params and collected data
-                    loaded_objects.append(pickle.load(f))
-                self.samplecount = int(loaded_objects[0])
-                self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1])
-                self.global_mu = loaded_objects[2]
-                logger.info("Loading both model from %s and replay buffer..." % filename)
-                f.close()
-            except:
-                logger.info("Loading only models...")
-        else:
-            print("We do not load the buffer!")
-
-    def restart(self):
-        self.summaryAct = None
-        self.lastSystemAction = None
-        self.prevbelief = None
-        self.prev_mu = None
-        self.prev_mask = None
-        self.actToBeRecorded = None
-        self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter)
-        self.episode_ave_max_q = []
-
-#END OF FILE
diff --git a/policy/feudalgainRL/FeudalBBQNPolicy.py b/policy/feudalgainRL/FeudalBBQNPolicy.py
deleted file mode 100644
index 01a6275ac7468b716beaa01e76656a7babf15ddf..0000000000000000000000000000000000000000
--- a/policy/feudalgainRL/FeudalBBQNPolicy.py
+++ /dev/null
@@ -1,407 +0,0 @@
-###############################################################################
-# PyDial: Multi-domain Statistical Spoken Dialogue System Software
-###############################################################################
-#
-# Copyright 2015 - 2019
-# Cambridge University Engineering Department Dialogue Systems Group
-#
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-###############################################################################
-
-'''
-DQNPolicy.py - deep Q network policy
-==================================================
-
-Author: Chris Tegho and Pei-Hao (Eddy) Su  (Copyright CUED Dialogue Systems Group 2016)
-
-.. seealso:: CUED Imports/Dependencies: 
-
-    import :class:`Policy`
-    import :class:`utils.ContextLogger`
-
-.. warning::
-        Documentation not done.
-
-
-************************
-
-'''
-
-import copy
-import os
-import json
-import numpy as np
-import pickle as pickle
-import random
-import sys
-import utils
-from utils.Settings import config as cfg
-from utils import ContextLogger, DiaAct, DialogueState
-
-import ontology.FlatOntologyManager as FlatOnt
-# from theano_dialogue.util.tool import *
-
-import tensorflow as tf
-from policy.DRL.replay_bufferVanilla import ReplayBuffer
-from policy.DRL.replay_prioritisedVanilla import ReplayPrioritised
-import policy.DRL.utils as drlutils
-from policy.DRL import bdqn as bbqn
-import policy.Policy
-import policy.SummaryAction
-import policy.BBQNPolicy
-from policy.Policy import TerminalAction, TerminalState
-from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state
-
-logger = utils.ContextLogger.getLogger('')
-
-# --- for flattening the belief --- # 
-domainUtil = FlatOnt.FlatDomainOntology('CamRestaurants')
-
-
-class FeudalBBQNPolicy(policy.BBQNPolicy.BBQNPolicy):
-    '''Derived from :class:`BBQNPolicy`
-    '''
-    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False,
-                 action_names=None, slot=None):
-        super(FeudalBBQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training)
-
-        tf.reset_default_graph()
-
-        self.domainString = domainString
-        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
-        self.in_policy_file = in_policy_file
-        self.out_policy_file = out_policy_file
-        self.is_training = is_training
-        self.accum_belief = []
-
-        self.prev_state_check = None
-
-        self.episode_ave_max_q = []
-
-        self.capacity *= 4 #set the capacity for episode methods, multiply it to adjust to turn based methods
-        self.slot = slot
-
-        # init session
-        self.sess = tf.Session()
-        with tf.device("/cpu:0"):
-
-            np.random.seed(self.randomseed)
-            tf.set_random_seed(self.randomseed)
-
-            # initialise an replay buffer
-            if self.replay_type == 'vanilla':
-                self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed)
-            elif self.replay_type == 'prioritized':
-                self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size,
-                                                                     self.randomseed)
-            # replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
-            # self.episodes = []
-            self.samplecount = 0
-            self.episodecount = 0
-
-            # construct the models
-            self.state_dim = 89  # current DIP state dim
-            self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
-            self.action_names = action_names
-            self.action_dim = len(self.action_names)
-            action_bound = len(self.action_names)
-            self.stats = [0 for _ in range(self.action_dim)]
-            self.stdVar = []
-            self.meanVar = []
-            self.stdMean = []
-            self.meanMean = []
-            self.td_error = []
-            self.td_errorVar = []
-
-            self.target_update_freq = 1
-            if cfg.has_option('bbqnpolicy', 'target_update_freq'):
-                self.target_update_freq = cfg.get('bbqnpolicy', 'target_update_freq')
-
-            #feudal params
-            self.features = 'dip'
-            self.sd_enc_size = 25
-            self.si_enc_size = 50
-            self.dropout_rate = 0.
-            if cfg.has_option('feudalpolicy', 'features'):
-                self.features = cfg.get('feudalpolicy', 'features')
-            if cfg.has_option('feudalpolicy', 'sd_enc_size'):
-                self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
-            if cfg.has_option('feudalpolicy', 'si_enc_size'):
-                self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
-            if cfg.has_option('feudalpolicy', 'dropout_rate') and self.is_training:
-                self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
-            self.actfreq_ds = False
-            if cfg.has_option('feudalpolicy', 'actfreq_ds'):
-                self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')
-
-            if self.features == 'dip':
-                if self.actfreq_ds:
-                    if self.domainString == 'CamRestaurants':
-                        self.state_dim += 16
-                    elif self.domainString == 'SFRestaurants':
-                        self.state_dim += 25
-                    elif self.domainString == 'Laptops11':
-                        self.state_dim += 40
-
-                self.bbqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate, self.tau,
-                                              action_bound, self.architecture, self.h1_size, self.h2_size,
-                                              self.n_samples,
-                                              self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu,
-                                              self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling,
-                                              self.alpha_divergence, self.alpha, self.sigma_eps)
-            elif self.features == 'learned' or self.features == 'rnn':
-                si_state_dim = 72
-                if self.actfreq_ds:
-                    if self.domainString == 'CamRestaurants':
-                        si_state_dim += 16
-                    elif self.domainString == 'SFRestaurants':
-                        si_state_dim += 25
-                    elif self.domainString == 'Laptops11':
-                        si_state_dim += 40
-                if self.domainString == 'CamRestaurants':
-                    sd_state_dim = 94
-                elif self.domainString == 'SFRestaurants':
-                    sd_state_dim = 158
-                elif self.domainString == 'Laptops11':
-                    sd_state_dim = 13
-                else:
-                    logger.error(
-                        'Domain {} not implemented in feudal-DQN yet')  # just find out the size of sd_state_dim for the new domain
-                if self.features == 'rnn':
-                    arch = 'rnn'
-                    self.state_dim = si_state_dim + sd_state_dim
-                    self.bbqn = bbqn.RNNBBQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate,
-                                                  self.tau, action_bound, arch, self.h1_size, self.h2_size, self.n_samples,
-                                                  self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu,
-                                                  self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling,
-                                                  self.alpha_divergence, self.alpha, self.sigma_eps, sd_enc_size=self.sd_enc_size,
-                                                   si_enc_size=self.sd_enc_size, dropout_rate=self.dropout_rate, slot=slot)
-                else:
-                    arch = 'vanilla'
-                    self.state_dim = si_state_dim + sd_state_dim
-                    self.bbqn = bbqn.NNBBQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate,
-                                                  self.tau, action_bound, arch, self.h1_size, self.h2_size, self.n_samples,
-                                                  self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu,
-                                                  self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling,
-                                                  self.alpha_divergence, self.alpha, self.sigma_eps, sd_enc_size=self.sd_enc_size,
-                                                   si_enc_size=self.sd_enc_size, dropout_rate=self.dropout_rate, slot=slot)
-            else:
-                logger.error('features "{}" not implemented'.format(self.features))
-
-
-
-            # when all models are defined, init all variables
-            init_op = tf.global_variables_initializer()
-            self.sess.run(init_op)
-
-            self.loadPolicy(self.in_policy_file)
-            print('loaded replay size: ', self.episodes[self.domainString].size())
-
-            self.bbqn.update_target_network()
-
-    def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None):
-        if domainInControl is None:
-            domainInControl = self.domainString
-        if self.actToBeRecorded is None:
-            # self.actToBeRecorded = self.lastSystemAction
-            self.actToBeRecorded = self.summaryAct
-
-        if state is None:
-            state = self.prevbelief
-        if action is None:
-            action = self.actToBeRecorded
-
-        cState, cAction = state, action
-
-        reward /= 20.0
-
-        cur_cState = np.vstack([np.expand_dims(x, 0) for x in [cState]])
-        cur_action_q = self.bbqn.predict(cur_cState)
-        cur_target_q = self.bbqn.predict_target(cur_cState)
-
-        if exec_mask is not None:
-            admissible = np.add(cur_target_q, np.array(exec_mask))
-        else:
-            admissible = cur_target_q
-
-        Q_s_t_a_t_ = cur_action_q[0][cAction]
-        gamma_Q_s_tplu1_maxa_ = self.gamma * np.max(admissible)
-
-        if weight == None:
-            if self.replay_type == 'vanilla':
-                self.episodes[domainInControl].record(state=cState, \
-                                                      state_ori=state, action=cAction, reward=reward)
-            elif self.replay_type == 'prioritized':
-                # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
-                if True:
-                    # if self.samplecount >= self.capacity:
-                    self.episodes[domainInControl].record(state=cState, \
-                                                          state_ori=state, action=cAction, reward=reward, \
-                                                          Q_s_t_a_t_=Q_s_t_a_t_,
-                                                          gamma_Q_s_tplu1_maxa_=gamma_Q_s_tplu1_maxa_, uniform=False)
-                else:
-                    self.episodes[domainInControl].record(state=cState, \
-                                                          state_ori=state, action=cAction, reward=reward, \
-                                                          Q_s_t_a_t_=Q_s_t_a_t_,
-                                                          gamma_Q_s_tplu1_maxa_=gamma_Q_s_tplu1_maxa_, uniform=True)
-
-        else:
-            self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward,
-                                                  ma_weight=weight)
-
-        self.actToBeRecorded = None
-        self.samplecount += 1
-        return
-
-    def finalizeRecord(self, reward, domainInControl=None):
-        if domainInControl is None:
-            domainInControl = self.domainString
-        if self.episodes[domainInControl] is None:
-            logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
-            return
-
-        # normalising total return to -1~1
-        # if reward == 0:
-        #    reward = -20.0
-        reward /= 20.0
-        """
-        if reward == 20.0:
-            reward = 1.0
-        else:
-            reward = -0.5
-        """
-        # reward = float(reward+10.0)/40.0
-
-        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
-
-        if self.replay_type == 'vanilla':
-            self.episodes[domainInControl].record(state=terminal_state, \
-                                                  state_ori=TerminalState(), action=terminal_action, reward=reward,
-                                                  terminal=True)
-        elif self.replay_type == 'prioritized':
-            # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
-            if True:
-                # if self.samplecount >= self.capacity:
-                self.episodes[domainInControl].record(state=terminal_state, \
-                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
-                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False,
-                                                      terminal=True)
-            else:
-                self.episodes[domainInControl].record(state=terminal_state, \
-                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
-                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=True,
-                                                      terminal=True)
-
-    def convertStateAction(self, state, action):
-        '''
-
-        '''
-        if isinstance(state, TerminalState):
-            return [0] * 89, action
-
-        else:
-            if self.features == 'learned' or self.features == 'rnn':
-                dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
-            else:
-                dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
-            action_name = self.actions.action_names[action]
-            act_slot = 'general'
-            for slot in dip_state.slots:
-                if slot in action_name:
-                    act_slot = slot
-            flat_belief = dip_state.get_beliefStateVec(act_slot)
-            self.prev_state_check = flat_belief
-
-            return flat_belief, action
-
-    def nextAction(self, beliefstate):
-        '''
-        select next action
-
-        :param beliefstate:
-        :param hyps:
-        :returns: (int) next summary action
-        '''
-
-        if self.exploration_type == 'e-greedy':
-            # epsilon greedy
-            if self.is_training and utils.Settings.random.rand() < self.epsilon:
-                action_Q = np.random.rand(len(self.action_names))
-            else:
-                action_Q = self.bbqn.predict(np.reshape(beliefstate, (1, len(beliefstate))))  # + (1. / (1. + i + j))
-
-                self.episode_ave_max_q.append(np.max(action_Q))
-
-        # return the Q vect, the action will be converted in the feudal policy
-        return action_Q
-
-
-    def train(self):
-        '''
-        call this function when the episode ends
-        '''
-
-        if not self.is_training:
-            logger.info("Not in training mode")
-            return
-        else:
-            logger.info("Update dqn policy parameters.")
-
-        self.episodecount += 1
-        logger.info("Sample Num so far: %s" % (self.samplecount))
-        logger.info("Episode Num so far: %s" % (self.episodecount))
-
-        if self.samplecount >= self.minibatch_size * 10 and self.episodecount % self.training_frequency == 0:
-            logger.info('start training...')
-
-            s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
-                self.episodes[self.domainString].sample_batch()
-
-            s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch])
-            s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch])
-
-            a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch]
-            action_q = self.bbqn.predict_dip(s2_batch, a_batch_one_hot)
-            target_q = self.bbqn.predict_target_dip(s2_batch, a_batch_one_hot)
-            # print 'action Q and target Q:', action_q, target_q
-
-            y_i = []
-            for k in range(min(self.minibatch_size, self.episodes[self.domainString].size())):
-                Q_bootstrap_label = 0
-                if t_batch[k]:
-                    Q_bootstrap_label = r_batch[k]
-                else:
-                    if self.q_update == 'single':
-                        belief = s2_ori_batch[k]
-                        execMask = [0.0] * len(self.action_names)  # TODO: find out how to compute the mask here, or save it when recording the state
-                        execMask[-1] = -sys.maxsize
-                        action_Q = target_q[k]
-                        admissible = np.add(action_Q, np.array(execMask))
-                        Q_bootstrap_label = r_batch[k] + self.gamma * np.max(admissible)
-
-                y_i.append(Q_bootstrap_label)
-
-            # Update the critic given the targets
-            reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i])
-
-            predicted_q_value, _, currentLoss, logLikelihood, varFC2, meanFC2, td_error, KL_div = self.bbqn.train(s_batch, a_batch_one_hot, reshaped_yi, self.episodecount)
-
-        if self.episodecount % self.target_update_freq == 0:
-            self.bbqn.update_target_network()
-        if self.episodecount % self.save_step == 0:
-            self.savePolicyInc()  # self.out_policy_file)
-
-
-# END OF FILE
diff --git a/policy/feudalgainRL/FeudalBBQNPolicyNew.py b/policy/feudalgainRL/FeudalBBQNPolicyNew.py
deleted file mode 100644
index 6d35709818b1fdeed16592c3bcc0cafb9a21c727..0000000000000000000000000000000000000000
--- a/policy/feudalgainRL/FeudalBBQNPolicyNew.py
+++ /dev/null
@@ -1,416 +0,0 @@
-###############################################################################
-# PyDial: Multi-domain Statistical Spoken Dialogue System Software
-###############################################################################
-#
-# Copyright 2015 - 2019
-# Cambridge University Engineering Department Dialogue Systems Group
-#
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-###############################################################################
-
-'''
-DQNPolicy.py - deep Q network policy
-==================================================
-
-Copyright CUED Dialogue Systems Group 2015 - 2017
-
-.. seealso:: CUED Imports/Dependencies:
-
-    import :class:`Policy`
-    import :class:`utils.ContextLogger`
-
-.. warning::
-        Documentation not done.
-
-
-************************
-
-'''
-
-import copy
-import os
-import sys
-import json
-import numpy as np
-import pickle as pickle
-from itertools import product
-from scipy.stats import entropy
-import utils
-from utils.Settings import config as cfg
-from utils import ContextLogger, DiaAct, DialogueState
-
-import ontology.FlatOntologyManager as FlatOnt
-import tensorflow as tf
-from policy.DRL.replay_buffer import ReplayBuffer
-from policy.DRL.replay_prioritised import ReplayPrioritised
-import policy.DRL.utils as drlutils
-import policy.DRL.dqn as dqn
-import policy.Policy
-import policy.DQNPolicy
-import policy.SummaryAction
-from policy.Policy import TerminalAction, TerminalState
-from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state
-from policy.feudalRL.feudalUtils import get_feudal_masks
-from policy.DRL import bdqn as bbqn
-
-
-logger = utils.ContextLogger.getLogger('')
-
-
-class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
-    '''Derived from :class:`DQNPolicy`
-    '''
-
-    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False,
-                 action_names=None, slot=None, sd_state_dim=50, js_threshold=0, info_reward=0.0, jsd_reward=False,
-                 jsd_function=None):
-        super(FeudalDQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training)
-
-        tf.reset_default_graph()
-
-        self.domainString = domainString
-        self.sd_state_dim = sd_state_dim
-        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
-        self.in_policy_file = in_policy_file
-        self.out_policy_file = out_policy_file
-        self.is_training = is_training
-        self.accum_belief = []
-        self.info_reward = info_reward
-        self.js_threshold = js_threshold
-        self.jsd_reward = jsd_reward
-        self.jsd_function = jsd_function
-        if self.jsd_function is not None:
-            print("We use the JSD-function", self.jsd_function)
-        if self.js_threshold != 1.0 and not self.jsd_reward:
-            print("We use JS-divergence, threshold =", self.js_threshold)
-        if self.jsd_reward:
-            print("We train with raw JSD reward.")
-        self.slots = slot
-        self.features = 'dip'
-        if cfg.has_option('feudalpolicy', 'features'):
-            self.features = cfg.get('feudalpolicy', 'features')
-        self.actfreq_ds = False
-        if cfg.has_option('feudalpolicy', 'actfreq_ds'):
-            self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')
-
-        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
-        self.prev_state_check = None
-
-        self.max_k = 5
-        if cfg.has_option('dqnpolicy', 'max_k'):
-            self.max_k = cfg.getint('dqnpolicy', 'max_k')
-
-        self.capacity *= 5  # capacity for episode methods, multiply it to adjust to turn based methods
-
-        # init session
-        self.sess = tf.Session()
-        with tf.device("/cpu:0"):
-
-            np.random.seed(self.randomseed)
-            tf.set_random_seed(self.randomseed)
-
-            # initialise a replay buffer
-            if self.replay_type == 'vanilla':
-                self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size*4, self.randomseed)
-            elif self.replay_type == 'prioritized':
-                self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size,
-                                                                     self.randomseed)
-            self.samplecount = 0
-            self.episodecount = 0
-
-            # construct the models
-            self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
-            self.action_names = action_names
-            self.action_dim = len(self.action_names)
-            action_bound = len(self.action_names)
-            self.stats = [0 for _ in range(self.action_dim)]
-
-            if self.features == 'learned' or self.features == 'rnn':
-                si_state_dim = 73
-                if self.actfreq_ds:
-                    if self.domainString == 'CamRestaurants':
-                        si_state_dim += 9#16
-                    elif self.domainString == 'SFRestaurants':
-                        si_state_dim += 9#25
-                    elif self.domainString == 'Laptops11':
-                        si_state_dim += 9#40
-                self.sd_enc_size = 50
-                self.si_enc_size = 25
-                self.dropout_rate = 0.
-                if cfg.has_option('feudalpolicy', 'sd_enc_size'):
-                    self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
-                if cfg.has_option('feudalpolicy', 'si_enc_size'):
-                    self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
-                if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
-                    self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
-                if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
-                    self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
-
-                self.state_dim = si_state_dim + sd_state_dim
-                if self.features == 'learned':
-
-                    self.dqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate,
-                                                  self.tau,
-                                                  action_bound, self.architecture, self.h1_size, self.h2_size,
-                                                  self.n_samples,
-                                                  self.minibatch_size)
-
-                elif self.features == 'rnn':
-                    self.dqn = dqn.RNNFDeepQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim,
-                                                   self.learning_rate, self.tau, action_bound, self.minibatch_size,
-                                                   self.architecture, self.h1_size, self.h2_size,
-                                                   sd_enc_size=self.sd_enc_size, si_enc_size=self.si_enc_size,
-                                                   dropout_rate=self.dropout_rate, slot=self.slot)
-            else: # self.features = 'dip'
-                if self.actfreq_ds:
-                    if self.domainString == 'CamRestaurants':
-                        self.state_dim += 9#16
-                    elif self.domainString == 'SFRestaurants':
-                        self.state_dim += 9#25
-                    elif self.domainString == 'Laptops11':
-                        self.state_dim += 9#40
-                self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim,
-                                            self.learning_rate, self.tau, action_bound, self.minibatch_size,
-                                            self.architecture, self.h1_size,
-                                            self.h2_size, dropout_rate=self.dropout_rate)
-
-            # when all models are defined, init all variables (this might to be sent to the main policy too)
-            init_op = tf.global_variables_initializer()
-            self.sess.run(init_op)
-
-            self.loadPolicy(self.in_policy_file)
-            print('loaded replay size: ', self.episodes[self.domainString].size())
-
-            self.dqn.update_target_network()
-
-    def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None):
-        if domainInControl is None:
-            domainInControl = self.domainString
-        if self.actToBeRecorded is None:
-            self.actToBeRecorded = self.summaryAct
-
-        if state is None:
-            state = self.prevbelief
-        if action is None:
-            action = self.actToBeRecorded
-
-        cState, cAction = state, action
-        # normalising total return to -1~1
-        reward /= 20.0
-
-        if self.replay_type == 'vanilla':
-            self.episodes[domainInControl].record(state=cState, \
-                                                  state_ori=state, action=cAction, reward=reward)
-
-        self.actToBeRecorded = None
-        self.samplecount += 1
-
-    def finalizeRecord(self, reward, domainInControl=None):
-        if domainInControl is None:
-            domainInControl = self.domainString
-        if self.episodes[domainInControl] is None:
-            logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
-            return
-
-        reward /= 20.0
-
-        terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
-
-        if self.replay_type == 'vanilla':
-            self.episodes[domainInControl].record(state=terminal_state, \
-                                                  state_ori=TerminalState(), action=terminal_action, reward=reward,
-                                                  terminal=True)
-        elif self.replay_type == 'prioritized':
-            self.episodes[domainInControl].record(state=terminal_state, \
-                                                      state_ori=TerminalState(), action=terminal_action, reward=reward, \
-                                                      Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False,
-                                                      terminal=True)
-            print('total TD', self.episodes[self.domainString].tree.total())
-
-    def convertStateAction(self, state, action):
-        '''
-
-        '''
-        if isinstance(state, TerminalState):
-            return [0] * 89, action
-        else:
-            if self.features == 'learned' or self.features == 'rnn':
-                dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
-            else:
-                dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
-            action_name = self.actions.action_names[action]
-            act_slot = 'general'
-            for slot in dip_state.slots:
-                if slot in action_name:
-                    act_slot = slot
-            flat_belief = dip_state.get_beliefStateVec(act_slot)
-            self.prev_state_check = flat_belief
-
-            return flat_belief, action
-
-    def nextAction(self, beliefstate):
-        '''
-        select next action
-
-        :param beliefstate: already converted to dipstatevec of the specific slot (or general)
-        :returns: (int) next summary action
-        '''
-
-        if self.exploration_type == 'e-greedy':
-            # epsilon greedy
-            if self.is_training and utils.Settings.random.rand() < self.epsilon:
-                action_Q = np.random.rand(len(self.action_names))
-            else:
-                if len(beliefstate.shape) == 1:
-                    action_Q = self.dqn.predict(np.reshape(beliefstate, (1, -1)))
-                else:
-                    action_Q = self.dqn.predict(beliefstate)
-                # add current max Q to self.episode_ave_max_q
-                self.episode_ave_max_q.append(np.max(action_Q))
-
-        #return the Q vect, the action will be converted in the feudal policy
-        return action_Q
-
-    def train(self):
-        '''
-        call this function when the episode ends
-        '''
-
-        if not self.is_training:
-            logger.info("Not in training mode")
-            return
-        else:
-            logger.info("Update dqn policy parameters.")
-
-        self.episodecount += 1
-        logger.info("Sample Num so far: %s" % (self.samplecount))
-        logger.info("Episode Num so far: %s" % (self.episodecount))
-
-        s_batch_new, s_batch_beliefstate, s_batch_chosen_slot, s2_batch_dipstate, s2_batch_beliefstate, t_batch_new, r_batch_new = \
-            [], [], [], [], [], [], []
-
-        if self.samplecount >= self.minibatch_size * 8 and self.episodecount % self.training_frequency == 0:
-            logger.info('start training...')
-
-            a_batch_one_hot_new = None
-            #updating only states where the action is not "pass()" complicates things :/
-            #since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples
-
-            while len(s_batch_new) < self.minibatch_size:
-
-                s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
-                    self.episodes[self.domainString].sample_batch()
-
-                a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch]
-                #we only wanna update state-action pairs, where action != pass()
-                valid_steps = [action[-1] != 1 for action in a_batch_one_hot]
-                a_batch_one_hot = a_batch_one_hot[valid_steps]
-
-                s_batch_new += [s[0] for i, s in enumerate(s_batch) if valid_steps[i]]
-                s_batch_beliefstate += [s[1] for i, s in enumerate(s_batch) if valid_steps[i]]
-                s_batch_chosen_slot += [s[2] for i, s in enumerate(s_batch) if valid_steps[i]]
-
-                s2_batch_dipstate += [s[3] for s, valid in zip(s2_batch, valid_steps) if valid]
-                s2_batch_beliefstate += [s[1] for s, valid in zip(s2_batch, valid_steps) if valid]
-
-                r_batch_new += [r for r, valid in zip(r_batch, valid_steps) if valid]
-                t_batch_new += [t for t, valid in zip(t_batch, valid_steps) if valid]
-
-                if a_batch_one_hot_new is None:
-                    a_batch_one_hot_new = a_batch_one_hot
-                else:
-                    a_batch_one_hot_new = np.vstack((a_batch_one_hot_new, a_batch_one_hot))
-
-            s_batch_new = np.vstack(s_batch_new)
-            s2_batch_dipstate = np.vstack(s2_batch_dipstate)
-
-            if self.js_threshold < 1.0 or self.jsd_reward:
-                #TODO: This is highly inefficient
-                js_divergence_batch = []
-                for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot):
-                    if slot != "None":
-                        keys = belief['beliefs'][slot].keys()
-
-                        b = [belief['beliefs'][slot]['**NONE**']] + \
-                            [belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
-
-                        b_2 = [belief2['beliefs'][slot]['**NONE**']] + \
-                              [belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
-
-                        js_divergence = self.compute_js_divergence(b, b_2)
-                        js_divergence_batch.append(js_divergence)
-                    else:
-                        js_divergence_batch.append(0.0)
-            else:
-                js_divergence_batch = [0] * len(r_batch_new)
-
-            tanh_n = np.tanh(1)
-            if self.jsd_reward:
-                if self.jsd_function == 'tanh':
-                    js_divergence_batch = np.tanh(np.array(js_divergence_batch)) / tanh_n
-                #normalize jsd between -1 and 1
-                js_divergence_batch = (-1 + 2 * np.array(js_divergence_batch)).tolist()
-            elif self.js_threshold < 1.0:
-                # normalizing bound to [0, 2] and then /20
-                js_divergence_batch = [2/20 * int(x > self.js_threshold) for x in js_divergence_batch]
-
-            action_q = self.dqn.predict_dip(s2_batch_dipstate, a_batch_one_hot_new)
-            target_q = self.dqn.predict_target_dip(s2_batch_dipstate, a_batch_one_hot_new)
-
-            action_q = np.reshape(action_q, (s_batch_new.shape[0], -1, self.action_dim))
-            target_q = np.reshape(target_q, (s_batch_new.shape[0], -1, self.action_dim))
-
-            y_i = []
-            for k in range(min(s_batch_new.shape[0], self.episodes[self.domainString].size())):
-                Q_bootstrap_label = 0
-                if t_batch_new[k]:
-                    Q_bootstrap_label = r_batch_new[k]
-                else:
-                    if self.q_update == 'single':
-                        action_Q = target_q[k]
-                        if self.jsd_reward:
-                            Q_bootstrap_label = js_divergence_batch[k] + self.gamma * np.max(action_Q)
-                        else:
-                            Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q)
-                    elif self.q_update == 'double':
-                        action_Q = action_q[k]
-                        argmax_tuple = np.unravel_index(np.argmax(action_Q, axis=None), action_Q.shape)
-                        value_Q = target_q[k][argmax_tuple]
-                        Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q
-                y_i.append(Q_bootstrap_label)
-
-                if self.replay_type == 'prioritized':
-                    # update the sum-tree
-                    # update the TD error of the samples in the minibatch
-                    currentQ_s_a_ = action_q[k][a_batch[k]]
-                    error = abs(currentQ_s_a_ - Q_bootstrap_label)
-                    self.episodes[self.domainString].update(idx_batch[k], error)
-
-            reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i])
-
-            predicted_q_value, _, currentLoss = self.dqn.train(s_batch_new, a_batch_one_hot_new, reshaped_yi)
-
-            if self.episodecount % 1 == 0:
-                # Update target networks
-                self.dqn.update_target_network()
-
-        self.savePolicyInc()
-
-    def compute_js_divergence(self, P, Q):
-
-        M = [p + q for p, q in zip(P, Q)]
-        return 0.5 * (entropy(P, M, base=2) + entropy(Q, M, base=2))
-
-# END OF FILE
diff --git a/policy/feudalgainRL/FeudalENACPolicy.py b/policy/feudalgainRL/FeudalENACPolicy.py
deleted file mode 100644
index 216c90e3120f66aa13e49ca2f3db4204711b442a..0000000000000000000000000000000000000000
--- a/policy/feudalgainRL/FeudalENACPolicy.py
+++ /dev/null
@@ -1,514 +0,0 @@
-###############################################################################
-# PyDial: Multi-domain Statistical Spoken Dialogue System Software
-###############################################################################
-#
-# Copyright 2015 - 2019
-# Cambridge University Engineering Department Dialogue Systems Group
-#
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-###############################################################################
-
-'''
-ENACPolicy.py - Advantage Actor-Critic policy
-==================================================
-
-Copyright CUED Dialogue Systems Group 2015 - 2017
-
-.. seealso:: CUED Imports/Dependencies: 
-
-    import :class:`Policy`
-    import :class:`utils.ContextLogger`
-
-.. warning::
-        Documentation not done.
-
-
-************************
-
-'''
-
-import copy
-import os
-import json
-import numpy as np
-import scipy
-import scipy.signal
-import pickle as pickle
-import random
-import utils
-from utils.Settings import config as cfg
-from utils import ContextLogger, DiaAct
-
-import ontology.FlatOntologyManager as FlatOnt
-#from theano_dialogue.util.tool import *
-
-import tensorflow as tf
-from policy.DRL.replay_buffer_episode_enac import ReplayBufferEpisode
-from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode
-import policy.DRL.utils as drlutils
-import policy.DRL.enac as enac
-import policy.Policy
-from policy.ENACPolicy import ENACPolicy
-import policy.SummaryAction
-from policy.Policy import TerminalAction, TerminalState
-from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state
-from policy.feudalRL.feudalUtils import get_feudal_masks
-
-logger = utils.ContextLogger.getLogger('')
-
-
-# Discounting function used to calculate discounted returns.
-def discount(x, gamma):
-    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
-
-class FeudalENACPolicy(ENACPolicy):
-    '''Derived from :class:`Policy`
-    '''
-    def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, action_names=None, slot=None):
-        super(FeudalENACPolicy, self).__init__(in_policy_file, out_policy_file, domainString=domainString, is_training=is_training)
-
-        tf.reset_default_graph()
-
-        self.in_policy_file = in_policy_file
-        self.out_policy_file = out_policy_file
-        self.is_training = is_training
-        self.accum_belief = []
-        self.prev_state_check = None
-
-        self.domainString = domainString
-        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
-
-        self.features = 'dip'
-        self.sd_enc_size = 80
-        self.si_enc_size = 40
-        self.dropout_rate = 0.
-        if cfg.has_option('feudalpolicy', 'features'):
-            self.features = cfg.get('feudalpolicy', 'features')
-        if cfg.has_option('feudalpolicy', 'sd_enc_size'):
-            self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
-        if cfg.has_option('feudalpolicy', 'si_enc_size'):
-            self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
-        if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
-            self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
-        if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
-            self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
-
-
-        # init session
-        self.sess = tf.Session()
-        with tf.device("/cpu:0"):
-
-            np.random.seed(self.randomseed)
-            tf.set_random_seed(self.randomseed)
-
-            # initialise an replay buffer
-            if self.replay_type == 'vanilla':
-                self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed)
-            elif self.replay_type == 'prioritized':
-                self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed)
-            #replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
-            #self.episodes = []
-            self.samplecount = 0
-            self.episodecount = 0
-
-            # construct the models
-            self.state_dim = 89  # current DIP state dim
-            self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
-            self.action_names = action_names
-            self.action_dim = len(self.action_names)
-            action_bound = len(self.action_names)
-            self.stats = [0 for _ in range(self.action_dim)]
-
-            if self.features == 'dip':
-                self.enac = enac.ENACNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.tau,
-                                         action_bound, self.architecture, self.h1_size, self.h2_size, self.is_training)
-            elif self.features == 'learned' or self.features == 'rnn':
-                si_state_dim = 72
-                if self.domainString == 'CamRestaurants':
-                    sd_state_dim = 94
-                elif self.domainString == 'SFRestaurants':
-                    sd_state_dim = 158
-                elif self.domainString == 'Laptops11':
-                    sd_state_dim = 13
-                else:
-                    logger.error(
-                        'Domain {} not implemented in feudal-DQN yet')  # just find out the size of sd_state_dim for the new domain
-                if self.features == 'rnn':
-                    arch = 'rnn'
-                else:
-                    arch = 'vanilla'
-                self.state_dim = si_state_dim + sd_state_dim
-                self.enac = enac.ENACNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.tau,
-                                             action_bound, self.architecture, self.h1_size, self.h2_size, self.is_training)
-            else:
-                logger.error('features "{}" not implemented'.format(self.features))
-
-            # when all models are defined, init all variables
-            init_op = tf.global_variables_initializer()
-            self.sess.run(init_op)
-
-            self.loadPolicy(self.in_policy_file)
-            print('loaded replay size: ', self.episodes[self.domainString].size())
-
-
-    def convertStateAction(self, state, action):
-        '''
-
-        '''
-        if isinstance(state, TerminalState):
-            return [0] * 89, action
-
-        else:
-            if self.features == 'learned' or self.features == 'rnn':
-                dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
-            else:
-                dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
-            action_name = self.actions.action_names[action]
-            act_slot = 'general'
-            for slot in dip_state.slots:
-                if slot in action_name:
-                    act_slot = slot
-            flat_belief = dip_state.get_beliefStateVec(act_slot)
-            self.prev_state_check = flat_belief
-
-            return flat_belief, action
-
-    def record(self, reward, domainInControl=None, weight=None, state=None, action=None):
-        if domainInControl is None:
-            domainInControl = self.domainString
-        if self.actToBeRecorded is None:
-            #self.actToBeRecorded = self.lastSystemAction
-            self.actToBeRecorded = self.summaryAct
-
-        if state is None:
-            state = self.prevbelief
-        if action is None:
-            action = self.actToBeRecorded
-
-        cState, cAction = state, action
-
-        # normalising total return to -1~1
-        reward /= 20.0
-
-        #value = self.a2c.predict_value([cState])
-        value = np.array([[0.0]])
-        policy_mu = self.mu_prob
-
-        if weight == None:
-            if self.replay_type == 'vanilla':
-                self.episodes[domainInControl].record(state=cState, \
-                        state_ori=state, action=cAction, reward=reward, value=value[0][0], distribution=policy_mu)
-            elif self.replay_type == 'prioritized':
-                self.episodes[domainInControl].record(state=cState, \
-                        state_ori=state, action=cAction, reward=reward, value=value[0][0], distribution=policy_mu)
-        else:
-            self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward, ma_weight=weight)
-
-        self.actToBeRecorded = None
-        self.samplecount += 1
-        return
-
-    def nextAction(self, beliefstate):
-        '''
-        select next action
-
-        :param beliefstate:
-        :returns: (int) next summary action
-        '''
-
-        if self.exploration_type == 'e-greedy':
-
-            # epsilon greedy
-            if self.is_training and utils.Settings.random.rand() < self.epsilon:
-                action_prob = np.random.rand(len(self.action_names))
-
-                # Importance sampling (should be turned off)
-                #if nextaIdex == greedyNextaIdex:
-                #    self.mu_prob = self.epsilon / float(self.action_dim) + 1 - self.epsilon
-                #else:
-                #    self.mu_prob = self.epsilon / float(self.action_dim)
-            else:
-                action_prob = self.enac.predict_policy(np.reshape(beliefstate, (1, len(beliefstate))))
-
-                # add current max Q to self.episode_ave_max_q
-                #print 'current maxQ', np.max(admissible)
-                #self.episode_ave_max_q.append(np.max(admissible))
-                
-                # Importance sampling
-                #self.mu_prob = self.epsilon / float(self.action_dim) + 1 - self.epsilon
-
-        return action_prob
-
-    def train(self):
-        '''
-        call this function when the episode ends
-        '''
-
-        if not self.is_training:
-            logger.info("Not in training mode")
-            return
-        else:
-            logger.info("Update enac policy parameters.")
-
-        self.episodecount += 1
-        logger.info("Sample Num so far: %s" %(self.samplecount))
-        logger.info("Episode Num so far: %s" %(self.episodecount))
-
-        if self.samplecount >= self.minibatch_size and self.episodecount % self.training_frequency == 0:
-            logger.info('start training...')
-
-            s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, v_batch, mu_policy = \
-                self.episodes[self.domainString].sample_batch()
-
-            discounted_return_batch = []
-        
-
-            def weightsImportanceSampling(mu_policy, r_batch):
-                mu_policy = np.asarray(mu_policy)
-                mu_cum = []
-                lenghts = []  # to properly divide on dialogues pi_policy later on
-                for mu in mu_policy:
-                    lenghts.append(len(mu))
-                    mu = np.asarray(mu).astype(np.longdouble)
-                    mu_cum.append(np.cumprod(mu[::-1])[::-1])  # going forward with cumulative product
-                # mu_cum = np.concatenate(np.array(mu_cum), axis=0).tolist()
-                mu_policy = np.concatenate(np.array(mu_policy), axis=0).tolist()  # concatenate all behavioral probs
-                lengths = np.cumsum(lenghts)  # time steps for ends of dialogues
-                lengths = np.concatenate((np.array([0]), lengths), axis=0)  # add first dialogue
-
-                if self.importance_sampling == 'max':
-                    pass
-                elif self.importance_sampling == "soft":
-                    # get the probabilities of actions taken from the batch
-                    pi_policy = self.enac.getPolicy(np.concatenate(np.array(s_batch), axis=0).tolist())[0]  # policy given s_t
-                    columns = np.asarray([np.concatenate(a_batch, axis=0).tolist()]).astype(int)  # actions taken at s_t
-                    rows = np.asarray([ii for ii in range(len(pi_policy))])
-                    pi_policy = pi_policy[rows, columns][0].astype(np.longdouble)  # getting probabilities for current policy
-
-                #####################################
-                # Weights for importance sampling
-                # it goes through each dialogue and computes in reverse order cumulative prod:
-                # rho_n = pi_n / mu_n
-                # ...
-                # rho_1 = pi_1 / mu_1 *  ... * pi_n / mu_n
-                # using dialogue and weight_cum lists
-                #####################################
-
-                rho_forward = []  # rho_forward from eq. 3.3 (the first one)
-                rho_whole = []  # product across the whole dialogue from eq. 3.3 (the second one)
-                #pi_cum2 = []  # stats to compare
-                #mu_cum2 = []  # stats to compare
-                #pi_cum = []  # stats to compare
-
-                # Precup version
-                r_vector = np.concatenate(np.array(r_batch), axis=0).tolist()
-                r_weighted = []
-
-                for ii in range(len(lengths) - 1):  # over dialogues
-                    weight_cum = 1.
-                    dialogue = []
-
-                    for pi, mu in zip(pi_policy[lengths[ii]:lengths[ii + 1]], mu_policy[lengths[ii]:lengths[ii + 1]]):
-                        weight_cum *= pi / mu
-                        dialogue.append(weight_cum)
-
-                    dialogue = np.array(dialogue)
-                    dialogue = np.clip(dialogue, 0.5, 1)  # clipping the weights
-                    dialogue = dialogue.tolist()
-
-                    rho_forward.extend(dialogue)
-                    #rho_whole.append(dialogue[-1])
-                    rho_whole.extend(np.ones(len(dialogue)) * dialogue[-1])
-                    r_weighted.extend(r_vector[lengths[ii]: lengths[ii + 1]] * np.asarray(dialogue))
-
-                # go back to original form:
-                ind = 0
-                r_new = copy.deepcopy(r_batch)
-                for id, batch in enumerate(r_new):
-                    for id2, _ in enumerate(batch):
-                        r_new[id][id2] = r_weighted[ind]
-                        ind += 1
-
-                # ONE STEP WEIGHTS
-                weights = np.asarray(pi_policy) / np.asarray(mu_policy)
-                weights = np.clip(weights, 0.5, 1)  # clipping the weights
-
-                return weights, rho_forward, rho_whole, r_new
-
-            weights, rho_forward, rho_whole, r_new = weightsImportanceSampling(mu_policy, r_batch)
-
-            weights = np.nan_to_num(weights)
-            rho_forward = np.nan_to_num(rho_forward)
-            rho_whole = np.nan_to_num(rho_whole)
-            """
-            print 'w',weights
-            print 'rho_for',rho_forward
-            print 'rho_who',rho_whole
-            """
-
-            def calculate_discountR(r_episode, idx):
-                #########################################################################
-                # Here we take the rewards and values from the rollouts, and use them to
-                # generate the advantage and discounted returns.
-                # The advantage function uses "Generalized Advantage Estimation"
-                bootstrap_value = 0.0
-                # r_episode rescale by rhos?
-                self.r_episode_plus = np.asarray(r_episode[idx:] + [bootstrap_value])
-                if self.importance_sampling:
-                    self.r_episode_plus = self.r_episode_plus
-                else:
-                    self.r_episode_plus = self.r_episode_plus/rho_forward[idx]
-                discounted_r_episode = discount(self.r_episode_plus, self.gamma)[:-1]
-                #########################################################################
-                return discounted_r_episode[0]
-
-            if self.replay_type == 'prioritized':
-                for item_r, item_v, item_idx in zip(r_new, v_batch, idx_batch):
-                    rlist = []
-                    for idx in range(len(item_r)):
-                        r = calculate_discountR(item_r, idx)
-                        rlist.append(r)
-
-                    discounted_return_batch.append(rlist[-1])
-            else:
-                for item_r, item_v in zip(r_new, v_batch):
-                    rlist = []
-                    for idx in range(len(item_r)):
-                        r = calculate_discountR(item_r, idx)
-                        rlist.append(r)
-
-                    discounted_return_batch.append(rlist[-1]) 
-
-            batch_size = len(s_batch)
-
-            if self.importance_sampling:
-                discounted_return_batch = np.clip(discounted_return_batch, -1, 1)
-
-            # get gradient info and create matrix
-            gradient_matrix = []
-            for item_s, item_a in zip(s_batch, a_batch):
-                item_a_one_hot = np.eye(self.action_dim)[item_a]
-                policy_gradient = self.enac.get_policy_gradient(item_s, item_a_one_hot)
-                policy_gradient = [(policy_gradient_idv.flatten()).tolist() for policy_gradient_idv in policy_gradient]
-                policy_gradient_flatten = np.hstack(policy_gradient)
-                policy_gradient_flatten = np.append(policy_gradient_flatten, [1.0])
-                gradient_matrix.append(policy_gradient_flatten.tolist())
-            
-            gradient_matrix = np.matrix(gradient_matrix)
-            return_matrix = np.matrix(discounted_return_batch)
-
-            logger.info("Updating eNAC policy parameters, before calculate eNac matrix")
-            try: 
-                natural_gradient = np.dot(np.linalg.pinv(gradient_matrix), return_matrix.T)
-                # convert a matrix to list-like array
-                natural_gradient = np.array(natural_gradient.flatten()).ravel()
-                natural_gradient = natural_gradient[:-1] # discard the last element
-            except:
-                natural_gradient = self.natural_gradient_prev 
-                print('SVD problem')
-
-            logger.info("Updating eNAC policy parameters, after calculate eNac matrix")
-
-            self.natural_gradient_prev = natural_gradient
-
-            all_params = self.enac.get_params()
-
-            cnt = 0
-            modelW = []
-            modelB = []
-            for variable in all_params:
-                       
-                shape = variable.shape
-                # weight matrix
-                if np.array(variable).ndim == 1:
-                    until = np.array(variable).shape[0]
-                    subNG = np.reshape(natural_gradient[cnt:cnt+until],shape)
-                    cnt += until
-                    modelB.append(subNG)
-                # bias vector
-                elif np.array(variable).ndim == 2:
-                    until = np.array(variable).shape[0]*np.array(variable).shape[1]
-                    subNG = np.reshape(natural_gradient[cnt:cnt+until],shape)
-                    cnt += until
-                    modelW.append(subNG)
-
-            a_batch_one_hot = np.eye(self.action_dim)[np.concatenate(a_batch, axis=0).tolist()]
-
-            policy_loss, entropy, all_loss, optimise = self.enac.train( \
-                    np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, \
-                    modelW[0], modelB[0], modelW[1], modelB[1], modelW[2], modelB[2] \
-            )
-
-            norm_p_l, ent, norm_all_l = \
-                    policy_loss/float(batch_size), \
-                    entropy/float(batch_size), all_loss/float(batch_size)
-
-        self.savePolicyInc()  # self.out_policy_file)
-
-    def savePolicy(self, FORCE_SAVE=False):
-        """
-        Does not use this, cause it will be called from agent after every episode.
-        we want to save the policy only periodically.
-        """
-        pass
-
-    def savePolicyInc(self, FORCE_SAVE=False):
-        """
-        save model and replay buffer
-        """
-        if self.episodecount % self.save_step == 0:
-            self.enac.save_network(self.out_policy_file+'.enac.ckpt')
-
-            f = open(self.out_policy_file+'.episode', 'wb')
-            for obj in [self.samplecount, self.episodes[self.domainString]]:
-                pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
-            f.close()
-            #logger.info("Saving model to %s and replay buffer..." % save_path)
-
-    def saveStats(self, FORCE_SAVE=False):
-        f = open(self.out_policy_file + '.stats', 'wb')
-        pickle.dump(self.stats, f, protocol=pickle.HIGHEST_PROTOCOL)
-        f.close()
-
-    def loadPolicy(self, filename):
-        """
-        load model and replay buffer
-        """
-        # load models
-        self.enac.load_network(filename+'.enac.ckpt')
-        
-        # load replay buffer
-        try:
-            print('load from: ', filename)
-            f = open(filename+'.episode', 'rb')
-            loaded_objects = []
-            for i in range(2): # load nn params and collected data
-                loaded_objects.append(pickle.load(f))
-            self.samplecount = int(loaded_objects[0])
-            self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1])
-            logger.info("Loading both model from %s and replay buffer..." % filename)
-            f.close()
-        except:
-            logger.info("Loading only models...")
-
-    def restart(self):
-        self.summaryAct = None          
-        self.lastSystemAction = None
-        self.prevbelief = None
-        self.actToBeRecorded = None
-        self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter)
-        self.episode_ave_max_q = []
-
-#END OF FILE
diff --git a/policy/feudalgainRL/FeudalNoisyACERPolicy.py b/policy/feudalgainRL/FeudalNoisyACERPolicy.py
index 732ee8a0d2528e5773a271c3e915db312cbfd6d2..f0608074ea428fe12d47a51c3b35fe606d2a0053 100644
--- a/policy/feudalgainRL/FeudalNoisyACERPolicy.py
+++ b/policy/feudalgainRL/FeudalNoisyACERPolicy.py
@@ -20,47 +20,24 @@
 #
 ###############################################################################
 
-'''
-ACERPolicy.py - ACER - Actor Critic with Experience Replay
-==================================================
-
-Copyright CUED Dialogue Systems Group 2015 - 2017
-
-.. seealso:: CUED Imports/Dependencies:
-
-    import :class:`Policy`
-    import :class:`utils.ContextLogger`
-
-.. warning::
-        Documentation not done.
-
-
-************************
-
-'''
 import copy
-import os
-import json
 import numpy as np
 import scipy
 import scipy.signal
-from scipy.stats import entropy
 import pickle as pickle
-import random
 import utils
-from policy.feudalgainRL.NoisyACERPolicy import NoisyACERPolicy
-from utils.Settings import config as cfg
-from utils import ContextLogger, DiaAct
-
 import ontology.FlatOntologyManager as FlatOnt
 import tensorflow as tf
-from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode
-from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode
-import policy.DRL.utils as drlutils
-#from policy.SACERPolicy import SACERPolicy
 import policy.feudalgainRL.noisyacer as noisy_acer
 import policy.Policy
 import policy.SummaryAction
+
+from policy.feudalgainRL.NoisyACERPolicy import NoisyACERPolicy
+from scipy.stats import entropy
+from utils.Settings import config as cfg
+from utils import ContextLogger
+from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode
+from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode
 from policy.Policy import TerminalAction, TerminalState
 from policy.feudalgainRL.DIP_parametrisation import DIP_state, padded_state
 
@@ -75,8 +52,7 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy):
     '''Derived from :class:`Policy`
     '''
     def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False,
-                 action_names=None, slot=None, sd_state_dim=50, js_threshold=1.0, info_reward=0.0, load_policy=True,
-                 critic_regularizer_weight=0):
+                 action_names=None, slot=None, sd_state_dim=50, load_policy=True):
         super(FeudalNoisyACERPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training)
 
         tf.reset_default_graph()
@@ -87,14 +63,9 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy):
         self.accum_belief = []
         self.prev_state_check = None
         self.sd_state_dim = sd_state_dim
-        self.info_reward = info_reward
-        self.js_threshold = js_threshold
-        if self.js_threshold != 1.0:
-            print("We train with JS-divergence, threshold =", self.js_threshold)
 
         self.domainString = domainString
         self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
-        self.critic_regularizer_weight = critic_regularizer_weight
 
         self.features = 'dip'
         self.sd_enc_size = 80
@@ -124,9 +95,6 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy):
         if self.sample_argmax:
             print("We sample argmax")
 
-        #self.log_path = cfg.get('exec_config', 'logfiledir')
-        #self.log_path = self.log_path + f"/{in_policy_file.split('/')[-1].split('.')[0]}-seed{self.randomseed}.npy"
-
         self.load_policy = load_policy
 
         # init session
@@ -141,8 +109,7 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy):
                 self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed)
             elif self.replay_type == 'prioritized':
                 self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed)
-            #replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
-            #self.episodes = []
+
             self.samplecount = 0
             self.episodecount = 0
 
@@ -151,40 +118,24 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy):
             self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
             self.action_names = action_names
             self.action_dim = len(self.action_names)
-            action_bound = len(self.action_names)
             self.stats = [0 for _ in range(self.action_dim)]
 
             self.global_mu = [0. for _ in range(self.action_dim)]
 
-            if self.features == 'dip':
-                if self.actfreq_ds:
-                    if self.domainString == 'CamRestaurants':
-                        self.state_dim += 9#16
-                    elif self.domainString == 'SFRestaurants':
-                        self.state_dim += 9#25
-                    elif self.domainString == 'Laptops11':
-                        self.state_dim += 9#40
-                self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta,
-                                             self.c, self.alpha, self.h1_size, self.h2_size, self.is_training)
-            elif self.features == 'learned' or self.features == 'rnn':
-                si_state_dim = 73
-                if self.actfreq_ds:
-                    if self.domainString == 'CamRestaurants':
-                        si_state_dim += 9#16
-                    elif self.domainString == 'SFRestaurants':
-                        si_state_dim += 9#25
-                    elif self.domainString == 'Laptops11':
-                        si_state_dim += 9#40
-
-                self.state_dim = si_state_dim
-                self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim,
-                                                    self.critic_lr, self.delta, self.c, self.alpha, self.h1_size,
-                                                    self.h2_size, self.is_training, temperature=self.temperature,
-                                                    critic_regularizer_weight=self.critic_regularizer_weight,
-                                                    noisy_acer=self.noisy_acer)
-
-            else:
-                logger.error('features "{}" not implemented'.format(self.features))
+            si_state_dim = 73
+            if self.actfreq_ds:
+                if self.domainString == 'CamRestaurants':
+                    si_state_dim += 9#16
+                elif self.domainString == 'SFRestaurants':
+                    si_state_dim += 9#25
+                elif self.domainString == 'Laptops11':
+                    si_state_dim += 9#40
+
+            self.state_dim = si_state_dim
+            self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim,
+                                                self.critic_lr, self.delta, self.c, self.alpha, self.h1_size,
+                                                self.h2_size, self.is_training,
+                                                noisy_acer=self.noisy_acer)
 
             # when all models are defined, init all variables
             init_op = tf.global_variables_initializer()
@@ -262,10 +213,6 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy):
             logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
             return
 
-        #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size())
-        #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q)
-        #print self.stats
-
         # normalising total return to -1~1
         reward /= 20.0
 
@@ -286,24 +233,8 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy):
             #########################################################################
             return discounted_r_episode, advantage
 
-        if self.replay_type == 'vanilla':
-            self.episodes[domainInControl].record(state=terminal_state, \
-                    state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None)
-        elif self.replay_type == 'prioritized':
-            episode_r, episode_v = self.episodes[domainInControl].record_final_and_get_episode(state=terminal_state, \
-                                                                                               state_ori=TerminalState(),
-                                                                                               action=terminal_action,
-                                                                                               reward=reward,
-                                                                                               value=value)
-
-            # TD_error is a list of td error in the current episode
-            _, TD_error = calculate_discountR_advantage(episode_r, episode_v)
-            episodic_TD = np.mean(np.absolute(TD_error))
-            print('episodic_TD')
-            print(episodic_TD)
-            self.episodes[domainInControl].insertPriority(episodic_TD)
-
-        return
+        self.episodes[domainInControl].record(state=terminal_state, \
+                state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None)
 
     def compute_responsible_q(self, inputs, actions, mask):
         return self.sacer.compute_responsible_q(inputs, actions, mask)
@@ -363,12 +294,6 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy):
         USE_GLOBAL_MU = False
         self.episode_ct += 1
 
-      #  new_noise_man_array = np.expand_dims(np.array(self.sacer.compute_mean_noisy()), axis=0)
-      #  if os.path.exists(self.log_path):
-      #      noise_mean_array = np.load(self.log_path)
-      #      new_noise_man_array = np.concatenate((noise_mean_array, new_noise_man_array), axis=0)
-      #  np.save(self.log_path, new_noise_man_array)
-
         if not self.is_training:
             logger.info("Not in training mode")
             return
@@ -379,7 +304,6 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy):
         logger.info("Sample Num so far: %s" % (self.samplecount))
         logger.info("Episode Num so far: %s" % (self.episodecount))
         if self.samplecount >= self.minibatch_size * 3 and self.episodecount % self.training_frequency == 0:
-        #if self.episodecount % self.training_frequency == 0:
             logger.info('start trainig...')
 
             for _ in range(self.train_iters_per_episode):
@@ -395,33 +319,6 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy):
                     assert False  # not implemented yet
 
                 s_batch = [[state_tuple[0] for state_tuple in epi] for epi in s_batch_full]
-                s_batch_beliefstate = [[state_tuple[1] for state_tuple in epi] for epi in s_batch_full]
-                s_batch_chosen_slot = [[state_tuple[2] for state_tuple in epi] for epi in s_batch_full]
-
-                s2_batch_beliefstate = [[state_tuple[1] for state_tuple in epi] for epi in s2_batch_full]
-
-                js_divergence_batch = []
-
-                if self.js_threshold < 1.0:
-                    #TODO: This is probably highly inefficient
-                    for epi_s, epi_s2, epi_slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot):
-                        for belief, belief2, slot in zip(epi_s, epi_s2, epi_slot):
-                            if slot != "None":
-                                keys = belief['beliefs'][slot].keys()
-
-                                b = [belief['beliefs'][slot]['**NONE**']] + \
-                                    [belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
-
-                                b_2 = [belief2['beliefs'][slot]['**NONE**']] + \
-                                      [belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
-
-                                js_divergence = self.compute_js_divergence(b, b_2)
-                                js_divergence_batch.append(js_divergence)
-                            else:
-                                js_divergence_batch.append(0.0)
-
-                    js_divergence_batch = [int(x > self.js_threshold) for x in js_divergence_batch]
-                    js_divergence_batch = 2/20 * np.array(js_divergence_batch) #normalizing bound to [0, 2] and then /20
 
                 discounted_r_batch = []
                 advantage_batch = []
@@ -467,30 +364,14 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy):
 
                 a_batch_one_hot = np.eye(self.action_dim)[np.concatenate(a_batch, axis=0).tolist()]
 
-                if self.js_threshold < 1.0:
-                    r_batch_concatenated = np.concatenate(np.array(r_batch), axis=0) + js_divergence_batch
-                else:
-                    r_batch_concatenated = np.concatenate(np.array(r_batch), axis=0)
-
-                if critic_regularizer is not None:
-                    critic_regularizer_q = critic_regularizer.compute_responsible_q(
-                        np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot,
-                        np.concatenate(np.array(mask_batch), axis=0).tolist())
-
-                    loss, entropy, optimize = \
-                                self.sacer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot,
-                                                 np.concatenate(np.array(mask_batch), axis=0).tolist(),
-                                                 r_batch_concatenated, s_batch, r_batch, self.gamma,
-                                                 np.concatenate(np.array(mu_policy), axis=0),
-                                                 discounted_r_batch, advantage_batch,
-                                                 critic_regularizer_output=critic_regularizer_q)
-                else:
-                    loss, entropy, optimize = \
-                        self.sacer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot,
-                                         np.concatenate(np.array(mask_batch), axis=0).tolist(),
-                                         r_batch_concatenated, s_batch, r_batch, self.gamma,
-                                         np.concatenate(np.array(mu_policy), axis=0),
-                                         discounted_r_batch, advantage_batch)
+                r_batch_concatenated = np.concatenate(np.array(r_batch), axis=0)
+
+                loss, entropy, optimize = \
+                    self.sacer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot,
+                                     np.concatenate(np.array(mask_batch), axis=0).tolist(),
+                                     r_batch_concatenated, s_batch, r_batch, self.gamma,
+                                     np.concatenate(np.array(mu_policy), axis=0),
+                                     discounted_r_batch, advantage_batch)
 
                 ent, norm_loss = entropy/float(batch_size), loss/float(batch_size)
 
diff --git a/policy/feudalgainRL/FeudalNoisyDQNPolicy.py b/policy/feudalgainRL/FeudalNoisyDQNPolicy.py
index 3fffadbb49839298c03157c5d4e11cdab6add668..7ba55b8b42d243eb29d99210f7918c669611a6fe 100644
--- a/policy/feudalgainRL/FeudalNoisyDQNPolicy.py
+++ b/policy/feudalgainRL/FeudalNoisyDQNPolicy.py
@@ -95,7 +95,7 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
         if self.jsd_function is not None:
             print("We use the JSD-function", self.jsd_function)
         if self.js_threshold != 1.0 and not self.jsd_reward:
-            print("We use JS-divergence, threshold =", self.js_threshold)
+            print("We use Information Gain with JS-divergence, threshold =", self.js_threshold)
         if self.jsd_reward:
             print("We train with raw JSD reward.")
         self.slots = slot
@@ -361,7 +361,7 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
                 s2_batch_dipstate = np.vstack([s[3] for s in s2_batch])
                 t_batch_new = t_batch
 
-            if self.js_threshold < 1.0 or self.jsd_reward:
+            if self.js_threshold < 1.0:
                 js_divergence_batch = []
                 for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot):
                     if slot != "None":
@@ -380,13 +380,7 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
             else:
                 js_divergence_batch = [0] * len(r_batch_new)
 
-            tanh_n = np.tanh(1)
-            if self.jsd_reward:
-                if self.jsd_function == 'tanh':
-                    js_divergence_batch = np.tanh(np.array(js_divergence_batch)) / tanh_n
-                #normalize jsd between -1 and 1
-                js_divergence_batch = (-1 + 2 * np.array(js_divergence_batch)).tolist()
-            elif self.js_threshold < 1.0:
+            if self.js_threshold < 1.0:
                 # normalizing bound to [0, 2] and then /20
                 js_divergence_batch = [2/20 * int(x > self.js_threshold) for x in js_divergence_batch]
 
@@ -404,18 +398,12 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
                 else:
                     if self.q_update == 'single':
                         action_Q = target_q[k]
-                        if self.jsd_reward:
-                            Q_bootstrap_label = js_divergence_batch[k] + self.gamma * np.max(action_Q)
-                        else:
-                            Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q)
+                        Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q)
                     elif self.q_update == 'double':
                         action_Q = action_q[k]
                         argmax_tuple = np.unravel_index(np.argmax(action_Q, axis=None), action_Q.shape)
                         value_Q = target_q[k][argmax_tuple]
-                        if not self.jsd_reward:
-                            Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q
-                        else:
-                            Q_bootstrap_label = js_divergence_batch[k] + self.gamma * value_Q
+                        Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q
 
                 y_i.append(Q_bootstrap_label)
 
diff --git a/policy/feudalgainRL/NoisyACERPolicy.py b/policy/feudalgainRL/NoisyACERPolicy.py
index 5854756136445216cb3f58ce3ffb1569d576f1f4..6d5e5920386991907e2b72665cc1dc5066e83d1c 100644
--- a/policy/feudalgainRL/NoisyACERPolicy.py
+++ b/policy/feudalgainRL/NoisyACERPolicy.py
@@ -21,6 +21,9 @@
 ###############################################################################
 
 '''
+
+Using Noisy Networks for the following implementation:
+
 ACERPolicy.py - Sample Efficient Actor Critic with Experience Replay
 ==================================================
 
@@ -49,19 +52,17 @@ import pickle as pickle
 import copy
 import json
 import numpy as np
-import os
 import random
 import scipy
 import scipy.signal
 import tensorflow as tf
-
+import ontology.FlatOntologyManager as FlatOnt
+import utils
 import policy.feudalgainRL.noisyacer as noisy_acer
-#from policy.DRL import replay_policy as replay_policy
-from policy.DRL import utils as drlutils
+
 from policy import Policy
 from policy import SummaryAction
-import ontology.FlatOntologyManager as FlatOnt
-import utils
+from policy import MasterAction
 from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode
 from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode
 from policy.Policy import TerminalAction, TerminalState
@@ -158,21 +159,11 @@ class NoisyACERPolicy(Policy.Policy):
         self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
 
         self.load_buffer = True
-        if cfg.has_option('policy', 'bootstrap_buffer'):
-            self.load_buffer = cfg.getboolean('policy', 'bootstrap_buffer')
-            print("SACER: BOOTSTRAP BUFFER: ", self.load_buffer)
-
         self.load_policy = True
-        if cfg.has_option('policy', 'bootstrap_master_policy'):
-            self.load_policy = cfg.getboolean('policy', 'bootstrap_master_policy')
-            print("SACER: BOOTSTRAP Policy: ", self.load_policy)
 
         # parameter settings
 
-        if 0:  # cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper
-            self.n_in = cfg.getint('dqnpolicy', 'n_in')
-        else:
-            self.n_in = self.get_n_in(domainString)
+        self.n_in = self.get_n_in(domainString)
 
         self.actor_lr = 0.0001
         if cfg.has_option('dqnpolicy', 'actor_lr'):
@@ -295,10 +286,6 @@ class NoisyACERPolicy(Policy.Policy):
         if cfg.has_option('policy', 'save_step'):
             self.save_step = cfg.getint('policy', 'save_step')
 
-        self.temperature = 0.0
-        if cfg.has_option('policy', 'temperature'):
-            self.temperature = cfg.getfloat('policy', 'temperature')
-
         self.behaviour_cloning = False
         if cfg.has_option('policy', 'behaviour_cloning'):
             self.behaviour_cloning = cfg.getboolean('policy', 'behaviour_cloning')
@@ -471,8 +458,7 @@ class NoisyACERPolicy(Policy.Policy):
                 self.global_mu = [0. for _ in range(self.action_dim)]
 
             self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta,
-                                                self.c, self.alpha, self.h1_size, self.h2_size, self.is_training,
-                                                temperature=self.temperature)
+                                                self.c, self.alpha, self.h1_size, self.h2_size, self.is_training)
 
             #if self.optimize_ER:
             #    self.replay_policy = replay_policy.ReplayPolicy(self.sess, seed=self.randomseed)
@@ -685,9 +671,7 @@ class NoisyACERPolicy(Policy.Policy):
 
         if self.master_space:
             beliefstate = beliefstate.getDomainState(self.domainUtil.domainString)
-            print("MASTER ACTION: ", self.masteraction.action_names[nextaIdex])
             masterAct = self.masteraction.Convert(beliefstate, self.masteraction.action_names[nextaIdex], self.lastSystemAction)
-            print("MASTER ACT: ", masterAct)
         else:
             summaryAct = self.summaryaction.action_names[nextaIdex]
             beliefstate = beliefstate.getDomainState(self.domainUtil.domainString)
diff --git a/policy/feudalgainRL/README.md b/policy/feudalgainRL/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..242b15337485f14caa150a3029c92eacdf0926fe
--- /dev/null
+++ b/policy/feudalgainRL/README.md
@@ -0,0 +1,45 @@
+## FeudalGain
+
+This is the implementation to the work **What does the User want? Information Gain for Hierarchical Dialogue Policy Optimisation**, published at ASRU2021.
+
+Reference: 
+
+#### Train a FeudalGain policy
+
+First of all, create a virtual environment with python3 and run **pip install -r requirements** to install the python packages.
+
+You can find config-files for all environments in the folder pydial3-public/policy/feudalgainRL/configs. To start a training, choose one of the config files and run the following command in the main repo:
+
+```
+python3 pydial.py train config_path/config.cfg
+```
+
+You can change parameter settings in the config files as needed. The most important parameters to set are:
+
+```
+[policy]
+noisy_acer=True/False: Use noisy networks for policy \pi_mg or \pi_m and \pi_g
+use_pass=True/False: Train information policy \pi_i with transitions where action=pass(). Deactivated if information gain is used. Should be activated if Feudal is used for training.
+
+[feudalpolicy]
+only_master = True/False: True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g
+js_threshold = 0.2: threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do use external reward for training \pi_i.
+
+[dqnpolicy]
+architecture = noisy_duel/duel: use noisy_duel for noisy network architecture
+```
+
+If you want to use the vanilla Feudal algorithm, set the parameters in the config as follows:
+
+```
+[policy]
+noisy_acer=False
+use_pass=True
+
+[feudalpolicy]
+only_master = False
+js_threshold = 1.0
+
+[dqnpolicy]
+architecture = duel
+```
\ No newline at end of file
diff --git a/policy/feudalgainRL/configs/env1-CR-FeudalGain.cfg b/policy/feudalgainRL/configs/env1-CR-FeudalGain.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..c393c0c1a9c21aa616cfd4d02d863e693b5ce68f
--- /dev/null
+++ b/policy/feudalgainRL/configs/env1-CR-FeudalGain.cfg
@@ -0,0 +1,105 @@
+# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator
+# User model: standard sampled params, sampled patience
+# Masks: off
+
+###### General parameters ######
+[GENERAL]
+# Set to "SFRestaurants" or "Laptops11"
+domains = CamRestaurants
+singledomain = True
+tracedialog = 0
+seed = 0
+
+[exec_config]
+configdir = _benchmarkpolicies/env1-feudalgain
+logfiledir = _benchmarklogs/env1-feudalgain
+numtrainbatches = 20
+traindialogsperbatch = 200
+numbatchtestdialogs =  500
+trainsourceiteration = 0
+numtestdialogs =  500
+trainerrorrate = 0
+testerrorrate  = 0
+testeverybatch = True
+deleteprevpolicy = True
+
+[logging]
+usecolor = False
+screen_level = results
+file_level = results
+file = auto
+
+###### Environment parameters ######
+
+[agent]
+maxturns = 25
+
+[usermodel]
+usenewgoalscenarios = True
+oldstylepatience = False
+patience = 4,6
+configfile = config/sampledUM.cfg
+
+[errormodel]
+nbestsize = 1
+confusionmodel = RandomConfusions
+nbestgeneratormodel = SampledNBestGenerator
+confscorer = additive
+
+[summaryacts]
+maxinformslots = 5
+informmask = True
+requestmask = True
+informcountaccepted = 4
+byemask = True
+
+###### Dialogue Manager parameters ######
+[policy]
+policydir = _benchmarkpolicies/env1-feudalgain
+belieftype = focus
+useconfreq = False
+learning = True
+policytype = feudalgain
+startwithhello = False
+inpolicyfile = auto
+outpolicyfile = auto
+# Set noisy_acer=False for vanilla neural networks
+noisy_acer = True
+# Set use_pass=True if transitions where pass() action were taken should be used. Always False if InformationGain is used.
+use_pass = False
+
+[feudalpolicy]
+features=learned
+si_policy_type=acer
+# only_master=True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g
+only_master = True
+# Set the threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do not use InformationGain.
+js_threshold = 0.2
+
+[dqnpolicy]
+q_update = double
+# set architecture=duel for vanilla neural networks
+architecture = noisy_duel
+h1_size = 300
+h2_size = 100
+capacity = 2000
+beta = 0.95
+epsilon_start = 0.3
+maxiter = 4000
+minibatch_size = 64
+is_threshold = 5.0
+episodeNum = 0.0
+epsilon_end = 0.0
+n_in = 268
+features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"]
+
+###### Evaluation parameters ######
+
+[eval]
+rewardvenuerecommended=0
+penaliseallturns = True
+wrongvenuepenalty = 0
+notmentionedvaluepenalty = 0
+successmeasure = objective
+successreward = 20
+
diff --git a/policy/feudalgainRL/configs/env2-CR-FeudalGain.cfg b/policy/feudalgainRL/configs/env2-CR-FeudalGain.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..779f0eeb8f558f4827836a273e1ddf59a345f47a
--- /dev/null
+++ b/policy/feudalgainRL/configs/env2-CR-FeudalGain.cfg
@@ -0,0 +1,106 @@
+# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator
+# User model: standard sampled params, sampled patience
+# Masks: off
+
+###### General parameters ######
+[GENERAL]
+# Set to "SFRestaurants" or "Laptops11"
+domains = CamRestaurants
+singledomain = True
+tracedialog = 0
+seed = 0
+
+[exec_config]
+configdir = _benchmarkpolicies/env2-feudalgain
+logfiledir = _benchmarklogs/env2-feudalgain
+numtrainbatches = 20
+traindialogsperbatch = 200
+numbatchtestdialogs =  500
+trainsourceiteration = 0
+numtestdialogs =  500
+trainerrorrate = 0
+testerrorrate  = 0
+testeverybatch = True
+deleteprevpolicy = True
+
+[logging]
+usecolor = False
+screen_level = results
+file_level = results
+file = auto
+
+###### Environment parameters ######
+
+[agent]
+maxturns = 25
+
+[usermodel]
+usenewgoalscenarios = True
+oldstylepatience = False
+patience = 4,6
+configfile = config/sampledUM.cfg
+
+[errormodel]
+nbestsize = 1
+confusionmodel = RandomConfusions
+nbestgeneratormodel = SampledNBestGenerator
+confscorer = additive
+
+
+[summaryacts]
+maxinformslots = 5
+informmask = False
+requestmask = False
+informcountaccepted = 4
+byemask = False
+
+###### Dialogue Manager parameters ######
+[policy]
+policydir = _benchmarkpolicies/env2-feudalgain
+belieftype = focus
+useconfreq = False
+learning = True
+policytype = feudalgain
+startwithhello = False
+inpolicyfile = auto
+outpolicyfile = auto
+# Set noisy_acer=False for vanilla neural networks
+noisy_acer = True
+# Set use_pass=True if transitions where pass() action were taken should be used. Always False if InformationGain is used.
+use_pass = False
+
+[feudalpolicy]
+features=learned
+si_policy_type=acer
+# only_master=True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g
+only_master = True
+# Set the threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do not use InformationGain.
+js_threshold = 0.2
+
+[dqnpolicy]
+q_update = double
+# set architecture=duel for vanilla neural networks
+architecture = noisy_duel
+h1_size = 300
+h2_size = 100
+capacity = 2000
+beta = 0.95
+epsilon_start = 0.3
+maxiter = 4000
+minibatch_size = 64
+is_threshold = 5.0
+episodeNum = 0.0
+epsilon_end = 0.0
+n_in = 268
+features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"]
+
+###### Evaluation parameters ######
+
+[eval]
+rewardvenuerecommended=0
+penaliseallturns = True
+wrongvenuepenalty = 0
+notmentionedvaluepenalty = 0
+successmeasure = objective
+successreward = 20
+
diff --git a/feudalconfig.cfg b/policy/feudalgainRL/configs/env3-CR-FeudalGain.cfg
similarity index 73%
rename from feudalconfig.cfg
rename to policy/feudalgainRL/configs/env3-CR-FeudalGain.cfg
index cb24e8f4c4fc8173412f33b7db0eacc779d297f9..c85b4e8d71aa192d8d35a505ceca2baa5bab6f25 100644
--- a/feudalconfig.cfg
+++ b/policy/feudalgainRL/configs/env3-CR-FeudalGain.cfg
@@ -4,14 +4,15 @@
 
 ###### General parameters ######
 [GENERAL]
+# Set to "SFRestaurants" or "Laptops11"
 domains = CamRestaurants
 singledomain = True
 tracedialog = 0
-seed = 1
+seed = 0
 
 [exec_config]
-configdir = _benchmarkpolicies/env3-feudal
-logfiledir = _benchmarklogs/env3-feudal
+configdir = _benchmarkpolicies/env3-feudalgain
+logfiledir = _benchmarklogs/env3-feudalgain
 numtrainbatches = 20
 traindialogsperbatch = 200
 numbatchtestdialogs =  500
@@ -56,7 +57,7 @@ byemask = True
 
 ###### Dialogue Manager parameters ######
 [policy]
-policydir = _benchmarkpolicies/env3-feudal
+policydir = _benchmarkpolicies/env3-feudalgain
 belieftype = focus
 useconfreq = False
 learning = True
@@ -64,33 +65,23 @@ policytype = feudalgain
 startwithhello = False
 inpolicyfile = auto
 outpolicyfile = auto
-temperature = 0.0
+# Set noisy_acer=False for vanilla neural networks
 noisy_acer = True
-sample_argmax = False
+# Set use_pass=True if transitions where pass() action were taken should be used. Always False if InformationGain is used.
+use_pass = False
 
 [feudalpolicy]
 features=learned
 si_policy_type=acer
+# only_master=True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g
 only_master = True
-jsd_reward = False
-#jsd_function = tanh
+# Set the threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do not use InformationGain.
 js_threshold = 0.2
-js_threshold_master = 1
-
-[i2a]
-is_imaging = False
-deepmind = False
-load_pretrain_data = False
-improve_env = False
-share_layer = 2
-new_q_loss = False
-device = cpu
-env_model_path = env_model/env1_acer_200.pkl
 
 [dqnpolicy]
 q_update = double
+# set architecture=duel for vanilla neural networks
 architecture = noisy_duel
-#architecture = duel
 h1_size = 300
 h2_size = 100
 capacity = 2000
diff --git a/policy/feudalgainRL/configs/env4-CR-FeudalGain.cfg b/policy/feudalgainRL/configs/env4-CR-FeudalGain.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..1b9053b16ea9a46abf6b3eaf014e946f66f9134d
--- /dev/null
+++ b/policy/feudalgainRL/configs/env4-CR-FeudalGain.cfg
@@ -0,0 +1,107 @@
+# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator
+# User model: standard sampled params, sampled patience
+# Masks: off
+
+###### General parameters ######
+[GENERAL]
+# Set to "SFRestaurants" or "Laptops11"
+domains = CamRestaurants
+singledomain = True
+tracedialog = 0
+seed = 0
+
+[exec_config]
+configdir = _benchmarkpolicies/env4-feudalgain
+logfiledir = _benchmarklogs/env4-feudalgain
+numtrainbatches = 20
+traindialogsperbatch = 200
+numbatchtestdialogs =  500
+trainsourceiteration = 0
+numtestdialogs =  500
+trainerrorrate = 15
+testerrorrate  = 15
+testeverybatch = True
+deleteprevpolicy = True
+
+[logging]
+usecolor = False
+screen_level = results
+file_level = results
+file = auto
+
+###### Environment parameters ######
+
+[agent]
+maxturns = 25
+
+[usermodel]
+usenewgoalscenarios = True
+oldstylepatience = False
+patience = 4,6
+configfile = config/sampledUM.cfg
+
+[errormodel]
+nbestsize = 5
+confusionmodel = LevenshteinConfusions
+nbestgeneratormodel = DSTC2NBestGenerator
+confscorer = DSTC2
+configfile = config/set1-ErrorModel.cfg
+
+
+[summaryacts]
+maxinformslots = 5
+informmask = False
+requestmask = False
+informcountaccepted = 4
+byemask = False
+
+###### Dialogue Manager parameters ######
+[policy]
+policydir = _benchmarkpolicies/env4-feudalgain
+belieftype = focus
+useconfreq = False
+learning = True
+policytype = feudalgain
+startwithhello = False
+inpolicyfile = auto
+outpolicyfile = auto
+# Set noisy_acer=False for vanilla neural networks
+noisy_acer = True
+# Set use_pass=True if transitions where pass() action were taken should be used. Always False if InformationGain is used.
+use_pass = False
+
+[feudalpolicy]
+features=learned
+si_policy_type=acer
+# only_master=True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g
+only_master = True
+# Set the threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do not use InformationGain.
+js_threshold = 0.2
+
+[dqnpolicy]
+q_update = double
+# set architecture=duel for vanilla neural networks
+architecture = noisy_duel
+h1_size = 300
+h2_size = 100
+capacity = 2000
+beta = 0.95
+epsilon_start = 0.3
+maxiter = 4000
+minibatch_size = 64
+is_threshold = 5.0
+episodeNum = 0.0
+epsilon_end = 0.0
+n_in = 268
+features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"]
+
+###### Evaluation parameters ######
+
+[eval]
+rewardvenuerecommended=0
+penaliseallturns = True
+wrongvenuepenalty = 0
+notmentionedvaluepenalty = 0
+successmeasure = objective
+successreward = 20
+
diff --git a/policy/feudalgainRL/configs/env5-CR-FeudalGain.cfg b/policy/feudalgainRL/configs/env5-CR-FeudalGain.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..61bf9b1f5ed71d870e835a3ff4bb3075af2687e7
--- /dev/null
+++ b/policy/feudalgainRL/configs/env5-CR-FeudalGain.cfg
@@ -0,0 +1,107 @@
+# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator
+# User model: standard sampled params, sampled patience
+# Masks: off
+
+###### General parameters ######
+[GENERAL]
+# Set to "SFRestaurants" or "Laptops11"
+domains = CamRestaurants
+singledomain = True
+tracedialog = 0
+seed = 0
+
+[exec_config]
+configdir = _benchmarkpolicies/env5-feudalgain
+logfiledir = _benchmarklogs/env5-feudalgain
+numtrainbatches = 20
+traindialogsperbatch = 200
+numbatchtestdialogs =  500
+trainsourceiteration = 0
+numtestdialogs =  500
+trainerrorrate = 15
+testerrorrate  = 15
+testeverybatch = True
+deleteprevpolicy = True
+
+[logging]
+usecolor = False
+screen_level = results
+file_level = results
+file = auto
+
+###### Environment parameters ######
+
+[agent]
+maxturns = 25
+
+[usermodel]
+usenewgoalscenarios = True
+oldstylepatience = False
+patience = 4,6
+configfile = config/unfriendlyUM.cfg
+
+[errormodel]
+nbestsize = 5
+confusionmodel = LevenshteinConfusions
+nbestgeneratormodel = DSTC2NBestGenerator
+confscorer = DSTC2
+configfile = config/set1-ErrorModel.cfg
+
+
+[summaryacts]
+maxinformslots = 5
+informmask = True
+requestmask = True
+informcountaccepted = 4
+byemask = True
+
+###### Dialogue Manager parameters ######
+[policy]
+policydir = _benchmarkpolicies/env5-feudalgain
+belieftype = focus
+useconfreq = False
+learning = True
+policytype = feudalgain
+startwithhello = False
+inpolicyfile = auto
+outpolicyfile = auto
+# Set noisy_acer=False for vanilla neural networks
+noisy_acer = True
+# Set use_pass=True if transitions where pass() action were taken should be used. Always False if InformationGain is used.
+use_pass = False
+
+[feudalpolicy]
+features=learned
+si_policy_type=acer
+# only_master=True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g
+only_master = True
+# Set the threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do not use InformationGain.
+js_threshold = 0.2
+
+[dqnpolicy]
+q_update = double
+# set architecture=duel for vanilla neural networks
+architecture = noisy_duel
+h1_size = 300
+h2_size = 100
+capacity = 2000
+beta = 0.95
+epsilon_start = 0.3
+maxiter = 4000
+minibatch_size = 64
+is_threshold = 5.0
+episodeNum = 0.0
+epsilon_end = 0.0
+n_in = 268
+features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"]
+
+###### Evaluation parameters ######
+
+[eval]
+rewardvenuerecommended=0
+penaliseallturns = True
+wrongvenuepenalty = 0
+notmentionedvaluepenalty = 0
+successmeasure = objective
+successreward = 20
+
diff --git a/policy/feudalgainRL/configs/env6-CR-FeudalGain.cfg b/policy/feudalgainRL/configs/env6-CR-FeudalGain.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..75fc7db5b663cd9172fe0ac987cd05177e0bf72f
--- /dev/null
+++ b/policy/feudalgainRL/configs/env6-CR-FeudalGain.cfg
@@ -0,0 +1,106 @@
+# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator
+# User model: standard sampled params, sampled patience
+# Masks: off
+
+###### General parameters ######
+[GENERAL]
+# Set to "SFRestaurants" or "Laptops11" for SanFrancisco or Laptops domain
+domains = CamRestaurants
+singledomain = True
+tracedialog = 0
+seed = 0
+
+[exec_config]
+configdir = _benchmarkpolicies/env6-feudalgain
+logfiledir = _benchmarklogs/env6-feudalgain
+numtrainbatches = 20
+traindialogsperbatch = 200
+numbatchtestdialogs =  500
+trainsourceiteration = 0
+numtestdialogs =  500
+trainerrorrate = 30
+testerrorrate  = 30
+testeverybatch = True
+deleteprevpolicy = True
+
+[logging]
+usecolor = False
+screen_level = results
+file_level = results
+file = auto
+
+###### Environment parameters ######
+
+[agent]
+maxturns = 25
+
+[usermodel]
+usenewgoalscenarios = True
+oldstylepatience = False
+patience = 4,6
+configfile = config/sampledUM.cfg
+
+[errormodel]
+nbestsize = 5
+confusionmodel = LevenshteinConfusions
+nbestgeneratormodel = DSTC2NBestGenerator
+confscorer = DSTC2
+configfile = config/set3-ErrorModel.cfg
+
+[summaryacts]
+maxinformslots = 5
+informmask = True
+requestmask = True
+informcountaccepted = 4
+byemask = True
+
+###### Dialogue Manager parameters ######
+[policy]
+policydir = _benchmarkpolicies/env3-feudalgain
+belieftype = focus
+useconfreq = False
+learning = True
+policytype = feudalgain
+startwithhello = False
+inpolicyfile = auto
+outpolicyfile = auto
+# Set noisy_acer=False for vanilla neural networks
+noisy_acer = True
+# Set use_pass=True if transitions where pass() action were taken should be used. Always False if InformationGain is used.
+use_pass = False
+
+[feudalpolicy]
+features=learned
+si_policy_type=acer
+# only_master=True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g
+only_master = True
+# Set the threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do not use InformationGain.
+js_threshold = 0.2
+
+[dqnpolicy]
+q_update = double
+# set architecture=duel for vanilla neural networks
+architecture = noisy_duel
+h1_size = 300
+h2_size = 100
+capacity = 2000
+beta = 0.95
+epsilon_start = 0.3
+maxiter = 4000
+minibatch_size = 64
+is_threshold = 5.0
+episodeNum = 0.0
+epsilon_end = 0.0
+n_in = 268
+features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"]
+
+###### Evaluation parameters ######
+
+[eval]
+rewardvenuerecommended=0
+penaliseallturns = True
+wrongvenuepenalty = 0
+notmentionedvaluepenalty = 0
+successmeasure = objective
+successreward = 20
+
diff --git a/policy/feudalgainRL/dqn_latest.py b/policy/feudalgainRL/dqn_latest.py
deleted file mode 100644
index f945067231ef7176b671fd6c5d35dea2599586e4..0000000000000000000000000000000000000000
--- a/policy/feudalgainRL/dqn_latest.py
+++ /dev/null
@@ -1,197 +0,0 @@
-###############################################################################
-# PyDial: Multi-domain Statistical Spoken Dialogue System Software
-###############################################################################
-#
-# Copyright 2015 - 2019
-# Cambridge University Engineering Department Dialogue Systems Group
-#
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-###############################################################################
-
-""" 
-Implementation of DQAN -  Deep Q Action Network
-
-The algorithm is developed with tflearn + Tensorflow
-
-Author: Pei-Hao Su
-"""
-import tensorflow as tf
-import numpy as np
-import tflearn
-
-from policy.DRL.replay_buffer import ReplayBuffer
-
-# ===========================
-#   Deep Q Action Network
-# ===========================
-class DeepQNetwork(object):
-    """ 
-    Input to the network is the state and action, output is Q(s,a).
-    """
-    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, \
-                    num_actor_vars, architecture = 'duel', h1_size = 130, h2_size = 50):
-        self.sess = sess
-        self.s_dim = state_dim
-        self.a_dim = action_dim
-        self.learning_rate = learning_rate
-        self.tau = tau
-        self.architecture = architecture
-        self.h1_size = h1_size
-        self.h2_size = h2_size
-
-        # Create the deep Q network
-        self.inputs, self.action, self.Qout = \
-                        self.create_ddq_network(self.architecture, self.h1_size, self.h2_size)
-        self.network_params = tf.trainable_variables()
-
-        # Target Network
-        self.target_inputs, self.target_action, self.target_Qout = \
-                        self.create_ddq_network(self.architecture, self.h1_size, self.h2_size)
-        self.target_network_params = tf.trainable_variables()[len(self.network_params):]
-
-        # Op for periodically updating target network
-        self.update_target_network_params = \
-            [self.target_network_params[i].assign(\
-                tf.mul(self.network_params[i], self.tau) + tf.mul(self.target_network_params[i], 1. - self.tau))
-                for i in range(len(self.target_network_params))]
-
-        # Network target (y_i)
-        self.sampled_q = tf.placeholder(tf.float32, [None, 1])
-        
-        # Predicted Q given state and chosed action
-        #actions_one_hot = self.action
-        #self.pred_q = tf.reduce_sum(self.Qout * actions_one_hot, reduction_indices=1, name='q_acted')
-        self.pred_q = self.Qout
-
-        self.diff = self.sampled_q - self.pred_q
-
-        self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss')
-
-        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
-        self.optimize = self.optimizer.minimize(self.loss)
-       
-    def create_ddq_network(self, architecture = 'duel', h1_size = 130, h2_size = 50):
-        inputs = tf.placeholder(tf.float32, [None, self.s_dim])
-        action = tf.placeholder(tf.float32, [None, self.a_dim])
-
-        # state network
-        W_fc1_s = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01))
-        b_fc1_s = tf.Variable(tf.zeros([h1_size]))
-        h_fc1_s = tf.nn.relu(tf.matmul(inputs, W_fc1_s) + b_fc1_s)
-
-        # action network
-        W_fc1_a = tf.Variable(tf.truncated_normal([self.a_dim, h1_size], stddev=0.01))
-        b_fc1_a = tf.Variable(tf.zeros([h1_size]))
-        h_fc1_a = tf.nn.relu(tf.matmul(action, W_fc1_a) + b_fc1_a)
-
-
-        #h_fc1 = tf.nn.tanh(tf.matmul(inputs, W_fc1) + b_fc1)
-        #if architecture == 'duel':
-        if False:
-
-            """
-            W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
-            b_fc2_s = tf.Variable(tf.zeros([h2_size]))
-            h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s)
-
-            W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01))
-            b_value = tf.Variable(tf.zeros([1]))
-            value_out  = tf.matmul(h_fc2_s, W_value) + b_value
-
-
-
-            W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
-            b_fc2_a = tf.Variable(tf.zeros([h2_size]))
-            h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a)
-
-            Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1)
-            """
-
-
-            # value function
-            W_value = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
-            b_value = tf.Variable(tf.zeros([h2_size]))
-            h_value = tf.nn.relu(tf.matmul(h_fc1, W_value) + b_value)
-
-            W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01))
-            b_value = tf.Variable(tf.zeros([1]))
-            value_out  = tf.matmul(h_value, W_value) + b_value
-
-            # advantage function
-            W_advantage = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
-            b_advantage = tf.Variable(tf.zeros([h2_size]))
-            h_advantage = tf.nn.relu(tf.matmul(h_fc1, W_advantage) + b_advantage)
-
-            W_advantage = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01))
-            b_advantage = tf.Variable(tf.zeros([self.a_dim]))
-            Advantage_out  = tf.matmul(h_advantage, W_advantage) + b_advantage
-
-            Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, reduction_indices=1, keep_dims=True))
-
-        else:
-            W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
-            b_fc2_s = tf.Variable(tf.zeros([h2_size]))
-            h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s)
-            
-            W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
-            b_fc2_a = tf.Variable(tf.zeros([h2_size]))
-            h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a)
-
-            # inner product of state s and action a
-            #Qout = tf.mul(h_fc2_s,h_fc2_a)
-            Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1)
-            #Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1, keep_dims=True )
-            #Qout = tf.reduce_sum(tf.mul(h_fc2_s,h_fc2_a))
-        
-        return inputs, action, Qout
-
-    def train(self, inputs, action, sampled_q):
-        return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={
-            self.inputs: inputs,
-            self.action: action,
-            self.sampled_q: sampled_q
-        })
-
-    def predict(self, inputs, action):
-        #return self.sess.run(self.pred_q, feed_dict={
-        return self.sess.run(self.Qout, feed_dict={
-            self.inputs: inputs,
-            self.action: action
-        })
-
-    def predict_target(self, inputs, action):
-        #return self.sess.run(self.pred_q, feed_dict={
-        return self.sess.run(self.target_Qout, feed_dict={
-            self.target_inputs: inputs,
-            self.target_action: action
-        })
-
-    def update_target_network(self):
-        self.sess.run(self.update_target_network_params)
-
-    def load_network(self, load_filename):
-        self.saver = tf.train.Saver()
-        try:
-            self.saver.restore(self.sess, load_filename)
-            print("Successfully loaded:", load_filename)
-        except:
-            print("Could not find old network weights")
-
-    def save_network(self, save_filename):
-        print('Saving deepq-network...')
-        self.saver.save(self.sess, save_filename)
-
-    def clipped_error(self, x): 
-        return tf.select(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false
diff --git a/policy/feudalgainRL/noisyacer.py b/policy/feudalgainRL/noisyacer.py
index da52ad6ad9af57907094797589ee5cb9b954ab00..decaffab6d9d2cc3f15dc5134447d129b7202dc9 100644
--- a/policy/feudalgainRL/noisyacer.py
+++ b/policy/feudalgainRL/noisyacer.py
@@ -21,31 +21,26 @@
 ###############################################################################
 
 """
-Implementation of ACER
+Implementation of ACER with Noisy Networks
 
 The algorithm is developed with Tensorflow
 
-Author: Gellert Weisz
+Author: Gellert Weisz/Christian Geishauser
 """
 
 
 import numpy as np
 import tensorflow as tf
 
-from random import choice
-from time import sleep
-from time import time
-
-import sys # todo remove later
 
 # ===========================
-#   Soft Actor Critic with Experience Replay
+# Actor Critic with Experience Replay
 # ===========================
 
 
 class NoisyACERNetwork(object):
     def __init__(self, sess, state_dim, action_dim, learning_rate, delta, c, alpha, h1_size=130, h2_size=50,
-                 is_training = True, actfreq_loss=None, temperature=0, critic_regularizer_weight=0, noisy_acer=False):
+                 is_training=True, actfreq_loss=None, noisy_acer=False):
         self.sess = sess
         self.s_dim = state_dim
         self.a_dim = action_dim
@@ -55,9 +50,7 @@ class NoisyACERNetwork(object):
         else:
             self.master_space = False
         self.learning_rate = learning_rate
-        self.critic_regularizer_weight = critic_regularizer_weight
-        if self.critic_regularizer_weight != 0:
-            print(f"We use a regularizer for the critic with weight {self.critic_regularizer_weight}.")
+
         self.delta = delta
         self.c = c
         self.noisy_acer = noisy_acer
@@ -65,11 +58,6 @@ class NoisyACERNetwork(object):
         self.h1_size = h1_size
         self.h2_size = h2_size
         self.is_training = is_training
-        self.temperature = temperature
-        if self.temperature != 0:
-            print("Using soft ACER, temperature set to: ", self.temperature)
-        else:
-            print("Temperature of Maximum Entropy set to 0, using ACER.")
 
         #Input and hidden layers
         self.inputs = tf.placeholder(tf.float32, [None, self.s_dim])
@@ -94,7 +82,7 @@ class NoisyACERNetwork(object):
         self.avg_policy = tf.stop_gradient(self.avg_policy)
 
         # weighted average over q-values according to current policy gives the value of the state
-        self.value = tf.reduce_sum((self.q - self.temperature * tf.log(self.policy)) * self.policy, 1)
+        self.value = tf.reduce_sum(self.q * self.policy, 1)
 
         self.actions_onehot = self.actions
         self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])
@@ -109,7 +97,6 @@ class NoisyACERNetwork(object):
 
         self.q_ret = tf.placeholder(tf.float32, [None])
 
-
         # step 1 from pawel
         self.advantages_qret = self.q_ret - self.value
         self.wrt_theta_step1 = -tf.reduce_sum(tf.log(self.responsible_outputs) * tf.stop_gradient(self.rho * self.advantages_qret))
@@ -117,18 +104,12 @@ class NoisyACERNetwork(object):
         # step 2 from pawel
         self.wrt_theta = tf.reduce_sum(
             tf.log(self.responsible_outputs) *
-            tf.stop_gradient(self.rho_bar_c * (self.advantages_qret - self.temperature * (1 + tf.log(self.responsible_outputs)))) +
-            tf.reduce_sum(tf.log(self.policy) *
-                          tf.stop_gradient(tf.maximum(0., 1. - self.c / self.rho_all) *
-                                           self.policy *
-                                           (self.q - tf.reshape(self.value, [-1, 1]) - self.temperature * (1 + tf.log(self.policy)))), [1]))
+            tf.stop_gradient(self.rho_bar_c * self.advantages_qret) + tf.reduce_sum(tf.log(self.policy) *
+                          tf.stop_gradient(tf.maximum(0., 1. - self.c / self.rho_all) * self.policy *
+                                           (self.q - tf.reshape(self.value, [-1, 1]))), [1]))
 
         self.q_regularizer = tf.placeholder(tf.float32, [None])
-        if self.critic_regularizer_weight != 0:
-            self.wrt_theta_v = tf.reduce_sum(tf.square(self.q_ret - self.responsible_q)) + \
-                               self.critic_regularizer_weight * tf.reduce_sum(tf.square(self.q_regularizer - self.responsible_q))
-        else:
-            self.wrt_theta_v = tf.reduce_sum(tf.square(self.q_ret - self.responsible_q))
+        self.wrt_theta_v = tf.reduce_sum(tf.square(self.q_ret - self.responsible_q))
 
         self.entropy = -tf.reduce_sum(self.policy * tf.log(self.policy))
         #self.loss = self.wrt_theta_v + self.wrt_theta - self.entropy * 0.01
@@ -166,7 +147,7 @@ class NoisyACERNetwork(object):
             else:
                 self.final_gradients.append((-self.g[i][0], self.g[i][1])) # negative because this is loss
 
-        if self.temperature == 0 and not self.noisy_acer:
+        if not self.noisy_acer:
             self.optimize = [self.optimizer.apply_gradients(self.final_gradients),
                              self.optimizer.apply_gradients(self.entropy_gradients),
                              self.optimizer.apply_gradients(self.value_gradients)
@@ -261,9 +242,6 @@ class NoisyACERNetwork(object):
             #self.behaviour_mask: behaviour_mask
         }
 
-        if self.critic_regularizer_weight != 0:
-            feed_dict[self.q_regularizer] = critic_regularizer_output
-
         trpo_scale, klprod, kl, diff, entropy, loss, optimize = self.sess.run([self.trpo_scale, self.klprod, self.kl, self.advantage_qret_diff, self.entropy, self.loss, self.optimize], feed_dict=feed_dict)
         update_avg_theta = self.sess.run([self.update_avg_theta], feed_dict=feed_dict)
 
diff --git a/pydial.py b/pydial.py
index 606eb2382bddbcaa4e8645204de98c0102c77f50..ee827a042efcc02bd0ad57350972e486f6dcb415 100644
--- a/pydial.py
+++ b/pydial.py
@@ -825,6 +825,9 @@ def train_command(configfile, seed=None, trainerrorrate=None,trainsourceiteratio
         Optional parameters over-ride the corresponding config parameters of the same name.
     """
 
+    os.makedirs("_benchmarklogs", exist_ok=True)
+    os.makedirs("_benchmarkpolicies", exist_ok=True)
+
     try:
         if seed and seed.startswith('('):
             seeds = seed.replace('(', '').replace(')', '').split(',')