diff --git a/policy/FeudalGainPolicy.py b/policy/FeudalGainPolicy.py index 9a767ab2be8993f9ab07668968d9ba81535978f0..5ea4b20fc4f741e665315ca0b858e4e3b8a59984 100644 --- a/policy/FeudalGainPolicy.py +++ b/policy/FeudalGainPolicy.py @@ -21,6 +21,14 @@ ############################################################################### +""" +Implementation of FeudalGain + +Paper: Arxiv reference + +Author: Christian Geishauser +""" + import numpy as np import random import utils @@ -96,9 +104,6 @@ class FeudalGainPolicy(Policy.Policy): if cfg.has_option('feudalpolicy', 'info_reward_master'): self.info_reward_master = cfg.getfloat('feudalpolicy', 'info_reward_master') print("Master policy trains with info_gain reward") - self.js_threshold_master = 1.0 - if cfg.has_option('feudalpolicy', 'js_threshold_master'): - self.js_threshold_master = cfg.getfloat('feudalpolicy', 'js_threshold_master') self.only_master = False if cfg.has_option('feudalpolicy', 'only_master'): self.only_master = cfg.getboolean('feudalpolicy', 'only_master') @@ -108,16 +113,6 @@ class FeudalGainPolicy(Policy.Policy): self.bye_mask = False if cfg.has_option('summaryacts', 'byemask'): self.bye_mask = cfg.getboolean('summaryacts', 'byemask') - print("WE USE BYEMASK: ", self.bye_mask) - - self.critic_regularizer_path = None - if cfg.has_option('policy', 'critic_regularizer'): - self.critic_regularizer_path = cfg.get('policy', 'critic_regularizer') - print(f"We use {self.critic_regularizer_path} as a critic regularizer.") - - self.critic_regularizer_weight = 0 - if cfg.has_option('policy', 'critic_regularizer_weight'): - self.critic_regularizer_weight = cfg.getfloat('policy', 'critic_regularizer_weight') self.randomseed = 1234 if cfg.has_option('GENERAL', 'seed'): @@ -150,51 +145,26 @@ class FeudalGainPolicy(Policy.Policy): self.chosen = False if self.only_master: - print("Using ACER with merged policy.") + print("Using merged policy pi_mg") self.master_actions = self.slot_independent_actions[:-1] + ['slot_dep'] self.master_policy = FeudalNoisyACERPolicy(self._modify_policyfile('master', in_policy_file), self._modify_policyfile('master', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=self.master_actions, sd_state_dim=self.probability_max, - slot='si', js_threshold=self.js_threshold_master, - info_reward=self.info_reward_master, load_policy=self.load_master_policy, - critic_regularizer_weight=self.critic_regularizer_weight) + slot='si', load_policy=self.load_master_policy) elif self.si_policy_type == 'acer': - print("Using ACER with give_info and master_policy.") + print("Using policies pi_m and pi_g") self.master_policy = FeudalNoisyACERPolicy(self._modify_policyfile('master', in_policy_file), self._modify_policyfile('master', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=self.master_actions, sd_state_dim=self.probability_max, - slot='si', js_threshold=self.js_threshold_master, - info_reward=self.info_reward_master) + slot='si') self.give_info_policy = FeudalNoisyACERPolicy(self._modify_policyfile('gi', in_policy_file), self._modify_policyfile('gi', out_policy_file), domainString=self.domainString, is_training=self.is_training, action_names=self.slot_independent_actions, slot='si', sd_state_dim=self.probability_max) - elif self.si_policy_type == 'dqn': - self.master_policy = FeudalNoisyACERPolicy(self._modify_policyfile('master', in_policy_file), - self._modify_policyfile('master', out_policy_file), - domainString=self.domainString, is_training=self.is_training, - action_names=self.master_actions, sd_state_dim=self.probability_max, - slot='si') - self.give_info_policy = FeudalDQNPolicy(self._modify_policyfile('gi', in_policy_file), - self._modify_policyfile('gi', out_policy_file), - domainString=self.domainString, is_training=self.is_training, - action_names=self.slot_independent_actions, slot='si', - sd_state_dim=0) - - else: - self.master_policy = FeudalDQNPolicy(self._modify_policyfile('master', in_policy_file), - self._modify_policyfile('master', out_policy_file), - domainString=self.domainString, is_training=self.is_training, - action_names=self.master_actions, - slot='si')#pass is always masked, but its needed for implementation - self.give_info_policy = FeudalDQNPolicy(self._modify_policyfile('gi', in_policy_file), - self._modify_policyfile('gi', out_policy_file), - domainString=self.domainString, is_training=self.is_training, - action_names=self.slot_independent_actions, slot='si') self.request_info_policy = FeudalDQNPolicy(self._modify_policyfile('ri', in_policy_file), self._modify_policyfile('ri', out_policy_file), diff --git a/policy/MasterAction.py b/policy/MasterAction.py new file mode 100644 index 0000000000000000000000000000000000000000..74309722fb1ede44e334d4b164116a10cae0dd51 --- /dev/null +++ b/policy/MasterAction.py @@ -0,0 +1,381 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +''' +SummaryAction.py - Mapping between summary and master actions +============================================================= + +Copyright CUED Dialogue Systems Group 2015 - 2017, 2017 + +.. seealso:: CUED Imports/Dependencies: + + import :mod:`policy.SummaryUtils` |.| + import :mod:`ontology.Ontology` |.| + import :mod:`utils.ContextLogger` |.| + import :mod:`utils.Settings` + +************************ + +''' + +__author__ = "Christian Geishauser" + +from policy import SummaryUtils +from utils import ContextLogger, Settings +from ontology import Ontology + +logger = ContextLogger.getLogger('') + +MAX_NUM_ACCEPTED = 10 + + +class MasterAction(object): + ''' + The master action class encapsulates the functionality of a master action + + .. Note:: + The list of all possible master actions are defined in this class. + ''' + + def __init__(self, domainString, empty=False, confreq=False): + ''' + Records what domain the class is instantiated for, and what actions are available + + :param domainString: domain tag + :type domainString: string + :param empty: None + :type empty: bool + :param confreq: representing if the action confreq is used + :type confreq: bool + ''' + + self.domainString = domainString + self.summary_action_names = [] + self.inform_names = [] + self.action_names = [] + self._array_slot_summary = None + self._global_summary = None + + self.inform_mask = False + if Settings.config.has_option("summaryacts", "informmask"): + self.inform_mask = Settings.config.getboolean('summaryacts', 'informmask') + self.inform_count_accepted = 4 + if Settings.config.has_option("summaryacts", "informcountaccepted"): + self.inform_count_accepted = Settings.config.getint('summaryacts', 'informcountaccepted') + elif Settings.config.has_option("goalgenerator", "maxconstraints"): + self.inform_count_accepted = Settings.config.getint('goalgenerator', 'maxconstraints') + 1 + self.request_mask = False + if Settings.config.has_option("summaryacts", "requestmask"): + self.request_mask = Settings.config.getboolean('summaryacts', 'requestmask') + self.bye_mask = False + if Settings.config.has_option("summaryacts", "byemask"): + self.bye_mask = Settings.config.getboolean('summaryacts', 'byemask') + + if not empty: + for slot in Ontology.global_ontology.get_system_requestable_slots(domainString): + self.summary_action_names.append("request_" + slot) + self.summary_action_names.append("confirm_" + slot) + self.summary_action_names.append("select_" + slot) + if confreq: + for slot2 in Ontology.global_ontology.get_system_requestable_slots(domainString): + self.summary_action_names.append("confreq_" + slot + "_" + slot2) + self.summary_action_names += ["inform", + "inform_byname", + "inform_alternatives", + "bye", + "repeat", + "reqmore", + "restart" + ] + + informable_slots = Ontology.global_ontology.get_requestable_slots(domainString) + informable_slots.remove("name") + for i in range(0, 2**len(informable_slots)): + slots_to_inform = "" + #get binary representation and reverse it + binary_rep = format(i, "b")[::-1] + for index, j in enumerate(binary_rep): + if int(j) == 1: + slots_to_inform += f"-{informable_slots[index]}" + + self.inform_names.append(slots_to_inform) + + self.inform_ways = ["inform", "inform_byname", "inform_alternatives"] + + for inform_way in self.inform_ways: + for slots_to_inform in self.inform_names: + self.action_names.append(inform_way + slots_to_inform) + + for name in self.summary_action_names: + if not name.startswith("inform"): + self.action_names.append(name) + + print("WE WORK IN MASTER ACTION SPACE DIRECTLY!") + print("NUMBER OF POSSIBLE MASTER ACTIONS: ", len(self.action_names)) + print("NOTE: MASKING IS NOT IMPLEMENTED YET!") + + self.reset() + + def reset(self): + self.alternatives_requested = False + + def Convert(self, belief, action, lastSystemAction=None): + ''' + Converts the given summary action into a master action based on the current belief and the last system action. + + :param belief: the current master belief + :type belief: dict + :param action: the summary action to be converted to master action + :type action: string + :param lastSystemAction: the system action of the previous turn + :type lastSystemAction: string + ''' + + self._array_slot_summary = SummaryUtils.arraySlotSummary(belief, self.domainString) + self._global_summary = SummaryUtils.globalSummary(belief, self.domainString) + logger.dial('system summary act: {}.'.format(action)) + + if action.startswith("inform"): + if "byname" in action: + output = self.getInformByName(belief, action) + elif "alternatives" in action: + output = self.getInformAlternatives(belief, action) + else: + #just inform by constraints possible now + output = self.getInformByConstraints(belief, action) + elif "request_" in action: + output = self.getRequest(action.split("_")[1]) + elif "select_" in action: + output = self.getSelect(action.split("_")[1]) + elif "confirm_" in action: + output = self.getConfirm(action.split("_")[1]) + elif "confreq_" in action: + output = self.getConfReq(action.split("_")[1], action.split("_")[2]) + elif action == "bye": + output = self.getBye() + elif action == "repeat": + output = lastSystemAction + elif action == "reqmore": + output = self.getReqMore() + elif action == "restart": + output = self.getRestart() + else: + output = "" + logger.error("Unknown action: " + action) + return output + + # MASK OVER SUMMARY ACTION SET + # ------------------------------------------------------------------------------------ + + def getNonExecutable(self, belief, lastSystemAction): + ''' + Set of rules defining the mask over the action set, given the current belief state + :param belief: the current master belief + :type belief: dict + :param lastSystemAction: the system action of the previous turn + :type lastSystemAction: string + :return: list of non-executable (masked) actions + ''' + + array_slot_summary = SummaryUtils.arraySlotSummary(belief, self.domainString) + global_summary = SummaryUtils.globalSummary(belief, self.domainString) + if global_summary['GLOBAL_BYALTERNATIVES'] and not global_summary['GLOBAL_THANKYOU'] and not global_summary[ + 'GLOBAL_ACK']: + self.alternatives_requested = True + + nonexec = [] + + for action in self.action_names: + mask_action = False + + if action == "inform": + acceptance_list = SummaryUtils.getTopBeliefs(belief, domainString=self.domainString) + discriminable = SummaryUtils.acceptanceListCanBeDiscriminated(acceptance_list, + self.domainString) + if not global_summary['GLOBAL_BYCONSTRAINTS']: + mask_action = True + if global_summary['GLOBAL_COUNTACCEPTED'] < self.inform_count_accepted and discriminable: + mask_action = True + if mask_action and self.inform_mask: + nonexec.append(action) + + elif action == "inform_byname": + if not global_summary['GLOBAL_BYNAME']: + mask_action = True + if belief['features']['lastInformedVenue'] == '' \ + and SummaryUtils.getTopBelief(belief['beliefs']['name'])[0] == '**NONE**': + mask_action = True + if mask_action and self.inform_mask: + nonexec.append(action) + + elif action == "inform_alternatives": + if not self.alternatives_requested: + mask_action = True + if belief['features']['lastInformedVenue'] == '': + mask_action = True + if mask_action and self.inform_mask: + nonexec.append(action) + + elif action == "bye": + if not global_summary['GLOBAL_FINISHED']: + mask_action = True + if mask_action and self.bye_mask: + nonexec.append(action) + + elif action == "repeat": + if not global_summary['GLOBAL_REPEAT'] or lastSystemAction is None: + mask_action = True + mask_action = True # ic340: this action is "deactivated" because simuser doesnt know how to react to it + if mask_action: + nonexec.append(action) + + elif action == "reqmore": + if belief['features']['lastInformedVenue'] == '': + mask_action = True + if mask_action and self.request_mask: + nonexec.append(action) + + elif action == "restart": + if not global_summary['GLOBAL_RESTART']: + mask_action = True + mask_action = True # ic340: this action is "deactivated" because simuser doesnt know how to react to it + if mask_action: + nonexec.append(action) + + elif "request_" in action: + pass + if mask_action and self.request_mask: + nonexec.append(action) + + elif "select_" in action: + slot_summary = array_slot_summary[action.split("_")[1]] + top_prob = slot_summary['TOPHYPS'][0][1] + sec_prob = slot_summary['TOPHYPS'][1][1] + if top_prob == 0 or sec_prob == 0: + mask_action = True + if mask_action and self.request_mask: + nonexec.append(action) + + elif "confirm_" in action: + slot_summary = array_slot_summary[action.split("_")[1]] + top_prob = slot_summary['TOPHYPS'][0][1] + if top_prob == 0: + mask_action = True + if mask_action and self.request_mask: + nonexec.append(action) + + elif "confreq_" in action: + slot_summary = array_slot_summary[action.split("_")[1]] + top_prob = slot_summary['TOPHYPS'][0][1] + if top_prob == 0: + mask_action = True + if mask_action and self.request_mask: + nonexec.append(action) + + logger.info('masked inform actions:' + str([act for act in nonexec if 'inform' in act])) + return nonexec + + # added by phs26, 4 Nov 2016 + def getExecutableMask(self): + ''' + ''' + """ + # hack, make every action executable + return [0.0] * len(self.action_names) + """ + return [0.0] * len(self.summary_action_names) + + + # CONVERTING METHODS FOR EACH SPECIFIC ACT: + # ------------------------------------------------------------------------------------ + + def getRequest(self, slot): + return 'request({})'.format(slot) + + def getConfirm(self, slot): + summary = self._array_slot_summary[slot] + top_value = summary['TOPHYPS'][0][0] + return 'confirm({}="{}")'.format(slot, top_value) + + def getConfReq(self, cslot, rslot): + summary = self._array_slot_summary[cslot] + top_value = summary['TOPHYPS'][0][0] + return 'confreq({}="{}",{})'.format(cslot, top_value, rslot) + + def getSelect(self, slot): + summary = self._array_slot_summary[slot] + top_value = summary['TOPHYPS'][0][0] + sec_value = summary['TOPHYPS'][1][0] + return 'select({}="{}",{}="{}")'.format(slot, top_value, slot, sec_value) + + def getInformByConstraints(self, belief, action): + accepted_values = SummaryUtils.getTopBeliefs(belief, domainString=self.domainString) + + inform_slots = action.split("-") + if len(inform_slots) == 1: + inform_slots = [] + else: + inform_slots = inform_slots[1:] + constraints = SummaryUtils.get_constraints(accepted_values) + return SummaryUtils.getInformByConstraints(constraints, self.domainString, + belief['features']['lastInformedVenue'], inform_slots) + + def getInformByName(self, belief, action): + + requested_slots = action.split("-") + if len(requested_slots) == 1: + #the policy decided to inform just the name and no slots, we will inform on a random slot then + requested_slots = [] + else: + requested_slots = requested_slots[1:] + name = SummaryUtils.getTopBelief(belief['beliefs']['name'])[0] + if name == '**NONE**': + name = belief['features']['lastInformedVenue'] + return SummaryUtils.getInformRequestedSlots(requested_slots, name, self.domainString) + + def getInformAlternatives(self, belief, action): + self.alternatives_requested = False + informedVenueSinceNone = set(belief['features']['informedVenueSinceNone']) + accepted_values = SummaryUtils.getTopBeliefs(belief, domainString=self.domainString) + + inform_slots = action.split("-") + if len(inform_slots) == 1: + inform_slots = [] + else: + inform_slots = inform_slots[1:] + return SummaryUtils.getInformAlternativeEntities(accepted_values, informedVenueSinceNone, self.domainString, inform_slots) + + def getBye(self): + return 'bye()' + + def getReqMore(self): + return 'reqmore()' + + def getInformRepeat(self): + # TODO: implement the proper action, this was not implemented in PolicyUtils.py + return 'null()' + + def getRestart(self): + # TODO: implement the proper action, this was not implemented in PolicyUtils.py + return 'null()' + +# END OF FILE diff --git a/policy/PolicyManager.py b/policy/PolicyManager.py index 3ca85b89159b3f27f93341d03aaba96f2b28583b..5671814dfa283407b72724459b9126195cfb0527 100644 --- a/policy/PolicyManager.py +++ b/policy/PolicyManager.py @@ -51,6 +51,8 @@ class PolicyManager(object): self.domainPolicies = dict.fromkeys(OntologyUtils.available_domains, None) self.committees = self._load_committees() self.shared_params = None + + self.SPECIAL_DOMAINS = ['topicmanager','wikipedia'] diff --git a/policy/feudalgainRL/DQNPolicy_latest.py b/policy/feudalgainRL/DQNPolicy_latest.py deleted file mode 100644 index 559e5bc4f55174efffac683a31cd9c9a6481f198..0000000000000000000000000000000000000000 --- a/policy/feudalgainRL/DQNPolicy_latest.py +++ /dev/null @@ -1,789 +0,0 @@ -############################################################################### -# PyDial: Multi-domain Statistical Spoken Dialogue System Software -############################################################################### -# -# Copyright 2015 - 2019 -# Cambridge University Engineering Department Dialogue Systems Group -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -############################################################################### - -''' -DQNPolicy.py - deep Q network policy -================================================== - -Author: Pei-Hao (Eddy) Su (Copyright CUED Dialogue Systems Group 2016) - -.. seealso:: CUED Imports/Dependencies: - - import :class:`Policy` - import :class:`utils.ContextLogger` - -.. warning:: - Documentation not done. - - -************************ - -''' - -import copy -import sys -import os -import json -import numpy as np -import pickle as pickle -import random -import utils -from utils.Settings import config as cfg -from utils import ContextLogger - -import ontology.FlatOntologyManager as FlatOnt -#from theano_dialogue.util.tool import * - -import tensorflow as tf -from policy.DRL.replay_buffer import ReplayBuffer -from policy.DRL.replay_prioritised import ReplayPrioritised -import policy.DRL.utils as drlutils -import policy.DRL.dqn as dqn -import policy.Policy -import policy.SummaryAction -from policy.Policy import TerminalAction, TerminalState -import policy.GPPolicy - -logger = utils.ContextLogger.getLogger('') - -# --- for flattening the belief --- # -domainUtil = FlatOnt.FlatDomainOntology('CamRestaurants') - -""" -def flatten_belief(gpstate): - ''' - Flatten the GP-dictionary-typed belief state to a one-dim vector - ''' - - if isinstance(gpstate, TerminalState): - return [0] * 304 #260 #264 - - flat_belief = [] - for key, value in gpstate._bstate.items(): - flat_belief += value - - return flat_belief -""" - -def flatten_belief(belief,domainUtil=FlatOnt.FlatDomainOntology('CamRestaurants'), merge=False): - if isinstance(belief, TerminalState): - return [0] * 260 #264 - - #for key, value in belief.items(): - # print key, value - - #policyfeatures = ['full','method','discourseAct','requested'] - policyfeatures = ['full','method','discourseAct','requested',\ - 'lastActionInformNone','offerHappened','inform_info'] - - flat_belief = [] - for feat in policyfeatures: - add_feature = [] - if feat == 'kbest': - for slot in self.domainUtil.sorted_system_requestable_slots: - # print slot, 'belief', belief['beliefs'][slot] - temp = [belief['beliefs'][slot][value] for value in domainUtil.ontology['informable'][slot]] - temp = sorted(temp, key=lambda b: -b) - #temp = [belief['beliefs'][slot]['dontcare']] + [belief['beliefs'][slot]['**NONE**']] + temp - temp = temp + [belief['beliefs'][slot]['dontcare']] + [belief['beliefs'][slot]['**NONE**']] - temp = temp[0:self.max_k] - add_feature += temp - elif feat == 'full': - #for slot in self.sorted_slots: - for slot in domainUtil.ontology['informable']: - for value in domainUtil.ontology['informable'][slot]:# + ['**NONE**']: - #for value in domainUtil.ontology['informable'][slot] + ['**NONE**']: - #for value in domainUtil.ontology['informable'][slot] + ['dontcare'] + ['**NONE**']: - add_feature.append(belief['beliefs'][slot][value]) - elif feat == 'method': - add_feature = [belief['beliefs']['method'][method] for method in domainUtil.ontology['method']] - elif feat == 'discourseAct': - add_feature = [belief['beliefs']['discourseAct'][discourseAct] - for discourseAct in domainUtil.ontology['discourseAct']] - elif feat == 'requested': - add_feature = [belief['beliefs']['requested'][slot] \ - for slot in domainUtil.ontology['requestable']] - elif feat == 'lastActionInformNone': - add_feature.append(float(belief['features']['lastActionInformNone'])) - elif feat == 'offerHappened': - add_feature.append(float(belief['features']['offerHappened'])) - elif feat == 'inform_info': - add_feature += belief['features']['inform_info'] - else: - logger.error('Invalid feature name in config: ' + feat) - - flat_belief += add_feature - - return flat_belief - - - - """ - flat_belief = [] - for feat in policyfeatures: - add_feature = [] - if feat == 'full': - #for slot in self.sorted_slots: - for slot in domainUtil.ontology['informable']: - if slot == 'name': - continue - accumProb = 0.0 - for value in domainUtil.ontology['informable'][slot] + ['**NONE**']: - if value not in ('dontcare', '**NONE**'): - accumProb += float(belief['beliefs'][slot][value]) - add_feature.append(accumProb) - add_feature.append(belief['beliefs'][slot]['dontcare']) - add_feature.append(belief['beliefs'][slot]['**NONE**']) - - #add_feature.append(belief['beliefs'][slot][value]) - elif feat == 'method': - add_feature = [belief['beliefs']['method'][method] \ - for method in domainUtil.ontology['method']] - elif feat == 'discourseAct': - add_feature = [belief['beliefs']['discourseAct'][discourseAct] - for discourseAct in domainUtil.ontology['discourseAct']] - elif feat == 'requested': - add_feature = [belief['beliefs']['requested'][slot] \ - for slot in domainUtil.ontology['requestable']] - elif feat == 'lastActionInformNone': - add_feature.append(float(belief['features']['lastActionInformNone'])) - elif feat == 'offerHappened': - add_feature.append(float(belief['features']['offerHappened'])) - elif feat == 'inform_info': - add_feature += (belief['features']['inform_info']) - else: - logger.error('Invalid feature name in config: ' + feat) - - flat_belief += add_feature - return flat_belief - """ - - -class DQNPolicy(Policy.Policy): - '''Derived from :class:`Policy` - ''' - def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False): - super(DQNPolicy, self).__init__(domainString, is_training) - - self.in_policy_file = in_policy_file - self.out_policy_file = out_policy_file - self.is_training = is_training - self.accum_belief = [] - self.stats = [0 for ii in range(14)] - - self.prev_state_check = None - - # parameter settings - self.n_in= 260 - if cfg.has_option('dqnpolicy_'+domainString, 'n_in'): - self.n_in = cfg.getint('dqnpolicy_'+domainString, 'n_in') - - self.actor_lr = 0.0001 - if cfg.has_option('dqnpolicy_'+domainString, 'actor_lr'): - self.actor_lr = cfg.getfloat('dqnpolicy_'+domainString, 'actor_lr') - - self.critic_lr = 0.001 - if cfg.has_option('dqnpolicy_'+domainString, 'critic_lr'): - self.critic_lr = cfg.getfloat('dqnpolicy_'+domainString, 'critic_lr') - - self.tau = 0.001 - if cfg.has_option('dqnpolicy_'+domainString, 'tau'): - self.tau = cfg.getfloat('dqnpolicy_'+domainString, 'tau') - - self.randomseed = 1234 - if cfg.has_option('GENERAL', 'seed'): - self.randomseed = cfg.getint('GENERAL', 'seed') - - self.gamma = 1.0 - if cfg.has_option('dqnpolicy_'+domainString, 'gamma'): - self.gamma = cfg.getfloat('dqnpolicy_'+domainString, 'gamma') - - self.regularisation = 'l2' - if cfg.has_option('dqnpolicy_'+domainString, 'regularisation'): - self.regularisation = cfg.get('dqnpolicy_'+domainString, 'regulariser') - - self.learning_rate = 0.001 - if cfg.has_option('dqnpolicy_'+domainString, 'learning_rate'): - self.learning_rate = cfg.getfloat('dqnpolicy_'+domainString, 'learning_rate') - - self.exploration_type = 'e-greedy' # Boltzman - if cfg.has_option('dqnpolicy_'+domainString, 'exploration_type'): - self.exploration_type = cfg.get('dqnpolicy_'+domainString, 'exploration_type') - - self.episodeNum = 1000 - if cfg.has_option('dqnpolicy_'+domainString, 'episodeNum'): - self.episodeNum = cfg.getfloat('dqnpolicy_'+domainString, 'episodeNum') - - self.maxiter = 5000 - if cfg.has_option('dqnpolicy_'+domainString, 'maxiter'): - self.maxiter = cfg.getfloat('dqnpolicy_'+domainString, 'maxiter') - - self.epsilon = 1 - if cfg.has_option('dqnpolicy_'+domainString, 'epsilon'): - self.epsilon = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon') - - self.epsilon_start = 1 - if cfg.has_option('dqnpolicy_'+domainString, 'epsilon_start'): - self.epsilon_start = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon_start') - - self.epsilon_end = 1 - if cfg.has_option('dqnpolicy_'+domainString, 'epsilon_end'): - self.epsilon_end = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon_end') - - self.priorProbStart = 1.0 - if cfg.has_option('dqnpolicy_'+domainString, 'prior_sample_prob_start'): - self.priorProbStart = cfg.getfloat('dqnpolicy_'+domainString, 'prior_sample_prob_start') - - self.priorProbEnd = 0.1 - if cfg.has_option('dqnpolicy_'+domainString, 'prior_sample_prob_end'): - self.priorProbEnd = cfg.getfloat('dqnpolicy_'+domainString, 'prior_sample_prob_end') - - self.policyfeatures = [] - if cfg.has_option('dqnpolicy_'+domainString, 'features'): - logger.info('Features: ' + str(cfg.get('dqnpolicy_'+domainString, 'features'))) - self.policyfeatures = json.loads(cfg.get('dqnpolicy_'+domainString, 'features')) - - self.max_k = 5 - if cfg.has_option('dqnpolicy_'+domainString, 'max_k'): - self.max_k = cfg.getint('dqnpolicy_'+domainString, 'max_k') - - self.learning_algorithm = 'drl' - if cfg.has_option('dqnpolicy_'+domainString, 'learning_algorithm'): - self.learning_algorithm = cfg.get('dqnpolicy_'+domainString, 'learning_algorithm') - logger.info('Learning algorithm: ' + self.learning_algorithm) - - self.minibatch_size = 32 - if cfg.has_option('dqnpolicy_'+domainString, 'minibatch_size'): - self.minibatch_size = cfg.getint('dqnpolicy_'+domainString, 'minibatch_size') - - self.capacity = 1000#max(self.minibatch_size, 2000) - if cfg.has_option('dqnpolicy_'+domainString, 'capacity'): - self.capacity = max(cfg.getint('dqnpolicy_'+domainString,'capacity'), 2000) - - self.replay_type = 'vanilla' - if cfg.has_option('dqnpolicy_'+domainString, 'replay_type'): - self.replay_type = cfg.get('dqnpolicy_'+domainString, 'replay_type') - - self.architecture = 'vanilla' - if cfg.has_option('dqnpolicy_'+domainString, 'architecture'): - self.architecture = cfg.get('dqnpolicy_'+domainString, 'architecture') - - self.q_update = 'single' - if cfg.has_option('dqnpolicy_'+domainString, 'q_update'): - self.q_update = cfg.get('dqnpolicy_'+domainString, 'q_update') - - self.h1_size = 130 - if cfg.has_option('dqnpolicy_'+domainString, 'h1_size'): - self.h1_size = cfg.getint('dqnpolicy_'+domainString, 'h1_size') - - self.h2_size = 130 - if cfg.has_option('dqnpolicy_'+domainString, 'h2_size'): - self.h2_size = cfg.getint('dqnpolicy_'+domainString, 'h2_size') - - """ - self.shuffle = False - if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'): - self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay') - if not self.shuffle: - # If we don't use experience replay, we don't need to maintain - # sliding window of experiences with maximum capacity. - # We only need to maintain the data of minibatch_size - self.capacity = self.minibatch_size - """ - - self.episode_ave_max_q = [] - - os.environ["CUDA_VISIBLE_DEVICES"]="" - - # init session - self.sess = tf.Session() - with tf.device("/cpu:0"): - - np.random.seed(self.randomseed) - tf.set_random_seed(self.randomseed) - - # initialise an replay buffer - if self.replay_type == 'vanilla': - self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed) - elif self.replay_type == 'prioritized': - self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size, self.randomseed) - #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) - #self.episodes = [] - self.samplecount = 0 - self.episodecount = 0 - - # construct the models - self.state_dim = self.n_in - self.summaryaction = SummaryAction.SummaryAction(domainString) - self.action_dim = len(self.summaryaction.action_names) - action_bound = len(self.summaryaction.action_names) - - self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, \ - self.critic_lr, self.tau, action_bound, self.architecture, self.h1_size, self.h2_size) - - # when all models are defined, init all variables - init_op = tf.initialize_all_variables() - self.sess.run(init_op) - - self.loadPolicy(self.in_policy_file) - print('loaded replay size: ', self.episodes[self.domainString].size()) - - self.dqn.update_target_network() - - # def record() has been handled... - - def act_on(self, beliefstate, hyps=None): - if self.lastSystemAction is None and self.startwithhello: - systemAct, nextaIdex = 'hello()', -1 - else: - systemAct, nextaIdex = self.nextAction(beliefstate, hyps) - self.lastSystemAction = systemAct - self.summaryAct = nextaIdex - self.prevbelief = beliefstate - return systemAct - - def record(self, reward, domainInControl=None, weight=None, state=None, action=None): - if domainInControl is None: - domainInControl = self.domainString - if self.episodes[domainInControl] is None: - self.episodes[domainInControl] = Episode(dstring=domainInControl) - if self.actToBeRecorded is None: - #self.actToBeRecorded = self.lastSystemAction - self.actToBeRecorded = self.summaryAct - - if state is None: - state = self.prevbelief - if action is None: - action = self.actToBeRecorded - - cState, cAction = self.convertStateAction(state, action) - - # normalising total return to -1~1 - #reward /= 40.0 - reward /= 20.0 - """ - reward = float(reward+10.0)/40.0 - """ - - if weight == None: - if self.replay_type == 'vanilla': - self.episodes[domainInControl].record(state=cState, \ - state_ori=state, action=cAction, reward=reward) - elif self.replay_type == 'prioritized': - - ##### calculate Q_s_t_a_t_ and gamma_Q_s_tplu1_maxa_ for PER ### - ################################################################ - cur_cState = np.vstack([np.expand_dims(x, 0) for x in [cState]]) - cur_cAction_one_hot = np.eye(self.action_dim, self.action_dim)[[cAction]] - - cur_action_q = self.dqn.predict(cur_cState, cur_cAction_one_hot) - execMask = self.summaryaction.getExecutableMask(state, cAction) - - if self.q_update == 'single': - Qs = [] - for idx, v in enumerate(execMask): - if v > -sys.maxsize: - Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] - Qidx = self.dqn.predict_target(cur_cState, Action_idx) - Qs.append(Qidx[0]) - #Qs.append(Qidx[0]) - - Q_s_t_a_t_ = cur_action_q[0] - gamma_Q_s_tplu1_maxa_ = self.gamma * np.max(Qs) - elif self.q_update == 'double': - Qs = [] - for idx, v in enumerate(execMask): - if v > -sys.maxsize: - Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] - Qidx = self.dqn.predict(cur_cState, Action_idx) - Qs.append(Qidx[0]) - else: - Qs.append(-sys.maxsize) - - policyQ_argmax_a = np.argmax(Qs) - policyQ_argmax_a_one_hot = np.eye(self.action_dim, self.action_dim)[[policyQ_argmax_a]] - target_value_Q = self.dqn.predict_target(cur_cState, policyQ_argmax_a_one_hot) - - Q_s_t_a_t_ = cur_action_q[0] - gamma_Q_s_tplu1_maxa_ = self.gamma * target_value_Q - - print('Q_s_t_a_t_', Q_s_t_a_t_) - print('gamma_Q_s_tplu1_maxa_', gamma_Q_s_tplu1_maxa_) - ################################################################ - - # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used - #if self.samplecount >= self.capacity: - if True: - self.episodes[domainInControl].record(state=cState, \ - state_ori=state, action=cAction, reward=reward, \ - Q_s_t_a_t_ = Q_s_t_a_t_, gamma_Q_s_tplu1_maxa_= gamma_Q_s_tplu1_maxa_, uniform=False) - else: - self.episodes[domainInControl].record(state=cState, \ - state_ori=state, action=cAction, reward=reward, \ - Q_s_t_a_t_ = Q_s_t_a_t_, gamma_Q_s_tplu1_maxa_= gamma_Q_s_tplu1_maxa_, uniform=True) - - else: - self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward, ma_weight=weight) - - self.actToBeRecorded = None - self.samplecount += 1 - return - - def finalizeRecord(self, reward, domainInControl=None): - if domainInControl is None: - domainInControl = self.domainString - if self.episodes[domainInControl] is None: - logger.warning("record attempted to be finalized for domain where nothing has been recorded before") - return - - #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size()) - print('Episode Avg_Max_Q', np.mean(self.episode_ave_max_q)) - - print('saving statics') - self.saveStats() - print(self.stats) - - # normalising total return to -1~1 - #if reward == 0: - # reward = -20.0 - reward /= 20.0 - """ - if reward == 20.0: - reward = 1.0 - else: - reward = -0.5 - """ - #reward = float(reward+10.0)/40.0 - - terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction()) - - if self.replay_type == 'vanilla': - self.episodes[domainInControl].record(state=terminal_state, \ - state_ori=TerminalState(), action=terminal_action, reward=reward, terminal=True) - elif self.replay_type == 'prioritized': - # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used - #if self.samplecount >= self.capacity: - if True: - self.episodes[domainInControl].record(state=terminal_state, \ - state_ori=TerminalState(), action=terminal_action, reward=reward, \ - Q_s_t_a_t_ = 0.0, gamma_Q_s_tplu1_maxa_= 0.0, uniform=False, terminal=True) - else: - self.episodes[domainInControl].record(state=terminal_state, \ - state_ori=TerminalState(), action=terminal_action, reward=reward, \ - Q_s_t_a_t_ = 0.0, gamma_Q_s_tplu1_maxa_= 0.0, uniform=True, terminal=True) - return - - def convertStateAction(self, state, action): - ''' - nnType = 'dnn' - #nnType = 'rnn' - # expand one dimension to match the batch size of 1 at axis 0 - if nnType == 'rnn': - belief = np.expand_dims(belief,axis=0) - ''' - - if isinstance(state, TerminalState): - return [0] * 260, action #260 #264 - else: - flat_belief = flatten_belief(state) - - if flat_belief == self.prev_state_check: - print('same state') - else: - print('diff state') - self.prev_state_check = flat_belief - - return flat_belief, action - - def nextAction(self, beliefstate, hyps): - ''' - select next action - - :param beliefstate: - :param hyps: - :returns: (int) next summary action - ''' - #beliefVec = flatten_belief(beliefstate, domainUtil) - beliefVec = flatten_belief(beliefstate) - - execMask = self.summaryaction.getExecutableMask(beliefstate, self.lastSystemAction) - #print sum([ 1 for i in execMask if i==0.0 ]) - if self.exploration_type == 'e-greedy': - # epsilon greedy - if self.is_training and utils.Settings.random.rand() < self.epsilon: - admissible = [i for i, x in enumerate(execMask) if x == 0.0] - random.shuffle(admissible) - nextaIdex = admissible[0] - else: - admissible = [] - for idx, v in enumerate(execMask): - if v > -sys.maxsize: - Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] - Qidx = self.dqn.predict(np.reshape(beliefVec, (1, len(beliefVec))), Action_idx) - admissible.append(Qidx[0]) - else: - admissible.append(-sys.maxsize) - #action_Q = self.dqn.predict(np.reshape(beliefVec, (1, len(beliefVec))))# + (1. / (1. + i + j)) - #admissible = np.add(action_Q, np.array(execMask)) - logger.info('action Q...') - print(admissible) - nextaIdex = np.argmax(admissible) - - # add current max Q to self.episode_ave_max_q - print('current maxQ', np.max(admissible)) - self.episode_ave_max_q.append(np.max(admissible)) - - elif self.exploration_type == 'Boltzman': - # randomly assign, not complete - admissible = [i for i, x in enumerate(execMask) if x == 0.0] - random.shuffle(admissible) - nextaIdex = admissible[0] - - self.stats[nextaIdex] += 1 - summaryAct = self.summaryaction.action_names[nextaIdex] - masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction) - return masterAct, nextaIdex - - def train(self): - ''' - call this function when the episode ends - ''' - - if not self.is_training: - logger.info("Not in training mode") - return - else: - logger.info("Update dqn policy parameters.") - - self.episodecount += 1 - logger.info("Sample Num so far: %s" %(self.samplecount)) - logger.info("Episode Num so far: %s" %(self.episodecount)) - #if True: - if self.samplecount >= self.minibatch_size * 3 and self.episodecount % 4 == 0: - #if self.samplecount >= self.capacity and self.episodecount % 5 == 0: - #if self.samplecount > self.minibatch_size: - #if self.samplecount > self.capacity: - logger.info('start traninig...') - - - ################################################# - ################################################# - # update TD error for all experience in PER # - ################################################# - ################################################# - """ - #s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \ - # self.episodes[self.domainString].all_batch() - experience, idx_batch = self.episodes[self.domainString].all_batch() - #self.episodes[self.domainString].sample_batch_vanilla_PER() - - #s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch]) - #s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch]) - - # self.s_prev, self.s_ori_prev, self.a_prev, self.r_prev, state, state_ori, termina - - for k in xrange(len(idx_batch)): - Q_bootstrap_label = 0 - if experience[k][-1]: # terminal - Q_bootstrap_label = experience[k][3] # reward - else: - execMask = self.summaryaction.getExecutableMask(experience[k][-2], experience[k][2]) # s_ori, a - if self.q_update == 'single': - admissible = [] - for idx, v in enumerate(execMask): - if v > -sys.maxint: - Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] - s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][0]] ]) # s - Qidx = self.dqn.predict_target(s2_idx, Action_idx) - admissible.append(Qidx[0]) - Q_bootstrap_label = experience[k][3] + self.gamma * np.max(admissible) # reward - elif self.q_update == 'double': - Qs = [] - for idx, v in enumerate(execMask): - if v > -sys.maxint: - Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] - s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][-3]] ]) - Qidx = self.dqn.predict(s2_idx, Action_idx) - Qs.append(Qidx[0]) - else: - Qs.append(-sys.maxint) - - policyQ_argmax_a = np.argmax(Qs) - policyQ_argmax_a_one_hot = np.eye(self.action_dim, self.action_dim)[[policyQ_argmax_a]] - s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][-3]] ]) - target_value_Q = self.dqn.predict_target(s2_idx, policyQ_argmax_a_one_hot) - - Q_bootstrap_label = experience[k][3] + self.gamma * target_value_Q - - if self.replay_type == 'prioritized': - # update the sum-tree - # update the TD error of the samples in the minibatch - current_a = np.eye(self.action_dim, self.action_dim)[[experience[k][2]]] - current_s = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][0]] ]) - currentQ_s_a_ = self.dqn.predict(current_s, current_a) - currentQ_s_a_ = currentQ_s_a_[0] - error = abs(currentQ_s_a_ - Q_bootstrap_label) - self.episodes[self.domainString].update(idx_batch[k], error) - - """ - - s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \ - self.episodes[self.domainString].sample_batch() - #self.episodes[self.domainString].sample_batch_vanilla_PER() - - #s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch]) - #s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch]) - - y_i = [] - for k in range(min(self.minibatch_size, self.episodes[self.domainString].size())): - Q_bootstrap_label = 0 - if t_batch[k]: - Q_bootstrap_label = r_batch[k] - else: - execMask = self.summaryaction.getExecutableMask(s2_ori_batch[k], a_batch[k]) - if self.q_update == 'single': - admissible = [] - for idx, v in enumerate(execMask): - if v > -sys.maxsize: - Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] - s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [s2_batch[k]] ]) - Qidx = self.dqn.predict_target(s2_idx, Action_idx) - admissible.append(Qidx[0]) - Q_bootstrap_label = r_batch[k] + self.gamma * np.max(admissible) - elif self.q_update == 'double': - Qs = [] - for idx, v in enumerate(execMask): - if v > -sys.maxsize: - Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] - s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [s2_batch[k]] ]) - Qidx = self.dqn.predict(s2_idx, Action_idx) - Qs.append(Qidx[0]) - else: - Qs.append(-sys.maxsize) - - policyQ_argmax_a = np.argmax(Qs) - policyQ_argmax_a_one_hot = np.eye(self.action_dim, self.action_dim)[[policyQ_argmax_a]] - s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [s2_batch[k]] ]) - target_value_Q = self.dqn.predict_target(s2_idx, policyQ_argmax_a_one_hot) - - Q_bootstrap_label = r_batch[k] + self.gamma * target_value_Q - y_i.append(Q_bootstrap_label) - - if self.replay_type == 'prioritized': - # update the sum-tree - # update the TD error of the samples in the minibatch - current_a = np.eye(self.action_dim, self.action_dim)[[a_batch[k]]] - current_s = np.vstack([ np.expand_dims(x, 0) for x in [s_batch[k]] ]) - currentQ_s_a_ = self.dqn.predict(current_s, current_a) - currentQ_s_a_ = currentQ_s_a_[0] - error = abs(currentQ_s_a_ - Q_bootstrap_label) - self.episodes[self.domainString].update(idx_batch[k], error) - - # change index-based a_batch to one-hot-based a_batch - a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch] - - # Update the critic given the targets - reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i]) - - s_batch_expand = np.vstack([np.expand_dims(x, 0) for x in s_batch]) - """ - print s_batch_expand - print 'a_batch', a_batch - print a_batch_one_hot - print len(a_batch) - print len(y_i) - """ - #reshaped_yi = np.reshape(y_i, (min(self.minibatch_size, self.episodes[self.domainString].size()), 1)) - predicted_q_value, _, currentLoss = self.dqn.train(s_batch, a_batch_one_hot, reshaped_yi) - #predicted_q_value, _, currentLoss = self.dqn.train(s_batch_expand, a_batch_one_hot, reshaped_yi) - - print('y_i') - print(y_i) - print('currentLoss', currentLoss) - print('predict Q') - print(predicted_q_value) - - if self.episodecount % 1 == 0: - #if self.episodecount % 50 == 0: - # Update target networks - self.dqn.update_target_network() - - self.savePolicyInc() # self.out_policy_file) - - def savePolicy(self, FORCE_SAVE=False): - """ - Does not use this, cause it will be called from agent after every episode. - we want to save the policy only periodically. - """ - pass - - def savePolicyInc(self, FORCE_SAVE=False): - """ - save model and replay buffer - """ - #save_path = self.saver.save(self.sess, self.out_policy_file+'.ckpt') - self.dqn.save_network(self.out_policy_file+'.dqn.ckpt') - - f = open(self.out_policy_file+'.episode', 'wb') - for obj in [self.samplecount, self.episodes[self.domainString]]: - pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) - f.close() - #logger.info("Saving model to %s and replay buffer..." % save_path) - - def saveStats(self, FORCE_SAVE=False): - f = open(self.out_policy_file + '.stats', 'wb') - pickle.dump(self.stats, f, protocol=pickle.HIGHEST_PROTOCOL) - f.close() - - def loadPolicy(self, filename): - """ - load model and replay buffer - """ - # load models - self.dqn.load_network(filename+'.dqn.ckpt') - - # load replay buffer - try: - print('laod from: ', filename) - f = open(filename+'.episode', 'rb') - loaded_objects = [] - for i in range(2): # load nn params and collected data - loaded_objects.append(pickle.load(f)) - self.samplecount = int(loaded_objects[0]) - self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1]) - logger.info("Loading both model from %s and replay buffer..." % filename) - f.close() - except: - logger.info("Loading only models...") - - def restart(self): - self.summaryAct = None - self.lastSystemAction = None - self.prevbelief = None - self.actToBeRecorded = None - self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter) - print('current eps', self.epsilon) - #self.episodes = dict.fromkeys(OntologyUtils.available_domains, None) - #self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.randomseed) - self.episode_ave_max_q = [] - -#END OF FILE diff --git a/policy/feudalgainRL/FeudalACERPolicy.py b/policy/feudalgainRL/FeudalACERPolicy.py deleted file mode 100644 index 19d9fccb719b8588567e8e397a9a6f2441230de2..0000000000000000000000000000000000000000 --- a/policy/feudalgainRL/FeudalACERPolicy.py +++ /dev/null @@ -1,457 +0,0 @@ -############################################################################### -# PyDial: Multi-domain Statistical Spoken Dialogue System Software -############################################################################### -# -# Copyright 2015 - 2019 -# Cambridge University Engineering Department Dialogue Systems Group -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -############################################################################### - -''' -ACERPolicy.py - ACER - Actor Critic with Experience Replay -================================================== - -Copyright CUED Dialogue Systems Group 2015 - 2017 - -.. seealso:: CUED Imports/Dependencies: - - import :class:`Policy` - import :class:`utils.ContextLogger` - -.. warning:: - Documentation not done. - - -************************ - -''' -import copy -import os -import json -import numpy as np -import scipy -import scipy.signal -import pickle as pickle -import random -import utils -from utils.Settings import config as cfg -from utils import ContextLogger, DiaAct - -import ontology.FlatOntologyManager as FlatOnt -import tensorflow as tf -from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode -from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode -import policy.DRL.utils as drlutils -from policy.ACERPolicy import ACERPolicy -import policy.DRL.acer as acer -import policy.Policy -import policy.SummaryAction -from policy.Policy import TerminalAction, TerminalState -from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state - -logger = utils.ContextLogger.getLogger('') - -# Discounting function used to calculate discounted returns. -def discount(x, gamma): - return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] - - -class FeudalACERPolicy(ACERPolicy): - '''Derived from :class:`Policy` - ''' - def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, - action_names=None, slot=None, sd_state_dim=50): - super(FeudalACERPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training) - - tf.reset_default_graph() - - self.in_policy_file = in_policy_file - self.out_policy_file = out_policy_file - self.is_training = is_training - self.accum_belief = [] - self.prev_state_check = None - self.sd_state_dim = sd_state_dim - - self.domainString = domainString - self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) - - self.features = 'dip' - self.sd_enc_size = 80 - self.si_enc_size = 40 - self.dropout_rate = 0. - if cfg.has_option('feudalpolicy', 'features'): - self.features = cfg.get('feudalpolicy', 'features') - if cfg.has_option('feudalpolicy', 'sd_enc_size'): - self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size') - if cfg.has_option('feudalpolicy', 'si_enc_size'): - self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size') - if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: - self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') - if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: - self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') - self.actfreq_ds = False - if cfg.has_option('feudalpolicy', 'actfreq_ds'): - self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds') - - # init session - self.sess = tf.Session() - with tf.device("/cpu:0"): - - np.random.seed(self.randomseed) - tf.set_random_seed(self.randomseed) - - # initialise an replay buffer - if self.replay_type == 'vanilla': - self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed) - elif self.replay_type == 'prioritized': - self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed) - #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) - #self.episodes = [] - self.samplecount = 0 - self.episodecount = 0 - - # construct the models - self.state_dim = 89 # current DIP state dim - self.summaryaction = policy.SummaryAction.SummaryAction(domainString) - self.action_names = action_names - self.action_dim = len(self.action_names) - action_bound = len(self.action_names) - self.stats = [0 for _ in range(self.action_dim)] - - self.global_mu = [0. for _ in range(self.action_dim)] - - if self.features == 'dip': - if self.actfreq_ds: - if self.domainString == 'CamRestaurants': - self.state_dim += 9#16 - elif self.domainString == 'SFRestaurants': - self.state_dim += 9#25 - elif self.domainString == 'Laptops11': - self.state_dim += 9#40 - self.acer = acer.ACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta, - self.c, self.alpha, self.h1_size, self.h2_size, self.is_training) - elif self.features == 'learned' or self.features == 'rnn': - si_state_dim = 73 - if self.actfreq_ds: - if self.domainString == 'CamRestaurants': - si_state_dim += 9#16 - elif self.domainString == 'SFRestaurants': - si_state_dim += 9#25 - elif self.domainString == 'Laptops11': - si_state_dim += 9#40 - - if 0:#self.features == 'rnn': - self.acer = acer.RNNACERNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.critic_lr, - self.delta, self.c, self.alpha, self.h1_size, self.h2_size, self.is_training, - sd_enc_size=25, si_enc_size=25, dropout_rate=0., tn='normal', slot='si') - else: - self.state_dim = si_state_dim - self.acer = acer.ACERNetwork(self.sess, self.state_dim, self.action_dim, - self.critic_lr, self.delta, self.c, self.alpha, self.h1_size, - self.h2_size, self.is_training) - - else: - logger.error('features "{}" not implemented'.format(self.features)) - - - # when all models are defined, init all variables - init_op = tf.global_variables_initializer() - self.sess.run(init_op) - - self.loadPolicy(self.in_policy_file) - print('loaded replay size: ', self.episodes[self.domainString].size()) - - #self.acer.update_target_network() - - # def record() has been handled... - - def convertStateAction(self, state, action): - ''' - - ''' - if isinstance(state, TerminalState): - return [0] * 89, action - - else: - if self.features == 'learned' or self.features == 'rnn': - dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString) - else: - dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString) - action_name = self.actions.action_names[action] - act_slot = 'general' - for slot in dip_state.slots: - if slot in action_name: - act_slot = slot - flat_belief = dip_state.get_beliefStateVec(act_slot) - self.prev_state_check = flat_belief - - return flat_belief, action - - def record(self, reward, domainInControl=None, weight=None, state=None, action=None): - if domainInControl is None: - domainInControl = self.domainString - if self.actToBeRecorded is None: - self.actToBeRecorded = self.summaryAct - - if state is None: - state = self.prevbelief - if action is None: - action = self.actToBeRecorded - mu_weight = self.prev_mu - mask = self.prev_mask - if action == self.action_dim-1: # pass action was taken - mask = np.zeros(self.action_dim) - mu_weight = np.ones(self.action_dim)/self.action_dim - - cState, cAction = state, action - - reward /= 20.0 - - value = self.acer.predict_value([cState], [mask]) - - if self.replay_type == 'vanilla': - self.episodes[domainInControl].record(state=cState, \ - state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask) - elif self.replay_type == 'prioritized': - self.episodes[domainInControl].record(state=cState, \ - state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask) - - self.actToBeRecorded = None - self.samplecount += 1 - return - - def finalizeRecord(self, reward, domainInControl=None): - if domainInControl is None: - domainInControl = self.domainString - if self.episodes[domainInControl] is None: - logger.warning("record attempted to be finalized for domain where nothing has been recorded before") - return - - #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size()) - #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q) - #print self.stats - - # normalising total return to -1~1 - reward /= 20.0 - - terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction()) - value = 0.0 # not effect on experience replay - - def calculate_discountR_advantage(r_episode, v_episode): - ######################################################################### - # Here we take the rewards and values from the rollout, and use them to - # generate the advantage and discounted returns. - # The advantage function uses "Generalized Advantage Estimation" - bootstrap_value = 0.0 - self.r_episode_plus = np.asarray(r_episode + [bootstrap_value]) - discounted_r_episode = discount(self.r_episode_plus,self.gamma)[:-1] - self.v_episode_plus = np.asarray(v_episode + [bootstrap_value]) - advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1] - advantage = discount(advantage,self.gamma) - ######################################################################### - return discounted_r_episode, advantage - - if self.replay_type == 'vanilla': - self.episodes[domainInControl].record(state=terminal_state, \ - state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None) - elif self.replay_type == 'prioritized': - episode_r, episode_v = self.episodes[domainInControl].record_final_and_get_episode(state=terminal_state, \ - state_ori=TerminalState(), - action=terminal_action, - reward=reward, - value=value) - - # TD_error is a list of td error in the current episode - _, TD_error = calculate_discountR_advantage(episode_r, episode_v) - episodic_TD = np.mean(np.absolute(TD_error)) - print('episodic_TD') - print(episodic_TD) - self.episodes[domainInControl].insertPriority(episodic_TD) - - return - - def nextAction(self, beliefstate): - ''' - select next action - - :param beliefstate: - :param hyps: - :returns: (int) next summarye action - ''' - - #execMask = self.summaryaction.getExecutableMask(beliefstate, self.lastSystemAction) - execMask = np.zeros(self.action_dim) - - def apply_mask(prob, maskval, baseline=9.99999975e-06): - return prob if maskval == 0.0 else baseline # not quite 0.0 to avoid division by zero - - if self.exploration_type == 'e-greedy' or not self.is_training: - if self.is_training and utils.Settings.random.rand() < self.epsilon: - action_prob = np.random.rand(len(self.action_names)) - else: - action_prob = self.acer.predict_policy(np.reshape(beliefstate, (1, len(beliefstate))), - np.reshape(execMask, (1, len(execMask))))[0] - mu = action_prob / sum(action_prob) - self.prev_mu = mu - self.prev_mask = execMask - return action_prob - - def train(self): - ''' - call this function when the episode ends - ''' - USE_GLOBAL_MU = False - self.episode_ct += 1 - - if not self.is_training: - logger.info("Not in training mode") - return - else: - logger.info("Update acer policy parameters.") - - self.episodecount += 1 - logger.info("Sample Num so far: %s" % (self.samplecount)) - logger.info("Episode Num so far: %s" % (self.episodecount)) - if self.samplecount >= self.minibatch_size * 3 and self.episodecount % self.training_frequency == 0: - #if self.episodecount % self.training_frequency == 0: - logger.info('start trainig...') - - for _ in range(self.train_iters_per_episode): - - if self.replay_type == 'vanilla' or self.replay_type == 'prioritized': - s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, v_batch, mu_policy, mask_batch = \ - self.episodes[self.domainString].sample_batch() - if USE_GLOBAL_MU: - mu_sum = sum(self.global_mu) - mu_normalised = np.array([c / mu_sum for c in self.global_mu]) - mu_policy = [[mu_normalised for _ in range(len(mu_policy[i]))] for i in range(len(mu_policy))] - else: - assert False # not implemented yet - - discounted_r_batch = [] - advantage_batch = [] - def calculate_discountR_advantage(r_episode, v_episode): - ######################################################################### - # Here we take the rewards and values from the rolloutv, and use them to - # generate the advantage and discounted returns. - # The advantage function uses "Generalized Advantage Estimation" - bootstrap_value = 0.0 - # r_episode rescale by rhos? - self.r_episode_plus = np.asarray(r_episode + [bootstrap_value]) - discounted_r_episode = discount(self.r_episode_plus, self.gamma)[:-1] - self.v_episode_plus = np.asarray(v_episode + [bootstrap_value]) - # change sth here - advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1] - advantage = discount(advantage, self.gamma) - ######################################################################### - return discounted_r_episode, advantage - - if self.replay_type == 'prioritized': - for item_r, item_v, item_idx in zip(r_batch, v_batch, idx_batch): - # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist()) - r, a = calculate_discountR_advantage(item_r, item_v) - - # flatten nested numpy array and turn it into list - discounted_r_batch += r.tolist() - advantage_batch += a.tolist() - - # update the sum-tree - # update the TD error of the samples (episode) in the minibatch - episodic_TD_error = np.mean(np.absolute(a)) - self.episodes[self.domainString].update(item_idx, episodic_TD_error) - else: - for item_r, item_v in zip(r_batch, v_batch): - # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist()) - r, a = calculate_discountR_advantage(item_r, item_v) - - # flatten nested numpy array and turn it into list - discounted_r_batch += r.tolist() - advantage_batch += a.tolist() - - batch_size = len(s_batch) - - a_batch_one_hot = np.eye(self.action_dim)[np.concatenate(a_batch, axis=0).tolist()] - - loss, entropy, optimize = \ - self.acer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, - np.concatenate(np.array(mask_batch), axis=0).tolist(), - np.concatenate(np.array(r_batch), axis=0).tolist(), s_batch, r_batch, self.gamma, - np.concatenate(np.array(mu_policy), axis=0), - discounted_r_batch, advantage_batch) - - ent, norm_loss = entropy/float(batch_size), loss/float(batch_size) - - - self.savePolicyInc() # self.out_policy_file) - - - def savePolicy(self, FORCE_SAVE=False): - """ - Does not use this, cause it will be called from agent after every episode. - we want to save the policy only periodically. - """ - pass - - def savePolicyInc(self, FORCE_SAVE=False): - """ - save model and replay buffer - """ - if self.episodecount % self.save_step == 0: - #save_path = self.saver.save(self.sess, self.out_policy_file+'.ckpt') - self.acer.save_network(self.out_policy_file+'.acer.ckpt') - - f = open(self.out_policy_file+'.episode', 'wb') - for obj in [self.samplecount, self.episodes[self.domainString], self.global_mu]: - pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) - f.close() - #logger.info("Saving model to %s and replay buffer..." % save_path) - - def loadPolicy(self, filename): - """ - load model and replay buffer - """ - # load models - self.acer.load_network(filename+'.acer.ckpt') - - # load replay buffer - if self.load_buffer: - try: - print('load from: ', filename) - f = open(filename+'.episode', 'rb') - loaded_objects = [] - for i in range(2): # load nn params and collected data - loaded_objects.append(pickle.load(f)) - self.samplecount = int(loaded_objects[0]) - self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1]) - self.global_mu = loaded_objects[2] - logger.info("Loading both model from %s and replay buffer..." % filename) - f.close() - except: - logger.info("Loading only models...") - else: - print("We do not load the buffer!") - - def restart(self): - self.summaryAct = None - self.lastSystemAction = None - self.prevbelief = None - self.prev_mu = None - self.prev_mask = None - self.actToBeRecorded = None - self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter) - self.episode_ave_max_q = [] - -#END OF FILE diff --git a/policy/feudalgainRL/FeudalBBQNPolicy.py b/policy/feudalgainRL/FeudalBBQNPolicy.py deleted file mode 100644 index 01a6275ac7468b716beaa01e76656a7babf15ddf..0000000000000000000000000000000000000000 --- a/policy/feudalgainRL/FeudalBBQNPolicy.py +++ /dev/null @@ -1,407 +0,0 @@ -############################################################################### -# PyDial: Multi-domain Statistical Spoken Dialogue System Software -############################################################################### -# -# Copyright 2015 - 2019 -# Cambridge University Engineering Department Dialogue Systems Group -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -############################################################################### - -''' -DQNPolicy.py - deep Q network policy -================================================== - -Author: Chris Tegho and Pei-Hao (Eddy) Su (Copyright CUED Dialogue Systems Group 2016) - -.. seealso:: CUED Imports/Dependencies: - - import :class:`Policy` - import :class:`utils.ContextLogger` - -.. warning:: - Documentation not done. - - -************************ - -''' - -import copy -import os -import json -import numpy as np -import pickle as pickle -import random -import sys -import utils -from utils.Settings import config as cfg -from utils import ContextLogger, DiaAct, DialogueState - -import ontology.FlatOntologyManager as FlatOnt -# from theano_dialogue.util.tool import * - -import tensorflow as tf -from policy.DRL.replay_bufferVanilla import ReplayBuffer -from policy.DRL.replay_prioritisedVanilla import ReplayPrioritised -import policy.DRL.utils as drlutils -from policy.DRL import bdqn as bbqn -import policy.Policy -import policy.SummaryAction -import policy.BBQNPolicy -from policy.Policy import TerminalAction, TerminalState -from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state - -logger = utils.ContextLogger.getLogger('') - -# --- for flattening the belief --- # -domainUtil = FlatOnt.FlatDomainOntology('CamRestaurants') - - -class FeudalBBQNPolicy(policy.BBQNPolicy.BBQNPolicy): - '''Derived from :class:`BBQNPolicy` - ''' - def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, - action_names=None, slot=None): - super(FeudalBBQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training) - - tf.reset_default_graph() - - self.domainString = domainString - self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) - self.in_policy_file = in_policy_file - self.out_policy_file = out_policy_file - self.is_training = is_training - self.accum_belief = [] - - self.prev_state_check = None - - self.episode_ave_max_q = [] - - self.capacity *= 4 #set the capacity for episode methods, multiply it to adjust to turn based methods - self.slot = slot - - # init session - self.sess = tf.Session() - with tf.device("/cpu:0"): - - np.random.seed(self.randomseed) - tf.set_random_seed(self.randomseed) - - # initialise an replay buffer - if self.replay_type == 'vanilla': - self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed) - elif self.replay_type == 'prioritized': - self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size, - self.randomseed) - # replay_buffer = ReplayBuffer(self.capacity, self.randomseed) - # self.episodes = [] - self.samplecount = 0 - self.episodecount = 0 - - # construct the models - self.state_dim = 89 # current DIP state dim - self.summaryaction = policy.SummaryAction.SummaryAction(domainString) - self.action_names = action_names - self.action_dim = len(self.action_names) - action_bound = len(self.action_names) - self.stats = [0 for _ in range(self.action_dim)] - self.stdVar = [] - self.meanVar = [] - self.stdMean = [] - self.meanMean = [] - self.td_error = [] - self.td_errorVar = [] - - self.target_update_freq = 1 - if cfg.has_option('bbqnpolicy', 'target_update_freq'): - self.target_update_freq = cfg.get('bbqnpolicy', 'target_update_freq') - - #feudal params - self.features = 'dip' - self.sd_enc_size = 25 - self.si_enc_size = 50 - self.dropout_rate = 0. - if cfg.has_option('feudalpolicy', 'features'): - self.features = cfg.get('feudalpolicy', 'features') - if cfg.has_option('feudalpolicy', 'sd_enc_size'): - self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size') - if cfg.has_option('feudalpolicy', 'si_enc_size'): - self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size') - if cfg.has_option('feudalpolicy', 'dropout_rate') and self.is_training: - self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') - self.actfreq_ds = False - if cfg.has_option('feudalpolicy', 'actfreq_ds'): - self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds') - - if self.features == 'dip': - if self.actfreq_ds: - if self.domainString == 'CamRestaurants': - self.state_dim += 16 - elif self.domainString == 'SFRestaurants': - self.state_dim += 25 - elif self.domainString == 'Laptops11': - self.state_dim += 40 - - self.bbqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate, self.tau, - action_bound, self.architecture, self.h1_size, self.h2_size, - self.n_samples, - self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu, - self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling, - self.alpha_divergence, self.alpha, self.sigma_eps) - elif self.features == 'learned' or self.features == 'rnn': - si_state_dim = 72 - if self.actfreq_ds: - if self.domainString == 'CamRestaurants': - si_state_dim += 16 - elif self.domainString == 'SFRestaurants': - si_state_dim += 25 - elif self.domainString == 'Laptops11': - si_state_dim += 40 - if self.domainString == 'CamRestaurants': - sd_state_dim = 94 - elif self.domainString == 'SFRestaurants': - sd_state_dim = 158 - elif self.domainString == 'Laptops11': - sd_state_dim = 13 - else: - logger.error( - 'Domain {} not implemented in feudal-DQN yet') # just find out the size of sd_state_dim for the new domain - if self.features == 'rnn': - arch = 'rnn' - self.state_dim = si_state_dim + sd_state_dim - self.bbqn = bbqn.RNNBBQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate, - self.tau, action_bound, arch, self.h1_size, self.h2_size, self.n_samples, - self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu, - self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling, - self.alpha_divergence, self.alpha, self.sigma_eps, sd_enc_size=self.sd_enc_size, - si_enc_size=self.sd_enc_size, dropout_rate=self.dropout_rate, slot=slot) - else: - arch = 'vanilla' - self.state_dim = si_state_dim + sd_state_dim - self.bbqn = bbqn.NNBBQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate, - self.tau, action_bound, arch, self.h1_size, self.h2_size, self.n_samples, - self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu, - self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling, - self.alpha_divergence, self.alpha, self.sigma_eps, sd_enc_size=self.sd_enc_size, - si_enc_size=self.sd_enc_size, dropout_rate=self.dropout_rate, slot=slot) - else: - logger.error('features "{}" not implemented'.format(self.features)) - - - - # when all models are defined, init all variables - init_op = tf.global_variables_initializer() - self.sess.run(init_op) - - self.loadPolicy(self.in_policy_file) - print('loaded replay size: ', self.episodes[self.domainString].size()) - - self.bbqn.update_target_network() - - def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None): - if domainInControl is None: - domainInControl = self.domainString - if self.actToBeRecorded is None: - # self.actToBeRecorded = self.lastSystemAction - self.actToBeRecorded = self.summaryAct - - if state is None: - state = self.prevbelief - if action is None: - action = self.actToBeRecorded - - cState, cAction = state, action - - reward /= 20.0 - - cur_cState = np.vstack([np.expand_dims(x, 0) for x in [cState]]) - cur_action_q = self.bbqn.predict(cur_cState) - cur_target_q = self.bbqn.predict_target(cur_cState) - - if exec_mask is not None: - admissible = np.add(cur_target_q, np.array(exec_mask)) - else: - admissible = cur_target_q - - Q_s_t_a_t_ = cur_action_q[0][cAction] - gamma_Q_s_tplu1_maxa_ = self.gamma * np.max(admissible) - - if weight == None: - if self.replay_type == 'vanilla': - self.episodes[domainInControl].record(state=cState, \ - state_ori=state, action=cAction, reward=reward) - elif self.replay_type == 'prioritized': - # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used - if True: - # if self.samplecount >= self.capacity: - self.episodes[domainInControl].record(state=cState, \ - state_ori=state, action=cAction, reward=reward, \ - Q_s_t_a_t_=Q_s_t_a_t_, - gamma_Q_s_tplu1_maxa_=gamma_Q_s_tplu1_maxa_, uniform=False) - else: - self.episodes[domainInControl].record(state=cState, \ - state_ori=state, action=cAction, reward=reward, \ - Q_s_t_a_t_=Q_s_t_a_t_, - gamma_Q_s_tplu1_maxa_=gamma_Q_s_tplu1_maxa_, uniform=True) - - else: - self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward, - ma_weight=weight) - - self.actToBeRecorded = None - self.samplecount += 1 - return - - def finalizeRecord(self, reward, domainInControl=None): - if domainInControl is None: - domainInControl = self.domainString - if self.episodes[domainInControl] is None: - logger.warning("record attempted to be finalized for domain where nothing has been recorded before") - return - - # normalising total return to -1~1 - # if reward == 0: - # reward = -20.0 - reward /= 20.0 - """ - if reward == 20.0: - reward = 1.0 - else: - reward = -0.5 - """ - # reward = float(reward+10.0)/40.0 - - terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction()) - - if self.replay_type == 'vanilla': - self.episodes[domainInControl].record(state=terminal_state, \ - state_ori=TerminalState(), action=terminal_action, reward=reward, - terminal=True) - elif self.replay_type == 'prioritized': - # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used - if True: - # if self.samplecount >= self.capacity: - self.episodes[domainInControl].record(state=terminal_state, \ - state_ori=TerminalState(), action=terminal_action, reward=reward, \ - Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False, - terminal=True) - else: - self.episodes[domainInControl].record(state=terminal_state, \ - state_ori=TerminalState(), action=terminal_action, reward=reward, \ - Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=True, - terminal=True) - - def convertStateAction(self, state, action): - ''' - - ''' - if isinstance(state, TerminalState): - return [0] * 89, action - - else: - if self.features == 'learned' or self.features == 'rnn': - dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString) - else: - dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString) - action_name = self.actions.action_names[action] - act_slot = 'general' - for slot in dip_state.slots: - if slot in action_name: - act_slot = slot - flat_belief = dip_state.get_beliefStateVec(act_slot) - self.prev_state_check = flat_belief - - return flat_belief, action - - def nextAction(self, beliefstate): - ''' - select next action - - :param beliefstate: - :param hyps: - :returns: (int) next summary action - ''' - - if self.exploration_type == 'e-greedy': - # epsilon greedy - if self.is_training and utils.Settings.random.rand() < self.epsilon: - action_Q = np.random.rand(len(self.action_names)) - else: - action_Q = self.bbqn.predict(np.reshape(beliefstate, (1, len(beliefstate)))) # + (1. / (1. + i + j)) - - self.episode_ave_max_q.append(np.max(action_Q)) - - # return the Q vect, the action will be converted in the feudal policy - return action_Q - - - def train(self): - ''' - call this function when the episode ends - ''' - - if not self.is_training: - logger.info("Not in training mode") - return - else: - logger.info("Update dqn policy parameters.") - - self.episodecount += 1 - logger.info("Sample Num so far: %s" % (self.samplecount)) - logger.info("Episode Num so far: %s" % (self.episodecount)) - - if self.samplecount >= self.minibatch_size * 10 and self.episodecount % self.training_frequency == 0: - logger.info('start training...') - - s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \ - self.episodes[self.domainString].sample_batch() - - s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch]) - s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch]) - - a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch] - action_q = self.bbqn.predict_dip(s2_batch, a_batch_one_hot) - target_q = self.bbqn.predict_target_dip(s2_batch, a_batch_one_hot) - # print 'action Q and target Q:', action_q, target_q - - y_i = [] - for k in range(min(self.minibatch_size, self.episodes[self.domainString].size())): - Q_bootstrap_label = 0 - if t_batch[k]: - Q_bootstrap_label = r_batch[k] - else: - if self.q_update == 'single': - belief = s2_ori_batch[k] - execMask = [0.0] * len(self.action_names) # TODO: find out how to compute the mask here, or save it when recording the state - execMask[-1] = -sys.maxsize - action_Q = target_q[k] - admissible = np.add(action_Q, np.array(execMask)) - Q_bootstrap_label = r_batch[k] + self.gamma * np.max(admissible) - - y_i.append(Q_bootstrap_label) - - # Update the critic given the targets - reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i]) - - predicted_q_value, _, currentLoss, logLikelihood, varFC2, meanFC2, td_error, KL_div = self.bbqn.train(s_batch, a_batch_one_hot, reshaped_yi, self.episodecount) - - if self.episodecount % self.target_update_freq == 0: - self.bbqn.update_target_network() - if self.episodecount % self.save_step == 0: - self.savePolicyInc() # self.out_policy_file) - - -# END OF FILE diff --git a/policy/feudalgainRL/FeudalBBQNPolicyNew.py b/policy/feudalgainRL/FeudalBBQNPolicyNew.py deleted file mode 100644 index 6d35709818b1fdeed16592c3bcc0cafb9a21c727..0000000000000000000000000000000000000000 --- a/policy/feudalgainRL/FeudalBBQNPolicyNew.py +++ /dev/null @@ -1,416 +0,0 @@ -############################################################################### -# PyDial: Multi-domain Statistical Spoken Dialogue System Software -############################################################################### -# -# Copyright 2015 - 2019 -# Cambridge University Engineering Department Dialogue Systems Group -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -############################################################################### - -''' -DQNPolicy.py - deep Q network policy -================================================== - -Copyright CUED Dialogue Systems Group 2015 - 2017 - -.. seealso:: CUED Imports/Dependencies: - - import :class:`Policy` - import :class:`utils.ContextLogger` - -.. warning:: - Documentation not done. - - -************************ - -''' - -import copy -import os -import sys -import json -import numpy as np -import pickle as pickle -from itertools import product -from scipy.stats import entropy -import utils -from utils.Settings import config as cfg -from utils import ContextLogger, DiaAct, DialogueState - -import ontology.FlatOntologyManager as FlatOnt -import tensorflow as tf -from policy.DRL.replay_buffer import ReplayBuffer -from policy.DRL.replay_prioritised import ReplayPrioritised -import policy.DRL.utils as drlutils -import policy.DRL.dqn as dqn -import policy.Policy -import policy.DQNPolicy -import policy.SummaryAction -from policy.Policy import TerminalAction, TerminalState -from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state -from policy.feudalRL.feudalUtils import get_feudal_masks -from policy.DRL import bdqn as bbqn - - -logger = utils.ContextLogger.getLogger('') - - -class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy): - '''Derived from :class:`DQNPolicy` - ''' - - def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, - action_names=None, slot=None, sd_state_dim=50, js_threshold=0, info_reward=0.0, jsd_reward=False, - jsd_function=None): - super(FeudalDQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training) - - tf.reset_default_graph() - - self.domainString = domainString - self.sd_state_dim = sd_state_dim - self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) - self.in_policy_file = in_policy_file - self.out_policy_file = out_policy_file - self.is_training = is_training - self.accum_belief = [] - self.info_reward = info_reward - self.js_threshold = js_threshold - self.jsd_reward = jsd_reward - self.jsd_function = jsd_function - if self.jsd_function is not None: - print("We use the JSD-function", self.jsd_function) - if self.js_threshold != 1.0 and not self.jsd_reward: - print("We use JS-divergence, threshold =", self.js_threshold) - if self.jsd_reward: - print("We train with raw JSD reward.") - self.slots = slot - self.features = 'dip' - if cfg.has_option('feudalpolicy', 'features'): - self.features = cfg.get('feudalpolicy', 'features') - self.actfreq_ds = False - if cfg.has_option('feudalpolicy', 'actfreq_ds'): - self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds') - - self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) - self.prev_state_check = None - - self.max_k = 5 - if cfg.has_option('dqnpolicy', 'max_k'): - self.max_k = cfg.getint('dqnpolicy', 'max_k') - - self.capacity *= 5 # capacity for episode methods, multiply it to adjust to turn based methods - - # init session - self.sess = tf.Session() - with tf.device("/cpu:0"): - - np.random.seed(self.randomseed) - tf.set_random_seed(self.randomseed) - - # initialise a replay buffer - if self.replay_type == 'vanilla': - self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size*4, self.randomseed) - elif self.replay_type == 'prioritized': - self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size, - self.randomseed) - self.samplecount = 0 - self.episodecount = 0 - - # construct the models - self.summaryaction = policy.SummaryAction.SummaryAction(domainString) - self.action_names = action_names - self.action_dim = len(self.action_names) - action_bound = len(self.action_names) - self.stats = [0 for _ in range(self.action_dim)] - - if self.features == 'learned' or self.features == 'rnn': - si_state_dim = 73 - if self.actfreq_ds: - if self.domainString == 'CamRestaurants': - si_state_dim += 9#16 - elif self.domainString == 'SFRestaurants': - si_state_dim += 9#25 - elif self.domainString == 'Laptops11': - si_state_dim += 9#40 - self.sd_enc_size = 50 - self.si_enc_size = 25 - self.dropout_rate = 0. - if cfg.has_option('feudalpolicy', 'sd_enc_size'): - self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size') - if cfg.has_option('feudalpolicy', 'si_enc_size'): - self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size') - if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: - self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') - if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: - self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') - - self.state_dim = si_state_dim + sd_state_dim - if self.features == 'learned': - - self.dqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate, - self.tau, - action_bound, self.architecture, self.h1_size, self.h2_size, - self.n_samples, - self.minibatch_size) - - elif self.features == 'rnn': - self.dqn = dqn.RNNFDeepQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, - self.learning_rate, self.tau, action_bound, self.minibatch_size, - self.architecture, self.h1_size, self.h2_size, - sd_enc_size=self.sd_enc_size, si_enc_size=self.si_enc_size, - dropout_rate=self.dropout_rate, slot=self.slot) - else: # self.features = 'dip' - if self.actfreq_ds: - if self.domainString == 'CamRestaurants': - self.state_dim += 9#16 - elif self.domainString == 'SFRestaurants': - self.state_dim += 9#25 - elif self.domainString == 'Laptops11': - self.state_dim += 9#40 - self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, - self.learning_rate, self.tau, action_bound, self.minibatch_size, - self.architecture, self.h1_size, - self.h2_size, dropout_rate=self.dropout_rate) - - # when all models are defined, init all variables (this might to be sent to the main policy too) - init_op = tf.global_variables_initializer() - self.sess.run(init_op) - - self.loadPolicy(self.in_policy_file) - print('loaded replay size: ', self.episodes[self.domainString].size()) - - self.dqn.update_target_network() - - def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None): - if domainInControl is None: - domainInControl = self.domainString - if self.actToBeRecorded is None: - self.actToBeRecorded = self.summaryAct - - if state is None: - state = self.prevbelief - if action is None: - action = self.actToBeRecorded - - cState, cAction = state, action - # normalising total return to -1~1 - reward /= 20.0 - - if self.replay_type == 'vanilla': - self.episodes[domainInControl].record(state=cState, \ - state_ori=state, action=cAction, reward=reward) - - self.actToBeRecorded = None - self.samplecount += 1 - - def finalizeRecord(self, reward, domainInControl=None): - if domainInControl is None: - domainInControl = self.domainString - if self.episodes[domainInControl] is None: - logger.warning("record attempted to be finalized for domain where nothing has been recorded before") - return - - reward /= 20.0 - - terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction()) - - if self.replay_type == 'vanilla': - self.episodes[domainInControl].record(state=terminal_state, \ - state_ori=TerminalState(), action=terminal_action, reward=reward, - terminal=True) - elif self.replay_type == 'prioritized': - self.episodes[domainInControl].record(state=terminal_state, \ - state_ori=TerminalState(), action=terminal_action, reward=reward, \ - Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False, - terminal=True) - print('total TD', self.episodes[self.domainString].tree.total()) - - def convertStateAction(self, state, action): - ''' - - ''' - if isinstance(state, TerminalState): - return [0] * 89, action - else: - if self.features == 'learned' or self.features == 'rnn': - dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString) - else: - dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString) - action_name = self.actions.action_names[action] - act_slot = 'general' - for slot in dip_state.slots: - if slot in action_name: - act_slot = slot - flat_belief = dip_state.get_beliefStateVec(act_slot) - self.prev_state_check = flat_belief - - return flat_belief, action - - def nextAction(self, beliefstate): - ''' - select next action - - :param beliefstate: already converted to dipstatevec of the specific slot (or general) - :returns: (int) next summary action - ''' - - if self.exploration_type == 'e-greedy': - # epsilon greedy - if self.is_training and utils.Settings.random.rand() < self.epsilon: - action_Q = np.random.rand(len(self.action_names)) - else: - if len(beliefstate.shape) == 1: - action_Q = self.dqn.predict(np.reshape(beliefstate, (1, -1))) - else: - action_Q = self.dqn.predict(beliefstate) - # add current max Q to self.episode_ave_max_q - self.episode_ave_max_q.append(np.max(action_Q)) - - #return the Q vect, the action will be converted in the feudal policy - return action_Q - - def train(self): - ''' - call this function when the episode ends - ''' - - if not self.is_training: - logger.info("Not in training mode") - return - else: - logger.info("Update dqn policy parameters.") - - self.episodecount += 1 - logger.info("Sample Num so far: %s" % (self.samplecount)) - logger.info("Episode Num so far: %s" % (self.episodecount)) - - s_batch_new, s_batch_beliefstate, s_batch_chosen_slot, s2_batch_dipstate, s2_batch_beliefstate, t_batch_new, r_batch_new = \ - [], [], [], [], [], [], [] - - if self.samplecount >= self.minibatch_size * 8 and self.episodecount % self.training_frequency == 0: - logger.info('start training...') - - a_batch_one_hot_new = None - #updating only states where the action is not "pass()" complicates things :/ - #since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples - - while len(s_batch_new) < self.minibatch_size: - - s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \ - self.episodes[self.domainString].sample_batch() - - a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch] - #we only wanna update state-action pairs, where action != pass() - valid_steps = [action[-1] != 1 for action in a_batch_one_hot] - a_batch_one_hot = a_batch_one_hot[valid_steps] - - s_batch_new += [s[0] for i, s in enumerate(s_batch) if valid_steps[i]] - s_batch_beliefstate += [s[1] for i, s in enumerate(s_batch) if valid_steps[i]] - s_batch_chosen_slot += [s[2] for i, s in enumerate(s_batch) if valid_steps[i]] - - s2_batch_dipstate += [s[3] for s, valid in zip(s2_batch, valid_steps) if valid] - s2_batch_beliefstate += [s[1] for s, valid in zip(s2_batch, valid_steps) if valid] - - r_batch_new += [r for r, valid in zip(r_batch, valid_steps) if valid] - t_batch_new += [t for t, valid in zip(t_batch, valid_steps) if valid] - - if a_batch_one_hot_new is None: - a_batch_one_hot_new = a_batch_one_hot - else: - a_batch_one_hot_new = np.vstack((a_batch_one_hot_new, a_batch_one_hot)) - - s_batch_new = np.vstack(s_batch_new) - s2_batch_dipstate = np.vstack(s2_batch_dipstate) - - if self.js_threshold < 1.0 or self.jsd_reward: - #TODO: This is highly inefficient - js_divergence_batch = [] - for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot): - if slot != "None": - keys = belief['beliefs'][slot].keys() - - b = [belief['beliefs'][slot]['**NONE**']] + \ - [belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**'] - - b_2 = [belief2['beliefs'][slot]['**NONE**']] + \ - [belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**'] - - js_divergence = self.compute_js_divergence(b, b_2) - js_divergence_batch.append(js_divergence) - else: - js_divergence_batch.append(0.0) - else: - js_divergence_batch = [0] * len(r_batch_new) - - tanh_n = np.tanh(1) - if self.jsd_reward: - if self.jsd_function == 'tanh': - js_divergence_batch = np.tanh(np.array(js_divergence_batch)) / tanh_n - #normalize jsd between -1 and 1 - js_divergence_batch = (-1 + 2 * np.array(js_divergence_batch)).tolist() - elif self.js_threshold < 1.0: - # normalizing bound to [0, 2] and then /20 - js_divergence_batch = [2/20 * int(x > self.js_threshold) for x in js_divergence_batch] - - action_q = self.dqn.predict_dip(s2_batch_dipstate, a_batch_one_hot_new) - target_q = self.dqn.predict_target_dip(s2_batch_dipstate, a_batch_one_hot_new) - - action_q = np.reshape(action_q, (s_batch_new.shape[0], -1, self.action_dim)) - target_q = np.reshape(target_q, (s_batch_new.shape[0], -1, self.action_dim)) - - y_i = [] - for k in range(min(s_batch_new.shape[0], self.episodes[self.domainString].size())): - Q_bootstrap_label = 0 - if t_batch_new[k]: - Q_bootstrap_label = r_batch_new[k] - else: - if self.q_update == 'single': - action_Q = target_q[k] - if self.jsd_reward: - Q_bootstrap_label = js_divergence_batch[k] + self.gamma * np.max(action_Q) - else: - Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q) - elif self.q_update == 'double': - action_Q = action_q[k] - argmax_tuple = np.unravel_index(np.argmax(action_Q, axis=None), action_Q.shape) - value_Q = target_q[k][argmax_tuple] - Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q - y_i.append(Q_bootstrap_label) - - if self.replay_type == 'prioritized': - # update the sum-tree - # update the TD error of the samples in the minibatch - currentQ_s_a_ = action_q[k][a_batch[k]] - error = abs(currentQ_s_a_ - Q_bootstrap_label) - self.episodes[self.domainString].update(idx_batch[k], error) - - reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i]) - - predicted_q_value, _, currentLoss = self.dqn.train(s_batch_new, a_batch_one_hot_new, reshaped_yi) - - if self.episodecount % 1 == 0: - # Update target networks - self.dqn.update_target_network() - - self.savePolicyInc() - - def compute_js_divergence(self, P, Q): - - M = [p + q for p, q in zip(P, Q)] - return 0.5 * (entropy(P, M, base=2) + entropy(Q, M, base=2)) - -# END OF FILE diff --git a/policy/feudalgainRL/FeudalENACPolicy.py b/policy/feudalgainRL/FeudalENACPolicy.py deleted file mode 100644 index 216c90e3120f66aa13e49ca2f3db4204711b442a..0000000000000000000000000000000000000000 --- a/policy/feudalgainRL/FeudalENACPolicy.py +++ /dev/null @@ -1,514 +0,0 @@ -############################################################################### -# PyDial: Multi-domain Statistical Spoken Dialogue System Software -############################################################################### -# -# Copyright 2015 - 2019 -# Cambridge University Engineering Department Dialogue Systems Group -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -############################################################################### - -''' -ENACPolicy.py - Advantage Actor-Critic policy -================================================== - -Copyright CUED Dialogue Systems Group 2015 - 2017 - -.. seealso:: CUED Imports/Dependencies: - - import :class:`Policy` - import :class:`utils.ContextLogger` - -.. warning:: - Documentation not done. - - -************************ - -''' - -import copy -import os -import json -import numpy as np -import scipy -import scipy.signal -import pickle as pickle -import random -import utils -from utils.Settings import config as cfg -from utils import ContextLogger, DiaAct - -import ontology.FlatOntologyManager as FlatOnt -#from theano_dialogue.util.tool import * - -import tensorflow as tf -from policy.DRL.replay_buffer_episode_enac import ReplayBufferEpisode -from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode -import policy.DRL.utils as drlutils -import policy.DRL.enac as enac -import policy.Policy -from policy.ENACPolicy import ENACPolicy -import policy.SummaryAction -from policy.Policy import TerminalAction, TerminalState -from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state -from policy.feudalRL.feudalUtils import get_feudal_masks - -logger = utils.ContextLogger.getLogger('') - - -# Discounting function used to calculate discounted returns. -def discount(x, gamma): - return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] - -class FeudalENACPolicy(ENACPolicy): - '''Derived from :class:`Policy` - ''' - def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, action_names=None, slot=None): - super(FeudalENACPolicy, self).__init__(in_policy_file, out_policy_file, domainString=domainString, is_training=is_training) - - tf.reset_default_graph() - - self.in_policy_file = in_policy_file - self.out_policy_file = out_policy_file - self.is_training = is_training - self.accum_belief = [] - self.prev_state_check = None - - self.domainString = domainString - self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) - - self.features = 'dip' - self.sd_enc_size = 80 - self.si_enc_size = 40 - self.dropout_rate = 0. - if cfg.has_option('feudalpolicy', 'features'): - self.features = cfg.get('feudalpolicy', 'features') - if cfg.has_option('feudalpolicy', 'sd_enc_size'): - self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size') - if cfg.has_option('feudalpolicy', 'si_enc_size'): - self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size') - if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: - self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') - if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: - self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') - - - # init session - self.sess = tf.Session() - with tf.device("/cpu:0"): - - np.random.seed(self.randomseed) - tf.set_random_seed(self.randomseed) - - # initialise an replay buffer - if self.replay_type == 'vanilla': - self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed) - elif self.replay_type == 'prioritized': - self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed) - #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) - #self.episodes = [] - self.samplecount = 0 - self.episodecount = 0 - - # construct the models - self.state_dim = 89 # current DIP state dim - self.summaryaction = policy.SummaryAction.SummaryAction(domainString) - self.action_names = action_names - self.action_dim = len(self.action_names) - action_bound = len(self.action_names) - self.stats = [0 for _ in range(self.action_dim)] - - if self.features == 'dip': - self.enac = enac.ENACNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.tau, - action_bound, self.architecture, self.h1_size, self.h2_size, self.is_training) - elif self.features == 'learned' or self.features == 'rnn': - si_state_dim = 72 - if self.domainString == 'CamRestaurants': - sd_state_dim = 94 - elif self.domainString == 'SFRestaurants': - sd_state_dim = 158 - elif self.domainString == 'Laptops11': - sd_state_dim = 13 - else: - logger.error( - 'Domain {} not implemented in feudal-DQN yet') # just find out the size of sd_state_dim for the new domain - if self.features == 'rnn': - arch = 'rnn' - else: - arch = 'vanilla' - self.state_dim = si_state_dim + sd_state_dim - self.enac = enac.ENACNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.tau, - action_bound, self.architecture, self.h1_size, self.h2_size, self.is_training) - else: - logger.error('features "{}" not implemented'.format(self.features)) - - # when all models are defined, init all variables - init_op = tf.global_variables_initializer() - self.sess.run(init_op) - - self.loadPolicy(self.in_policy_file) - print('loaded replay size: ', self.episodes[self.domainString].size()) - - - def convertStateAction(self, state, action): - ''' - - ''' - if isinstance(state, TerminalState): - return [0] * 89, action - - else: - if self.features == 'learned' or self.features == 'rnn': - dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString) - else: - dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString) - action_name = self.actions.action_names[action] - act_slot = 'general' - for slot in dip_state.slots: - if slot in action_name: - act_slot = slot - flat_belief = dip_state.get_beliefStateVec(act_slot) - self.prev_state_check = flat_belief - - return flat_belief, action - - def record(self, reward, domainInControl=None, weight=None, state=None, action=None): - if domainInControl is None: - domainInControl = self.domainString - if self.actToBeRecorded is None: - #self.actToBeRecorded = self.lastSystemAction - self.actToBeRecorded = self.summaryAct - - if state is None: - state = self.prevbelief - if action is None: - action = self.actToBeRecorded - - cState, cAction = state, action - - # normalising total return to -1~1 - reward /= 20.0 - - #value = self.a2c.predict_value([cState]) - value = np.array([[0.0]]) - policy_mu = self.mu_prob - - if weight == None: - if self.replay_type == 'vanilla': - self.episodes[domainInControl].record(state=cState, \ - state_ori=state, action=cAction, reward=reward, value=value[0][0], distribution=policy_mu) - elif self.replay_type == 'prioritized': - self.episodes[domainInControl].record(state=cState, \ - state_ori=state, action=cAction, reward=reward, value=value[0][0], distribution=policy_mu) - else: - self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward, ma_weight=weight) - - self.actToBeRecorded = None - self.samplecount += 1 - return - - def nextAction(self, beliefstate): - ''' - select next action - - :param beliefstate: - :returns: (int) next summary action - ''' - - if self.exploration_type == 'e-greedy': - - # epsilon greedy - if self.is_training and utils.Settings.random.rand() < self.epsilon: - action_prob = np.random.rand(len(self.action_names)) - - # Importance sampling (should be turned off) - #if nextaIdex == greedyNextaIdex: - # self.mu_prob = self.epsilon / float(self.action_dim) + 1 - self.epsilon - #else: - # self.mu_prob = self.epsilon / float(self.action_dim) - else: - action_prob = self.enac.predict_policy(np.reshape(beliefstate, (1, len(beliefstate)))) - - # add current max Q to self.episode_ave_max_q - #print 'current maxQ', np.max(admissible) - #self.episode_ave_max_q.append(np.max(admissible)) - - # Importance sampling - #self.mu_prob = self.epsilon / float(self.action_dim) + 1 - self.epsilon - - return action_prob - - def train(self): - ''' - call this function when the episode ends - ''' - - if not self.is_training: - logger.info("Not in training mode") - return - else: - logger.info("Update enac policy parameters.") - - self.episodecount += 1 - logger.info("Sample Num so far: %s" %(self.samplecount)) - logger.info("Episode Num so far: %s" %(self.episodecount)) - - if self.samplecount >= self.minibatch_size and self.episodecount % self.training_frequency == 0: - logger.info('start training...') - - s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, v_batch, mu_policy = \ - self.episodes[self.domainString].sample_batch() - - discounted_return_batch = [] - - - def weightsImportanceSampling(mu_policy, r_batch): - mu_policy = np.asarray(mu_policy) - mu_cum = [] - lenghts = [] # to properly divide on dialogues pi_policy later on - for mu in mu_policy: - lenghts.append(len(mu)) - mu = np.asarray(mu).astype(np.longdouble) - mu_cum.append(np.cumprod(mu[::-1])[::-1]) # going forward with cumulative product - # mu_cum = np.concatenate(np.array(mu_cum), axis=0).tolist() - mu_policy = np.concatenate(np.array(mu_policy), axis=0).tolist() # concatenate all behavioral probs - lengths = np.cumsum(lenghts) # time steps for ends of dialogues - lengths = np.concatenate((np.array([0]), lengths), axis=0) # add first dialogue - - if self.importance_sampling == 'max': - pass - elif self.importance_sampling == "soft": - # get the probabilities of actions taken from the batch - pi_policy = self.enac.getPolicy(np.concatenate(np.array(s_batch), axis=0).tolist())[0] # policy given s_t - columns = np.asarray([np.concatenate(a_batch, axis=0).tolist()]).astype(int) # actions taken at s_t - rows = np.asarray([ii for ii in range(len(pi_policy))]) - pi_policy = pi_policy[rows, columns][0].astype(np.longdouble) # getting probabilities for current policy - - ##################################### - # Weights for importance sampling - # it goes through each dialogue and computes in reverse order cumulative prod: - # rho_n = pi_n / mu_n - # ... - # rho_1 = pi_1 / mu_1 * ... * pi_n / mu_n - # using dialogue and weight_cum lists - ##################################### - - rho_forward = [] # rho_forward from eq. 3.3 (the first one) - rho_whole = [] # product across the whole dialogue from eq. 3.3 (the second one) - #pi_cum2 = [] # stats to compare - #mu_cum2 = [] # stats to compare - #pi_cum = [] # stats to compare - - # Precup version - r_vector = np.concatenate(np.array(r_batch), axis=0).tolist() - r_weighted = [] - - for ii in range(len(lengths) - 1): # over dialogues - weight_cum = 1. - dialogue = [] - - for pi, mu in zip(pi_policy[lengths[ii]:lengths[ii + 1]], mu_policy[lengths[ii]:lengths[ii + 1]]): - weight_cum *= pi / mu - dialogue.append(weight_cum) - - dialogue = np.array(dialogue) - dialogue = np.clip(dialogue, 0.5, 1) # clipping the weights - dialogue = dialogue.tolist() - - rho_forward.extend(dialogue) - #rho_whole.append(dialogue[-1]) - rho_whole.extend(np.ones(len(dialogue)) * dialogue[-1]) - r_weighted.extend(r_vector[lengths[ii]: lengths[ii + 1]] * np.asarray(dialogue)) - - # go back to original form: - ind = 0 - r_new = copy.deepcopy(r_batch) - for id, batch in enumerate(r_new): - for id2, _ in enumerate(batch): - r_new[id][id2] = r_weighted[ind] - ind += 1 - - # ONE STEP WEIGHTS - weights = np.asarray(pi_policy) / np.asarray(mu_policy) - weights = np.clip(weights, 0.5, 1) # clipping the weights - - return weights, rho_forward, rho_whole, r_new - - weights, rho_forward, rho_whole, r_new = weightsImportanceSampling(mu_policy, r_batch) - - weights = np.nan_to_num(weights) - rho_forward = np.nan_to_num(rho_forward) - rho_whole = np.nan_to_num(rho_whole) - """ - print 'w',weights - print 'rho_for',rho_forward - print 'rho_who',rho_whole - """ - - def calculate_discountR(r_episode, idx): - ######################################################################### - # Here we take the rewards and values from the rollouts, and use them to - # generate the advantage and discounted returns. - # The advantage function uses "Generalized Advantage Estimation" - bootstrap_value = 0.0 - # r_episode rescale by rhos? - self.r_episode_plus = np.asarray(r_episode[idx:] + [bootstrap_value]) - if self.importance_sampling: - self.r_episode_plus = self.r_episode_plus - else: - self.r_episode_plus = self.r_episode_plus/rho_forward[idx] - discounted_r_episode = discount(self.r_episode_plus, self.gamma)[:-1] - ######################################################################### - return discounted_r_episode[0] - - if self.replay_type == 'prioritized': - for item_r, item_v, item_idx in zip(r_new, v_batch, idx_batch): - rlist = [] - for idx in range(len(item_r)): - r = calculate_discountR(item_r, idx) - rlist.append(r) - - discounted_return_batch.append(rlist[-1]) - else: - for item_r, item_v in zip(r_new, v_batch): - rlist = [] - for idx in range(len(item_r)): - r = calculate_discountR(item_r, idx) - rlist.append(r) - - discounted_return_batch.append(rlist[-1]) - - batch_size = len(s_batch) - - if self.importance_sampling: - discounted_return_batch = np.clip(discounted_return_batch, -1, 1) - - # get gradient info and create matrix - gradient_matrix = [] - for item_s, item_a in zip(s_batch, a_batch): - item_a_one_hot = np.eye(self.action_dim)[item_a] - policy_gradient = self.enac.get_policy_gradient(item_s, item_a_one_hot) - policy_gradient = [(policy_gradient_idv.flatten()).tolist() for policy_gradient_idv in policy_gradient] - policy_gradient_flatten = np.hstack(policy_gradient) - policy_gradient_flatten = np.append(policy_gradient_flatten, [1.0]) - gradient_matrix.append(policy_gradient_flatten.tolist()) - - gradient_matrix = np.matrix(gradient_matrix) - return_matrix = np.matrix(discounted_return_batch) - - logger.info("Updating eNAC policy parameters, before calculate eNac matrix") - try: - natural_gradient = np.dot(np.linalg.pinv(gradient_matrix), return_matrix.T) - # convert a matrix to list-like array - natural_gradient = np.array(natural_gradient.flatten()).ravel() - natural_gradient = natural_gradient[:-1] # discard the last element - except: - natural_gradient = self.natural_gradient_prev - print('SVD problem') - - logger.info("Updating eNAC policy parameters, after calculate eNac matrix") - - self.natural_gradient_prev = natural_gradient - - all_params = self.enac.get_params() - - cnt = 0 - modelW = [] - modelB = [] - for variable in all_params: - - shape = variable.shape - # weight matrix - if np.array(variable).ndim == 1: - until = np.array(variable).shape[0] - subNG = np.reshape(natural_gradient[cnt:cnt+until],shape) - cnt += until - modelB.append(subNG) - # bias vector - elif np.array(variable).ndim == 2: - until = np.array(variable).shape[0]*np.array(variable).shape[1] - subNG = np.reshape(natural_gradient[cnt:cnt+until],shape) - cnt += until - modelW.append(subNG) - - a_batch_one_hot = np.eye(self.action_dim)[np.concatenate(a_batch, axis=0).tolist()] - - policy_loss, entropy, all_loss, optimise = self.enac.train( \ - np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, \ - modelW[0], modelB[0], modelW[1], modelB[1], modelW[2], modelB[2] \ - ) - - norm_p_l, ent, norm_all_l = \ - policy_loss/float(batch_size), \ - entropy/float(batch_size), all_loss/float(batch_size) - - self.savePolicyInc() # self.out_policy_file) - - def savePolicy(self, FORCE_SAVE=False): - """ - Does not use this, cause it will be called from agent after every episode. - we want to save the policy only periodically. - """ - pass - - def savePolicyInc(self, FORCE_SAVE=False): - """ - save model and replay buffer - """ - if self.episodecount % self.save_step == 0: - self.enac.save_network(self.out_policy_file+'.enac.ckpt') - - f = open(self.out_policy_file+'.episode', 'wb') - for obj in [self.samplecount, self.episodes[self.domainString]]: - pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) - f.close() - #logger.info("Saving model to %s and replay buffer..." % save_path) - - def saveStats(self, FORCE_SAVE=False): - f = open(self.out_policy_file + '.stats', 'wb') - pickle.dump(self.stats, f, protocol=pickle.HIGHEST_PROTOCOL) - f.close() - - def loadPolicy(self, filename): - """ - load model and replay buffer - """ - # load models - self.enac.load_network(filename+'.enac.ckpt') - - # load replay buffer - try: - print('load from: ', filename) - f = open(filename+'.episode', 'rb') - loaded_objects = [] - for i in range(2): # load nn params and collected data - loaded_objects.append(pickle.load(f)) - self.samplecount = int(loaded_objects[0]) - self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1]) - logger.info("Loading both model from %s and replay buffer..." % filename) - f.close() - except: - logger.info("Loading only models...") - - def restart(self): - self.summaryAct = None - self.lastSystemAction = None - self.prevbelief = None - self.actToBeRecorded = None - self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter) - self.episode_ave_max_q = [] - -#END OF FILE diff --git a/policy/feudalgainRL/FeudalNoisyACERPolicy.py b/policy/feudalgainRL/FeudalNoisyACERPolicy.py index 732ee8a0d2528e5773a271c3e915db312cbfd6d2..f0608074ea428fe12d47a51c3b35fe606d2a0053 100644 --- a/policy/feudalgainRL/FeudalNoisyACERPolicy.py +++ b/policy/feudalgainRL/FeudalNoisyACERPolicy.py @@ -20,47 +20,24 @@ # ############################################################################### -''' -ACERPolicy.py - ACER - Actor Critic with Experience Replay -================================================== - -Copyright CUED Dialogue Systems Group 2015 - 2017 - -.. seealso:: CUED Imports/Dependencies: - - import :class:`Policy` - import :class:`utils.ContextLogger` - -.. warning:: - Documentation not done. - - -************************ - -''' import copy -import os -import json import numpy as np import scipy import scipy.signal -from scipy.stats import entropy import pickle as pickle -import random import utils -from policy.feudalgainRL.NoisyACERPolicy import NoisyACERPolicy -from utils.Settings import config as cfg -from utils import ContextLogger, DiaAct - import ontology.FlatOntologyManager as FlatOnt import tensorflow as tf -from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode -from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode -import policy.DRL.utils as drlutils -#from policy.SACERPolicy import SACERPolicy import policy.feudalgainRL.noisyacer as noisy_acer import policy.Policy import policy.SummaryAction + +from policy.feudalgainRL.NoisyACERPolicy import NoisyACERPolicy +from scipy.stats import entropy +from utils.Settings import config as cfg +from utils import ContextLogger +from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode +from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode from policy.Policy import TerminalAction, TerminalState from policy.feudalgainRL.DIP_parametrisation import DIP_state, padded_state @@ -75,8 +52,7 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy): '''Derived from :class:`Policy` ''' def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, - action_names=None, slot=None, sd_state_dim=50, js_threshold=1.0, info_reward=0.0, load_policy=True, - critic_regularizer_weight=0): + action_names=None, slot=None, sd_state_dim=50, load_policy=True): super(FeudalNoisyACERPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training) tf.reset_default_graph() @@ -87,14 +63,9 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy): self.accum_belief = [] self.prev_state_check = None self.sd_state_dim = sd_state_dim - self.info_reward = info_reward - self.js_threshold = js_threshold - if self.js_threshold != 1.0: - print("We train with JS-divergence, threshold =", self.js_threshold) self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) - self.critic_regularizer_weight = critic_regularizer_weight self.features = 'dip' self.sd_enc_size = 80 @@ -124,9 +95,6 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy): if self.sample_argmax: print("We sample argmax") - #self.log_path = cfg.get('exec_config', 'logfiledir') - #self.log_path = self.log_path + f"/{in_policy_file.split('/')[-1].split('.')[0]}-seed{self.randomseed}.npy" - self.load_policy = load_policy # init session @@ -141,8 +109,7 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy): self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed) elif self.replay_type == 'prioritized': self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed) - #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) - #self.episodes = [] + self.samplecount = 0 self.episodecount = 0 @@ -151,40 +118,24 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy): self.summaryaction = policy.SummaryAction.SummaryAction(domainString) self.action_names = action_names self.action_dim = len(self.action_names) - action_bound = len(self.action_names) self.stats = [0 for _ in range(self.action_dim)] self.global_mu = [0. for _ in range(self.action_dim)] - if self.features == 'dip': - if self.actfreq_ds: - if self.domainString == 'CamRestaurants': - self.state_dim += 9#16 - elif self.domainString == 'SFRestaurants': - self.state_dim += 9#25 - elif self.domainString == 'Laptops11': - self.state_dim += 9#40 - self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta, - self.c, self.alpha, self.h1_size, self.h2_size, self.is_training) - elif self.features == 'learned' or self.features == 'rnn': - si_state_dim = 73 - if self.actfreq_ds: - if self.domainString == 'CamRestaurants': - si_state_dim += 9#16 - elif self.domainString == 'SFRestaurants': - si_state_dim += 9#25 - elif self.domainString == 'Laptops11': - si_state_dim += 9#40 - - self.state_dim = si_state_dim - self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim, - self.critic_lr, self.delta, self.c, self.alpha, self.h1_size, - self.h2_size, self.is_training, temperature=self.temperature, - critic_regularizer_weight=self.critic_regularizer_weight, - noisy_acer=self.noisy_acer) - - else: - logger.error('features "{}" not implemented'.format(self.features)) + si_state_dim = 73 + if self.actfreq_ds: + if self.domainString == 'CamRestaurants': + si_state_dim += 9#16 + elif self.domainString == 'SFRestaurants': + si_state_dim += 9#25 + elif self.domainString == 'Laptops11': + si_state_dim += 9#40 + + self.state_dim = si_state_dim + self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim, + self.critic_lr, self.delta, self.c, self.alpha, self.h1_size, + self.h2_size, self.is_training, + noisy_acer=self.noisy_acer) # when all models are defined, init all variables init_op = tf.global_variables_initializer() @@ -262,10 +213,6 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy): logger.warning("record attempted to be finalized for domain where nothing has been recorded before") return - #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size()) - #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q) - #print self.stats - # normalising total return to -1~1 reward /= 20.0 @@ -286,24 +233,8 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy): ######################################################################### return discounted_r_episode, advantage - if self.replay_type == 'vanilla': - self.episodes[domainInControl].record(state=terminal_state, \ - state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None) - elif self.replay_type == 'prioritized': - episode_r, episode_v = self.episodes[domainInControl].record_final_and_get_episode(state=terminal_state, \ - state_ori=TerminalState(), - action=terminal_action, - reward=reward, - value=value) - - # TD_error is a list of td error in the current episode - _, TD_error = calculate_discountR_advantage(episode_r, episode_v) - episodic_TD = np.mean(np.absolute(TD_error)) - print('episodic_TD') - print(episodic_TD) - self.episodes[domainInControl].insertPriority(episodic_TD) - - return + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None) def compute_responsible_q(self, inputs, actions, mask): return self.sacer.compute_responsible_q(inputs, actions, mask) @@ -363,12 +294,6 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy): USE_GLOBAL_MU = False self.episode_ct += 1 - # new_noise_man_array = np.expand_dims(np.array(self.sacer.compute_mean_noisy()), axis=0) - # if os.path.exists(self.log_path): - # noise_mean_array = np.load(self.log_path) - # new_noise_man_array = np.concatenate((noise_mean_array, new_noise_man_array), axis=0) - # np.save(self.log_path, new_noise_man_array) - if not self.is_training: logger.info("Not in training mode") return @@ -379,7 +304,6 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy): logger.info("Sample Num so far: %s" % (self.samplecount)) logger.info("Episode Num so far: %s" % (self.episodecount)) if self.samplecount >= self.minibatch_size * 3 and self.episodecount % self.training_frequency == 0: - #if self.episodecount % self.training_frequency == 0: logger.info('start trainig...') for _ in range(self.train_iters_per_episode): @@ -395,33 +319,6 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy): assert False # not implemented yet s_batch = [[state_tuple[0] for state_tuple in epi] for epi in s_batch_full] - s_batch_beliefstate = [[state_tuple[1] for state_tuple in epi] for epi in s_batch_full] - s_batch_chosen_slot = [[state_tuple[2] for state_tuple in epi] for epi in s_batch_full] - - s2_batch_beliefstate = [[state_tuple[1] for state_tuple in epi] for epi in s2_batch_full] - - js_divergence_batch = [] - - if self.js_threshold < 1.0: - #TODO: This is probably highly inefficient - for epi_s, epi_s2, epi_slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot): - for belief, belief2, slot in zip(epi_s, epi_s2, epi_slot): - if slot != "None": - keys = belief['beliefs'][slot].keys() - - b = [belief['beliefs'][slot]['**NONE**']] + \ - [belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**'] - - b_2 = [belief2['beliefs'][slot]['**NONE**']] + \ - [belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**'] - - js_divergence = self.compute_js_divergence(b, b_2) - js_divergence_batch.append(js_divergence) - else: - js_divergence_batch.append(0.0) - - js_divergence_batch = [int(x > self.js_threshold) for x in js_divergence_batch] - js_divergence_batch = 2/20 * np.array(js_divergence_batch) #normalizing bound to [0, 2] and then /20 discounted_r_batch = [] advantage_batch = [] @@ -467,30 +364,14 @@ class FeudalNoisyACERPolicy(NoisyACERPolicy): a_batch_one_hot = np.eye(self.action_dim)[np.concatenate(a_batch, axis=0).tolist()] - if self.js_threshold < 1.0: - r_batch_concatenated = np.concatenate(np.array(r_batch), axis=0) + js_divergence_batch - else: - r_batch_concatenated = np.concatenate(np.array(r_batch), axis=0) - - if critic_regularizer is not None: - critic_regularizer_q = critic_regularizer.compute_responsible_q( - np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, - np.concatenate(np.array(mask_batch), axis=0).tolist()) - - loss, entropy, optimize = \ - self.sacer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, - np.concatenate(np.array(mask_batch), axis=0).tolist(), - r_batch_concatenated, s_batch, r_batch, self.gamma, - np.concatenate(np.array(mu_policy), axis=0), - discounted_r_batch, advantage_batch, - critic_regularizer_output=critic_regularizer_q) - else: - loss, entropy, optimize = \ - self.sacer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, - np.concatenate(np.array(mask_batch), axis=0).tolist(), - r_batch_concatenated, s_batch, r_batch, self.gamma, - np.concatenate(np.array(mu_policy), axis=0), - discounted_r_batch, advantage_batch) + r_batch_concatenated = np.concatenate(np.array(r_batch), axis=0) + + loss, entropy, optimize = \ + self.sacer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, + np.concatenate(np.array(mask_batch), axis=0).tolist(), + r_batch_concatenated, s_batch, r_batch, self.gamma, + np.concatenate(np.array(mu_policy), axis=0), + discounted_r_batch, advantage_batch) ent, norm_loss = entropy/float(batch_size), loss/float(batch_size) diff --git a/policy/feudalgainRL/FeudalNoisyDQNPolicy.py b/policy/feudalgainRL/FeudalNoisyDQNPolicy.py index 3fffadbb49839298c03157c5d4e11cdab6add668..7ba55b8b42d243eb29d99210f7918c669611a6fe 100644 --- a/policy/feudalgainRL/FeudalNoisyDQNPolicy.py +++ b/policy/feudalgainRL/FeudalNoisyDQNPolicy.py @@ -95,7 +95,7 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy): if self.jsd_function is not None: print("We use the JSD-function", self.jsd_function) if self.js_threshold != 1.0 and not self.jsd_reward: - print("We use JS-divergence, threshold =", self.js_threshold) + print("We use Information Gain with JS-divergence, threshold =", self.js_threshold) if self.jsd_reward: print("We train with raw JSD reward.") self.slots = slot @@ -361,7 +361,7 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy): s2_batch_dipstate = np.vstack([s[3] for s in s2_batch]) t_batch_new = t_batch - if self.js_threshold < 1.0 or self.jsd_reward: + if self.js_threshold < 1.0: js_divergence_batch = [] for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot): if slot != "None": @@ -380,13 +380,7 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy): else: js_divergence_batch = [0] * len(r_batch_new) - tanh_n = np.tanh(1) - if self.jsd_reward: - if self.jsd_function == 'tanh': - js_divergence_batch = np.tanh(np.array(js_divergence_batch)) / tanh_n - #normalize jsd between -1 and 1 - js_divergence_batch = (-1 + 2 * np.array(js_divergence_batch)).tolist() - elif self.js_threshold < 1.0: + if self.js_threshold < 1.0: # normalizing bound to [0, 2] and then /20 js_divergence_batch = [2/20 * int(x > self.js_threshold) for x in js_divergence_batch] @@ -404,18 +398,12 @@ class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy): else: if self.q_update == 'single': action_Q = target_q[k] - if self.jsd_reward: - Q_bootstrap_label = js_divergence_batch[k] + self.gamma * np.max(action_Q) - else: - Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q) + Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q) elif self.q_update == 'double': action_Q = action_q[k] argmax_tuple = np.unravel_index(np.argmax(action_Q, axis=None), action_Q.shape) value_Q = target_q[k][argmax_tuple] - if not self.jsd_reward: - Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q - else: - Q_bootstrap_label = js_divergence_batch[k] + self.gamma * value_Q + Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q y_i.append(Q_bootstrap_label) diff --git a/policy/feudalgainRL/NoisyACERPolicy.py b/policy/feudalgainRL/NoisyACERPolicy.py index 5854756136445216cb3f58ce3ffb1569d576f1f4..6d5e5920386991907e2b72665cc1dc5066e83d1c 100644 --- a/policy/feudalgainRL/NoisyACERPolicy.py +++ b/policy/feudalgainRL/NoisyACERPolicy.py @@ -21,6 +21,9 @@ ############################################################################### ''' + +Using Noisy Networks for the following implementation: + ACERPolicy.py - Sample Efficient Actor Critic with Experience Replay ================================================== @@ -49,19 +52,17 @@ import pickle as pickle import copy import json import numpy as np -import os import random import scipy import scipy.signal import tensorflow as tf - +import ontology.FlatOntologyManager as FlatOnt +import utils import policy.feudalgainRL.noisyacer as noisy_acer -#from policy.DRL import replay_policy as replay_policy -from policy.DRL import utils as drlutils + from policy import Policy from policy import SummaryAction -import ontology.FlatOntologyManager as FlatOnt -import utils +from policy import MasterAction from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode from policy.Policy import TerminalAction, TerminalState @@ -158,21 +159,11 @@ class NoisyACERPolicy(Policy.Policy): self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.load_buffer = True - if cfg.has_option('policy', 'bootstrap_buffer'): - self.load_buffer = cfg.getboolean('policy', 'bootstrap_buffer') - print("SACER: BOOTSTRAP BUFFER: ", self.load_buffer) - self.load_policy = True - if cfg.has_option('policy', 'bootstrap_master_policy'): - self.load_policy = cfg.getboolean('policy', 'bootstrap_master_policy') - print("SACER: BOOTSTRAP Policy: ", self.load_policy) # parameter settings - if 0: # cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper - self.n_in = cfg.getint('dqnpolicy', 'n_in') - else: - self.n_in = self.get_n_in(domainString) + self.n_in = self.get_n_in(domainString) self.actor_lr = 0.0001 if cfg.has_option('dqnpolicy', 'actor_lr'): @@ -295,10 +286,6 @@ class NoisyACERPolicy(Policy.Policy): if cfg.has_option('policy', 'save_step'): self.save_step = cfg.getint('policy', 'save_step') - self.temperature = 0.0 - if cfg.has_option('policy', 'temperature'): - self.temperature = cfg.getfloat('policy', 'temperature') - self.behaviour_cloning = False if cfg.has_option('policy', 'behaviour_cloning'): self.behaviour_cloning = cfg.getboolean('policy', 'behaviour_cloning') @@ -471,8 +458,7 @@ class NoisyACERPolicy(Policy.Policy): self.global_mu = [0. for _ in range(self.action_dim)] self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta, - self.c, self.alpha, self.h1_size, self.h2_size, self.is_training, - temperature=self.temperature) + self.c, self.alpha, self.h1_size, self.h2_size, self.is_training) #if self.optimize_ER: # self.replay_policy = replay_policy.ReplayPolicy(self.sess, seed=self.randomseed) @@ -685,9 +671,7 @@ class NoisyACERPolicy(Policy.Policy): if self.master_space: beliefstate = beliefstate.getDomainState(self.domainUtil.domainString) - print("MASTER ACTION: ", self.masteraction.action_names[nextaIdex]) masterAct = self.masteraction.Convert(beliefstate, self.masteraction.action_names[nextaIdex], self.lastSystemAction) - print("MASTER ACT: ", masterAct) else: summaryAct = self.summaryaction.action_names[nextaIdex] beliefstate = beliefstate.getDomainState(self.domainUtil.domainString) diff --git a/policy/feudalgainRL/README.md b/policy/feudalgainRL/README.md new file mode 100644 index 0000000000000000000000000000000000000000..242b15337485f14caa150a3029c92eacdf0926fe --- /dev/null +++ b/policy/feudalgainRL/README.md @@ -0,0 +1,45 @@ +## FeudalGain + +This is the implementation to the work **What does the User want? Information Gain for Hierarchical Dialogue Policy Optimisation**, published at ASRU2021. + +Reference: + +#### Train a FeudalGain policy + +First of all, create a virtual environment with python3 and run **pip install -r requirements** to install the python packages. + +You can find config-files for all environments in the folder pydial3-public/policy/feudalgainRL/configs. To start a training, choose one of the config files and run the following command in the main repo: + +``` +python3 pydial.py train config_path/config.cfg +``` + +You can change parameter settings in the config files as needed. The most important parameters to set are: + +``` +[policy] +noisy_acer=True/False: Use noisy networks for policy \pi_mg or \pi_m and \pi_g +use_pass=True/False: Train information policy \pi_i with transitions where action=pass(). Deactivated if information gain is used. Should be activated if Feudal is used for training. + +[feudalpolicy] +only_master = True/False: True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g +js_threshold = 0.2: threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do use external reward for training \pi_i. + +[dqnpolicy] +architecture = noisy_duel/duel: use noisy_duel for noisy network architecture +``` + +If you want to use the vanilla Feudal algorithm, set the parameters in the config as follows: + +``` +[policy] +noisy_acer=False +use_pass=True + +[feudalpolicy] +only_master = False +js_threshold = 1.0 + +[dqnpolicy] +architecture = duel +``` \ No newline at end of file diff --git a/policy/feudalgainRL/configs/env1-CR-FeudalGain.cfg b/policy/feudalgainRL/configs/env1-CR-FeudalGain.cfg new file mode 100644 index 0000000000000000000000000000000000000000..c393c0c1a9c21aa616cfd4d02d863e693b5ce68f --- /dev/null +++ b/policy/feudalgainRL/configs/env1-CR-FeudalGain.cfg @@ -0,0 +1,105 @@ +# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator +# User model: standard sampled params, sampled patience +# Masks: off + +###### General parameters ###### +[GENERAL] +# Set to "SFRestaurants" or "Laptops11" +domains = CamRestaurants +singledomain = True +tracedialog = 0 +seed = 0 + +[exec_config] +configdir = _benchmarkpolicies/env1-feudalgain +logfiledir = _benchmarklogs/env1-feudalgain +numtrainbatches = 20 +traindialogsperbatch = 200 +numbatchtestdialogs = 500 +trainsourceiteration = 0 +numtestdialogs = 500 +trainerrorrate = 0 +testerrorrate = 0 +testeverybatch = True +deleteprevpolicy = True + +[logging] +usecolor = False +screen_level = results +file_level = results +file = auto + +###### Environment parameters ###### + +[agent] +maxturns = 25 + +[usermodel] +usenewgoalscenarios = True +oldstylepatience = False +patience = 4,6 +configfile = config/sampledUM.cfg + +[errormodel] +nbestsize = 1 +confusionmodel = RandomConfusions +nbestgeneratormodel = SampledNBestGenerator +confscorer = additive + +[summaryacts] +maxinformslots = 5 +informmask = True +requestmask = True +informcountaccepted = 4 +byemask = True + +###### Dialogue Manager parameters ###### +[policy] +policydir = _benchmarkpolicies/env1-feudalgain +belieftype = focus +useconfreq = False +learning = True +policytype = feudalgain +startwithhello = False +inpolicyfile = auto +outpolicyfile = auto +# Set noisy_acer=False for vanilla neural networks +noisy_acer = True +# Set use_pass=True if transitions where pass() action were taken should be used. Always False if InformationGain is used. +use_pass = False + +[feudalpolicy] +features=learned +si_policy_type=acer +# only_master=True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g +only_master = True +# Set the threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do not use InformationGain. +js_threshold = 0.2 + +[dqnpolicy] +q_update = double +# set architecture=duel for vanilla neural networks +architecture = noisy_duel +h1_size = 300 +h2_size = 100 +capacity = 2000 +beta = 0.95 +epsilon_start = 0.3 +maxiter = 4000 +minibatch_size = 64 +is_threshold = 5.0 +episodeNum = 0.0 +epsilon_end = 0.0 +n_in = 268 +features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"] + +###### Evaluation parameters ###### + +[eval] +rewardvenuerecommended=0 +penaliseallturns = True +wrongvenuepenalty = 0 +notmentionedvaluepenalty = 0 +successmeasure = objective +successreward = 20 + diff --git a/policy/feudalgainRL/configs/env2-CR-FeudalGain.cfg b/policy/feudalgainRL/configs/env2-CR-FeudalGain.cfg new file mode 100644 index 0000000000000000000000000000000000000000..779f0eeb8f558f4827836a273e1ddf59a345f47a --- /dev/null +++ b/policy/feudalgainRL/configs/env2-CR-FeudalGain.cfg @@ -0,0 +1,106 @@ +# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator +# User model: standard sampled params, sampled patience +# Masks: off + +###### General parameters ###### +[GENERAL] +# Set to "SFRestaurants" or "Laptops11" +domains = CamRestaurants +singledomain = True +tracedialog = 0 +seed = 0 + +[exec_config] +configdir = _benchmarkpolicies/env2-feudalgain +logfiledir = _benchmarklogs/env2-feudalgain +numtrainbatches = 20 +traindialogsperbatch = 200 +numbatchtestdialogs = 500 +trainsourceiteration = 0 +numtestdialogs = 500 +trainerrorrate = 0 +testerrorrate = 0 +testeverybatch = True +deleteprevpolicy = True + +[logging] +usecolor = False +screen_level = results +file_level = results +file = auto + +###### Environment parameters ###### + +[agent] +maxturns = 25 + +[usermodel] +usenewgoalscenarios = True +oldstylepatience = False +patience = 4,6 +configfile = config/sampledUM.cfg + +[errormodel] +nbestsize = 1 +confusionmodel = RandomConfusions +nbestgeneratormodel = SampledNBestGenerator +confscorer = additive + + +[summaryacts] +maxinformslots = 5 +informmask = False +requestmask = False +informcountaccepted = 4 +byemask = False + +###### Dialogue Manager parameters ###### +[policy] +policydir = _benchmarkpolicies/env2-feudalgain +belieftype = focus +useconfreq = False +learning = True +policytype = feudalgain +startwithhello = False +inpolicyfile = auto +outpolicyfile = auto +# Set noisy_acer=False for vanilla neural networks +noisy_acer = True +# Set use_pass=True if transitions where pass() action were taken should be used. Always False if InformationGain is used. +use_pass = False + +[feudalpolicy] +features=learned +si_policy_type=acer +# only_master=True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g +only_master = True +# Set the threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do not use InformationGain. +js_threshold = 0.2 + +[dqnpolicy] +q_update = double +# set architecture=duel for vanilla neural networks +architecture = noisy_duel +h1_size = 300 +h2_size = 100 +capacity = 2000 +beta = 0.95 +epsilon_start = 0.3 +maxiter = 4000 +minibatch_size = 64 +is_threshold = 5.0 +episodeNum = 0.0 +epsilon_end = 0.0 +n_in = 268 +features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"] + +###### Evaluation parameters ###### + +[eval] +rewardvenuerecommended=0 +penaliseallturns = True +wrongvenuepenalty = 0 +notmentionedvaluepenalty = 0 +successmeasure = objective +successreward = 20 + diff --git a/feudalconfig.cfg b/policy/feudalgainRL/configs/env3-CR-FeudalGain.cfg similarity index 73% rename from feudalconfig.cfg rename to policy/feudalgainRL/configs/env3-CR-FeudalGain.cfg index cb24e8f4c4fc8173412f33b7db0eacc779d297f9..c85b4e8d71aa192d8d35a505ceca2baa5bab6f25 100644 --- a/feudalconfig.cfg +++ b/policy/feudalgainRL/configs/env3-CR-FeudalGain.cfg @@ -4,14 +4,15 @@ ###### General parameters ###### [GENERAL] +# Set to "SFRestaurants" or "Laptops11" domains = CamRestaurants singledomain = True tracedialog = 0 -seed = 1 +seed = 0 [exec_config] -configdir = _benchmarkpolicies/env3-feudal -logfiledir = _benchmarklogs/env3-feudal +configdir = _benchmarkpolicies/env3-feudalgain +logfiledir = _benchmarklogs/env3-feudalgain numtrainbatches = 20 traindialogsperbatch = 200 numbatchtestdialogs = 500 @@ -56,7 +57,7 @@ byemask = True ###### Dialogue Manager parameters ###### [policy] -policydir = _benchmarkpolicies/env3-feudal +policydir = _benchmarkpolicies/env3-feudalgain belieftype = focus useconfreq = False learning = True @@ -64,33 +65,23 @@ policytype = feudalgain startwithhello = False inpolicyfile = auto outpolicyfile = auto -temperature = 0.0 +# Set noisy_acer=False for vanilla neural networks noisy_acer = True -sample_argmax = False +# Set use_pass=True if transitions where pass() action were taken should be used. Always False if InformationGain is used. +use_pass = False [feudalpolicy] features=learned si_policy_type=acer +# only_master=True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g only_master = True -jsd_reward = False -#jsd_function = tanh +# Set the threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do not use InformationGain. js_threshold = 0.2 -js_threshold_master = 1 - -[i2a] -is_imaging = False -deepmind = False -load_pretrain_data = False -improve_env = False -share_layer = 2 -new_q_loss = False -device = cpu -env_model_path = env_model/env1_acer_200.pkl [dqnpolicy] q_update = double +# set architecture=duel for vanilla neural networks architecture = noisy_duel -#architecture = duel h1_size = 300 h2_size = 100 capacity = 2000 diff --git a/policy/feudalgainRL/configs/env4-CR-FeudalGain.cfg b/policy/feudalgainRL/configs/env4-CR-FeudalGain.cfg new file mode 100644 index 0000000000000000000000000000000000000000..1b9053b16ea9a46abf6b3eaf014e946f66f9134d --- /dev/null +++ b/policy/feudalgainRL/configs/env4-CR-FeudalGain.cfg @@ -0,0 +1,107 @@ +# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator +# User model: standard sampled params, sampled patience +# Masks: off + +###### General parameters ###### +[GENERAL] +# Set to "SFRestaurants" or "Laptops11" +domains = CamRestaurants +singledomain = True +tracedialog = 0 +seed = 0 + +[exec_config] +configdir = _benchmarkpolicies/env4-feudalgain +logfiledir = _benchmarklogs/env4-feudalgain +numtrainbatches = 20 +traindialogsperbatch = 200 +numbatchtestdialogs = 500 +trainsourceiteration = 0 +numtestdialogs = 500 +trainerrorrate = 15 +testerrorrate = 15 +testeverybatch = True +deleteprevpolicy = True + +[logging] +usecolor = False +screen_level = results +file_level = results +file = auto + +###### Environment parameters ###### + +[agent] +maxturns = 25 + +[usermodel] +usenewgoalscenarios = True +oldstylepatience = False +patience = 4,6 +configfile = config/sampledUM.cfg + +[errormodel] +nbestsize = 5 +confusionmodel = LevenshteinConfusions +nbestgeneratormodel = DSTC2NBestGenerator +confscorer = DSTC2 +configfile = config/set1-ErrorModel.cfg + + +[summaryacts] +maxinformslots = 5 +informmask = False +requestmask = False +informcountaccepted = 4 +byemask = False + +###### Dialogue Manager parameters ###### +[policy] +policydir = _benchmarkpolicies/env4-feudalgain +belieftype = focus +useconfreq = False +learning = True +policytype = feudalgain +startwithhello = False +inpolicyfile = auto +outpolicyfile = auto +# Set noisy_acer=False for vanilla neural networks +noisy_acer = True +# Set use_pass=True if transitions where pass() action were taken should be used. Always False if InformationGain is used. +use_pass = False + +[feudalpolicy] +features=learned +si_policy_type=acer +# only_master=True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g +only_master = True +# Set the threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do not use InformationGain. +js_threshold = 0.2 + +[dqnpolicy] +q_update = double +# set architecture=duel for vanilla neural networks +architecture = noisy_duel +h1_size = 300 +h2_size = 100 +capacity = 2000 +beta = 0.95 +epsilon_start = 0.3 +maxiter = 4000 +minibatch_size = 64 +is_threshold = 5.0 +episodeNum = 0.0 +epsilon_end = 0.0 +n_in = 268 +features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"] + +###### Evaluation parameters ###### + +[eval] +rewardvenuerecommended=0 +penaliseallturns = True +wrongvenuepenalty = 0 +notmentionedvaluepenalty = 0 +successmeasure = objective +successreward = 20 + diff --git a/policy/feudalgainRL/configs/env5-CR-FeudalGain.cfg b/policy/feudalgainRL/configs/env5-CR-FeudalGain.cfg new file mode 100644 index 0000000000000000000000000000000000000000..61bf9b1f5ed71d870e835a3ff4bb3075af2687e7 --- /dev/null +++ b/policy/feudalgainRL/configs/env5-CR-FeudalGain.cfg @@ -0,0 +1,107 @@ +# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator +# User model: standard sampled params, sampled patience +# Masks: off + +###### General parameters ###### +[GENERAL] +# Set to "SFRestaurants" or "Laptops11" +domains = CamRestaurants +singledomain = True +tracedialog = 0 +seed = 0 + +[exec_config] +configdir = _benchmarkpolicies/env5-feudalgain +logfiledir = _benchmarklogs/env5-feudalgain +numtrainbatches = 20 +traindialogsperbatch = 200 +numbatchtestdialogs = 500 +trainsourceiteration = 0 +numtestdialogs = 500 +trainerrorrate = 15 +testerrorrate = 15 +testeverybatch = True +deleteprevpolicy = True + +[logging] +usecolor = False +screen_level = results +file_level = results +file = auto + +###### Environment parameters ###### + +[agent] +maxturns = 25 + +[usermodel] +usenewgoalscenarios = True +oldstylepatience = False +patience = 4,6 +configfile = config/unfriendlyUM.cfg + +[errormodel] +nbestsize = 5 +confusionmodel = LevenshteinConfusions +nbestgeneratormodel = DSTC2NBestGenerator +confscorer = DSTC2 +configfile = config/set1-ErrorModel.cfg + + +[summaryacts] +maxinformslots = 5 +informmask = True +requestmask = True +informcountaccepted = 4 +byemask = True + +###### Dialogue Manager parameters ###### +[policy] +policydir = _benchmarkpolicies/env5-feudalgain +belieftype = focus +useconfreq = False +learning = True +policytype = feudalgain +startwithhello = False +inpolicyfile = auto +outpolicyfile = auto +# Set noisy_acer=False for vanilla neural networks +noisy_acer = True +# Set use_pass=True if transitions where pass() action were taken should be used. Always False if InformationGain is used. +use_pass = False + +[feudalpolicy] +features=learned +si_policy_type=acer +# only_master=True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g +only_master = True +# Set the threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do not use InformationGain. +js_threshold = 0.2 + +[dqnpolicy] +q_update = double +# set architecture=duel for vanilla neural networks +architecture = noisy_duel +h1_size = 300 +h2_size = 100 +capacity = 2000 +beta = 0.95 +epsilon_start = 0.3 +maxiter = 4000 +minibatch_size = 64 +is_threshold = 5.0 +episodeNum = 0.0 +epsilon_end = 0.0 +n_in = 268 +features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"] + +###### Evaluation parameters ###### + +[eval] +rewardvenuerecommended=0 +penaliseallturns = True +wrongvenuepenalty = 0 +notmentionedvaluepenalty = 0 +successmeasure = objective +successreward = 20 + diff --git a/policy/feudalgainRL/configs/env6-CR-FeudalGain.cfg b/policy/feudalgainRL/configs/env6-CR-FeudalGain.cfg new file mode 100644 index 0000000000000000000000000000000000000000..75fc7db5b663cd9172fe0ac987cd05177e0bf72f --- /dev/null +++ b/policy/feudalgainRL/configs/env6-CR-FeudalGain.cfg @@ -0,0 +1,106 @@ +# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator +# User model: standard sampled params, sampled patience +# Masks: off + +###### General parameters ###### +[GENERAL] +# Set to "SFRestaurants" or "Laptops11" for SanFrancisco or Laptops domain +domains = CamRestaurants +singledomain = True +tracedialog = 0 +seed = 0 + +[exec_config] +configdir = _benchmarkpolicies/env6-feudalgain +logfiledir = _benchmarklogs/env6-feudalgain +numtrainbatches = 20 +traindialogsperbatch = 200 +numbatchtestdialogs = 500 +trainsourceiteration = 0 +numtestdialogs = 500 +trainerrorrate = 30 +testerrorrate = 30 +testeverybatch = True +deleteprevpolicy = True + +[logging] +usecolor = False +screen_level = results +file_level = results +file = auto + +###### Environment parameters ###### + +[agent] +maxturns = 25 + +[usermodel] +usenewgoalscenarios = True +oldstylepatience = False +patience = 4,6 +configfile = config/sampledUM.cfg + +[errormodel] +nbestsize = 5 +confusionmodel = LevenshteinConfusions +nbestgeneratormodel = DSTC2NBestGenerator +confscorer = DSTC2 +configfile = config/set3-ErrorModel.cfg + +[summaryacts] +maxinformslots = 5 +informmask = True +requestmask = True +informcountaccepted = 4 +byemask = True + +###### Dialogue Manager parameters ###### +[policy] +policydir = _benchmarkpolicies/env3-feudalgain +belieftype = focus +useconfreq = False +learning = True +policytype = feudalgain +startwithhello = False +inpolicyfile = auto +outpolicyfile = auto +# Set noisy_acer=False for vanilla neural networks +noisy_acer = True +# Set use_pass=True if transitions where pass() action were taken should be used. Always False if InformationGain is used. +use_pass = False + +[feudalpolicy] +features=learned +si_policy_type=acer +# only_master=True means that we use only policy pi_mg, set to False if you want to use pi_m and pi_g +only_master = True +# Set the threshold for information gain reward calculated by JS-divergence. If set to 1.0, we do not use InformationGain. +js_threshold = 0.2 + +[dqnpolicy] +q_update = double +# set architecture=duel for vanilla neural networks +architecture = noisy_duel +h1_size = 300 +h2_size = 100 +capacity = 2000 +beta = 0.95 +epsilon_start = 0.3 +maxiter = 4000 +minibatch_size = 64 +is_threshold = 5.0 +episodeNum = 0.0 +epsilon_end = 0.0 +n_in = 268 +features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"] + +###### Evaluation parameters ###### + +[eval] +rewardvenuerecommended=0 +penaliseallturns = True +wrongvenuepenalty = 0 +notmentionedvaluepenalty = 0 +successmeasure = objective +successreward = 20 + diff --git a/policy/feudalgainRL/dqn_latest.py b/policy/feudalgainRL/dqn_latest.py deleted file mode 100644 index f945067231ef7176b671fd6c5d35dea2599586e4..0000000000000000000000000000000000000000 --- a/policy/feudalgainRL/dqn_latest.py +++ /dev/null @@ -1,197 +0,0 @@ -############################################################################### -# PyDial: Multi-domain Statistical Spoken Dialogue System Software -############################################################################### -# -# Copyright 2015 - 2019 -# Cambridge University Engineering Department Dialogue Systems Group -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -############################################################################### - -""" -Implementation of DQAN - Deep Q Action Network - -The algorithm is developed with tflearn + Tensorflow - -Author: Pei-Hao Su -""" -import tensorflow as tf -import numpy as np -import tflearn - -from policy.DRL.replay_buffer import ReplayBuffer - -# =========================== -# Deep Q Action Network -# =========================== -class DeepQNetwork(object): - """ - Input to the network is the state and action, output is Q(s,a). - """ - def __init__(self, sess, state_dim, action_dim, learning_rate, tau, \ - num_actor_vars, architecture = 'duel', h1_size = 130, h2_size = 50): - self.sess = sess - self.s_dim = state_dim - self.a_dim = action_dim - self.learning_rate = learning_rate - self.tau = tau - self.architecture = architecture - self.h1_size = h1_size - self.h2_size = h2_size - - # Create the deep Q network - self.inputs, self.action, self.Qout = \ - self.create_ddq_network(self.architecture, self.h1_size, self.h2_size) - self.network_params = tf.trainable_variables() - - # Target Network - self.target_inputs, self.target_action, self.target_Qout = \ - self.create_ddq_network(self.architecture, self.h1_size, self.h2_size) - self.target_network_params = tf.trainable_variables()[len(self.network_params):] - - # Op for periodically updating target network - self.update_target_network_params = \ - [self.target_network_params[i].assign(\ - tf.mul(self.network_params[i], self.tau) + tf.mul(self.target_network_params[i], 1. - self.tau)) - for i in range(len(self.target_network_params))] - - # Network target (y_i) - self.sampled_q = tf.placeholder(tf.float32, [None, 1]) - - # Predicted Q given state and chosed action - #actions_one_hot = self.action - #self.pred_q = tf.reduce_sum(self.Qout * actions_one_hot, reduction_indices=1, name='q_acted') - self.pred_q = self.Qout - - self.diff = self.sampled_q - self.pred_q - - self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss') - - self.optimizer = tf.train.AdamOptimizer(self.learning_rate) - self.optimize = self.optimizer.minimize(self.loss) - - def create_ddq_network(self, architecture = 'duel', h1_size = 130, h2_size = 50): - inputs = tf.placeholder(tf.float32, [None, self.s_dim]) - action = tf.placeholder(tf.float32, [None, self.a_dim]) - - # state network - W_fc1_s = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01)) - b_fc1_s = tf.Variable(tf.zeros([h1_size])) - h_fc1_s = tf.nn.relu(tf.matmul(inputs, W_fc1_s) + b_fc1_s) - - # action network - W_fc1_a = tf.Variable(tf.truncated_normal([self.a_dim, h1_size], stddev=0.01)) - b_fc1_a = tf.Variable(tf.zeros([h1_size])) - h_fc1_a = tf.nn.relu(tf.matmul(action, W_fc1_a) + b_fc1_a) - - - #h_fc1 = tf.nn.tanh(tf.matmul(inputs, W_fc1) + b_fc1) - #if architecture == 'duel': - if False: - - """ - W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) - b_fc2_s = tf.Variable(tf.zeros([h2_size])) - h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s) - - W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01)) - b_value = tf.Variable(tf.zeros([1])) - value_out = tf.matmul(h_fc2_s, W_value) + b_value - - - - W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) - b_fc2_a = tf.Variable(tf.zeros([h2_size])) - h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a) - - Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1) - """ - - - # value function - W_value = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) - b_value = tf.Variable(tf.zeros([h2_size])) - h_value = tf.nn.relu(tf.matmul(h_fc1, W_value) + b_value) - - W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01)) - b_value = tf.Variable(tf.zeros([1])) - value_out = tf.matmul(h_value, W_value) + b_value - - # advantage function - W_advantage = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) - b_advantage = tf.Variable(tf.zeros([h2_size])) - h_advantage = tf.nn.relu(tf.matmul(h_fc1, W_advantage) + b_advantage) - - W_advantage = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01)) - b_advantage = tf.Variable(tf.zeros([self.a_dim])) - Advantage_out = tf.matmul(h_advantage, W_advantage) + b_advantage - - Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, reduction_indices=1, keep_dims=True)) - - else: - W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) - b_fc2_s = tf.Variable(tf.zeros([h2_size])) - h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s) - - W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) - b_fc2_a = tf.Variable(tf.zeros([h2_size])) - h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a) - - # inner product of state s and action a - #Qout = tf.mul(h_fc2_s,h_fc2_a) - Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1) - #Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1, keep_dims=True ) - #Qout = tf.reduce_sum(tf.mul(h_fc2_s,h_fc2_a)) - - return inputs, action, Qout - - def train(self, inputs, action, sampled_q): - return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={ - self.inputs: inputs, - self.action: action, - self.sampled_q: sampled_q - }) - - def predict(self, inputs, action): - #return self.sess.run(self.pred_q, feed_dict={ - return self.sess.run(self.Qout, feed_dict={ - self.inputs: inputs, - self.action: action - }) - - def predict_target(self, inputs, action): - #return self.sess.run(self.pred_q, feed_dict={ - return self.sess.run(self.target_Qout, feed_dict={ - self.target_inputs: inputs, - self.target_action: action - }) - - def update_target_network(self): - self.sess.run(self.update_target_network_params) - - def load_network(self, load_filename): - self.saver = tf.train.Saver() - try: - self.saver.restore(self.sess, load_filename) - print("Successfully loaded:", load_filename) - except: - print("Could not find old network weights") - - def save_network(self, save_filename): - print('Saving deepq-network...') - self.saver.save(self.sess, save_filename) - - def clipped_error(self, x): - return tf.select(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false diff --git a/policy/feudalgainRL/noisyacer.py b/policy/feudalgainRL/noisyacer.py index da52ad6ad9af57907094797589ee5cb9b954ab00..decaffab6d9d2cc3f15dc5134447d129b7202dc9 100644 --- a/policy/feudalgainRL/noisyacer.py +++ b/policy/feudalgainRL/noisyacer.py @@ -21,31 +21,26 @@ ############################################################################### """ -Implementation of ACER +Implementation of ACER with Noisy Networks The algorithm is developed with Tensorflow -Author: Gellert Weisz +Author: Gellert Weisz/Christian Geishauser """ import numpy as np import tensorflow as tf -from random import choice -from time import sleep -from time import time - -import sys # todo remove later # =========================== -# Soft Actor Critic with Experience Replay +# Actor Critic with Experience Replay # =========================== class NoisyACERNetwork(object): def __init__(self, sess, state_dim, action_dim, learning_rate, delta, c, alpha, h1_size=130, h2_size=50, - is_training = True, actfreq_loss=None, temperature=0, critic_regularizer_weight=0, noisy_acer=False): + is_training=True, actfreq_loss=None, noisy_acer=False): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim @@ -55,9 +50,7 @@ class NoisyACERNetwork(object): else: self.master_space = False self.learning_rate = learning_rate - self.critic_regularizer_weight = critic_regularizer_weight - if self.critic_regularizer_weight != 0: - print(f"We use a regularizer for the critic with weight {self.critic_regularizer_weight}.") + self.delta = delta self.c = c self.noisy_acer = noisy_acer @@ -65,11 +58,6 @@ class NoisyACERNetwork(object): self.h1_size = h1_size self.h2_size = h2_size self.is_training = is_training - self.temperature = temperature - if self.temperature != 0: - print("Using soft ACER, temperature set to: ", self.temperature) - else: - print("Temperature of Maximum Entropy set to 0, using ACER.") #Input and hidden layers self.inputs = tf.placeholder(tf.float32, [None, self.s_dim]) @@ -94,7 +82,7 @@ class NoisyACERNetwork(object): self.avg_policy = tf.stop_gradient(self.avg_policy) # weighted average over q-values according to current policy gives the value of the state - self.value = tf.reduce_sum((self.q - self.temperature * tf.log(self.policy)) * self.policy, 1) + self.value = tf.reduce_sum(self.q * self.policy, 1) self.actions_onehot = self.actions self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1]) @@ -109,7 +97,6 @@ class NoisyACERNetwork(object): self.q_ret = tf.placeholder(tf.float32, [None]) - # step 1 from pawel self.advantages_qret = self.q_ret - self.value self.wrt_theta_step1 = -tf.reduce_sum(tf.log(self.responsible_outputs) * tf.stop_gradient(self.rho * self.advantages_qret)) @@ -117,18 +104,12 @@ class NoisyACERNetwork(object): # step 2 from pawel self.wrt_theta = tf.reduce_sum( tf.log(self.responsible_outputs) * - tf.stop_gradient(self.rho_bar_c * (self.advantages_qret - self.temperature * (1 + tf.log(self.responsible_outputs)))) + - tf.reduce_sum(tf.log(self.policy) * - tf.stop_gradient(tf.maximum(0., 1. - self.c / self.rho_all) * - self.policy * - (self.q - tf.reshape(self.value, [-1, 1]) - self.temperature * (1 + tf.log(self.policy)))), [1])) + tf.stop_gradient(self.rho_bar_c * self.advantages_qret) + tf.reduce_sum(tf.log(self.policy) * + tf.stop_gradient(tf.maximum(0., 1. - self.c / self.rho_all) * self.policy * + (self.q - tf.reshape(self.value, [-1, 1]))), [1])) self.q_regularizer = tf.placeholder(tf.float32, [None]) - if self.critic_regularizer_weight != 0: - self.wrt_theta_v = tf.reduce_sum(tf.square(self.q_ret - self.responsible_q)) + \ - self.critic_regularizer_weight * tf.reduce_sum(tf.square(self.q_regularizer - self.responsible_q)) - else: - self.wrt_theta_v = tf.reduce_sum(tf.square(self.q_ret - self.responsible_q)) + self.wrt_theta_v = tf.reduce_sum(tf.square(self.q_ret - self.responsible_q)) self.entropy = -tf.reduce_sum(self.policy * tf.log(self.policy)) #self.loss = self.wrt_theta_v + self.wrt_theta - self.entropy * 0.01 @@ -166,7 +147,7 @@ class NoisyACERNetwork(object): else: self.final_gradients.append((-self.g[i][0], self.g[i][1])) # negative because this is loss - if self.temperature == 0 and not self.noisy_acer: + if not self.noisy_acer: self.optimize = [self.optimizer.apply_gradients(self.final_gradients), self.optimizer.apply_gradients(self.entropy_gradients), self.optimizer.apply_gradients(self.value_gradients) @@ -261,9 +242,6 @@ class NoisyACERNetwork(object): #self.behaviour_mask: behaviour_mask } - if self.critic_regularizer_weight != 0: - feed_dict[self.q_regularizer] = critic_regularizer_output - trpo_scale, klprod, kl, diff, entropy, loss, optimize = self.sess.run([self.trpo_scale, self.klprod, self.kl, self.advantage_qret_diff, self.entropy, self.loss, self.optimize], feed_dict=feed_dict) update_avg_theta = self.sess.run([self.update_avg_theta], feed_dict=feed_dict) diff --git a/pydial.py b/pydial.py index 606eb2382bddbcaa4e8645204de98c0102c77f50..ee827a042efcc02bd0ad57350972e486f6dcb415 100644 --- a/pydial.py +++ b/pydial.py @@ -825,6 +825,9 @@ def train_command(configfile, seed=None, trainerrorrate=None,trainsourceiteratio Optional parameters over-ride the corresponding config parameters of the same name. """ + os.makedirs("_benchmarklogs", exist_ok=True) + os.makedirs("_benchmarkpolicies", exist_ok=True) + try: if seed and seed.startswith('('): seeds = seed.replace('(', '').replace(')', '').split(',')