diff --git a/config/set1-ErrorModel.cfg b/config/set1-ErrorModel.cfg new file mode 100644 index 0000000000000000000000000000000000000000..faf5d1a67b39d44e52326b21673bf532d78b7d4d --- /dev/null +++ b/config/set1-ErrorModel.cfg @@ -0,0 +1,10 @@ +### Error model parameters ### +### set1 ### +correctNBLenDist = [0.264, 0.215, 0.138, 0.099, 0.08] +correctMean = [0.871, 0.103, 0.045, 0.032, 0.027, 0.049] +correctVar = [0.029, 0.01, 0.002, 0.001, 0.0, 0.001] +incorrectNBLenDist = [0.3, 0.068, 0.085, 0.072, 0.079] +incorrectNBPosDist = [0.0, 0.179, 0.063, 0.027, 0.017] +incorrectMean = [0.757, 0.179, 0.077, 0.048, 0.037, 0.065] +incorrectVar = [0.052, 0.014, 0.003, 0.001, 0.0, 0.002] + diff --git a/config/set2-ErrorModel.cfg b/config/set2-ErrorModel.cfg new file mode 100644 index 0000000000000000000000000000000000000000..d508eda07ef7a079432da231ea5a5b3aafd7b7e0 --- /dev/null +++ b/config/set2-ErrorModel.cfg @@ -0,0 +1,9 @@ +### Error model parameters ### +### set2 ### +correctNBLenDist = [0.322, 0.455, 0.183, 0.033, 0.006] +correctMean = [0.922, 0.105, 0.024, 0.021, 0.018, 0.022] +correctVar = [0.014, 0.013, 0.002, 0.001, 0.001, 0.001] +incorrectNBLenDist = [0.248, 0.462, 0.189, 0.073, 0.02] +incorrectNBPosDist = [0.0, 0.297, 0.045, 0.008, 0.002] +incorrectMean = [0.819, 0.201, 0.084, 0.052, 0.036, 0.042] +incorrectVar = [0.034, 0.02, 0.005, 0.002, 0.001, 0.004] diff --git a/config/set3-ErrorModel.cfg b/config/set3-ErrorModel.cfg new file mode 100644 index 0000000000000000000000000000000000000000..8707fbcc7ad81eff85e9c209b9e6724b58743def --- /dev/null +++ b/config/set3-ErrorModel.cfg @@ -0,0 +1,10 @@ +### Error model parameters ### +### set3 ### +correctNBLenDist = [0.147, 0.43, 0.212, 0.147, 0.048] +correctMean = [0.905, 0.101, 0.017, 0.008, 0.005, 0.01] +correctVar = [0.016, 0.013, 0.002, 0.001, 0.0, 0.001] +incorrectNBLenDist = [0.351, 0.388, 0.158, 0.071, 0.02] +incorrectNBPosDist = [0.0, 0.116, 0.027, 0.007, 0.002] +incorrectMean = [0.871, 0.164, 0.065, 0.043, 0.03, 0.054] +incorrectVar = [0.029, 0.02, 0.005, 0.002, 0.001, 0.003] + diff --git a/config/set4-ErrorModel.cfg b/config/set4-ErrorModel.cfg new file mode 100644 index 0000000000000000000000000000000000000000..a492c587d602e6dc84820a66c930780311540f84 --- /dev/null +++ b/config/set4-ErrorModel.cfg @@ -0,0 +1,9 @@ +### Error model parameters ### +### set4 ### +correctNBLenDist = [0.143, 0.405, 0.24, 0.155, 0.043] +correctMean = [0.9, 0.104, 0.02, 0.01, 0.004, 0.004] +correctVar = [0.016, 0.012, 0.002, 0.001, 0.0, 0.0] +incorrectNBLenDist = [0.315, 0.387, 0.175, 0.089, 0.024] +incorrectNBPosDist = [0.0, 0.128, 0.042, 0.01, 0.0] +incorrectMean = [0.868, 0.155, 0.064, 0.043, 0.029, 0.038] +incorrectVar = [0.03, 0.02, 0.004, 0.002, 0.001, 0.001] diff --git a/feudalconfig.cfg b/feudalconfig.cfg new file mode 100644 index 0000000000000000000000000000000000000000..3b310f37132adf5716e39f4f3cae7878c5b4738b --- /dev/null +++ b/feudalconfig.cfg @@ -0,0 +1,116 @@ +# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator +# User model: standard sampled params, sampled patience +# Masks: off + +###### General parameters ###### +[GENERAL] +domains = CamRestaurants +singledomain = True +tracedialog = 0 +seed = 1 + +[exec_config] +configdir = _benchmarkpolicies/env3-feudal +logfiledir = _benchmarklogs/env3-feudal +numtrainbatches = 20 +traindialogsperbatch = 200 +numbatchtestdialogs = 500 +trainsourceiteration = 0 +numtestdialogs = 500 +trainerrorrate = 15 +testerrorrate = 15 +testeverybatch = True +deleteprevpolicy = True + +[logging] +usecolor = False +screen_level = results +file_level = results +file = auto + +###### Environment parameters ###### + +[agent] +maxturns = 25 + +[usermodel] +usenewgoalscenarios = True +oldstylepatience = False +patience = 4,6 +configfile = config/sampledUM.cfg + +[errormodel] +nbestsize = 5 +confusionmodel = LevenshteinConfusions +nbestgeneratormodel = DSTC2NBestGenerator +confscorer = DSTC2 +configfile = config/set1-ErrorModel.cfg + + +[summaryacts] +maxinformslots = 5 +informmask = True +requestmask = True +informcountaccepted = 4 +byemask = True + +###### Dialogue Manager parameters ###### +[policy] +policydir = _benchmarkpolicies/env3-feudal +belieftype = focus +useconfreq = False +learning = True +policytype = feudalgain +startwithhello = False +inpolicyfile = auto +outpolicyfile = auto +temperature = 0.0 +noisy_acer = True +sample_argmax = False + +[feudalpolicy] +features=learned +si_policy_type=acer +only_master = True +jsd_reward = True +#jsd_function = tanh +js_threshold = 0.2 +js_threshold_master = 1 + +[i2a] +is_imaging = False +deepmind = False +load_pretrain_data = False +improve_env = False +share_layer = 2 +new_q_loss = False +device = cpu +env_model_path = env_model/env1_acer_200.pkl + +[dqnpolicy] +q_update = double +architecture = duel +#architecture = duel +h1_size = 300 +h2_size = 100 +capacity = 2000 +beta = 0.95 +epsilon_start = 0.3 +maxiter = 4000 +minibatch_size = 64 +is_threshold = 5.0 +episodeNum = 0.0 +epsilon_end = 0.0 +n_in = 268 +features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"] + +###### Evaluation parameters ###### + +[eval] +rewardvenuerecommended=0 +penaliseallturns = True +wrongvenuepenalty = 0 +notmentionedvaluepenalty = 0 +successmeasure = objective +successreward = 20 + diff --git a/policy/FeudalGainPolicy.py b/policy/FeudalGainPolicy.py new file mode 100644 index 0000000000000000000000000000000000000000..9a767ab2be8993f9ab07668968d9ba81535978f0 --- /dev/null +++ b/policy/FeudalGainPolicy.py @@ -0,0 +1,425 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + + +import numpy as np +import random +import utils +from utils.Settings import config as cfg +from utils import ContextLogger, DiaAct + +import ontology.FlatOntologyManager as FlatOnt +from ontology import Ontology +from policy import Policy +from policy import SummaryAction +from policy.feudalgainRL.DIP_parametrisation import DIP_state, padded_state +from policy.feudalgainRL.FeudalNoisyDQNPolicy import FeudalDQNPolicy +from policy.feudalgainRL.FeudalNoisyACERPolicy import FeudalNoisyACERPolicy +from policy.feudalgainRL.feudalUtils import get_feudalAC_masks + +logger = utils.ContextLogger.getLogger('') + + +class FeudalGainPolicy(Policy.Policy): + '''Derived from :class:`Policy` + ''' + + def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False): + super(FeudalGainPolicy, self).__init__(domainString, is_training) + + self.domainString = domainString + self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) + self.in_policy_file = in_policy_file + self.out_policy_file = out_policy_file + self.is_training = is_training + self.accum_belief = [] + + self.prev_state_check = None + #feudalRL variables + self.prev_sub_policy = None + self.prev_master_act = None + self.prev_master_belief = None + self.prev_child_act = None + self.prev_child_belief = None + + self.slots = list(Ontology.global_ontology.get_informable_slots(domainString)) + + if 'price' in self.slots: + self.slots.remove('price') # remove price from SFR ont, its not used + if 'name' in self.slots: + self.slots.remove('name') + + self.features = 'dip' + if cfg.has_option('feudalpolicy', 'features'): + self.features = cfg.get('feudalpolicy', 'features') + self.si_policy_type = 'dqn' + if cfg.has_option('feudalpolicy', 'si_policy_type'): + self.si_policy_type = cfg.get('feudalpolicy', 'si_policy_type') + self.sd_policy_type = 'dqn' + if cfg.has_option('feudalpolicy', 'sd_policy_type'): + self.sd_policy_type = cfg.get('feudalpolicy', 'sd_policy_type') + self.probability_max = 50 + if cfg.has_option('feudalpolicy', 'probability_max'): + self.probability_max = cfg.get('feudalpolicy', 'probability_max') + self.info_reward = 0.0 + if cfg.has_option('feudalpolicy', 'info_reward'): + self.info_reward = cfg.getfloat('feudalpolicy', 'info_reward') + self.js_threshold = 1.0 + if cfg.has_option('feudalpolicy', 'js_threshold'): + self.js_threshold = cfg.getfloat('feudalpolicy', 'js_threshold') + self.jsd_reward = False + if cfg.has_option('feudalpolicy', 'jsd_reward'): + self.jsd_reward = cfg.getboolean('feudalpolicy', 'jsd_reward') + self.jsd_function = None + if cfg.has_option('feudalpolicy', 'jsd_function'): + self.jsd_function = cfg.get('feudalpolicy', 'jsd_function') + self.info_reward_master = 0.0 + if cfg.has_option('feudalpolicy', 'info_reward_master'): + self.info_reward_master = cfg.getfloat('feudalpolicy', 'info_reward_master') + print("Master policy trains with info_gain reward") + self.js_threshold_master = 1.0 + if cfg.has_option('feudalpolicy', 'js_threshold_master'): + self.js_threshold_master = cfg.getfloat('feudalpolicy', 'js_threshold_master') + self.only_master = False + if cfg.has_option('feudalpolicy', 'only_master'): + self.only_master = cfg.getboolean('feudalpolicy', 'only_master') + if self.only_master: + print("We train with merged master!") + + self.bye_mask = False + if cfg.has_option('summaryacts', 'byemask'): + self.bye_mask = cfg.getboolean('summaryacts', 'byemask') + print("WE USE BYEMASK: ", self.bye_mask) + + self.critic_regularizer_path = None + if cfg.has_option('policy', 'critic_regularizer'): + self.critic_regularizer_path = cfg.get('policy', 'critic_regularizer') + print(f"We use {self.critic_regularizer_path} as a critic regularizer.") + + self.critic_regularizer_weight = 0 + if cfg.has_option('policy', 'critic_regularizer_weight'): + self.critic_regularizer_weight = cfg.getfloat('policy', 'critic_regularizer_weight') + + self.randomseed = 1234 + if cfg.has_option('GENERAL', 'seed'): + self.randomseed = cfg.getint('GENERAL', 'seed') + + self.load_master_policy = True + if cfg.has_option('policy', 'bootstrap_master_policy'): + self.load_master_policy = cfg.getboolean('policy', 'bootstrap_master_policy') + print("FeudalAC: BOOTSTRAP MASTER Policy: ", self.load_master_policy) + + # Create the feudal structure (including feudal masks) + + self.summaryaction = SummaryAction.SummaryAction(domainString) + self.full_action_list = self.summaryaction.action_names + self.slot_independent_actions = ["inform", + "inform_byname", + "inform_alternatives", + "reqmore", + 'bye', + 'pass' + ] + + self.slot_specific_actions = ["request", + "confirm", + "select", + 'pass'] + + self.master_actions = ['slot_ind', 'slot_dep'] + + self.chosen = False + + if self.only_master: + print("Using ACER with merged policy.") + self.master_actions = self.slot_independent_actions[:-1] + ['slot_dep'] + self.master_policy = FeudalNoisyACERPolicy(self._modify_policyfile('master', in_policy_file), + self._modify_policyfile('master', out_policy_file), + domainString=self.domainString, is_training=self.is_training, + action_names=self.master_actions, sd_state_dim=self.probability_max, + slot='si', js_threshold=self.js_threshold_master, + info_reward=self.info_reward_master, load_policy=self.load_master_policy, + critic_regularizer_weight=self.critic_regularizer_weight) + + elif self.si_policy_type == 'acer': + print("Using ACER with give_info and master_policy.") + self.master_policy = FeudalNoisyACERPolicy(self._modify_policyfile('master', in_policy_file), + self._modify_policyfile('master', out_policy_file), + domainString=self.domainString, is_training=self.is_training, + action_names=self.master_actions, sd_state_dim=self.probability_max, + slot='si', js_threshold=self.js_threshold_master, + info_reward=self.info_reward_master) + self.give_info_policy = FeudalNoisyACERPolicy(self._modify_policyfile('gi', in_policy_file), + self._modify_policyfile('gi', out_policy_file), + domainString=self.domainString, is_training=self.is_training, + action_names=self.slot_independent_actions, slot='si', + sd_state_dim=self.probability_max) + elif self.si_policy_type == 'dqn': + self.master_policy = FeudalNoisyACERPolicy(self._modify_policyfile('master', in_policy_file), + self._modify_policyfile('master', out_policy_file), + domainString=self.domainString, is_training=self.is_training, + action_names=self.master_actions, sd_state_dim=self.probability_max, + slot='si') + self.give_info_policy = FeudalDQNPolicy(self._modify_policyfile('gi', in_policy_file), + self._modify_policyfile('gi', out_policy_file), + domainString=self.domainString, is_training=self.is_training, + action_names=self.slot_independent_actions, slot='si', + sd_state_dim=0) + + else: + self.master_policy = FeudalDQNPolicy(self._modify_policyfile('master', in_policy_file), + self._modify_policyfile('master', out_policy_file), + domainString=self.domainString, is_training=self.is_training, + action_names=self.master_actions, + slot='si')#pass is always masked, but its needed for implementation + self.give_info_policy = FeudalDQNPolicy(self._modify_policyfile('gi', in_policy_file), + self._modify_policyfile('gi', out_policy_file), + domainString=self.domainString, is_training=self.is_training, + action_names=self.slot_independent_actions, slot='si') + + self.request_info_policy = FeudalDQNPolicy(self._modify_policyfile('ri', in_policy_file), + self._modify_policyfile('ri', out_policy_file), + domainString=self.domainString, is_training=self.is_training, + action_names=self.slot_specific_actions, slot='sd', + sd_state_dim=self.probability_max, + js_threshold=self.js_threshold, info_reward=self.info_reward, + jsd_reward=self.jsd_reward, jsd_function=self.jsd_function) + self.critic_regularizer = None + + def _modify_policyfile(self, mod, policyfile): + pf_split = policyfile.split('/') + pf_split[-1] = mod + '_' + pf_split[-1] + return '/'.join(pf_split) + + def act_on(self, state, hyps=None): + if self.lastSystemAction is None and self.startwithhello: + systemAct, nextaIdex = 'hello()', -1 + self.chosen_slot_ = None + else: + systemAct, nextaIdex = self.nextAction(state) + self.lastSystemAction = systemAct + self.summaryAct = nextaIdex + self.prevbelief = state + + systemAct = DiaAct.DiaAct(systemAct) + return systemAct + + def record(self, reward, domainInControl=None, weight=None, state=None, action=None): + self.record_master(reward) + self.record_childs(reward) + + def finalizeRecord(self, reward, domainInControl=None): + if domainInControl is None: + domainInControl = self.domainString + self.master_policy.finalizeRecord(reward) + if not self.only_master: + self.give_info_policy.finalizeRecord(reward) + self.request_info_policy.finalizeRecord(reward) + + #print("DIALOGUE FINISHED") + #print("REWARD:", reward) + #print("\n") + + def record_master(self, reward): + if self.only_master or self.si_policy_type == 'acer': + self.master_policy.record(reward, domainInControl=self.domainString, + state=[self.prev_master_belief, self.beliefstate, self.chosen_slot], + action=self.prev_master_act) + else: + self.master_policy.record(reward, domainInControl=self.domainString, + state=self.prev_master_belief, action=self.prev_master_act) + + def record_childs(self, reward): + if self.prev_sub_policy == 'si': + if not self.only_master: + self.give_info_policy.record(reward, domainInControl=self.domainString, + state=[self.prev_master_belief, 0 , 0], + action=self.prev_child_act) + + state_for_pi_d = np.concatenate([np.zeros(self.probability_max), self.prev_master_belief]) + state_for_pi_d[0] = 1.0 + + self.request_info_policy.record(reward, domainInControl=self.domainString, + state=[state_for_pi_d, + self.beliefstate, self.chosen_slot, self.dipstatevec_slots], + action=len(self.slot_specific_actions) - 1) + elif self.prev_sub_policy == 'sd': + self.request_info_policy.record(reward, domainInControl=self.domainString, + state=[self.prev_child_belief, self.beliefstate, self.chosen_slot, self.dipstatevec_slots], + action=self.prev_child_act) + if not self.only_master: + self.give_info_policy.record(reward, domainInControl=self.domainString, + state=[self.prev_master_belief, 0 , 0], + action=len(self.slot_independent_actions) - 1) + + def convertStateAction(self, state, action): + pass + + def nextAction(self, beliefstate): + ''' + select next action + + :param beliefstate: + :returns: (int) next summary action + ''' + + # compute main belief + + if self.features == 'learned' or self.features == 'rnn': + dipstate = padded_state(beliefstate, domainString=self.domainString, probability_max=self.probability_max) + else: + dipstate = DIP_state(beliefstate,domainString=self.domainString) + dipstatevec = dipstate.get_beliefStateVec('general') + + non_exec = self.summaryaction.getNonExecutable(beliefstate.domainStates[beliefstate.currentdomain], self.lastSystemAction) + masks = get_feudalAC_masks(non_exec, self.slots, self.slot_independent_actions, self.slot_specific_actions, + only_master=self.only_master) + + master_Q_values = self.master_policy.nextAction(dipstatevec, masks["master"]) + #TODO: MASTER ACTIONS ARE NOT MASKED, ONLY COMPLETELY VALID FOR ENV4 ATM + master_decision = np.argmax(master_Q_values) + self.prev_master_act = master_decision + self.prev_master_belief = dipstatevec + self.beliefstate = beliefstate.domainStates[beliefstate.currentdomain] + + self.dipstatevec_slots, self.maskvec_slots = self.get_dipstate_vec_slots_and_masks(dipstate, masks) + self.slot_beliefs = self.get_slot_beliefs(dipstate) + + if self.master_actions[master_decision] != 'slot_dep': + # drop to give_info policy + self.prev_sub_policy = 'si' + if not self.only_master: + child_Q_values = self.give_info_policy.nextAction(dipstatevec, masks['give_info']) + child_Q_values = np.add(child_Q_values, masks['give_info']) + #TODO: sample from the distribution instead of argmax.. + child_decision = np.argmax(child_Q_values) + summaryAct = self.slot_independent_actions[child_decision] + self.prev_child_act = child_decision + self.prev_child_belief = dipstatevec + else: + summaryAct = self.master_actions[master_decision] + self.chosen_slot = "None" + else: + self.prev_sub_policy = 'sd' + + child_Q_values = self.request_info_policy.nextAction(self.dipstatevec_slots) + #if we chose randomly, child_Q_values is of shape len(actions), else shape=(number_slots, len(actions)) + if len(child_Q_values.shape) == 1: + #we chose a random action, now we need a random slot to it + random_slot = random.choice(self.slots) + child_Q_values = np.add(child_Q_values, masks['req_info'][random_slot]) + child_decision = np.argmax(child_Q_values) + self.prev_child_act = child_decision + self.prev_child_belief = dipstate.get_beliefStateVec(random_slot) + self.chosen_slot = random_slot + summaryAct = self.slot_specific_actions[child_decision] + "_" + random_slot + else: + child_Q_values = np.add(child_Q_values, self.maskvec_slots) + child_decision = np.unravel_index(np.argmax(child_Q_values, axis=None), child_Q_values.shape) + #child_decision is tuple of length 2! + chosen_slot = child_decision[0] + chosen_action = child_decision[1] + self.chosen_slot = self.slots[chosen_slot] + self.chosen_slot_ = self.slots[chosen_slot] + self.prev_child_act = chosen_action + self.prev_child_belief = dipstate.get_beliefStateVec(self.slots[chosen_slot]) + summaryAct = self.slot_specific_actions[chosen_action] + "_" + self.slots[chosen_slot] + self.chosen = True + + #if self.chosen_slot_: + # print(self.chosen_slot_) + # keys = self.beliefstate['beliefs'][self.chosen_slot_].keys() + # b = [self.beliefstate['beliefs'][self.chosen_slot_]['**NONE**']] + \ + # [self.beliefstate['beliefs'][self.chosen_slot_][value] for value in list(keys) if value != '**NONE**'] + # print(f"DISTRIBUTION FOR SLOT {self.chosen_slot_}:", b) + + beliefstate = beliefstate.getDomainState(self.domainUtil.domainString) + masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction) + nextaIdex = self.full_action_list.index(summaryAct) + + return masterAct, nextaIdex + + def train(self): + ''' + call this function when the episode ends + ''' + self.master_policy.train(self.critic_regularizer) + if not self.only_master: + self.give_info_policy.train() + self.request_info_policy.train() + + def get_slot_beliefs(self, dipstate): + + slot_beliefs = [] + for slot in self.slots: + slot_dependent_vec = dipstate.get_beliefStateVec(slot) + slot_beliefs.append(slot_dependent_vec) + return np.concatenate(slot_beliefs, axis=0) + + def get_dipstate_vec_slots_and_masks(self, dipstate, masks): + + dipstatevec_slots = [] + maskvec_slots = [] + for slot in self.slots: + slot_dependent_vec = dipstate.get_beliefStateVec(slot) + dipstatevec_slots.append(slot_dependent_vec) + maskvec_slots.append(masks['req_info'][slot]) + dipstatevec_slots = np.vstack(dipstatevec_slots) + maskvec_slots = np.asarray(maskvec_slots) + + return dipstatevec_slots, maskvec_slots + + def savePolicy(self, FORCE_SAVE=False): + """ + Does not use this, cause it will be called from agent after every episode. + we want to save the policy only periodically. + """ + pass + + def savePolicyInc(self, FORCE_SAVE=False): + """ + save model and replay buffer + """ + # just save each sub-policy + self.master_policy.savePolicyInc() + if not self.only_master: + self.give_info_policy.savePolicyInc() + self.request_info_policy.savePolicyInc() + + def loadPolicy(self, filename): + """ + load model and replay buffer + """ + # load policy models one by one + pass + + def restart(self): + self.summaryAct = None + self.lastSystemAction = None + self.prevbelief = None + self.actToBeRecorded = None + self.master_policy.restart() + if not self.only_master: + self.give_info_policy.restart() + self.request_info_policy.restart() + +# END OF FILE diff --git a/policy/PolicyManager.py b/policy/PolicyManager.py index 6e9d754d8be7b4fb988ee1df6355dcdd8da24e7e..3ca85b89159b3f27f93341d03aaba96f2b28583b 100644 --- a/policy/PolicyManager.py +++ b/policy/PolicyManager.py @@ -303,6 +303,9 @@ class PolicyManager(object): elif policy_type == 'feudalAC': from policy import FeudalACPolicy self.domainPolicies[domainString] = FeudalACPolicy.FeudalACPolicy(in_policy_file, out_policy_file, domainString, learning) + elif policy_type == 'feudalgain': + from policy import FeudalGainPolicy + self.domainPolicies[domainString] = FeudalGainPolicy.FeudalGainPolicy(in_policy_file, out_policy_file, domainString, learning) else: try: # try to view the config string as a complete module path to the class to be instantiated diff --git a/policy/feudalgainRL/DIP_parametrisation.py b/policy/feudalgainRL/DIP_parametrisation.py new file mode 100644 index 0000000000000000000000000000000000000000..82db843c5e2e4d7336497c64368e3fbe321d3a6f --- /dev/null +++ b/policy/feudalgainRL/DIP_parametrisation.py @@ -0,0 +1,2022 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +''' +Class to convert belief states into DIP parametrisations +''' + +import numpy as np +import copy +from itertools import product +from scipy.stats import entropy + +from policy.Policy import Policy, Action, State, TerminalAction, TerminalState +from ontology import Ontology +from utils import Settings, ContextLogger, DialogueState +logger = ContextLogger.getLogger('') + +class DIP_state(State): + def __init__(self, belief, domainString=None, action_freq=None): + #params + self.domainString = domainString + self.N_bins = 10 + self.slots = list(Ontology.global_ontology.get_informable_slots(domainString)) + if 'price' in self.slots: + self.slots.remove('price') #remove price from SFR ont, its not used + + if 'name' in self.slots: + self.slots.remove('name') + self.DIP_state = {'general':None, 'joint':None} + for slot in self.slots: + self.DIP_state[slot]=None + + # convert belief state into DIP params + if action_freq is not None: + self.DIP_state['general'] = np.concatenate((action_freq,self.convert_general_b(belief))) + else: + self.DIP_state['general'] = self.convert_general_b(belief) + self.DIP_state['joint'] = self.convert_joint_slot_b(belief) + for slot in self.slots: + self.DIP_state[slot] = self.convert_slot_b(belief, slot) + + # create DIP vector and masks + self.get_DIP_vector() + self.beliefStateVec = None #for compatibility with GP sarsa implementation + + def get_DIP_vector(self): + """ + convert the DIP state into a numpy vector and a set of masks per slot + :return: + """ + pad_v = np.zeros(len(self.DIP_state[self.slots[0]])) + slot_len = len(pad_v) + general_len = len(self.DIP_state['general']) + len(self.DIP_state['joint']) + pad_v[0] = 1. + self.DIP_vector = [pad_v] + self.DIP_masks = {} + mask_template = [False] * (slot_len * (len(self.slots) + 1)) + [True] * general_len + i = 1 + for slot in self.slots: + self.DIP_vector.append(self.DIP_state[slot]) + self.DIP_masks[slot] = np.array(mask_template) + self.DIP_masks[slot][slot_len*i:slot_len*(i+1)] = True + i += 1 + self.DIP_vector.append(self.DIP_state['general']) + self.DIP_vector.append(self.DIP_state['joint']) + self.DIP_masks['general'] = np.array(mask_template) + self.DIP_masks['general'][:slot_len] = True + + self.DIP_vector = np.concatenate(self.DIP_vector) + + def get_beliefStateVec(self, slot): + return self.DIP_vector[self.DIP_masks[slot]] + + def get_DIP_state(self, slot): + return np.array([self.DIP_state['general'] + self.DIP_state['joint'] + self.DIP_state[slot]]) + + def get_full_DIP_state(self): + full_slot_bstate = [] + for slot in self.slots: + full_slot_bstate += self.DIP_state[slot] + full_DIP_state = np.array([full_slot_bstate + self.DIP_state['general'] + self.DIP_state['joint']]) + DIP_mask = [True]*(len(self.DIP_state['general']) + len(self.DIP_state['joint'])) + [False] * len(full_slot_bstate) + return full_DIP_state, DIP_mask + + def convert_general_b(self, belief): + """ + Extracts from the belief state the DIP vector corresponding to the general features (e.g. method, user act...) + :param belief: The full belief state + :return: The DIP general vector + """ + if type(belief) == DialogueState.DialogueState: + belief = belief.domainStates[belief.currentdomain] + + dial_act = list(belief['beliefs']['discourseAct'].values()) + + requested = self._get_DIP_requested_vector(belief) + method = list(belief['beliefs']['method'].values()) + features = [int(belief['features']['offerHappened']), int(belief['features']['lastActionInformNone']), int(bool(belief['features']['lastInformedVenue']))] + discriminable = [int(x) for x in belief['features']['inform_info']] + slot_n = 1/len(self.slots) + val_n = [] + for slot in self.slots: + val_n.append(len(Ontology.global_ontology.get_informable_slot_values(self.domainString, slot))) + avg_value_n = 1/np.mean(val_n) + + + return dial_act + requested + method + features + discriminable + [slot_n, avg_value_n] + + + def _get_DIP_requested_vector(self, belief): + n_requested = sum([x>0.5 for x in list(belief['beliefs']['requested'].values())]) + ret_vec = [0] * 5 + if n_requested > 4: + n_requested = 4 + ret_vec[n_requested] = 1. + return ret_vec + + def convert_joint_slot_b(self, belief): + """ + Extracts the features for the joint DIP vector for all the slots + :param belief: The full belief state + :return: The DIP joint slot vector + """ + if type(belief) == DialogueState.DialogueState: + belief = belief.domainStates[belief.currentdomain] + + joint_beliefs = [] + joint_none = 1. + informable_beliefs = [copy.deepcopy(belief['beliefs'][x]) for x in list(belief['beliefs'].keys()) if x in self.slots] # this might be inneficent + for i, b in enumerate(informable_beliefs): + joint_none *= b['**NONE**'] + del b['**NONE**'] # should I put **NONE** prob mass to dontcare? + informable_beliefs[i] = sorted([x for x in list(b.values()) if x != 0], reverse=True)[:2] + while len(informable_beliefs[i]) < 2: + informable_beliefs[i].append(0.) + for probs in product(*informable_beliefs): + joint_beliefs.append(np.prod(probs)) + j_top = joint_beliefs[0] + j_2nd = joint_beliefs[1] + j_3rd = joint_beliefs[2] + first_joint_beliefs = joint_beliefs[:8] + if sum(first_joint_beliefs) == 0: + first_joint_beliefs = np.ones(len(first_joint_beliefs)) / len(first_joint_beliefs) + else: + first_joint_beliefs = np.array(first_joint_beliefs) / sum(first_joint_beliefs) # why normalise? + + # difference between 1st and 2dn values + j_ent = entropy(first_joint_beliefs) + j_dif = joint_beliefs[0] - joint_beliefs[1] + j_dif_bin = [0.] * 5 + idx = int((j_dif) * 5) + if idx == 5: + idx = 4 + j_dif_bin[idx] = 1 + + # number of slots which are not **NONE** + n = 0 + for key in belief['beliefs']: + if key in self.slots: + none_val = belief['beliefs'][key]['**NONE**'] + top_val = np.max([belief['beliefs'][key][value] for value in list(belief['beliefs'][key].keys()) if value != '**NONE**']) + if top_val > none_val: + n += 1 + not_none = [0.] * 5 + if n > 4: + n = 4 + not_none[n] = 1. + + return [j_top, j_2nd, j_3rd, joint_none, j_ent, j_dif] + j_dif_bin + not_none + + def convert_slot_b(self, belief, slot): + """ + Extracts the slot DIP features. + :param belief: The full belief state + :return: The slot DIP vector + """ + if type(belief) == DialogueState.DialogueState: + belief = belief.domainStates[belief.currentdomain] + b = [belief['beliefs'][slot]['**NONE**']] + sorted([belief['beliefs'][slot][value] for value in list(belief['beliefs'][slot].keys()) if value != '**NONE**'], reverse=True) + b_top = b[1] + b_2nd = b[2] + b_3rd = b[3] + b_ent = entropy(b) + b_none = b[0] + b_dif = b[1] - b[2] + b_dif_bin = [0.] * 5 + idx = int((b_dif) * 5) + if idx == 5: + idx = 4 + b_dif_bin[idx] = 1 + non_zero_rate = [x != 0 for x in b[1:]] + non_zero_rate = sum(non_zero_rate) / len(non_zero_rate) + requested_prob = belief['beliefs']['requested'][slot] + + # Ontology and DB based features + V_len = len(Ontology.global_ontology.get_informable_slot_values(self.domainString, slot)) + norm_N_values = 1 / V_len + v_len_bin_vector = [0.] * self.N_bins + v_len_bin_vector[int(np.log2(V_len))] = 1. + #ocurr_prob, not_occur_prob, first_prob, second_prob, later_prob = self._get_importance_and_priority(slot) # this was manually set in the original DIP paper, I think it can be learned from the other features + val_dist_in_DB = self._get_val_dist_in_DB(slot) + # potential_contr_to_DB_search = self._get_potential_contr_to_DB_search(slot, belief) + #potential_contr_to_DB_search = [0, 0, 0, 0] # the implementation of this method is too slow right now, dont knwo how useful these features are (but they seem quite useful) + return [0, b_top, b_2nd, b_3rd, b_ent, b_none, non_zero_rate, requested_prob, norm_N_values, val_dist_in_DB] + b_dif_bin + v_len_bin_vector + + def _get_val_dist_in_DB(self, slot): + # The entropy of the normalised histogram (|DB(s=v)|/|DB|) \forall v \in V_s + values = Ontology.global_ontology.get_informable_slot_values(self.domainString, slot) + entities = Ontology.global_ontology.entity_by_features(self.domainString, {}) + val_dist = np.zeros(len(values)) + n = 0 + for ent in entities: + if ent[slot] != 'not available': + val_dist[values.index(ent[slot])] += 1 + n += 1 + return entropy(val_dist/n) + + +class padded_state(State): + def __init__(self, belief, domainString=None, action_freq=None, probability_max=50): + #params + self.domainString = domainString + self.sortbelief = True + self.probability_max = probability_max + #self.action_freq = False + if Settings.config.has_option('feudalpolicy', 'sortbelief'): + self.sortbelief = Settings.config.getboolean('feudalpolicy', 'sortbelief') + #if Settings.config.has_option('feudalpolicy', 'action_freq'): + # self.action_freq = Settings.config.getboolean('feudalpolicy', 'action_freq') + self.slots = list(Ontology.global_ontology.get_informable_slots(domainString)) + if 'price' in self.slots: + self.slots.remove('price') #remove price from SFR ont, its not used + + if 'name' in self.slots: + self.slots.remove('name') + + slot_values = Ontology.global_ontology.get_informable_slots_and_values(domainString) + self.max_v = np.max([len(slot_values[s]) for s in self.slots]) + 3 # (+**NONE**+dontcare+pad) + self.max_v = 158 + self.si_size = 72 # size of general plus joint vectors + self.sd_size = self.max_v + + self.DIP_state = {'general':None, 'joint':None} + for slot in self.slots: + self.DIP_state[slot]=None + + # convert belief state into DIP params + if action_freq is not None: + self.DIP_state['general'] = np.concatenate((action_freq,self.convert_general_b(belief))) + else: + self.DIP_state['general'] = self.convert_general_b(belief) + self.DIP_state['joint'] = self.convert_joint_slot_b(belief) + for slot in self.slots: + self.DIP_state[slot] = self.convert_slot_b(belief, slot) + + # create vector and masks + self.get_DIP_vector() + self.beliefStateVec = None #for compatibility with GP sarsa implementation + + def get_DIP_vector(self): + """ + convert the state into a numpy vector and a set of masks per slot + :return: + """ + pad_v = np.zeros(len(self.DIP_state[self.slots[0]])) + slot_len = len(pad_v) + general_len = len(self.DIP_state['general']) + len(self.DIP_state['joint']) + + self.DIP_vector = [] + self.DIP_masks = {} + mask_template = [False] * (slot_len * (len(self.slots))) + [True] * general_len + i = 0 + for slot in self.slots: + self.DIP_vector.append(self.DIP_state[slot]) + self.DIP_masks[slot] = np.array(mask_template) + self.DIP_masks[slot][slot_len*i:slot_len*(i+1)] = True + i += 1 + self.DIP_vector.append(self.DIP_state['general']) + self.DIP_vector.append(self.DIP_state['joint']) + self.DIP_masks['general'] = np.array(mask_template) + + self.DIP_vector = np.concatenate(self.DIP_vector) + + def get_beliefStateVec(self, slot): + return self.DIP_vector[self.DIP_masks[slot]] + + def get_DIP_state(self, slot): + return np.array([self.DIP_state['general'] + self.DIP_state['joint'] + self.DIP_state[slot]]) + + def get_full_DIP_state(self): + full_slot_bstate = [] + for slot in self.slots: + full_slot_bstate += self.DIP_state[slot] + full_DIP_state = np.array([full_slot_bstate + self.DIP_state['general'] + self.DIP_state['joint']]) + DIP_mask = [True]*(len(self.DIP_state['general']) + len(self.DIP_state['joint'])) + [False] * len(full_slot_bstate) + return full_DIP_state, DIP_mask + + def convert_general_b(self, belief): + """ + Extracts from the belief state the vector corresponding to the general features (e.g. method, user act...) + :param belief: The full belief state + :return: The general vector + """ + if type(belief) == DialogueState.DialogueState: + belief = belief.domainStates[belief.currentdomain] + + #print("BELIEF: ", belief['features']) + + dial_act = list(belief['beliefs']['discourseAct'].values()) + + requested = self._get_requested_vector(belief) + method = list(belief['beliefs']['method'].values()) + features = [int(belief['features']['offerHappened']), int(belief['features']['lastActionInformNone']), + int(bool(belief['features']['lastInformedVenue']))] + discriminable = [int(x) for x in belief['features']['inform_info']] + + return dial_act + requested + method + features + discriminable + [1.0/len(self.slots)] + + def _get_requested_vector(self, belief): + n_requested = sum([x>0.5 for x in list(belief['beliefs']['requested'].values())]) + ret_vec = [0] * 5 + if n_requested > 4: + n_requested = 4 + ret_vec[n_requested] = 1. + return ret_vec + + def convert_joint_slot_b(self, belief): + """ + Extracts the features for the joint vector of all the slots + :param belief: The full belief state + :return: The joint slot vector + """ + #ic340 note: this should probably be done with an rnn encoder + if type(belief) == DialogueState.DialogueState: + belief = belief.domainStates[belief.currentdomain] + + joint_beliefs = [] + joint_none = 1. + informable_beliefs = [copy.deepcopy(belief['beliefs'][x]) for x in list(belief['beliefs'].keys()) if + x in self.slots] # this might be inneficent + for i, b in enumerate(informable_beliefs): + joint_none *= b['**NONE**'] + del b['**NONE**'] # should I put **NONE** prob mass to dontcare? + informable_beliefs[i] = sorted([x for x in list(b.values()) if x != 0], reverse=True)[:2] + while len(informable_beliefs[i]) < 2: + informable_beliefs[i].append(0.) + for probs in product(*informable_beliefs): + joint_beliefs.append(np.prod(probs)) + first_joint_beliefs = np.zeros(20) + joint_beliefs = joint_beliefs[:20] + len_joint_beliefs = len(joint_beliefs) + first_joint_beliefs[:len_joint_beliefs] = joint_beliefs + + if sum(first_joint_beliefs) == 0: + first_joint_beliefs = list(np.ones(len(first_joint_beliefs)) / len(first_joint_beliefs)) + else: + first_joint_beliefs = list(np.array(first_joint_beliefs) / sum(first_joint_beliefs)) # why normalise? + + # number of slots which are not **NONE** + n = 0 + for key in belief['beliefs']: + if key in self.slots: + none_val = belief['beliefs'][key]['**NONE**'] + top_val = np.max( + [belief['beliefs'][key][value] for value in list(belief['beliefs'][key].keys()) if value != '**NONE**']) + if top_val > none_val: + n += 1 + not_none = [0.] * 5 + if n > 4: + n = 4 + not_none[n] = 1. + + return [joint_none] + first_joint_beliefs + not_none + + def convert_slot_b(self, belief, slot): + """ + Extracts the slot features by padding the distribution vector with 0s. + :param belief: The full belief state + :return: The slot DIP vector + """ + if type(belief) == DialogueState.DialogueState: + belief = belief.domainStates[belief.currentdomain] + if self.sortbelief is True: + b = [belief['beliefs'][slot]['**NONE**']] + sorted( + [belief['beliefs'][slot][value] for value in list(belief['beliefs'][slot].keys()) if value != '**NONE**'], + reverse=True) # sorted values + b = b[:self.probability_max] + else: + b = [belief['beliefs'][slot]['**NONE**']] + \ + [belief['beliefs'][slot][value] for value in list(belief['beliefs'][slot].keys()) if value != '**NONE**'] # unsorted values + + padded_b = np.zeros(self.probability_max) + padded_b[0:len(b)] = b + return np.array(padded_b) + + def _get_val_dist_in_DB(self, slot): + # The entropy of the normalised histogram (|DB(s=v)|/|DB|) \forall v \in V_s + values = Ontology.global_ontology.get_informable_slot_values(self.domainString, slot) + entities = Ontology.global_ontology.entity_by_features(self.domainString, {}) + val_dist = np.zeros(len(values)) + n = 0 + for ent in entities: + if ent[slot] != 'not available': + val_dist[values.index(ent[slot])] += 1 + n += 1 + return entropy(val_dist/n) + + +def get_test_beliefs(): + b1 = {'beliefs': {'allowedforkids': {'**NONE**': 0.0, + '0': 0.0, + '1': 0.0, + 'dontcare': 1.0}, + 'area': {'**NONE**': 1.0, + 'alamo square': 0.0, + 'amanico ergina village': 0.0, + 'anza vista': 0.0, + 'ashbury heights': 0.0, + 'balboa terrace': 0.0, + 'bayview district': 0.0, + 'bayview heights': 0.0, + 'bernal heights': 0.0, + 'bernal heights north': 0.0, + 'bernal heights south': 0.0, + 'buena vista park': 0.0, + 'castro': 0.0, + 'cathedral hill': 0.0, + 'cayuga terrace': 0.0, + 'central richmond': 0.0, + 'central sunset': 0.0, + 'central waterfront': 0.0, + 'chinatown': 0.0, + 'civic center': 0.0, + 'clarendon heights': 0.0, + 'cole valley': 0.0, + 'corona heights': 0.0, + 'cow hollow': 0.0, + 'crocker amazon': 0.0, + 'diamond heights': 0.0, + 'doelger city': 0.0, + 'dogpatch': 0.0, + 'dolores heights': 0.0, + 'dontcare': 0.0, + 'downtown': 0.0, + 'duboce triangle': 0.0, + 'embarcadero': 0.0, + 'eureka valley': 0.0, + 'eureka valley dolores heights': 0.0, + 'excelsior': 0.0, + 'financial district': 0.0, + 'financial district south': 0.0, + 'fishermans wharf': 0.0, + 'forest hill': 0.0, + 'forest hill extension': 0.0, + 'forest knolls': 0.0, + 'fort mason': 0.0, + 'fort winfield scott': 0.0, + 'frederick douglass haynes gardens': 0.0, + 'friendship village': 0.0, + 'glen park': 0.0, + 'glenridge': 0.0, + 'golden gate heights': 0.0, + 'golden gate park': 0.0, + 'haight ashbury': 0.0, + 'hayes valley': 0.0, + 'hunters point': 0.0, + 'india basin': 0.0, + 'ingleside': 0.0, + 'ingleside heights': 0.0, + 'ingleside terrace': 0.0, + 'inner mission': 0.0, + 'inner parkside': 0.0, + 'inner richmond': 0.0, + 'inner sunset': 0.0, + 'inset': 0.0, + 'jordan park': 0.0, + 'laguna honda': 0.0, + 'lake': 0.0, + 'lake shore': 0.0, + 'lakeside': 0.0, + 'laurel heights': 0.0, + 'lincoln park': 0.0, + 'lincoln park lobos': 0.0, + 'little hollywood': 0.0, + 'little italy': 0.0, + 'little osaka': 0.0, + 'little russia': 0.0, + 'lone mountain': 0.0, + 'lower haight': 0.0, + 'lower nob hill': 0.0, + 'lower pacific heights': 0.0, + 'malcolm x square': 0.0, + 'marcus garvey square': 0.0, + 'marina district': 0.0, + 'martin luther king square': 0.0, + 'mastro': 0.0, + 'merced heights': 0.0, + 'merced manor': 0.0, + 'midtown terrace': 0.0, + 'miraloma park': 0.0, + 'mission bay': 0.0, + 'mission district': 0.0, + 'mission dolores': 0.0, + 'mission terrace': 0.0, + 'monterey heights': 0.0, + 'mount davidson manor': 0.0, + 'nob hill': 0.0, + 'noe valley': 0.0, + 'noma': 0.0, + 'north beach': 0.0, + 'north panhandle': 0.0, + 'north park': 0.0, + 'north waterfront': 0.0, + 'oceanview': 0.0, + 'opera plaza': 0.0, + 'outer mission': 0.0, + 'outer parkside': 0.0, + 'outer richmond': 0.0, + 'outer sunset': 0.0, + 'outset': 0.0, + 'pacific heights': 0.0, + 'panhandle': 0.0, + 'park merced': 0.0, + 'parkmerced': 0.0, + 'parkside': 0.0, + 'pine lake park': 0.0, + 'portola': 0.0, + 'potrero flats': 0.0, + 'potrero hill': 0.0, + 'presidio': 0.0, + 'presidio heights': 0.0, + 'richmond district': 0.0, + 'russian hill': 0.0, + 'saint francis wood': 0.0, + 'san francisco airport': 0.0, + 'san francisco state university': 0.0, + 'sea cliff': 0.0, + 'sherwood forest': 0.0, + 'showplace square': 0.0, + 'silver terrace': 0.0, + 'somisspo': 0.0, + 'south basin': 0.0, + 'south beach': 0.0, + 'south of market': 0.0, + 'st francis square': 0.0, + 'st francis wood': 0.0, + 'stonestown': 0.0, + 'sunnydale': 0.0, + 'sunnyside': 0.0, + 'sunset district': 0.0, + 'telegraph hill': 0.0, + 'tenderloin': 0.0, + 'thomas paine square': 0.0, + 'transmission': 0.0, + 'treasure island': 0.0, + 'twin peaks': 0.0, + 'twin peaks west': 0.0, + 'upper market': 0.0, + 'van ness': 0.0, + 'victoria mews': 0.0, + 'visitacion valley': 0.0, + 'vista del monte': 0.0, + 'west of twin peaks': 0.0, + 'west portal': 0.0, + 'western addition': 0.0, + 'westlake and olympic': 0.0, + 'westwood highlands': 0.0, + 'westwood park': 0.0, + 'yerba buena island': 0.0, + 'zion district': 0.0}, + 'discourseAct': {'ack': 0.0, + 'bye': 0.0, + 'hello': 0.0, + 'none': 1.0, + 'repeat': 0.0, + 'silence': 0.0, + 'thankyou': 0.0}, + 'food': {'**NONE**': 0.0, + 'afghan': 0.0, + 'arabian': 0.0, + 'asian': 0.0, + 'basque': 0.0, + 'brasseries': 0.0, + 'brazilian': 0.0, + 'buffets': 0.0, + 'burgers': 0.0, + 'burmese': 0.0, + 'cafes': 0.0, + 'cambodian': 0.0, + 'cantonese': 1.0, + 'chinese': 0.0, + 'comfort food': 0.0, + 'creperies': 0.0, + 'dim sum': 0.0, + 'dontcare': 0.0, + 'ethiopian': 0.0, + 'ethnic food': 0.0, + 'french': 0.0, + 'gluten free': 0.0, + 'himalayan': 0.0, + 'indian': 0.0, + 'indonesian': 0.0, + 'indpak': 0.0, + 'italian': 0.0, + 'japanese': 0.0, + 'korean': 0.0, + 'kosher': 0.0, + 'latin': 0.0, + 'lebanese': 0.0, + 'lounges': 0.0, + 'malaysian': 0.0, + 'mediterranean': 0.0, + 'mexican': 0.0, + 'middle eastern': 0.0, + 'modern european': 0.0, + 'moroccan': 0.0, + 'new american': 0.0, + 'pakistani': 0.0, + 'persian': 0.0, + 'peruvian': 0.0, + 'pizza': 0.0, + 'raw food': 0.0, + 'russian': 0.0, + 'sandwiches': 0.0, + 'sea food': 0.0, + 'shanghainese': 0.0, + 'singaporean': 0.0, + 'soul food': 0.0, + 'spanish': 0.0, + 'steak': 0.0, + 'sushi': 0.0, + 'taiwanese': 0.0, + 'tapas': 0.0, + 'thai': 0.0, + 'traditionnal american': 0.0, + 'turkish': 0.0, + 'vegetarian': 0.0, + 'vietnamese': 0.0}, + 'goodformeal': {'**NONE**': 0.0, + 'breakfast': 0.0, + 'brunch': 0.0, + 'dinner': 0.0, + 'dontcare': 1.0, + 'lunch': 0.0}, + 'method': {'byalternatives': 0.0, + 'byconstraints': 0.0, + 'byname': 0.9285714285714286, + 'finished': 0.0, + 'none': 0.0714285714285714, + 'restart': 0.0}, + 'name': {'**NONE**': 0.0, + 'a 16': 0.0, + 'a la turca restaurant': 0.0, + 'abacus': 0.0, + 'alamo square seafood grill': 0.0, + 'albona ristorante istriano': 0.0, + 'alborz persian cuisine': 0.0, + 'allegro romano': 0.0, + 'amarena': 0.0, + 'amber india': 0.0, + 'ame': 0.0, + 'ananda fuara': 0.0, + 'anchor oyster bar': 0.0, + 'angkor borei restaurant': 0.0, + 'aperto restaurant': 0.0, + 'ar roi restaurant': 0.0, + 'arabian nights restaurant': 0.0, + 'assab eritrean restaurant': 0.0, + 'atelier crenn': 0.0, + 'aux delices restaurant': 0.0, + 'aziza': 0.0, + 'b star bar': 0.0, + 'bar crudo': 0.0, + 'beijing restaurant': 0.0, + 'bella trattoria': 0.0, + 'benu': 0.0, + 'betelnut': 0.0, + 'bistro central parc': 0.0, + 'bix': 0.0, + 'borgo': 0.0, + 'borobudur restaurant': 0.0, + 'bouche': 0.0, + 'boulevard': 0.0, + 'brothers restaurant': 0.0, + 'bund shanghai restaurant': 0.0, + 'burma superstar': 0.0, + 'butterfly': 0.0, + 'cafe claude': 0.0, + 'cafe jacqueline': 0.0, + 'campton place restaurant': 0.0, + 'canteen': 0.0, + 'canto do brasil restaurant': 0.0, + 'capannina': 0.0, + 'capital restaurant': 0.0, + 'chai yo thai restaurant': 0.0, + 'chaya brasserie': 0.0, + 'chenery park': 0.0, + 'chez maman': 0.0, + 'chez papa bistrot': 0.0, + 'chez spencer': 0.0, + 'chiaroscuro': 0.0, + 'chouchou': 0.0, + 'chow': 0.0, + 'city view restaurant': 0.0, + 'claudine': 0.0, + 'coi': 0.0, + 'colibri mexican bistro': 0.0, + 'coqueta': 0.0, + 'crustacean restaurant': 0.0, + 'da flora a venetian osteria': 0.0, + 'darbar restaurant': 0.0, + 'delancey street restaurant': 0.0, + 'delfina': 0.0, + 'dong baek restaurant': 0.0, + 'dontcare': 0.0, + 'dosa on fillmore': 0.0, + 'dosa on valencia': 0.0, + 'eiji': 0.0, + 'enjoy vegetarian restaurant': 0.0, + 'espetus churrascaria': 0.0, + 'fang': 0.0, + 'farallon': 0.0, + 'fattoush restaurant': 0.0, + 'fifth floor': 0.0, + 'fino restaurant': 0.0, + 'firefly': 0.0, + 'firenze by night ristorante': 0.0, + 'fleur de lys': 0.0, + 'fog harbor fish house': 0.0, + 'forbes island': 0.0, + 'foreign cinema': 0.0, + 'frances': 0.0, + 'franchino': 0.0, + 'franciscan crab restaurant': 0.0, + 'frascati': 0.0, + 'fresca': 0.0, + 'fringale': 0.0, + 'fujiyama ya japanese restaurant': 0.0, + 'gajalee': 0.0, + 'gamine': 0.0, + 'garcon restaurant': 0.0, + 'gary danko': 0.0, + 'gitane': 0.0, + 'golden era restaurant': 0.0, + 'gracias madre': 0.0, + 'great eastern restaurant': 1.0, + 'hakka restaurant': 0.0, + 'hakkasan': 0.0, + 'han second kwan': 0.0, + 'heirloom cafe': 0.0, + 'helmand palace': 0.0, + 'hi dive': 0.0, + 'hillside supper club': 0.0, + 'hillstone': 0.0, + 'hong kong clay pot restaurant': 0.0, + 'house of nanking': 0.0, + 'house of prime rib': 0.0, + 'hunan homes restaurant': 0.0, + 'incanto': 0.0, + 'isa': 0.0, + 'jannah': 0.0, + 'jasmine garden': 0.0, + 'jitlada thai cuisine': 0.0, + 'kappa japanese restaurant': 0.0, + 'kim thanh restaurant': 0.0, + 'kirin chinese restaurant': 0.0, + 'kiss seafood': 0.0, + 'kokkari estiatorio': 0.0, + 'la briciola': 0.0, + 'la ciccia': 0.0, + 'la folie': 0.0, + 'la mediterranee': 0.0, + 'la traviata': 0.0, + 'lahore karahi': 0.0, + 'lavash': 0.0, + 'le charm': 0.0, + 'le colonial': 0.0, + 'le soleil': 0.0, + 'lime tree southeast asian kitchen': 0.0, + 'little delhi': 0.0, + 'little nepal': 0.0, + 'luce': 0.0, + 'lucky creation restaurant': 0.0, + 'luella': 0.0, + 'lupa': 0.0, + 'm y china': 0.0, + 'maki restaurant': 0.0, + 'mangia tutti ristorante': 0.0, + 'manna': 0.0, + 'marlowe': 0.0, + 'marnee thai': 0.0, + 'maverick': 0.0, + 'mela tandoori kitchen': 0.0, + 'mescolanza': 0.0, + 'mezes': 0.0, + 'michael mina restaurant': 0.0, + 'millennium': 0.0, + 'minako organic japanese restaurant': 0.0, + 'minami restaurant': 0.0, + 'mission chinese food': 0.0, + 'mochica': 0.0, + 'modern thai': 0.0, + 'mona lisa restaurant': 0.0, + 'mozzeria': 0.0, + 'muguboka restaurant': 0.0, + 'my tofu house': 0.0, + 'nicaragua restaurant': 0.0, + 'nob hill cafe': 0.0, + 'nopa': 0.0, + 'old jerusalem restaurant': 0.0, + 'old skool cafe': 0.0, + 'one market restaurant': 0.0, + 'orexi': 0.0, + 'original us restaurant': 0.0, + 'osha thai': 0.0, + 'oyaji restaurant': 0.0, + 'ozumo': 0.0, + 'pad thai restaurant': 0.0, + 'panta rei restaurant': 0.0, + 'park tavern': 0.0, + 'pera': 0.0, + 'piperade': 0.0, + 'ploy 2': 0.0, + 'poc chuc': 0.0, + 'poesia': 0.0, + 'prospect': 0.0, + 'quince': 0.0, + 'radius san francisco': 0.0, + 'range': 0.0, + 'red door cafe': 0.0, + 'restaurant ducroix': 0.0, + 'ristorante bacco': 0.0, + 'ristorante ideale': 0.0, + 'ristorante milano': 0.0, + 'ristorante parma': 0.0, + 'rn74': 0.0, + 'rue lepic': 0.0, + 'saha': 0.0, + 'sai jai thai restaurant': 0.0, + 'salt house': 0.0, + 'san tung chinese restaurant': 0.0, + 'san wang restaurant': 0.0, + 'sanjalisco': 0.0, + 'sanraku': 0.0, + 'seasons': 0.0, + 'seoul garden': 0.0, + 'seven hills': 0.0, + 'shangri la vegetarian restaurant': 0.0, + 'singapore malaysian restaurant': 0.0, + 'skool': 0.0, + 'so': 0.0, + 'sotto mare': 0.0, + 'source': 0.0, + 'specchio ristorante': 0.0, + 'spruce': 0.0, + 'straits restaurant': 0.0, + 'stroganoff restaurant': 0.0, + 'sunflower potrero hill': 0.0, + 'sushi bistro': 0.0, + 'taiwan restaurant': 0.0, + 'tanuki restaurant': 0.0, + 'tataki': 0.0, + 'tekka japanese restaurant': 0.0, + 'thai cottage restaurant': 0.0, + 'thai house express': 0.0, + 'thai idea vegetarian': 0.0, + 'thai time restaurant': 0.0, + 'thanh long': 0.0, + 'the big 4 restaurant': 0.0, + 'the blue plate': 0.0, + 'the house': 0.0, + 'the richmond': 0.0, + 'the slanted door': 0.0, + 'the stinking rose': 0.0, + 'thep phanom thai restaurant': 0.0, + 'tommys joynt': 0.0, + 'toraya japanese restaurant': 0.0, + 'town hall': 0.0, + 'trattoria contadina': 0.0, + 'tu lan': 0.0, + 'tuba restaurant': 0.0, + 'u lee restaurant': 0.0, + 'udupi palace': 0.0, + 'venticello ristorante': 0.0, + 'vicoletto': 0.0, + 'yank sing': 0.0, + 'yummy yummy': 0.0, + 'z and y restaurant': 0.0, + 'zadin': 0.0, + 'zare at fly trap': 0.0, + 'zarzuela': 0.0, + 'zen yai thai restaurant': 0.0, + 'zuni cafe': 0.0, + 'zushi puzzle': 0.0}, + 'near': {'**NONE**': 0.0, + 'bayview hunters point': 0.0, + 'dontcare': 1.0, + 'haight': 0.0, + 'japantown': 0.0, + 'marina cow hollow': 0.0, + 'mission': 0.0, + 'nopa': 0.0, + 'north beach telegraph hill': 0.0, + 'soma': 0.0, + 'union square': 0.0}, + 'price': {'**NONE**': 1.0, + '10 dollar': 0.0, + '10 euro': 0.0, + '11 euro': 0.0, + '15 euro': 0.0, + '18 euro': 0.0, + '20 euro': 0.0, + '22 euro': 0.0, + '25 euro': 0.0, + '26 euro': 0.0, + '29 euro': 0.0, + '37 euro': 0.0, + '6': 0.0, + '7': 0.0, + '9': 0.0, + 'between 0 and 15 euro': 0.0, + 'between 10 and 13 euro': 0.0, + 'between 10 and 15 euro': 0.0, + 'between 10 and 18 euro': 0.0, + 'between 10 and 20 euro': 0.0, + 'between 10 and 23 euro': 0.0, + 'between 10 and 30 euro': 0.0, + 'between 11 and 15 euro': 0.0, + 'between 11 and 18 euro': 0.0, + 'between 11 and 22 euro': 0.0, + 'between 11 and 25 euro': 0.0, + 'between 11 and 29 euro': 0.0, + 'between 11 and 35 euro': 0.0, + 'between 13 and 15 euro': 0.0, + 'between 13 and 18 euro': 0.0, + 'between 13 and 24 euro': 0.0, + 'between 15 and 18 euro': 0.0, + 'between 15 and 22 euro': 0.0, + 'between 15 and 26 euro': 0.0, + 'between 15 and 29 euro': 0.0, + 'between 15 and 33 euro': 0.0, + 'between 15 and 44 euro': 0.0, + 'between 15 and 58 euro': 0.0, + 'between 18 and 26 euro': 0.0, + 'between 18 and 29 euro': 0.0, + 'between 18 and 44 euro': 0.0, + 'between 18 and 55 euro': 0.0, + 'between 18 and 58 euro': 0.0, + 'between 18 and 73 euro': 0.0, + 'between 18 and 78 euro': 0.0, + 'between 2 and 15 euro': 0.0, + 'between 20 and 30 euro': 0.0, + 'between 21 and 23 euro': 0.0, + 'between 22 and 29 euro': 0.0, + 'between 22 and 30 dollar': 0.0, + 'between 22 and 37 euro': 0.0, + 'between 22 and 58 euro': 0.0, + 'between 22 and 73 euro': 0.0, + 'between 23 and 29': 0.0, + 'between 23 and 29 euro': 0.0, + 'between 23 and 37 euro': 0.0, + 'between 23 and 58': 0.0, + 'between 23 and 58 euro': 0.0, + 'between 26 and 33 euro': 0.0, + 'between 26 and 34 euro': 0.0, + 'between 26 and 37 euro': 0.0, + 'between 29 and 37 euro': 0.0, + 'between 29 and 44 euro': 0.0, + 'between 29 and 58 euro': 0.0, + 'between 29 and 73 euro': 0.0, + 'between 30 and 58': 0.0, + 'between 30 and 58 euro': 0.0, + 'between 31 and 50 euro': 0.0, + 'between 37 and 110 euro': 0.0, + 'between 37 and 44 euro': 0.0, + 'between 37 and 58 euro': 0.0, + 'between 4 and 22 euro': 0.0, + 'between 4 and 58 euro': 0.0, + 'between 5 an 30 euro': 0.0, + 'between 5 and 10 euro': 0.0, + 'between 5 and 11 euro': 0.0, + 'between 5 and 15 dollar': 0.0, + 'between 5 and 20 euro': 0.0, + 'between 5 and 25 euro': 0.0, + 'between 6 and 10 euro': 0.0, + 'between 6 and 11 euro': 0.0, + 'between 6 and 15 euro': 0.0, + 'between 6 and 29 euro': 0.0, + 'between 7 and 11 euro': 0.0, + 'between 7 and 13 euro': 0.0, + 'between 7 and 15 euro': 0.0, + 'between 7 and 37 euro': 0.0, + 'between 8 and 22 euro': 0.0, + 'between 9 and 13 dolllar': 0.0, + 'between 9 and 15 euro': 0.0, + 'between 9 and 58 euro': 0.0, + 'bteween 11 and 15 euro': 0.0, + 'bteween 15 and 22 euro': 0.0, + 'bteween 22 and 37': 0.0, + 'bteween 30 and 58 euro': 0.0, + 'bteween 51 and 73 euro': 0.0, + 'netween 20 and 30 euro': 0.0}, + 'pricerange': {'**NONE**': 1.0, + 'cheap': 0.0, + 'dontcare': 0.0, + 'expensive': 0.0, + 'moderate': 0.0}, + 'requested': {'addr': 1.0, + 'allowedforkids': 0.0, + 'area': 0.0, + 'food': 0.0, + 'goodformeal': 0.0, + 'name': 0.0, + 'near': 0.0, + 'phone': 1, + 'postcode': 0.0, + 'price': 0.0, + 'pricerange': 0.0}}, + 'features': {'inform_info': [False, + False, + True, + False, + True, + False, + False, + True, + False, + True, + False, + False, + True, + False, + True, + False, + False, + True, + False, + True, + False, + False, + True, + False, + True], + 'informedVenueSinceNone': ['great eastern restaurant', + 'great eastern restaurant'], + 'lastActionInformNone': False, + 'lastInformedVenue': 'great eastern restaurant', + 'offerHappened': False}, + 'userActs': [('request(name="great eastern restaurant",phone)', 1.0)]} + b2 = {'beliefs': {'allowedforkids': {'**NONE**': 0.014367834316388661, + '0': 0.009175995595522114, + '1': 0.9579333306577846, + 'dontcare': 0.01852283943030468}, + 'area': {'**NONE**': 0.9753165718480455, + 'alamo square': 0.0, + 'amanico ergina village': 0.0, + 'anza vista': 0.0, + 'ashbury heights': 0.0, + 'balboa terrace': 0.0, + 'bayview district': 0.0, + 'bayview heights': 0.0, + 'bernal heights': 0.0, + 'bernal heights north': 0.0, + 'bernal heights south': 0.0, + 'buena vista park': 0.0, + 'castro': 0.0, + 'cathedral hill': 0.0, + 'cayuga terrace': 0.0, + 'central richmond': 0.0, + 'central sunset': 0.0, + 'central waterfront': 0.0, + 'chinatown': 0.0, + 'civic center': 0.0, + 'clarendon heights': 0.0, + 'cole valley': 0.0, + 'corona heights': 0.0, + 'cow hollow': 0.0, + 'crocker amazon': 0.0, + 'diamond heights': 0.0, + 'doelger city': 0.0, + 'dogpatch': 0.0, + 'dolores heights': 0.0, + 'dontcare': 0.0, + 'downtown': 0.0, + 'duboce triangle': 0.0, + 'embarcadero': 0.0, + 'eureka valley': 0.0, + 'eureka valley dolores heights': 0.0, + 'excelsior': 0.0, + 'financial district': 0.0, + 'financial district south': 0.0, + 'fishermans wharf': 0.0, + 'forest hill': 0.0, + 'forest hill extension': 0.0, + 'forest knolls': 0.0, + 'fort mason': 0.0, + 'fort winfield scott': 0.0, + 'frederick douglass haynes gardens': 0.0, + 'friendship village': 0.0, + 'glen park': 0.0, + 'glenridge': 0.0, + 'golden gate heights': 0.0, + 'golden gate park': 0.0, + 'haight ashbury': 0.0, + 'hayes valley': 0.0, + 'hunters point': 0.0, + 'india basin': 0.0, + 'ingleside': 0.0, + 'ingleside heights': 0.0, + 'ingleside terrace': 0.0, + 'inner mission': 0.0, + 'inner parkside': 0.0, + 'inner richmond': 0.0, + 'inner sunset': 0.0, + 'inset': 0.0, + 'jordan park': 0.0, + 'laguna honda': 0.0, + 'lake': 0.0, + 'lake shore': 0.0, + 'lakeside': 0.0, + 'laurel heights': 0.0, + 'lincoln park': 0.0, + 'lincoln park lobos': 0.0, + 'little hollywood': 0.0, + 'little italy': 0.0, + 'little osaka': 0.0, + 'little russia': 0.0, + 'lone mountain': 0.0, + 'lower haight': 0.0, + 'lower nob hill': 0.0, + 'lower pacific heights': 0.0, + 'malcolm x square': 0.0, + 'marcus garvey square': 0.0, + 'marina district': 0.0, + 'martin luther king square': 0.0, + 'mastro': 0.0, + 'merced heights': 0.0, + 'merced manor': 0.0, + 'midtown terrace': 0.0, + 'miraloma park': 0.0, + 'mission bay': 0.0, + 'mission district': 0.0, + 'mission dolores': 0.0, + 'mission terrace': 0.0, + 'monterey heights': 0.0, + 'mount davidson manor': 0.0, + 'nob hill': 0.0, + 'noe valley': 0.0, + 'noma': 0.0, + 'north beach': 0.0, + 'north panhandle': 0.0, + 'north park': 0.0, + 'north waterfront': 0.0, + 'oceanview': 0.0, + 'opera plaza': 0.0, + 'outer mission': 0.0, + 'outer parkside': 0.0, + 'outer richmond': 0.0, + 'outer sunset': 0.0, + 'outset': 0.0, + 'pacific heights': 0.0, + 'panhandle': 0.0, + 'park merced': 0.0, + 'parkmerced': 0.0, + 'parkside': 0.0, + 'pine lake park': 0.0, + 'portola': 0.0, + 'potrero flats': 0.0, + 'potrero hill': 0.0, + 'presidio': 0.0, + 'presidio heights': 0.0, + 'richmond district': 0.0, + 'russian hill': 0.0, + 'saint francis wood': 0.0, + 'san francisco airport': 0.0, + 'san francisco state university': 0.0, + 'sea cliff': 0.0, + 'sherwood forest': 0.0, + 'showplace square': 0.0, + 'silver terrace': 0.0, + 'somisspo': 0.0, + 'south basin': 0.0, + 'south beach': 0.0, + 'south of market': 0.0, + 'st francis square': 0.0, + 'st francis wood': 0.0, + 'stonestown': 0.024683428151954484, + 'sunnydale': 0.0, + 'sunnyside': 0.0, + 'sunset district': 0.0, + 'telegraph hill': 0.0, + 'tenderloin': 0.0, + 'thomas paine square': 0.0, + 'transmission': 0.0, + 'treasure island': 0.0, + 'twin peaks': 0.0, + 'twin peaks west': 0.0, + 'upper market': 0.0, + 'van ness': 0.0, + 'victoria mews': 0.0, + 'visitacion valley': 0.0, + 'vista del monte': 0.0, + 'west of twin peaks': 0.0, + 'west portal': 0.0, + 'western addition': 0.0, + 'westlake and olympic': 0.0, + 'westwood highlands': 0.0, + 'westwood park': 0.0, + 'yerba buena island': 0.0, + 'zion district': 0.0}, + 'discourseAct': {'ack': 0.0, + 'bye': 0.0, + 'hello': 0.0, + 'none': 0.9999999999999998, + 'repeat': 0.0, + 'silence': 0.0, + 'thankyou': 0.0}, + 'food': {'**NONE**': 1.0, + 'afghan': 0.0, + 'arabian': 0.0, + 'asian': 0.0, + 'basque': 0.0, + 'brasseries': 0.0, + 'brazilian': 0.0, + 'buffets': 0.0, + 'burgers': 0.0, + 'burmese': 0.0, + 'cafes': 0.0, + 'cambodian': 0.0, + 'cantonese': 0.0, + 'chinese': 0.0, + 'comfort food': 0.0, + 'creperies': 0.0, + 'dim sum': 0.0, + 'dontcare': 0.0, + 'ethiopian': 0.0, + 'ethnic food': 0.0, + 'french': 0.0, + 'gluten free': 0.0, + 'himalayan': 0.0, + 'indian': 0.0, + 'indonesian': 0.0, + 'indpak': 0.0, + 'italian': 0.0, + 'japanese': 0.0, + 'korean': 0.0, + 'kosher': 0.0, + 'latin': 0.0, + 'lebanese': 0.0, + 'lounges': 0.0, + 'malaysian': 0.0, + 'mediterranean': 0.0, + 'mexican': 0.0, + 'middle eastern': 0.0, + 'modern european': 0.0, + 'moroccan': 0.0, + 'new american': 0.0, + 'pakistani': 0.0, + 'persian': 0.0, + 'peruvian': 0.0, + 'pizza': 0.0, + 'raw food': 0.0, + 'russian': 0.0, + 'sandwiches': 0.0, + 'sea food': 0.0, + 'shanghainese': 0.0, + 'singaporean': 0.0, + 'soul food': 0.0, + 'spanish': 0.0, + 'steak': 0.0, + 'sushi': 0.0, + 'taiwanese': 0.0, + 'tapas': 0.0, + 'thai': 0.0, + 'traditionnal american': 0.0, + 'turkish': 0.0, + 'vegetarian': 0.0, + 'vietnamese': 0.0}, + 'goodformeal': {'**NONE**': 1.0, + 'breakfast': 0.0, + 'brunch': 0.0, + 'dinner': 0.0, + 'dontcare': 0.0, + 'lunch': 0.0}, + 'method': {'byalternatives': 0.0, + 'byconstraints': 0.7725475751076113, + 'byname': 0.0, + 'finished': 0.0, + 'none': 0.0, + 'restart': 0.0}, + 'name': {'**NONE**': 1.0, + 'a 16': 0.0, + 'a la turca restaurant': 0.0, + 'abacus': 0.0, + 'alamo square seafood grill': 0.0, + 'albona ristorante istriano': 0.0, + 'alborz persian cuisine': 0.0, + 'allegro romano': 0.0, + 'amarena': 0.0, + 'amber india': 0.0, + 'ame': 0.0, + 'ananda fuara': 0.0, + 'anchor oyster bar': 0.0, + 'angkor borei restaurant': 0.0, + 'aperto restaurant': 0.0, + 'ar roi restaurant': 0.0, + 'arabian nights restaurant': 0.0, + 'assab eritrean restaurant': 0.0, + 'atelier crenn': 0.0, + 'aux delices restaurant': 0.0, + 'aziza': 0.0, + 'b star bar': 0.0, + 'bar crudo': 0.0, + 'beijing restaurant': 0.0, + 'bella trattoria': 0.0, + 'benu': 0.0, + 'betelnut': 0.0, + 'bistro central parc': 0.0, + 'bix': 0.0, + 'borgo': 0.0, + 'borobudur restaurant': 0.0, + 'bouche': 0.0, + 'boulevard': 0.0, + 'brothers restaurant': 0.0, + 'bund shanghai restaurant': 0.0, + 'burma superstar': 0.0, + 'butterfly': 0.0, + 'cafe claude': 0.0, + 'cafe jacqueline': 0.0, + 'campton place restaurant': 0.0, + 'canteen': 0.0, + 'canto do brasil restaurant': 0.0, + 'capannina': 0.0, + 'capital restaurant': 0.0, + 'chai yo thai restaurant': 0.0, + 'chaya brasserie': 0.0, + 'chenery park': 0.0, + 'chez maman': 0.0, + 'chez papa bistrot': 0.0, + 'chez spencer': 0.0, + 'chiaroscuro': 0.0, + 'chouchou': 0.0, + 'chow': 0.0, + 'city view restaurant': 0.0, + 'claudine': 0.0, + 'coi': 0.0, + 'colibri mexican bistro': 0.0, + 'coqueta': 0.0, + 'crustacean restaurant': 0.0, + 'da flora a venetian osteria': 0.0, + 'darbar restaurant': 0.0, + 'delancey street restaurant': 0.0, + 'delfina': 0.0, + 'dong baek restaurant': 0.0, + 'dosa on fillmore': 0.0, + 'dosa on valencia': 0.0, + 'eiji': 0.0, + 'enjoy vegetarian restaurant': 0.0, + 'espetus churrascaria': 0.0, + 'fang': 0.0, + 'farallon': 0.0, + 'fattoush restaurant': 0.0, + 'fifth floor': 0.0, + 'fino restaurant': 0.0, + 'firefly': 0.0, + 'firenze by night ristorante': 0.0, + 'fleur de lys': 0.0, + 'fog harbor fish house': 0.0, + 'forbes island': 0.0, + 'foreign cinema': 0.0, + 'frances': 0.0, + 'franchino': 0.0, + 'franciscan crab restaurant': 0.0, + 'frascati': 0.0, + 'fresca': 0.0, + 'fringale': 0.0, + 'fujiyama ya japanese restaurant': 0.0, + 'gajalee': 0.0, + 'gamine': 0.0, + 'garcon restaurant': 0.0, + 'gary danko': 0.0, + 'gitane': 0.0, + 'golden era restaurant': 0.0, + 'gracias madre': 0.0, + 'great eastern restaurant': 0.0, + 'hakka restaurant': 0.0, + 'hakkasan': 0.0, + 'han second kwan': 0.0, + 'heirloom cafe': 0.0, + 'helmand palace': 0.0, + 'hi dive': 0.0, + 'hillside supper club': 0.0, + 'hillstone': 0.0, + 'hong kong clay pot restaurant': 0.0, + 'house of nanking': 0.0, + 'house of prime rib': 0.0, + 'hunan homes restaurant': 0.0, + 'incanto': 0.0, + 'isa': 0.0, + 'jannah': 0.0, + 'jasmine garden': 0.0, + 'jitlada thai cuisine': 0.0, + 'kappa japanese restaurant': 0.0, + 'kim thanh restaurant': 0.0, + 'kirin chinese restaurant': 0.0, + 'kiss seafood': 0.0, + 'kokkari estiatorio': 0.0, + 'la briciola': 0.0, + 'la ciccia': 0.0, + 'la folie': 0.0, + 'la mediterranee': 0.0, + 'la traviata': 0.0, + 'lahore karahi': 0.0, + 'lavash': 0.0, + 'le charm': 0.0, + 'le colonial': 0.0, + 'le soleil': 0.0, + 'lime tree southeast asian kitchen': 0.0, + 'little delhi': 0.0, + 'little nepal': 0.0, + 'luce': 0.0, + 'lucky creation restaurant': 0.0, + 'luella': 0.0, + 'lupa': 0.0, + 'm y china': 0.0, + 'maki restaurant': 0.0, + 'mangia tutti ristorante': 0.0, + 'manna': 0.0, + 'marlowe': 0.0, + 'marnee thai': 0.0, + 'maverick': 0.0, + 'mela tandoori kitchen': 0.0, + 'mescolanza': 0.0, + 'mezes': 0.0, + 'michael mina restaurant': 0.0, + 'millennium': 0.0, + 'minako organic japanese restaurant': 0.0, + 'minami restaurant': 0.0, + 'mission chinese food': 0.0, + 'mochica': 0.0, + 'modern thai': 0.0, + 'mona lisa restaurant': 0.0, + 'mozzeria': 0.0, + 'muguboka restaurant': 0.0, + 'my tofu house': 0.0, + 'nicaragua restaurant': 0.0, + 'nob hill cafe': 0.0, + 'nopa': 0.0, + 'old jerusalem restaurant': 0.0, + 'old skool cafe': 0.0, + 'one market restaurant': 0.0, + 'orexi': 0.0, + 'original us restaurant': 0.0, + 'osha thai': 0.0, + 'oyaji restaurant': 0.0, + 'ozumo': 0.0, + 'pad thai restaurant': 0.0, + 'panta rei restaurant': 0.0, + 'park tavern': 0.0, + 'pera': 0.0, + 'piperade': 0.0, + 'ploy 2': 0.0, + 'poc chuc': 0.0, + 'poesia': 0.0, + 'prospect': 0.0, + 'quince': 0.0, + 'radius san francisco': 0.0, + 'range': 0.0, + 'red door cafe': 0.0, + 'restaurant ducroix': 0.0, + 'ristorante bacco': 0.0, + 'ristorante ideale': 0.0, + 'ristorante milano': 0.0, + 'ristorante parma': 0.0, + 'rn74': 0.0, + 'rue lepic': 0.0, + 'saha': 0.0, + 'sai jai thai restaurant': 0.0, + 'salt house': 0.0, + 'san tung chinese restaurant': 0.0, + 'san wang restaurant': 0.0, + 'sanjalisco': 0.0, + 'sanraku': 0.0, + 'seasons': 0.0, + 'seoul garden': 0.0, + 'seven hills': 0.0, + 'shangri la vegetarian restaurant': 0.0, + 'singapore malaysian restaurant': 0.0, + 'skool': 0.0, + 'so': 0.0, + 'sotto mare': 0.0, + 'source': 0.0, + 'specchio ristorante': 0.0, + 'spruce': 0.0, + 'straits restaurant': 0.0, + 'stroganoff restaurant': 0.0, + 'sunflower potrero hill': 0.0, + 'sushi bistro': 0.0, + 'taiwan restaurant': 0.0, + 'tanuki restaurant': 0.0, + 'tataki': 0.0, + 'tekka japanese restaurant': 0.0, + 'thai cottage restaurant': 0.0, + 'thai house express': 0.0, + 'thai idea vegetarian': 0.0, + 'thai time restaurant': 0.0, + 'thanh long': 0.0, + 'the big 4 restaurant': 0.0, + 'the blue plate': 0.0, + 'the house': 0.0, + 'the richmond': 0.0, + 'the slanted door': 0.0, + 'the stinking rose': 0.0, + 'thep phanom thai restaurant': 0.0, + 'tommys joynt': 0.0, + 'toraya japanese restaurant': 0.0, + 'town hall': 0.0, + 'trattoria contadina': 0.0, + 'tu lan': 0.0, + 'tuba restaurant': 0.0, + 'u lee restaurant': 0.0, + 'udupi palace': 0.0, + 'venticello ristorante': 0.0, + 'vicoletto': 0.0, + 'yank sing': 0.0, + 'yummy yummy': 0.0, + 'z and y restaurant': 0.0, + 'zadin': 0.0, + 'zare at fly trap': 0.0, + 'zarzuela': 0.0, + 'zen yai thai restaurant': 0.0, + 'zuni cafe': 0.0, + 'zushi puzzle': 0.0}, + 'near': {'**NONE**': 0.13300733496332517, + 'bayview hunters point': 0.0, + 'dontcare': 0.15859820700896493, + 'haight': 0.0, + 'japantown': 0.038712306438467806, + 'marina cow hollow': 0.0, + 'mission': 0.0, + 'nopa': 0.669682151589242, + 'north beach telegraph hill': 0.0, + 'soma': 0.0, + 'union square': 0.0}, + 'price': {'**NONE**': 1.0, + '10 dollar': 0.0, + '10 euro': 0.0, + '11 euro': 0.0, + '15 euro': 0.0, + '18 euro': 0.0, + '20 euro': 0.0, + '22 euro': 0.0, + '25 euro': 0.0, + '26 euro': 0.0, + '29 euro': 0.0, + '37 euro': 0.0, + '6': 0.0, + '7': 0.0, + '9': 0.0, + 'between 0 and 15 euro': 0.0, + 'between 10 and 13 euro': 0.0, + 'between 10 and 15 euro': 0.0, + 'between 10 and 18 euro': 0.0, + 'between 10 and 20 euro': 0.0, + 'between 10 and 23 euro': 0.0, + 'between 10 and 30 euro': 0.0, + 'between 11 and 15 euro': 0.0, + 'between 11 and 18 euro': 0.0, + 'between 11 and 22 euro': 0.0, + 'between 11 and 25 euro': 0.0, + 'between 11 and 29 euro': 0.0, + 'between 11 and 35 euro': 0.0, + 'between 13 and 15 euro': 0.0, + 'between 13 and 18 euro': 0.0, + 'between 13 and 24 euro': 0.0, + 'between 15 and 18 euro': 0.0, + 'between 15 and 22 euro': 0.0, + 'between 15 and 26 euro': 0.0, + 'between 15 and 29 euro': 0.0, + 'between 15 and 33 euro': 0.0, + 'between 15 and 44 euro': 0.0, + 'between 15 and 58 euro': 0.0, + 'between 18 and 26 euro': 0.0, + 'between 18 and 29 euro': 0.0, + 'between 18 and 44 euro': 0.0, + 'between 18 and 55 euro': 0.0, + 'between 18 and 58 euro': 0.0, + 'between 18 and 73 euro': 0.0, + 'between 18 and 78 euro': 0.0, + 'between 2 and 15 euro': 0.0, + 'between 20 and 30 euro': 0.0, + 'between 21 and 23 euro': 0.0, + 'between 22 and 29 euro': 0.0, + 'between 22 and 30 dollar': 0.0, + 'between 22 and 37 euro': 0.0, + 'between 22 and 58 euro': 0.0, + 'between 22 and 73 euro': 0.0, + 'between 23 and 29': 0.0, + 'between 23 and 29 euro': 0.0, + 'between 23 and 37 euro': 0.0, + 'between 23 and 58': 0.0, + 'between 23 and 58 euro': 0.0, + 'between 26 and 33 euro': 0.0, + 'between 26 and 34 euro': 0.0, + 'between 26 and 37 euro': 0.0, + 'between 29 and 37 euro': 0.0, + 'between 29 and 44 euro': 0.0, + 'between 29 and 58 euro': 0.0, + 'between 29 and 73 euro': 0.0, + 'between 30 and 58': 0.0, + 'between 30 and 58 euro': 0.0, + 'between 31 and 50 euro': 0.0, + 'between 37 and 110 euro': 0.0, + 'between 37 and 44 euro': 0.0, + 'between 37 and 58 euro': 0.0, + 'between 4 and 22 euro': 0.0, + 'between 4 and 58 euro': 0.0, + 'between 5 an 30 euro': 0.0, + 'between 5 and 10 euro': 0.0, + 'between 5 and 11 euro': 0.0, + 'between 5 and 15 dollar': 0.0, + 'between 5 and 20 euro': 0.0, + 'between 5 and 25 euro': 0.0, + 'between 6 and 10 euro': 0.0, + 'between 6 and 11 euro': 0.0, + 'between 6 and 15 euro': 0.0, + 'between 6 and 29 euro': 0.0, + 'between 7 and 11 euro': 0.0, + 'between 7 and 13 euro': 0.0, + 'between 7 and 15 euro': 0.0, + 'between 7 and 37 euro': 0.0, + 'between 8 and 22 euro': 0.0, + 'between 9 and 13 dolllar': 0.0, + 'between 9 and 15 euro': 0.0, + 'between 9 and 58 euro': 0.0, + 'bteween 11 and 15 euro': 0.0, + 'bteween 15 and 22 euro': 0.0, + 'bteween 22 and 37': 0.0, + 'bteween 30 and 58 euro': 0.0, + 'bteween 51 and 73 euro': 0.0, + 'netween 20 and 30 euro': 0.0}, + 'pricerange': {'**NONE**': 0.22571148184494605, + 'cheap': 0.0, + 'dontcare': 0.774288518155054, + 'expensive': 0.0, + 'moderate': 0.0}, + 'requested': {'addr': 0.0, + 'allowedforkids': 0.0, + 'area': 0.0, + 'food': 0.0, + 'goodformeal': 0.0, + 'name': 0.0, + 'near': 0.0, + 'phone': 0.0, + 'postcode': 0.0, + 'price': 0.0, + 'pricerange': 0.0}}, + 'features': {'inform_info': [False, + False, + False, + True, + True, + False, + False, + False, + True, + True, + False, + True, + False, + False, + False, + False, + True, + False, + False, + False, + False, + True, + False, + False, + False], + 'informedVenueSinceNone': [], + 'lastActionInformNone': False, + 'lastInformedVenue': '', + 'offerHappened': False}, + 'userActs': [('inform(allowedforkids="1")', 0.90842356395668944), + ('inform(allowedforkids="dontcare")', 0.0091759955955221153), + ('inform(allowedforkids="0")', 0.0091759955955221153), + ('inform(postcode)', 0.025509267755551478), + ('inform(area="stonestown")', 0.024683428151954491), + ('null()', 0.023031748944760511)]} + + b3 = {'beliefs': {'area': {'**NONE**': 0.12910550615265692, + 'centre': 0.8338099777773861, + 'dontcare': 0.0, + 'east': 0.03708451606995696, + 'north': 0.0, + 'south': 0.0, + 'west': 0.0}, + 'discourseAct': {'ack': 0.0, + 'bye': 0.0, + 'hello': 0.0, + 'none': 1.0, + 'repeat': 0.0, + 'silence': 0.0, + 'thankyou': 0.0}, + 'food': {'**NONE**': 0.020895546925810415, + 'afghan': 0.0, + 'african': 0.0, + 'afternoon tea': 0.0, + 'asian oriental': 0.0, + 'australasian': 0.0, + 'australian': 0.0, + 'austrian': 0.0, + 'barbeque': 0.0, + 'basque': 0.0, + 'belgian': 0.0, + 'bistro': 0.0, + 'brazilian': 0.0, + 'british': 0.0, + 'canapes': 0.0, + 'cantonese': 0.0, + 'caribbean': 0.0, + 'catalan': 0.0, + 'chinese': 0.0, + 'christmas': 0.0, + 'corsica': 0.0, + 'creative': 0.0, + 'crossover': 0.0, + 'cuban': 0.0, + 'danish': 0.0, + 'dontcare': 0.0, + 'eastern european': 0.0, + 'english': 0.0, + 'eritrean': 0.0, + 'european': 0.0, + 'french': 0.0, + 'fusion': 0.0, + 'gastropub': 0.0, + 'german': 0.0, + 'greek': 0.0, + 'halal': 0.0, + 'hungarian': 0.0, + 'indian': 0.0, + 'indonesian': 0.0, + 'international': 0.0, + 'irish': 0.0, + 'italian': 0.0, + 'jamaican': 0.0, + 'japanese': 0.0, + 'korean': 0.0, + 'kosher': 0.0, + 'latin american': 0.0, + 'lebanese': 0.0, + 'light bites': 0.0, + 'malaysian': 0.0, + 'mediterranean': 0.9791044530741896, + 'mexican': 0.0, + 'middle eastern': 0.0, + 'modern american': 0.0, + 'modern eclectic': 0.0, + 'modern european': 0.0, + 'modern global': 0.0, + 'molecular gastronomy': 0.0, + 'moroccan': 0.0, + 'new zealand': 0.0, + 'north african': 0.0, + 'north american': 0.0, + 'north indian': 0.0, + 'northern european': 0.0, + 'panasian': 0.0, + 'persian': 0.0, + 'polish': 0.0, + 'polynesian': 0.0, + 'portuguese': 0.0, + 'romanian': 0.0, + 'russian': 0.0, + 'scandinavian': 0.0, + 'scottish': 0.0, + 'seafood': 0.0, + 'singaporean': 0.0, + 'south african': 0.0, + 'south indian': 0.0, + 'spanish': 0.0, + 'sri lankan': 0.0, + 'steakhouse': 0.0, + 'swedish': 0.0, + 'swiss': 0.0, + 'thai': 0.0, + 'the americas': 0.0, + 'traditional': 0.0, + 'turkish': 0.0, + 'tuscan': 0.0, + 'unusual': 0.0, + 'vegetarian': 0.0, + 'venetian': 0.0, + 'vietnamese': 0.0, + 'welsh': 0.0, + 'world': 0.0}, + 'method': {'byalternatives': 0.0, + 'byconstraints': 0.6359877465366015, + 'byname': 0.0, + 'finished': 0.0, + 'none': 0.0, + 'restart': 0.0}, + 'name': {'**NONE**': 1.0, + 'ali baba': 0.0, + 'anatolia': 0.0, + 'ask': 0.0, + 'backstreet bistro': 0.0, + 'bangkok city': 0.0, + 'bedouin': 0.0, + 'bloomsbury restaurant': 0.0, + 'caffe uno': 0.0, + 'cambridge lodge restaurant': 0.0, + 'charlie chan': 0.0, + 'chiquito restaurant bar': 0.0, + 'city stop restaurant': 0.0, + 'clowns cafe': 0.0, + 'cocum': 0.0, + 'cote': 0.0, + 'cotto': 0.0, + 'curry garden': 0.0, + 'curry king': 0.0, + 'curry prince': 0.0, + 'curry queen': 0.0, + 'da vince pizzeria': 0.0, + 'da vinci pizzeria': 0.0, + 'darrys cookhouse and wine shop': 0.0, + 'de luca cucina and bar': 0.0, + 'dojo noodle bar': 0.0, + 'don pasquale pizzeria': 0.0, + 'efes restaurant': 0.0, + 'eraina': 0.0, + 'fitzbillies restaurant': 0.0, + 'frankie and bennys': 0.0, + 'galleria': 0.0, + 'golden house': 0.0, + 'golden wok': 0.0, + 'gourmet burger kitchen': 0.0, + 'graffiti': 0.0, + 'grafton hotel restaurant': 0.0, + 'hakka': 0.0, + 'hk fusion': 0.0, + 'hotel du vin and bistro': 0.0, + 'india house': 0.0, + 'j restaurant': 0.0, + 'jinling noodle bar': 0.0, + 'kohinoor': 0.0, + 'kymmoy': 0.0, + 'la margherita': 0.0, + 'la mimosa': 0.0, + 'la raza': 0.0, + 'la tasca': 0.0, + 'lan hong house': 0.0, + 'little seoul': 0.0, + 'loch fyne': 0.0, + 'mahal of cambridge': 0.0, + 'maharajah tandoori restaurant': 0.0, + 'meghna': 0.0, + 'meze bar restaurant': 0.0, + 'michaelhouse cafe': 0.0, + 'midsummer house restaurant': 0.0, + 'nandos': 0.0, + 'nandos city centre': 0.0, + 'panahar': 0.0, + 'peking restaurant': 0.0, + 'pipasha restaurant': 0.0, + 'pizza express': 0.0, + 'pizza express fen ditton': 0.0, + 'pizza hut': 0.0, + 'pizza hut cherry hinton': 0.0, + 'pizza hut city centre': 0.0, + 'pizza hut fen ditton': 0.0, + 'prezzo': 0.0, + 'rajmahal': 0.0, + 'restaurant alimentum': 0.0, + 'restaurant one seven': 0.0, + 'restaurant two two': 0.0, + 'rice boat': 0.0, + 'rice house': 0.0, + 'riverside brasserie': 0.0, + 'royal spice': 0.0, + 'royal standard': 0.0, + 'saffron brasserie': 0.0, + 'saigon city': 0.0, + 'saint johns chop house': 0.0, + 'sala thong': 0.0, + 'sesame restaurant and bar': 0.0, + 'shanghai family restaurant': 0.0, + 'shiraz restaurant': 0.0, + 'sitar tandoori': 0.0, + 'stazione restaurant and coffee bar': 0.0, + 'taj tandoori': 0.0, + 'tandoori palace': 0.0, + 'tang chinese': 0.0, + 'thanh binh': 0.0, + 'the cambridge chop house': 0.0, + 'the copper kettle': 0.0, + 'the cow pizza kitchen and bar': 0.0, + 'the gandhi': 0.0, + 'the gardenia': 0.0, + 'the golden curry': 0.0, + 'the good luck chinese food takeaway': 0.0, + 'the hotpot': 0.0, + 'the lucky star': 0.0, + 'the missing sock': 0.0, + 'the nirala': 0.0, + 'the oak bistro': 0.0, + 'the river bar steakhouse and grill': 0.0, + 'the slug and lettuce': 0.0, + 'the varsity restaurant': 0.0, + 'travellers rest': 0.0, + 'ugly duckling': 0.0, + 'venue': 0.0, + 'wagamama': 0.0, + 'yippee noodle bar': 0.0, + 'yu garden': 0.0, + 'zizzi cambridge': 0.0}, + 'pricerange': {'**NONE**': 0.1340777132648503, + 'cheap': 0.0, + 'dontcare': 0.8659222867351497, + 'expensive': 0.0, + 'moderate': 0.0}, + 'requested': {'addr': 0.0, + 'area': 0.0, + 'description': 0.0, + 'food': 0.0, + 'name': 0.0, + 'phone': 0.0, + 'postcode': 0.0, + 'pricerange': 0.0, + 'signature': 0.0}}, + 'features': {'inform_info': [False, + False, + True, + False, + True, + False, + False, + True, + False, + False, + False, + False, + True, + False, + False, + False, + False, + True, + False, + False, + False, + False, + True, + False, + False], + 'informedVenueSinceNone': [], + 'lastActionInformNone': False, + 'lastInformedVenue': '', + 'offerHappened': False}, + 'userActs': [('inform(food="mediterranean")', 0.84415346579983519), + ('inform(area="east")', 0.037084516069956962), + ('null()', 0.048530354363153554), + ('reqmore()', 0.04541708634740408), + ('confirm(phone)', 0.024814577419650211)]} + + return b1, b2, b3 + + +def main(): + """ + unit test + :return: + """ + + Settings.init('config/Tut-gp-Multidomain.cfg', 12345) + Ontology.init_global_ontology() + + b1, b2, b3 = get_test_beliefs() + '''state1 = DIP_state(b1, domainString='SFRestaurants') + state2 = DIP_state(b2, domainString='SFRestaurants') + state3 = DIP_state(b3, domainString='CamRestaurants')''' + state1 = padded_state(b1, domainString='SFRestaurants') + state2 = padded_state(b2, domainString='SFRestaurants') + state3 = padded_state(b3, domainString='CamRestaurants') + print(state1.get_beliefStateVec('area')[:state1.max_v]) + print(len(state2.get_beliefStateVec('near'))-state2.max_v) + print(len(state3.get_beliefStateVec('pricerange'))-state3.max_v) + #print len(state3.get_beliefStateVec('general')) + s2 = state2.get_beliefStateVec('food') + s3 = state3.get_beliefStateVec('food') + a=1 + #print state3.get_beliefStateVec('general')[:state2.max_v] + #print state2.max_v + #print state3.max_v + + +if __name__ == '__main__': + main() + diff --git a/policy/feudalgainRL/DQNPolicy_latest.py b/policy/feudalgainRL/DQNPolicy_latest.py new file mode 100644 index 0000000000000000000000000000000000000000..559e5bc4f55174efffac683a31cd9c9a6481f198 --- /dev/null +++ b/policy/feudalgainRL/DQNPolicy_latest.py @@ -0,0 +1,789 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +''' +DQNPolicy.py - deep Q network policy +================================================== + +Author: Pei-Hao (Eddy) Su (Copyright CUED Dialogue Systems Group 2016) + +.. seealso:: CUED Imports/Dependencies: + + import :class:`Policy` + import :class:`utils.ContextLogger` + +.. warning:: + Documentation not done. + + +************************ + +''' + +import copy +import sys +import os +import json +import numpy as np +import pickle as pickle +import random +import utils +from utils.Settings import config as cfg +from utils import ContextLogger + +import ontology.FlatOntologyManager as FlatOnt +#from theano_dialogue.util.tool import * + +import tensorflow as tf +from policy.DRL.replay_buffer import ReplayBuffer +from policy.DRL.replay_prioritised import ReplayPrioritised +import policy.DRL.utils as drlutils +import policy.DRL.dqn as dqn +import policy.Policy +import policy.SummaryAction +from policy.Policy import TerminalAction, TerminalState +import policy.GPPolicy + +logger = utils.ContextLogger.getLogger('') + +# --- for flattening the belief --- # +domainUtil = FlatOnt.FlatDomainOntology('CamRestaurants') + +""" +def flatten_belief(gpstate): + ''' + Flatten the GP-dictionary-typed belief state to a one-dim vector + ''' + + if isinstance(gpstate, TerminalState): + return [0] * 304 #260 #264 + + flat_belief = [] + for key, value in gpstate._bstate.items(): + flat_belief += value + + return flat_belief +""" + +def flatten_belief(belief,domainUtil=FlatOnt.FlatDomainOntology('CamRestaurants'), merge=False): + if isinstance(belief, TerminalState): + return [0] * 260 #264 + + #for key, value in belief.items(): + # print key, value + + #policyfeatures = ['full','method','discourseAct','requested'] + policyfeatures = ['full','method','discourseAct','requested',\ + 'lastActionInformNone','offerHappened','inform_info'] + + flat_belief = [] + for feat in policyfeatures: + add_feature = [] + if feat == 'kbest': + for slot in self.domainUtil.sorted_system_requestable_slots: + # print slot, 'belief', belief['beliefs'][slot] + temp = [belief['beliefs'][slot][value] for value in domainUtil.ontology['informable'][slot]] + temp = sorted(temp, key=lambda b: -b) + #temp = [belief['beliefs'][slot]['dontcare']] + [belief['beliefs'][slot]['**NONE**']] + temp + temp = temp + [belief['beliefs'][slot]['dontcare']] + [belief['beliefs'][slot]['**NONE**']] + temp = temp[0:self.max_k] + add_feature += temp + elif feat == 'full': + #for slot in self.sorted_slots: + for slot in domainUtil.ontology['informable']: + for value in domainUtil.ontology['informable'][slot]:# + ['**NONE**']: + #for value in domainUtil.ontology['informable'][slot] + ['**NONE**']: + #for value in domainUtil.ontology['informable'][slot] + ['dontcare'] + ['**NONE**']: + add_feature.append(belief['beliefs'][slot][value]) + elif feat == 'method': + add_feature = [belief['beliefs']['method'][method] for method in domainUtil.ontology['method']] + elif feat == 'discourseAct': + add_feature = [belief['beliefs']['discourseAct'][discourseAct] + for discourseAct in domainUtil.ontology['discourseAct']] + elif feat == 'requested': + add_feature = [belief['beliefs']['requested'][slot] \ + for slot in domainUtil.ontology['requestable']] + elif feat == 'lastActionInformNone': + add_feature.append(float(belief['features']['lastActionInformNone'])) + elif feat == 'offerHappened': + add_feature.append(float(belief['features']['offerHappened'])) + elif feat == 'inform_info': + add_feature += belief['features']['inform_info'] + else: + logger.error('Invalid feature name in config: ' + feat) + + flat_belief += add_feature + + return flat_belief + + + + """ + flat_belief = [] + for feat in policyfeatures: + add_feature = [] + if feat == 'full': + #for slot in self.sorted_slots: + for slot in domainUtil.ontology['informable']: + if slot == 'name': + continue + accumProb = 0.0 + for value in domainUtil.ontology['informable'][slot] + ['**NONE**']: + if value not in ('dontcare', '**NONE**'): + accumProb += float(belief['beliefs'][slot][value]) + add_feature.append(accumProb) + add_feature.append(belief['beliefs'][slot]['dontcare']) + add_feature.append(belief['beliefs'][slot]['**NONE**']) + + #add_feature.append(belief['beliefs'][slot][value]) + elif feat == 'method': + add_feature = [belief['beliefs']['method'][method] \ + for method in domainUtil.ontology['method']] + elif feat == 'discourseAct': + add_feature = [belief['beliefs']['discourseAct'][discourseAct] + for discourseAct in domainUtil.ontology['discourseAct']] + elif feat == 'requested': + add_feature = [belief['beliefs']['requested'][slot] \ + for slot in domainUtil.ontology['requestable']] + elif feat == 'lastActionInformNone': + add_feature.append(float(belief['features']['lastActionInformNone'])) + elif feat == 'offerHappened': + add_feature.append(float(belief['features']['offerHappened'])) + elif feat == 'inform_info': + add_feature += (belief['features']['inform_info']) + else: + logger.error('Invalid feature name in config: ' + feat) + + flat_belief += add_feature + return flat_belief + """ + + +class DQNPolicy(Policy.Policy): + '''Derived from :class:`Policy` + ''' + def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False): + super(DQNPolicy, self).__init__(domainString, is_training) + + self.in_policy_file = in_policy_file + self.out_policy_file = out_policy_file + self.is_training = is_training + self.accum_belief = [] + self.stats = [0 for ii in range(14)] + + self.prev_state_check = None + + # parameter settings + self.n_in= 260 + if cfg.has_option('dqnpolicy_'+domainString, 'n_in'): + self.n_in = cfg.getint('dqnpolicy_'+domainString, 'n_in') + + self.actor_lr = 0.0001 + if cfg.has_option('dqnpolicy_'+domainString, 'actor_lr'): + self.actor_lr = cfg.getfloat('dqnpolicy_'+domainString, 'actor_lr') + + self.critic_lr = 0.001 + if cfg.has_option('dqnpolicy_'+domainString, 'critic_lr'): + self.critic_lr = cfg.getfloat('dqnpolicy_'+domainString, 'critic_lr') + + self.tau = 0.001 + if cfg.has_option('dqnpolicy_'+domainString, 'tau'): + self.tau = cfg.getfloat('dqnpolicy_'+domainString, 'tau') + + self.randomseed = 1234 + if cfg.has_option('GENERAL', 'seed'): + self.randomseed = cfg.getint('GENERAL', 'seed') + + self.gamma = 1.0 + if cfg.has_option('dqnpolicy_'+domainString, 'gamma'): + self.gamma = cfg.getfloat('dqnpolicy_'+domainString, 'gamma') + + self.regularisation = 'l2' + if cfg.has_option('dqnpolicy_'+domainString, 'regularisation'): + self.regularisation = cfg.get('dqnpolicy_'+domainString, 'regulariser') + + self.learning_rate = 0.001 + if cfg.has_option('dqnpolicy_'+domainString, 'learning_rate'): + self.learning_rate = cfg.getfloat('dqnpolicy_'+domainString, 'learning_rate') + + self.exploration_type = 'e-greedy' # Boltzman + if cfg.has_option('dqnpolicy_'+domainString, 'exploration_type'): + self.exploration_type = cfg.get('dqnpolicy_'+domainString, 'exploration_type') + + self.episodeNum = 1000 + if cfg.has_option('dqnpolicy_'+domainString, 'episodeNum'): + self.episodeNum = cfg.getfloat('dqnpolicy_'+domainString, 'episodeNum') + + self.maxiter = 5000 + if cfg.has_option('dqnpolicy_'+domainString, 'maxiter'): + self.maxiter = cfg.getfloat('dqnpolicy_'+domainString, 'maxiter') + + self.epsilon = 1 + if cfg.has_option('dqnpolicy_'+domainString, 'epsilon'): + self.epsilon = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon') + + self.epsilon_start = 1 + if cfg.has_option('dqnpolicy_'+domainString, 'epsilon_start'): + self.epsilon_start = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon_start') + + self.epsilon_end = 1 + if cfg.has_option('dqnpolicy_'+domainString, 'epsilon_end'): + self.epsilon_end = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon_end') + + self.priorProbStart = 1.0 + if cfg.has_option('dqnpolicy_'+domainString, 'prior_sample_prob_start'): + self.priorProbStart = cfg.getfloat('dqnpolicy_'+domainString, 'prior_sample_prob_start') + + self.priorProbEnd = 0.1 + if cfg.has_option('dqnpolicy_'+domainString, 'prior_sample_prob_end'): + self.priorProbEnd = cfg.getfloat('dqnpolicy_'+domainString, 'prior_sample_prob_end') + + self.policyfeatures = [] + if cfg.has_option('dqnpolicy_'+domainString, 'features'): + logger.info('Features: ' + str(cfg.get('dqnpolicy_'+domainString, 'features'))) + self.policyfeatures = json.loads(cfg.get('dqnpolicy_'+domainString, 'features')) + + self.max_k = 5 + if cfg.has_option('dqnpolicy_'+domainString, 'max_k'): + self.max_k = cfg.getint('dqnpolicy_'+domainString, 'max_k') + + self.learning_algorithm = 'drl' + if cfg.has_option('dqnpolicy_'+domainString, 'learning_algorithm'): + self.learning_algorithm = cfg.get('dqnpolicy_'+domainString, 'learning_algorithm') + logger.info('Learning algorithm: ' + self.learning_algorithm) + + self.minibatch_size = 32 + if cfg.has_option('dqnpolicy_'+domainString, 'minibatch_size'): + self.minibatch_size = cfg.getint('dqnpolicy_'+domainString, 'minibatch_size') + + self.capacity = 1000#max(self.minibatch_size, 2000) + if cfg.has_option('dqnpolicy_'+domainString, 'capacity'): + self.capacity = max(cfg.getint('dqnpolicy_'+domainString,'capacity'), 2000) + + self.replay_type = 'vanilla' + if cfg.has_option('dqnpolicy_'+domainString, 'replay_type'): + self.replay_type = cfg.get('dqnpolicy_'+domainString, 'replay_type') + + self.architecture = 'vanilla' + if cfg.has_option('dqnpolicy_'+domainString, 'architecture'): + self.architecture = cfg.get('dqnpolicy_'+domainString, 'architecture') + + self.q_update = 'single' + if cfg.has_option('dqnpolicy_'+domainString, 'q_update'): + self.q_update = cfg.get('dqnpolicy_'+domainString, 'q_update') + + self.h1_size = 130 + if cfg.has_option('dqnpolicy_'+domainString, 'h1_size'): + self.h1_size = cfg.getint('dqnpolicy_'+domainString, 'h1_size') + + self.h2_size = 130 + if cfg.has_option('dqnpolicy_'+domainString, 'h2_size'): + self.h2_size = cfg.getint('dqnpolicy_'+domainString, 'h2_size') + + """ + self.shuffle = False + if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'): + self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay') + if not self.shuffle: + # If we don't use experience replay, we don't need to maintain + # sliding window of experiences with maximum capacity. + # We only need to maintain the data of minibatch_size + self.capacity = self.minibatch_size + """ + + self.episode_ave_max_q = [] + + os.environ["CUDA_VISIBLE_DEVICES"]="" + + # init session + self.sess = tf.Session() + with tf.device("/cpu:0"): + + np.random.seed(self.randomseed) + tf.set_random_seed(self.randomseed) + + # initialise an replay buffer + if self.replay_type == 'vanilla': + self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed) + elif self.replay_type == 'prioritized': + self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size, self.randomseed) + #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) + #self.episodes = [] + self.samplecount = 0 + self.episodecount = 0 + + # construct the models + self.state_dim = self.n_in + self.summaryaction = SummaryAction.SummaryAction(domainString) + self.action_dim = len(self.summaryaction.action_names) + action_bound = len(self.summaryaction.action_names) + + self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, \ + self.critic_lr, self.tau, action_bound, self.architecture, self.h1_size, self.h2_size) + + # when all models are defined, init all variables + init_op = tf.initialize_all_variables() + self.sess.run(init_op) + + self.loadPolicy(self.in_policy_file) + print('loaded replay size: ', self.episodes[self.domainString].size()) + + self.dqn.update_target_network() + + # def record() has been handled... + + def act_on(self, beliefstate, hyps=None): + if self.lastSystemAction is None and self.startwithhello: + systemAct, nextaIdex = 'hello()', -1 + else: + systemAct, nextaIdex = self.nextAction(beliefstate, hyps) + self.lastSystemAction = systemAct + self.summaryAct = nextaIdex + self.prevbelief = beliefstate + return systemAct + + def record(self, reward, domainInControl=None, weight=None, state=None, action=None): + if domainInControl is None: + domainInControl = self.domainString + if self.episodes[domainInControl] is None: + self.episodes[domainInControl] = Episode(dstring=domainInControl) + if self.actToBeRecorded is None: + #self.actToBeRecorded = self.lastSystemAction + self.actToBeRecorded = self.summaryAct + + if state is None: + state = self.prevbelief + if action is None: + action = self.actToBeRecorded + + cState, cAction = self.convertStateAction(state, action) + + # normalising total return to -1~1 + #reward /= 40.0 + reward /= 20.0 + """ + reward = float(reward+10.0)/40.0 + """ + + if weight == None: + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward) + elif self.replay_type == 'prioritized': + + ##### calculate Q_s_t_a_t_ and gamma_Q_s_tplu1_maxa_ for PER ### + ################################################################ + cur_cState = np.vstack([np.expand_dims(x, 0) for x in [cState]]) + cur_cAction_one_hot = np.eye(self.action_dim, self.action_dim)[[cAction]] + + cur_action_q = self.dqn.predict(cur_cState, cur_cAction_one_hot) + execMask = self.summaryaction.getExecutableMask(state, cAction) + + if self.q_update == 'single': + Qs = [] + for idx, v in enumerate(execMask): + if v > -sys.maxsize: + Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] + Qidx = self.dqn.predict_target(cur_cState, Action_idx) + Qs.append(Qidx[0]) + #Qs.append(Qidx[0]) + + Q_s_t_a_t_ = cur_action_q[0] + gamma_Q_s_tplu1_maxa_ = self.gamma * np.max(Qs) + elif self.q_update == 'double': + Qs = [] + for idx, v in enumerate(execMask): + if v > -sys.maxsize: + Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] + Qidx = self.dqn.predict(cur_cState, Action_idx) + Qs.append(Qidx[0]) + else: + Qs.append(-sys.maxsize) + + policyQ_argmax_a = np.argmax(Qs) + policyQ_argmax_a_one_hot = np.eye(self.action_dim, self.action_dim)[[policyQ_argmax_a]] + target_value_Q = self.dqn.predict_target(cur_cState, policyQ_argmax_a_one_hot) + + Q_s_t_a_t_ = cur_action_q[0] + gamma_Q_s_tplu1_maxa_ = self.gamma * target_value_Q + + print('Q_s_t_a_t_', Q_s_t_a_t_) + print('gamma_Q_s_tplu1_maxa_', gamma_Q_s_tplu1_maxa_) + ################################################################ + + # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used + #if self.samplecount >= self.capacity: + if True: + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward, \ + Q_s_t_a_t_ = Q_s_t_a_t_, gamma_Q_s_tplu1_maxa_= gamma_Q_s_tplu1_maxa_, uniform=False) + else: + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward, \ + Q_s_t_a_t_ = Q_s_t_a_t_, gamma_Q_s_tplu1_maxa_= gamma_Q_s_tplu1_maxa_, uniform=True) + + else: + self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward, ma_weight=weight) + + self.actToBeRecorded = None + self.samplecount += 1 + return + + def finalizeRecord(self, reward, domainInControl=None): + if domainInControl is None: + domainInControl = self.domainString + if self.episodes[domainInControl] is None: + logger.warning("record attempted to be finalized for domain where nothing has been recorded before") + return + + #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size()) + print('Episode Avg_Max_Q', np.mean(self.episode_ave_max_q)) + + print('saving statics') + self.saveStats() + print(self.stats) + + # normalising total return to -1~1 + #if reward == 0: + # reward = -20.0 + reward /= 20.0 + """ + if reward == 20.0: + reward = 1.0 + else: + reward = -0.5 + """ + #reward = float(reward+10.0)/40.0 + + terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction()) + + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, terminal=True) + elif self.replay_type == 'prioritized': + # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used + #if self.samplecount >= self.capacity: + if True: + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, \ + Q_s_t_a_t_ = 0.0, gamma_Q_s_tplu1_maxa_= 0.0, uniform=False, terminal=True) + else: + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, \ + Q_s_t_a_t_ = 0.0, gamma_Q_s_tplu1_maxa_= 0.0, uniform=True, terminal=True) + return + + def convertStateAction(self, state, action): + ''' + nnType = 'dnn' + #nnType = 'rnn' + # expand one dimension to match the batch size of 1 at axis 0 + if nnType == 'rnn': + belief = np.expand_dims(belief,axis=0) + ''' + + if isinstance(state, TerminalState): + return [0] * 260, action #260 #264 + else: + flat_belief = flatten_belief(state) + + if flat_belief == self.prev_state_check: + print('same state') + else: + print('diff state') + self.prev_state_check = flat_belief + + return flat_belief, action + + def nextAction(self, beliefstate, hyps): + ''' + select next action + + :param beliefstate: + :param hyps: + :returns: (int) next summary action + ''' + #beliefVec = flatten_belief(beliefstate, domainUtil) + beliefVec = flatten_belief(beliefstate) + + execMask = self.summaryaction.getExecutableMask(beliefstate, self.lastSystemAction) + #print sum([ 1 for i in execMask if i==0.0 ]) + if self.exploration_type == 'e-greedy': + # epsilon greedy + if self.is_training and utils.Settings.random.rand() < self.epsilon: + admissible = [i for i, x in enumerate(execMask) if x == 0.0] + random.shuffle(admissible) + nextaIdex = admissible[0] + else: + admissible = [] + for idx, v in enumerate(execMask): + if v > -sys.maxsize: + Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] + Qidx = self.dqn.predict(np.reshape(beliefVec, (1, len(beliefVec))), Action_idx) + admissible.append(Qidx[0]) + else: + admissible.append(-sys.maxsize) + #action_Q = self.dqn.predict(np.reshape(beliefVec, (1, len(beliefVec))))# + (1. / (1. + i + j)) + #admissible = np.add(action_Q, np.array(execMask)) + logger.info('action Q...') + print(admissible) + nextaIdex = np.argmax(admissible) + + # add current max Q to self.episode_ave_max_q + print('current maxQ', np.max(admissible)) + self.episode_ave_max_q.append(np.max(admissible)) + + elif self.exploration_type == 'Boltzman': + # randomly assign, not complete + admissible = [i for i, x in enumerate(execMask) if x == 0.0] + random.shuffle(admissible) + nextaIdex = admissible[0] + + self.stats[nextaIdex] += 1 + summaryAct = self.summaryaction.action_names[nextaIdex] + masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction) + return masterAct, nextaIdex + + def train(self): + ''' + call this function when the episode ends + ''' + + if not self.is_training: + logger.info("Not in training mode") + return + else: + logger.info("Update dqn policy parameters.") + + self.episodecount += 1 + logger.info("Sample Num so far: %s" %(self.samplecount)) + logger.info("Episode Num so far: %s" %(self.episodecount)) + #if True: + if self.samplecount >= self.minibatch_size * 3 and self.episodecount % 4 == 0: + #if self.samplecount >= self.capacity and self.episodecount % 5 == 0: + #if self.samplecount > self.minibatch_size: + #if self.samplecount > self.capacity: + logger.info('start traninig...') + + + ################################################# + ################################################# + # update TD error for all experience in PER # + ################################################# + ################################################# + """ + #s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \ + # self.episodes[self.domainString].all_batch() + experience, idx_batch = self.episodes[self.domainString].all_batch() + #self.episodes[self.domainString].sample_batch_vanilla_PER() + + #s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch]) + #s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch]) + + # self.s_prev, self.s_ori_prev, self.a_prev, self.r_prev, state, state_ori, termina + + for k in xrange(len(idx_batch)): + Q_bootstrap_label = 0 + if experience[k][-1]: # terminal + Q_bootstrap_label = experience[k][3] # reward + else: + execMask = self.summaryaction.getExecutableMask(experience[k][-2], experience[k][2]) # s_ori, a + if self.q_update == 'single': + admissible = [] + for idx, v in enumerate(execMask): + if v > -sys.maxint: + Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] + s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][0]] ]) # s + Qidx = self.dqn.predict_target(s2_idx, Action_idx) + admissible.append(Qidx[0]) + Q_bootstrap_label = experience[k][3] + self.gamma * np.max(admissible) # reward + elif self.q_update == 'double': + Qs = [] + for idx, v in enumerate(execMask): + if v > -sys.maxint: + Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] + s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][-3]] ]) + Qidx = self.dqn.predict(s2_idx, Action_idx) + Qs.append(Qidx[0]) + else: + Qs.append(-sys.maxint) + + policyQ_argmax_a = np.argmax(Qs) + policyQ_argmax_a_one_hot = np.eye(self.action_dim, self.action_dim)[[policyQ_argmax_a]] + s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][-3]] ]) + target_value_Q = self.dqn.predict_target(s2_idx, policyQ_argmax_a_one_hot) + + Q_bootstrap_label = experience[k][3] + self.gamma * target_value_Q + + if self.replay_type == 'prioritized': + # update the sum-tree + # update the TD error of the samples in the minibatch + current_a = np.eye(self.action_dim, self.action_dim)[[experience[k][2]]] + current_s = np.vstack([ np.expand_dims(x, 0) for x in [experience[k][0]] ]) + currentQ_s_a_ = self.dqn.predict(current_s, current_a) + currentQ_s_a_ = currentQ_s_a_[0] + error = abs(currentQ_s_a_ - Q_bootstrap_label) + self.episodes[self.domainString].update(idx_batch[k], error) + + """ + + s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \ + self.episodes[self.domainString].sample_batch() + #self.episodes[self.domainString].sample_batch_vanilla_PER() + + #s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch]) + #s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch]) + + y_i = [] + for k in range(min(self.minibatch_size, self.episodes[self.domainString].size())): + Q_bootstrap_label = 0 + if t_batch[k]: + Q_bootstrap_label = r_batch[k] + else: + execMask = self.summaryaction.getExecutableMask(s2_ori_batch[k], a_batch[k]) + if self.q_update == 'single': + admissible = [] + for idx, v in enumerate(execMask): + if v > -sys.maxsize: + Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] + s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [s2_batch[k]] ]) + Qidx = self.dqn.predict_target(s2_idx, Action_idx) + admissible.append(Qidx[0]) + Q_bootstrap_label = r_batch[k] + self.gamma * np.max(admissible) + elif self.q_update == 'double': + Qs = [] + for idx, v in enumerate(execMask): + if v > -sys.maxsize: + Action_idx = np.eye(self.action_dim, self.action_dim)[[idx]] + s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [s2_batch[k]] ]) + Qidx = self.dqn.predict(s2_idx, Action_idx) + Qs.append(Qidx[0]) + else: + Qs.append(-sys.maxsize) + + policyQ_argmax_a = np.argmax(Qs) + policyQ_argmax_a_one_hot = np.eye(self.action_dim, self.action_dim)[[policyQ_argmax_a]] + s2_idx = np.vstack([ np.expand_dims(x, 0) for x in [s2_batch[k]] ]) + target_value_Q = self.dqn.predict_target(s2_idx, policyQ_argmax_a_one_hot) + + Q_bootstrap_label = r_batch[k] + self.gamma * target_value_Q + y_i.append(Q_bootstrap_label) + + if self.replay_type == 'prioritized': + # update the sum-tree + # update the TD error of the samples in the minibatch + current_a = np.eye(self.action_dim, self.action_dim)[[a_batch[k]]] + current_s = np.vstack([ np.expand_dims(x, 0) for x in [s_batch[k]] ]) + currentQ_s_a_ = self.dqn.predict(current_s, current_a) + currentQ_s_a_ = currentQ_s_a_[0] + error = abs(currentQ_s_a_ - Q_bootstrap_label) + self.episodes[self.domainString].update(idx_batch[k], error) + + # change index-based a_batch to one-hot-based a_batch + a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch] + + # Update the critic given the targets + reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i]) + + s_batch_expand = np.vstack([np.expand_dims(x, 0) for x in s_batch]) + """ + print s_batch_expand + print 'a_batch', a_batch + print a_batch_one_hot + print len(a_batch) + print len(y_i) + """ + #reshaped_yi = np.reshape(y_i, (min(self.minibatch_size, self.episodes[self.domainString].size()), 1)) + predicted_q_value, _, currentLoss = self.dqn.train(s_batch, a_batch_one_hot, reshaped_yi) + #predicted_q_value, _, currentLoss = self.dqn.train(s_batch_expand, a_batch_one_hot, reshaped_yi) + + print('y_i') + print(y_i) + print('currentLoss', currentLoss) + print('predict Q') + print(predicted_q_value) + + if self.episodecount % 1 == 0: + #if self.episodecount % 50 == 0: + # Update target networks + self.dqn.update_target_network() + + self.savePolicyInc() # self.out_policy_file) + + def savePolicy(self, FORCE_SAVE=False): + """ + Does not use this, cause it will be called from agent after every episode. + we want to save the policy only periodically. + """ + pass + + def savePolicyInc(self, FORCE_SAVE=False): + """ + save model and replay buffer + """ + #save_path = self.saver.save(self.sess, self.out_policy_file+'.ckpt') + self.dqn.save_network(self.out_policy_file+'.dqn.ckpt') + + f = open(self.out_policy_file+'.episode', 'wb') + for obj in [self.samplecount, self.episodes[self.domainString]]: + pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) + f.close() + #logger.info("Saving model to %s and replay buffer..." % save_path) + + def saveStats(self, FORCE_SAVE=False): + f = open(self.out_policy_file + '.stats', 'wb') + pickle.dump(self.stats, f, protocol=pickle.HIGHEST_PROTOCOL) + f.close() + + def loadPolicy(self, filename): + """ + load model and replay buffer + """ + # load models + self.dqn.load_network(filename+'.dqn.ckpt') + + # load replay buffer + try: + print('laod from: ', filename) + f = open(filename+'.episode', 'rb') + loaded_objects = [] + for i in range(2): # load nn params and collected data + loaded_objects.append(pickle.load(f)) + self.samplecount = int(loaded_objects[0]) + self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1]) + logger.info("Loading both model from %s and replay buffer..." % filename) + f.close() + except: + logger.info("Loading only models...") + + def restart(self): + self.summaryAct = None + self.lastSystemAction = None + self.prevbelief = None + self.actToBeRecorded = None + self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter) + print('current eps', self.epsilon) + #self.episodes = dict.fromkeys(OntologyUtils.available_domains, None) + #self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.randomseed) + self.episode_ave_max_q = [] + +#END OF FILE diff --git a/policy/feudalgainRL/FeudalACERPolicy.py b/policy/feudalgainRL/FeudalACERPolicy.py new file mode 100644 index 0000000000000000000000000000000000000000..19d9fccb719b8588567e8e397a9a6f2441230de2 --- /dev/null +++ b/policy/feudalgainRL/FeudalACERPolicy.py @@ -0,0 +1,457 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +''' +ACERPolicy.py - ACER - Actor Critic with Experience Replay +================================================== + +Copyright CUED Dialogue Systems Group 2015 - 2017 + +.. seealso:: CUED Imports/Dependencies: + + import :class:`Policy` + import :class:`utils.ContextLogger` + +.. warning:: + Documentation not done. + + +************************ + +''' +import copy +import os +import json +import numpy as np +import scipy +import scipy.signal +import pickle as pickle +import random +import utils +from utils.Settings import config as cfg +from utils import ContextLogger, DiaAct + +import ontology.FlatOntologyManager as FlatOnt +import tensorflow as tf +from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode +from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode +import policy.DRL.utils as drlutils +from policy.ACERPolicy import ACERPolicy +import policy.DRL.acer as acer +import policy.Policy +import policy.SummaryAction +from policy.Policy import TerminalAction, TerminalState +from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state + +logger = utils.ContextLogger.getLogger('') + +# Discounting function used to calculate discounted returns. +def discount(x, gamma): + return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] + + +class FeudalACERPolicy(ACERPolicy): + '''Derived from :class:`Policy` + ''' + def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, + action_names=None, slot=None, sd_state_dim=50): + super(FeudalACERPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training) + + tf.reset_default_graph() + + self.in_policy_file = in_policy_file + self.out_policy_file = out_policy_file + self.is_training = is_training + self.accum_belief = [] + self.prev_state_check = None + self.sd_state_dim = sd_state_dim + + self.domainString = domainString + self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) + + self.features = 'dip' + self.sd_enc_size = 80 + self.si_enc_size = 40 + self.dropout_rate = 0. + if cfg.has_option('feudalpolicy', 'features'): + self.features = cfg.get('feudalpolicy', 'features') + if cfg.has_option('feudalpolicy', 'sd_enc_size'): + self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size') + if cfg.has_option('feudalpolicy', 'si_enc_size'): + self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size') + if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: + self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') + if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: + self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') + self.actfreq_ds = False + if cfg.has_option('feudalpolicy', 'actfreq_ds'): + self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds') + + # init session + self.sess = tf.Session() + with tf.device("/cpu:0"): + + np.random.seed(self.randomseed) + tf.set_random_seed(self.randomseed) + + # initialise an replay buffer + if self.replay_type == 'vanilla': + self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed) + elif self.replay_type == 'prioritized': + self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed) + #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) + #self.episodes = [] + self.samplecount = 0 + self.episodecount = 0 + + # construct the models + self.state_dim = 89 # current DIP state dim + self.summaryaction = policy.SummaryAction.SummaryAction(domainString) + self.action_names = action_names + self.action_dim = len(self.action_names) + action_bound = len(self.action_names) + self.stats = [0 for _ in range(self.action_dim)] + + self.global_mu = [0. for _ in range(self.action_dim)] + + if self.features == 'dip': + if self.actfreq_ds: + if self.domainString == 'CamRestaurants': + self.state_dim += 9#16 + elif self.domainString == 'SFRestaurants': + self.state_dim += 9#25 + elif self.domainString == 'Laptops11': + self.state_dim += 9#40 + self.acer = acer.ACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta, + self.c, self.alpha, self.h1_size, self.h2_size, self.is_training) + elif self.features == 'learned' or self.features == 'rnn': + si_state_dim = 73 + if self.actfreq_ds: + if self.domainString == 'CamRestaurants': + si_state_dim += 9#16 + elif self.domainString == 'SFRestaurants': + si_state_dim += 9#25 + elif self.domainString == 'Laptops11': + si_state_dim += 9#40 + + if 0:#self.features == 'rnn': + self.acer = acer.RNNACERNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.critic_lr, + self.delta, self.c, self.alpha, self.h1_size, self.h2_size, self.is_training, + sd_enc_size=25, si_enc_size=25, dropout_rate=0., tn='normal', slot='si') + else: + self.state_dim = si_state_dim + self.acer = acer.ACERNetwork(self.sess, self.state_dim, self.action_dim, + self.critic_lr, self.delta, self.c, self.alpha, self.h1_size, + self.h2_size, self.is_training) + + else: + logger.error('features "{}" not implemented'.format(self.features)) + + + # when all models are defined, init all variables + init_op = tf.global_variables_initializer() + self.sess.run(init_op) + + self.loadPolicy(self.in_policy_file) + print('loaded replay size: ', self.episodes[self.domainString].size()) + + #self.acer.update_target_network() + + # def record() has been handled... + + def convertStateAction(self, state, action): + ''' + + ''' + if isinstance(state, TerminalState): + return [0] * 89, action + + else: + if self.features == 'learned' or self.features == 'rnn': + dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString) + else: + dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString) + action_name = self.actions.action_names[action] + act_slot = 'general' + for slot in dip_state.slots: + if slot in action_name: + act_slot = slot + flat_belief = dip_state.get_beliefStateVec(act_slot) + self.prev_state_check = flat_belief + + return flat_belief, action + + def record(self, reward, domainInControl=None, weight=None, state=None, action=None): + if domainInControl is None: + domainInControl = self.domainString + if self.actToBeRecorded is None: + self.actToBeRecorded = self.summaryAct + + if state is None: + state = self.prevbelief + if action is None: + action = self.actToBeRecorded + mu_weight = self.prev_mu + mask = self.prev_mask + if action == self.action_dim-1: # pass action was taken + mask = np.zeros(self.action_dim) + mu_weight = np.ones(self.action_dim)/self.action_dim + + cState, cAction = state, action + + reward /= 20.0 + + value = self.acer.predict_value([cState], [mask]) + + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask) + elif self.replay_type == 'prioritized': + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask) + + self.actToBeRecorded = None + self.samplecount += 1 + return + + def finalizeRecord(self, reward, domainInControl=None): + if domainInControl is None: + domainInControl = self.domainString + if self.episodes[domainInControl] is None: + logger.warning("record attempted to be finalized for domain where nothing has been recorded before") + return + + #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size()) + #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q) + #print self.stats + + # normalising total return to -1~1 + reward /= 20.0 + + terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction()) + value = 0.0 # not effect on experience replay + + def calculate_discountR_advantage(r_episode, v_episode): + ######################################################################### + # Here we take the rewards and values from the rollout, and use them to + # generate the advantage and discounted returns. + # The advantage function uses "Generalized Advantage Estimation" + bootstrap_value = 0.0 + self.r_episode_plus = np.asarray(r_episode + [bootstrap_value]) + discounted_r_episode = discount(self.r_episode_plus,self.gamma)[:-1] + self.v_episode_plus = np.asarray(v_episode + [bootstrap_value]) + advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1] + advantage = discount(advantage,self.gamma) + ######################################################################### + return discounted_r_episode, advantage + + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None) + elif self.replay_type == 'prioritized': + episode_r, episode_v = self.episodes[domainInControl].record_final_and_get_episode(state=terminal_state, \ + state_ori=TerminalState(), + action=terminal_action, + reward=reward, + value=value) + + # TD_error is a list of td error in the current episode + _, TD_error = calculate_discountR_advantage(episode_r, episode_v) + episodic_TD = np.mean(np.absolute(TD_error)) + print('episodic_TD') + print(episodic_TD) + self.episodes[domainInControl].insertPriority(episodic_TD) + + return + + def nextAction(self, beliefstate): + ''' + select next action + + :param beliefstate: + :param hyps: + :returns: (int) next summarye action + ''' + + #execMask = self.summaryaction.getExecutableMask(beliefstate, self.lastSystemAction) + execMask = np.zeros(self.action_dim) + + def apply_mask(prob, maskval, baseline=9.99999975e-06): + return prob if maskval == 0.0 else baseline # not quite 0.0 to avoid division by zero + + if self.exploration_type == 'e-greedy' or not self.is_training: + if self.is_training and utils.Settings.random.rand() < self.epsilon: + action_prob = np.random.rand(len(self.action_names)) + else: + action_prob = self.acer.predict_policy(np.reshape(beliefstate, (1, len(beliefstate))), + np.reshape(execMask, (1, len(execMask))))[0] + mu = action_prob / sum(action_prob) + self.prev_mu = mu + self.prev_mask = execMask + return action_prob + + def train(self): + ''' + call this function when the episode ends + ''' + USE_GLOBAL_MU = False + self.episode_ct += 1 + + if not self.is_training: + logger.info("Not in training mode") + return + else: + logger.info("Update acer policy parameters.") + + self.episodecount += 1 + logger.info("Sample Num so far: %s" % (self.samplecount)) + logger.info("Episode Num so far: %s" % (self.episodecount)) + if self.samplecount >= self.minibatch_size * 3 and self.episodecount % self.training_frequency == 0: + #if self.episodecount % self.training_frequency == 0: + logger.info('start trainig...') + + for _ in range(self.train_iters_per_episode): + + if self.replay_type == 'vanilla' or self.replay_type == 'prioritized': + s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, v_batch, mu_policy, mask_batch = \ + self.episodes[self.domainString].sample_batch() + if USE_GLOBAL_MU: + mu_sum = sum(self.global_mu) + mu_normalised = np.array([c / mu_sum for c in self.global_mu]) + mu_policy = [[mu_normalised for _ in range(len(mu_policy[i]))] for i in range(len(mu_policy))] + else: + assert False # not implemented yet + + discounted_r_batch = [] + advantage_batch = [] + def calculate_discountR_advantage(r_episode, v_episode): + ######################################################################### + # Here we take the rewards and values from the rolloutv, and use them to + # generate the advantage and discounted returns. + # The advantage function uses "Generalized Advantage Estimation" + bootstrap_value = 0.0 + # r_episode rescale by rhos? + self.r_episode_plus = np.asarray(r_episode + [bootstrap_value]) + discounted_r_episode = discount(self.r_episode_plus, self.gamma)[:-1] + self.v_episode_plus = np.asarray(v_episode + [bootstrap_value]) + # change sth here + advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1] + advantage = discount(advantage, self.gamma) + ######################################################################### + return discounted_r_episode, advantage + + if self.replay_type == 'prioritized': + for item_r, item_v, item_idx in zip(r_batch, v_batch, idx_batch): + # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist()) + r, a = calculate_discountR_advantage(item_r, item_v) + + # flatten nested numpy array and turn it into list + discounted_r_batch += r.tolist() + advantage_batch += a.tolist() + + # update the sum-tree + # update the TD error of the samples (episode) in the minibatch + episodic_TD_error = np.mean(np.absolute(a)) + self.episodes[self.domainString].update(item_idx, episodic_TD_error) + else: + for item_r, item_v in zip(r_batch, v_batch): + # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist()) + r, a = calculate_discountR_advantage(item_r, item_v) + + # flatten nested numpy array and turn it into list + discounted_r_batch += r.tolist() + advantage_batch += a.tolist() + + batch_size = len(s_batch) + + a_batch_one_hot = np.eye(self.action_dim)[np.concatenate(a_batch, axis=0).tolist()] + + loss, entropy, optimize = \ + self.acer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, + np.concatenate(np.array(mask_batch), axis=0).tolist(), + np.concatenate(np.array(r_batch), axis=0).tolist(), s_batch, r_batch, self.gamma, + np.concatenate(np.array(mu_policy), axis=0), + discounted_r_batch, advantage_batch) + + ent, norm_loss = entropy/float(batch_size), loss/float(batch_size) + + + self.savePolicyInc() # self.out_policy_file) + + + def savePolicy(self, FORCE_SAVE=False): + """ + Does not use this, cause it will be called from agent after every episode. + we want to save the policy only periodically. + """ + pass + + def savePolicyInc(self, FORCE_SAVE=False): + """ + save model and replay buffer + """ + if self.episodecount % self.save_step == 0: + #save_path = self.saver.save(self.sess, self.out_policy_file+'.ckpt') + self.acer.save_network(self.out_policy_file+'.acer.ckpt') + + f = open(self.out_policy_file+'.episode', 'wb') + for obj in [self.samplecount, self.episodes[self.domainString], self.global_mu]: + pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) + f.close() + #logger.info("Saving model to %s and replay buffer..." % save_path) + + def loadPolicy(self, filename): + """ + load model and replay buffer + """ + # load models + self.acer.load_network(filename+'.acer.ckpt') + + # load replay buffer + if self.load_buffer: + try: + print('load from: ', filename) + f = open(filename+'.episode', 'rb') + loaded_objects = [] + for i in range(2): # load nn params and collected data + loaded_objects.append(pickle.load(f)) + self.samplecount = int(loaded_objects[0]) + self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1]) + self.global_mu = loaded_objects[2] + logger.info("Loading both model from %s and replay buffer..." % filename) + f.close() + except: + logger.info("Loading only models...") + else: + print("We do not load the buffer!") + + def restart(self): + self.summaryAct = None + self.lastSystemAction = None + self.prevbelief = None + self.prev_mu = None + self.prev_mask = None + self.actToBeRecorded = None + self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter) + self.episode_ave_max_q = [] + +#END OF FILE diff --git a/policy/feudalgainRL/FeudalBBQNPolicy.py b/policy/feudalgainRL/FeudalBBQNPolicy.py new file mode 100644 index 0000000000000000000000000000000000000000..01a6275ac7468b716beaa01e76656a7babf15ddf --- /dev/null +++ b/policy/feudalgainRL/FeudalBBQNPolicy.py @@ -0,0 +1,407 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +''' +DQNPolicy.py - deep Q network policy +================================================== + +Author: Chris Tegho and Pei-Hao (Eddy) Su (Copyright CUED Dialogue Systems Group 2016) + +.. seealso:: CUED Imports/Dependencies: + + import :class:`Policy` + import :class:`utils.ContextLogger` + +.. warning:: + Documentation not done. + + +************************ + +''' + +import copy +import os +import json +import numpy as np +import pickle as pickle +import random +import sys +import utils +from utils.Settings import config as cfg +from utils import ContextLogger, DiaAct, DialogueState + +import ontology.FlatOntologyManager as FlatOnt +# from theano_dialogue.util.tool import * + +import tensorflow as tf +from policy.DRL.replay_bufferVanilla import ReplayBuffer +from policy.DRL.replay_prioritisedVanilla import ReplayPrioritised +import policy.DRL.utils as drlutils +from policy.DRL import bdqn as bbqn +import policy.Policy +import policy.SummaryAction +import policy.BBQNPolicy +from policy.Policy import TerminalAction, TerminalState +from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state + +logger = utils.ContextLogger.getLogger('') + +# --- for flattening the belief --- # +domainUtil = FlatOnt.FlatDomainOntology('CamRestaurants') + + +class FeudalBBQNPolicy(policy.BBQNPolicy.BBQNPolicy): + '''Derived from :class:`BBQNPolicy` + ''' + def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, + action_names=None, slot=None): + super(FeudalBBQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training) + + tf.reset_default_graph() + + self.domainString = domainString + self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) + self.in_policy_file = in_policy_file + self.out_policy_file = out_policy_file + self.is_training = is_training + self.accum_belief = [] + + self.prev_state_check = None + + self.episode_ave_max_q = [] + + self.capacity *= 4 #set the capacity for episode methods, multiply it to adjust to turn based methods + self.slot = slot + + # init session + self.sess = tf.Session() + with tf.device("/cpu:0"): + + np.random.seed(self.randomseed) + tf.set_random_seed(self.randomseed) + + # initialise an replay buffer + if self.replay_type == 'vanilla': + self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed) + elif self.replay_type == 'prioritized': + self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size, + self.randomseed) + # replay_buffer = ReplayBuffer(self.capacity, self.randomseed) + # self.episodes = [] + self.samplecount = 0 + self.episodecount = 0 + + # construct the models + self.state_dim = 89 # current DIP state dim + self.summaryaction = policy.SummaryAction.SummaryAction(domainString) + self.action_names = action_names + self.action_dim = len(self.action_names) + action_bound = len(self.action_names) + self.stats = [0 for _ in range(self.action_dim)] + self.stdVar = [] + self.meanVar = [] + self.stdMean = [] + self.meanMean = [] + self.td_error = [] + self.td_errorVar = [] + + self.target_update_freq = 1 + if cfg.has_option('bbqnpolicy', 'target_update_freq'): + self.target_update_freq = cfg.get('bbqnpolicy', 'target_update_freq') + + #feudal params + self.features = 'dip' + self.sd_enc_size = 25 + self.si_enc_size = 50 + self.dropout_rate = 0. + if cfg.has_option('feudalpolicy', 'features'): + self.features = cfg.get('feudalpolicy', 'features') + if cfg.has_option('feudalpolicy', 'sd_enc_size'): + self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size') + if cfg.has_option('feudalpolicy', 'si_enc_size'): + self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size') + if cfg.has_option('feudalpolicy', 'dropout_rate') and self.is_training: + self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') + self.actfreq_ds = False + if cfg.has_option('feudalpolicy', 'actfreq_ds'): + self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds') + + if self.features == 'dip': + if self.actfreq_ds: + if self.domainString == 'CamRestaurants': + self.state_dim += 16 + elif self.domainString == 'SFRestaurants': + self.state_dim += 25 + elif self.domainString == 'Laptops11': + self.state_dim += 40 + + self.bbqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate, self.tau, + action_bound, self.architecture, self.h1_size, self.h2_size, + self.n_samples, + self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu, + self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling, + self.alpha_divergence, self.alpha, self.sigma_eps) + elif self.features == 'learned' or self.features == 'rnn': + si_state_dim = 72 + if self.actfreq_ds: + if self.domainString == 'CamRestaurants': + si_state_dim += 16 + elif self.domainString == 'SFRestaurants': + si_state_dim += 25 + elif self.domainString == 'Laptops11': + si_state_dim += 40 + if self.domainString == 'CamRestaurants': + sd_state_dim = 94 + elif self.domainString == 'SFRestaurants': + sd_state_dim = 158 + elif self.domainString == 'Laptops11': + sd_state_dim = 13 + else: + logger.error( + 'Domain {} not implemented in feudal-DQN yet') # just find out the size of sd_state_dim for the new domain + if self.features == 'rnn': + arch = 'rnn' + self.state_dim = si_state_dim + sd_state_dim + self.bbqn = bbqn.RNNBBQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate, + self.tau, action_bound, arch, self.h1_size, self.h2_size, self.n_samples, + self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu, + self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling, + self.alpha_divergence, self.alpha, self.sigma_eps, sd_enc_size=self.sd_enc_size, + si_enc_size=self.sd_enc_size, dropout_rate=self.dropout_rate, slot=slot) + else: + arch = 'vanilla' + self.state_dim = si_state_dim + sd_state_dim + self.bbqn = bbqn.NNBBQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate, + self.tau, action_bound, arch, self.h1_size, self.h2_size, self.n_samples, + self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu, + self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling, + self.alpha_divergence, self.alpha, self.sigma_eps, sd_enc_size=self.sd_enc_size, + si_enc_size=self.sd_enc_size, dropout_rate=self.dropout_rate, slot=slot) + else: + logger.error('features "{}" not implemented'.format(self.features)) + + + + # when all models are defined, init all variables + init_op = tf.global_variables_initializer() + self.sess.run(init_op) + + self.loadPolicy(self.in_policy_file) + print('loaded replay size: ', self.episodes[self.domainString].size()) + + self.bbqn.update_target_network() + + def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None): + if domainInControl is None: + domainInControl = self.domainString + if self.actToBeRecorded is None: + # self.actToBeRecorded = self.lastSystemAction + self.actToBeRecorded = self.summaryAct + + if state is None: + state = self.prevbelief + if action is None: + action = self.actToBeRecorded + + cState, cAction = state, action + + reward /= 20.0 + + cur_cState = np.vstack([np.expand_dims(x, 0) for x in [cState]]) + cur_action_q = self.bbqn.predict(cur_cState) + cur_target_q = self.bbqn.predict_target(cur_cState) + + if exec_mask is not None: + admissible = np.add(cur_target_q, np.array(exec_mask)) + else: + admissible = cur_target_q + + Q_s_t_a_t_ = cur_action_q[0][cAction] + gamma_Q_s_tplu1_maxa_ = self.gamma * np.max(admissible) + + if weight == None: + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward) + elif self.replay_type == 'prioritized': + # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used + if True: + # if self.samplecount >= self.capacity: + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward, \ + Q_s_t_a_t_=Q_s_t_a_t_, + gamma_Q_s_tplu1_maxa_=gamma_Q_s_tplu1_maxa_, uniform=False) + else: + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward, \ + Q_s_t_a_t_=Q_s_t_a_t_, + gamma_Q_s_tplu1_maxa_=gamma_Q_s_tplu1_maxa_, uniform=True) + + else: + self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward, + ma_weight=weight) + + self.actToBeRecorded = None + self.samplecount += 1 + return + + def finalizeRecord(self, reward, domainInControl=None): + if domainInControl is None: + domainInControl = self.domainString + if self.episodes[domainInControl] is None: + logger.warning("record attempted to be finalized for domain where nothing has been recorded before") + return + + # normalising total return to -1~1 + # if reward == 0: + # reward = -20.0 + reward /= 20.0 + """ + if reward == 20.0: + reward = 1.0 + else: + reward = -0.5 + """ + # reward = float(reward+10.0)/40.0 + + terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction()) + + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, + terminal=True) + elif self.replay_type == 'prioritized': + # heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used + if True: + # if self.samplecount >= self.capacity: + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, \ + Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False, + terminal=True) + else: + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, \ + Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=True, + terminal=True) + + def convertStateAction(self, state, action): + ''' + + ''' + if isinstance(state, TerminalState): + return [0] * 89, action + + else: + if self.features == 'learned' or self.features == 'rnn': + dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString) + else: + dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString) + action_name = self.actions.action_names[action] + act_slot = 'general' + for slot in dip_state.slots: + if slot in action_name: + act_slot = slot + flat_belief = dip_state.get_beliefStateVec(act_slot) + self.prev_state_check = flat_belief + + return flat_belief, action + + def nextAction(self, beliefstate): + ''' + select next action + + :param beliefstate: + :param hyps: + :returns: (int) next summary action + ''' + + if self.exploration_type == 'e-greedy': + # epsilon greedy + if self.is_training and utils.Settings.random.rand() < self.epsilon: + action_Q = np.random.rand(len(self.action_names)) + else: + action_Q = self.bbqn.predict(np.reshape(beliefstate, (1, len(beliefstate)))) # + (1. / (1. + i + j)) + + self.episode_ave_max_q.append(np.max(action_Q)) + + # return the Q vect, the action will be converted in the feudal policy + return action_Q + + + def train(self): + ''' + call this function when the episode ends + ''' + + if not self.is_training: + logger.info("Not in training mode") + return + else: + logger.info("Update dqn policy parameters.") + + self.episodecount += 1 + logger.info("Sample Num so far: %s" % (self.samplecount)) + logger.info("Episode Num so far: %s" % (self.episodecount)) + + if self.samplecount >= self.minibatch_size * 10 and self.episodecount % self.training_frequency == 0: + logger.info('start training...') + + s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \ + self.episodes[self.domainString].sample_batch() + + s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch]) + s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch]) + + a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch] + action_q = self.bbqn.predict_dip(s2_batch, a_batch_one_hot) + target_q = self.bbqn.predict_target_dip(s2_batch, a_batch_one_hot) + # print 'action Q and target Q:', action_q, target_q + + y_i = [] + for k in range(min(self.minibatch_size, self.episodes[self.domainString].size())): + Q_bootstrap_label = 0 + if t_batch[k]: + Q_bootstrap_label = r_batch[k] + else: + if self.q_update == 'single': + belief = s2_ori_batch[k] + execMask = [0.0] * len(self.action_names) # TODO: find out how to compute the mask here, or save it when recording the state + execMask[-1] = -sys.maxsize + action_Q = target_q[k] + admissible = np.add(action_Q, np.array(execMask)) + Q_bootstrap_label = r_batch[k] + self.gamma * np.max(admissible) + + y_i.append(Q_bootstrap_label) + + # Update the critic given the targets + reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i]) + + predicted_q_value, _, currentLoss, logLikelihood, varFC2, meanFC2, td_error, KL_div = self.bbqn.train(s_batch, a_batch_one_hot, reshaped_yi, self.episodecount) + + if self.episodecount % self.target_update_freq == 0: + self.bbqn.update_target_network() + if self.episodecount % self.save_step == 0: + self.savePolicyInc() # self.out_policy_file) + + +# END OF FILE diff --git a/policy/feudalgainRL/FeudalBBQNPolicyNew.py b/policy/feudalgainRL/FeudalBBQNPolicyNew.py new file mode 100644 index 0000000000000000000000000000000000000000..6d35709818b1fdeed16592c3bcc0cafb9a21c727 --- /dev/null +++ b/policy/feudalgainRL/FeudalBBQNPolicyNew.py @@ -0,0 +1,416 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +''' +DQNPolicy.py - deep Q network policy +================================================== + +Copyright CUED Dialogue Systems Group 2015 - 2017 + +.. seealso:: CUED Imports/Dependencies: + + import :class:`Policy` + import :class:`utils.ContextLogger` + +.. warning:: + Documentation not done. + + +************************ + +''' + +import copy +import os +import sys +import json +import numpy as np +import pickle as pickle +from itertools import product +from scipy.stats import entropy +import utils +from utils.Settings import config as cfg +from utils import ContextLogger, DiaAct, DialogueState + +import ontology.FlatOntologyManager as FlatOnt +import tensorflow as tf +from policy.DRL.replay_buffer import ReplayBuffer +from policy.DRL.replay_prioritised import ReplayPrioritised +import policy.DRL.utils as drlutils +import policy.DRL.dqn as dqn +import policy.Policy +import policy.DQNPolicy +import policy.SummaryAction +from policy.Policy import TerminalAction, TerminalState +from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state +from policy.feudalRL.feudalUtils import get_feudal_masks +from policy.DRL import bdqn as bbqn + + +logger = utils.ContextLogger.getLogger('') + + +class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy): + '''Derived from :class:`DQNPolicy` + ''' + + def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, + action_names=None, slot=None, sd_state_dim=50, js_threshold=0, info_reward=0.0, jsd_reward=False, + jsd_function=None): + super(FeudalDQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training) + + tf.reset_default_graph() + + self.domainString = domainString + self.sd_state_dim = sd_state_dim + self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) + self.in_policy_file = in_policy_file + self.out_policy_file = out_policy_file + self.is_training = is_training + self.accum_belief = [] + self.info_reward = info_reward + self.js_threshold = js_threshold + self.jsd_reward = jsd_reward + self.jsd_function = jsd_function + if self.jsd_function is not None: + print("We use the JSD-function", self.jsd_function) + if self.js_threshold != 1.0 and not self.jsd_reward: + print("We use JS-divergence, threshold =", self.js_threshold) + if self.jsd_reward: + print("We train with raw JSD reward.") + self.slots = slot + self.features = 'dip' + if cfg.has_option('feudalpolicy', 'features'): + self.features = cfg.get('feudalpolicy', 'features') + self.actfreq_ds = False + if cfg.has_option('feudalpolicy', 'actfreq_ds'): + self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds') + + self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) + self.prev_state_check = None + + self.max_k = 5 + if cfg.has_option('dqnpolicy', 'max_k'): + self.max_k = cfg.getint('dqnpolicy', 'max_k') + + self.capacity *= 5 # capacity for episode methods, multiply it to adjust to turn based methods + + # init session + self.sess = tf.Session() + with tf.device("/cpu:0"): + + np.random.seed(self.randomseed) + tf.set_random_seed(self.randomseed) + + # initialise a replay buffer + if self.replay_type == 'vanilla': + self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size*4, self.randomseed) + elif self.replay_type == 'prioritized': + self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size, + self.randomseed) + self.samplecount = 0 + self.episodecount = 0 + + # construct the models + self.summaryaction = policy.SummaryAction.SummaryAction(domainString) + self.action_names = action_names + self.action_dim = len(self.action_names) + action_bound = len(self.action_names) + self.stats = [0 for _ in range(self.action_dim)] + + if self.features == 'learned' or self.features == 'rnn': + si_state_dim = 73 + if self.actfreq_ds: + if self.domainString == 'CamRestaurants': + si_state_dim += 9#16 + elif self.domainString == 'SFRestaurants': + si_state_dim += 9#25 + elif self.domainString == 'Laptops11': + si_state_dim += 9#40 + self.sd_enc_size = 50 + self.si_enc_size = 25 + self.dropout_rate = 0. + if cfg.has_option('feudalpolicy', 'sd_enc_size'): + self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size') + if cfg.has_option('feudalpolicy', 'si_enc_size'): + self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size') + if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: + self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') + if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: + self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') + + self.state_dim = si_state_dim + sd_state_dim + if self.features == 'learned': + + self.dqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate, + self.tau, + action_bound, self.architecture, self.h1_size, self.h2_size, + self.n_samples, + self.minibatch_size) + + elif self.features == 'rnn': + self.dqn = dqn.RNNFDeepQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, + self.learning_rate, self.tau, action_bound, self.minibatch_size, + self.architecture, self.h1_size, self.h2_size, + sd_enc_size=self.sd_enc_size, si_enc_size=self.si_enc_size, + dropout_rate=self.dropout_rate, slot=self.slot) + else: # self.features = 'dip' + if self.actfreq_ds: + if self.domainString == 'CamRestaurants': + self.state_dim += 9#16 + elif self.domainString == 'SFRestaurants': + self.state_dim += 9#25 + elif self.domainString == 'Laptops11': + self.state_dim += 9#40 + self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, + self.learning_rate, self.tau, action_bound, self.minibatch_size, + self.architecture, self.h1_size, + self.h2_size, dropout_rate=self.dropout_rate) + + # when all models are defined, init all variables (this might to be sent to the main policy too) + init_op = tf.global_variables_initializer() + self.sess.run(init_op) + + self.loadPolicy(self.in_policy_file) + print('loaded replay size: ', self.episodes[self.domainString].size()) + + self.dqn.update_target_network() + + def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None): + if domainInControl is None: + domainInControl = self.domainString + if self.actToBeRecorded is None: + self.actToBeRecorded = self.summaryAct + + if state is None: + state = self.prevbelief + if action is None: + action = self.actToBeRecorded + + cState, cAction = state, action + # normalising total return to -1~1 + reward /= 20.0 + + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward) + + self.actToBeRecorded = None + self.samplecount += 1 + + def finalizeRecord(self, reward, domainInControl=None): + if domainInControl is None: + domainInControl = self.domainString + if self.episodes[domainInControl] is None: + logger.warning("record attempted to be finalized for domain where nothing has been recorded before") + return + + reward /= 20.0 + + terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction()) + + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, + terminal=True) + elif self.replay_type == 'prioritized': + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, \ + Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False, + terminal=True) + print('total TD', self.episodes[self.domainString].tree.total()) + + def convertStateAction(self, state, action): + ''' + + ''' + if isinstance(state, TerminalState): + return [0] * 89, action + else: + if self.features == 'learned' or self.features == 'rnn': + dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString) + else: + dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString) + action_name = self.actions.action_names[action] + act_slot = 'general' + for slot in dip_state.slots: + if slot in action_name: + act_slot = slot + flat_belief = dip_state.get_beliefStateVec(act_slot) + self.prev_state_check = flat_belief + + return flat_belief, action + + def nextAction(self, beliefstate): + ''' + select next action + + :param beliefstate: already converted to dipstatevec of the specific slot (or general) + :returns: (int) next summary action + ''' + + if self.exploration_type == 'e-greedy': + # epsilon greedy + if self.is_training and utils.Settings.random.rand() < self.epsilon: + action_Q = np.random.rand(len(self.action_names)) + else: + if len(beliefstate.shape) == 1: + action_Q = self.dqn.predict(np.reshape(beliefstate, (1, -1))) + else: + action_Q = self.dqn.predict(beliefstate) + # add current max Q to self.episode_ave_max_q + self.episode_ave_max_q.append(np.max(action_Q)) + + #return the Q vect, the action will be converted in the feudal policy + return action_Q + + def train(self): + ''' + call this function when the episode ends + ''' + + if not self.is_training: + logger.info("Not in training mode") + return + else: + logger.info("Update dqn policy parameters.") + + self.episodecount += 1 + logger.info("Sample Num so far: %s" % (self.samplecount)) + logger.info("Episode Num so far: %s" % (self.episodecount)) + + s_batch_new, s_batch_beliefstate, s_batch_chosen_slot, s2_batch_dipstate, s2_batch_beliefstate, t_batch_new, r_batch_new = \ + [], [], [], [], [], [], [] + + if self.samplecount >= self.minibatch_size * 8 and self.episodecount % self.training_frequency == 0: + logger.info('start training...') + + a_batch_one_hot_new = None + #updating only states where the action is not "pass()" complicates things :/ + #since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples + + while len(s_batch_new) < self.minibatch_size: + + s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \ + self.episodes[self.domainString].sample_batch() + + a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch] + #we only wanna update state-action pairs, where action != pass() + valid_steps = [action[-1] != 1 for action in a_batch_one_hot] + a_batch_one_hot = a_batch_one_hot[valid_steps] + + s_batch_new += [s[0] for i, s in enumerate(s_batch) if valid_steps[i]] + s_batch_beliefstate += [s[1] for i, s in enumerate(s_batch) if valid_steps[i]] + s_batch_chosen_slot += [s[2] for i, s in enumerate(s_batch) if valid_steps[i]] + + s2_batch_dipstate += [s[3] for s, valid in zip(s2_batch, valid_steps) if valid] + s2_batch_beliefstate += [s[1] for s, valid in zip(s2_batch, valid_steps) if valid] + + r_batch_new += [r for r, valid in zip(r_batch, valid_steps) if valid] + t_batch_new += [t for t, valid in zip(t_batch, valid_steps) if valid] + + if a_batch_one_hot_new is None: + a_batch_one_hot_new = a_batch_one_hot + else: + a_batch_one_hot_new = np.vstack((a_batch_one_hot_new, a_batch_one_hot)) + + s_batch_new = np.vstack(s_batch_new) + s2_batch_dipstate = np.vstack(s2_batch_dipstate) + + if self.js_threshold < 1.0 or self.jsd_reward: + #TODO: This is highly inefficient + js_divergence_batch = [] + for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot): + if slot != "None": + keys = belief['beliefs'][slot].keys() + + b = [belief['beliefs'][slot]['**NONE**']] + \ + [belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**'] + + b_2 = [belief2['beliefs'][slot]['**NONE**']] + \ + [belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**'] + + js_divergence = self.compute_js_divergence(b, b_2) + js_divergence_batch.append(js_divergence) + else: + js_divergence_batch.append(0.0) + else: + js_divergence_batch = [0] * len(r_batch_new) + + tanh_n = np.tanh(1) + if self.jsd_reward: + if self.jsd_function == 'tanh': + js_divergence_batch = np.tanh(np.array(js_divergence_batch)) / tanh_n + #normalize jsd between -1 and 1 + js_divergence_batch = (-1 + 2 * np.array(js_divergence_batch)).tolist() + elif self.js_threshold < 1.0: + # normalizing bound to [0, 2] and then /20 + js_divergence_batch = [2/20 * int(x > self.js_threshold) for x in js_divergence_batch] + + action_q = self.dqn.predict_dip(s2_batch_dipstate, a_batch_one_hot_new) + target_q = self.dqn.predict_target_dip(s2_batch_dipstate, a_batch_one_hot_new) + + action_q = np.reshape(action_q, (s_batch_new.shape[0], -1, self.action_dim)) + target_q = np.reshape(target_q, (s_batch_new.shape[0], -1, self.action_dim)) + + y_i = [] + for k in range(min(s_batch_new.shape[0], self.episodes[self.domainString].size())): + Q_bootstrap_label = 0 + if t_batch_new[k]: + Q_bootstrap_label = r_batch_new[k] + else: + if self.q_update == 'single': + action_Q = target_q[k] + if self.jsd_reward: + Q_bootstrap_label = js_divergence_batch[k] + self.gamma * np.max(action_Q) + else: + Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q) + elif self.q_update == 'double': + action_Q = action_q[k] + argmax_tuple = np.unravel_index(np.argmax(action_Q, axis=None), action_Q.shape) + value_Q = target_q[k][argmax_tuple] + Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q + y_i.append(Q_bootstrap_label) + + if self.replay_type == 'prioritized': + # update the sum-tree + # update the TD error of the samples in the minibatch + currentQ_s_a_ = action_q[k][a_batch[k]] + error = abs(currentQ_s_a_ - Q_bootstrap_label) + self.episodes[self.domainString].update(idx_batch[k], error) + + reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i]) + + predicted_q_value, _, currentLoss = self.dqn.train(s_batch_new, a_batch_one_hot_new, reshaped_yi) + + if self.episodecount % 1 == 0: + # Update target networks + self.dqn.update_target_network() + + self.savePolicyInc() + + def compute_js_divergence(self, P, Q): + + M = [p + q for p, q in zip(P, Q)] + return 0.5 * (entropy(P, M, base=2) + entropy(Q, M, base=2)) + +# END OF FILE diff --git a/policy/feudalgainRL/FeudalENACPolicy.py b/policy/feudalgainRL/FeudalENACPolicy.py new file mode 100644 index 0000000000000000000000000000000000000000..216c90e3120f66aa13e49ca2f3db4204711b442a --- /dev/null +++ b/policy/feudalgainRL/FeudalENACPolicy.py @@ -0,0 +1,514 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +''' +ENACPolicy.py - Advantage Actor-Critic policy +================================================== + +Copyright CUED Dialogue Systems Group 2015 - 2017 + +.. seealso:: CUED Imports/Dependencies: + + import :class:`Policy` + import :class:`utils.ContextLogger` + +.. warning:: + Documentation not done. + + +************************ + +''' + +import copy +import os +import json +import numpy as np +import scipy +import scipy.signal +import pickle as pickle +import random +import utils +from utils.Settings import config as cfg +from utils import ContextLogger, DiaAct + +import ontology.FlatOntologyManager as FlatOnt +#from theano_dialogue.util.tool import * + +import tensorflow as tf +from policy.DRL.replay_buffer_episode_enac import ReplayBufferEpisode +from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode +import policy.DRL.utils as drlutils +import policy.DRL.enac as enac +import policy.Policy +from policy.ENACPolicy import ENACPolicy +import policy.SummaryAction +from policy.Policy import TerminalAction, TerminalState +from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state +from policy.feudalRL.feudalUtils import get_feudal_masks + +logger = utils.ContextLogger.getLogger('') + + +# Discounting function used to calculate discounted returns. +def discount(x, gamma): + return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] + +class FeudalENACPolicy(ENACPolicy): + '''Derived from :class:`Policy` + ''' + def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, action_names=None, slot=None): + super(FeudalENACPolicy, self).__init__(in_policy_file, out_policy_file, domainString=domainString, is_training=is_training) + + tf.reset_default_graph() + + self.in_policy_file = in_policy_file + self.out_policy_file = out_policy_file + self.is_training = is_training + self.accum_belief = [] + self.prev_state_check = None + + self.domainString = domainString + self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) + + self.features = 'dip' + self.sd_enc_size = 80 + self.si_enc_size = 40 + self.dropout_rate = 0. + if cfg.has_option('feudalpolicy', 'features'): + self.features = cfg.get('feudalpolicy', 'features') + if cfg.has_option('feudalpolicy', 'sd_enc_size'): + self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size') + if cfg.has_option('feudalpolicy', 'si_enc_size'): + self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size') + if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: + self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') + if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: + self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') + + + # init session + self.sess = tf.Session() + with tf.device("/cpu:0"): + + np.random.seed(self.randomseed) + tf.set_random_seed(self.randomseed) + + # initialise an replay buffer + if self.replay_type == 'vanilla': + self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed) + elif self.replay_type == 'prioritized': + self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed) + #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) + #self.episodes = [] + self.samplecount = 0 + self.episodecount = 0 + + # construct the models + self.state_dim = 89 # current DIP state dim + self.summaryaction = policy.SummaryAction.SummaryAction(domainString) + self.action_names = action_names + self.action_dim = len(self.action_names) + action_bound = len(self.action_names) + self.stats = [0 for _ in range(self.action_dim)] + + if self.features == 'dip': + self.enac = enac.ENACNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.tau, + action_bound, self.architecture, self.h1_size, self.h2_size, self.is_training) + elif self.features == 'learned' or self.features == 'rnn': + si_state_dim = 72 + if self.domainString == 'CamRestaurants': + sd_state_dim = 94 + elif self.domainString == 'SFRestaurants': + sd_state_dim = 158 + elif self.domainString == 'Laptops11': + sd_state_dim = 13 + else: + logger.error( + 'Domain {} not implemented in feudal-DQN yet') # just find out the size of sd_state_dim for the new domain + if self.features == 'rnn': + arch = 'rnn' + else: + arch = 'vanilla' + self.state_dim = si_state_dim + sd_state_dim + self.enac = enac.ENACNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.tau, + action_bound, self.architecture, self.h1_size, self.h2_size, self.is_training) + else: + logger.error('features "{}" not implemented'.format(self.features)) + + # when all models are defined, init all variables + init_op = tf.global_variables_initializer() + self.sess.run(init_op) + + self.loadPolicy(self.in_policy_file) + print('loaded replay size: ', self.episodes[self.domainString].size()) + + + def convertStateAction(self, state, action): + ''' + + ''' + if isinstance(state, TerminalState): + return [0] * 89, action + + else: + if self.features == 'learned' or self.features == 'rnn': + dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString) + else: + dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString) + action_name = self.actions.action_names[action] + act_slot = 'general' + for slot in dip_state.slots: + if slot in action_name: + act_slot = slot + flat_belief = dip_state.get_beliefStateVec(act_slot) + self.prev_state_check = flat_belief + + return flat_belief, action + + def record(self, reward, domainInControl=None, weight=None, state=None, action=None): + if domainInControl is None: + domainInControl = self.domainString + if self.actToBeRecorded is None: + #self.actToBeRecorded = self.lastSystemAction + self.actToBeRecorded = self.summaryAct + + if state is None: + state = self.prevbelief + if action is None: + action = self.actToBeRecorded + + cState, cAction = state, action + + # normalising total return to -1~1 + reward /= 20.0 + + #value = self.a2c.predict_value([cState]) + value = np.array([[0.0]]) + policy_mu = self.mu_prob + + if weight == None: + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward, value=value[0][0], distribution=policy_mu) + elif self.replay_type == 'prioritized': + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward, value=value[0][0], distribution=policy_mu) + else: + self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward, ma_weight=weight) + + self.actToBeRecorded = None + self.samplecount += 1 + return + + def nextAction(self, beliefstate): + ''' + select next action + + :param beliefstate: + :returns: (int) next summary action + ''' + + if self.exploration_type == 'e-greedy': + + # epsilon greedy + if self.is_training and utils.Settings.random.rand() < self.epsilon: + action_prob = np.random.rand(len(self.action_names)) + + # Importance sampling (should be turned off) + #if nextaIdex == greedyNextaIdex: + # self.mu_prob = self.epsilon / float(self.action_dim) + 1 - self.epsilon + #else: + # self.mu_prob = self.epsilon / float(self.action_dim) + else: + action_prob = self.enac.predict_policy(np.reshape(beliefstate, (1, len(beliefstate)))) + + # add current max Q to self.episode_ave_max_q + #print 'current maxQ', np.max(admissible) + #self.episode_ave_max_q.append(np.max(admissible)) + + # Importance sampling + #self.mu_prob = self.epsilon / float(self.action_dim) + 1 - self.epsilon + + return action_prob + + def train(self): + ''' + call this function when the episode ends + ''' + + if not self.is_training: + logger.info("Not in training mode") + return + else: + logger.info("Update enac policy parameters.") + + self.episodecount += 1 + logger.info("Sample Num so far: %s" %(self.samplecount)) + logger.info("Episode Num so far: %s" %(self.episodecount)) + + if self.samplecount >= self.minibatch_size and self.episodecount % self.training_frequency == 0: + logger.info('start training...') + + s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, v_batch, mu_policy = \ + self.episodes[self.domainString].sample_batch() + + discounted_return_batch = [] + + + def weightsImportanceSampling(mu_policy, r_batch): + mu_policy = np.asarray(mu_policy) + mu_cum = [] + lenghts = [] # to properly divide on dialogues pi_policy later on + for mu in mu_policy: + lenghts.append(len(mu)) + mu = np.asarray(mu).astype(np.longdouble) + mu_cum.append(np.cumprod(mu[::-1])[::-1]) # going forward with cumulative product + # mu_cum = np.concatenate(np.array(mu_cum), axis=0).tolist() + mu_policy = np.concatenate(np.array(mu_policy), axis=0).tolist() # concatenate all behavioral probs + lengths = np.cumsum(lenghts) # time steps for ends of dialogues + lengths = np.concatenate((np.array([0]), lengths), axis=0) # add first dialogue + + if self.importance_sampling == 'max': + pass + elif self.importance_sampling == "soft": + # get the probabilities of actions taken from the batch + pi_policy = self.enac.getPolicy(np.concatenate(np.array(s_batch), axis=0).tolist())[0] # policy given s_t + columns = np.asarray([np.concatenate(a_batch, axis=0).tolist()]).astype(int) # actions taken at s_t + rows = np.asarray([ii for ii in range(len(pi_policy))]) + pi_policy = pi_policy[rows, columns][0].astype(np.longdouble) # getting probabilities for current policy + + ##################################### + # Weights for importance sampling + # it goes through each dialogue and computes in reverse order cumulative prod: + # rho_n = pi_n / mu_n + # ... + # rho_1 = pi_1 / mu_1 * ... * pi_n / mu_n + # using dialogue and weight_cum lists + ##################################### + + rho_forward = [] # rho_forward from eq. 3.3 (the first one) + rho_whole = [] # product across the whole dialogue from eq. 3.3 (the second one) + #pi_cum2 = [] # stats to compare + #mu_cum2 = [] # stats to compare + #pi_cum = [] # stats to compare + + # Precup version + r_vector = np.concatenate(np.array(r_batch), axis=0).tolist() + r_weighted = [] + + for ii in range(len(lengths) - 1): # over dialogues + weight_cum = 1. + dialogue = [] + + for pi, mu in zip(pi_policy[lengths[ii]:lengths[ii + 1]], mu_policy[lengths[ii]:lengths[ii + 1]]): + weight_cum *= pi / mu + dialogue.append(weight_cum) + + dialogue = np.array(dialogue) + dialogue = np.clip(dialogue, 0.5, 1) # clipping the weights + dialogue = dialogue.tolist() + + rho_forward.extend(dialogue) + #rho_whole.append(dialogue[-1]) + rho_whole.extend(np.ones(len(dialogue)) * dialogue[-1]) + r_weighted.extend(r_vector[lengths[ii]: lengths[ii + 1]] * np.asarray(dialogue)) + + # go back to original form: + ind = 0 + r_new = copy.deepcopy(r_batch) + for id, batch in enumerate(r_new): + for id2, _ in enumerate(batch): + r_new[id][id2] = r_weighted[ind] + ind += 1 + + # ONE STEP WEIGHTS + weights = np.asarray(pi_policy) / np.asarray(mu_policy) + weights = np.clip(weights, 0.5, 1) # clipping the weights + + return weights, rho_forward, rho_whole, r_new + + weights, rho_forward, rho_whole, r_new = weightsImportanceSampling(mu_policy, r_batch) + + weights = np.nan_to_num(weights) + rho_forward = np.nan_to_num(rho_forward) + rho_whole = np.nan_to_num(rho_whole) + """ + print 'w',weights + print 'rho_for',rho_forward + print 'rho_who',rho_whole + """ + + def calculate_discountR(r_episode, idx): + ######################################################################### + # Here we take the rewards and values from the rollouts, and use them to + # generate the advantage and discounted returns. + # The advantage function uses "Generalized Advantage Estimation" + bootstrap_value = 0.0 + # r_episode rescale by rhos? + self.r_episode_plus = np.asarray(r_episode[idx:] + [bootstrap_value]) + if self.importance_sampling: + self.r_episode_plus = self.r_episode_plus + else: + self.r_episode_plus = self.r_episode_plus/rho_forward[idx] + discounted_r_episode = discount(self.r_episode_plus, self.gamma)[:-1] + ######################################################################### + return discounted_r_episode[0] + + if self.replay_type == 'prioritized': + for item_r, item_v, item_idx in zip(r_new, v_batch, idx_batch): + rlist = [] + for idx in range(len(item_r)): + r = calculate_discountR(item_r, idx) + rlist.append(r) + + discounted_return_batch.append(rlist[-1]) + else: + for item_r, item_v in zip(r_new, v_batch): + rlist = [] + for idx in range(len(item_r)): + r = calculate_discountR(item_r, idx) + rlist.append(r) + + discounted_return_batch.append(rlist[-1]) + + batch_size = len(s_batch) + + if self.importance_sampling: + discounted_return_batch = np.clip(discounted_return_batch, -1, 1) + + # get gradient info and create matrix + gradient_matrix = [] + for item_s, item_a in zip(s_batch, a_batch): + item_a_one_hot = np.eye(self.action_dim)[item_a] + policy_gradient = self.enac.get_policy_gradient(item_s, item_a_one_hot) + policy_gradient = [(policy_gradient_idv.flatten()).tolist() for policy_gradient_idv in policy_gradient] + policy_gradient_flatten = np.hstack(policy_gradient) + policy_gradient_flatten = np.append(policy_gradient_flatten, [1.0]) + gradient_matrix.append(policy_gradient_flatten.tolist()) + + gradient_matrix = np.matrix(gradient_matrix) + return_matrix = np.matrix(discounted_return_batch) + + logger.info("Updating eNAC policy parameters, before calculate eNac matrix") + try: + natural_gradient = np.dot(np.linalg.pinv(gradient_matrix), return_matrix.T) + # convert a matrix to list-like array + natural_gradient = np.array(natural_gradient.flatten()).ravel() + natural_gradient = natural_gradient[:-1] # discard the last element + except: + natural_gradient = self.natural_gradient_prev + print('SVD problem') + + logger.info("Updating eNAC policy parameters, after calculate eNac matrix") + + self.natural_gradient_prev = natural_gradient + + all_params = self.enac.get_params() + + cnt = 0 + modelW = [] + modelB = [] + for variable in all_params: + + shape = variable.shape + # weight matrix + if np.array(variable).ndim == 1: + until = np.array(variable).shape[0] + subNG = np.reshape(natural_gradient[cnt:cnt+until],shape) + cnt += until + modelB.append(subNG) + # bias vector + elif np.array(variable).ndim == 2: + until = np.array(variable).shape[0]*np.array(variable).shape[1] + subNG = np.reshape(natural_gradient[cnt:cnt+until],shape) + cnt += until + modelW.append(subNG) + + a_batch_one_hot = np.eye(self.action_dim)[np.concatenate(a_batch, axis=0).tolist()] + + policy_loss, entropy, all_loss, optimise = self.enac.train( \ + np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, \ + modelW[0], modelB[0], modelW[1], modelB[1], modelW[2], modelB[2] \ + ) + + norm_p_l, ent, norm_all_l = \ + policy_loss/float(batch_size), \ + entropy/float(batch_size), all_loss/float(batch_size) + + self.savePolicyInc() # self.out_policy_file) + + def savePolicy(self, FORCE_SAVE=False): + """ + Does not use this, cause it will be called from agent after every episode. + we want to save the policy only periodically. + """ + pass + + def savePolicyInc(self, FORCE_SAVE=False): + """ + save model and replay buffer + """ + if self.episodecount % self.save_step == 0: + self.enac.save_network(self.out_policy_file+'.enac.ckpt') + + f = open(self.out_policy_file+'.episode', 'wb') + for obj in [self.samplecount, self.episodes[self.domainString]]: + pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) + f.close() + #logger.info("Saving model to %s and replay buffer..." % save_path) + + def saveStats(self, FORCE_SAVE=False): + f = open(self.out_policy_file + '.stats', 'wb') + pickle.dump(self.stats, f, protocol=pickle.HIGHEST_PROTOCOL) + f.close() + + def loadPolicy(self, filename): + """ + load model and replay buffer + """ + # load models + self.enac.load_network(filename+'.enac.ckpt') + + # load replay buffer + try: + print('load from: ', filename) + f = open(filename+'.episode', 'rb') + loaded_objects = [] + for i in range(2): # load nn params and collected data + loaded_objects.append(pickle.load(f)) + self.samplecount = int(loaded_objects[0]) + self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1]) + logger.info("Loading both model from %s and replay buffer..." % filename) + f.close() + except: + logger.info("Loading only models...") + + def restart(self): + self.summaryAct = None + self.lastSystemAction = None + self.prevbelief = None + self.actToBeRecorded = None + self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter) + self.episode_ave_max_q = [] + +#END OF FILE diff --git a/policy/feudalgainRL/FeudalNoisyACERPolicy.py b/policy/feudalgainRL/FeudalNoisyACERPolicy.py new file mode 100644 index 0000000000000000000000000000000000000000..732ee8a0d2528e5773a271c3e915db312cbfd6d2 --- /dev/null +++ b/policy/feudalgainRL/FeudalNoisyACERPolicy.py @@ -0,0 +1,561 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +''' +ACERPolicy.py - ACER - Actor Critic with Experience Replay +================================================== + +Copyright CUED Dialogue Systems Group 2015 - 2017 + +.. seealso:: CUED Imports/Dependencies: + + import :class:`Policy` + import :class:`utils.ContextLogger` + +.. warning:: + Documentation not done. + + +************************ + +''' +import copy +import os +import json +import numpy as np +import scipy +import scipy.signal +from scipy.stats import entropy +import pickle as pickle +import random +import utils +from policy.feudalgainRL.NoisyACERPolicy import NoisyACERPolicy +from utils.Settings import config as cfg +from utils import ContextLogger, DiaAct + +import ontology.FlatOntologyManager as FlatOnt +import tensorflow as tf +from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode +from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode +import policy.DRL.utils as drlutils +#from policy.SACERPolicy import SACERPolicy +import policy.feudalgainRL.noisyacer as noisy_acer +import policy.Policy +import policy.SummaryAction +from policy.Policy import TerminalAction, TerminalState +from policy.feudalgainRL.DIP_parametrisation import DIP_state, padded_state + +logger = utils.ContextLogger.getLogger('') + +# Discounting function used to calculate discounted returns. +def discount(x, gamma): + return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] + + +class FeudalNoisyACERPolicy(NoisyACERPolicy): + '''Derived from :class:`Policy` + ''' + def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, + action_names=None, slot=None, sd_state_dim=50, js_threshold=1.0, info_reward=0.0, load_policy=True, + critic_regularizer_weight=0): + super(FeudalNoisyACERPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training) + + tf.reset_default_graph() + + self.in_policy_file = in_policy_file + self.out_policy_file = out_policy_file + self.is_training = is_training + self.accum_belief = [] + self.prev_state_check = None + self.sd_state_dim = sd_state_dim + self.info_reward = info_reward + self.js_threshold = js_threshold + if self.js_threshold != 1.0: + print("We train with JS-divergence, threshold =", self.js_threshold) + + self.domainString = domainString + self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) + self.critic_regularizer_weight = critic_regularizer_weight + + self.features = 'dip' + self.sd_enc_size = 80 + self.si_enc_size = 40 + self.dropout_rate = 0. + if cfg.has_option('feudalpolicy', 'features'): + self.features = cfg.get('feudalpolicy', 'features') + if cfg.has_option('feudalpolicy', 'sd_enc_size'): + self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size') + if cfg.has_option('feudalpolicy', 'si_enc_size'): + self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size') + if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: + self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') + if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: + self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') + self.actfreq_ds = False + if cfg.has_option('feudalpolicy', 'actfreq_ds'): + self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds') + self.noisy_acer = False + if cfg.has_option('policy', 'noisy_acer'): + self.noisy_acer = cfg.getboolean('policy', 'noisy_acer') + + self.sample_argmax = False + if cfg.has_option('policy', 'sample_argmax'): + self.sample_argmax = cfg.getboolean('policy', 'sample_argmax') + + if self.sample_argmax: + print("We sample argmax") + + #self.log_path = cfg.get('exec_config', 'logfiledir') + #self.log_path = self.log_path + f"/{in_policy_file.split('/')[-1].split('.')[0]}-seed{self.randomseed}.npy" + + self.load_policy = load_policy + + # init session + self.sess = tf.Session() + with tf.device("/cpu:0"): + + np.random.seed(self.randomseed) + tf.set_random_seed(self.randomseed) + + # initialise an replay buffer + if self.replay_type == 'vanilla': + self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed) + elif self.replay_type == 'prioritized': + self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed) + #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) + #self.episodes = [] + self.samplecount = 0 + self.episodecount = 0 + + # construct the models + self.state_dim = 89 # current DIP state dim + self.summaryaction = policy.SummaryAction.SummaryAction(domainString) + self.action_names = action_names + self.action_dim = len(self.action_names) + action_bound = len(self.action_names) + self.stats = [0 for _ in range(self.action_dim)] + + self.global_mu = [0. for _ in range(self.action_dim)] + + if self.features == 'dip': + if self.actfreq_ds: + if self.domainString == 'CamRestaurants': + self.state_dim += 9#16 + elif self.domainString == 'SFRestaurants': + self.state_dim += 9#25 + elif self.domainString == 'Laptops11': + self.state_dim += 9#40 + self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta, + self.c, self.alpha, self.h1_size, self.h2_size, self.is_training) + elif self.features == 'learned' or self.features == 'rnn': + si_state_dim = 73 + if self.actfreq_ds: + if self.domainString == 'CamRestaurants': + si_state_dim += 9#16 + elif self.domainString == 'SFRestaurants': + si_state_dim += 9#25 + elif self.domainString == 'Laptops11': + si_state_dim += 9#40 + + self.state_dim = si_state_dim + self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim, + self.critic_lr, self.delta, self.c, self.alpha, self.h1_size, + self.h2_size, self.is_training, temperature=self.temperature, + critic_regularizer_weight=self.critic_regularizer_weight, + noisy_acer=self.noisy_acer) + + else: + logger.error('features "{}" not implemented'.format(self.features)) + + # when all models are defined, init all variables + init_op = tf.global_variables_initializer() + self.sess.run(init_op) + + if self.load_policy: + self.loadPolicy(self.in_policy_file) + print('loaded replay size: ', self.episodes[self.domainString].size()) + else: + print("We do not load a previous policy.") + + #self.acer.update_target_network() + + # def record() has been handled... + + def convertStateAction(self, state, action): + ''' + + ''' + if isinstance(state, TerminalState): + return [0] * 89, action + + else: + if self.features == 'learned' or self.features == 'rnn': + dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString) + else: + dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString) + action_name = self.actions.action_names[action] + act_slot = 'general' + for slot in dip_state.slots: + if slot in action_name: + act_slot = slot + flat_belief = dip_state.get_beliefStateVec(act_slot) + self.prev_state_check = flat_belief + + return flat_belief, action + + def record(self, reward, domainInControl=None, weight=None, state=None, action=None): + if domainInControl is None: + domainInControl = self.domainString + if self.actToBeRecorded is None: + self.actToBeRecorded = self.summaryAct + + if state is None: + state = self.prevbelief + if action is None: + action = self.actToBeRecorded + mu_weight = self.prev_mu + mask = self.prev_mask + if action == self.action_dim-1: # pass action was taken + mask = np.zeros(self.action_dim) + mu_weight = np.ones(self.action_dim)/self.action_dim + + cState, cAction = state, action + + reward /= 20.0 + + value = self.sacer.predict_value([cState[0]], [mask]) + + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask) + elif self.replay_type == 'prioritized': + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask) + + self.actToBeRecorded = None + self.samplecount += 1 + return + + def finalizeRecord(self, reward, domainInControl=None): + if domainInControl is None: + domainInControl = self.domainString + if self.episodes[domainInControl] is None: + logger.warning("record attempted to be finalized for domain where nothing has been recorded before") + return + + #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size()) + #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q) + #print self.stats + + # normalising total return to -1~1 + reward /= 20.0 + + terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction()) + value = 0.0 # not effect on experience replay + + def calculate_discountR_advantage(r_episode, v_episode): + ######################################################################### + # Here we take the rewards and values from the rollout, and use them to + # generate the advantage and discounted returns. + # The advantage function uses "Generalized Advantage Estimation" + bootstrap_value = 0.0 + self.r_episode_plus = np.asarray(r_episode + [bootstrap_value]) + discounted_r_episode = discount(self.r_episode_plus,self.gamma)[:-1] + self.v_episode_plus = np.asarray(v_episode + [bootstrap_value]) + advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1] + advantage = discount(advantage,self.gamma) + ######################################################################### + return discounted_r_episode, advantage + + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None) + elif self.replay_type == 'prioritized': + episode_r, episode_v = self.episodes[domainInControl].record_final_and_get_episode(state=terminal_state, \ + state_ori=TerminalState(), + action=terminal_action, + reward=reward, + value=value) + + # TD_error is a list of td error in the current episode + _, TD_error = calculate_discountR_advantage(episode_r, episode_v) + episodic_TD = np.mean(np.absolute(TD_error)) + print('episodic_TD') + print(episodic_TD) + self.episodes[domainInControl].insertPriority(episodic_TD) + + return + + def compute_responsible_q(self, inputs, actions, mask): + return self.sacer.compute_responsible_q(inputs, actions, mask) + + def nextAction(self, beliefstate, execMask): + ''' + select next action + + :param beliefstate: + :param hyps: + :returns: (int) next summarye action + ''' + + action_prob = self.sacer.predict_policy(np.reshape(beliefstate, (1, len(beliefstate))), + np.reshape(execMask, (1, len(execMask))))[0] + + if (self.exploration_type == 'e-greedy' or not self.is_training) and not self.noisy_acer: + + if not self.sample_argmax: + epsilon = self.epsilon if self.is_training else 0. + eps_prob = np.ones(len(action_prob)) / len(action_prob) + + best_index = np.argmax(action_prob) + best_prob = [1. if i == best_index else 0. for i in range(len(action_prob))] + + # we sample a random action with probability epsilon and sample from the policy distribution with probability 1-epsilon + action_prob = epsilon * np.array(eps_prob) + (1. - epsilon) * action_prob + + if not self.is_training: + # take the greedy action during evaluation + action_prob = np.array(best_prob) + else: + if self.is_training and utils.Settings.random.rand() < self.epsilon: + action_prob = np.random.rand(len(self.action_names)) + + if not self.is_training: + # take the greedy action during evaluation + best_index = np.argmax(action_prob) + best_prob = [1. if i == best_index else 0. for i in range(len(action_prob))] + action_prob = np.array(best_prob) + + if not self.sample_argmax: + nextaIdex = np.random.choice(len(action_prob), p=action_prob / sum(action_prob)) + else: + nextaIdex = np.argmax(action_prob) + mu = action_prob / sum(action_prob) + + self.prev_mu = mu + self.prev_mask = execMask + + return np.array([1. if i == nextaIdex else 0. for i in range(len(action_prob))]) + + def train(self, critic_regularizer=None): + ''' + call this function when the episode ends + ''' + USE_GLOBAL_MU = False + self.episode_ct += 1 + + # new_noise_man_array = np.expand_dims(np.array(self.sacer.compute_mean_noisy()), axis=0) + # if os.path.exists(self.log_path): + # noise_mean_array = np.load(self.log_path) + # new_noise_man_array = np.concatenate((noise_mean_array, new_noise_man_array), axis=0) + # np.save(self.log_path, new_noise_man_array) + + if not self.is_training: + logger.info("Not in training mode") + return + else: + logger.info("Update acer policy parameters.") + + self.episodecount += 1 + logger.info("Sample Num so far: %s" % (self.samplecount)) + logger.info("Episode Num so far: %s" % (self.episodecount)) + if self.samplecount >= self.minibatch_size * 3 and self.episodecount % self.training_frequency == 0: + #if self.episodecount % self.training_frequency == 0: + logger.info('start trainig...') + + for _ in range(self.train_iters_per_episode): + + if self.replay_type == 'vanilla' or self.replay_type == 'prioritized': + s_batch_full, s_ori_batch, a_batch, r_batch, s2_batch_full, s2_ori_batch, t_batch, idx_batch, v_batch, mu_policy, mask_batch = \ + self.episodes[self.domainString].sample_batch() + if USE_GLOBAL_MU: + mu_sum = sum(self.global_mu) + mu_normalised = np.array([c / mu_sum for c in self.global_mu]) + mu_policy = [[mu_normalised for _ in range(len(mu_policy[i]))] for i in range(len(mu_policy))] + else: + assert False # not implemented yet + + s_batch = [[state_tuple[0] for state_tuple in epi] for epi in s_batch_full] + s_batch_beliefstate = [[state_tuple[1] for state_tuple in epi] for epi in s_batch_full] + s_batch_chosen_slot = [[state_tuple[2] for state_tuple in epi] for epi in s_batch_full] + + s2_batch_beliefstate = [[state_tuple[1] for state_tuple in epi] for epi in s2_batch_full] + + js_divergence_batch = [] + + if self.js_threshold < 1.0: + #TODO: This is probably highly inefficient + for epi_s, epi_s2, epi_slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot): + for belief, belief2, slot in zip(epi_s, epi_s2, epi_slot): + if slot != "None": + keys = belief['beliefs'][slot].keys() + + b = [belief['beliefs'][slot]['**NONE**']] + \ + [belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**'] + + b_2 = [belief2['beliefs'][slot]['**NONE**']] + \ + [belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**'] + + js_divergence = self.compute_js_divergence(b, b_2) + js_divergence_batch.append(js_divergence) + else: + js_divergence_batch.append(0.0) + + js_divergence_batch = [int(x > self.js_threshold) for x in js_divergence_batch] + js_divergence_batch = 2/20 * np.array(js_divergence_batch) #normalizing bound to [0, 2] and then /20 + + discounted_r_batch = [] + advantage_batch = [] + def calculate_discountR_advantage(r_episode, v_episode): + ######################################################################### + # Here we take the rewards and values from the rolloutv, and use them to + # generate the advantage and discounted returns. + # The advantage function uses "Generalized Advantage Estimation" + bootstrap_value = 0.0 + # r_episode rescale by rhos? + self.r_episode_plus = np.asarray(r_episode + [bootstrap_value]) + discounted_r_episode = discount(self.r_episode_plus, self.gamma)[:-1] + self.v_episode_plus = np.asarray(v_episode + [bootstrap_value]) + # change sth here + advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1] + advantage = discount(advantage, self.gamma) + ######################################################################### + return discounted_r_episode, advantage + + if self.replay_type == 'prioritized': + for item_r, item_v, item_idx in zip(r_batch, v_batch, idx_batch): + # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist()) + r, a = calculate_discountR_advantage(item_r, item_v) + + # flatten nested numpy array and turn it into list + discounted_r_batch += r.tolist() + advantage_batch += a.tolist() + + # update the sum-tree + # update the TD error of the samples (episode) in the minibatch + episodic_TD_error = np.mean(np.absolute(a)) + self.episodes[self.domainString].update(item_idx, episodic_TD_error) + else: + for item_r, item_v in zip(r_batch, v_batch): + # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist()) + r, a = calculate_discountR_advantage(item_r, item_v) + + # flatten nested numpy array and turn it into list + discounted_r_batch += r.tolist() + advantage_batch += a.tolist() + + batch_size = len(s_batch) + + a_batch_one_hot = np.eye(self.action_dim)[np.concatenate(a_batch, axis=0).tolist()] + + if self.js_threshold < 1.0: + r_batch_concatenated = np.concatenate(np.array(r_batch), axis=0) + js_divergence_batch + else: + r_batch_concatenated = np.concatenate(np.array(r_batch), axis=0) + + if critic_regularizer is not None: + critic_regularizer_q = critic_regularizer.compute_responsible_q( + np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, + np.concatenate(np.array(mask_batch), axis=0).tolist()) + + loss, entropy, optimize = \ + self.sacer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, + np.concatenate(np.array(mask_batch), axis=0).tolist(), + r_batch_concatenated, s_batch, r_batch, self.gamma, + np.concatenate(np.array(mu_policy), axis=0), + discounted_r_batch, advantage_batch, + critic_regularizer_output=critic_regularizer_q) + else: + loss, entropy, optimize = \ + self.sacer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, + np.concatenate(np.array(mask_batch), axis=0).tolist(), + r_batch_concatenated, s_batch, r_batch, self.gamma, + np.concatenate(np.array(mu_policy), axis=0), + discounted_r_batch, advantage_batch) + + ent, norm_loss = entropy/float(batch_size), loss/float(batch_size) + + self.savePolicyInc() + + def savePolicy(self, FORCE_SAVE=False): + """ + Does not use this, cause it will be called from agent after every episode. + we want to save the policy only periodically. + """ + pass + + def compute_js_divergence(self, P, Q): + + M = [p + q for p, q in zip(P, Q)] + return 0.5 * (entropy(P, M, base=2) + entropy(Q, M, base=2)) + + def savePolicyInc(self, FORCE_SAVE=False): + """ + save model and replay buffer + """ + if self.episodecount % self.save_step == 0: + #save_path = self.saver.save(self.sess, self.out_policy_file+'.ckpt') + self.sacer.save_network(self.out_policy_file+'.acer.ckpt') + + f = open(self.out_policy_file+'.episode', 'wb') + for obj in [self.samplecount, self.episodes[self.domainString], self.global_mu]: + pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) + f.close() + #logger.info("Saving model to %s and replay buffer..." % save_path) + + def loadPolicy(self, filename): + """ + load model and replay buffer + """ + # load models + self.sacer.load_network(filename+'.acer.ckpt') + + # load replay buffer + if self.load_buffer: + try: + print('load from: ', filename) + f = open(filename+'.episode', 'rb') + loaded_objects = [] + for i in range(2): # load nn params and collected data + loaded_objects.append(pickle.load(f)) + self.samplecount = int(loaded_objects[0]) + self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1]) + self.global_mu = loaded_objects[2] + logger.info("Loading both model from %s and replay buffer..." % filename) + f.close() + except: + logger.info("Loading only models...") + else: + print("We do not load the buffer!") + + def restart(self): + self.summaryAct = None + self.lastSystemAction = None + self.prevbelief = None + self.prev_mu = None + self.prev_mask = None + self.actToBeRecorded = None + self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter) + self.episode_ave_max_q = [] + + +#END OF FILE diff --git a/policy/feudalgainRL/FeudalNoisyDQNPolicy.py b/policy/feudalgainRL/FeudalNoisyDQNPolicy.py new file mode 100644 index 0000000000000000000000000000000000000000..5ada20841ad370836f976327ba8e6c2c4422a4f2 --- /dev/null +++ b/policy/feudalgainRL/FeudalNoisyDQNPolicy.py @@ -0,0 +1,554 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +''' +DQNPolicy.py - deep Q network policy +================================================== + +Copyright CUED Dialogue Systems Group 2015 - 2017 + +.. seealso:: CUED Imports/Dependencies: + + import :class:`Policy` + import :class:`utils.ContextLogger` + +.. warning:: + Documentation not done. + + +************************ + +''' + +import copy +import os +import sys +import json +import numpy as np +import pickle as pickle +from itertools import product +from scipy.stats import entropy +import utils +#from pydial import log_dir +from utils.Settings import config as cfg +from utils import ContextLogger, DiaAct, DialogueState + +import ontology.FlatOntologyManager as FlatOnt +import tensorflow as tf +from policy.DRL.replay_buffer import ReplayBuffer +from policy.DRL.replay_prioritised import ReplayPrioritised +import policy.feudalgainRL.noisydqn as dqn +import policy.Policy +import policy.DQNPolicy +import policy.SummaryAction +from policy.Policy import TerminalAction, TerminalState +from policy.feudalgainRL.DIP_parametrisation import DIP_state, padded_state + + +logger = utils.ContextLogger.getLogger('') + + +class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy): + '''Derived from :class:`DQNPolicy` + ''' + + def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, + action_names=None, slot=None, sd_state_dim=50, js_threshold=0, info_reward=0.0, jsd_reward=False, + jsd_function=None): + super(FeudalDQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training) + + tf.reset_default_graph() + + self.domainString = domainString + self.sd_state_dim = sd_state_dim + self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) + self.in_policy_file = in_policy_file + self.out_policy_file = out_policy_file + self.is_training = is_training + self.accum_belief = [] + self.info_reward = info_reward + self.js_threshold = js_threshold + self.jsd_reward = jsd_reward + self.jsd_function = jsd_function + self.log_path = cfg.get('exec_config', 'logfiledir') + self.log_path = self.log_path + f"/{in_policy_file.split('/')[-1].split('.')[0]}-seed{self.randomseed}.txt" + + if self.jsd_function is not None: + print("We use the JSD-function", self.jsd_function) + if self.js_threshold != 1.0 and not self.jsd_reward: + print("We use JS-divergence, threshold =", self.js_threshold) + if self.jsd_reward: + print("We train with raw JSD reward.") + self.slots = slot + self.features = 'dip' + if cfg.has_option('feudalpolicy', 'features'): + self.features = cfg.get('feudalpolicy', 'features') + self.actfreq_ds = False + if cfg.has_option('feudalpolicy', 'actfreq_ds'): + self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds') + + self.use_pass = True + if cfg.has_option('feudalpolicy', 'use_pass'): + self.use_pass = cfg.getboolean('feudalpolicy', 'use_pass') + + if self.use_pass: + print("We work with pass action in DQN training") + else: + print("We work without pass action in DQN training") + + self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) + self.prev_state_check = None + + self.max_k = 5 + if cfg.has_option('dqnpolicy', 'max_k'): + self.max_k = cfg.getint('dqnpolicy', 'max_k') + + self.capacity *= 5 # capacity for episode methods, multiply it to adjust to turn based methods + + # init session + self.sess = tf.Session() + with tf.device("/cpu:0"): + + np.random.seed(self.randomseed) + tf.set_random_seed(self.randomseed) + + # initialise a replay buffer + if self.replay_type == 'vanilla': + self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size*4, self.randomseed) + elif self.replay_type == 'prioritized': + self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size, + self.randomseed) + self.samplecount = 0 + self.episodecount = 0 + + # construct the models + self.summaryaction = policy.SummaryAction.SummaryAction(domainString) + self.action_names = action_names + self.action_dim = len(self.action_names) + action_bound = len(self.action_names) + self.stats = [0 for _ in range(self.action_dim)] + + if self.features == 'learned' or self.features == 'rnn': + si_state_dim = 73 + if self.actfreq_ds: + if self.domainString == 'CamRestaurants': + si_state_dim += 9#16 + elif self.domainString == 'SFRestaurants': + si_state_dim += 9#25 + elif self.domainString == 'Laptops11': + si_state_dim += 9#40 + self.sd_enc_size = 50 + self.si_enc_size = 25 + self.dropout_rate = 0. + if cfg.has_option('feudalpolicy', 'sd_enc_size'): + self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size') + if cfg.has_option('feudalpolicy', 'si_enc_size'): + self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size') + if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: + self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') + if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training: + self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate') + + self.state_dim = si_state_dim + sd_state_dim + if self.features == 'learned': + self.dqn = dqn.NNFDeepQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, + self.learning_rate, self.tau, action_bound, self.minibatch_size, + self.architecture, self.h1_size, self.h2_size, sd_enc_size=self.sd_enc_size, + si_enc_size=self.si_enc_size, dropout_rate=self.dropout_rate) + + elif self.features == 'rnn': + self.dqn = dqn.RNNFDeepQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, + self.learning_rate, self.tau, action_bound, self.minibatch_size, + self.architecture, self.h1_size, self.h2_size, + sd_enc_size=self.sd_enc_size, si_enc_size=self.si_enc_size, + dropout_rate=self.dropout_rate, slot=self.slot) + else: # self.features = 'dip' + if self.actfreq_ds: + if self.domainString == 'CamRestaurants': + self.state_dim += 9#16 + elif self.domainString == 'SFRestaurants': + self.state_dim += 9#25 + elif self.domainString == 'Laptops11': + self.state_dim += 9#40 + self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, + self.learning_rate, self.tau, action_bound, self.minibatch_size, + self.architecture, self.h1_size, + self.h2_size, dropout_rate=self.dropout_rate) + + # when all models are defined, init all variables (this might to be sent to the main policy too) + init_op = tf.global_variables_initializer() + self.sess.run(init_op) + + self.loadPolicy(self.in_policy_file) + print('loaded replay size: ', self.episodes[self.domainString].size()) + + self.dqn.update_target_network() + + def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None): + if domainInControl is None: + domainInControl = self.domainString + if self.actToBeRecorded is None: + self.actToBeRecorded = self.summaryAct + + if state is None: + state = self.prevbelief + if action is None: + action = self.actToBeRecorded + + cState, cAction = state, action + # normalising total return to -1~1 + reward /= 20.0 + + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward) + + self.actToBeRecorded = None + self.samplecount += 1 + + def finalizeRecord(self, reward, domainInControl=None): + if domainInControl is None: + domainInControl = self.domainString + if self.episodes[domainInControl] is None: + logger.warning("record attempted to be finalized for domain where nothing has been recorded before") + return + + reward /= 20.0 + + terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction()) + + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, + terminal=True) + elif self.replay_type == 'prioritized': + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, \ + Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False, + terminal=True) + print('total TD', self.episodes[self.domainString].tree.total()) + + def convertStateAction(self, state, action): + ''' + + ''' + if isinstance(state, TerminalState): + return [0] * 89, action + else: + if self.features == 'learned' or self.features == 'rnn': + dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString) + else: + dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString) + action_name = self.actions.action_names[action] + act_slot = 'general' + for slot in dip_state.slots: + if slot in action_name: + act_slot = slot + flat_belief = dip_state.get_beliefStateVec(act_slot) + self.prev_state_check = flat_belief + + return flat_belief, action + + def nextAction(self, beliefstate): + ''' + select next action + + :param beliefstate: already converted to dipstatevec of the specific slot (or general) + :returns: (int) next summary action + ''' + + if self.exploration_type == 'e-greedy' and self.architecture != 'noisy_duel': + # epsilon greedy + if self.is_training and utils.Settings.random.rand() < self.epsilon: + action_Q = np.random.rand(len(self.action_names)) + else: + if len(beliefstate.shape) == 1: + action_Q = self.dqn.predict(np.reshape(beliefstate, (1, -1))) + else: + action_Q = self.dqn.predict(beliefstate) + # add current max Q to self.episode_ave_max_q + self.episode_ave_max_q.append(np.max(action_Q)) + elif self.architecture == 'noisy_duel': + if len(beliefstate.shape) == 1: + action_Q = self.dqn.predict(np.reshape(beliefstate, (1, -1))) + else: + action_Q = self.dqn.predict(beliefstate) + # add current max Q to self.episode_ave_max_q + self.episode_ave_max_q.append(np.max(action_Q)) + + #return the Q vect, the action will be converted in the feudal policy + return action_Q + + def train(self): + ''' + call this function when the episode ends + ''' + + if not self.is_training: + logger.info("Not in training mode") + return + else: + logger.info("Update dqn policy parameters.") + + self.episodecount += 1 + logger.info("Sample Num so far: %s" % (self.samplecount)) + logger.info("Episode Num so far: %s" % (self.episodecount)) + + s_batch_new, s_batch_beliefstate, s_batch_chosen_slot, s2_batch_dipstate, s2_batch_beliefstate, t_batch_new, r_batch_new = \ + [], [], [], [], [], [], [] + + if self.samplecount >= self.minibatch_size * 10 and self.episodecount % self.training_frequency == 0: + logger.info('start training...') + + a_batch_one_hot_new = None + #updating only states where the action is not "pass()" complicates things :/ + #since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples + + if self.js_threshold < 1.0 or not self.use_pass: + while len(s_batch_new) < self.minibatch_size: + + s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \ + self.episodes[self.domainString].sample_batch() + + a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch] + #we only wanna update state-action pairs, where action != pass() + valid_steps = [action[-1] != 1 for action in a_batch_one_hot] + a_batch_one_hot = a_batch_one_hot[valid_steps] + + s_batch_new += [s[0] for i, s in enumerate(s_batch) if valid_steps[i]] + s_batch_beliefstate += [s[1] for i, s in enumerate(s_batch) if valid_steps[i]] + s_batch_chosen_slot += [s[2] for i, s in enumerate(s_batch) if valid_steps[i]] + + s2_batch_dipstate += [s[3] for s, valid in zip(s2_batch, valid_steps) if valid] + s2_batch_beliefstate += [s[1] for s, valid in zip(s2_batch, valid_steps) if valid] + + r_batch_new += [r for r, valid in zip(r_batch, valid_steps) if valid] + t_batch_new += [t for t, valid in zip(t_batch, valid_steps) if valid] + + if a_batch_one_hot_new is None: + a_batch_one_hot_new = a_batch_one_hot + else: + a_batch_one_hot_new = np.vstack((a_batch_one_hot_new, a_batch_one_hot)) + + s_batch_new = np.vstack(s_batch_new) + s2_batch_dipstate = np.vstack(s2_batch_dipstate) + + else: + s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \ + self.episodes[self.domainString].sample_batch() + + a_batch_one_hot_new = np.eye(self.action_dim, self.action_dim)[a_batch] + s_batch_new = np.vstack([s[0] for s in s_batch]) + r_batch_new = r_batch + s2_batch_dipstate = np.vstack([s[3] for s in s2_batch]) + t_batch_new = t_batch + + if self.js_threshold < 1.0 or self.jsd_reward: + #TODO: This is highly inefficient + js_divergence_batch = [] + for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot): + if slot != "None": + keys = belief['beliefs'][slot].keys() + + b = [belief['beliefs'][slot]['**NONE**']] + \ + [belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**'] + + b_2 = [belief2['beliefs'][slot]['**NONE**']] + \ + [belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**'] + + js_divergence = self.compute_js_divergence(b, b_2) + js_divergence_batch.append(js_divergence) + else: + js_divergence_batch.append(0.0) + else: + js_divergence_batch = [0] * len(r_batch_new) + + tanh_n = np.tanh(1) + if self.jsd_reward: + if self.jsd_function == 'tanh': + js_divergence_batch = np.tanh(np.array(js_divergence_batch)) / tanh_n + #normalize jsd between -1 and 1 + js_divergence_batch = (-1 + 2 * np.array(js_divergence_batch)).tolist() + elif self.js_threshold < 1.0: + # normalizing bound to [0, 2] and then /20 + js_divergence_batch = [2/20 * int(x > self.js_threshold) for x in js_divergence_batch] + + action_q = self.dqn.predict_dip(s2_batch_dipstate, a_batch_one_hot_new) + target_q = self.dqn.predict_target_dip(s2_batch_dipstate, a_batch_one_hot_new) + + action_q = np.reshape(action_q, (s_batch_new.shape[0], -1, self.action_dim)) + target_q = np.reshape(target_q, (s_batch_new.shape[0], -1, self.action_dim)) + + y_i = [] + for k in range(min(s_batch_new.shape[0], self.episodes[self.domainString].size())): + Q_bootstrap_label = 0 + if t_batch_new[k]: + Q_bootstrap_label = r_batch_new[k] + else: + if self.q_update == 'single': + action_Q = target_q[k] + if self.jsd_reward: + Q_bootstrap_label = js_divergence_batch[k] + self.gamma * np.max(action_Q) + else: + Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q) + elif self.q_update == 'double': + action_Q = action_q[k] + argmax_tuple = np.unravel_index(np.argmax(action_Q, axis=None), action_Q.shape) + value_Q = target_q[k][argmax_tuple] + if not self.jsd_reward: + Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q + else: + Q_bootstrap_label = js_divergence_batch[k] + self.gamma * value_Q + + y_i.append(Q_bootstrap_label) + + if self.replay_type == 'prioritized': + # update the sum-tree + # update the TD error of the samples in the minibatch + currentQ_s_a_ = action_q[k][a_batch[k]] + error = abs(currentQ_s_a_ - Q_bootstrap_label) + self.episodes[self.domainString].update(idx_batch[k], error) + + reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i]) + + predicted_q_value, _, currentLoss = self.dqn.train(s_batch_new, a_batch_one_hot_new, reshaped_yi) + + self.log_loss() + + if self.episodecount % 1 == 0: + # Update target networks + self.dqn.update_target_network() + + self.savePolicyInc() + + def log_loss(self): + + s_batch_new, s_batch_beliefstate, s_batch_chosen_slot, s2_batch_dipstate, s2_batch_beliefstate, t_batch_new, r_batch_new = \ + [], [], [], [], [], [], [] + + if self.samplecount >= self.minibatch_size * 8 and self.episodecount % self.training_frequency == 0: + logger.info('start training...') + + a_batch_one_hot_new = None + #updating only states where the action is not "pass()" complicates things :/ + #since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples + + while len(s_batch_new) < 512: + + s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \ + self.episodes[self.domainString].sample_batch() + + a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch] + #we only wanna update state-action pairs, where action != pass() + valid_steps = [action[-1] != 1 for action in a_batch_one_hot] + a_batch_one_hot = a_batch_one_hot[valid_steps] + + s_batch_new += [s[0] for i, s in enumerate(s_batch) if valid_steps[i]] + s_batch_beliefstate += [s[1] for i, s in enumerate(s_batch) if valid_steps[i]] + s_batch_chosen_slot += [s[2] for i, s in enumerate(s_batch) if valid_steps[i]] + + s2_batch_dipstate += [s[3] for s, valid in zip(s2_batch, valid_steps) if valid] + s2_batch_beliefstate += [s[1] for s, valid in zip(s2_batch, valid_steps) if valid] + + r_batch_new += [r for r, valid in zip(r_batch, valid_steps) if valid] + t_batch_new += [t for t, valid in zip(t_batch, valid_steps) if valid] + + if a_batch_one_hot_new is None: + a_batch_one_hot_new = a_batch_one_hot + else: + a_batch_one_hot_new = np.vstack((a_batch_one_hot_new, a_batch_one_hot)) + + s_batch_new = np.vstack(s_batch_new) + s2_batch_dipstate = np.vstack(s2_batch_dipstate) + + if self.js_threshold < 1.0 or self.jsd_reward: + #TODO: This is highly inefficient + js_divergence_batch = [] + for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot): + if slot != "None": + keys = belief['beliefs'][slot].keys() + + b = [belief['beliefs'][slot]['**NONE**']] + \ + [belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**'] + + b_2 = [belief2['beliefs'][slot]['**NONE**']] + \ + [belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**'] + + js_divergence = self.compute_js_divergence(b, b_2) + js_divergence_batch.append(js_divergence) + else: + js_divergence_batch.append(0.0) + else: + js_divergence_batch = [0] * len(r_batch_new) + + tanh_n = np.tanh(1) + if self.jsd_reward: + if self.jsd_function == 'tanh': + js_divergence_batch = np.tanh(np.array(js_divergence_batch)) / tanh_n + #normalize jsd between -1 and 1 + js_divergence_batch = (-1 + 2 * np.array(js_divergence_batch)).tolist() + elif self.js_threshold < 1.0: + # normalizing bound to [0, 2] and then /20 + js_divergence_batch = [2/20 * int(x > self.js_threshold) for x in js_divergence_batch] + + action_q = self.dqn.predict_dip(s2_batch_dipstate, a_batch_one_hot_new) + target_q = self.dqn.predict_target_dip(s2_batch_dipstate, a_batch_one_hot_new) + + action_q = np.reshape(action_q, (s_batch_new.shape[0], -1, self.action_dim)) + target_q = np.reshape(target_q, (s_batch_new.shape[0], -1, self.action_dim)) + + y_i = [] + for k in range(s_batch_new.shape[0]): + Q_bootstrap_label = 0 + if t_batch_new[k]: + Q_bootstrap_label = r_batch_new[k] + else: + if self.q_update == 'single': + action_Q = target_q[k] + if self.jsd_reward: + Q_bootstrap_label = js_divergence_batch[k] + self.gamma * np.max(action_Q) + else: + Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q) + elif self.q_update == 'double': + action_Q = action_q[k] + argmax_tuple = np.unravel_index(np.argmax(action_Q, axis=None), action_Q.shape) + value_Q = target_q[k][argmax_tuple] + if not self.jsd_reward: + Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q + else: + Q_bootstrap_label = js_divergence_batch[k] + self.gamma * value_Q + + y_i.append(Q_bootstrap_label) + + reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i]) + + currentLoss = self.dqn.compute_loss(s_batch_new, a_batch_one_hot_new, reshaped_yi) + + with open(self.log_path, 'a') as file: + file.write(str(currentLoss) + "\n") + + def compute_js_divergence(self, P, Q): + + M = [p + q for p, q in zip(P, Q)] + return 0.5 * (entropy(P, M, base=2) + entropy(Q, M, base=2)) + +# END OF FILE diff --git a/policy/feudalgainRL/NoisyACERPolicy.py b/policy/feudalgainRL/NoisyACERPolicy.py new file mode 100644 index 0000000000000000000000000000000000000000..5854756136445216cb3f58ce3ffb1569d576f1f4 --- /dev/null +++ b/policy/feudalgainRL/NoisyACERPolicy.py @@ -0,0 +1,963 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +''' +ACERPolicy.py - Sample Efficient Actor Critic with Experience Replay +================================================== + +Copyright CUED Dialogue Systems Group 2015 - 2017 + +The implementation of the sample efficient actor critic with truncated importance sampling with bias correction, +the trust region policy optimization method and RETRACE-like multi-step estimation of the value function. +The parameters ACERPolicy.c, ACERPolicy.alpha, ACERPolicy. +The details of the implementation can be found here: https://arxiv.org/abs/1802.03753 + +See also: +https://arxiv.org/abs/1611.01224 +https://arxiv.org/abs/1606.02647 + +.. seealso:: CUED Imports/Dependencies: + + import :class:`Policy` + import :class:`utils.ContextLogger` + + + +************************ + +''' +import pickle as pickle +import copy +import json +import numpy as np +import os +import random +import scipy +import scipy.signal +import tensorflow as tf + +import policy.feudalgainRL.noisyacer as noisy_acer +#from policy.DRL import replay_policy as replay_policy +from policy.DRL import utils as drlutils +from policy import Policy +from policy import SummaryAction +import ontology.FlatOntologyManager as FlatOnt +import utils +from policy.DRL.replay_buffer_episode_acer import ReplayBufferEpisode +from policy.DRL.replay_prioritised_episode import ReplayPrioritisedEpisode +from policy.Policy import TerminalAction, TerminalState +from curiosity.curiosity_module import Curious +from utils import ContextLogger, DiaAct +from utils.Settings import config as cfg + +logger = utils.ContextLogger.getLogger('') + + +# --- for flattening the belief --- # +def flatten_belief(belief, domainUtil, merge=False): + belief = belief.getDomainState(domainUtil.domainString) + if isinstance(belief, TerminalState): + if domainUtil.domainString == 'CamRestaurants': + return [0] * 268 + elif domainUtil.domainString == 'CamHotels': + return [0] * 111 + elif domainUtil.domainString == 'SFRestaurants': + return [0] * 633 + elif domainUtil.domainString == 'SFHotels': + return [0] * 438 + elif domainUtil.domainString == 'Laptops11': + return [0] * 257 + elif domainUtil.domainString == 'TV': + return [0] * 188 + + policyfeatures = ['full', 'method', 'discourseAct', 'requested', \ + 'lastActionInformNone', 'offerHappened', 'inform_info'] + + flat_belief = [] + for feat in policyfeatures: + add_feature = [] + if feat == 'full': + # for slot in self.sorted_slots: + for slot in domainUtil.ontology['informable']: + for value in domainUtil.ontology['informable'][slot]: # + ['**NONE**']: + add_feature.append(belief['beliefs'][slot][value]) + + # pfb30 11.03.2017 + try: + add_feature.append(belief['beliefs'][slot]['**NONE**']) + except: + add_feature.append(0.) # for NONE + try: + add_feature.append(belief['beliefs'][slot]['dontcare']) + except: + add_feature.append(0.) # for dontcare + + elif feat == 'method': + add_feature = [belief['beliefs']['method'][method] for method in domainUtil.ontology['method']] + elif feat == 'discourseAct': + add_feature = [belief['beliefs']['discourseAct'][discourseAct] + for discourseAct in domainUtil.ontology['discourseAct']] + elif feat == 'requested': + add_feature = [belief['beliefs']['requested'][slot] \ + for slot in domainUtil.ontology['requestable']] + elif feat == 'lastActionInformNone': + add_feature.append(float(belief['features']['lastActionInformNone'])) + elif feat == 'offerHappened': + add_feature.append(float(belief['features']['offerHappened'])) + elif feat == 'inform_info': + add_feature += belief['features']['inform_info'] + else: + logger.error('Invalid feature name in config: ' + feat) + + flat_belief += add_feature + + return flat_belief + + +# Discounting function used to calculate discounted returns. +def discount(x, gamma): + return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] + + +class NoisyACERPolicy(Policy.Policy): + ''' + Derived from :class:`Policy` + ''' + def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False): + super(NoisyACERPolicy, self).__init__(domainString, is_training) + + tf.reset_default_graph() + + self.file = in_policy_file + self.in_policy_file = self.file + self.out_policy_file = out_policy_file + self.is_training = is_training + self.accum_belief = [] + self.prev_state_check = None + + self.domainString = domainString + self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) + + self.load_buffer = True + if cfg.has_option('policy', 'bootstrap_buffer'): + self.load_buffer = cfg.getboolean('policy', 'bootstrap_buffer') + print("SACER: BOOTSTRAP BUFFER: ", self.load_buffer) + + self.load_policy = True + if cfg.has_option('policy', 'bootstrap_master_policy'): + self.load_policy = cfg.getboolean('policy', 'bootstrap_master_policy') + print("SACER: BOOTSTRAP Policy: ", self.load_policy) + + # parameter settings + + if 0: # cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper + self.n_in = cfg.getint('dqnpolicy', 'n_in') + else: + self.n_in = self.get_n_in(domainString) + + self.actor_lr = 0.0001 + if cfg.has_option('dqnpolicy', 'actor_lr'): + self.actor_lr = cfg.getfloat('dqnpolicy', 'actor_lr') + + self.critic_lr = 0.001 + if cfg.has_option('dqnpolicy', 'critic_lr'): + self.critic_lr = cfg.getfloat('dqnpolicy', 'critic_lr') + + self.delta = 1. + if cfg.has_option('dqnpolicy', 'delta'): + self.delta = cfg.getfloat('dqnpolicy', 'delta') + + self.alpha = 0.99 + if cfg.has_option('dqnpolicy', 'beta'): + self.alpha = cfg.getfloat('dqnpolicy', 'beta') + + self.c = 10. + if cfg.has_option('dqnpolicy', 'is_threshold'): + self.c = cfg.getfloat('dqnpolicy', 'is_threshold') + + self.randomseed = 1234 + if cfg.has_option('GENERAL', 'seed'): + self.randomseed = cfg.getint('GENERAL', 'seed') + + self.gamma = 0.99 + if cfg.has_option('dqnpolicy', 'gamma'): + self.gamma = cfg.getfloat('dqnpolicy', 'gamma') + + self.regularisation = 'l2' + if cfg.has_option('dqnpolicy', 'regularisation'): + self.regularisation = cfg.get('dqnpolicy', 'regularisation') + + self.learning_rate = 0.001 + if cfg.has_option('dqnpolicy', 'learning_rate'): + self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate') + + self.exploration_type = 'e-greedy' # Boltzman + if cfg.has_option('dqnpolicy', 'exploration_type'): + self.exploration_type = cfg.get('dqnpolicy', 'exploration_type') + + self.episodeNum = 1000 + if cfg.has_option('dqnpolicy', 'episodeNum'): + self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum') + + self.maxiter = 4000 + if cfg.has_option('dqnpolicy', 'maxiter'): + self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter') + + self.curiosityreward = False + if cfg.has_option('eval', 'curiosityreward'): + self.curiosityreward = cfg.getboolean('eval', 'curiosityreward') + + self.epsilon = 1 + if cfg.has_option('dqnpolicy', 'epsilon'): + self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon') + + if not self.curiosityreward: # no eps-greedy exploration when curious expl. is used + self.epsilon_start = 1 + if cfg.has_option('dqnpolicy', 'epsilon_start'): + self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start') + else: + self.epsilon_start = 0 + + self.epsilon_end = 1 + if cfg.has_option('dqnpolicy', 'epsilon_end'): + self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end') + + self.priorProbStart = 1.0 + if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'): + self.priorProbStart = cfg.getfloat('dqnpolicy', 'prior_sample_prob_start') + + self.priorProbEnd = 0.1 + if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'): + self.priorProbEnd = cfg.getfloat('dqnpolicy', 'prior_sample_prob_end') + + self.policyfeatures = [] + if cfg.has_option('dqnpolicy', 'features'): + logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features'))) + self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features')) + + self.max_k = 5 + if cfg.has_option('dqnpolicy', 'max_k'): + self.max_k = cfg.getint('dqnpolicy', 'max_k') + + self.learning_algorithm = 'drl' + if cfg.has_option('dqnpolicy', 'learning_algorithm'): + self.learning_algorithm = cfg.get('dqnpolicy', 'learning_algorithm') + logger.info('Learning algorithm: ' + self.learning_algorithm) + + self.minibatch_size = 32 + if cfg.has_option('dqnpolicy', 'minibatch_size'): + self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size') + + self.capacity = 1000 + if cfg.has_option('dqnpolicy', 'capacity'): + self.capacity = cfg.getint('dqnpolicy','capacity') + + self.replay_type = 'vanilla' + if cfg.has_option('dqnpolicy', 'replay_type'): + self.replay_type = cfg.get('dqnpolicy', 'replay_type') + + self.architecture = 'vanilla' + if cfg.has_option('dqnpolicy', 'architecture'): + self.architecture = cfg.get('dqnpolicy', 'architecture') + + self.q_update = 'single' + if cfg.has_option('dqnpolicy', 'q_update'): + self.q_update = cfg.get('dqnpolicy', 'q_update') + + self.h1_size = 130 + if cfg.has_option('dqnpolicy', 'h1_size'): + self.h1_size = cfg.getint('dqnpolicy', 'h1_size') + + self.h2_size = 50 + if cfg.has_option('dqnpolicy', 'h2_size'): + self.h2_size = cfg.getint('dqnpolicy', 'h2_size') + + self.save_step = 200 + if cfg.has_option('policy', 'save_step'): + self.save_step = cfg.getint('policy', 'save_step') + + self.temperature = 0.0 + if cfg.has_option('policy', 'temperature'): + self.temperature = cfg.getfloat('policy', 'temperature') + + self.behaviour_cloning = False + if cfg.has_option('policy', 'behaviour_cloning'): + self.behaviour_cloning = cfg.getboolean('policy', 'behaviour_cloning') + if self.behaviour_cloning: + print("We use behaviour cloning in addition.") + + self.combined_ER = False + if cfg.has_option('policy', 'combined_ER'): + self.combined_ER = cfg.getboolean('policy', 'combined_ER') + + self.master_space = False + if cfg.has_option('policy', 'master_space'): + self.master_space = cfg.getboolean('policy', 'master_space') + + self.optimize_ER = False + if cfg.has_option('policy', 'optimize_ER'): + self.optimize_ER = cfg.getboolean('policy', 'optimize_ER') + + self.importance_sampling = 'soft' + if cfg.has_option('dqnpolicy', 'importance_sampling'): + self.importance_sampling = cfg.get('dqnpolicy', 'importance_sampling') + + self.train_iters_per_episode = 1 + if cfg.has_option('dqnpolicy', 'train_iters_per_episode'): + self.train_iters_per_episode = cfg.getint('dqnpolicy', 'train_iters_per_episode') + + self.training_frequency = 2 + if cfg.has_option('dqnpolicy', 'training_frequency'): + self.training_frequency = cfg.getint('dqnpolicy', 'training_frequency') + + # domain specific parameter settings (overrides general policy parameter settings) + if cfg.has_option('dqnpolicy_'+domainString, 'n_in'): + self.n_in = cfg.getint('dqnpolicy_'+domainString, 'n_in') + + if cfg.has_option('dqnpolicy_'+domainString, 'actor_lr'): + self.actor_lr = cfg.getfloat('dqnpolicy_'+domainString, 'actor_lr') + + if cfg.has_option('dqnpolicy_'+domainString, 'critic_lr'): + self.critic_lr = cfg.getfloat('dqnpolicy_'+domainString, 'critic_lr') + + if cfg.has_option('dqnpolicy_'+domainString, 'delta'): + self.delta = cfg.getfloat('dqnpolicy_'+domainString, 'delta') + + if cfg.has_option('dqnpolicy_' + domainString, 'beta'): + self.alpha = cfg.getfloat('dqnpolicy_' + domainString, 'beta') + + if cfg.has_option('dqnpolicy_' + domainString, 'is_threshold'): + self.c = cfg.getfloat('dqnpolicy_' + domainString, 'is_threshold') + + if cfg.has_option('dqnpolicy_'+domainString, 'gamma'): + self.gamma = cfg.getfloat('dqnpolicy_'+domainString, 'gamma') + + if cfg.has_option('dqnpolicy_'+domainString, 'regularisation'): + self.regularisation = cfg.get('dqnpolicy_'+domainString, 'regulariser') + + if cfg.has_option('dqnpolicy_'+domainString, 'learning_rate'): + self.learning_rate = cfg.getfloat('dqnpolicy_'+domainString, 'learning_rate') + + if cfg.has_option('dqnpolicy_'+domainString, 'exploration_type'): + self.exploration_type = cfg.get('dqnpolicy_'+domainString, 'exploration_type') + + if cfg.has_option('dqnpolicy_'+domainString, 'episodeNum'): + self.episodeNum = cfg.getfloat('dqnpolicy_'+domainString, 'episodeNum') + + if cfg.has_option('dqnpolicy_'+domainString, 'maxiter'): + self.maxiter = cfg.getfloat('dqnpolicy_'+domainString, 'maxiter') + + if cfg.has_option('dqnpolicy_'+domainString, 'epsilon'): + self.epsilon = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon') + + if cfg.has_option('dqnpolicy_'+domainString, 'epsilon_start'): + self.epsilon_start = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon_start') + + if cfg.has_option('dqnpolicy_'+domainString, 'epsilon_end'): + self.epsilon_end = cfg.getfloat('dqnpolicy_'+domainString, 'epsilon_end') + + if cfg.has_option('dqnpolicy_'+domainString, 'prior_sample_prob_start'): + self.priorProbStart = cfg.getfloat('dqnpolicy_'+domainString, 'prior_sample_prob_start') + + if cfg.has_option('dqnpolicy_'+domainString, 'prior_sample_prob_end'): + self.priorProbEnd = cfg.getfloat('dqnpolicy_'+domainString, 'prior_sample_prob_end') + + if cfg.has_option('dqnpolicy_'+domainString, 'features'): + logger.info('Features: ' + str(cfg.get('dqnpolicy_'+domainString, 'features'))) + self.policyfeatures = json.loads(cfg.get('dqnpolicy_'+domainString, 'features')) + + if cfg.has_option('dqnpolicy_'+domainString, 'max_k'): + self.max_k = cfg.getint('dqnpolicy_'+domainString, 'max_k') + + if cfg.has_option('dqnpolicy_'+domainString, 'learning_algorithm'): + self.learning_algorithm = cfg.get('dqnpolicy_'+domainString, 'learning_algorithm') + logger.info('Learning algorithm: ' + self.learning_algorithm) + + if cfg.has_option('dqnpolicy_'+domainString, 'minibatch_size'): + self.minibatch_size = cfg.getint('dqnpolicy_'+domainString, 'minibatch_size') + + if cfg.has_option('dqnpolicy_'+domainString, 'capacity'): + self.capacity = cfg.getint('dqnpolicy_'+domainString,'capacity') + + if cfg.has_option('dqnpolicy_'+domainString, 'replay_type'): + self.replay_type = cfg.get('dqnpolicy_'+domainString, 'replay_type') + + if cfg.has_option('dqnpolicy_'+domainString, 'architecture'): + self.architecture = cfg.get('dqnpolicy_'+domainString, 'architecture') + + if cfg.has_option('dqnpolicy_'+domainString, 'q_update'): + self.q_update = cfg.get('dqnpolicy_'+domainString, 'q_update') + + if cfg.has_option('dqnpolicy_'+domainString, 'h1_size'): + self.h1_size = cfg.getint('dqnpolicy_'+domainString, 'h1_size') + + if cfg.has_option('dqnpolicy_'+domainString, 'h2_size'): + self.h2_size = cfg.getint('dqnpolicy_'+domainString, 'h2_size') + + if cfg.has_option('policy_' + domainString, 'save_step'): + self.save_step = cfg.getint('policy_' + domainString, 'save_step') + + if cfg.has_option('dqnpolicy_'+domainString, 'importance_sampling'): + self.importance_sampling = cfg.get('dqnpolicy_'+domainString, 'importance_sampling') + + if cfg.has_option('dqnpolicy_' + domainString, 'train_iters_per_episode'): + self.train_iters_per_episode = cfg.getint('dqnpolicy_' + domainString, 'train_iters_per_episode') + + if cfg.has_option('dqnpolicy_'+domainString, 'training_frequency'): + self.training_frequency = cfg.getint('dqnpolicy_'+domainString, 'training_frequency') + + self.episode_ct = 0 + + self.episode_ave_max_q = [] + self.mu_prob = 0. # behavioral policy + + # os.environ["CUDA_VISIBLE_DEVICES"]="" + + # init session + self.sess = tf.Session() + + with tf.device("/cpu:0"): + + np.random.seed(self.randomseed) + tf.set_random_seed(self.randomseed) + random.seed(self.randomseed) + + # initialise an replay buffer + if self.replay_type == 'vanilla': + self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, + self.randomseed) + elif self.replay_type == 'prioritized': + self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed) + #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) + #self.episodes = [] + self.samplecount = 0 + self.episodecount = 0 + + # construct the models + self.state_dim = self.n_in + if self.master_space: + self.masteraction = MasterAction.MasterAction(domainString) + self.inform_ways = len(self.masteraction.inform_ways) + self.summary_action_dim = len(self.masteraction.summary_action_names) + self.payload_dim = len(self.masteraction.inform_names) + self.action_dim = [self.summary_action_dim, self.payload_dim, self.inform_ways] + self.global_mu = [0. for _ in range(self.action_dim[0])] + #dimension of master space should then be: + # summary_action_dim - inform_ways + inform_ways * payload_dim + else: + self.summaryaction = SummaryAction.SummaryAction(domainString) + self.action_dim = len(self.summaryaction.action_names) + action_bound = len(self.summaryaction.action_names) + #self.stats = [0 for _ in range(self.action_dim)] + self.global_mu = [0. for _ in range(self.action_dim)] + + self.sacer = noisy_acer.NoisyACERNetwork(self.sess, self.state_dim, self.action_dim, self.critic_lr, self.delta, + self.c, self.alpha, self.h1_size, self.h2_size, self.is_training, + temperature=self.temperature) + + #if self.optimize_ER: + # self.replay_policy = replay_policy.ReplayPolicy(self.sess, seed=self.randomseed) + + # when all models are defined, init all variables + init_op = tf.global_variables_initializer() + self.sess.run(init_op) + + if self.load_policy: + self.loadPolicy(self.in_policy_file) + print('loaded replay size: ', self.episodes[self.domainString].size()) + else: + print("We do not load a previous policy.") + + if self.curiosityreward: + self.curiosityFunctions = Curious() + #self.acer.update_target_network() + + def get_n_in(self, domain_string): + if domain_string == 'CamRestaurants': + return 268 + elif domain_string == 'CamHotels': + return 111 + elif domain_string == 'SFRestaurants': + return 636 + elif domain_string == 'SFHotels': + return 438 + elif domain_string == 'Laptops6': + return 268 # ic340: this is wrong + elif domain_string == 'Laptops11': + return 257 + elif domain_string is 'TV': + return 188 + else: + print('DOMAIN {} SIZE NOT SPECIFIED, PLEASE DEFINE n_in'.format(domain_string)) + + def act_on(self, state, hyps=None): + if self.lastSystemAction is None and self.startwithhello: + systemAct, nextaIdex, mu, mask = 'hello()', -1, None, None + else: + systemAct, nextaIdex, mu, mask = self.tion(state) + self.lastSystemAction = systemAct + self.summaryAct = nextaIdex + self.prev_mu = mu + self.prev_mask = mask + self.prevbelief = state + + systemAct = DiaAct.DiaAct(systemAct) + return systemAct + + def record(self, reward, domainInControl=None, weight=None, state=None, action=None): + if domainInControl is None: + domainInControl = self.domainString + if self.actToBeRecorded is None: + #self.actToBeRecorded = self.lastSystemAction + self.actToBeRecorded = self.summaryAct + + if state is None: + state = self.prevbelief + if action is None: + action = self.actToBeRecorded + mu_weight = self.prev_mu + mask = self.prev_mask + + cState, cAction = self.convertStateAction(state, action) + + # normalising total return to -1~1 + #reward /= 40.0 + reward /= 20.0 + """ + reward = float(reward+10.0)/40.0 + """ + value = self.sacer.predict_value([cState], [mask]) + + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask) + elif self.replay_type == 'prioritized': + self.episodes[domainInControl].record(state=cState, \ + state_ori=state, action=cAction, reward=reward, value=value[0], distribution=mu_weight, mask=mask) + + self.actToBeRecorded = None + self.samplecount += 1 + return + + def finalizeRecord(self, reward, domainInControl=None): + if domainInControl is None: + domainInControl = self.domainString + if self.episodes[domainInControl] is None: + logger.warning("record attempted to be finalized for domain where nothing has been recorded before") + return + + #print 'Episode Avg_Max_Q', float(self.episode_ave_max_q)/float(self.episodes[domainInControl].size()) + #print 'Episode Avg_Max_Q', np.mean(self.episode_ave_max_q) + #print self.stats + + # normalising total return to -1~1 + reward /= 20.0 + + terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction()) + value = 0.0 # not effect on experience replay + + def calculate_discountR_advantage(r_episode, v_episode): + ######################################################################### + # Here we take the rewards and values from the rollout, and use them to + # generate the advantage and discounted returns. + # The advantage function uses "Generalized Advantage Estimation" + bootstrap_value = 0.0 + self.r_episode_plus = np.asarray(r_episode + [bootstrap_value]) + discounted_r_episode = discount(self.r_episode_plus,self.gamma)[:-1] + self.v_episode_plus = np.asarray(v_episode + [bootstrap_value]) + advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1] + advantage = discount(advantage,self.gamma) + ######################################################################### + return discounted_r_episode, advantage + + if self.replay_type == 'vanilla': + self.episodes[domainInControl].record(state=terminal_state, \ + state_ori=TerminalState(), action=terminal_action, reward=reward, value=value, terminal=True, distribution=None) + elif self.replay_type == 'prioritized': + episode_r, episode_v = self.episodes[domainInControl].record_final_and_get_episode(state=terminal_state, \ + state_ori=TerminalState(), + action=terminal_action, + reward=reward, + value=value) + + # TD_error is a list of td error in the current episode + _, TD_error = calculate_discountR_advantage(episode_r, episode_v) + episodic_TD = np.mean(np.absolute(TD_error)) + print('episodic_TD') + print(episodic_TD) + self.episodes[domainInControl].insertPriority(episodic_TD) + + return + + def convertStateAction(self, state, action): + if isinstance(state, TerminalState): + if self.domainUtil.domainString == 'CamRestaurants': + return [0] * 268, action + elif self.domainUtil.domainString == 'CamHotels': + return [0] * 111, action + elif self.domainUtil.domainString == 'SFRestaurants': + return [0] * 633, action + elif self.domainUtil.domainString == 'SFHotels': + return [0] * 438, action + elif self.domainUtil.domainString == 'Laptops11': + return [0] * 257, action + elif self.domainUtil.domainString == 'TV': + return [0] * 188, action + else: + flat_belief = flatten_belief(state, self.domainUtil) + self.prev_state_check = flat_belief + + return flat_belief, action + + def tion(self, beliefstate): + ''' + select next action + + :param beliefstate: + :param hyps: + :returns: (int) next summarye action + ''' + beliefVec = flatten_belief(beliefstate, self.domainUtil) + if self.master_space: + execMask = self.masteraction.getExecutableMask() + else: + execMask = self.summaryaction.getExecutableMask(beliefstate, self.lastSystemAction) + #execMask = np.zeros(self.action_dim) + + def apply_mask(prob, maskval, baseline=9.99999975e-06): + return prob if maskval == 0.0 else baseline # not quite 0.0 to avoid division by zero + + action_prob = self.sacer.predict_policy(np.reshape(beliefVec, (1, len(beliefVec))), + np.reshape(execMask, (1, len(execMask))))[0] + + if self.exploration_type == 'e-greedy' or not self.is_training: + # epsilon greedy + epsilon = self.epsilon if self.is_training else 0. + if not self.master_space: + # a bit hacky here because execMask has a different shape than action_prob + eps_prob = [apply_mask(prob, admissible) for prob, admissible in zip(np.ones(len(action_prob)), execMask)] + else: + #this is fine because we have no execMask for master space at the moment + eps_prob = np.ones(len(action_prob)) + eps_prob /= sum(eps_prob) + + #action_prob = [apply_mask(prob, admissible) for prob, admissible in zip(action_prob, execMask)] + best_index = np.argmax(action_prob) + best_prob = [1. if i == best_index else 0. for i in range(len(action_prob))] + + #we sample a random action with probability epsilon and sample from the policy distribution with probability 1-epsilon + action_prob = epsilon * np.array(eps_prob) + (1. - epsilon) * action_prob + + #take the greedy action during evaluation + if not self.is_training: + action_prob = np.array(best_prob) + + elif self.exploration_type == 'standard': + #action_prob = [apply_mask(prob, admissible) for prob, admissible in zip(action_prob, execMask)] + print(action_prob) + + if not self.is_training: + best_index = np.argmax(action_prob) + best_prob = [1. if i == best_index else 0. for i in range(len(action_prob))] + action_prob = np.array(best_prob) + + nextaIdex = np.random.choice(len(action_prob), p=action_prob / sum(action_prob)) + mu = action_prob / sum(action_prob) + + if self.master_space: + beliefstate = beliefstate.getDomainState(self.domainUtil.domainString) + print("MASTER ACTION: ", self.masteraction.action_names[nextaIdex]) + masterAct = self.masteraction.Convert(beliefstate, self.masteraction.action_names[nextaIdex], self.lastSystemAction) + print("MASTER ACT: ", masterAct) + else: + summaryAct = self.summaryaction.action_names[nextaIdex] + beliefstate = beliefstate.getDomainState(self.domainUtil.domainString) + masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction) + return masterAct, nextaIdex, mu, execMask + + def train(self): + ''' + call this function when the episode ends + ''' + USE_GLOBAL_MU = False + self.episode_ct += 1 + + if not self.is_training: + logger.info("Not in training mode") + return + else: + logger.info("Update acer policy parameters.") + + self.episodecount += 1 + logger.info("Sample Num so far: %s" % (self.samplecount)) + logger.info("Episode Num so far: %s" % (self.episodecount)) + #if True: + if self.samplecount >= self.minibatch_size * 3 and self.episodecount % self.training_frequency == 0: + # if self.episodecount >= self.minibatch_size and self.episodecount % 2 == 0: + # if self.episodecount >= self.minibatch_size * 3 and self.episodecount % 2 == 0: + # if self.samplecount >= self.capacity and self.episodecount % 5 == 0: + logger.info('start training...') + + for _ in range(self.train_iters_per_episode): + + if self.optimize_ER: + episode_features = self.compute_episode_features() + sub_buffer = self.replay_policy.sample_buffer(episode_features, self.episodes[self.domainString].buffer) + else: + sub_buffer = [] + + if self.replay_type == 'vanilla' or self.replay_type == 'prioritized': + s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, v_batch, mu_policy, mask_batch = \ + self.episodes[self.domainString].sample_batch(sub_buffer) + if USE_GLOBAL_MU: + mu_sum = sum(self.global_mu) + mu_normalised = np.array([c / mu_sum for c in self.global_mu]) + #print >> sys.stderr, len(mu_policy), len(mu_policy[0]), mu_policy[0][0] + mu_policy = [[mu_normalised for _ in range(len(mu_policy[i]))] for i in range(len(mu_policy))] + else: + assert False # not implemented yet + + discounted_r_batch = [] + advantage_batch = [] + + def calculate_discountR_advantage(r_episode, v_episode): + ######################################################################### + # Here we take the rewards and values from the rolloutv, and use them to + # generate the advantage and discounted returns. + # The advantage function uses "Generalized Advantage Estimation" + bootstrap_value = 0.0 + # r_episode rescale by rhos? + self.r_episode_plus = np.asarray(r_episode + [bootstrap_value]) + discounted_r_episode = discount(self.r_episode_plus, self.gamma)[:-1] + self.v_episode_plus = np.asarray(v_episode + [bootstrap_value]) + # change sth here + advantage = r_episode + self.gamma * self.v_episode_plus[1:] - self.v_episode_plus[:-1] + advantage = discount(advantage, self.gamma) + ######################################################################### + return discounted_r_episode, advantage + + if self.replay_type == 'prioritized': + for item_r, item_v, item_idx in zip(r_batch, v_batch, idx_batch): + # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist()) + r, a = calculate_discountR_advantage(item_r, item_v) + + # flatten nested numpy array and turn it into list + discounted_r_batch += r.tolist() + advantage_batch += a.tolist() + + # update the sum-tree + # update the TD error of the samples (episode) in the minibatch + episodic_TD_error = np.mean(np.absolute(a)) + self.episodes[self.domainString].update(item_idx, episodic_TD_error) + else: + for item_r, item_v in zip(r_batch, v_batch): + # r, a = calculate_discountR_advantage(item_r, np.concatenate(item_v).ravel().tolist()) + r, a = calculate_discountR_advantage(item_r, item_v) + + # flatten nested numpy array and turn it into list + discounted_r_batch += r.tolist() + advantage_batch += a.tolist() + + batch_size = len(s_batch) + + if self.optimize_ER: + + if self.episodes[self.domainString].count < self.minibatch_size: + random_indices = list(range(len(self.episodes[self.domainString].buffer))) + else: + random_indices = random.sample(range(len(self.episodes[self.domainString].buffer)), self.minibatch_size) + if len(self.replay_policy.sampled_indices) < self.minibatch_size: + random_sampled_indices = self.replay_policy.sampled_indices + else: + random_sampled_indices = random.sample(self.replay_policy.sampled_indices, self.minibatch_size) + sampled_indices = random_indices + random_sampled_indices + + start_states = [self.episodes[self.domainString].buffer[i][0][0] for i in sampled_indices] + start_masks = [self.episodes[self.domainString].buffer[i][0][9] for i in sampled_indices] + + start_values = self.sacer.predict_value(start_states, start_masks) + average_start_value = np.mean(start_values) + + if self.master_space: + a_dim = self.action_dim[0] - self.action_dim[2] + self.action_dim[2] * self.action_dim[1] + else: + a_dim = self.action_dim + a_batch_one_hot = np.eye(a_dim)[np.concatenate(a_batch, axis=0).tolist()] + + if self.behaviour_cloning: + behaviour_mask = [] + for r in r_batch: + if r[-1] > 0: + #episode was successful + behaviour_mask = behaviour_mask + [1] * len(r) + else: + behaviour_mask = behaviour_mask + [0] * len(r) + behaviour_mask = np.array(behaviour_mask, dtype=np.float32) + else: + behaviour_mask = np.zeros(shape=[sum([len(l) for l in s_batch])], dtype=np.float32) + + # train curiosity model (Paula) + if self.curiosityreward: + self.curiosityFunctions.training(np.concatenate(np.array(s2_batch), axis=0).tolist(), + np.concatenate(np.array(s_batch), axis=0).tolist(), + a_batch_one_hot) + + loss, entropy, optimize = \ + self.sacer.train(np.concatenate(np.array(s_batch), axis=0).tolist(), a_batch_one_hot, + np.concatenate(np.array(mask_batch), axis=0).tolist(), + np.concatenate(np.array(r_batch), axis=0).tolist(), s_batch, r_batch, self.gamma, + np.concatenate(np.array(mu_policy), axis=0), + discounted_r_batch, advantage_batch, + mu_values=np.concatenate(np.array(v_batch), axis=0), + behaviour_mask=behaviour_mask) + + ent, norm_loss = entropy/float(batch_size), loss/float(batch_size) + + if self.optimize_ER: + start_values = self.sacer.predict_value(start_states, start_masks) + new_average_start_value = np.mean(start_values) + + for number, index in enumerate(random_indices): + self.episodes[self.domainString].buffer[index][0][6] = start_values[number] + for number, index in enumerate(random_sampled_indices): + self.episodes[self.domainString].buffer[index][0][6] = start_values[number + len(random_indices)] + + if self.minibatch_size < self.episodecount: + print("REWARD SIGNAL ER ACTOR: ", new_average_start_value - average_start_value) + self.replay_policy.train_ER_actor(new_average_start_value - average_start_value) + + self.savePolicyInc() # self.out_policy_file) + + def savePolicy(self, FORCE_SAVE=False): + """ + Does not use this, cause it will be called from agent after every episode. + we want to save the policy only periodically. + """ + pass + + def savePolicyInc(self, FORCE_SAVE=False): + """ + save model and replay buffer + """ + if self.episodecount % self.save_step == 0: + # save_path = self.saver.save(self.sess, self.out_policy_file+'.ckpt') + self.sacer.save_network(self.out_policy_file+'.acer.ckpt') + if self.curiosityreward: + self.curiosityFunctions.save_ICM('_curiosity_model/ckpt-curiosity') + + f = open(self.out_policy_file+'.episode', 'wb') + for obj in [self.episodecount, self.episodes[self.domainString], self.global_mu]: + pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) + f.close() + # logger.info("Saving model to %s and replay buffer..." % save_path) + + def loadPolicy(self, filename): + """ + load model and replay buffer + """ + # load models + self.sacer.load_network(filename+'.acer.ckpt') + # load replay buffer + if self.load_buffer: + try: + print('load from: ', filename) + f = open(filename+'.episode', 'rb') + loaded_objects = [] + for i in range(2): # load nn params and collected data + loaded_objects.append(pickle.load(f)) + self.episodecount = int(loaded_objects[0]) + self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1]) + self.global_mu = loaded_objects[2] + logger.info("Loading both model from %s and replay buffer..." % filename) + f.close() + except: + logger.info("Loading only models...") + else: + print("SACER: We do not load the buffer.") + + def restart(self): + self.summaryAct = None + self.lastSystemAction = None + self.prevbelief = None + self.prev_mu = None + self.prev_mask = None + self.actToBeRecorded = None + self.epsilon = self.epsilon_start - (self.epsilon_start - self.epsilon_end) * float(self.episodeNum+self.episodecount) / float(self.maxiter) + # print 'current eps', self.epsilon + # self.episodes = dict.fromkeys(OntologyUtils.available_domains, None) + # self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.randomseed) + self.episode_ave_max_q = [] + + def compute_episode_features(self): + + episode_features = [] + + self.update_buffer_divergence() + + for index, episode in enumerate(self.episodes[self.domainString].buffer): + + success = 1 if episode[-1][3] > 0 else 0 + total_return = len(episode) + if success == 1: + total_return += 1 + + timestep = index / len(self.episodes[self.domainString].buffer) + + episode_features.append([success, total_return, timestep, episode[0][6], episode[-1][10]]) + + successful_dialogs = [1 for epi in episode_features if epi[0]==1] + print("NUMBER OF SUCCESSFUL DIALOGS: ", len(successful_dialogs)) + + return episode_features + + def update_buffer_divergence(self): + if self.episodes[self.domainString].count < self.minibatch_size: + random_indices = list(range(len(self.episodes[self.domainString].buffer))) + else: + random_indices = random.sample(range(len(self.episodes[self.domainString].buffer)), self.minibatch_size) + + episodes = [self.episodes[self.domainString].buffer[i] for i in random_indices] + + s_batch = [timestep[0] for epi in episodes for timestep in epi] + a_batch = [timestep[2] for epi in episodes for timestep in epi] + mu_policy = [timestep[8] for epi in episodes for timestep in epi] + mask_batch = [timestep[9] for epi in episodes for timestep in epi] + + a_batch_one_hot = np.eye(self.action_dim)[a_batch] + + rho = self.sacer.compute_rho(s_batch, a_batch_one_hot, mu_policy, mask_batch) + + #pi_prob = self.sacer.compute_responsible_output(s_batch, a_batch_one_hot, mask_batch) + #product = rho * pi_prob + + #TODO: normalize by c? + product = np.minimum(self.c, rho) + + offset = 0 + for index in random_indices: + length = len(self.episodes[self.domainString].buffer[index]) + episode_divergence = product[offset:length+offset] + episode_divergence = sum(episode_divergence) / length + + offset = offset + length + + self.episodes[self.domainString].buffer[index][-1][10] = episode_divergence diff --git a/policy/feudalgainRL/__init__.py b/policy/feudalgainRL/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/policy/feudalgainRL/dqn_latest.py b/policy/feudalgainRL/dqn_latest.py new file mode 100644 index 0000000000000000000000000000000000000000..f945067231ef7176b671fd6c5d35dea2599586e4 --- /dev/null +++ b/policy/feudalgainRL/dqn_latest.py @@ -0,0 +1,197 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +""" +Implementation of DQAN - Deep Q Action Network + +The algorithm is developed with tflearn + Tensorflow + +Author: Pei-Hao Su +""" +import tensorflow as tf +import numpy as np +import tflearn + +from policy.DRL.replay_buffer import ReplayBuffer + +# =========================== +# Deep Q Action Network +# =========================== +class DeepQNetwork(object): + """ + Input to the network is the state and action, output is Q(s,a). + """ + def __init__(self, sess, state_dim, action_dim, learning_rate, tau, \ + num_actor_vars, architecture = 'duel', h1_size = 130, h2_size = 50): + self.sess = sess + self.s_dim = state_dim + self.a_dim = action_dim + self.learning_rate = learning_rate + self.tau = tau + self.architecture = architecture + self.h1_size = h1_size + self.h2_size = h2_size + + # Create the deep Q network + self.inputs, self.action, self.Qout = \ + self.create_ddq_network(self.architecture, self.h1_size, self.h2_size) + self.network_params = tf.trainable_variables() + + # Target Network + self.target_inputs, self.target_action, self.target_Qout = \ + self.create_ddq_network(self.architecture, self.h1_size, self.h2_size) + self.target_network_params = tf.trainable_variables()[len(self.network_params):] + + # Op for periodically updating target network + self.update_target_network_params = \ + [self.target_network_params[i].assign(\ + tf.mul(self.network_params[i], self.tau) + tf.mul(self.target_network_params[i], 1. - self.tau)) + for i in range(len(self.target_network_params))] + + # Network target (y_i) + self.sampled_q = tf.placeholder(tf.float32, [None, 1]) + + # Predicted Q given state and chosed action + #actions_one_hot = self.action + #self.pred_q = tf.reduce_sum(self.Qout * actions_one_hot, reduction_indices=1, name='q_acted') + self.pred_q = self.Qout + + self.diff = self.sampled_q - self.pred_q + + self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss') + + self.optimizer = tf.train.AdamOptimizer(self.learning_rate) + self.optimize = self.optimizer.minimize(self.loss) + + def create_ddq_network(self, architecture = 'duel', h1_size = 130, h2_size = 50): + inputs = tf.placeholder(tf.float32, [None, self.s_dim]) + action = tf.placeholder(tf.float32, [None, self.a_dim]) + + # state network + W_fc1_s = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01)) + b_fc1_s = tf.Variable(tf.zeros([h1_size])) + h_fc1_s = tf.nn.relu(tf.matmul(inputs, W_fc1_s) + b_fc1_s) + + # action network + W_fc1_a = tf.Variable(tf.truncated_normal([self.a_dim, h1_size], stddev=0.01)) + b_fc1_a = tf.Variable(tf.zeros([h1_size])) + h_fc1_a = tf.nn.relu(tf.matmul(action, W_fc1_a) + b_fc1_a) + + + #h_fc1 = tf.nn.tanh(tf.matmul(inputs, W_fc1) + b_fc1) + #if architecture == 'duel': + if False: + + """ + W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_fc2_s = tf.Variable(tf.zeros([h2_size])) + h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s) + + W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01)) + b_value = tf.Variable(tf.zeros([1])) + value_out = tf.matmul(h_fc2_s, W_value) + b_value + + + + W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_fc2_a = tf.Variable(tf.zeros([h2_size])) + h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a) + + Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1) + """ + + + # value function + W_value = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_value = tf.Variable(tf.zeros([h2_size])) + h_value = tf.nn.relu(tf.matmul(h_fc1, W_value) + b_value) + + W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01)) + b_value = tf.Variable(tf.zeros([1])) + value_out = tf.matmul(h_value, W_value) + b_value + + # advantage function + W_advantage = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_advantage = tf.Variable(tf.zeros([h2_size])) + h_advantage = tf.nn.relu(tf.matmul(h_fc1, W_advantage) + b_advantage) + + W_advantage = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01)) + b_advantage = tf.Variable(tf.zeros([self.a_dim])) + Advantage_out = tf.matmul(h_advantage, W_advantage) + b_advantage + + Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, reduction_indices=1, keep_dims=True)) + + else: + W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_fc2_s = tf.Variable(tf.zeros([h2_size])) + h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s) + + W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_fc2_a = tf.Variable(tf.zeros([h2_size])) + h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a) + + # inner product of state s and action a + #Qout = tf.mul(h_fc2_s,h_fc2_a) + Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1) + #Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1, keep_dims=True ) + #Qout = tf.reduce_sum(tf.mul(h_fc2_s,h_fc2_a)) + + return inputs, action, Qout + + def train(self, inputs, action, sampled_q): + return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={ + self.inputs: inputs, + self.action: action, + self.sampled_q: sampled_q + }) + + def predict(self, inputs, action): + #return self.sess.run(self.pred_q, feed_dict={ + return self.sess.run(self.Qout, feed_dict={ + self.inputs: inputs, + self.action: action + }) + + def predict_target(self, inputs, action): + #return self.sess.run(self.pred_q, feed_dict={ + return self.sess.run(self.target_Qout, feed_dict={ + self.target_inputs: inputs, + self.target_action: action + }) + + def update_target_network(self): + self.sess.run(self.update_target_network_params) + + def load_network(self, load_filename): + self.saver = tf.train.Saver() + try: + self.saver.restore(self.sess, load_filename) + print("Successfully loaded:", load_filename) + except: + print("Could not find old network weights") + + def save_network(self, save_filename): + print('Saving deepq-network...') + self.saver.save(self.sess, save_filename) + + def clipped_error(self, x): + return tf.select(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false diff --git a/policy/feudalgainRL/feudalUtils.py b/policy/feudalgainRL/feudalUtils.py new file mode 100644 index 0000000000000000000000000000000000000000..27d7b625013bc256fe24ad22102f733e2ae57132 --- /dev/null +++ b/policy/feudalgainRL/feudalUtils.py @@ -0,0 +1,128 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +import sys +import numpy as np + +import ontology.FlatOntologyManager as FlatOnt + + +def get_feudal_masks(non_exec, slots, slot_independent_actions, slot_specific_actions): + + feudal_masks = {'req_info': {}, 'give_info': None, 'master': None} + give_info_masks = np.zeros(len(slot_independent_actions)) + give_info_masks[-1] = -sys.maxsize + for i, action in enumerate(slot_independent_actions): + if action in non_exec: + give_info_masks[i] = -sys.maxsize + feudal_masks['give_info'] = give_info_masks + for slot in slots: + feudal_masks['req_info'][slot] = np.zeros(len(slot_specific_actions)) + feudal_masks['req_info'][slot][-1] = -sys.maxsize + for i, action in enumerate(slot_specific_actions): + if action == 'reqmore': + if action in non_exec: + feudal_masks['req_info'][slot][i] = -sys.maxsize + elif action + '_' + slot in non_exec: + feudal_masks['req_info'][slot][i] = -sys.maxsize + master_masks = np.zeros(3) + master_masks[:] = -sys.maxsize + if 0 in give_info_masks: + master_masks[0] = 0 + for slot in slots: + if 0 in feudal_masks['req_info'][slot]: + master_masks[1] = 0 + feudal_masks['master'] = master_masks + # print(non_exec) + # print(feudal_masks) + return feudal_masks + +def get_feudalAC_masks(non_exec, slots, slot_independent_actions, slot_specific_actions, only_master=True): + + if only_master: + + feudal_masks = {'req_info': {}, 'give_info': None, 'master': None} + give_info_masks = np.zeros(len(slot_independent_actions)) + give_info_masks[-1] = -sys.maxsize + for i, action in enumerate(slot_independent_actions): + if action in non_exec: + give_info_masks[i] = -sys.maxsize + feudal_masks['give_info'] = give_info_masks + for slot in slots: + feudal_masks['req_info'][slot] = np.zeros(len(slot_specific_actions)) + feudal_masks['req_info'][slot][-1] = -sys.maxsize + for i, action in enumerate(slot_specific_actions): + if action + '_' + slot in non_exec: + feudal_masks['req_info'][slot][i] = -sys.maxsize + #master_masks = np.zeros(len(slot_independent_actions)) + #master_masks[:] = -sys.maxsize + #if 0 in give_info_masks: + # master_masks[-2] = 0 + for i, slot in enumerate(slots): + if 0 in feudal_masks['req_info'][slot]: + give_info_masks[-1] = 0 + feudal_masks['master'] = give_info_masks + # print(non_exec) + # print(feudal_masks) + return feudal_masks + + else: + feudal_masks = {'req_info': {}, 'give_info': None, 'master': None} + give_info_masks = np.zeros(len(slot_independent_actions)) + give_info_masks[-1] = -sys.maxsize + for i, action in enumerate(slot_independent_actions): + if action in non_exec: + give_info_masks[i] = -sys.maxsize + feudal_masks['give_info'] = give_info_masks + for slot in slots: + feudal_masks['req_info'][slot] = np.zeros(len(slot_specific_actions)) + feudal_masks['req_info'][slot][-1] = -sys.maxsize + for i, action in enumerate(slot_specific_actions): + if action + '_' + slot in non_exec: + feudal_masks['req_info'][slot][i] = -sys.maxsize + master_masks = np.zeros(2) * -sys.maxsize + #master_masks[:] = -sys.maxsize + if 0 in give_info_masks: + master_masks[0] = 0 + for i, slot in enumerate(slots): + if 0 in feudal_masks['req_info'][slot]: + master_masks[1] = 0 + feudal_masks['master'] = master_masks + # print(non_exec) + # print(feudal_masks) + return feudal_masks + + + +def get_feudal_slot_mask(non_exec, slot, slot_actions): + slot_masks = np.zeros(len(slot_actions)) + slot_masks[-1] = -sys.maxsize + if slot == 'master' or slot == 'give_info': + for i, action in enumerate(slot_actions): + if action in non_exec: + slot_masks[i] = -sys.maxsize + else: + for i, action in enumerate(slot_actions): + action = action+'_'+slot + if action in non_exec: + slot_masks[i] = -sys.maxsize + return slot_masks \ No newline at end of file diff --git a/policy/feudalgainRL/noisyacer.py b/policy/feudalgainRL/noisyacer.py new file mode 100644 index 0000000000000000000000000000000000000000..da52ad6ad9af57907094797589ee5cb9b954ab00 --- /dev/null +++ b/policy/feudalgainRL/noisyacer.py @@ -0,0 +1,588 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +""" +Implementation of ACER + +The algorithm is developed with Tensorflow + +Author: Gellert Weisz +""" + + +import numpy as np +import tensorflow as tf + +from random import choice +from time import sleep +from time import time + +import sys # todo remove later + +# =========================== +# Soft Actor Critic with Experience Replay +# =========================== + + +class NoisyACERNetwork(object): + def __init__(self, sess, state_dim, action_dim, learning_rate, delta, c, alpha, h1_size=130, h2_size=50, + is_training = True, actfreq_loss=None, temperature=0, critic_regularizer_weight=0, noisy_acer=False): + self.sess = sess + self.s_dim = state_dim + self.a_dim = action_dim + if isinstance(action_dim, list): + self.master_space = True + self.master_space_dim = self.a_dim[0] - self.a_dim[2] + self.a_dim[2] * self.a_dim[1] + else: + self.master_space = False + self.learning_rate = learning_rate + self.critic_regularizer_weight = critic_regularizer_weight + if self.critic_regularizer_weight != 0: + print(f"We use a regularizer for the critic with weight {self.critic_regularizer_weight}.") + self.delta = delta + self.c = c + self.noisy_acer = noisy_acer + self.alpha = alpha + self.h1_size = h1_size + self.h2_size = h2_size + self.is_training = is_training + self.temperature = temperature + if self.temperature != 0: + print("Using soft ACER, temperature set to: ", self.temperature) + else: + print("Temperature of Maximum Entropy set to 0, using ACER.") + + #Input and hidden layers + self.inputs = tf.placeholder(tf.float32, [None, self.s_dim]) + self.actions = tf.placeholder(tf.float32, [None, self.a_dim]) + self.execMask = tf.placeholder(tf.float32, [None, self.a_dim]) + self.behaviour_mask = tf.placeholder_with_default(tf.zeros(tf.shape(self.actions)[0], dtype=tf.float32), shape=[None]) + self.mu_values = tf.placeholder(tf.float32, [None]) + self.mu = tf.placeholder(tf.float32, [None, self.a_dim]) + + if self.noisy_acer: + print("WE USE NOISY ACER") + self.policy, self.q = self.construct_noisy_network() + self.network_params = tf.trainable_variables() + self.avg_policy, _ = self.construct_noisy_network() + self.target_network_params = tf.trainable_variables()[len(self.network_params):] + else: + self.policy, self.q = self.construct_network() + self.network_params = tf.trainable_variables() + self.avg_policy, _ = self.construct_network() + self.target_network_params = tf.trainable_variables()[len(self.network_params):] + + self.avg_policy = tf.stop_gradient(self.avg_policy) + + # weighted average over q-values according to current policy gives the value of the state + self.value = tf.reduce_sum((self.q - self.temperature * tf.log(self.policy)) * self.policy, 1) + + self.actions_onehot = self.actions + self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1]) + self.responsible_q = tf.reduce_sum(self.q * self.actions_onehot, [1]) + + # IS weights + self.responsible_mu = tf.reduce_sum(self.mu * self.actions_onehot, [1]) + self.rho = self.responsible_outputs / self.responsible_mu + self.rho_all = self.policy / self.mu + self.rho_bar = tf.minimum(1., self.rho) + self.rho_bar_c = tf.minimum(self.c, self.rho) + + self.q_ret = tf.placeholder(tf.float32, [None]) + + + # step 1 from pawel + self.advantages_qret = self.q_ret - self.value + self.wrt_theta_step1 = -tf.reduce_sum(tf.log(self.responsible_outputs) * tf.stop_gradient(self.rho * self.advantages_qret)) + + # step 2 from pawel + self.wrt_theta = tf.reduce_sum( + tf.log(self.responsible_outputs) * + tf.stop_gradient(self.rho_bar_c * (self.advantages_qret - self.temperature * (1 + tf.log(self.responsible_outputs)))) + + tf.reduce_sum(tf.log(self.policy) * + tf.stop_gradient(tf.maximum(0., 1. - self.c / self.rho_all) * + self.policy * + (self.q - tf.reshape(self.value, [-1, 1]) - self.temperature * (1 + tf.log(self.policy)))), [1])) + + self.q_regularizer = tf.placeholder(tf.float32, [None]) + if self.critic_regularizer_weight != 0: + self.wrt_theta_v = tf.reduce_sum(tf.square(self.q_ret - self.responsible_q)) + \ + self.critic_regularizer_weight * tf.reduce_sum(tf.square(self.q_regularizer - self.responsible_q)) + else: + self.wrt_theta_v = tf.reduce_sum(tf.square(self.q_ret - self.responsible_q)) + + self.entropy = -tf.reduce_sum(self.policy * tf.log(self.policy)) + #self.loss = self.wrt_theta_v + self.wrt_theta - self.entropy * 0.01 + + self.target_v = tf.placeholder(tf.float32, [None]) + self.advantages = tf.placeholder(tf.float32, [None]) + self.advantage_qret_diff = tf.reduce_mean(tf.square(self.advantages - self. advantages_qret)) + + self.q_loss = 0.5 * self.wrt_theta_v + self.policy_loss = -self.wrt_theta + self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy)) + self.loss = self.q_loss + self.policy_loss - 0.01 * self.entropy + + self.optimizer = tf.train.AdamOptimizer(self.learning_rate) + self.optimize = self.optimizer.minimize(self.loss) + + # TRPO in theta-space + use_trpo = True # can switch off TRPO here + self.value_gradients = self.optimizer.compute_gradients(self.q_loss) + self.entropy_gradients = self.optimizer.compute_gradients(-0.01 * self.entropy) + #self.behaviour_gradients = self.optimizer.compute_gradients(self.behaviour_loss) + self.g = self.optimizer.compute_gradients(-self.policy_loss) + self.kl = tf.reduce_sum(tf.reduce_sum(self.avg_policy * tf.log(self.avg_policy / self.policy), [1])) # this is total KL divergence, per batch + self.k = self.optimizer.compute_gradients(self.kl) + self.g = [(grad, var) for grad, var in self.g if grad is not None] + self.k = [(grad, var) for grad, var in self.k if grad is not None] + assert len(self.g) == len(self.k) + self.klprod = tf.reduce_sum([tf.reduce_sum(tf.reshape(k[0], [-1]) * tf.reshape(g[0], [-1])) for k, g in zip(self.k, self.g)]) + self.klen = tf.reduce_sum([tf.reduce_sum(tf.reshape(k[0], [-1]) * tf.reshape(k[0], [-1])) for k, g in zip(self.k, self.g)]) + self.trpo_scale = tf.maximum(0., (self.klprod - self.delta) / self.klen) + self.final_gradients = [] + for i in range(len(self.g)): + if use_trpo: + self.final_gradients.append((-(self.g[i][0] - self.trpo_scale * self.k[i][0]), self.g[i][1])) # negative because this is loss + else: + self.final_gradients.append((-self.g[i][0], self.g[i][1])) # negative because this is loss + + if self.temperature == 0 and not self.noisy_acer: + self.optimize = [self.optimizer.apply_gradients(self.final_gradients), + self.optimizer.apply_gradients(self.entropy_gradients), + self.optimizer.apply_gradients(self.value_gradients) + ] + else: + self.optimize = [self.optimizer.apply_gradients(self.final_gradients), + self.optimizer.apply_gradients(self.value_gradients) + ] + + self.update_avg_theta = \ + [self.target_network_params[i].assign(tf.multiply(self.network_params[i], 1. - self.alpha) + + tf.multiply(self.target_network_params[i], self.alpha)) + for i in range(len(self.target_network_params))] + + self.saver = tf.train.Saver() + + def construct_network(self): + W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, self.h1_size], stddev=0.01)) + b_fc1 = tf.Variable(tf.zeros([self.h1_size])) + h_fc1 = tf.nn.relu(tf.matmul(self.inputs, W_fc1) + b_fc1) + + W_h2 = tf.Variable(tf.truncated_normal([self.h1_size, self.h2_size], stddev=0.01)) + b_h2 = tf.Variable(tf.zeros([self.h2_size])) + h_h2 = tf.nn.relu(tf.matmul(h_fc1, W_h2) + b_h2) + + W_q = tf.Variable(tf.truncated_normal([self.h2_size, self.a_dim], stddev=0.01)) + b_q = tf.Variable(tf.zeros([self.a_dim])) + q = tf.matmul(h_h2, W_q) + b_q + + W_policy = tf.Variable(tf.truncated_normal([self.h2_size, self.a_dim], stddev=0.01)) + b_policy = tf.Variable(tf.zeros([self.a_dim])) + policy = tf.nn.softmax(tf.matmul(h_h2, W_policy) + b_policy + self.execMask) + 0.00001 + + return policy, q + + def construct_noisy_network(self): + self.mean_noisy_w = [] + self.mean_noisy_b = [] + + h_fc1 = self.noisy_dense_layer(self.inputs, self.s_dim, self.h1_size, activation=tf.nn.relu) + + h_h2 = self.noisy_dense_layer(h_fc1, self.h1_size, self.h2_size, activation=tf.nn.relu) + # Q function + q = self.noisy_dense_layer(h_h2, self.h2_size, self.a_dim) + + policy = self.noisy_dense_layer(h_h2, self.h2_size, self.a_dim) + # prevent problem when calling log(self.policy) + policy = tf.nn.softmax(policy + self.execMask) + 0.00001 + + return policy, q + + def getPolicy(self, inputs, execMask): + return self.sess.run([self.policy], feed_dict={ + self.inputs: inputs, + self.execMask: execMask, + }) + + def train(self, inputs, actions, execMask, rewards, unflattened_inputs, unflattened_rewards, gamma, mu, + discounted_rewards, advantages, mu_values=None, behaviour_mask=None, critic_regularizer_output=None): + value, responsible_q, rho_bar, responsible_outputs = self.sess.run( + [self.value, self.responsible_q, self.rho_bar, self.responsible_outputs], feed_dict={ + self.inputs: inputs, + self.actions: actions, + self.execMask: execMask, + self.mu: mu, + }) + + q_rets, offset = [], 0 + #print >> sys.stderr, rho_bar[0], value[0], responsible_q[0] + for j in range(0, len(unflattened_inputs)): # todo implement retrace for lambda other than one + q_ret, new_q_ret = [], 0 + for i in range(len(unflattened_inputs[j])-1, -1, -1): + new_q_ret = rewards[offset+i] + gamma * new_q_ret + q_ret.append(new_q_ret) + new_q_ret = rho_bar[offset+i] * (new_q_ret - responsible_q[offset+i]) + value[offset+i] + #new_q_ret = value[offset+i] # debug + q_ret = list(reversed(q_ret)) + q_rets.append(q_ret) + offset += len(unflattened_inputs[j]) + + q_ret_flat = np.concatenate(np.array(q_rets), axis=0).tolist() + + feed_dict = { + self.inputs: inputs, + self.actions: actions, + self.execMask: execMask, + self.mu: mu, + self.q_ret: q_ret_flat, + self.target_v: discounted_rewards, + self.advantages: advantages, + #self.mu_values: mu_values, + #self.behaviour_mask: behaviour_mask + } + + if self.critic_regularizer_weight != 0: + feed_dict[self.q_regularizer] = critic_regularizer_output + + trpo_scale, klprod, kl, diff, entropy, loss, optimize = self.sess.run([self.trpo_scale, self.klprod, self.kl, self.advantage_qret_diff, self.entropy, self.loss, self.optimize], feed_dict=feed_dict) + update_avg_theta = self.sess.run([self.update_avg_theta], feed_dict=feed_dict) + + return loss, entropy, optimize + + def predict_policy(self, inputs, execMask): + return self.sess.run(self.policy, feed_dict={ + self.inputs: inputs, + self.execMask: execMask, + }) + + def compute_rho(self, inputs, actions, mu, mask): + return self.sess.run(self.rho, + feed_dict={self.inputs: inputs, self.actions: actions, self.mu: mu, self.execMask: mask}) + + def compute_responsible_output(self, inputs, actions, mask): + return self.sess.run(self.responsible_outputs, + feed_dict={self.inputs: inputs, self.actions: actions, self.execMask: mask}) + + def compute_responsible_q(self, inputs, actions, mask): + return self.sess.run(self.responsible_q, + feed_dict={self.inputs: inputs, self.actions: actions, self.execMask: mask}) + + def predict_value(self, inputs, execMask): + return self.sess.run(self.value, feed_dict={ + self.inputs: inputs, + self.execMask: execMask, + }) + + def predict_action_value(self, inputs, execMask): + return self.sess.run([self.policy, self.value], feed_dict={ + self.inputs: inputs, + self.execMask: execMask, + }) + + def compute_mean_noisy(self): + return self.sess.run([self.mean_noisy_w, self.mean_noisy_b]) + + def noisy_dense_layer(self, input, input_neurons, output_neurons, activation=tf.identity): + + W_mu = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01)) + W_sigma = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01)) + W_eps = tf.random_normal(shape=[input_neurons, output_neurons]) + W = W_mu + tf.multiply(W_sigma, W_eps) + + b_mu = tf.Variable(tf.zeros([output_neurons])) + b_sigma = tf.Variable(tf.zeros([output_neurons])) + b_eps = tf.random_normal(shape=[output_neurons]) + b = b_mu + tf.multiply(b_sigma, b_eps) + + self.mean_noisy_w.append(tf.reduce_mean(tf.abs(W_sigma))) + self.mean_noisy_b.append(tf.reduce_mean(tf.abs(b_sigma))) + + return activation(tf.matmul(input, W) + b) + + def load_network(self, load_filename): + self.saver = tf.train.Saver() + if load_filename.split('.')[-3] != '0': + try: + self.saver.restore(self.sess, load_filename) + print("Successfully loaded:", load_filename) + except: + print("Could not find old network weights") + else: + print('nothing loaded in first iteration') + + def save_network(self, save_filename): + print('Saving acer-network...') + self.saver.save(self.sess, save_filename) + + +class RNNACERNetwork(object): + def __init__(self, sess, si_state_dim, sd_state_dim, action_dim, learning_rate, delta, c, alpha, h1_size = 130, h2_size = 50, is_training = True, sd_enc_size=25, + si_enc_size=25, dropout_rate=0., tn='normal', slot='si'): + self.sess = sess + self.s_dim = si_state_dim + sd_state_dim + self.a_dim = action_dim + self.learning_rate = learning_rate + self.delta = delta + self.c = c + self.alpha = alpha + self.h1_size = h1_size + self.h2_size = h2_size + self.is_training = is_training + self.sd_dim = sd_state_dim + self.si_dim = si_state_dim + self.sd_enc_size = sd_enc_size + + #Input and hidden layers + self.inputs = tf.placeholder(tf.float32, [None, self.s_dim]) + self.actions = tf.placeholder(tf.float32, [None, self.a_dim]) + self.execMask = tf.placeholder(tf.float32, [None, self.a_dim]) + + keep_prob = 1 - dropout_rate + sd_inputs, si_inputs = tf.split(self.inputs, [self.sd_dim, self.si_dim], 1) + + if slot == 'sd': + sd_inputs = tf.reshape(sd_inputs, (tf.shape(sd_inputs)[0], 1, self.sd_dim)) + + # slots encoder + with tf.variable_scope(tn): + # try: + lstm_cell = tf.nn.rnn_cell.GRUCell(self.sd_enc_size) + if keep_prob < 1: + lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob) + hidden_state = lstm_cell.zero_state(tf.shape(sd_inputs)[0], tf.float32) + _, h_sdfe = tf.nn.dynamic_rnn(lstm_cell, sd_inputs, initial_state=hidden_state) + # except: + # lstm_cell = tf.contrib.rnn.GRUCell(self.sd_enc_size) + # hidden_state = lstm_cell.zero_state(tf.shape(sd_inputs)[0], tf.float32) + # _, h_sdfe = tf.contrib.rnn.dynamic_rnn(lstm_cell, sd_inputs, initial_state=hidden_state) + h1_inputs = tf.concat((si_inputs, h_sdfe), 1) + else: + '''W_sdfe = tf.Variable(tf.truncated_normal([self.sd_dim, sd_enc_size], stddev=0.01)) + b_sdfe = tf.Variable(tf.zeros([sd_enc_size])) + h_sdfe = tf.nn.relu(tf.matmul(sd_inputs, W_sdfe) + b_sdfe) + if keep_prob < 1: + h_sdfe = tf.nn.dropout(h_sdfe, keep_prob)''' + h1_inputs = self.inputs + + def construct_theta(): + W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, self.h1_size], stddev=0.01)) + b_fc1 = tf.Variable(0.0 * tf.ones([self.h1_size])) + if self.h2_size > 0: # todo layer 2 should be shared between policy and q-function? + W_h2 = tf.Variable(tf.truncated_normal([self.h1_size, self.h2_size], stddev=0.01)) + b_h2 = tf.Variable(0.0 * tf.ones([self.h2_size])) + + W_q = tf.Variable(tf.truncated_normal([self.h2_size, self.a_dim], stddev=0.01)) + b_q = tf.Variable(0.0 * tf.ones([self.a_dim])) + W_policy = tf.Variable(tf.truncated_normal([self.h2_size, self.a_dim], stddev=0.01)) + b_policy = tf.Variable(0.0 * tf.ones([self.a_dim])) + + theta = [W_fc1, b_fc1, W_h2, b_h2, W_q, b_q, W_policy, b_policy] + else: + W_q = tf.Variable(tf.truncated_normal([self.h1_size, self.a_dim], stddev=0.01)) + b_q = tf.Variable(0.0 * tf.ones([self.a_dim])) + W_policy = tf.Variable(tf.truncated_normal([self.h1_size, self.a_dim], stddev=0.01)) + b_policy = tf.Variable(0.0 * tf.ones([self.a_dim])) + + theta = [W_fc1, b_fc1, W_q, b_q, W_policy, b_policy] + return theta + + self.theta = construct_theta() + self.avg_theta = construct_theta() + + def construct_network(theta): + if self.h2_size > 0: + W_fc1, b_fc1, W_h2, b_h2, W_q, b_q, W_policy, b_policy = theta + else: + W_fc1, b_fc1, W_q, b_q, W_policy, b_policy = theta + + h_fc1 = tf.nn.relu(tf.matmul(h1_inputs, W_fc1) + b_fc1) + + if self.h2_size > 0: + h_h2 = tf.nn.relu(tf.matmul(h_fc1, W_h2) + b_h2) + # Q function + q = tf.matmul(h_h2, W_q) + b_q + # prevent problem when calling log(self.policy) + policy = tf.nn.softmax(tf.matmul(h_h2, W_policy) + b_policy + self.execMask) + 0.00001 + else: # 1 hidden layer + # value function + q = tf.matmul(h_fc1, W_q) + b_q + # policy function + policy = tf.nn.softmax(tf.matmul(h_fc1, W_policy) + b_policy + self.execMask) + 0.00001 + return policy, q + + self.policy, self.q = construct_network(self.theta) + self.avg_policy, _ = construct_network(self.avg_theta) + self.avg_policy = tf.stop_gradient(self.avg_policy) + + # weighted average over q-values according to current policy gives the value of the state + self.value = tf.reduce_sum(self.q * self.policy, 1) + + self.actions_onehot = self.actions + self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1]) + self.responsible_q = tf.reduce_sum(self.q * self.actions_onehot, [1]) + + # IS weights + self.mu = tf.placeholder(tf.float32, [None, self.a_dim]) + self.responsible_mu = tf.reduce_sum(self.mu * self.actions_onehot, [1]) + self.rho = self.responsible_outputs / self.responsible_mu + self.rho_all = self.policy / self.mu + self.rho_bar = tf.minimum(1., self.rho) + self.rho_bar_c = tf.minimum(self.c, self.rho) + + self.q_ret = tf.placeholder(tf.float32, [None]) + + # step 1 from pawel + self.advantages_qret = self.q_ret - self.value + self.wrt_theta_step1 = -tf.reduce_sum(tf.log(self.responsible_outputs) * tf.stop_gradient(self.rho * self.advantages_qret)) + + # step 2 from pawel + self.wrt_theta = tf.reduce_sum( + tf.log(self.responsible_outputs) * tf.stop_gradient(self.rho_bar_c * self.advantages_qret) + + tf.reduce_sum(tf.log(self.policy) * + tf.stop_gradient(tf.maximum(0., 1. - self.c / self.rho_all) * + self.policy * (self.q - tf.reshape(self.value, [-1, 1]))), [1])) + + self.wrt_theta_v = tf.reduce_sum(tf.square(self.q_ret - self.responsible_q)) + self.entropy = -tf.reduce_sum(self.policy * tf.log(self.policy)) + #self.loss = self.wrt_theta_v + self.wrt_theta - self.entropy * 0.01 + + self.target_v = tf.placeholder(tf.float32, [None]) + self.advantages = tf.placeholder(tf.float32, [None]) + self.advantage_qret_diff = tf.reduce_mean(tf.square(self.advantages - self. advantages_qret)) + + # DEBUG (A2C) + #self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1]))) # original a2c + self.q_loss = 0.5 * self.wrt_theta_v + self.policy_loss = -self.wrt_theta + self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy)) + self.loss = self.q_loss + self.policy_loss - 0.01 * self.entropy + + self.optimizer = tf.train.AdamOptimizer(self.learning_rate) + self.optimize = self.optimizer.minimize(self.loss) + + # TRPO in theta-space + use_trpo = True # can switch off TRPO here + self.value_gradients = self.optimizer.compute_gradients(self.q_loss) + self.entropy_gradients = self.optimizer.compute_gradients(-0.01 * self.entropy) + self.g = self.optimizer.compute_gradients(-self.policy_loss) + self.kl = tf.reduce_sum(tf.reduce_sum(self.avg_policy * tf.log(self.avg_policy / self.policy), [1])) # this is total KL divergence, per batch + self.k = self.optimizer.compute_gradients(self.kl) + self.g = [(grad, var) for grad, var in self.g if grad is not None] + self.k = [(grad, var) for grad, var in self.k if grad is not None] + assert len(self.g) == len(self.k) + self.klprod = tf.reduce_sum([tf.reduce_sum(tf.reshape(k[0], [-1]) * tf.reshape(g[0], [-1])) for k, g in zip(self.k, self.g)]) + self.klen = tf.reduce_sum([tf.reduce_sum(tf.reshape(k[0], [-1]) * tf.reshape(k[0], [-1])) for k, g in zip(self.k, self.g)]) + self.trpo_scale = tf.maximum(0., (self.klprod - self.delta) / self.klen) + self.final_gradients = [] + for i in range(len(self.g)): + if use_trpo: + self.final_gradients.append((-(self.g[i][0] - self.trpo_scale * self.k[i][0]), self.g[i][1])) # negative because this is loss + else: + self.final_gradients.append((-self.g[i][0], self.g[i][1])) # negative because this is loss + + self.optimize = [self.optimizer.apply_gradients(self.final_gradients), + self.optimizer.apply_gradients(self.entropy_gradients), + self.optimizer.apply_gradients(self.value_gradients)] + + self.update_avg_theta = [avg_w.assign(self.alpha * avg_w + (1. - self.alpha) * w) + for avg_w, w in zip(self.avg_theta, self.theta)] + + + def getPolicy(self, inputs, execMask): + return self.sess.run([self.policy], feed_dict={ + self.inputs: inputs, + self.execMask: execMask, + }) + + def train(self, inputs, actions, execMask, rewards, unflattened_inputs, unflattened_rewards, gamma, mu, discounted_rewards, advantages): + value, responsible_q, rho_bar, responsible_outputs = self.sess.run( + [self.value, self.responsible_q, self.rho_bar, self.responsible_outputs], feed_dict={ + self.inputs: inputs, + self.actions: actions, + self.execMask: execMask, + self.mu: mu, + }) + + q_rets, offset = [], 0 + #print >> sys.stderr, rho_bar[0], value[0], responsible_q[0] + for j in range(0, len(unflattened_inputs)): # todo implement retrace for lambda other than one + q_ret, new_q_ret = [], 0 + for i in range(len(unflattened_inputs[j])-1, -1, -1): + new_q_ret = rewards[offset+i] + gamma * new_q_ret + q_ret.append(new_q_ret) + new_q_ret = rho_bar[offset+i] * (new_q_ret - responsible_q[offset+i]) + value[offset+i] + #new_q_ret = value[offset+i] # debug + q_ret = list(reversed(q_ret)) + q_rets.append(q_ret) + offset += len(unflattened_inputs[j]) + + q_ret_flat = np.concatenate(np.array(q_rets), axis=0).tolist() + + feed_dict = { + self.inputs: inputs, + self.actions: actions, + self.execMask: execMask, + self.mu: mu, + self.q_ret: q_ret_flat, + self.target_v: discounted_rewards, + self.advantages: advantages, + } + + trpo_scale, klprod, kl, diff, entropy, loss, optimize = self.sess.run([self.trpo_scale, self.klprod, self.kl, self.advantage_qret_diff, self.entropy, self.loss, self.optimize], feed_dict=feed_dict) + update_avg_theta = self.sess.run([self.update_avg_theta], feed_dict=feed_dict) + + return loss, entropy, optimize + + def predict_policy(self, inputs, execMask): + return self.sess.run(self.policy, feed_dict={ + self.inputs: inputs, + self.execMask: execMask, + }) + + def predict_value(self, inputs, execMask): + return self.sess.run(self.value, feed_dict={ + self.inputs: inputs, + self.execMask: execMask, + }) + + def predict_action_value(self, inputs, execMask): + return self.sess.run([self.policy, self.value], feed_dict={ + self.inputs: inputs, + self.execMask: execMask, + }) + + def load_network(self, load_filename): + self.saver = tf.train.Saver() + if load_filename.split('.')[-3] != '0': + try: + self.saver.restore(self.sess, load_filename) + print("Successfully loaded:", load_filename) + except: + print("Could not find old network weights") + else: + print('nothing loaded in first iteration') + + def save_network(self, save_filename): + print('Saving sacer-network...') + #self.saver = tf.train.Saver() + self.saver.save(self.sess, save_filename) diff --git a/policy/feudalgainRL/noisydqn.py b/policy/feudalgainRL/noisydqn.py new file mode 100644 index 0000000000000000000000000000000000000000..03bdb48dd6116ddbe3992390dcf0165dc4732da9 --- /dev/null +++ b/policy/feudalgainRL/noisydqn.py @@ -0,0 +1,632 @@ +############################################################################### +# PyDial: Multi-domain Statistical Spoken Dialogue System Software +############################################################################### +# +# Copyright 2015 - 2019 +# Cambridge University Engineering Department Dialogue Systems Group +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### + +""" +Implementation of DQN - Deep Q Network + +The algorithm is developed with tflearn + Tensorflow + +Author: Pei-Hao Su +""" +import tensorflow as tf + +# =========================== +# Deep Q Network +# =========================== +class DeepQNetwork(object): + """ + Input to the network is the state and action, output is Q(s,a). + """ + def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars, minibatch_size=64, + architecture='duel', h1_size=130, h2_size=50, dropout_rate=0.): + self.sess = sess + self.s_dim = state_dim + self.a_dim = action_dim + self.learning_rate = learning_rate + self.tau = tau + self.architecture = architecture + self.h1_size = h1_size + self.h2_size = h2_size + self.minibatch_size = minibatch_size + + # Create the deep Q network + self.inputs, self.action, self.Qout = \ + self.create_ddq_network(self.architecture, self.h1_size, self.h2_size, dropout_rate=dropout_rate) + self.network_params = tf.trainable_variables() + + # Target Network + self.target_inputs, self.target_action, self.target_Qout = \ + self.create_ddq_network(self.architecture, self.h1_size, self.h2_size, dropout_rate=dropout_rate) + self.target_network_params = tf.trainable_variables()[len(self.network_params):] + + # Op for periodically updating target network + self.update_target_network_params = \ + [self.target_network_params[i].assign(\ + tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau)) + for i in range(len(self.target_network_params))] + + # Network target (y_i) + self.sampled_q = tf.placeholder(tf.float32, [None, 1]) + #self.temperature = tf.placeholder(shape=None,dtype=tf.float32) + + # for Boltzman exploration + #self.softmax_Q = tf.nn.softmax(self.self.Qout/self.temperature) + + # Predicted Q given state and chosed action + #actions_one_hot = tf.one_hot(self.action, self.a_dim, 1.0, 0.0, name='action_one_hot') + actions_one_hot = self.action + + if architecture!= 'dip': + self.pred_q = tf.reshape(tf.reduce_sum(self.Qout * actions_one_hot, axis=1, name='q_acted'), + [self.minibatch_size, 1]) + else: + self.pred_q = self.Qout #DIP case, not sure if will work + + #self.pred_q = tf.reduce_sum(self.Qout * actions_one_hot, reduction_indices=1, name='q_acted_target') + + #self.a_maxQ = tf.argmax(self.Qout, 1) + #action_maxQ_one_hot = tf.one_hot(self.a_maxQ, self.a_dim, 1.0, 0.0, name='action_maxQ_one_hot') + #self.action_maxQ_target = tf.reduce_sum(self.target_Qout * action_maxQ_one_hot, reduction_indices=1, name='a_maxQ_target') + + # Define loss and optimization Op + self.diff = self.sampled_q - self.pred_q + self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss') + + self.optimizer = tf.train.AdamOptimizer(self.learning_rate) + self.optimize = self.optimizer.minimize(self.loss) + + # gs = tf.gradients(self.loss, self.network_params) + # capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in zip(gs, self.network_params)] + # + # self.optimize = self.optimizer.apply_gradients(capped_gvs) + + def create_ddq_network(self, architecture='duel', h1_size=130, h2_size=50, dropout_rate=0.): + keep_prob = 1 - dropout_rate + inputs = tf.placeholder(tf.float32, [None, self.s_dim]) + action = tf.placeholder(tf.float32, [None, self.a_dim]) + + if architecture == 'duel': + W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01)) + b_fc1 = tf.Variable(tf.zeros([h1_size])) + h_fc1 = tf.nn.relu(tf.matmul(inputs, W_fc1) + b_fc1) + + # value function + W_value = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_value = tf.Variable(tf.zeros([h2_size])) + h_value = tf.nn.relu(tf.matmul(h_fc1, W_value) + b_value) + + W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01)) + b_value = tf.Variable(tf.zeros([1])) + value_out = tf.matmul(h_value, W_value) + b_value + + # advantage function + W_advantage = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_advantage = tf.Variable(tf.zeros([h2_size])) + h_advantage = tf.nn.relu(tf.matmul(h_fc1, W_advantage) + b_advantage) + + W_advantage = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01)) + b_advantage = tf.Variable(tf.zeros([self.a_dim])) + Advantage_out = tf.matmul(h_advantage, W_advantage) + b_advantage + + Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, axis=1, keep_dims=True)) + + elif architecture == 'noisy_duel': + print("WE USE DUEL NOISY ARCHITECTURE") + h_fc1 = self.noisy_dense_layer(inputs, self.s_dim, h1_size, activation=tf.nn.relu) + # value function + h_value = self.noisy_dense_layer(h_fc1, h1_size, h2_size, activation=tf.nn.relu) + value_out = self.noisy_dense_layer(h_value, h2_size, 1) + + # advantage function + h_advantage = self.noisy_dense_layer(h_fc1, h1_size, h2_size, activation=tf.nn.relu) + Advantage_out = self.noisy_dense_layer(h_advantage, h2_size, self.a_dim) + + Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, axis=1, keep_dims=True)) + + elif architecture == 'dip': + + # state network + W_fc1_s = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01)) + b_fc1_s = tf.Variable(tf.zeros([h1_size])) + h_fc1_s = tf.nn.relu(tf.matmul(inputs, W_fc1_s) + b_fc1_s) + + # action network + W_fc1_a = tf.Variable(tf.truncated_normal([self.a_dim, h1_size], stddev=0.01)) + b_fc1_a = tf.Variable(tf.zeros([h1_size])) + h_fc1_a = tf.nn.relu(tf.matmul(action, W_fc1_a) + b_fc1_a) + + W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_fc2_s = tf.Variable(tf.zeros([h2_size])) + h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s) + + W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_fc2_a = tf.Variable(tf.zeros([h2_size])) + h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a) + + Qout = tf.reduce_sum(tf.multiply(h_fc2_s, h_fc2_a), 1) + + else: + W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01)) + b_fc1 = tf.Variable(tf.zeros([h1_size])) + h_fc1 = tf.nn.relu(tf.matmul(inputs, W_fc1) + b_fc1) + if keep_prob < 1: + h_fc1 = tf.nn.dropout(h_fc1, keep_prob) + + W_fc2 = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_fc2 = tf.Variable(tf.zeros([h2_size])) + h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2) + if keep_prob < 1: + h_fc2 = tf.nn.dropout(h_fc2, keep_prob) + + W_out = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01)) + b_out = tf.Variable(tf.zeros([self.a_dim])) + Qout = tf.matmul(h_fc2, W_out) + b_out + + return inputs, action, Qout + + def noisy_dense_layer(self, input, input_neurons, output_neurons, activation=tf.identity): + + W_mu = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01)) + W_sigma = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01)) + W_eps = tf.random_normal(shape=[input_neurons, output_neurons]) + W = W_mu + tf.multiply(W_sigma, W_eps) + + b_mu = tf.Variable(tf.zeros([output_neurons])) + b_sigma = tf.Variable(tf.zeros([output_neurons])) + b_eps = tf.random_normal(shape=[output_neurons]) + b = b_mu + tf.multiply(b_sigma, b_eps) + + return activation(tf.matmul(input, W) + b) + + def train(self, inputs, action, sampled_q): + return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={ #yes, needs to be changed too + self.inputs: inputs, + self.action: action, + self.sampled_q: sampled_q + }) + + + def predict(self, inputs): + return self.sess.run(self.Qout, feed_dict={ + self.inputs: inputs + }) + + def predict_dip(self, inputs, action): + return self.sess.run(self.Qout, feed_dict={ + self.inputs: inputs, + self.action: action + }) + + def predict_action(self, inputs): + return self.sess.run(self.pred_q, feed_dict={ + self.inputs: inputs + }) + + def predict_target(self, inputs): + return self.sess.run(self.target_Qout, feed_dict={ + self.target_inputs: inputs + }) + + def predict_target_dip(self, inputs, action): + return self.sess.run(self.target_Qout, feed_dict={ + self.target_inputs: inputs, + self.target_action: action + }) + + def predict_target_with_action_maxQ(self, inputs): + return self.sess.run(self.action_maxQ_target, feed_dict={ + self.target_inputs: inputs, + self.inputs: inputs + }) + + def update_target_network(self): + self.sess.run(self.update_target_network_params) #yes, but no need to change + + def load_network(self, load_filename): + self.saver = tf.train.Saver() + if load_filename.split('.')[-3] != '0': + try: + self.saver.restore(self.sess, './' + load_filename) + print("Successfully loaded:", load_filename) + except: + print("Could not find old network weights") + else: + print('nothing loaded in first iteration') + + def save_network(self, save_filename): + print('Saving deepq-network...') + self.saver.save(self.sess, './' +save_filename) # yes but no need to change + + def clipped_error(self, x): + return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false + + +class NNFDeepQNetwork(object): + """ + Input to the network is the state and action, output is Q(s,a). + """ + def __init__(self, sess, si_state_dim, sd_state_dim, action_dim, learning_rate, tau, num_actor_vars, minibatch_size=64, + architecture='duel', h1_size=130, h2_size=50, sd_enc_size=40, si_enc_size=80, dropout_rate=0.): + #super(NNFDeepQNetwork, self).__init__(sess, si_state_dim + sd_state_dim, action_dim, learning_rate, tau, num_actor_vars, + # minibatch_size=64, architecture='duel', h1_size=130, h2_size=50) + self.sess = sess + self.si_dim = si_state_dim + self.sd_dim = sd_state_dim + self.s_dim = self.si_dim + self.sd_dim + self.a_dim = action_dim + self.learning_rate = learning_rate + self.tau = tau + self.architecture = architecture + self.h1_size = h1_size + self.h2_size = h2_size + self.minibatch_size = minibatch_size + self.sd_enc_size = sd_enc_size + self.si_enc_size = si_enc_size + self.dropout_rate = dropout_rate + + # Create the deep Q network + self.inputs, self.action, self.Qout = \ + self.create_nnfdq_network(self.h1_size, self.h2_size, self.sd_enc_size, self.si_enc_size, self.dropout_rate) + self.network_params = tf.trainable_variables() + + # Target Network + self.target_inputs, self.target_action, self.target_Qout = \ + self.create_nnfdq_network(self.h1_size, self.h2_size, self.sd_enc_size, self.si_enc_size, self.dropout_rate) + self.target_network_params = tf.trainable_variables()[len(self.network_params):] + + # Op for periodically updating target network + self.update_target_network_params = \ + [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + + tf.multiply(self.target_network_params[i], 1. - self.tau)) + for i in range(len(self.target_network_params))] + + # Network target (y_i) + self.sampled_q = tf.placeholder(tf.float32, [None, 1]) + + # Predicted Q given state and chosed action + actions_one_hot = self.action + + if architecture != 'dip': + self.pred_q = tf.reshape(tf.reduce_sum(self.Qout * actions_one_hot, axis=1, name='q_acted'), + [-1, 1]) + else: + self.pred_q = self.Qout + + # Define loss and optimization Op + self.diff = self.sampled_q - self.pred_q + self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss') + + self.optimizer = tf.train.AdamOptimizer(self.learning_rate) + self.optimize = self.optimizer.minimize(self.loss) + + def create_nnfdq_network(self, h1_size=130, h2_size=50, sd_enc_size=40, si_enc_size=80, dropout_rate=0.): + + keep_prob = 1 - dropout_rate + inputs = tf.placeholder(tf.float32, [None, self.s_dim]) + action = tf.placeholder(tf.float32, [None, self.a_dim]) + + if self.architecture == 'duel': + print("WE USE THE DUELING ARCHITECTURE!") + W_fc1 = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01)) + b_fc1 = tf.Variable(tf.zeros([h1_size])) + h_fc1 = tf.nn.relu(tf.matmul(inputs, W_fc1) + b_fc1) + + # value function + W_value = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_value = tf.Variable(tf.zeros([h2_size])) + h_value = tf.nn.relu(tf.matmul(h_fc1, W_value) + b_value) + + W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01)) + b_value = tf.Variable(tf.zeros([1])) + value_out = tf.matmul(h_value, W_value) + b_value + + # advantage function + W_advantage = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_advantage = tf.Variable(tf.zeros([h2_size])) + h_advantage = tf.nn.relu(tf.matmul(h_fc1, W_advantage) + b_advantage) + + W_advantage = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01)) + b_advantage = tf.Variable(tf.zeros([self.a_dim])) + Advantage_out = tf.matmul(h_advantage, W_advantage) + b_advantage + + Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, axis=1, keep_dims=True)) + + elif self.architecture == 'noisy_duel': + print("WE USE THE NOISY DUELING ARCHITECTURE!") + self.mean_noisy_w = [] + self.mean_noisy_b = [] + h_fc1 = self.noisy_dense_layer(inputs, self.s_dim, h1_size, activation=tf.nn.relu) + # value function + h_value = self.noisy_dense_layer(h_fc1, h1_size, h2_size, activation=tf.nn.relu) + value_out = self.noisy_dense_layer(h_value, h2_size, 1) + + # advantage function + h_advantage = self.noisy_dense_layer(h_fc1, h1_size, h2_size, activation=tf.nn.relu) + Advantage_out = self.noisy_dense_layer(h_advantage, h2_size, self.a_dim) + + Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, axis=1, keep_dims=True)) + + else: + inputs = tf.placeholder(tf.float32, [None, self.sd_dim + self.si_dim]) + keep_prob = 1 - dropout_rate + sd_inputs, si_inputs = tf.split(inputs, [self.sd_dim, self.si_dim], 1) + action = tf.placeholder(tf.float32, [None, self.a_dim]) + + W_sdfe = tf.Variable(tf.truncated_normal([self.sd_dim, sd_enc_size], stddev=0.01)) + b_sdfe = tf.Variable(tf.zeros([sd_enc_size])) + h_sdfe = tf.nn.relu(tf.matmul(sd_inputs, W_sdfe) + b_sdfe) + if keep_prob < 1: + h_sdfe = tf.nn.dropout(h_sdfe, keep_prob) + + W_sife = tf.Variable(tf.truncated_normal([self.si_dim, si_enc_size], stddev=0.01)) + b_sife = tf.Variable(tf.zeros([si_enc_size])) + h_sife = tf.nn.relu(tf.matmul(si_inputs, W_sife) + b_sife) + if keep_prob < 1: + h_sife = tf.nn.dropout(h_sife, keep_prob) + + W_fc1 = tf.Variable(tf.truncated_normal([sd_enc_size+si_enc_size, h1_size], stddev=0.01)) + b_fc1 = tf.Variable(tf.zeros([h1_size])) + h_fc1 = tf.nn.relu(tf.matmul(tf.concat((h_sdfe, h_sife), 1), W_fc1) + b_fc1) + + W_fc2 = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_fc2 = tf.Variable(tf.zeros([h2_size])) + h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2) + + W_out = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01)) + b_out = tf.Variable(tf.zeros([self.a_dim])) + Qout = tf.matmul(h_fc2, W_out) + b_out + + return inputs, action, Qout + + def predict(self, inputs): + return self.sess.run(self.Qout, feed_dict={ #inputs where a single flat_bstate + self.inputs: inputs + }) + + def predict_dip(self, inputs, action): + return self.sess.run(self.Qout, feed_dict={ #inputs and action where array of 64 (batch size) + self.inputs: inputs, + self.action: action + }) + + def predict_target(self, inputs): + return self.sess.run(self.target_Qout, feed_dict={ #inputs where a single flat_bstate + self.target_inputs: inputs + }) + + def predict_target_dip(self, inputs, action): + return self.sess.run(self.target_Qout, feed_dict={ #inputs and action where array of 64 (batch size) + self.target_inputs: inputs, + self.target_action: action + }) + + def train(self, inputs, action, sampled_q): + return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={ #all the inputs are arrays of 64 + self.inputs: inputs, + self.action: action, + self.sampled_q: sampled_q + }) + + def compute_loss(self, inputs, action, sampled_q): + + return self.sess.run(self.loss, feed_dict={ # yes, needs to be changed too + self.inputs: inputs, + self.action: action, + self.sampled_q: sampled_q + }) + + def clipped_error(self, x): + return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false + + def save_network(self, save_filename): + print('Saving deepq-network...') + self.saver.save(self.sess, './' + save_filename) + + def update_target_network(self): + self.sess.run(self.update_target_network_params) + + def load_network(self, load_filename): + self.saver = tf.train.Saver() + if load_filename.split('.')[-3] != '0': + try: + self.saver.restore(self.sess, './' + load_filename) + print("Successfully loaded:", load_filename) + except: + print("Could not find old network weights") + else: + print('nothing loaded in first iteration') + + def compute_mean_noisy(self): + return self.sess.run([self.mean_noisy_w, self.mean_noisy_b]) + + def noisy_dense_layer(self, input, input_neurons, output_neurons, activation=tf.identity): + + W_mu = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01)) + W_sigma = tf.Variable(tf.truncated_normal([input_neurons, output_neurons], stddev=0.01)) + W_eps = tf.random_normal(shape=[input_neurons, output_neurons]) + W = W_mu + tf.multiply(W_sigma, W_eps) + + b_mu = tf.Variable(tf.zeros([output_neurons])) + b_sigma = tf.Variable(tf.zeros([output_neurons])) + b_eps = tf.random_normal(shape=[output_neurons]) + b = b_mu + tf.multiply(b_sigma, b_eps) + + self.mean_noisy_w.append(tf.reduce_mean(tf.abs(W_sigma))) + self.mean_noisy_b.append(tf.reduce_mean(tf.abs(b_sigma))) + + return activation(tf.matmul(input, W) + b) + +class RNNFDeepQNetwork(object): + """ + Input to the network is the state and action, output is Q(s,a). + """ + def __init__(self, sess, si_state_dim, sd_state_dim, action_dim, learning_rate, tau, num_actor_vars, minibatch_size=64, + architecture='duel', h1_size=130, h2_size=50, sd_enc_size=40, si_enc_size=80, dropout_rate=0., slot='si'): + #super(NNFDeepQNetwork, self).__init__(sess, si_state_dim + sd_state_dim, action_dim, learning_rate, tau, num_actor_vars, + # minibatch_size=64, architecture='duel', h1_size=130, h2_size=50) + self.sess = sess + self.si_dim = si_state_dim + self.sd_dim = sd_state_dim + self.a_dim = action_dim + self.learning_rate = learning_rate + self.tau = tau + self.architecture = architecture + self.h1_size = h1_size + self.h2_size = h2_size + self.minibatch_size = minibatch_size + self.sd_enc_size = sd_enc_size + self.si_enc_size = si_enc_size + self.dropout_rate = dropout_rate + + # Create the deep Q network + self.inputs, self.action, self.Qout = \ + self.create_rnnfdq_network(self.h1_size, self.h2_size, self.sd_enc_size, self.si_enc_size, self.dropout_rate, slot=slot) + self.network_params = tf.trainable_variables() + + # Target Network + self.target_inputs, self.target_action, self.target_Qout = \ + self.create_rnnfdq_network(self.h1_size, self.h2_size, self.sd_enc_size, self.si_enc_size, self.dropout_rate, tn='target', slot=slot) + self.target_network_params = tf.trainable_variables()[len(self.network_params):] + + # Op for periodically updating target network + self.update_target_network_params = \ + [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + + tf.multiply(self.target_network_params[i], 1. - self.tau)) + for i in range(len(self.target_network_params))] + + # Network target (y_i) + self.sampled_q = tf.placeholder(tf.float32, [None, 1]) + + # Predicted Q given state and chosed action + actions_one_hot = self.action + + if architecture!= 'dip': + self.pred_q = tf.reshape(tf.reduce_sum(self.Qout * actions_one_hot, axis=1, name='q_acted'), + [self.minibatch_size, 1]) + else: + self.pred_q = self.Qout + + # Define loss and optimization Op + self.diff = self.sampled_q - self.pred_q + self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss') + + self.optimizer = tf.train.AdamOptimizer(self.learning_rate) + self.optimize = self.optimizer.minimize(self.loss) + + #def create_slot_encoder(self): + + + def create_rnnfdq_network(self, h1_size=130, h2_size=50, sd_enc_size=40, si_enc_size=80, dropout_rate=0., + tn='normal', slot='si'): + inputs = tf.placeholder(tf.float32, [None, self.sd_dim + self.si_dim]) + keep_prob = 1 - dropout_rate + sd_inputs, si_inputs = tf.split(inputs, [self.sd_dim, self.si_dim], 1) + action = tf.placeholder(tf.float32, [None, self.a_dim]) + if slot == 'sd': + sd_inputs = tf.reshape(sd_inputs, (tf.shape(sd_inputs)[0], 1, self.sd_dim)) + + #slots encoder + with tf.variable_scope(tn): + #try: + lstm_cell = tf.nn.rnn_cell.GRUCell(self.sd_enc_size) + hidden_state = lstm_cell.zero_state(tf.shape(sd_inputs)[0], tf.float32) + _, h_sdfe = tf.nn.dynamic_rnn(lstm_cell, sd_inputs, initial_state=hidden_state) + #except: + # lstm_cell = tf.contrib.rnn.GRUCell(self.sd_enc_size) + # hidden_state = lstm_cell.zero_state(tf.shape(sd_inputs)[0], tf.float32) + # _, h_sdfe = tf.contrib.rnn.dynamic_rnn(lstm_cell, sd_inputs, initial_state=hidden_state) + else: + W_sdfe = tf.Variable(tf.truncated_normal([self.sd_dim, sd_enc_size], stddev=0.01)) + b_sdfe = tf.Variable(tf.zeros([sd_enc_size])) + h_sdfe = tf.nn.relu(tf.matmul(sd_inputs, W_sdfe) + b_sdfe) + if keep_prob < 1: + h_sdfe = tf.nn.dropout(h_sdfe, keep_prob) + + W_sife = tf.Variable(tf.truncated_normal([self.si_dim, si_enc_size], stddev=0.01)) + b_sife = tf.Variable(tf.zeros([si_enc_size])) + h_sife = tf.nn.relu(tf.matmul(si_inputs, W_sife) + b_sife) + if keep_prob < 1: + h_sife = tf.nn.dropout(h_sife, keep_prob) + + W_fc1 = tf.Variable(tf.truncated_normal([sd_enc_size+si_enc_size, h1_size], stddev=0.01)) + b_fc1 = tf.Variable(tf.zeros([h1_size])) + h_fc1 = tf.nn.relu(tf.matmul(tf.concat((h_sdfe, h_sife), 1), W_fc1) + b_fc1) + + W_fc2 = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01)) + b_fc2 = tf.Variable(tf.zeros([h2_size])) + h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2) + + W_out = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01)) + b_out = tf.Variable(tf.zeros([self.a_dim])) + Qout = tf.matmul(h_fc2, W_out) + b_out + + return inputs, action, Qout + + def predict(self, inputs): + return self.sess.run(self.Qout, feed_dict={ #inputs where a single flat_bstate + self.inputs: inputs + }) + + def predict_dip(self, inputs, action): + return self.sess.run(self.Qout, feed_dict={ #inputs and action where array of 64 (batch size) + self.inputs: inputs, + self.action: action + }) + + def predict_target(self, inputs): + return self.sess.run(self.target_Qout, feed_dict={ #inputs where a single flat_bstate + self.target_inputs: inputs + }) + + def predict_target_dip(self, inputs, action): + return self.sess.run(self.target_Qout, feed_dict={ #inputs and action where array of 64 (batch size) + self.target_inputs: inputs, + self.target_action: action + }) + + def train(self, inputs, action, sampled_q): + return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={ #all the inputs are arrays of 64 + self.inputs: inputs, + self.action: action, + self.sampled_q: sampled_q + }) + + def clipped_error(self, x): + return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false + + def save_network(self, save_filename): + print('Saving deepq-network...') + self.saver.save(self.sess, save_filename) + + def update_target_network(self): + self.sess.run(self.update_target_network_params) + + def load_network(self, load_filename): + self.saver = tf.train.Saver() + if load_filename.split('.')[-3] != '0': + try: + self.saver.restore(self.sess, load_filename) + print("Successfully loaded:", load_filename) + except: + print("Could not find old network weights") + else: + print('nothing loaded in first iteration')