Skip to content
Snippets Groups Projects
Commit 5fa29847 authored by Christian's avatar Christian
Browse files

First running version of FeudalGain

parent 8f6a7ef6
Branches
No related tags found
No related merge requests found
Showing
with 8178 additions and 0 deletions
### Error model parameters ###
### set1 ###
correctNBLenDist = [0.264, 0.215, 0.138, 0.099, 0.08]
correctMean = [0.871, 0.103, 0.045, 0.032, 0.027, 0.049]
correctVar = [0.029, 0.01, 0.002, 0.001, 0.0, 0.001]
incorrectNBLenDist = [0.3, 0.068, 0.085, 0.072, 0.079]
incorrectNBPosDist = [0.0, 0.179, 0.063, 0.027, 0.017]
incorrectMean = [0.757, 0.179, 0.077, 0.048, 0.037, 0.065]
incorrectVar = [0.052, 0.014, 0.003, 0.001, 0.0, 0.002]
### Error model parameters ###
### set2 ###
correctNBLenDist = [0.322, 0.455, 0.183, 0.033, 0.006]
correctMean = [0.922, 0.105, 0.024, 0.021, 0.018, 0.022]
correctVar = [0.014, 0.013, 0.002, 0.001, 0.001, 0.001]
incorrectNBLenDist = [0.248, 0.462, 0.189, 0.073, 0.02]
incorrectNBPosDist = [0.0, 0.297, 0.045, 0.008, 0.002]
incorrectMean = [0.819, 0.201, 0.084, 0.052, 0.036, 0.042]
incorrectVar = [0.034, 0.02, 0.005, 0.002, 0.001, 0.004]
### Error model parameters ###
### set3 ###
correctNBLenDist = [0.147, 0.43, 0.212, 0.147, 0.048]
correctMean = [0.905, 0.101, 0.017, 0.008, 0.005, 0.01]
correctVar = [0.016, 0.013, 0.002, 0.001, 0.0, 0.001]
incorrectNBLenDist = [0.351, 0.388, 0.158, 0.071, 0.02]
incorrectNBPosDist = [0.0, 0.116, 0.027, 0.007, 0.002]
incorrectMean = [0.871, 0.164, 0.065, 0.043, 0.03, 0.054]
incorrectVar = [0.029, 0.02, 0.005, 0.002, 0.001, 0.003]
### Error model parameters ###
### set4 ###
correctNBLenDist = [0.143, 0.405, 0.24, 0.155, 0.043]
correctMean = [0.9, 0.104, 0.02, 0.01, 0.004, 0.004]
correctVar = [0.016, 0.012, 0.002, 0.001, 0.0, 0.0]
incorrectNBLenDist = [0.315, 0.387, 0.175, 0.089, 0.024]
incorrectNBPosDist = [0.0, 0.128, 0.042, 0.01, 0.0]
incorrectMean = [0.868, 0.155, 0.064, 0.043, 0.029, 0.038]
incorrectVar = [0.03, 0.02, 0.004, 0.002, 0.001, 0.001]
# Error model: 15% error rate, DSTC2 confscorer, DSTC2 nbestgenerator
# User model: standard sampled params, sampled patience
# Masks: off
###### General parameters ######
[GENERAL]
domains = CamRestaurants
singledomain = True
tracedialog = 0
seed = 1
[exec_config]
configdir = _benchmarkpolicies/env3-feudal
logfiledir = _benchmarklogs/env3-feudal
numtrainbatches = 20
traindialogsperbatch = 200
numbatchtestdialogs = 500
trainsourceiteration = 0
numtestdialogs = 500
trainerrorrate = 15
testerrorrate = 15
testeverybatch = True
deleteprevpolicy = True
[logging]
usecolor = False
screen_level = results
file_level = results
file = auto
###### Environment parameters ######
[agent]
maxturns = 25
[usermodel]
usenewgoalscenarios = True
oldstylepatience = False
patience = 4,6
configfile = config/sampledUM.cfg
[errormodel]
nbestsize = 5
confusionmodel = LevenshteinConfusions
nbestgeneratormodel = DSTC2NBestGenerator
confscorer = DSTC2
configfile = config/set1-ErrorModel.cfg
[summaryacts]
maxinformslots = 5
informmask = True
requestmask = True
informcountaccepted = 4
byemask = True
###### Dialogue Manager parameters ######
[policy]
policydir = _benchmarkpolicies/env3-feudal
belieftype = focus
useconfreq = False
learning = True
policytype = feudalgain
startwithhello = False
inpolicyfile = auto
outpolicyfile = auto
temperature = 0.0
noisy_acer = True
sample_argmax = False
[feudalpolicy]
features=learned
si_policy_type=acer
only_master = True
jsd_reward = True
#jsd_function = tanh
js_threshold = 0.2
js_threshold_master = 1
[i2a]
is_imaging = False
deepmind = False
load_pretrain_data = False
improve_env = False
share_layer = 2
new_q_loss = False
device = cpu
env_model_path = env_model/env1_acer_200.pkl
[dqnpolicy]
q_update = double
architecture = duel
#architecture = duel
h1_size = 300
h2_size = 100
capacity = 2000
beta = 0.95
epsilon_start = 0.3
maxiter = 4000
minibatch_size = 64
is_threshold = 5.0
episodeNum = 0.0
epsilon_end = 0.0
n_in = 268
features = ["discourseAct", "method", "requested", "full", "lastActionInformNone", "offerHappened", "inform_info"]
###### Evaluation parameters ######
[eval]
rewardvenuerecommended=0
penaliseallturns = True
wrongvenuepenalty = 0
notmentionedvaluepenalty = 0
successmeasure = objective
successreward = 20
This diff is collapsed.
...@@ -303,6 +303,9 @@ class PolicyManager(object): ...@@ -303,6 +303,9 @@ class PolicyManager(object):
elif policy_type == 'feudalAC': elif policy_type == 'feudalAC':
from policy import FeudalACPolicy from policy import FeudalACPolicy
self.domainPolicies[domainString] = FeudalACPolicy.FeudalACPolicy(in_policy_file, out_policy_file, domainString, learning) self.domainPolicies[domainString] = FeudalACPolicy.FeudalACPolicy(in_policy_file, out_policy_file, domainString, learning)
elif policy_type == 'feudalgain':
from policy import FeudalGainPolicy
self.domainPolicies[domainString] = FeudalGainPolicy.FeudalGainPolicy(in_policy_file, out_policy_file, domainString, learning)
else: else:
try: try:
# try to view the config string as a complete module path to the class to be instantiated # try to view the config string as a complete module path to the class to be instantiated
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
###############################################################################
# PyDial: Multi-domain Statistical Spoken Dialogue System Software
###############################################################################
#
# Copyright 2015 - 2019
# Cambridge University Engineering Department Dialogue Systems Group
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
###############################################################################
'''
DQNPolicy.py - deep Q network policy
==================================================
Author: Chris Tegho and Pei-Hao (Eddy) Su (Copyright CUED Dialogue Systems Group 2016)
.. seealso:: CUED Imports/Dependencies:
import :class:`Policy`
import :class:`utils.ContextLogger`
.. warning::
Documentation not done.
************************
'''
import copy
import os
import json
import numpy as np
import pickle as pickle
import random
import sys
import utils
from utils.Settings import config as cfg
from utils import ContextLogger, DiaAct, DialogueState
import ontology.FlatOntologyManager as FlatOnt
# from theano_dialogue.util.tool import *
import tensorflow as tf
from policy.DRL.replay_bufferVanilla import ReplayBuffer
from policy.DRL.replay_prioritisedVanilla import ReplayPrioritised
import policy.DRL.utils as drlutils
from policy.DRL import bdqn as bbqn
import policy.Policy
import policy.SummaryAction
import policy.BBQNPolicy
from policy.Policy import TerminalAction, TerminalState
from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state
logger = utils.ContextLogger.getLogger('')
# --- for flattening the belief --- #
domainUtil = FlatOnt.FlatDomainOntology('CamRestaurants')
class FeudalBBQNPolicy(policy.BBQNPolicy.BBQNPolicy):
'''Derived from :class:`BBQNPolicy`
'''
def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False,
action_names=None, slot=None):
super(FeudalBBQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training)
tf.reset_default_graph()
self.domainString = domainString
self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
self.in_policy_file = in_policy_file
self.out_policy_file = out_policy_file
self.is_training = is_training
self.accum_belief = []
self.prev_state_check = None
self.episode_ave_max_q = []
self.capacity *= 4 #set the capacity for episode methods, multiply it to adjust to turn based methods
self.slot = slot
# init session
self.sess = tf.Session()
with tf.device("/cpu:0"):
np.random.seed(self.randomseed)
tf.set_random_seed(self.randomseed)
# initialise an replay buffer
if self.replay_type == 'vanilla':
self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size, self.randomseed)
elif self.replay_type == 'prioritized':
self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size,
self.randomseed)
# replay_buffer = ReplayBuffer(self.capacity, self.randomseed)
# self.episodes = []
self.samplecount = 0
self.episodecount = 0
# construct the models
self.state_dim = 89 # current DIP state dim
self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
self.action_names = action_names
self.action_dim = len(self.action_names)
action_bound = len(self.action_names)
self.stats = [0 for _ in range(self.action_dim)]
self.stdVar = []
self.meanVar = []
self.stdMean = []
self.meanMean = []
self.td_error = []
self.td_errorVar = []
self.target_update_freq = 1
if cfg.has_option('bbqnpolicy', 'target_update_freq'):
self.target_update_freq = cfg.get('bbqnpolicy', 'target_update_freq')
#feudal params
self.features = 'dip'
self.sd_enc_size = 25
self.si_enc_size = 50
self.dropout_rate = 0.
if cfg.has_option('feudalpolicy', 'features'):
self.features = cfg.get('feudalpolicy', 'features')
if cfg.has_option('feudalpolicy', 'sd_enc_size'):
self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
if cfg.has_option('feudalpolicy', 'si_enc_size'):
self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
if cfg.has_option('feudalpolicy', 'dropout_rate') and self.is_training:
self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
self.actfreq_ds = False
if cfg.has_option('feudalpolicy', 'actfreq_ds'):
self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')
if self.features == 'dip':
if self.actfreq_ds:
if self.domainString == 'CamRestaurants':
self.state_dim += 16
elif self.domainString == 'SFRestaurants':
self.state_dim += 25
elif self.domainString == 'Laptops11':
self.state_dim += 40
self.bbqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate, self.tau,
action_bound, self.architecture, self.h1_size, self.h2_size,
self.n_samples,
self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu,
self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling,
self.alpha_divergence, self.alpha, self.sigma_eps)
elif self.features == 'learned' or self.features == 'rnn':
si_state_dim = 72
if self.actfreq_ds:
if self.domainString == 'CamRestaurants':
si_state_dim += 16
elif self.domainString == 'SFRestaurants':
si_state_dim += 25
elif self.domainString == 'Laptops11':
si_state_dim += 40
if self.domainString == 'CamRestaurants':
sd_state_dim = 94
elif self.domainString == 'SFRestaurants':
sd_state_dim = 158
elif self.domainString == 'Laptops11':
sd_state_dim = 13
else:
logger.error(
'Domain {} not implemented in feudal-DQN yet') # just find out the size of sd_state_dim for the new domain
if self.features == 'rnn':
arch = 'rnn'
self.state_dim = si_state_dim + sd_state_dim
self.bbqn = bbqn.RNNBBQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate,
self.tau, action_bound, arch, self.h1_size, self.h2_size, self.n_samples,
self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu,
self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling,
self.alpha_divergence, self.alpha, self.sigma_eps, sd_enc_size=self.sd_enc_size,
si_enc_size=self.sd_enc_size, dropout_rate=self.dropout_rate, slot=slot)
else:
arch = 'vanilla'
self.state_dim = si_state_dim + sd_state_dim
self.bbqn = bbqn.NNBBQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim, self.learning_rate,
self.tau, action_bound, arch, self.h1_size, self.h2_size, self.n_samples,
self.minibatch_size, self.sigma_prior, self.n_batches, self.stddev_var_mu,
self.stddev_var_logsigma, self.mean_log_sigma, self.importance_sampling,
self.alpha_divergence, self.alpha, self.sigma_eps, sd_enc_size=self.sd_enc_size,
si_enc_size=self.sd_enc_size, dropout_rate=self.dropout_rate, slot=slot)
else:
logger.error('features "{}" not implemented'.format(self.features))
# when all models are defined, init all variables
init_op = tf.global_variables_initializer()
self.sess.run(init_op)
self.loadPolicy(self.in_policy_file)
print('loaded replay size: ', self.episodes[self.domainString].size())
self.bbqn.update_target_network()
def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None):
if domainInControl is None:
domainInControl = self.domainString
if self.actToBeRecorded is None:
# self.actToBeRecorded = self.lastSystemAction
self.actToBeRecorded = self.summaryAct
if state is None:
state = self.prevbelief
if action is None:
action = self.actToBeRecorded
cState, cAction = state, action
reward /= 20.0
cur_cState = np.vstack([np.expand_dims(x, 0) for x in [cState]])
cur_action_q = self.bbqn.predict(cur_cState)
cur_target_q = self.bbqn.predict_target(cur_cState)
if exec_mask is not None:
admissible = np.add(cur_target_q, np.array(exec_mask))
else:
admissible = cur_target_q
Q_s_t_a_t_ = cur_action_q[0][cAction]
gamma_Q_s_tplu1_maxa_ = self.gamma * np.max(admissible)
if weight == None:
if self.replay_type == 'vanilla':
self.episodes[domainInControl].record(state=cState, \
state_ori=state, action=cAction, reward=reward)
elif self.replay_type == 'prioritized':
# heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
if True:
# if self.samplecount >= self.capacity:
self.episodes[domainInControl].record(state=cState, \
state_ori=state, action=cAction, reward=reward, \
Q_s_t_a_t_=Q_s_t_a_t_,
gamma_Q_s_tplu1_maxa_=gamma_Q_s_tplu1_maxa_, uniform=False)
else:
self.episodes[domainInControl].record(state=cState, \
state_ori=state, action=cAction, reward=reward, \
Q_s_t_a_t_=Q_s_t_a_t_,
gamma_Q_s_tplu1_maxa_=gamma_Q_s_tplu1_maxa_, uniform=True)
else:
self.episodes[domainInControl].record(state=cState, state_ori=state, action=cAction, reward=reward,
ma_weight=weight)
self.actToBeRecorded = None
self.samplecount += 1
return
def finalizeRecord(self, reward, domainInControl=None):
if domainInControl is None:
domainInControl = self.domainString
if self.episodes[domainInControl] is None:
logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
return
# normalising total return to -1~1
# if reward == 0:
# reward = -20.0
reward /= 20.0
"""
if reward == 20.0:
reward = 1.0
else:
reward = -0.5
"""
# reward = float(reward+10.0)/40.0
terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
if self.replay_type == 'vanilla':
self.episodes[domainInControl].record(state=terminal_state, \
state_ori=TerminalState(), action=terminal_action, reward=reward,
terminal=True)
elif self.replay_type == 'prioritized':
# heuristically assign 0.0 to Q_s_t_a_t_ and Q_s_tplu1_maxa_, doesn't matter as it is not used
if True:
# if self.samplecount >= self.capacity:
self.episodes[domainInControl].record(state=terminal_state, \
state_ori=TerminalState(), action=terminal_action, reward=reward, \
Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False,
terminal=True)
else:
self.episodes[domainInControl].record(state=terminal_state, \
state_ori=TerminalState(), action=terminal_action, reward=reward, \
Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=True,
terminal=True)
def convertStateAction(self, state, action):
'''
'''
if isinstance(state, TerminalState):
return [0] * 89, action
else:
if self.features == 'learned' or self.features == 'rnn':
dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
else:
dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
action_name = self.actions.action_names[action]
act_slot = 'general'
for slot in dip_state.slots:
if slot in action_name:
act_slot = slot
flat_belief = dip_state.get_beliefStateVec(act_slot)
self.prev_state_check = flat_belief
return flat_belief, action
def nextAction(self, beliefstate):
'''
select next action
:param beliefstate:
:param hyps:
:returns: (int) next summary action
'''
if self.exploration_type == 'e-greedy':
# epsilon greedy
if self.is_training and utils.Settings.random.rand() < self.epsilon:
action_Q = np.random.rand(len(self.action_names))
else:
action_Q = self.bbqn.predict(np.reshape(beliefstate, (1, len(beliefstate)))) # + (1. / (1. + i + j))
self.episode_ave_max_q.append(np.max(action_Q))
# return the Q vect, the action will be converted in the feudal policy
return action_Q
def train(self):
'''
call this function when the episode ends
'''
if not self.is_training:
logger.info("Not in training mode")
return
else:
logger.info("Update dqn policy parameters.")
self.episodecount += 1
logger.info("Sample Num so far: %s" % (self.samplecount))
logger.info("Episode Num so far: %s" % (self.episodecount))
if self.samplecount >= self.minibatch_size * 10 and self.episodecount % self.training_frequency == 0:
logger.info('start training...')
s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
self.episodes[self.domainString].sample_batch()
s_batch = np.vstack([np.expand_dims(x, 0) for x in s_batch])
s2_batch = np.vstack([np.expand_dims(x, 0) for x in s2_batch])
a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch]
action_q = self.bbqn.predict_dip(s2_batch, a_batch_one_hot)
target_q = self.bbqn.predict_target_dip(s2_batch, a_batch_one_hot)
# print 'action Q and target Q:', action_q, target_q
y_i = []
for k in range(min(self.minibatch_size, self.episodes[self.domainString].size())):
Q_bootstrap_label = 0
if t_batch[k]:
Q_bootstrap_label = r_batch[k]
else:
if self.q_update == 'single':
belief = s2_ori_batch[k]
execMask = [0.0] * len(self.action_names) # TODO: find out how to compute the mask here, or save it when recording the state
execMask[-1] = -sys.maxsize
action_Q = target_q[k]
admissible = np.add(action_Q, np.array(execMask))
Q_bootstrap_label = r_batch[k] + self.gamma * np.max(admissible)
y_i.append(Q_bootstrap_label)
# Update the critic given the targets
reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i])
predicted_q_value, _, currentLoss, logLikelihood, varFC2, meanFC2, td_error, KL_div = self.bbqn.train(s_batch, a_batch_one_hot, reshaped_yi, self.episodecount)
if self.episodecount % self.target_update_freq == 0:
self.bbqn.update_target_network()
if self.episodecount % self.save_step == 0:
self.savePolicyInc() # self.out_policy_file)
# END OF FILE
###############################################################################
# PyDial: Multi-domain Statistical Spoken Dialogue System Software
###############################################################################
#
# Copyright 2015 - 2019
# Cambridge University Engineering Department Dialogue Systems Group
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
###############################################################################
'''
DQNPolicy.py - deep Q network policy
==================================================
Copyright CUED Dialogue Systems Group 2015 - 2017
.. seealso:: CUED Imports/Dependencies:
import :class:`Policy`
import :class:`utils.ContextLogger`
.. warning::
Documentation not done.
************************
'''
import copy
import os
import sys
import json
import numpy as np
import pickle as pickle
from itertools import product
from scipy.stats import entropy
import utils
from utils.Settings import config as cfg
from utils import ContextLogger, DiaAct, DialogueState
import ontology.FlatOntologyManager as FlatOnt
import tensorflow as tf
from policy.DRL.replay_buffer import ReplayBuffer
from policy.DRL.replay_prioritised import ReplayPrioritised
import policy.DRL.utils as drlutils
import policy.DRL.dqn as dqn
import policy.Policy
import policy.DQNPolicy
import policy.SummaryAction
from policy.Policy import TerminalAction, TerminalState
from policy.feudalRL.DIP_parametrisation import DIP_state, padded_state
from policy.feudalRL.feudalUtils import get_feudal_masks
from policy.DRL import bdqn as bbqn
logger = utils.ContextLogger.getLogger('')
class FeudalDQNPolicy(policy.DQNPolicy.DQNPolicy):
'''Derived from :class:`DQNPolicy`
'''
def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False,
action_names=None, slot=None, sd_state_dim=50, js_threshold=0, info_reward=0.0, jsd_reward=False,
jsd_function=None):
super(FeudalDQNPolicy, self).__init__(in_policy_file, out_policy_file, domainString, is_training)
tf.reset_default_graph()
self.domainString = domainString
self.sd_state_dim = sd_state_dim
self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
self.in_policy_file = in_policy_file
self.out_policy_file = out_policy_file
self.is_training = is_training
self.accum_belief = []
self.info_reward = info_reward
self.js_threshold = js_threshold
self.jsd_reward = jsd_reward
self.jsd_function = jsd_function
if self.jsd_function is not None:
print("We use the JSD-function", self.jsd_function)
if self.js_threshold != 1.0 and not self.jsd_reward:
print("We use JS-divergence, threshold =", self.js_threshold)
if self.jsd_reward:
print("We train with raw JSD reward.")
self.slots = slot
self.features = 'dip'
if cfg.has_option('feudalpolicy', 'features'):
self.features = cfg.get('feudalpolicy', 'features')
self.actfreq_ds = False
if cfg.has_option('feudalpolicy', 'actfreq_ds'):
self.actfreq_ds = cfg.getboolean('feudalpolicy', 'actfreq_ds')
self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
self.prev_state_check = None
self.max_k = 5
if cfg.has_option('dqnpolicy', 'max_k'):
self.max_k = cfg.getint('dqnpolicy', 'max_k')
self.capacity *= 5 # capacity for episode methods, multiply it to adjust to turn based methods
# init session
self.sess = tf.Session()
with tf.device("/cpu:0"):
np.random.seed(self.randomseed)
tf.set_random_seed(self.randomseed)
# initialise a replay buffer
if self.replay_type == 'vanilla':
self.episodes[self.domainString] = ReplayBuffer(self.capacity, self.minibatch_size*4, self.randomseed)
elif self.replay_type == 'prioritized':
self.episodes[self.domainString] = ReplayPrioritised(self.capacity, self.minibatch_size,
self.randomseed)
self.samplecount = 0
self.episodecount = 0
# construct the models
self.summaryaction = policy.SummaryAction.SummaryAction(domainString)
self.action_names = action_names
self.action_dim = len(self.action_names)
action_bound = len(self.action_names)
self.stats = [0 for _ in range(self.action_dim)]
if self.features == 'learned' or self.features == 'rnn':
si_state_dim = 73
if self.actfreq_ds:
if self.domainString == 'CamRestaurants':
si_state_dim += 9#16
elif self.domainString == 'SFRestaurants':
si_state_dim += 9#25
elif self.domainString == 'Laptops11':
si_state_dim += 9#40
self.sd_enc_size = 50
self.si_enc_size = 25
self.dropout_rate = 0.
if cfg.has_option('feudalpolicy', 'sd_enc_size'):
self.sd_enc_size = cfg.getint('feudalpolicy', 'sd_enc_size')
if cfg.has_option('feudalpolicy', 'si_enc_size'):
self.si_enc_size = cfg.getint('feudalpolicy', 'si_enc_size')
if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
if cfg.has_option('dqnpolicy', 'dropout_rate') and self.is_training:
self.dropout_rate = cfg.getfloat('feudalpolicy', 'dropout_rate')
self.state_dim = si_state_dim + sd_state_dim
if self.features == 'learned':
self.dqn = bbqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, self.learning_rate,
self.tau,
action_bound, self.architecture, self.h1_size, self.h2_size,
self.n_samples,
self.minibatch_size)
elif self.features == 'rnn':
self.dqn = dqn.RNNFDeepQNetwork(self.sess, si_state_dim, sd_state_dim, self.action_dim,
self.learning_rate, self.tau, action_bound, self.minibatch_size,
self.architecture, self.h1_size, self.h2_size,
sd_enc_size=self.sd_enc_size, si_enc_size=self.si_enc_size,
dropout_rate=self.dropout_rate, slot=self.slot)
else: # self.features = 'dip'
if self.actfreq_ds:
if self.domainString == 'CamRestaurants':
self.state_dim += 9#16
elif self.domainString == 'SFRestaurants':
self.state_dim += 9#25
elif self.domainString == 'Laptops11':
self.state_dim += 9#40
self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim,
self.learning_rate, self.tau, action_bound, self.minibatch_size,
self.architecture, self.h1_size,
self.h2_size, dropout_rate=self.dropout_rate)
# when all models are defined, init all variables (this might to be sent to the main policy too)
init_op = tf.global_variables_initializer()
self.sess.run(init_op)
self.loadPolicy(self.in_policy_file)
print('loaded replay size: ', self.episodes[self.domainString].size())
self.dqn.update_target_network()
def record(self, reward, domainInControl=None, weight=None, state=None, action=None, exec_mask=None):
if domainInControl is None:
domainInControl = self.domainString
if self.actToBeRecorded is None:
self.actToBeRecorded = self.summaryAct
if state is None:
state = self.prevbelief
if action is None:
action = self.actToBeRecorded
cState, cAction = state, action
# normalising total return to -1~1
reward /= 20.0
if self.replay_type == 'vanilla':
self.episodes[domainInControl].record(state=cState, \
state_ori=state, action=cAction, reward=reward)
self.actToBeRecorded = None
self.samplecount += 1
def finalizeRecord(self, reward, domainInControl=None):
if domainInControl is None:
domainInControl = self.domainString
if self.episodes[domainInControl] is None:
logger.warning("record attempted to be finalized for domain where nothing has been recorded before")
return
reward /= 20.0
terminal_state, terminal_action = self.convertStateAction(TerminalState(), TerminalAction())
if self.replay_type == 'vanilla':
self.episodes[domainInControl].record(state=terminal_state, \
state_ori=TerminalState(), action=terminal_action, reward=reward,
terminal=True)
elif self.replay_type == 'prioritized':
self.episodes[domainInControl].record(state=terminal_state, \
state_ori=TerminalState(), action=terminal_action, reward=reward, \
Q_s_t_a_t_=0.0, gamma_Q_s_tplu1_maxa_=0.0, uniform=False,
terminal=True)
print('total TD', self.episodes[self.domainString].tree.total())
def convertStateAction(self, state, action):
'''
'''
if isinstance(state, TerminalState):
return [0] * 89, action
else:
if self.features == 'learned' or self.features == 'rnn':
dip_state = padded_state(state.domainStates[state.currentdomain], self.domainString)
else:
dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString)
action_name = self.actions.action_names[action]
act_slot = 'general'
for slot in dip_state.slots:
if slot in action_name:
act_slot = slot
flat_belief = dip_state.get_beliefStateVec(act_slot)
self.prev_state_check = flat_belief
return flat_belief, action
def nextAction(self, beliefstate):
'''
select next action
:param beliefstate: already converted to dipstatevec of the specific slot (or general)
:returns: (int) next summary action
'''
if self.exploration_type == 'e-greedy':
# epsilon greedy
if self.is_training and utils.Settings.random.rand() < self.epsilon:
action_Q = np.random.rand(len(self.action_names))
else:
if len(beliefstate.shape) == 1:
action_Q = self.dqn.predict(np.reshape(beliefstate, (1, -1)))
else:
action_Q = self.dqn.predict(beliefstate)
# add current max Q to self.episode_ave_max_q
self.episode_ave_max_q.append(np.max(action_Q))
#return the Q vect, the action will be converted in the feudal policy
return action_Q
def train(self):
'''
call this function when the episode ends
'''
if not self.is_training:
logger.info("Not in training mode")
return
else:
logger.info("Update dqn policy parameters.")
self.episodecount += 1
logger.info("Sample Num so far: %s" % (self.samplecount))
logger.info("Episode Num so far: %s" % (self.episodecount))
s_batch_new, s_batch_beliefstate, s_batch_chosen_slot, s2_batch_dipstate, s2_batch_beliefstate, t_batch_new, r_batch_new = \
[], [], [], [], [], [], []
if self.samplecount >= self.minibatch_size * 8 and self.episodecount % self.training_frequency == 0:
logger.info('start training...')
a_batch_one_hot_new = None
#updating only states where the action is not "pass()" complicates things :/
#since in a batch we can take only non-pass() actions, we have to loop a bit until we get enough samples
while len(s_batch_new) < self.minibatch_size:
s_batch, s_ori_batch, a_batch, r_batch, s2_batch, s2_ori_batch, t_batch, idx_batch, _ = \
self.episodes[self.domainString].sample_batch()
a_batch_one_hot = np.eye(self.action_dim, self.action_dim)[a_batch]
#we only wanna update state-action pairs, where action != pass()
valid_steps = [action[-1] != 1 for action in a_batch_one_hot]
a_batch_one_hot = a_batch_one_hot[valid_steps]
s_batch_new += [s[0] for i, s in enumerate(s_batch) if valid_steps[i]]
s_batch_beliefstate += [s[1] for i, s in enumerate(s_batch) if valid_steps[i]]
s_batch_chosen_slot += [s[2] for i, s in enumerate(s_batch) if valid_steps[i]]
s2_batch_dipstate += [s[3] for s, valid in zip(s2_batch, valid_steps) if valid]
s2_batch_beliefstate += [s[1] for s, valid in zip(s2_batch, valid_steps) if valid]
r_batch_new += [r for r, valid in zip(r_batch, valid_steps) if valid]
t_batch_new += [t for t, valid in zip(t_batch, valid_steps) if valid]
if a_batch_one_hot_new is None:
a_batch_one_hot_new = a_batch_one_hot
else:
a_batch_one_hot_new = np.vstack((a_batch_one_hot_new, a_batch_one_hot))
s_batch_new = np.vstack(s_batch_new)
s2_batch_dipstate = np.vstack(s2_batch_dipstate)
if self.js_threshold < 1.0 or self.jsd_reward:
#TODO: This is highly inefficient
js_divergence_batch = []
for belief, belief2, slot in zip(s_batch_beliefstate, s2_batch_beliefstate, s_batch_chosen_slot):
if slot != "None":
keys = belief['beliefs'][slot].keys()
b = [belief['beliefs'][slot]['**NONE**']] + \
[belief['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
b_2 = [belief2['beliefs'][slot]['**NONE**']] + \
[belief2['beliefs'][slot][value] for value in list(keys) if value != '**NONE**']
js_divergence = self.compute_js_divergence(b, b_2)
js_divergence_batch.append(js_divergence)
else:
js_divergence_batch.append(0.0)
else:
js_divergence_batch = [0] * len(r_batch_new)
tanh_n = np.tanh(1)
if self.jsd_reward:
if self.jsd_function == 'tanh':
js_divergence_batch = np.tanh(np.array(js_divergence_batch)) / tanh_n
#normalize jsd between -1 and 1
js_divergence_batch = (-1 + 2 * np.array(js_divergence_batch)).tolist()
elif self.js_threshold < 1.0:
# normalizing bound to [0, 2] and then /20
js_divergence_batch = [2/20 * int(x > self.js_threshold) for x in js_divergence_batch]
action_q = self.dqn.predict_dip(s2_batch_dipstate, a_batch_one_hot_new)
target_q = self.dqn.predict_target_dip(s2_batch_dipstate, a_batch_one_hot_new)
action_q = np.reshape(action_q, (s_batch_new.shape[0], -1, self.action_dim))
target_q = np.reshape(target_q, (s_batch_new.shape[0], -1, self.action_dim))
y_i = []
for k in range(min(s_batch_new.shape[0], self.episodes[self.domainString].size())):
Q_bootstrap_label = 0
if t_batch_new[k]:
Q_bootstrap_label = r_batch_new[k]
else:
if self.q_update == 'single':
action_Q = target_q[k]
if self.jsd_reward:
Q_bootstrap_label = js_divergence_batch[k] + self.gamma * np.max(action_Q)
else:
Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * np.max(action_Q)
elif self.q_update == 'double':
action_Q = action_q[k]
argmax_tuple = np.unravel_index(np.argmax(action_Q, axis=None), action_Q.shape)
value_Q = target_q[k][argmax_tuple]
Q_bootstrap_label = r_batch_new[k] + js_divergence_batch[k] + self.gamma * value_Q
y_i.append(Q_bootstrap_label)
if self.replay_type == 'prioritized':
# update the sum-tree
# update the TD error of the samples in the minibatch
currentQ_s_a_ = action_q[k][a_batch[k]]
error = abs(currentQ_s_a_ - Q_bootstrap_label)
self.episodes[self.domainString].update(idx_batch[k], error)
reshaped_yi = np.vstack([np.expand_dims(x, 0) for x in y_i])
predicted_q_value, _, currentLoss = self.dqn.train(s_batch_new, a_batch_one_hot_new, reshaped_yi)
if self.episodecount % 1 == 0:
# Update target networks
self.dqn.update_target_network()
self.savePolicyInc()
def compute_js_divergence(self, P, Q):
M = [p + q for p, q in zip(P, Q)]
return 0.5 * (entropy(P, M, base=2) + entropy(Q, M, base=2))
# END OF FILE
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
###############################################################################
# PyDial: Multi-domain Statistical Spoken Dialogue System Software
###############################################################################
#
# Copyright 2015 - 2019
# Cambridge University Engineering Department Dialogue Systems Group
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
###############################################################################
"""
Implementation of DQAN - Deep Q Action Network
The algorithm is developed with tflearn + Tensorflow
Author: Pei-Hao Su
"""
import tensorflow as tf
import numpy as np
import tflearn
from policy.DRL.replay_buffer import ReplayBuffer
# ===========================
# Deep Q Action Network
# ===========================
class DeepQNetwork(object):
"""
Input to the network is the state and action, output is Q(s,a).
"""
def __init__(self, sess, state_dim, action_dim, learning_rate, tau, \
num_actor_vars, architecture = 'duel', h1_size = 130, h2_size = 50):
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.learning_rate = learning_rate
self.tau = tau
self.architecture = architecture
self.h1_size = h1_size
self.h2_size = h2_size
# Create the deep Q network
self.inputs, self.action, self.Qout = \
self.create_ddq_network(self.architecture, self.h1_size, self.h2_size)
self.network_params = tf.trainable_variables()
# Target Network
self.target_inputs, self.target_action, self.target_Qout = \
self.create_ddq_network(self.architecture, self.h1_size, self.h2_size)
self.target_network_params = tf.trainable_variables()[len(self.network_params):]
# Op for periodically updating target network
self.update_target_network_params = \
[self.target_network_params[i].assign(\
tf.mul(self.network_params[i], self.tau) + tf.mul(self.target_network_params[i], 1. - self.tau))
for i in range(len(self.target_network_params))]
# Network target (y_i)
self.sampled_q = tf.placeholder(tf.float32, [None, 1])
# Predicted Q given state and chosed action
#actions_one_hot = self.action
#self.pred_q = tf.reduce_sum(self.Qout * actions_one_hot, reduction_indices=1, name='q_acted')
self.pred_q = self.Qout
self.diff = self.sampled_q - self.pred_q
self.loss = tf.reduce_mean(self.clipped_error(self.diff), name='loss')
self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
self.optimize = self.optimizer.minimize(self.loss)
def create_ddq_network(self, architecture = 'duel', h1_size = 130, h2_size = 50):
inputs = tf.placeholder(tf.float32, [None, self.s_dim])
action = tf.placeholder(tf.float32, [None, self.a_dim])
# state network
W_fc1_s = tf.Variable(tf.truncated_normal([self.s_dim, h1_size], stddev=0.01))
b_fc1_s = tf.Variable(tf.zeros([h1_size]))
h_fc1_s = tf.nn.relu(tf.matmul(inputs, W_fc1_s) + b_fc1_s)
# action network
W_fc1_a = tf.Variable(tf.truncated_normal([self.a_dim, h1_size], stddev=0.01))
b_fc1_a = tf.Variable(tf.zeros([h1_size]))
h_fc1_a = tf.nn.relu(tf.matmul(action, W_fc1_a) + b_fc1_a)
#h_fc1 = tf.nn.tanh(tf.matmul(inputs, W_fc1) + b_fc1)
#if architecture == 'duel':
if False:
"""
W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
b_fc2_s = tf.Variable(tf.zeros([h2_size]))
h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s)
W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01))
b_value = tf.Variable(tf.zeros([1]))
value_out = tf.matmul(h_fc2_s, W_value) + b_value
W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
b_fc2_a = tf.Variable(tf.zeros([h2_size]))
h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a)
Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1)
"""
# value function
W_value = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
b_value = tf.Variable(tf.zeros([h2_size]))
h_value = tf.nn.relu(tf.matmul(h_fc1, W_value) + b_value)
W_value = tf.Variable(tf.truncated_normal([h2_size, 1], stddev=0.01))
b_value = tf.Variable(tf.zeros([1]))
value_out = tf.matmul(h_value, W_value) + b_value
# advantage function
W_advantage = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
b_advantage = tf.Variable(tf.zeros([h2_size]))
h_advantage = tf.nn.relu(tf.matmul(h_fc1, W_advantage) + b_advantage)
W_advantage = tf.Variable(tf.truncated_normal([h2_size, self.a_dim], stddev=0.01))
b_advantage = tf.Variable(tf.zeros([self.a_dim]))
Advantage_out = tf.matmul(h_advantage, W_advantage) + b_advantage
Qout = value_out + (Advantage_out - tf.reduce_mean(Advantage_out, reduction_indices=1, keep_dims=True))
else:
W_fc2_s = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
b_fc2_s = tf.Variable(tf.zeros([h2_size]))
h_fc2_s = tf.nn.relu(tf.matmul(h_fc1_s, W_fc2_s) + b_fc2_s)
W_fc2_a = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=0.01))
b_fc2_a = tf.Variable(tf.zeros([h2_size]))
h_fc2_a = tf.nn.relu(tf.matmul(h_fc1_a, W_fc2_a) + b_fc2_a)
# inner product of state s and action a
#Qout = tf.mul(h_fc2_s,h_fc2_a)
Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1)
#Qout = tf.reduce_sum( tf.mul( h_fc2_s,h_fc2_a ), 1, keep_dims=True )
#Qout = tf.reduce_sum(tf.mul(h_fc2_s,h_fc2_a))
return inputs, action, Qout
def train(self, inputs, action, sampled_q):
return self.sess.run([self.pred_q, self.optimize, self.loss], feed_dict={
self.inputs: inputs,
self.action: action,
self.sampled_q: sampled_q
})
def predict(self, inputs, action):
#return self.sess.run(self.pred_q, feed_dict={
return self.sess.run(self.Qout, feed_dict={
self.inputs: inputs,
self.action: action
})
def predict_target(self, inputs, action):
#return self.sess.run(self.pred_q, feed_dict={
return self.sess.run(self.target_Qout, feed_dict={
self.target_inputs: inputs,
self.target_action: action
})
def update_target_network(self):
self.sess.run(self.update_target_network_params)
def load_network(self, load_filename):
self.saver = tf.train.Saver()
try:
self.saver.restore(self.sess, load_filename)
print("Successfully loaded:", load_filename)
except:
print("Could not find old network weights")
def save_network(self, save_filename):
print('Saving deepq-network...')
self.saver.save(self.sess, save_filename)
def clipped_error(self, x):
return tf.select(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) # condition, true, false
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment