diff --git a/Dockerfile b/Dockerfile
index 5bc422ac0eb174e5ecf6fa19527919ace751fdde..49bcaad8bb0420937c43c06f3ab1427093b33faf 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,6 @@ RUN pip install scipy
 RUN pip install scikit-learn==0.20.3
 RUN pip install pytorch-pretrained-bert==0.6.1
 RUN pip install transformers==2.3.0
-RUN pip install tensorflow==1.14
 RUN pip install tensorboard==1.14.0
 RUN pip install tensorboardX==1.7
 RUN pip install tokenizers==0.8.0
diff --git a/README.md b/README.md
index 0a91208556b698ae3d2e05d087ec171e0fcbe39d..ffd1e93141ee77ba61ee6ec404e7b0049f621e05 100755
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ Our documents are on https://thu-coai.github.io/ConvLab-2_docs/convlab2.html.
 We provide following models:
 
 - NLU: SVMNLU, MILU, BERTNLU
-- DST: rule, MDBT, TRADE, SUMBT
+- DST: rule, TRADE, SUMBT
 - Policy: rule, Imitation, REINFORCE, PPO, GDPL, MDRG, HDSA, LaRL
 - Simulator policy: Agenda, VHUS
 - NLG: Template, SCLSTM
diff --git a/convlab2/dst/mdbt/README.md b/convlab2/dst/mdbt/README.md
deleted file mode 100755
index f240af6d30f17e2f364ec9decbfbfbf9aedd4e9e..0000000000000000000000000000000000000000
--- a/convlab2/dst/mdbt/README.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# Multi-domain Belief DST
-The multidomain belief tracker (MDBT) is a belief tracking model that
-fully utilizes semantic similarity between dialogue utterances and the
-ontology terms, which is proposed by [Ramadan et al., 2018](https://www.aclweb.org/anthology/P18-2069).
-
-## Package Structure
-We adopted the original code to make it a flexible module which can be
-easily imported in a pipeline dialog framework. The dataset-independent
-implementation for MDBT is in ```convlab2/dst/mdbt```, and that for Multiwoz
-dataset is in ```convlab2/dst/mdbt/multiwoz```.
-
-## Run the Code
-The framework will automatically download the pretrained models and data
-before running. If the auto-downloading fails, you have to download the pre-trained model and data
-from [here](https://drive.google.com/open?id=1k6wbabIlYju7kR0Zr4aVXwE_fsGBOtdw),
-and put the ```word-vectors, models``` and ```data``` directories under
-```convlab2/dst/mdbt/multiwoz/configs```.
-git
-## Performance
-The performance of our pre-trained MDBT model is 13.9%. 
-You can train the model by your self for better performance.
diff --git a/convlab2/dst/mdbt/__init__.py b/convlab2/dst/mdbt/__init__.py
deleted file mode 100755
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/convlab2/dst/mdbt/mdbt.py b/convlab2/dst/mdbt/mdbt.py
deleted file mode 100755
index ee99a7f6d7a5b296f7dbf90e496988209efc20f3..0000000000000000000000000000000000000000
--- a/convlab2/dst/mdbt/mdbt.py
+++ /dev/null
@@ -1,218 +0,0 @@
-import copy
-import json
-import os
-
-import tensorflow as tf
-
-from convlab2.dst.mdbt.mdbt_util import model_definition, \
-    track_dialogue, generate_batch, process_history
-from convlab2.dst.rule.multiwoz import normalize_value
-from convlab2.util.multiwoz.state import default_state
-from convlab2.dst.dst import DST
-from convlab2.util.multiwoz.multiwoz_slot_trans import REF_SYS_DA, REF_USR_DA
-
-from os.path import dirname
-
-train_batch_size = 1
-batches_per_eval = 10
-no_epochs = 600
-device = "gpu"
-start_batch = 0
-
-
-class MDBT(DST):
-    """
-    A multi-domain belief tracker, adopted from https://github.com/osmanio2/multi-domain-belief-tracking.
-    """
-    def __init__(self, ontology_vectors, ontology, slots, data_dir):
-        DST.__init__(self)
-        # data profile
-        self.data_dir = data_dir
-        self.validation_url = os.path.join(self.data_dir, 'data/validate.json')
-        self.word_vectors_url = os.path.join(self.data_dir, 'word-vectors/paragram_300_sl999.txt')
-        self.training_url = os.path.join(self.data_dir, 'data/train.json')
-        self.ontology_url = os.path.join(self.data_dir, 'data/ontology.json')
-        self.testing_url = os.path.join(self.data_dir, 'data/test.json')
-        self.model_url = os.path.join(self.data_dir, 'models/model-1')
-        self.graph_url = os.path.join(self.data_dir, 'graphs/graph-1')
-        self.results_url = os.path.join(self.data_dir, 'results/log-1.txt')
-        self.kb_url = os.path.join(self.data_dir, 'data/')  # not used
-        self.train_model_url = os.path.join(self.data_dir, 'train_models/model-1')
-        self.train_graph_url = os.path.join(self.data_dir, 'train_graph/graph-1')
-
-        self.model_variables = model_definition(ontology_vectors, len(ontology), slots, num_hidden=None,
-                                                bidir=True, net_type=None, test=True, dev='cpu')
-        self.state = default_state()
-        _config = tf.ConfigProto()
-        _config.gpu_options.allow_growth = True
-        _config.allow_soft_placement = True
-        self.sess = tf.Session(config=_config)
-        self.param_restored = False
-        self.det_dic = {}
-        for domain, dic in REF_USR_DA.items():
-            for key, value in dic.items():
-                assert '-' not in key
-                self.det_dic[key.lower()] = key + '-' + domain
-                self.det_dic[value.lower()] = key + '-' + domain
-
-        def parent_dir(path, time=1):
-            for _ in range(time):
-                path = os.path.dirname(path)
-            return path
-        root_dir = parent_dir(os.path.abspath(__file__), 4)
-        self.value_dict = json.load(open(os.path.join(root_dir, 'data/multiwoz/value_dict.json')))
-
-    def init_session(self):
-        self.state = default_state()
-        if not self.param_restored:
-            self.restore()
-
-    def restore(self):
-        self.__restore_model(self.sess, tf.train.Saver())
-
-    def update_batch(self, batch_action):
-        pass
-
-    def update(self, user_act=None):
-        """Update the dialog state."""
-        if type(user_act) is not str:
-            raise Exception('Expected user_act to be <class \'str\'> type, but get {}.'.format(type(user_act)))
-        prev_state = copy.deepcopy(self.state)
-        if not os.path.exists(os.path.join(self.data_dir, "results")):
-            os.makedirs(os.path.join(self.data_dir, "results"))
-
-        global train_batch_size
-
-        model_variables = self.model_variables
-        (user, sys_res, no_turns, user_uttr_len, sys_uttr_len, labels, domain_labels, domain_accuracy,
-         slot_accuracy, value_accuracy, value_f1, train_step, keep_prob, predictions,
-         true_predictions, [y, _]) = model_variables
-
-        # Note: Comment the following line since the first node is already i
-        # prev_state['history'] = [['sys', 'null']] if len(prev_state['history']) == 0 else prev_state['history']
-        assert len(prev_state['history']) > 0
-        first_turn = prev_state['history'][0]
-        if first_turn[0] != 'sys':
-            prev_state['history'] = [['sys', '']] + prev_state['history']
-        actual_history = []
-        assert len(prev_state['history']) % 2 == 0
-        for name, utt in prev_state['history']:
-            if not utt:
-                utt = 'null'
-            if len(actual_history)==0 or len(actual_history[-1])==2:
-                actual_history.append([utt])
-            else:
-                actual_history[-1].append(utt)
-        # actual_history[-1].append(user_act)
-        # actual_history = self.normalize_history(actual_history)
-        # if len(actual_history) == 0:
-        #     actual_history = [['', user_act if len(user_act)>0 else 'fake user act']]
-        fake_dialogue = {}
-        turn_no = 0
-        for _sys, _user in actual_history:
-            turn = {}
-            turn['system'] = _sys
-            fake_user = {}
-            fake_user['text'] = _user
-            fake_user['belief_state'] = default_state()['belief_state']
-            turn['user'] = fake_user
-            key = str(turn_no)
-            fake_dialogue[key] = turn
-            turn_no += 1
-        context, actual_context = process_history([fake_dialogue], self.word_vectors, self.ontology)
-        batch_user, batch_sys, batch_labels, batch_domain_labels, batch_user_uttr_len, batch_sys_uttr_len, \
-                batch_no_turns = generate_batch(context, 0, 1, len(self.ontology))  # old feature
-
-        # run model
-        [pred, y_pred] = self.sess.run(
-            [predictions, y],
-            feed_dict={user: batch_user, sys_res: batch_sys,
-                       labels: batch_labels,
-                       domain_labels: batch_domain_labels,
-                       user_uttr_len: batch_user_uttr_len,
-                       sys_uttr_len: batch_sys_uttr_len,
-                       no_turns: batch_no_turns,
-                       keep_prob: 1.0})
-
-        # convert to str output
-        dialgs, _, _ = track_dialogue(actual_context, self.ontology, pred, y_pred)
-        assert len(dialgs) >= 1
-        last_turn = dialgs[0][-1]
-        predictions = last_turn['prediction']
-        new_belief_state = copy.deepcopy(prev_state['belief_state'])
-
-        # update belief state
-        for item in predictions:
-            item = item.lower()
-            domain, slot, value = item.strip().split('-')
-            value = value[::-1].split(':', 1)[1][::-1]
-            if slot == 'price range':
-                slot = 'pricerange'
-            if slot not in ['name', 'book']:
-                if domain not in new_belief_state:
-                    raise Exception('Error: domain <{}> not in belief state'.format(domain))
-                slot = REF_SYS_DA[domain.capitalize( )].get(slot, slot)
-                assert 'semi' in new_belief_state[domain]
-                assert 'book' in new_belief_state[domain]
-                if 'book' in slot:
-                    assert slot.startswith('book ')
-                    slot = slot.strip().split()[1]
-                if slot == 'arriveby':
-                    slot = 'arriveBy'
-                elif slot == 'leaveat':
-                    slot = 'leaveAt'
-                domain_dic = new_belief_state[domain]
-                if slot in domain_dic['semi']:
-                    new_belief_state[domain]['semi'][slot] = normalize_value(self.value_dict, domain, slot, value)
-                elif slot in domain_dic['book']:
-                    new_belief_state[domain]['book'][slot] = value
-                elif slot.lower() in domain_dic['book']:
-                    new_belief_state[domain]['book'][slot.lower()] = value
-                else:
-                    with open('mdbt_unknown_slot.log', 'a+') as f:
-                        f.write('unknown slot name <{}> with value <{}> of domain <{}>\nitem: {}\n\n'.format(slot, value,
-                                domain, item))
-        new_request_state = copy.deepcopy(prev_state['request_state'])
-        # update request_state
-        user_request_slot = self.detect_requestable_slots(user_act)
-        for domain in user_request_slot:
-            for key in user_request_slot[domain]:
-                if domain not in new_request_state:
-                    new_request_state[domain] = {}
-                if key not in new_request_state[domain]:
-                    new_request_state[domain][key] = user_request_slot[domain][key]
-        # update state
-        new_state = copy.deepcopy(dict(prev_state))
-        new_state['belief_state'] = new_belief_state
-        new_state['request_state'] = new_request_state
-        self.state = new_state
-        return self.state
-
-    def normalize_history(self, history):
-        """Replace zero-length history."""
-        for i in range(len(history)):
-            a, b = history[i]
-            if len(a) == 0:
-                history[i][0] = 'sys'
-            if len(b) == 0:
-                history[i][1] = 'user'
-        return history
-
-    def detect_requestable_slots(self, observation):
-        result = {}
-        observation = observation.lower()
-        _observation = ' {} '.format(observation)
-        for value in self.det_dic.keys():
-            _value = ' {} '.format(value.strip())
-            if _value in _observation:
-                key, domain = self.det_dic[value].split('-')
-                if domain not in result:
-                    result[domain] = {}
-                result[domain][key] = 0
-        return result
-
-    def __restore_model(self, sess, saver):
-        saver.restore(sess, self.model_url)
-        print('Loading trained MDBT model from ', self.model_url)
-        self.param_restored = True
-
diff --git a/convlab2/dst/mdbt/mdbt_util.py b/convlab2/dst/mdbt/mdbt_util.py
deleted file mode 100755
index b693ad473728bf43c8bda51e9338c22960ee0f6a..0000000000000000000000000000000000000000
--- a/convlab2/dst/mdbt/mdbt_util.py
+++ /dev/null
@@ -1,1058 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import json
-import math
-import os
-import time
-from collections import OrderedDict
-from copy import deepcopy
-
-import numpy as np
-import tensorflow as tf
-from tensorflow.python.client import device_lib
-
-DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))), 'data/mdbt')
-VALIDATION_URL = os.path.join(DATA_PATH, "data/validate.json")
-WORD_VECTORS_URL = os.path.join(DATA_PATH, "word-vectors/paragram_300_sl999.txt")
-TRAINING_URL = os.path.join(DATA_PATH, "data/train.json")
-ONTOLOGY_URL = os.path.join(DATA_PATH, "data/ontology.json")
-TESTING_URL = os.path.join(DATA_PATH, "data/test.json")
-MODEL_URL = os.path.join(DATA_PATH, "models/model-1")
-GRAPH_URL = os.path.join(DATA_PATH, "graphs/graph-1")
-RESULTS_URL = os.path.join(DATA_PATH, "results/log-1.txt")
-
-#ROOT_URL = '../../data/mdbt'
-
-#VALIDATION_URL = "./data/mdbt/data/validate.json"
-#WORD_VECTORS_URL = "./data/mdbt/word-vectors/paragram_300_sl999.txt"
-#TRAINING_URL = "./data/mdbt/data/train.json"
-#ONTOLOGY_URL = "./data/mdbt/data/ontology.json"
-#TESTING_URL = "./data/mdbt/data/test.json"
-#MODEL_URL = "./data/mdbt/models/model-1"
-#GRAPH_URL = "./data/mdbt/graphs/graph-1"
-#RESULTS_URL = "./data/mdbt/results/log-1.txt"
-
-
-domains = ['restaurant', 'hotel', 'attraction', 'train', 'taxi']
-
-train_batch_size = 64
-batches_per_eval = 10
-no_epochs = 600
-device = "gpu"
-start_batch = 0
-
-num_slots = 0
-
-booking_slots = {}
-
-network = "lstm"
-bidirect = True
-lstm_num_hidden = 50
-max_utterance_length = 50
-vector_dimension = 300
-max_no_turns = 22
-
-
-# rnnrollout.py
-def get_available_devs():
-    local_device_protos = device_lib.list_local_devices()
-    return [x.name for x in local_device_protos if x.device_type == 'GPU']
-
-
-class GRU(tf.nn.rnn_cell.RNNCell):
-    '''
-    Create a Gated Recurrent unit to unroll the network through time
-    for combining the current and previous belief states
-    '''
-
-    def __init__(self, W_h, U_h, M_h, W_m, U_m, label_size, reuse=None, binary_output=False):
-        super(GRU, self).__init__(_reuse=reuse)
-        self.label_size = label_size
-        self.M_h = M_h
-        self.W_m = W_m
-        self.U_m = U_m
-        self.U_h = U_h
-        self.W_h = W_h
-        self.binary_output = binary_output
-
-    def __call__(self, inputs, state, scope=None):
-        state_only = tf.slice(state, [0, self.label_size], [-1, -1])
-        output_only = tf.slice(state, [0, 0], [-1, self.label_size])
-        new_state = tf.tanh(tf.matmul(inputs, self.U_m) + tf.matmul(state_only, self.W_m))
-        output = tf.matmul(inputs, self.U_h) + tf.matmul(output_only, self.W_h) + tf.matmul(state_only, self.M_h)
-        if self.binary_output:
-            output_ = tf.sigmoid(output)
-        else:
-            output_ = tf.nn.softmax(output)
-        state = tf.concat([output_, new_state], 1)
-        return output, state
-
-    @property
-    def state_size(self):
-        return tf.shape(self.W_m)[0] + self.label_size
-
-    @property
-    def output_size(self):
-        return tf.shape(self.W_h)[0]
-
-
-def define_CNN_model(utter, num_filters=300, name="r"):
-    """
-    Better code for defining the CNN model.
-    """
-    filter_sizes = [1, 2, 3]
-    W = []
-    b = []
-    for i, filter_size in enumerate(filter_sizes):
-        filter_shape = [filter_size, vector_dimension, 1, num_filters]
-        W.append(tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="F_W"))
-        b.append(tf.Variable(tf.constant(0.1, shape=[num_filters]), name="F_b"))
-
-    utter = tf.reshape(utter, [-1, max_utterance_length, vector_dimension])
-
-    hidden_representation = tf.zeros([num_filters], tf.float32)
-
-    pooled_outputs = []
-    for i, filter_size in enumerate(filter_sizes):
-        # with tf.name_scope("conv-maxpool-%s" % filter_size):
-        # Convolution Layer
-        conv = tf.nn.conv2d(
-            tf.expand_dims(utter, -1),
-            W[i],
-            strides=[1, 1, 1, 1],
-            padding="VALID",
-            name="conv_R")
-        # Apply nonlinearity
-        h = tf.nn.relu(tf.nn.bias_add(conv, b[i]), name="relu")
-        # Maxpooling over the outputs
-        pooled = tf.nn.max_pool(
-            h,
-            ksize=[1, max_utterance_length - filter_size + 1, 1, 1],
-            strides=[1, 1, 1, 1],
-            padding='VALID',
-            name="r_")
-        pooled_outputs.append(pooled)
-
-        hidden_representation += tf.reshape(tf.concat(pooled, 3), [-1, num_filters])
-
-    hidden_representation = tf.reshape(hidden_representation, [-1, max_no_turns, num_filters], name=name)
-
-    return hidden_representation
-
-
-def lstm_model(text_input, utterance_length, num_hidden, name, net_type, bidir):
-    '''Define an Lstm model that will run across the user input and system act
-
-    :param text_input: [batch_size, max_num_turns, max_utterance_size, vector_dimension]
-    :param utterance_length: number words in every utterance [batch_size, max_num_turns, 1]
-    :param num_hidden: -- int --
-    :param name: The name of lstm network
-    :param net_type: type of the network ("lstm" or "gru" or "rnn")
-    :param bidir: use a bidirectional network -- bool --
-    :return: output at each state [batch_size, max_num_turns, max_utterance_size, num_hidden],
-     output of the final state [batch_size, max_num_turns, num_hidden]
-    '''
-    with tf.variable_scope(name):
-
-        text_input = tf.reshape(text_input, [-1, max_utterance_length, vector_dimension])
-        utterance_length = tf.reshape(utterance_length, [-1])
-
-        def rnn(net_typ, num_units):
-            if net_typ == "lstm":
-                return tf.nn.rnn_cell.LSTMCell(num_units)
-            elif net_typ == "gru":
-                return tf.nn.rnn_cell.GRUCell(num_units)
-            else:
-                return tf.nn.rnn_cell.BasicRNNCell(num_units)
-
-        if bidir:
-            assert num_hidden % 2 == 0
-            rev_cell = rnn(net_type, num_hidden // 2)
-            cell = rnn(net_type, num_hidden // 2)
-            _, lspd = tf.nn.bidirectional_dynamic_rnn(cell, rev_cell, text_input, dtype=tf.float32,
-                                                      sequence_length=utterance_length)
-            if net_type == "lstm":
-                lspd = (lspd[0].h, lspd[1].h)
-
-            last_state = tf.concat(lspd, 1)
-        else:
-            cell = rnn(net_type, num_hidden)
-            _, last_state = tf.nn.dynamic_rnn(cell, text_input, dtype=tf.float32, sequence_length=utterance_length)
-            if net_type == "lstm":
-                last_state = last_state.h
-
-        last_state = tf.reshape(last_state, [-1, max_no_turns, num_hidden])
-
-        return last_state
-
-
-def model_definition(ontology, num_slots, slots, num_hidden=None, net_type=None, bidir=None, test=False, dev=None):
-    '''Create neural belief tracker model that is defined in my notes. It consists of encoding the user and system \
-    input, then use the ontology to decode the encoder in manner that detects if a domain-slot-value class is mentioned
-    
-    :param ontology: numpy array of the embedded vectors of the ontology [num_slots, 3*vector_dimension]
-    :param num_slots: number of ontology classes --int--
-    :param slots: indices of the values of each slot list of lists of ints
-    :param num_hidden: Number of hidden units or dimension of the hidden space
-    :param net_type: The type of the encoder network cnn, lstm, gru, rnn ...etc
-    :param bidir: For recurrent networks should it be bidirectional
-    :param test: This is testing mode (no back-propagation)
-    :param dev: Device to run the model on (cpu or gpu)
-    :return: All input variable/placeholders output metrics (precision, recall, f1-score) and trainer
-    '''
-    # print('model definition')
-    # print(ontology, num_slots, slots, num_hidden, net_type, bidir, test, dev)
-    global lstm_num_hidden
-
-    if not net_type:
-        net_type = network
-    else:
-        print("\tMDBT: Setting up the type of the network to {}..............................".format(net_type))
-    if bidir == None:
-        bidir = bidirect
-    else:
-        pass
-        # print("\tMDBT: Setting up type of the recurrent network to bidirectional {}...........................".format(bidir))
-    if num_hidden:
-        lstm_num_hidden = num_hidden
-        print("\tMDBT: Setting up type of the dimension of the hidden space to {}.........................".format(num_hidden))
-
-    ontology = tf.constant(ontology, dtype=tf.float32)
-
-    # ----------------------------------- Define the input variables --------------------------------------------------
-    user_input = tf.placeholder(tf.float32, [None, max_no_turns, max_utterance_length, vector_dimension], name="user")
-    system_input = tf.placeholder(tf.float32, [None, max_no_turns, max_utterance_length, vector_dimension], name="sys")
-    num_turns = tf.placeholder(tf.int32, [None], name="num_turns")
-    user_utterance_lengths = tf.placeholder(tf.int32, [None, max_no_turns], name="user_sen_len")
-    sys_utterance_lengths = tf.placeholder(tf.int32, [None, max_no_turns], name="sys_sen_len")
-    labels = tf.placeholder(tf.float32, [None, max_no_turns, num_slots], name="labels")
-    domain_labels = tf.placeholder(tf.float32, [None, max_no_turns, num_slots], name="domain_labels")
-    # dropout placeholder, 0.5 for training, 1.0 for validation/testing:
-    keep_prob = tf.placeholder("float")
-
-    # ------------------------------------ Create the Encoder networks ------------------------------------------------
-    devs = ['/device:CPU:0']
-    if dev == 'gpu':
-        devs = get_available_devs()
-
-    if net_type == "cnn":
-        with tf.device(devs[1 % len(devs)]):
-            # Encode the domain of the user input using a LSTM network
-            usr_dom_en = define_CNN_model(user_input, num_filters=lstm_num_hidden, name="h_u_d")
-            # Encode the domain of the system act using a LSTM network
-            sys_dom_en = define_CNN_model(system_input, num_filters=lstm_num_hidden, name="h_s_d")
-
-        with tf.device(devs[2 % len(devs)]):
-            # Encode the slot of the user input using a CNN network
-            usr_slot_en = define_CNN_model(user_input, num_filters=lstm_num_hidden, name="h_u_s")
-            # Encode the slot of the system act using a CNN network
-            sys_slot_en = define_CNN_model(system_input, num_filters=lstm_num_hidden, name="h_s_s")
-            # Encode the value of the user input using a CNN network
-            usr_val_en = define_CNN_model(user_input, num_filters=lstm_num_hidden, name="h_u_v")
-            # Encode the value of the system act using a CNN network
-            sys_val_en = define_CNN_model(system_input, num_filters=lstm_num_hidden, name="h_s_v")
-            # Encode the user using a CNN network
-            usr_en = define_CNN_model(user_input, num_filters=lstm_num_hidden // 5, name="h_u")
-
-    else:
-
-        with tf.device(devs[1 % len(devs)]):
-            # Encode the domain of the user input using a LSTM network
-            usr_dom_en = lstm_model(user_input, user_utterance_lengths, lstm_num_hidden, "h_u_d", net_type, bidir)
-            usr_dom_en = tf.nn.dropout(usr_dom_en, keep_prob, name="h_u_d_out")
-            # Encode the domain of the system act using a LSTM network
-            sys_dom_en = lstm_model(system_input, sys_utterance_lengths, lstm_num_hidden, "h_s_d", net_type, bidir)
-            sys_dom_en = tf.nn.dropout(sys_dom_en, keep_prob, name="h_s_d_out")
-
-        with tf.device(devs[2 % len(devs)]):
-            # Encode the slot of the user input using a LSTM network
-            usr_slot_en = lstm_model(user_input, user_utterance_lengths, lstm_num_hidden, "h_u_s", net_type, bidir)
-            usr_slot_en = tf.nn.dropout(usr_slot_en, keep_prob, name="h_u_s_out")
-            # Encode the slot of the system act using a LSTM network
-            sys_slot_en = lstm_model(system_input, sys_utterance_lengths, lstm_num_hidden, "h_s_s", net_type, bidir)
-            sys_slot_en = tf.nn.dropout(sys_slot_en, keep_prob, name="h_s_s_out")
-            # Encode the value of the user input using a LSTM network
-            usr_val_en = lstm_model(user_input, user_utterance_lengths, lstm_num_hidden, "h_u_v", net_type, bidir)
-            usr_val_en = tf.nn.dropout(usr_val_en, keep_prob, name="h_u_v_out")
-            # Encode the value of the system act using a LSTM network
-            sys_val_en = lstm_model(system_input, sys_utterance_lengths, lstm_num_hidden, "h_s_v", net_type, bidir)
-            sys_val_en = tf.nn.dropout(sys_val_en, keep_prob, name="h_s_v_out")
-            # Encode the user using a LSTM network
-            usr_en = lstm_model(user_input, user_utterance_lengths, lstm_num_hidden // 5, "h_u", net_type, bidir)
-            usr_en = tf.nn.dropout(usr_en, keep_prob, name="h_u_out")
-
-    with tf.device(devs[1 % len(devs)]):
-        usr_dom_en = tf.tile(tf.expand_dims(usr_dom_en, axis=2), [1, 1, num_slots, 1], name="h_u_d")
-        sys_dom_en = tf.tile(tf.expand_dims(sys_dom_en, axis=2), [1, 1, num_slots, 1], name="h_s_d")
-    with tf.device(devs[2 % len(devs)]):
-        usr_slot_en = tf.tile(tf.expand_dims(usr_slot_en, axis=2), [1, 1, num_slots, 1], name="h_u_s")
-        sys_slot_en = tf.tile(tf.expand_dims(sys_slot_en, axis=2), [1, 1, num_slots, 1], name="h_s_s")
-        usr_val_en = tf.tile(tf.expand_dims(usr_val_en, axis=2), [1, 1, num_slots, 1], name="h_u_v")
-        sys_val_en = tf.tile(tf.expand_dims(sys_val_en, axis=2), [1, 1, num_slots, 1], name="h_s_v")
-        usr_en = tf.tile(tf.expand_dims(usr_en, axis=2), [1, 1, num_slots, 1], name="h_u")
-
-    # All encoding vectors have size [batch_size, max_turns, num_slots, num_hidden]
-
-    # Matrix that transforms the ontology from the embedding space to the hidden representation
-    with tf.device(devs[1 % len(devs)]):
-        W_onto_domain = tf.Variable(tf.random_normal([vector_dimension, lstm_num_hidden]), name="W_onto_domain")
-        W_onto_slot = tf.Variable(tf.random_normal([vector_dimension, lstm_num_hidden]), name="W_onto_slot")
-        W_onto_value = tf.Variable(tf.random_normal([vector_dimension, lstm_num_hidden]), name="W_onto_value")
-
-        # And biases
-        b_onto_domain = tf.Variable(tf.zeros([lstm_num_hidden]), name="b_onto_domain")
-        b_onto_slot = tf.Variable(tf.zeros([lstm_num_hidden]), name="b_onto_slot")
-        b_onto_value = tf.Variable(tf.zeros([lstm_num_hidden]), name="b_onto_value")
-
-        # Apply the transformation from the embedding space of the ontology to the hidden space
-        domain_vec = tf.slice(ontology, begin=[0, 0], size=[-1, vector_dimension])
-        slot_vec = tf.slice(ontology, begin=[0, vector_dimension], size=[-1, vector_dimension])
-        value_vec = tf.slice(ontology, begin=[0, 2 * vector_dimension], size=[-1, vector_dimension])
-        # Each [num_slots, vector_dimension]
-        d = tf.nn.dropout(tf.tanh(tf.matmul(domain_vec, W_onto_domain) + b_onto_domain), keep_prob, name="d")
-        s = tf.nn.dropout(tf.tanh(tf.matmul(slot_vec, W_onto_slot) + b_onto_slot), keep_prob, name="s")
-        v = tf.nn.dropout(tf.tanh(tf.matmul(value_vec, W_onto_value) + b_onto_value), keep_prob, name="v")
-        # Each [num_slots, num_hidden]
-
-        # Apply the comparison mechanism for all the user and system utterances and ontology values
-        domain_user = tf.multiply(usr_dom_en, d, name="domain_user")
-        domain_sys = tf.multiply(sys_dom_en, d, name="domain_sys")
-        slot_user = tf.multiply(usr_slot_en, s, name="slot_user")
-        slot_sys = tf.multiply(sys_slot_en, s, name="slot_sys")
-        value_user = tf.multiply(usr_val_en, v, name="value_user")
-        value_sys = tf.multiply(sys_val_en, v, name="value_sys")
-        # All of size [batch_size, max_turns, num_slots, num_hidden]
-
-        # -------------- Domain Detection -------------------------------------------------------------------------
-        W_domain = tf.Variable(tf.random_normal([2 * lstm_num_hidden]), name="W_domain")
-        b_domain = tf.Variable(tf.zeros([1]), name="b_domain")
-        y_d = tf.sigmoid(tf.reduce_sum(tf.multiply(tf.concat([domain_user, domain_sys], axis=3), W_domain), axis=3)
-                         + b_domain)  # [batch_size, max_turns, num_slots]
-
-    # -------- Run through each of the 3 case ( inform, request, confirm) and decode the inferred state ---------
-    # 1 Inform (User is informing the system about the goal, e.g. "I am looking for a place to stay in the centre")
-    W_inform = tf.Variable(tf.random_normal([2 * lstm_num_hidden]), name="W_inform")
-    b_inform = tf.Variable(tf.random_normal([1]), name="b_inform")
-    inform = tf.add(tf.reduce_sum(tf.multiply(tf.concat([slot_user, value_user], axis=3), W_inform), axis=3), b_inform,
-                    name="inform")  # [batch_size, max_turns, num_slots]
-
-    # 2 Request (The system is requesting information from the user, e.g. "what type of food would you like?")
-    with tf.device(devs[2 % len(devs)]):
-        W_request = tf.Variable(tf.random_normal([2 * lstm_num_hidden]), name="W_request")
-        b_request = tf.Variable(tf.random_normal([1]), name="b_request")
-        request = tf.add(tf.reduce_sum(tf.multiply(tf.concat([slot_sys, value_user], axis=3), W_request), axis=3),
-                         b_request, name="request")  # [batch_size, max_turns, num_slots]
-
-    # 3 Confirm (The system is confirming values given by the user, e.g. "How about turkish food?")
-    with tf.device(devs[3 % len(devs)]):
-        size = 2 * lstm_num_hidden + lstm_num_hidden // 5
-        W_confirm = tf.Variable(tf.random_normal([size]), name="W_confirm")
-        b_confirm = tf.Variable(tf.random_normal([1]), name="b_confirm")
-        confirm = tf.add(
-            tf.reduce_sum(tf.multiply(tf.concat([slot_sys, value_sys, usr_en], axis=3), W_confirm), axis=3),
-            b_confirm, name="confirm")  # [batch_size, max_turns, num_slots]
-
-    output = inform + request + confirm
-
-    # -------------------- Adding the belief update RNN with memory cell (Taken from previous model) -------------------
-    with tf.device(devs[2 % len(devs)]):
-        domain_memory = tf.Variable(tf.random_normal([1, 1]), name="domain_memory")
-        domain_current = tf.Variable(tf.random_normal([1, 1]), name="domain_current")
-        domain_M_h = tf.Variable(tf.random_normal([1, 1]), name="domain_M_h")
-        domain_W_m = tf.Variable(tf.random_normal([1, 1], name="domain_W_m"))
-        domain_U_m = tf.Variable(tf.random_normal([1, 1]), name="domain_U_m")
-    a_memory = tf.Variable(tf.random_normal([1, 1]), name="a_memory")
-    b_memory = tf.Variable(tf.random_normal([1, 1]), name="b_memory")
-    a_current = tf.Variable(tf.random_normal([1, 1]), name="a_current")
-    b_current = tf.Variable(tf.random_normal([1, 1]), name="b_current")
-    M_h_a = tf.Variable(tf.random_normal([1, 1]), name="M_h_a")
-    M_h_b = tf.Variable(tf.random_normal([1, 1]), name="M_h_b")
-    W_m_a = tf.Variable(tf.random_normal([1, 1]), name="W_m_a")
-    W_m_b = tf.Variable(tf.random_normal([1, 1]), name="W_m_b")
-    U_m_a = tf.Variable(tf.random_normal([1, 1]), name="U_m_a")
-    U_m_b = tf.Variable(tf.random_normal([1, 1]), name="U_m_b")
-
-    # ---------------------------------- Unroll the domain over time --------------------------------------------------
-    with tf.device(devs[1 % len(devs)]):
-        cell = GRU(domain_memory * tf.diag(tf.ones(num_slots)), domain_current * tf.diag(tf.ones(num_slots)),
-                   domain_M_h * tf.diag(tf.ones(num_slots)), domain_W_m * tf.diag(tf.ones(num_slots)),
-                   domain_U_m * tf.diag(tf.ones(num_slots)), num_slots,
-                   binary_output=True)
-
-        y_d, _ = tf.nn.dynamic_rnn(cell, y_d, sequence_length=num_turns, dtype=tf.float32)
-
-        domain_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=domain_labels, logits=y_d), axis=2,
-                                    name="domain_loss") / (num_slots / len(slots))
-
-        y_d = tf.sigmoid(y_d)
-
-    with tf.device(devs[0 % len(devs)]):
-
-        loss = [None for _ in range(len(slots))]
-        slot_pred = [None for _ in range(len(slots))]
-        slot_label = [None for _ in range(len(slots))]
-        val_pred = [None for _ in range(len(slots))]
-        val_label = [None for _ in range(len(slots))]
-        y = [None for _ in range(len(slots))]
-        y_pred = [None for _ in range(len(slots))]
-        for i in range(len(slots)):
-
-            num_values = slots[i] + 1  # For the none case
-            size = sum(slots[:i + 1]) - slots[i]
-            if test:
-                domain_output = tf.slice(tf.round(y_d), begin=[0, 0, size], size=[-1, -1, slots[i]])
-            else:
-                domain_output = tf.slice(domain_labels, begin=[0, 0, size], size=[-1, -1, slots[i]])
-            max_val = tf.expand_dims(tf.reduce_max(domain_output, axis=2), axis=2)
-            # tf.assert_less_equal(max_val, 1.0)
-            # tf.assert_equal(tf.round(max_val), max_val)
-            domain_output = tf.concat([tf.zeros(tf.shape(domain_output)), 1 - max_val], axis=2)
-
-            slot_output = tf.slice(output, begin=[0, 0, size], size=[-1, -1, slots[i]])
-            slot_output = tf.concat([slot_output, tf.zeros([tf.shape(output)[0], max_no_turns, 1])], axis=2)
-
-            labels_output = tf.slice(labels, begin=[0, 0, size], size=[-1, -1, slots[i]])
-            max_val = tf.expand_dims(tf.reduce_max(labels_output, axis=2), axis=2)
-            # tf.assert_less_equal(max_val, 1.0)
-            # tf.assert_equal(tf.round(max_val), max_val)
-            slot_label[i] = max_val
-            # [Batch_size, max_turns, 1]
-            labels_output = tf.argmax(tf.concat([labels_output, 1 - max_val], axis=2), axis=2)
-            # [Batch_size, max_turns]
-            val_label[i] = tf.cast(tf.expand_dims(labels_output, axis=2), dtype="float")
-            # [Batch_size, max_turns, 1]
-
-            diag_memory = a_memory * tf.diag(tf.ones(num_values))
-            non_diag_memory = tf.matrix_set_diag(b_memory * tf.ones([num_values, num_values]), tf.zeros(num_values))
-            W_memory = diag_memory + non_diag_memory
-
-            diag_current = a_current * tf.diag(tf.ones(num_values))
-            non_diag_current = tf.matrix_set_diag(b_current * tf.ones([num_values, num_values]), tf.zeros(num_values))
-            W_current = diag_current + non_diag_current
-
-            diag_M_h = M_h_a * tf.diag(tf.ones(num_values))
-            non_diag_M_h = tf.matrix_set_diag(M_h_b * tf.ones([num_values, num_values]), tf.zeros(num_values))
-            M_h = diag_M_h + non_diag_M_h
-
-            diag_U_m = U_m_a * tf.diag(tf.ones(num_values))
-            non_diag_U_m = tf.matrix_set_diag(U_m_b * tf.ones([num_values, num_values]), tf.zeros(num_values))
-            U_m = diag_U_m + non_diag_U_m
-
-            diag_W_m = W_m_a * tf.diag(tf.ones(num_values))
-            non_diag_W_m = tf.matrix_set_diag(W_m_b * tf.ones([num_values, num_values]), tf.zeros(num_values))
-            W_m = diag_W_m + non_diag_W_m
-
-            cell = GRU(W_memory, W_current, M_h, W_m, U_m, num_values)
-            y_predict, _ = tf.nn.dynamic_rnn(cell, slot_output, sequence_length=num_turns, dtype=tf.float32)
-
-            y_predict = y_predict + 1000000.0 * domain_output
-            # [Batch_size, max_turns, num_values]
-
-            y[i] = tf.nn.softmax(y_predict)
-            val_pred[i] = tf.cast(tf.expand_dims(tf.argmax(y[i], axis=2), axis=2), dtype="float32")
-            # [Batch_size, max_turns, 1]
-            y_pred[i] = tf.slice(tf.one_hot(tf.argmax(y[i], axis=2), dtype=tf.float32, depth=num_values),
-                                 begin=[0, 0, 0], size=[-1, -1, num_values - 1])
-            y[i] = tf.slice(y[i], begin=[0, 0, 0], size=[-1, -1, num_values - 1])
-            slot_pred[i] = tf.cast(tf.reduce_max(y_pred[i], axis=2, keep_dims=True), dtype="float32")
-            # [Batch_size, max_turns, 1]
-            loss[i] = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels_output, logits=y_predict)
-            # [Batch_size, max_turns]
-
-    # ---------------- Compute the output and the loss function (cross_entropy) and add to optimizer--------------------
-    cross_entropy = tf.add_n(loss, name="cross_entropy")
-    # Add the error from the domains
-    cross_entropy = tf.add(cross_entropy, domain_loss, name="total_loss")
-
-    y = tf.concat(y, axis=2, name="y")
-
-    mask = tf.cast(tf.sequence_mask(num_turns, maxlen=max_no_turns), dtype=tf.float32)
-    mask_extended = tf.tile(tf.expand_dims(mask, axis=2), [1, 1, num_slots])
-    cross_entropy = tf.reduce_sum(mask * cross_entropy, axis=1) / tf.cast(num_turns, dtype=tf.float32)
-
-    optimizer = tf.train.AdamOptimizer(0.001)
-    train_step = optimizer.minimize(cross_entropy, colocate_gradients_with_ops=True)
-
-    # ----------------- Get the precision, recall f1-score and accuracy -----------------------------------------------
-
-    # Domain accuracy
-    true_predictions = tf.reshape(domain_labels, [-1, num_slots])
-    predictions = tf.reshape(tf.round(y_d) * mask_extended, [-1, num_slots])
-
-    y_d = tf.reshape(y_d * mask_extended, [-1, num_slots])
-
-    _, _, _, domain_accuracy = get_metrics(predictions, true_predictions, num_turns, mask_extended, num_slots)
-
-    mask_extended_2 = tf.tile(tf.expand_dims(mask, axis=2), [1, 1, len(slots)])
-
-    # Slot accuracy
-    true_predictions = tf.reshape(tf.concat(slot_label, axis=2), [-1, len(slots)])
-    predictions = tf.reshape(tf.concat(slot_pred, axis=2) * mask_extended_2, [-1, len(slots)])
-
-    _, _, _, slot_accuracy = get_metrics(predictions, true_predictions, num_turns, mask_extended_2, len(slots))
-
-    # accuracy
-    if test:
-        value_accuracy = []
-        mask_extended_3 = tf.expand_dims(mask, axis=2)
-        for i in range(len(slots)):
-            true_predictions = tf.reshape(val_label[i] * mask_extended_3, [-1, 1])
-            predictions = tf.reshape(val_pred[i] * mask_extended_3, [-1, 1])
-
-            _, _, _, value_acc = get_metrics(predictions, true_predictions, num_turns, mask_extended_3, 1)
-            value_accuracy.append(value_acc)
-
-        value_accuracy = tf.stack(value_accuracy)
-    else:
-        true_predictions = tf.reshape(tf.concat(val_label, axis=2) * mask_extended_2, [-1, len(slots)])
-        predictions = tf.reshape(tf.concat(val_pred, axis=2) * mask_extended_2, [-1, len(slots)])
-
-        _, _, _, value_accuracy = get_metrics(predictions, true_predictions, num_turns, mask_extended_2, len(slots))
-
-    # Value f1score a
-    true_predictions = tf.reshape(labels, [-1, num_slots])
-    predictions = tf.reshape(tf.concat(y_pred, axis=2) * mask_extended, [-1, num_slots])
-
-    precision, recall, value_f1_score, _ = get_metrics(predictions, true_predictions, num_turns,
-                                                       mask_extended, num_slots)
-
-    y_ = tf.reshape(y, [-1, num_slots])
-
-    # -------------------- Summarise the statistics of training to be viewed in tensorboard-----------------------------
-    tf.summary.scalar("domain_accuracy", domain_accuracy)
-    tf.summary.scalar("slot_accuracy", slot_accuracy)
-    tf.summary.scalar("value_accuracy", value_accuracy)
-    tf.summary.scalar("value_f1_score", value_f1_score)
-    tf.summary.scalar("cross_entropy", tf.reduce_mean(cross_entropy))
-
-    value_f1_score = [precision, recall, value_f1_score]
-
-    return user_input, system_input, num_turns, user_utterance_lengths, sys_utterance_lengths, labels, domain_labels, \
-           domain_accuracy, slot_accuracy, value_accuracy, value_f1_score, train_step, keep_prob, predictions, \
-           true_predictions, [y_, y_d]
-
-
-def get_metrics(predictions, true_predictions, no_turns, mask, num_slots):
-    mask = tf.reshape(mask, [-1, num_slots])
-    correct_prediction = tf.cast(tf.equal(predictions, true_predictions), "float32") * mask
-
-    num_positives = tf.reduce_sum(true_predictions)
-    classified_positives = tf.reduce_sum(predictions)
-
-    true_positives = tf.multiply(predictions, true_predictions)
-    num_true_positives = tf.reduce_sum(true_positives)
-
-    recall = num_true_positives / num_positives
-    precision = num_true_positives / classified_positives
-    f_score = (2 * recall * precision) / (recall + precision)
-    accuracy = tf.reduce_sum(correct_prediction) / (tf.cast(tf.reduce_sum(no_turns), dtype="float32") * num_slots)
-
-    return precision, recall, f_score, accuracy
-
-
-
-# main.py
-def normalise_word_vectors(word_vectors, norm=1.0):
-    """
-    This method normalises the collection of word vectors provided in the word_vectors dictionary.
-    """
-    for word in word_vectors:
-        word_vectors[word] /= math.sqrt(sum(word_vectors[word]**2) + 1e-6)
-        word_vectors[word] *= norm
-    return word_vectors
-
-
-def xavier_vector(word, D=300):
-    """
-    Returns a D-dimensional vector for the word.
-
-    We hash the word to always get the same vector for the given word.
-    """
-    def hash_string(_s):
-        return abs(hash(_s)) % (10 ** 8)
-    seed_value = hash_string(word)
-    np.random.seed(seed_value)
-
-    neg_value = - math.sqrt(6)/math.sqrt(D)
-    pos_value = math.sqrt(6)/math.sqrt(D)
-
-    rsample = np.random.uniform(low=neg_value, high=pos_value, size=(D,))
-    norm = np.linalg.norm(rsample)
-    rsample_normed = rsample/norm
-
-    return rsample_normed
-
-
-def load_ontology(url, word_vectors):
-    '''Load the ontology from a file
-
-    :param url: to the ontology
-    :param word_vectors: dictionary of the word embeddings [words, vector_dimension]
-    :return: list([domain-slot-value]), [no_slots, vector_dimension]
-    '''
-    global num_slots
-    # print("\tMDBT: Loading the ontology....................")
-    data = json.load(open(url, mode='r', encoding='utf8'), object_pairs_hook=OrderedDict)
-    slot_values = []
-    ontology = []
-    slots_values = []
-    ontology_vectors = []
-    for slots in data:
-        [domain, slot] = slots.split('-')
-        if domain not in domains or slot == 'name':
-            continue
-        values = data[slots]
-        if "book" in slot:
-            [slot, value] = slot.split(" ")
-            booking_slots[domain+'-'+value] = values
-            values = [value]
-        elif slot == "departure" or slot == "destination":
-            values = ["place"]
-        domain_vec = np.sum(process_text(domain, word_vectors), axis=0)
-        if domain not in word_vectors:
-            word_vectors[domain.replace(" ", "")] = domain_vec
-        slot_vec = np.sum(process_text(slot, word_vectors), axis=0)
-        if domain+'-'+slot not in slots_values:
-            slots_values.append(domain+'-'+slot)
-        if slot not in word_vectors:
-            word_vectors[slot.replace(" ", "")] = slot_vec
-        slot_values.append(len(values))
-        for value in values:
-            ontology.append(domain + '-' + slot + '-' + value)
-            value_vec = np.sum(process_text(value, word_vectors, print_mode=True), axis=0)
-            if value not in word_vectors:
-                word_vectors[value.replace(" ", "")] = value_vec
-            ontology_vectors.append(np.concatenate((domain_vec, slot_vec, value_vec)))
-
-    num_slots = len(slots_values)
-    # print("\tMDBT: We have about {} values".format(len(ontology)))
-    # print("\tMDBT: The Full Ontology is:")
-    # print(ontology)
-    # print("\tMDBT: The slots in this ontology:")
-    # print(slots_values)
-    return ontology, np.asarray(ontology_vectors, dtype='float32'), slot_values
-
-
-def load_word_vectors(url):
-    '''Load the word embeddings from the url
-
-    :param url: to the word vectors
-    :return: dict of word and vector values
-    '''
-    word_vectors = {}
-    # print("Loading the word embeddings....................")
-    # print('abs path: ', os.path.abspath(url))
-    with open(url, mode='r', encoding='utf8') as f:
-        for line in f:
-            line = line.split(" ", 1)
-            key = line[0]
-            word_vectors[key] = np.fromstring(line[1], dtype="float32", sep=" ")
-    # print("\tMDBT: The vocabulary contains about {} word embeddings".format(len(word_vectors)))
-    return normalise_word_vectors(word_vectors)
-
-
-def track_dialogue(data, ontology, predictions, y):
-    overall_accuracy_total = 0
-    overall_accuracy_corr = 0
-    joint_accuracy_total = 0
-    joint_accuracy_corr = 0
-    global num_slots
-    dialogues = []
-    idx = 0
-    for dialogue in data:
-        turn_ids = []
-        for key in dialogue.keys():
-            if key.isdigit():
-                turn_ids.append(int(key))
-            elif dialogue[key] and key not in domains:
-                continue
-        turn_ids.sort()
-        turns = []
-        previous_terms = []
-        for key in turn_ids:
-            turn = dialogue[str(key)]
-            user_input = turn['user']['text']
-            sys_res = turn['system']
-            state = turn['user']['belief_state']
-            turn_obj = dict()
-            turn_obj['user'] = user_input
-            turn_obj['system'] = sys_res
-            prediction = predictions[idx, :]
-            indices = np.argsort(prediction)[:-(int(np.sum(prediction)) + 1):-1]
-            predicted_terms = [process_booking(ontology[i], user_input, previous_terms) for i in indices]
-            previous_terms = deepcopy(predicted_terms)
-            turn_obj['prediction'] = ["{}: {}".format(predicted_terms[x], y[idx, i]) for x, i in enumerate(indices)]
-            turn_obj['True state'] = []
-            idx += 1
-            unpredicted_labels = 0
-            for domain in state:
-                if domain not in domains:
-                    continue
-                slots = state[domain]['semi']
-                for slot in slots:
-                    if slot == 'name':
-                        continue
-                    value = slots[slot]
-                    if value != '':
-                        label = domain + '-' + slot + '-' + value
-                        turn_obj['True state'].append(label)
-                        if label in predicted_terms:
-                            predicted_terms.remove(label)
-                        else:
-                            unpredicted_labels += 1
-
-            turns.append(turn_obj)
-            overall_accuracy_total += num_slots
-            overall_accuracy_corr += (num_slots - unpredicted_labels - len(predicted_terms))
-            if unpredicted_labels + len(predicted_terms) == 0:
-                joint_accuracy_corr += 1
-            joint_accuracy_total += 1
-
-        dialogues.append(turns)
-    return dialogues, overall_accuracy_corr/overall_accuracy_total, joint_accuracy_corr/joint_accuracy_total
-
-
-def process_booking(ontolog_term, usr_input, previous_terms):
-    usr_input = usr_input.lower().split()
-    domain, slot, value = ontolog_term.split('-')
-    if slot == 'book':
-        for term in previous_terms:
-            if domain+'-book '+value in term:
-                ontolog_term = term
-                break
-        else:
-            if value == 'stay' or value == 'people':
-                numbers = [int(s) for s in usr_input if s.isdigit()]
-                if len(numbers) == 1:
-                    ontolog_term = domain + '-' + slot + ' ' + value + '-' + str(numbers[0])
-                elif len(numbers) == 2:
-                    vals = {}
-                    if usr_input[usr_input.index(str(numbers[0]))+1] in ['people', 'person']:
-                        vals['people'] = str(numbers[0])
-                        vals['stay'] = str(numbers[1])
-                    else:
-                        vals['people'] = str(numbers[1])
-                        vals['stay'] = str(numbers[0])
-                    ontolog_term = domain + '-' + slot + ' ' + value + '-' + vals[value]
-            else:
-                for val in booking_slots[domain+'-'+value]:
-                    if val in ' '.join(usr_input):
-                        ontolog_term = domain + '-' + slot + ' ' + value + '-' + val
-                        break
-    return ontolog_term
-
-
-def process_history(sessions, word_vectors, ontology):
-    '''Load the woz3 data and extract feature vectors
-
-    :param data: the data to load
-    :param word_vectors: word embeddings
-    :param ontology: list of domain-slot-value
-    :param url: Is the data coming from a url, default true
-    :return: list(num of turns, user_input vectors, system_response vectors, labels)
-    '''
-    dialogues = []
-    actual_dialogues = []
-    for dialogue in sessions:
-        turn_ids = []
-        for key in dialogue.keys():
-            if key.isdigit():
-                turn_ids.append(int(key))
-            elif dialogue[key] and key not in domains:
-                continue
-        turn_ids.sort()
-        num_turns = len(turn_ids)
-        user_vecs = []
-        sys_vecs = []
-        turn_labels = []
-        turn_domain_labels = []
-        add = False
-        good = True
-        pre_sys = np.zeros([max_utterance_length, vector_dimension], dtype="float32")
-        for key in turn_ids:
-            turn = dialogue[str(key)]
-            user_v, sys_v, labels, domain_labels = process_turn(turn, word_vectors, ontology)
-            if good and (user_v.shape[0] > max_utterance_length or pre_sys.shape[0] > max_utterance_length):
-                # cut overlength utterance instead of discarding them
-                if user_v.shape[0] > max_utterance_length:
-                    user_v = user_v[:max_utterance_length]
-                if pre_sys.shape[0] > max_utterance_length:
-                    pre_sys = pre_sys[:max_utterance_length]
-                # good = False
-                # break
-            user_vecs.append(user_v)
-            sys_vecs.append(pre_sys)
-            turn_labels.append(labels)
-            turn_domain_labels.append(domain_labels)
-            if not add and sum(labels) > -1:
-                add = True
-            pre_sys = sys_v
-        if add and good:
-            dialogues.append((num_turns, user_vecs, sys_vecs, turn_labels, turn_domain_labels))
-            actual_dialogues.append(dialogue)
-    # print("\tMDBT: The data contains about {} dialogues".format(len(dialogues)))
-    return dialogues, actual_dialogues
-
-def load_woz_data_new(data, word_vectors, ontology, url=False):
-    '''Ported from load_woz_data, using convlab2.util.dataloader pkg
-
-    :param data: the data to load
-    :param word_vectors: word embeddings
-    :param ontology: list of domain-slot-value
-    :param url: Is the data coming from a url, default true
-    :return: list(num of turns, user_input vectors, system_response vectors, labels)
-    '''
-    if url:
-        data = json.load(open(url, mode='r', encoding='utf8'))
-    dialogues = []
-    actual_dialogues = []
-    for dialogue in data:
-        turn_ids = []
-        for key in dialogue.keys():
-            if key.isdigit():
-                turn_ids.append(int(key))
-            elif dialogue[key] and key not in domains:
-                continue
-        turn_ids.sort()
-        num_turns = len(turn_ids)
-        user_vecs = []
-        sys_vecs = []
-        turn_labels = []
-        turn_domain_labels = []
-        add = False
-        good = True
-        pre_sys = np.zeros([max_utterance_length, vector_dimension], dtype="float32")
-        for key in turn_ids:
-            turn = dialogue[str(key)]
-            user_v, sys_v, labels, domain_labels = process_turn(turn, word_vectors, ontology)
-            if good and (user_v.shape[0] > max_utterance_length or pre_sys.shape[0] > max_utterance_length):
-                good = False
-                break
-            user_vecs.append(user_v)
-            sys_vecs.append(pre_sys)
-            turn_labels.append(labels)
-            turn_domain_labels.append(domain_labels)
-            if not add and sum(labels) > 0:
-                add = True
-            pre_sys = sys_v
-        if add and good:
-            dialogues.append((num_turns, user_vecs, sys_vecs, turn_labels, turn_domain_labels))
-            actual_dialogues.append(dialogue)
-    # print("\tMDBT: The data contains about {} dialogues".format(len(dialogues)))
-    return dialogues, actual_dialogues
-
-def load_woz_data(data, word_vectors, ontology, url=True):
-    '''Load the woz3 data and extract feature vectors
-
-    :param data: the data to load
-    :param word_vectors: word embeddings
-    :param ontology: list of domain-slot-value
-    :param url: Is the data coming from a url, default true
-    :return: list(num of turns, user_input vectors, system_response vectors, labels)
-    '''
-    if url:
-        # print("Loading data from url {} ....................".format(data))
-        data = json.load(open(data, mode='r', encoding='utf8'))
-
-    dialogues = []
-    actual_dialogues = []
-    for dialogue in data:
-        turn_ids = []
-        for key in dialogue.keys():
-            if key.isdigit():
-                turn_ids.append(int(key))
-            elif dialogue[key] and key not in domains:
-                continue
-        turn_ids.sort()
-        num_turns = len(turn_ids)
-        user_vecs = []
-        sys_vecs = []
-        turn_labels = []
-        turn_domain_labels = []
-        add = False
-        good = True
-        pre_sys = np.zeros([max_utterance_length, vector_dimension], dtype="float32")
-        for key in turn_ids:
-            turn = dialogue[str(key)]
-            user_v, sys_v, labels, domain_labels = process_turn(turn, word_vectors, ontology)
-            if good and (user_v.shape[0] > max_utterance_length or pre_sys.shape[0] > max_utterance_length):
-                good = False
-                break
-            user_vecs.append(user_v)
-            sys_vecs.append(pre_sys)
-            turn_labels.append(labels)
-            turn_domain_labels.append(domain_labels)
-            if not add and sum(labels) > 0:
-                add = True
-            pre_sys = sys_v
-        if add and good:
-            dialogues.append((num_turns, user_vecs, sys_vecs, turn_labels, turn_domain_labels))
-            actual_dialogues.append(dialogue)
-    # print("\tMDBT: The data contains about {} dialogues".format(len(dialogues)))
-    return dialogues, actual_dialogues
-
-
-def process_turn(turn, word_vectors, ontology):
-    '''Process a single turn extracting and processing user text, system response and labels
-
-    :param turn: dict
-    :param word_vectors: word embeddings
-    :param ontology: list(domain-slot-value)
-    :return: ([utterance length, 300], [utterance length, 300], [no_slots])
-    '''
-    user_input = turn['user']['text']
-    sys_res = turn['system']
-    state = turn['user']['belief_state']
-    user_v = process_text(user_input, word_vectors, ontology)
-    sys_v = process_text(sys_res, word_vectors, ontology)
-    labels = np.zeros(len(ontology), dtype='float32')
-    domain_labels = np.zeros(len(ontology), dtype='float32')
-    for domain in state:
-        if domain not in domains:
-            continue
-        slots = state[domain]['semi']
-        domain_mention = False
-        for slot in slots:
-
-            if slot == 'name':
-                continue
-            value = slots[slot]
-            if "book" in slot:
-                [slot, value] = slot.split(" ")
-            if value != '' and value != 'corsican':
-                if slot == "destination" or slot == "departure":
-                    value = "place"
-                elif value == '09;45':
-                    value = '09:45'
-                elif 'alpha-milton' in value:
-                    value = value.replace('alpha-milton', 'alpha milton')
-                elif value == 'east side':
-                    value = 'east'
-                elif value == ' expensive':
-                    value = 'expensive'
-                labels[ontology.index(domain + '-' + slot + '-' + value)] = 1
-                domain_mention = True
-        if domain_mention:
-            for idx, slot in enumerate(ontology):
-                if domain in slot:
-                    domain_labels[idx] = 1
-
-    return user_v, sys_v, labels, domain_labels
-
-
-def process_text(text, word_vectors, ontology=None, print_mode=False):
-    '''Process a line/sentence converting words to feature vectors
-
-    :param text: sentence
-    :param word_vectors: word embeddings
-    :param ontology: The ontology to do exact matching
-    :param print_mode: Log the cases where the word is not in the pre-trained word vectors
-    :return: [length of sentence, 300]
-    '''
-    text = text.replace("(", "").replace(")", "").replace('"', "").replace(u"’", "'").replace(u"‘", "'")
-    text = text.replace("\t", "").replace("\n", "").replace("\r", "").strip().lower()
-    text = text.replace(',', ' ').replace('.', ' ').replace('?', ' ').replace('-', ' ').replace('/', ' / ')\
-        .replace(':', ' ')
-    if ontology:
-        for slot in ontology:
-            [domain, slot, value] = slot.split('-')
-            text.replace(domain, domain.replace(" ", ""))\
-                .replace(slot, slot.replace(" ", ""))\
-                .replace(value, value.replace(" ", ""))
-
-    words = text.split()
-
-    vectors = []
-    for word in words:
-        word = word.replace("'", "").replace("!", "")
-        if word == "":
-            continue
-        if word not in word_vectors:
-            length = len(word)
-            for i in range(1, length)[::-1]:
-                if word[:i] in word_vectors and word[i:] in word_vectors:
-                    vec = word_vectors[word[:i]] + word_vectors[word[i:]]
-                    break
-            else:
-                vec = xavier_vector(word)
-                word_vectors[word] = vec
-                if print_mode:
-                    pass
-                    # print("\tMDBT: Adding new word: {}".format(word))
-        else:
-            vec = word_vectors[word]
-        vectors.append(vec)
-    return np.asarray(vectors, dtype='float32')
-
-
-def generate_batch(dialogues, batch_no, batch_size, ontology_size):
-    '''Generate examples for minibatch training
-
-    :param dialogues: list(num of turns, user_input vectors, system_response vectors, labels)
-    :param batch_no: where we are in the training data
-    :param batch_size: number of dialogues to generate
-    :param ontology_size: no_slots
-    :return: list(user_input, system_response, labels, user_sentence_length, system_sentence_length, number of turns)
-    '''
-    user = np.zeros((batch_size, max_no_turns, max_utterance_length, vector_dimension), dtype='float32')
-    sys_res = np.zeros((batch_size, max_no_turns, max_utterance_length, vector_dimension), dtype='float32')
-    labels = np.zeros((batch_size, max_no_turns, ontology_size), dtype='float32')
-    domain_labels = np.zeros((batch_size, max_no_turns, ontology_size), dtype='float32')
-    user_uttr_len = np.zeros((batch_size, max_no_turns), dtype='int32')
-    sys_uttr_len = np.zeros((batch_size, max_no_turns), dtype='int32')
-    no_turns = np.zeros(batch_size, dtype='int32')
-    idx = 0
-    for i in range(batch_no*train_batch_size, batch_no*train_batch_size + batch_size):
-        (num_turns, user_vecs, sys_vecs, turn_labels, turn_domain_labels) = dialogues[i]
-        no_turns[idx] = num_turns
-        for j in range(num_turns):
-            user_uttr_len[idx, j] = user_vecs[j].shape[0]
-            sys_uttr_len[idx, j] = sys_vecs[j].shape[0]
-            user[idx, j, :user_uttr_len[idx, j], :] = user_vecs[j]
-            sys_res[idx, j, :sys_uttr_len[idx, j], :] = sys_vecs[j]
-            labels[idx, j, :] = turn_labels[j]
-            domain_labels[idx, j, :] = turn_domain_labels[j]
-        idx += 1
-    return user, sys_res, labels, domain_labels, user_uttr_len, sys_uttr_len, no_turns
-
-
-def evaluate_model(sess, model_variables, val_data, summary, batch_id, i):
-
-    '''Evaluate the model against validation set
-
-    :param sess: training session
-    :param model_variables: all model input variables
-    :param val_data: validation data
-    :param summary: For tensorboard
-    :param batch_id: where we are in the training data
-    :param i: the index of the validation data to load
-    :return: evaluation accuracy and the summary
-    '''
-
-    (user, sys_res, no_turns, user_uttr_len, sys_uttr_len, labels, domain_labels, domain_accuracy,
-     slot_accuracy, value_accuracy, value_f1, train_step, keep_prob, _, _, _) = model_variables
-
-    batch_user, batch_sys, batch_labels, batch_domain_labels, batch_user_uttr_len, batch_sys_uttr_len, \
-        batch_no_turns = val_data
-
-    start_time = time.time()
-
-    b_z = train_batch_size
-    [precision, recall, value_f1] = value_f1
-    [d_acc, s_acc, v_acc, f1_score, pr, re, sm1, sm2] = sess.run([domain_accuracy, slot_accuracy, value_accuracy,
-                                                                  value_f1, precision, recall] + summary,
-                                                           feed_dict={user: batch_user[i:i+b_z, :, :, :],
-                                                                      sys_res: batch_sys[i:i+b_z, :, :, :],
-                                                                      labels: batch_labels[i:i+b_z, :, :],
-                                                                      domain_labels: batch_domain_labels[i:i+b_z, :, :],
-                                                                      user_uttr_len: batch_user_uttr_len[i:i+b_z, :],
-                                                                      sys_uttr_len: batch_sys_uttr_len[i:i+b_z, :],
-                                                                      no_turns: batch_no_turns[i:i+b_z],
-                                                                      keep_prob: 1.0})
-
-    print("Batch", batch_id, "[Domain Accuracy] = ", d_acc, "[Slot Accuracy] = ", s_acc, "[Value Accuracy] = ",
-          v_acc, "[F1 Score] = ", f1_score, "[Precision] = ", pr, "[Recall] = ", re,
-          " ----- ", round(time.time() - start_time, 3),
-          "seconds. ---")
-
-    return d_acc, s_acc, v_acc, f1_score, sm1, sm2
diff --git a/convlab2/dst/mdbt/multiwoz/__init__.py b/convlab2/dst/mdbt/multiwoz/__init__.py
deleted file mode 100755
index 08491e683a0747e1717f4327c68646ea71cc4740..0000000000000000000000000000000000000000
--- a/convlab2/dst/mdbt/multiwoz/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from convlab2.dst.mdbt.multiwoz.dst import MultiWozMDBT as MDBT
diff --git a/convlab2/dst/mdbt/multiwoz/dst.py b/convlab2/dst/mdbt/multiwoz/dst.py
deleted file mode 100755
index 305419cd2fe40d3493649c76e4d179440355f179..0000000000000000000000000000000000000000
--- a/convlab2/dst/mdbt/multiwoz/dst.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import json
-import os
-import time
-import tensorflow as tf
-import shutil
-import zipfile
-
-from convlab2.dst.mdbt.mdbt import MDBT
-from convlab2.dst.mdbt.mdbt_util import load_word_vectors, load_ontology, load_woz_data_new
-from convlab2.util.dataloader.module_dataloader import AgentDSTDataloader
-from convlab2.util.dataloader.dataset_dataloader import MultiWOZDataloader
-from convlab2.util.file_util import cached_path
-from pprint import pprint
-
-train_batch_size = 1
-batches_per_eval = 10
-no_epochs = 600
-device = "gpu"
-start_batch = 0
-
-
-class MultiWozMDBT(MDBT):
-    def __init__(self, data_dir='configs', data=None):
-        """Constructor of MultiWOzMDBT class.
-        Args:
-            data_dir (str): The path of data dir, where the root path is convlab2/dst/mdbt/multiwoz.
-        """
-        if data is None:
-            loader = AgentDSTDataloader(MultiWOZDataloader())
-            data = loader.load_data()
-        self.file_url = 'https://convlab.blob.core.windows.net/convlab-2/mdbt_multiwoz_sys.zip'
-        local_path = os.path.dirname(os.path.abspath(__file__))
-        self.data_dir = os.path.join(local_path, data_dir)  # abstract data path
-
-        self.validation_url = os.path.join(self.data_dir, 'data/validate.json')
-        self.training_url = os.path.join(self.data_dir, 'data/train.json')
-        self.testing_url = os.path.join(self.data_dir, 'data/test.json')
-
-        self.word_vectors_url = os.path.join(self.data_dir, 'word-vectors/paragram_300_sl999.txt')
-        self.ontology_url = os.path.join(self.data_dir, 'data/ontology.json')
-        self.model_url = os.path.join(self.data_dir, 'models/model-1')
-        self.graph_url = os.path.join(self.data_dir, 'graphs/graph-1')
-        self.results_url = os.path.join(self.data_dir, 'results/log-1.txt')
-        self.kb_url = os.path.join(self.data_dir, 'data/')  # not used
-        self.train_model_url = os.path.join(self.data_dir, 'train_models/model-1')
-        self.train_graph_url = os.path.join(self.data_dir, 'train_graph/graph-1')
-
-        self.auto_download()
-
-        print('Configuring MDBT model...')
-        self.word_vectors = load_word_vectors(self.word_vectors_url)
-
-        # Load the ontology and extract the feature vectors
-        self.ontology, self.ontology_vectors, self.slots = load_ontology(self.ontology_url, self.word_vectors)
-
-        # Load and process the training data
-        self.test_dialogues, self.actual_dialogues = load_woz_data_new(data['test'], self.word_vectors,
-                                                                   self.ontology, url=self.testing_url)
-        self.no_dialogues = len(self.test_dialogues)
-
-        super(MultiWozMDBT, self).__init__(self.ontology_vectors, self.ontology, self.slots, self.data_dir)
-
-    def auto_download(self):
-        """Automatically download the pretrained model and necessary data."""
-        if os.path.exists(os.path.join(self.data_dir, 'models')) and \
-            os.path.exists(os.path.join(self.data_dir, 'data')) and \
-            os.path.exists(os.path.join(self.data_dir, 'word-vectors')):
-            return
-        cached_path(self.file_url, self.data_dir)
-        files = os.listdir(self.data_dir)
-        target_file = ''
-        for name in files:
-            if name.endswith('.json'):
-                target_file = name[:-5]
-        try:
-            assert target_file in files
-        except Exception as e:
-            print('allennlp download file error: MDBT Multiwoz data download failed.')
-            raise e
-        zip_file_path = os.path.join(self.data_dir, target_file+'.zip')
-        shutil.copyfile(os.path.join(self.data_dir, target_file), zip_file_path)
-        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
-            zip_ref.extractall(self.data_dir)
-
-
-def test_update():
-    # lower case, tokenized.
-    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-    tracker = MultiWozMDBT()
-    tracker.init_session()
-    # original usage in Convlab
-    # tracker.state['history'] = [
-    #     ["null", "am looking for a place to to stay that has cheap price range it should be in a type of hotel"],
-    #     ["Okay, do you have a specific area you want to stay in?", "no, i just need to make sure it's cheap. oh, and i need parking"],
-    #     ["I found 1 cheap hotel for you that includes parking. Do you like me to book it?", "Yes, please. 6 people 3 nights starting on tuesday."],
-    #     ["I am sorry but I wasn't able to book that for you for Tuesday. Is there another day you would like to stay or perhaps a shorter stay?", "how about only 2 nights."],
-    #     ["Booking was successful.\nReference number is : 7GAWK763. Anything else I can do for you?"]
-    # ]
-
-    # current usage in Convlab2
-    tracker.state['history'] = [
-        ['sys', ''],
-        ['user', 'Could you book a 4 stars hotel for one night, 1 person?'],
-        ['sys', 'If you\'d like something cheap, I recommend the Allenbell']
-    ]
-    tracker.state['history'].append(['user', 'Friday and Can you book it for me and get a reference number ?'])
-
-    user_utt = 'Friday and Can you book it for me and get a reference number ?'
-    from timeit import default_timer as timer
-    start = timer()
-    pprint(tracker.update(user_utt))
-    end = timer()
-    print(end - start)
-
-    start = timer()
-    tracker.update(user_utt)
-    end = timer()
-    print(end - start)
-
-    start = timer()
-    tracker.update(user_utt)
-    end = timer()
-    print(end - start)
-
-if __name__ == '__main__':
-    test_update()
diff --git a/setup.py b/setup.py
index 7170fb1c41fa6c77be3abf89cadb204e049e49b1..703178d7e37d2ef83b79609db4032dbc015938de 100755
--- a/setup.py
+++ b/setup.py
@@ -44,21 +44,21 @@ setup(
         'scikit-learn==0.20.3',
         'pytorch-pretrained-bert>=0.6.1',
         'transformers>=2.3.0,<3.0.0',
-        'tensorflow==1.14',
         'tensorboard>=1.14.0',
         'tensorboardX==1.7',
         'tokenizers>=0.8.0',
         'allennlp==0.9.0',
         'requests',
         'simplejson',
-        'spacy',
+        'spacy==2.1.9',
         'unidecode',
         'jieba',
         'embeddings',
         'quadprog',
         'pyyaml',
         'fuzzywuzzy',
-        'python-Levenshtein'
+        'python-Levenshtein',
+        'json_lines'
     ],
     extras_require={
         'develop': [