diff --git a/convlab/dst/setsumbt/README.md b/convlab/dst/setsumbt/README.md
index c081dedbde4e2e2a6aa706f7495456f735cef37a..96701aef8ced46dc0f08d50f3e6ce72b8f04673d 100644
--- a/convlab/dst/setsumbt/README.md
+++ b/convlab/dst/setsumbt/README.md
@@ -1,63 +1,183 @@
-# Our paper
-[Uncertainty Measures in Neural Belief Tracking and the Effects on Dialogue Policy Performance](https://todo.pdf)
+# SetSUMBT & SUMBT
+## Dialogue State Tracking and Language Understanding
 
-## Structure
-![SetSUMBT Architecture](https://gitlab.cs.uni-duesseldorf.de/dsml/convlab-2/-/raw/develop/convlab/dst/setsumbt/setSUMBT.png?inline=false)
+The SUMBT and SetSUMBT models is a group of dialogue state tracking models.
+These models include natural language understanding prediction heads which
+provide crucial information, such as the user request actions, required to
+incorporate the model in a pipeline dialogue system. [SUMBT](https://arxiv.org/pdf/1907.07421.pdf)
+utilises a Slot-Utterance matching attention mechanism (SUM) for information extraction,
+a recurrent module for latent information tracking and a picklist state
+prediction head using similarity based matching. [SetSUMBT](https://aclanthology.org/2021.emnlp-main.623/)
+extends the SUMBT model through the extension of the Slot-Utterance matching
+using to a set based Slot-Utterance matching module and a set based similarity
+matching prediction head. This model also introduces the language understanding
+prediction heads required for predicting additional crucial information. In addition,
+this model code allows for training of an ensemble and distillation of the ensemble
+producing a belief tracking model which predicts well calibrated belief states.
+
+
+## Our paper
+[Uncertainty Measures in Neural Belief Tracking and the Effects on Dialogue Policy Performance](https://aclanthology.org/2021.emnlp-main.623/)
+
+## SetSUMBT Model Architecture
+![SetSUMBT Architecture](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/setSUMBT.png?raw=true)
+The default configuration of the models are as follows:
+
+| Hyperparameter              |                      SetSUMBT                       |                             SUMBT                             |
+|:----------------------------|:---------------------------------------------------:|:-------------------------------------------------------------:|
+| Max Turns                   |                         12                          |                              12                               |
+| Max Turn Length             |                         64                          |                              64                               |
+| Max Candidate Desc. Length  |                         12                          |                              12                               |
+| Encoder model               | [roberta-base](https://huggingface.co/roberta-base) | [bert-base-uncased](https://huggingface.co/bert-base-uncased) |
+| Hidden Size                 |                         768                         |                              768                              |
+| SUM Attention Heads         |                         12                          |                              12                               |
+| Dropout rate                |                         0.3                         |                              0.3                              |
+| Tracker type                |                         GRU                         |                              GRU                              |
+| Tracker Hidden Size         |                         300                         |                              300                              |
+| Tracker RNN Layers          |                          1                          |                               1                               |
+| Set Pooler type             |                         CNN                         |                         No Set Pooler                         |
+| Candidate Desc. Pooler type |                      No Pooler                      |                           CLS Token                           |
+| Loss Function               |                   Label smoothing                   |                        Label smoothing                        |
+| Epochs                      |                         50                          |                              50                               |
+| Early stopping criteria     |                      20 Epochs                      |                           20 Epochs                           |
+| Learning rate               |                        5e-5                         |                             5e-5                              |
+| LR Scheduler                |                     Linear(0.2)                     |                          Linear(0.2)                          |
 
 ## Usages
-### Data preprocessing
+### Data sets
 We conduct experiments on the following datasets:
 
-* MultiWOZ 2.1 [Download](https://github.com/budzianowski/multiwoz/raw/master/data/MultiWOZ_2.1.zip) to get `MULTIWOZ2.1.zip`
+* [MultiWOZ 2.1](https://huggingface.co/datasets/ConvLab/multiwoz21)
+* [Schema Guided Dialogue(SGD)](https://huggingface.co/datasets/ConvLab/sgd)
+* [Taskmaster 1](https://huggingface.co/datasets/ConvLab/tm1)
+* [Taskmaster 2](https://huggingface.co/datasets/ConvLab/tm2)
+* [Taskmaster 3](https://huggingface.co/datasets/ConvLab/tm3)
+
+### Model checkpoints available on Huggingface
+
+The following pre-trained model checkpoints are available via huggingface hub:
+
+| Model    | Dataset      | Training Setup                   | Checkpoint                                                                        |
+|:---------|:-------------|:---------------------------------|:----------------------------------------------------------------------------------|
+| SetSUMBT | MultiWOZ 2.1 | Full dataset                     | [setsumbt-dst-multiwoz21](https://huggingface.co/ConvLab/setsumbt-dst-multiwoz21) |
+| SetSUMBT | SGD          | Full dataset                     | [setsumbt-dst-sgd](https://huggingface.co/ConvLab/setsumbt-dst-sgd)               |
+| SetSUMBT | TM1+TM2+TM3  | Full dataset                     | [setsumbt-dst-tm123](https://huggingface.co/ConvLab/setsumbt-dst-tm123)           |
+| SetSUMBT | MultiWOZ 2.1 | DST+NLU tasks + Uncertainty Est. | [setsumbt-dst_nlu-multiwoz21-EnD2](https://huggingface.co/ConvLab/setsumbt-dst_nlu-multiwoz21-EnD2)           |
 
 ### Train
 **Train baseline single instance SetSUMBT**
+
+Command to train the model on the MultiWOZ 2.1 dataset, to train the model on
+other datasets/setups or to train the SUMBT model set the relevant `starting_config_name`.
+To fine tune a pre-trained model set the `model_name_or_path` to the path of the pre-trained
+model. See below for more configurations of this model:
+
+| Model    | Dataset              | Training Setup                     | Starting Config Name                                                                                                                            |
+|:---------|:---------------------|:-----------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------|
+| SetSUMBT | MultiWOZ21           | Full dataset                       | [setsumbt_multiwoz21](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask.json)                    |
+| SetSUMBT | MultiWOZ21           | DST and NLU Tasks                  | [setsumbt_nlu_multiwoz21](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21.json)           |
+| SetSUMBT | MultiWOZ21           | Ensemble Distillation              | [setsumbt_nlu_multiwoz21_end](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end.json)   |
+| SetSUMBT | MultiWOZ21           | Ensemble Distribution Distillation | [setsumbt_nlu_multiwoz21_end2](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end2.json) |
+| SetSUMBT | MultiWOZ21           | 10% of the training data           | [setsumbt_multiwoz21_10p](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask_10p.json)            |
+| SetSUMBT | MultiWOZ21           | 1% of the training data            | [setsumbt_multiwoz21_1p](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask_1p.json)              |
+| SetSUMBT | TM1+TM2+TM3          | Full dataset                       | [setsumbt_tm](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask.json)                            |
+| SetSUMBT | SGD                  | Full dataset                       | [setsumbt_sgd](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask.json)                           |
+| SetSUMBT | MW21+SGD+TM1+TM2+TM3 | Joint training                     | [setsumbt_joint](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask.json)                         |
+| SetSUMBT | SGD+TM1+TM2+TM3      | Pre training                       | [setsumbt_pretrain](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/setsumbt_multitask.json)                      |
+| SUMBT    | MultiWOZ21           | Full dataset                       | [sumbt_multiwoz21](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask.json)                          |
+| SUMBT    | MultiWOZ21           | 10% of the training data           | [sumbt_multiwoz21_10p](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask_10p.json)                  |
+| SUMBT    | MultiWOZ21           | 1% of the training data            | [sumbt_multiwoz21_1p](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask_1p.json)                    |
+| SUMBT    | TM1+TM2+TM3          | Full dataset                       | [sumbt_tm](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask.json)                                  |
+| SUMBT    | SGD                  | Full dataset                       | [sumbt_sgd](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask.json)                                 |
+| SUMBT    | MW21+SGD+TM1+TM2+TM3 | Joint training                     | [sumbt_joint](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask.json)                               |
+| SUMBT    | SGD+TM1+TM2+TM3      | Pre training                       | [sumbt_pretrain](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt/configs/sumbt_multitask.json)                            |
+
 ```
-python run.py --run_nbt \
-    --use_descriptions --set_similarity \
-    --do_train --do_eval \
-    --seed 20211202
+python3 run.py \
+    --starting_config_name setsumbt_multiwoz21 \
+    --seed 0 \
+    --do_train
 ```
 
 **Train ensemble SetSUMBT**
 ```
 SEED=20211202
 MODEL_PATH="models/SetSUMBT-CE-roberta-gru-cosine-labelsmoothing-Seed$SEED-$(date +'%d-%m-%Y')"
-./configure_ensemble.sh $SEED $MODEL_PATH
-./train_ensemble.sh $SEED $MODEL_PATH
+ENSEMBLE_SIZE=10
+DATA_SIZE=7500
+
+python3 run.py \
+    --starting_config_name setsumbt_nlu_multiwoz21 \
+    --output_dir $MODEL_PATH \
+    --ensemble_size $ENSEMBLE_SIZE \
+    --data_sampling_size $DATA_SIZE \
+    --seed $SEED
+
+ENSEMBLE_SIZE=$(($ENSEMBLE_SIZE-1))
+for e in $(seq 0 $ENSEMBLE_SIZE);do
+    python3 run.py \
+        --starting_config_name setsumbt_nlu_multiwoz21
+        --output_dir "$OUT/ens-$e" \
+        --do_train \
+        --seed $SEED
+done
 ```
 
 **Distill Ensemble SetSUMBT**
 ```
 SEED=20211202
 MODEL_PATH="models/SetSUMBT-CE-roberta-gru-cosine-labelsmoothing-Seed$SEED-$(date +'%d-%m-%Y')"
-./distill_end.sh $SEED $MODEL_PATH
+for SUBSET in train dev test;do
+    python3 distillation_setup.py \
+        --model_path $MODEL_PATH \
+        --set_type $SUBSET \
+        --reduction mean \
+        --get_ensemble_distributions \
+        --convert_distributions_to_predictions
+done
+python3 run.py \
+    --starting_config_name setsumbt_nlu_multiwoz21_end \
+    --seed $SEED \
+    --output_dir $MODEL_PATH \
+    --do_train
 ```
 
 **Distribution Distill Ensemble SetSUMBT**
 ```
 SEED=20211202
 MODEL_PATH="models/SetSUMBT-CE-roberta-gru-cosine-labelsmoothing-Seed$SEED-$(date +'%d-%m-%Y')"
-./distill_end2.sh $SEED $MODEL_PATH
+for SUBSET in train dev test;do
+    python3 distillation_setup.py \
+        --model_path $MODEL_PATH \
+        --set_type $SUBSET \
+        --reduction none \
+        --get_ensemble_distributions \
+        --convert_distributions_to_predictions
+done
+python3 run.py \
+    --starting_config_name setsumbt_nlu_multiwoz21_end2 \
+    --seed $SEED \
+    --output_dir $MODEL_PATH \
+    --do_train
 ```
 
 ### Evaluation
 
-```
-SEED=20211202
-MODEL_PATH="models/SetSUMBT-CE-roberta-gru-cosine-labelsmoothing-Seed$SEED-$(date +'%d-%m-%Y')"
-python run.py --run_calibration \
-    --seed $SEED \
-    --output_dir $MODEL_PATH
-```
-
-### Convert training setup to convlab model
+To evaluate a model set the `$MODEL_PATH` to the path or URL of that model.
+The URL is the download URL of the model archive from the pretrained model
+for example for `setsumbt-dst-multiwoz21` the url is
+https://huggingface.co/ConvLab/setsumbt-dst-multiwoz21/resolve/main/SetSUMBT-multiwoz21-roberta-gru-cosine-labelsmoothing-Seed0.zip.
 
 ```
-SEED=20211202
-MODEL_PATH="models/SetSUMBT-CE-roberta-gru-cosine-labelsmoothing-Seed$SEED-$(date +'%d-%m-%Y')"
-OUT_PATH="models/labelsmoothing"
-./configure_model.sh $MODEL_PATH data $OUT_PATH
+python3 run.py \
+    --starting_config_name setsumbt_multiwoz21 \
+    --output_dir $MODEL_PATH \
+    --do_test
+python3 get_golden_labels.py \
+    --dataset_name multiwoz21 \
+    --model_path $MODEL_PATH
+python3 ../evaluate_unified_datasets.py \
+    -p "$MODEL_PATH/predictions/test_multiwoz21.json"
 ```
 
 ### Training PPO policy using SetSUMBT tracker and uncertainty
@@ -68,5 +188,5 @@ cd ../../policy/ppo
 ```
 In this directory run the relevant train script, for example to train the policy using END-SetSUMBT using no uncertainty metrics run:
 ```
-./train_setsumbt_end_baseline.sh
+python3 train.py --path setsumbt_config.json
 ```
diff --git a/convlab/dst/setsumbt/configs/setsumbt_multitask.json b/convlab/dst/setsumbt/configs/setsumbt_joint.json
similarity index 61%
rename from convlab/dst/setsumbt/configs/setsumbt_multitask.json
rename to convlab/dst/setsumbt/configs/setsumbt_joint.json
index c076a557cb3e1d567784c70559fb1922fe05c545..b97920f28538ebfd8f026e5c966434334e7257b4 100644
--- a/convlab/dst/setsumbt/configs/setsumbt_multitask.json
+++ b/convlab/dst/setsumbt/configs/setsumbt_joint.json
@@ -2,8 +2,8 @@
   "model_type": "SetSUMBT",
   "dataset": "multiwoz21+sgd+tm1+tm2+tm3",
   "no_action_prediction": true,
-  "model_name_or_path": "/gpfs/project/niekerk/models/transformers/roberta-base",
-  "transformers_local_files_only": true,
+  "model_name_or_path": "roberta-base",
+  "transformers_local_files_only": false,
   "train_batch_size": 3,
   "dev_batch_size": 8,
   "test_batch_size": 8,
diff --git a/convlab/dst/setsumbt/configs/setsumbt_multiwoz21.json b/convlab/dst/setsumbt/configs/setsumbt_multiwoz21.json
index 0bff751c16f0bdcdf61f04ce33d616370c0d32d8..57a245518aae0a111f6220b1a088943b8b64ee4c 100644
--- a/convlab/dst/setsumbt/configs/setsumbt_multiwoz21.json
+++ b/convlab/dst/setsumbt/configs/setsumbt_multiwoz21.json
@@ -2,8 +2,9 @@
   "model_type": "SetSUMBT",
   "dataset": "multiwoz21",
   "no_action_prediction": true,
-  "model_name_or_path": "/gpfs/project/niekerk/models/transformers/roberta-base",
-  "transformers_local_files_only": true,
+  "model_name_or_path": "roberta-base",
+  "candidate_embedding_model_name": "roberta-base",
+  "transformers_local_files_only": false,
   "train_batch_size": 3,
   "dev_batch_size": 16,
   "test_batch_size": 16,
diff --git a/convlab/dst/setsumbt/configs/setsumbt_multiwoz21_10p.json b/convlab/dst/setsumbt/configs/setsumbt_multiwoz21_10p.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4b54276f657930e46d14a20e677685153217e6a
--- /dev/null
+++ b/convlab/dst/setsumbt/configs/setsumbt_multiwoz21_10p.json
@@ -0,0 +1,16 @@
+{
+  "model_type": "SetSUMBT",
+  "dataset": "multiwoz21",
+  "dataset_train_ratio": 0.1,
+  "no_action_prediction": true,
+  "model_name_or_path": "roberta-base",
+  "candidate_embedding_model_name": "roberta-base",
+  "transformers_local_files_only": false,
+  "num_train_epochs": 500,
+  "patience": 50,
+  "warmup_proportion": 0.02,
+  "train_batch_size": 3,
+  "dev_batch_size": 16,
+  "test_batch_size": 16,
+  "run_nbt": true
+}
\ No newline at end of file
diff --git a/convlab/dst/setsumbt/configs/setsumbt_multiwoz21_1p.json b/convlab/dst/setsumbt/configs/setsumbt_multiwoz21_1p.json
new file mode 100644
index 0000000000000000000000000000000000000000..c902d0f7a456f453ff208f258594a4d090376628
--- /dev/null
+++ b/convlab/dst/setsumbt/configs/setsumbt_multiwoz21_1p.json
@@ -0,0 +1,16 @@
+{
+  "model_type": "SetSUMBT",
+  "dataset": "multiwoz21",
+  "dataset_train_ratio": 0.01,
+  "no_action_prediction": true,
+  "model_name_or_path": "roberta-base",
+  "candidate_embedding_model_name": "roberta-base",
+  "transformers_local_files_only": false,
+  "num_train_epochs": 1000,
+  "patience": 100,
+  "warmup_proportion": 0.01,
+  "train_batch_size": 3,
+  "dev_batch_size": 16,
+  "test_batch_size": 16,
+  "run_nbt": true
+}
\ No newline at end of file
diff --git a/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21.json b/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21.json
new file mode 100644
index 0000000000000000000000000000000000000000..59c272c171d7e0b6bc1a2e87b49b18a863464681
--- /dev/null
+++ b/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21.json
@@ -0,0 +1,12 @@
+{
+  "model_type": "SetSUMBT",
+  "dataset": "multiwoz21",
+  "no_action_prediction": false,
+  "model_name_or_path": "roberta-base",
+  "candidate_embedding_model_name": "roberta-base",
+  "transformers_local_files_only": false,
+  "train_batch_size": 3,
+  "dev_batch_size": 16,
+  "test_batch_size": 16,
+  "run_nbt": true
+}
\ No newline at end of file
diff --git a/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end.json b/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ae2f58885d07aa057d73ba475e48b24dc6e6861
--- /dev/null
+++ b/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end.json
@@ -0,0 +1,13 @@
+{
+  "model_type": "SetSUMBT",
+  "dataset": "multiwoz21",
+  "no_action_prediction": false,
+  "model_name_or_path": "roberta-base",
+  "candidate_embedding_model_name": "roberta-base",
+  "transformers_local_files_only": false,
+  "loss_function": "distillation",
+  "train_batch_size": 3,
+  "dev_batch_size": 16,
+  "test_batch_size": 16,
+  "run_nbt": true
+}
\ No newline at end of file
diff --git a/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end2.json b/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end2.json
new file mode 100644
index 0000000000000000000000000000000000000000..41f20072f3fa4706c5872c08e6b4b3f8b7bb1c47
--- /dev/null
+++ b/convlab/dst/setsumbt/configs/setsumbt_nlu_multiwoz21_end2.json
@@ -0,0 +1,13 @@
+{
+  "model_type": "SetSUMBT",
+  "dataset": "multiwoz21",
+  "no_action_prediction": false,
+  "model_name_or_path": "roberta-base",
+  "candidate_embedding_model_name": "roberta-base",
+  "transformers_local_files_only": false,
+  "loss_function": "distribution_distillation",
+  "train_batch_size": 3,
+  "dev_batch_size": 16,
+  "test_batch_size": 16,
+  "run_nbt": true
+}
\ No newline at end of file
diff --git a/convlab/dst/setsumbt/configs/setsumbt_pretrain.json b/convlab/dst/setsumbt/configs/setsumbt_pretrain.json
index fdc22d157840e7494b0266d0bd99f8a99d242969..3fa4104ceeca964986199a366a024bbdcf03847a 100644
--- a/convlab/dst/setsumbt/configs/setsumbt_pretrain.json
+++ b/convlab/dst/setsumbt/configs/setsumbt_pretrain.json
@@ -2,8 +2,8 @@
   "model_type": "SetSUMBT",
   "dataset": "sgd+tm1+tm2+tm3",
   "no_action_prediction": true,
-  "model_name_or_path": "/gpfs/project/niekerk/models/transformers/roberta-base",
-  "transformers_local_files_only": true,
+  "model_name_or_path": "roberta-base",
+  "transformers_local_files_only": false,
   "train_batch_size": 3,
   "dev_batch_size": 12,
   "test_batch_size": 12,
diff --git a/convlab/dst/setsumbt/configs/setsumbt_sgd.json b/convlab/dst/setsumbt/configs/setsumbt_sgd.json
index 97f5818334af4c7984ec24448861b627315820e3..7e6946126d6595e903bd03dfb346a9d1ba1068cd 100644
--- a/convlab/dst/setsumbt/configs/setsumbt_sgd.json
+++ b/convlab/dst/setsumbt/configs/setsumbt_sgd.json
@@ -2,8 +2,8 @@
   "model_type": "SetSUMBT",
   "dataset": "sgd",
   "no_action_prediction": true,
-  "model_name_or_path": "/gpfs/project/niekerk/models/transformers/roberta-base",
-  "transformers_local_files_only": true,
+  "model_name_or_path": "roberta-base",
+  "transformers_local_files_only": false,
   "train_batch_size": 3,
   "dev_batch_size": 6,
   "test_batch_size": 3,
diff --git a/convlab/dst/setsumbt/configs/setsumbt_tm.json b/convlab/dst/setsumbt/configs/setsumbt_tm.json
index 138f84c358067389d5f7b478ae94c3eb2aa90ea3..1cf54f443ee444a2035ecfb7e6ecb9ee7a9084bd 100644
--- a/convlab/dst/setsumbt/configs/setsumbt_tm.json
+++ b/convlab/dst/setsumbt/configs/setsumbt_tm.json
@@ -2,8 +2,8 @@
   "model_type": "SetSUMBT",
   "dataset": "tm1+tm2+tm3",
   "no_action_prediction": true,
-  "model_name_or_path": "/gpfs/project/niekerk/models/transformers/roberta-base",
-  "transformers_local_files_only": true,
+  "model_name_or_path": "roberta-base",
+  "transformers_local_files_only": false,
   "train_batch_size": 3,
   "dev_batch_size": 8,
   "test_batch_size": 8,
diff --git a/convlab/dst/setsumbt/configs/sumbt_joint.json b/convlab/dst/setsumbt/configs/sumbt_joint.json
new file mode 100644
index 0000000000000000000000000000000000000000..958a4261cfeaf5858112d225f994c8ccc6075c91
--- /dev/null
+++ b/convlab/dst/setsumbt/configs/sumbt_joint.json
@@ -0,0 +1,14 @@
+{
+  "model_type": "SetSUMBT",
+  "dataset": "multiwoz21+sgd+tm1+tm2+tm3",
+  "no_action_prediction": true,
+  "model_type": "bert",
+  "model_name_or_path": "bert-base-uncased",
+  "transformers_local_files_only": false,
+  "no_set_similarity": false,
+  "candidate_pooling": "cls",
+  "train_batch_size": 3,
+  "dev_batch_size": 8,
+  "test_batch_size": 8,
+  "run_nbt": true
+}
\ No newline at end of file
diff --git a/convlab/dst/setsumbt/configs/sumbt_multiwoz21.json b/convlab/dst/setsumbt/configs/sumbt_multiwoz21.json
new file mode 100644
index 0000000000000000000000000000000000000000..ebebe8a6631a03aff85a08ee6608b0d757f2a33d
--- /dev/null
+++ b/convlab/dst/setsumbt/configs/sumbt_multiwoz21.json
@@ -0,0 +1,15 @@
+{
+  "model_type": "SetSUMBT",
+  "dataset": "multiwoz21",
+  "no_action_prediction": true,
+  "model_type": "bert",
+  "model_name_or_path": "bert-base-uncased",
+  "candidate_embedding_model_name": "bert-base-uncased",
+  "transformers_local_files_only": false,
+  "no_set_similarity": false,
+  "candidate_pooling": "cls",
+  "train_batch_size": 3,
+  "dev_batch_size": 16,
+  "test_batch_size": 16,
+  "run_nbt": true
+}
\ No newline at end of file
diff --git a/convlab/dst/setsumbt/configs/sumbt_multiwoz21_10p.json b/convlab/dst/setsumbt/configs/sumbt_multiwoz21_10p.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3e5454e34b4001b7dcab50975a15a75f58a44ba
--- /dev/null
+++ b/convlab/dst/setsumbt/configs/sumbt_multiwoz21_10p.json
@@ -0,0 +1,19 @@
+{
+  "model_type": "SetSUMBT",
+  "dataset": "multiwoz21",
+  "dataset_train_ratio": 0.1,
+  "no_action_prediction": true,
+  "model_type": "bert",
+  "model_name_or_path": "bert-base-uncased",
+  "candidate_embedding_model_name": "bert-base-uncased",
+  "transformers_local_files_only": false,
+  "no_set_similarity": false,
+  "candidate_pooling": "cls",
+  "num_train_epochs": 500,
+  "patience": 50,
+  "warmup_proportion": 0.02,
+  "train_batch_size": 3,
+  "dev_batch_size": 16,
+  "test_batch_size": 16,
+  "run_nbt": true
+}
\ No newline at end of file
diff --git a/convlab/dst/setsumbt/configs/sumbt_multiwoz21_1p.json b/convlab/dst/setsumbt/configs/sumbt_multiwoz21_1p.json
new file mode 100644
index 0000000000000000000000000000000000000000..45f684db422b695019c173ad670206129fee793c
--- /dev/null
+++ b/convlab/dst/setsumbt/configs/sumbt_multiwoz21_1p.json
@@ -0,0 +1,19 @@
+{
+  "model_type": "SetSUMBT",
+  "dataset": "multiwoz21",
+  "dataset_train_ratio": 0.01,
+  "no_action_prediction": true,
+  "model_type": "bert",
+  "model_name_or_path": "bert-base-uncased",
+  "candidate_embedding_model_name": "bert-base-uncased",
+  "transformers_local_files_only": false,
+  "no_set_similarity": false,
+  "candidate_pooling": "cls",
+  "num_train_epochs": 1000,
+  "patience": 100,
+  "warmup_proportion": 0.01,
+  "train_batch_size": 3,
+  "dev_batch_size": 16,
+  "test_batch_size": 16,
+  "run_nbt": true
+}
\ No newline at end of file
diff --git a/convlab/dst/setsumbt/configs/sumbt_pretrain.json b/convlab/dst/setsumbt/configs/sumbt_pretrain.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf8a22d6a93be24daf63c4931c1323fcfefe7d12
--- /dev/null
+++ b/convlab/dst/setsumbt/configs/sumbt_pretrain.json
@@ -0,0 +1,14 @@
+{
+  "model_type": "SetSUMBT",
+  "dataset": "sgd+tm1+tm2+tm3",
+  "no_action_prediction": true,
+  "model_type": "bert",
+  "model_name_or_path": "bert-base-uncased",
+  "transformers_local_files_only": false,
+  "no_set_similarity": false,
+  "candidate_pooling": "cls",
+  "train_batch_size": 3,
+  "dev_batch_size": 12,
+  "test_batch_size": 12,
+  "run_nbt": true
+}
\ No newline at end of file
diff --git a/convlab/dst/setsumbt/configs/sumbt_sgd.json b/convlab/dst/setsumbt/configs/sumbt_sgd.json
new file mode 100644
index 0000000000000000000000000000000000000000..307bca67f2996e1ecf5a29eaf231804367964934
--- /dev/null
+++ b/convlab/dst/setsumbt/configs/sumbt_sgd.json
@@ -0,0 +1,14 @@
+{
+  "model_type": "SetSUMBT",
+  "dataset": "sgd",
+  "no_action_prediction": true,
+  "model_type": "bert",
+  "model_name_or_path": "bert-base-uncased",
+  "transformers_local_files_only": false,
+  "no_set_similarity": false,
+  "candidate_pooling": "cls",
+  "train_batch_size": 3,
+  "dev_batch_size": 6,
+  "test_batch_size": 3,
+  "run_nbt": true
+}
\ No newline at end of file
diff --git a/convlab/dst/setsumbt/configs/sumbt_tm.json b/convlab/dst/setsumbt/configs/sumbt_tm.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1263fd384e37c0b708a8771e0412b2c8fbc9cd4
--- /dev/null
+++ b/convlab/dst/setsumbt/configs/sumbt_tm.json
@@ -0,0 +1,14 @@
+{
+  "model_type": "SetSUMBT",
+  "dataset": "tm1+tm2+tm3",
+  "no_action_prediction": true,
+  "model_type": "bert",
+  "model_name_or_path": "bert-base-uncased",
+  "transformers_local_files_only": false,
+  "no_set_similarity": false,
+  "candidate_pooling": "cls",
+  "train_batch_size": 3,
+  "dev_batch_size": 8,
+  "test_batch_size": 8,
+  "run_nbt": true
+}
\ No newline at end of file
diff --git a/convlab/dst/setsumbt/configure_ensemble.sh b/convlab/dst/setsumbt/configure_ensemble.sh
deleted file mode 100755
index fc33df5a6f7e0bdbfae103b5999d212a73f7f4a3..0000000000000000000000000000000000000000
--- a/convlab/dst/setsumbt/configure_ensemble.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-ENSEMBLE_SIZE=10
-DATA_SIZE=7500
-SEED=$1
-OUT=$2
-
-python run.py --run_nbt \
-    --output_dir $OUT \
-    --use_descriptions --set_similarity \
-    --ensemble_size $ENSEMBLE_SIZE \
-    --data_sampling_size $DATA_SIZE \
-    --seed $SEED
-
-ENSEMBLE_SIZE=$(($ENSEMBLE_SIZE-1))
-for e in $(seq 0 $ENSEMBLE_SIZE);do
-    mkdir -p "$OUT/ensemble-$e/dataloaders"
-
-    mv "$OUT/ensemble-$e/train.dataloader" "$OUT/ensemble-$e/dataloaders/"
-    cp "$OUT/dataloaders/dev.dataloader" "$OUT/ensemble-$e/dataloaders/"
-    cp "$OUT/dataloaders/test.dataloader" "$OUT/ensemble-$e/dataloaders/"
-    cp -r $OUT/database "$OUT/ensemble-$e/"
-done
diff --git a/convlab/dst/setsumbt/configure_model.sh b/convlab/dst/setsumbt/configure_model.sh
deleted file mode 100755
index cea833bf9df741409f9c7fb28454493acc08cc49..0000000000000000000000000000000000000000
--- a/convlab/dst/setsumbt/configure_model.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-IN=$1
-IN_DATA=$2
-OUT=$3
-
-mkdir -p $OUT
-cp "$IN/database/test.db" "$OUT/ontology.db"
-cp "$IN_DATA/ontology_test.db" "$OUT/ontology.json"
-cp "$IN/pytorch_model.bin" "$OUT/pytorch_model.bin"
-cp "$IN/config.json" "$OUT/config.json"
diff --git a/convlab/dst/setsumbt/distill_end.sh b/convlab/dst/setsumbt/distill_end.sh
deleted file mode 100755
index 76db8dfdb64204938c1e657b87562a4731cdf3a2..0000000000000000000000000000000000000000
--- a/convlab/dst/setsumbt/distill_end.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-
-ENSEMBLE_SIZE=10
-SEED=$1
-OUT=$2
-
-ENSEMBLE_SIZE=$(($ENSEMBLE_SIZE-1))
-for e in $(seq 0 $ENSEMBLE_SIZE);do
-    cp "$OUT/ensemble-$e/pytorch_model.bin" "$OUT/pytorch_model_$e.bin"
-done
-cp "$OUT/ensemble-0/config.json" "$OUT/config.json"
-
-for SET in "train" "dev" "test";do
-    python distillation_setup.py --get_ensemble_distributions \
-        --model_path $OUT \
-        --model_type roberta \
-        --set_type $SET \
-        --ensemble_size $ENSEMBLE_SIZE \
-        --reduction mean
-done
-
-python distillation_setup.py --build_dataloaders \
-    --model_path $OUT \
-    --set_type train \
-    --batch_size 3
-
-for SET in "dev" "test";do
-    python distillation_setup.py --build_dataloaders \
-        --model_path $OUT \
-        --set_type $SET \
-        --batch_size 16
-done
-
-python run.py --run_nbt \
-    --output_dir $OUT \
-    --loss_function distillation \
-    --use_descriptions --set_similarity \
-    --do_train --do_eval \
-    --seed $SEED
diff --git a/convlab/dst/setsumbt/distill_end2.sh b/convlab/dst/setsumbt/distill_end2.sh
deleted file mode 100755
index 434375a2363a79d98f99a7eecd20d1dc44c4dbec..0000000000000000000000000000000000000000
--- a/convlab/dst/setsumbt/distill_end2.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-
-ENSEMBLE_SIZE=10
-SEED=$1
-OUT=$2
-
-ENSEMBLE_SIZE=$(($ENSEMBLE_SIZE-1))
-for e in $(seq 0 $ENSEMBLE_SIZE);do
-    cp "$OUT/ensemble-$e/pytorch_model.bin" "$OUT/pytorch_model_$e.bin"
-done
-cp "$OUT/ensemble-0/config.json" "$OUT/config.json"
-
-for SET in "train" "dev" "test";do
-    python distillation_setup.py --get_ensemble_distributions \
-        --model_path $OUT \
-        --model_type roberta \
-        --set_type $SET \
-        --ensemble_size $ENSEMBLE_SIZE \
-        --reduction none
-done
-
-python distillation_setup.py --build_dataloaders \
-    --model_path $OUT \
-    --set_type train \
-    --batch_size 3
-
-for SET in "dev" "test";do
-    python distillation_setup.py --build_dataloaders \
-        --model_path $OUT \
-        --set_type $SET \
-        --batch_size 16
-done
-
-python run.py --run_nbt \
-    --output_dir $OUT \
-    --loss_function "distribution_distillation" \
-    --use_descriptions --set_similarity \
-    --do_train --do_eval \
-    --seed $SEED
diff --git a/convlab/dst/setsumbt/train_ensemble.sh b/convlab/dst/setsumbt/train_ensemble.sh
deleted file mode 100755
index 911f8baa93966a1c3030655de97a263f492e7a0d..0000000000000000000000000000000000000000
--- a/convlab/dst/setsumbt/train_ensemble.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-ENSEMBLE_SIZE=10
-SEED=$1
-OUT=$2
-
-ENSEMBLE_SIZE=$(($ENSEMBLE_SIZE-1))
-for e in $(seq 0 $ENSEMBLE_SIZE);do
-    python run.py --run_nbt \
-        --output_dir "$OUT/ensemble-$e" \
-        --use_descriptions --set_similarity \
-        --do_train --do_eval \
-        --seed $SEED
-done
diff --git a/convlab/dst/sumbt/.gitignore b/convlab/dst/sumbt/.gitignore
deleted file mode 100644
index 4629906dfe3c88b813e109039739c38a0a890228..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*/model_output/
diff --git a/convlab/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py b/convlab/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py
deleted file mode 100755
index 6897678bc5dbf8760738cd118b836d0c0bb4c355..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/BeliefTrackerSlotQueryMultiSlot.py
+++ /dev/null
@@ -1,300 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from torch.nn import CrossEntropyLoss
-from transformers import BertModel
-from transformers import BertPreTrainedModel
-
-
-class BertForUtteranceEncoding(BertPreTrainedModel):
-    def __init__(self, config):
-        super(BertForUtteranceEncoding, self).__init__(config)
-
-        self.config = config
-        self.bert = BertModel(config)
-
-    def forward(self, input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False):
-
-        return self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, encoder_hidden_states=output_all_encoded_layers)
-
-
-class MultiHeadAttention(nn.Module):
-    def __init__(self, heads, d_model, dropout=0.1):
-        super().__init__()
-
-        self.d_model = d_model
-        self.d_k = d_model // heads
-        self.h = heads
-
-        self.q_linear = nn.Linear(d_model, d_model)
-        self.v_linear = nn.Linear(d_model, d_model)
-        self.k_linear = nn.Linear(d_model, d_model)
-        self.dropout = nn.Dropout(dropout)
-        self.out = nn.Linear(d_model, d_model)
-
-        self.scores = None
-
-    def attention(self, q, k, v, d_k, mask=None, dropout=None):
-
-        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
-
-        if mask is not None:
-            mask = mask.unsqueeze(1)
-            scores = scores.masked_fill(mask == 0, -1e9)
-        scores = F.softmax(scores, dim=-1)
-
-        if dropout is not None:
-            scores = dropout(scores)
-
-        self.scores = scores
-        output = torch.matmul(scores, v)
-        return output
-
-    def forward(self, q, k, v, mask=None):
-        bs = q.size(0)
-
-        # perform linear operation and split into h heads
-        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
-        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
-        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
-
-        # transpose to get dimensions bs * h * sl * d_model
-        k = k.transpose(1, 2)
-        q = q.transpose(1, 2)
-        v = v.transpose(1, 2)
-
-        scores = self.attention(q, k, v, self.d_k, mask, self.dropout)
-
-        # concatenate heads and put through final linear layer
-        concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
-        output = self.out(concat)
-        return output
-
-    def get_scores(self):
-        return self.scores
-
-
-class BeliefTracker(nn.Module):
-    def __init__(self, args, num_labels, device="cuda"):
-        super(BeliefTracker, self).__init__()
-
-        self.hidden_dim = args.hidden_dim
-        self.rnn_num_layers = args.num_rnn_layers
-        self.zero_init_rnn = args.zero_init_rnn
-        self.max_seq_length = args.max_seq_length
-        self.max_label_length = args.max_label_length
-        self.num_labels = num_labels
-        self.num_slots = len(num_labels)
-        self.attn_head = args.attn_head
-        self.device = device
-
-        ### Utterance Encoder
-        self.utterance_encoder = BertForUtteranceEncoding.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir)
-        self.utterance_encoder.train()
-        self.bert_output_dim = self.utterance_encoder.config.hidden_size
-        self.hidden_dropout_prob = self.utterance_encoder.config.hidden_dropout_prob
-        if args.fix_utterance_encoder:
-            for p in self.utterance_encoder.bert.pooler.parameters():
-                p.requires_grad = False
-
-        ### slot, slot-value Encoder (not trainable)
-        self.sv_encoder = BertForUtteranceEncoding.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir)
-        self.sv_encoder.train()
-        for p in self.sv_encoder.bert.parameters():
-            p.requires_grad = False
-
-        self.slot_lookup = nn.Embedding(self.num_slots, self.bert_output_dim)
-        self.value_lookup = nn.ModuleList([nn.Embedding(num_label, self.bert_output_dim) for num_label in num_labels])
-
-        ### Attention layer
-        self.attn = MultiHeadAttention(self.attn_head, self.bert_output_dim, dropout=0)
-
-        ### RNN Belief Tracker
-        self.nbt = None
-        if args.task_name.find("gru") != -1:
-            self.nbt = nn.GRU(input_size=self.bert_output_dim,
-                              hidden_size=self.hidden_dim,
-                              num_layers=self.rnn_num_layers,
-                              dropout=self.hidden_dropout_prob,
-                              batch_first=True)
-            self.init_parameter(self.nbt)
-        elif args.task_name.find("lstm") != -1:
-            self.nbt = nn.LSTM(input_size=self.bert_output_dim,
-                               hidden_size=self.hidden_dim,
-                               num_layers=self.rnn_num_layers,
-                               dropout=self.hidden_dropout_prob,
-                               batch_first=True)
-            self.init_parameter(self.nbt)
-        if not self.zero_init_rnn:
-            self.rnn_init_linear = nn.Sequential(
-                nn.Linear(self.bert_output_dim, self.hidden_dim),
-                nn.ReLU(),
-                nn.Dropout(self.hidden_dropout_prob)
-            )
-
-        self.linear = nn.Linear(self.hidden_dim, self.bert_output_dim)
-        self.layer_norm = nn.LayerNorm(self.bert_output_dim)
-
-        ### Measure
-        self.distance_metric = args.distance_metric
-        if self.distance_metric == "cosine":
-            self.metric = torch.nn.CosineSimilarity(dim=-1, eps=1e-08)
-        elif self.distance_metric == "euclidean":
-            self.metric = torch.nn.PairwiseDistance(p=2.0, eps=1e-06, keepdim=False)
-
-        ### Classifier
-        self.nll = CrossEntropyLoss(ignore_index=-1)
-
-        ### Etc.
-        self.dropout = nn.Dropout(self.hidden_dropout_prob)
-
-        # default evaluation mode
-        self.eval()
-
-    def initialize_slot_value_lookup(self, label_ids, slot_ids):
-
-        self.sv_encoder.eval()
-
-        # Slot encoding
-        slot_type_ids = torch.zeros(slot_ids.size(), dtype=torch.long).to(self.device)
-        slot_mask = slot_ids > 0
-
-        hid_slot, _ = self.sv_encoder(slot_ids.view(-1, self.max_label_length),
-                                      slot_type_ids.view(-1, self.max_label_length),
-                                      slot_mask.view(-1, self.max_label_length),
-                                      output_all_encoded_layers=False)
-        hid_slot = hid_slot[:, 0, :]
-        hid_slot = hid_slot.detach()
-        self.slot_lookup = nn.Embedding.from_pretrained(hid_slot, freeze=True)
-
-        for s, label_id in enumerate(label_ids):
-            label_type_ids = torch.zeros(label_id.size(), dtype=torch.long).to(self.device)
-            label_mask = label_id > 0
-            hid_label, _ = self.sv_encoder(label_id.view(-1, self.max_label_length),
-                                           label_type_ids.view(-1, self.max_label_length),
-                                           label_mask.view(-1, self.max_label_length),
-                                           output_all_encoded_layers=False)
-            hid_label = hid_label[:, 0, :]
-            hid_label = hid_label.detach()
-            self.value_lookup[s] = nn.Embedding.from_pretrained(hid_label, freeze=True)
-            self.value_lookup[s].padding_idx = -1
-
-        print("Complete initialization of slot and value lookup")
-
-    def _make_aux_tensors(self, ids, len):
-        token_type_ids = torch.zeros(ids.size(), dtype=torch.long).to(self.device)
-        for i in range(len.size(0)):
-            for j in range(len.size(1)):
-                if len[i, j, 0] == 0:  # padding
-                    break
-                elif len[i, j, 1] > 0:  # escape only text_a case
-                    start = len[i, j, 0]
-                    ending = len[i, j, 0] + len[i, j, 1]
-                    token_type_ids[i, j, start:ending] = 1
-        attention_mask = ids > 0
-        return token_type_ids, attention_mask
-
-    def forward(self, input_ids, input_len, labels, n_gpu=1, target_slot=None):
-
-        # if target_slot is not specified, output values corresponding all slot-types
-        if target_slot is None:
-            target_slot = list(range(0, self.num_slots))
-
-        ds = input_ids.size(0)  # dialog size
-        ts = input_ids.size(1)  # turn size
-        bs = ds * ts
-        slot_dim = len(target_slot)
-
-        # Utterance encoding
-        token_type_ids, attention_mask = self._make_aux_tensors(input_ids, input_len)
-
-        hidden, _ = self.utterance_encoder(input_ids.view(-1, self.max_seq_length),
-                                           token_type_ids.view(-1, self.max_seq_length),
-                                           attention_mask.view(-1, self.max_seq_length),
-                                           output_all_encoded_layers=False)
-        hidden = torch.mul(hidden, attention_mask.view(-1, self.max_seq_length, 1).expand(hidden.size()).float())
-        hidden = hidden.repeat(slot_dim, 1, 1)  # [(slot_dim*ds*ts), bert_seq, hid_size]
-
-        hid_slot = self.slot_lookup.weight[target_slot, :]  # Select target slot embedding
-        hid_slot = hid_slot.repeat(1, bs).view(bs * slot_dim, -1)  # [(slot_dim*ds*ts), bert_seq, hid_size]
-
-        # Attended utterance vector
-        hidden = self.attn(hid_slot, hidden, hidden,
-                           mask=attention_mask.view(-1, 1, self.max_seq_length).repeat(slot_dim, 1, 1))
-        hidden = hidden.squeeze()  # [slot_dim*ds*ts, bert_dim]
-        hidden = hidden.view(slot_dim, ds, ts, -1).view(-1, ts, self.bert_output_dim)
-
-        # NBT
-        if self.zero_init_rnn:
-            h = torch.zeros(self.rnn_num_layers, input_ids.shape[0] * slot_dim, self.hidden_dim).to(
-                self.device)  # [1, slot_dim*ds, hidden]
-        else:
-            h = hidden[:, 0, :].unsqueeze(0).repeat(self.rnn_num_layers, 1, 1)
-            h = self.rnn_init_linear(h)
-
-        if isinstance(self.nbt, nn.GRU):
-            rnn_out, _ = self.nbt(hidden, h)  # [slot_dim*ds, turn, hidden]
-        elif isinstance(self.nbt, nn.LSTM):
-            c = torch.zeros(self.rnn_num_layers, input_ids.shape[0] * slot_dim, self.hidden_dim).to(
-                self.device)  # [1, slot_dim*ds, hidden]
-            rnn_out, _ = self.nbt(hidden, (h, c))  # [slot_dim*ds, turn, hidden]
-        rnn_out = self.layer_norm(self.linear(self.dropout(rnn_out)))
-
-        hidden = rnn_out.view(slot_dim, ds, ts, -1)
-
-        # Label (slot-value) encoding
-        loss = 0
-        loss_slot = []
-        pred_slot = []
-        output = []
-        for s, slot_id in enumerate(target_slot):  ## note: target_slots are successive
-            # loss calculation
-            hid_label = self.value_lookup[slot_id].weight
-            num_slot_labels = hid_label.size(0)
-
-            _hid_label = hid_label.unsqueeze(0).unsqueeze(0).repeat(ds, ts, 1, 1).view(ds * ts * num_slot_labels, -1)
-            _hidden = hidden[s, :, :, :].unsqueeze(2).repeat(1, 1, num_slot_labels, 1).view(ds * ts * num_slot_labels,
-                                                                                            -1)
-            _dist = self.metric(_hid_label, _hidden).view(ds, ts, num_slot_labels)
-
-            if self.distance_metric == "euclidean":
-                _dist = -_dist
-            _, pred = torch.max(_dist, -1)
-            pred_slot.append(pred.view(ds, ts, 1))
-            output.append(_dist)
-
-            if labels is not None:
-                _loss = self.nll(_dist.view(ds * ts, -1), labels[:, :, s].view(-1))
-                loss_slot.append(_loss.item())
-                loss += _loss
-
-        if labels is None:
-            return output, torch.cat(pred_slot, 2)
-
-        # calculate joint accuracy
-        pred_slot = torch.cat(pred_slot, 2)
-        # print('pred slot:', pred_slot[0][0])
-        # print('labels:', labels[0][0])
-        accuracy = (pred_slot == labels).view(-1, slot_dim)
-        acc_slot = torch.sum(accuracy, 0).float() \
-                   / torch.sum(labels.view(-1, slot_dim) > -1, 0).float()
-        acc = sum(torch.sum(accuracy, 1) / slot_dim).float() \
-              / torch.sum(labels[:, :, 0].view(-1) > -1, 0).float()  # joint accuracy
-
-        if n_gpu == 1:
-            return loss, loss_slot, acc, acc_slot, pred_slot
-        else:
-            return loss.unsqueeze(0), None, acc.unsqueeze(0), acc_slot.unsqueeze(0), pred_slot.unsqueeze(0)
-
-    @staticmethod
-    def init_parameter(module):
-        if isinstance(module, nn.Linear):
-            torch.nn.init.xavier_normal_(module.weight)
-            torch.nn.init.constant_(module.bias, 0.0)
-        elif isinstance(module, nn.GRU) or isinstance(module, nn.LSTM):
-            torch.nn.init.xavier_normal_(module.weight_ih_l0)
-            torch.nn.init.xavier_normal_(module.weight_hh_l0)
-            torch.nn.init.constant_(module.bias_ih_l0, 0.0)
-            torch.nn.init.constant_(module.bias_hh_l0, 0.0)
diff --git a/convlab/dst/sumbt/README.md b/convlab/dst/sumbt/README.md
index 67d0a80ae67f417e93306d2808233bb967ddfcd2..7509ae2eed889bbc85f7eca3716d7dbfbc0beb6f 100755
--- a/convlab/dst/sumbt/README.md
+++ b/convlab/dst/sumbt/README.md
@@ -1,73 +1 @@
-# SUMBT on Multiwoz
-
-SUMBT (Slot-Utterance Matching Belief Tracker) is a belief tracking model that
-utilizes semantic similarity between dialogue utterances and slot-values
-, which is proposed by [Hwaran Lee et al., 2019](https://www.aclweb.org/anthology/P19-1546.pdf).
-
-The code derives from [github](https://github.com/SKTBrain/SUMBT). We modify it to support user DST. 
-
-## Usage
-
-
-### Train & Evaluate
-
-from Convlab root directory
-```python
-from convlab.dst.sumbt.multiwoz.sumbt import *
-m = SUMBTTracker()
-m.train()  # will train and output the model checkpoint in the output_path defined in 'sumbt_config.py' file
-# m.test(mode, model_path)  # where testset in ['dev', 'test'], respectively run evaluation on dev/test set of MultiWoz, model_path specify the model you want to evaluate with. will create 2 files containing evaluation metrics in the output_path defined in config file.
-
-```
-
-
-### Track
-from Convlab root directory
-```python
-from convlab.dst.sumbt.multiwoz.sumbt import *
-test_update() 
-```
-
-At the first run, the SumbtTracker will download a pre-trained model and save it into 'downloaded_model/' directory.
-
-## Data
-
-We use the multiwoz data.
-
-## Performance on Multiwoz
-
-`mode` determines the data we use: if mode=`usr`, use user utterances to train; if mode=`sys`, use system utterances to train.
-
-We evaluate the Joint accuracy and Slot accuracy on Multiwoz 2.0 validation and test set. 
-The accuracy on validation set are slightly higher than the results reported in the paper,
-because in the evaluation code all undefined values in ontology are set `none` but predictions 
-will always be wrong for all undefined domain-slots.  
-
-|   | Joint acc  | Slot acc    | Joint acc (Restaurant)  |  Slot acc (Restaurant)|
-| ----- | ----- | ------ | ------ | ----    |
-| dev     | 0.47 | 0.97 | 0.83 | 0.97  |
-| test    | 0.51 | 0.97 | 0.84 | 0.97
-
-## Model Structure
-
-SUMBT considers a domain-slot type (e.g., 'restaurant-food') as a query and finds the corresponding 
-slot-value in a pair of system-user utterances, under the assumption that the answer appear in the utterances.
-
-The model encodes domain-slot with a fixed BERT model and encodes utterances with another BERT 
-of which parameters are fine-tuned during training. A MultiHead attention layer is
-employed to capture slot-specific information, and the attention context vector is fed
-into an RNN to model the flow of dialogues.
-
-
-## Reference
-
-```
-@inproceedings{lee2019sumbt,
-  title={SUMBT: Slot-Utterance Matching for Universal and Scalable Belief Tracking},
-  author={Lee, Hwaran and Lee, Jinsik and Kim, Tae-Yoon},
-  booktitle={Proceedings of the 57th Conference of the Association for Computational Linguistics},
-  pages={5478--5483},
-  year={2019}
-}
-```
-
+See the [SetSUMBT Code](https://github.com/ConvLab/ConvLab-3/blob/master/convlab/dst/setsumbt) for the new SUMBT code.
\ No newline at end of file
diff --git a/convlab/dst/sumbt/__init__.py b/convlab/dst/sumbt/__init__.py
deleted file mode 100755
index 71cad5770be9eb7bf253646f060d84fbc024270a..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from convlab.dst.sumbt.BeliefTrackerSlotQueryMultiSlot import BeliefTracker
-
diff --git a/convlab/dst/sumbt/crosswoz_en/.gitignore b/convlab/dst/sumbt/crosswoz_en/.gitignore
deleted file mode 100644
index eb23bddd0762d03a8d0a3d25aaf033ef22948b43..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/crosswoz_en/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-model_output/
-pre-trained/
diff --git a/convlab/dst/sumbt/crosswoz_en/__init__.py b/convlab/dst/sumbt/crosswoz_en/__init__.py
deleted file mode 100644
index 91fa08da403f338d95db032b621c74904e56f986..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/crosswoz_en/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from convlab.dst.sumbt.crosswoz_en.sumbt import SUMBTTracker as SUMBT
diff --git a/convlab/dst/sumbt/crosswoz_en/convert_to_glue_format.py b/convlab/dst/sumbt/crosswoz_en/convert_to_glue_format.py
deleted file mode 100644
index bb56b27334cdec6ffe98a3ad21efdb29d93e64e9..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/crosswoz_en/convert_to_glue_format.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import json
-import zipfile
-from convlab.dst.sumbt.crosswoz_en.sumbt_config import *
-
-null = 'none'
-
-def trans_value(value):
-    trans = {
-        '': 'none',
-    }
-    value = value.strip()
-    value = trans.get(value, value)
-    value = value.replace('’', "'")
-    value = value.replace('‘', "'")
-    return value
-
-def convert_to_glue_format(data_dir, sumbt_dir):
-
-    if not os.path.isdir(os.path.join(sumbt_dir, args.tmp_data_dir)):
-        os.mkdir(os.path.join(sumbt_dir, args.tmp_data_dir))
-
-    ### Read ontology file
-    with open(os.path.join(data_dir, "ontology.json"), "r") as fp_ont:
-        data_ont = json.load(fp_ont)
-    ontology = {}
-    facilities = []
-    for domain_slot in data_ont:
-        domain, slot = domain_slot.split('-', 1)
-        if domain not in ontology:
-            ontology[domain] = {}
-        if slot.startswith('Hotel Facilities'):
-            facilities.append(slot.split(' - ')[1])
-        ontology[domain][slot] = set(map(str.lower, data_ont[domain_slot]))
-
-    ### Read woz logs and write to tsv files
-    tsv_filename = os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv")
-    print('tsv file: ', os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv"))
-    if os.path.exists(os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv")):
-        print('data has been processed!')
-        return 0
-    else:
-        print('processing data')
-
-    with open(os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv"), "w") as fp_train, \
-        open(os.path.join(sumbt_dir, args.tmp_data_dir, "dev.tsv"), "w") as fp_dev,      \
-        open(os.path.join(sumbt_dir, args.tmp_data_dir, "test.tsv"), "w") as fp_test:
-
-        fp_train.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t')
-        fp_dev.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t')
-        fp_test.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t')
-
-        for domain in sorted(ontology.keys()):
-            for slot in sorted(ontology[domain].keys()):
-                fp_train.write(f'{str(domain)}-{str(slot)}\t')
-                fp_dev.write(f'{str(domain)}-{str(slot)}\t')
-                fp_test.write(f'{str(domain)}-{str(slot)}\t')
-
-        fp_train.write('\n')
-        fp_dev.write('\n')
-        fp_test.write('\n')
-
-        # fp_data = open(os.path.join(SELF_DATA_DIR, "data.json"), "r")
-        # data = json.load(fp_data)
-
-        file_split = ['train', 'val', 'test']
-        fp = [fp_train, fp_dev, fp_test]
-
-        for split_type, split_fp in zip(file_split, fp):
-
-            zipfile_name = "{}.json.zip".format(split_type)
-            zip_fp = zipfile.ZipFile(os.path.join(data_dir, zipfile_name))
-            data = json.loads(str(zip_fp.read(zip_fp.namelist()[0]), 'utf-8'))
-
-            for file_id in data:
-                user_utterance = ''
-                system_response = ''
-                turn_idx = 0
-                messages = data[file_id]['messages']
-                for idx, turn in enumerate(messages):
-                    if idx % 2 == 0:        # user turn
-                        user_utterance = turn['content']
-                    else:                   # system turn
-                        user_utterance = user_utterance.replace('\t', ' ')
-                        user_utterance = user_utterance.replace('\n', ' ')
-                        user_utterance = user_utterance.replace('  ', ' ')
-
-                        system_response = system_response.replace('\t', ' ')
-                        system_response = system_response.replace('\n', ' ')
-                        system_response = system_response.replace('  ', ' ')
-
-                        split_fp.write(str(file_id))                   # 0: dialogue ID
-                        split_fp.write('\t' + str(turn_idx))           # 1: turn index
-                        split_fp.write('\t' + str(user_utterance))     # 2: user utterance
-                        split_fp.write('\t' + str(system_response))    # 3: system response
-
-                        # hardcode the value of facilities as 'yes' and 'no'
-                        belief = {f'Hotel-Hotel Facilities - {str(facility)}': null for facility in facilities}
-                        sys_state_init = turn['sys_state_init']
-                        for domain, slots in sys_state_init.items():
-                            for slot, value in slots.items():
-                                # skip selected results
-                                if isinstance(value, list):
-                                    continue
-                                if domain not in ontology:
-                                    print("domain (%s) is not defined" % domain)
-                                    continue
-
-                                if slot == 'Hotel Facilities':
-                                    for facility in value.split(','):
-                                        belief[f'{str(domain)}-Hotel Facilities - {str(facility)}'] = 'yes'
-                                else:
-                                    if slot not in ontology[domain]:
-                                        print("slot (%s) in domain (%s) is not defined" % (slot, domain))   # bus-arriveBy not defined
-                                        continue
-
-                                    value = trans_value(value).lower()
-
-                                    if value not in ontology[domain][slot] and value != null:
-                                        print("%s: value (%s) in domain (%s) slot (%s) is not defined in ontology" %
-                                            (file_id, value, domain, slot))
-                                        value = null
-
-                                    belief[f'{str(domain)}-{str(slot)}'] = value
-
-                        for domain in sorted(ontology.keys()):
-                            for slot in sorted(ontology[domain].keys()):
-                                key = str(domain) + '-' + str(slot)
-                                if key in belief:
-                                    val = belief[key]
-                                    split_fp.write('\t' + val)
-                                else:
-                                    split_fp.write(f'\t{null}')
-
-                        split_fp.write('\n')
-                        split_fp.flush()
-
-                        system_response = turn['content']
-                        turn_idx += 1
-    print('data has been processed!')
diff --git a/convlab/dst/sumbt/crosswoz_en/sumbt.py b/convlab/dst/sumbt/crosswoz_en/sumbt.py
deleted file mode 100644
index 8b0b5cc82c5588f591d97f1292fc6eb61f22d98d..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/crosswoz_en/sumbt.py
+++ /dev/null
@@ -1,683 +0,0 @@
-import copy
-from pprint import pprint
-import random
-from itertools import chain
-import numpy as np
-import zipfile
-
-from matplotlib import pyplot as plt
-
-from tensorboardX.writer import SummaryWriter
-from tqdm._tqdm import trange, tqdm
-
-from convlab.util.file_util import cached_path
-
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
-
-from transformers import BertTokenizer
-from transformers import get_linear_schedule_with_warmup, AdamW
-
-from convlab.dst.dst import DST
-from convlab.dst.sumbt.crosswoz_en.convert_to_glue_format import convert_to_glue_format, trans_value
-from convlab.util.crosswoz_en.state import default_state
-
-from convlab.dst.sumbt.BeliefTrackerSlotQueryMultiSlot import BeliefTracker
-from convlab.dst.sumbt.crosswoz_en.sumbt_utils import *
-from convlab.dst.sumbt.crosswoz_en.sumbt_config import *
-
-from convlab.dst.sumbt.crosswoz_en.convert_to_glue_format import null
-
-USE_CUDA = torch.cuda.is_available()
-N_GPU = torch.cuda.device_count() if USE_CUDA else 1
-DEVICE = "cuda" if USE_CUDA else "cpu"
-ROOT_PATH = convlab.get_root_path()
-SUMBT_PATH = os.path.dirname(os.path.abspath(__file__))
-DATA_PATH = os.path.join(ROOT_PATH, 'data/crosswoz_en')
-DOWNLOAD_DIRECTORY = os.path.join(SUMBT_PATH, "pre-trained/")
-crosswoz_en_slot_list = ['Attraction-duration', 'Attraction-fee', 'Attraction-name', 'Attraction-nearby attract.', 'Attraction-nearby hotels', 'Attraction-nearby rest.', 'Attraction-rating', 'Hotel-Hotel Facilities - 24-hour Hot Water', 'Hotel-Hotel Facilities - Bar', 'Hotel-Hotel Facilities - Breakfast Service', 'Hotel-Hotel Facilities - Broadband Internet', 'Hotel-Hotel Facilities - Business Center', 'Hotel-Hotel Facilities - Car Rental', 'Hotel-Hotel Facilities - Chess-Poker Room', 'Hotel-Hotel Facilities - Childcare Services', 'Hotel-Hotel Facilities - Chinese Restaurant', 'Hotel-Hotel Facilities - Disabled Facilities', 'Hotel-Hotel Facilities - Foreign Guests Reception', 'Hotel-Hotel Facilities - Free Breakfast Service', 'Hotel-Hotel Facilities - Free Domestic Long Distance Call', 'Hotel-Hotel Facilities - Free Local Calls', 'Hotel-Hotel Facilities - Gym', 'Hotel-Hotel Facilities - Hair Dryer', 'Hotel-Hotel Facilities - Heating', 'Hotel-Hotel Facilities - Hot Spring', 'Hotel-Hotel Facilities - Indoor Swimming Pool', 'Hotel-Hotel Facilities - International Call', 'Hotel-Hotel Facilities - Laundry Service', 'Hotel-Hotel Facilities - Luggage Storage', 'Hotel-Hotel Facilities - Meeting Room', 'Hotel-Hotel Facilities - Non-smoking Room', 'Hotel-Hotel Facilities - Outdoor Swimming Pool', 'Hotel-Hotel Facilities - Pay Parking', 'Hotel-Hotel Facilities - Pick-up Service', 'Hotel-Hotel Facilities - SPA', 'Hotel-Hotel Facilities - Sauna', 'Hotel-Hotel Facilities - Wake Up Service', 'Hotel-Hotel Facilities - Western Restaurant', 'Hotel-Hotel Facilities - WiFi in All Rooms', 'Hotel-Hotel Facilities - WiFi in Public Areas', 'Hotel-Hotel Facilities - WiFi in Public Areas and Some Rooms', 'Hotel-Hotel Facilities - WiFi in Some Rooms', 'Hotel-Hotel Facilities - WiFi throughout the Hotel', 'Hotel-name', 'Hotel-nearby attract.', 'Hotel-nearby hotels', 'Hotel-nearby rest.', 'Hotel-price', 'Hotel-rating', 'Hotel-type', 'Metro-from', 'Metro-to', 'Restaurant-cost', 'Restaurant-dishes', 'Restaurant-name', 'Restaurant-nearby attract.', 'Restaurant-nearby hotels', 'Restaurant-nearby rest.', 'Restaurant-rating', 'Taxi-from', 'Taxi-to']
-
-def plot(x, y):
-    a, b = [], []
-    for x, y in sorted(zip(x, y)):
-        a.append(x)
-        b.append(y)
-    plt.plot(a, b)
-
-# def get_label_embedding(labels, max_seq_length, tokenizer, device):
-#     features = []
-#     for label in labels:
-#         label_tokens = ["[CLS]"] + tokenizer.tokenize(label) + ["[SEP]"]
-#         label_token_ids = tokenizer.convert_tokens_to_ids(label_tokens)
-#         label_len = len(label_token_ids)
-
-#         label_padding = [0] * (max_seq_length - len(label_token_ids))
-#         label_token_ids += label_padding
-#         assert len(label_token_ids) == max_seq_length
-
-#         features.append((label_token_ids, label_len))
-
-#     all_label_token_ids = torch.tensor([f[0] for f in features], dtype=torch.long).to(device)
-#     all_label_len = torch.tensor([f[1] for f in features], dtype=torch.long).to(device)
-
-#     return all_label_token_ids, all_label_len
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-class SUMBTTracker(DST):
-    """
-    Transferable multi-domain dialogue state tracker, adopted from https://github.com/SKTBrain/SUMBT
-    """
-
-    @staticmethod
-    def init_data():
-        if not os.path.exists(os.path.join(DATA_PATH, 'train.json.zip')):
-            with zipfile.ZipFile(os.path.join(DATA_PATH, 'mt.zip')) as f:
-                f.extractall(DATA_PATH)
-
-        for split in ['train', 'test', 'val']:
-            with zipfile.ZipFile(os.path.join(DATA_PATH, f'{split}.json.zip'), 'w') as f:
-                f.write(os.path.join(DATA_PATH, f'{split}.json'), f'{split}.json')
-
-    def __init__(self, data_dir=DATA_PATH):
-
-        DST.__init__(self)
-
-        # if not os.path.exists(data_dir):
-        #     if model_file == '':
-        #         raise Exception(
-        #             'Please provide remote model file path in config')
-        #     resp = urllib.request.urlretrieve(model_file)[0]
-        #     temp_file = tarfile.open(resp)
-        #     temp_file.extractall('data')
-        #     assert os.path.exists(data_dir)
-
-        processor = Processor(args)
-        self.processor = processor
-        # values of each slot e.g. values_list
-        label_list = processor.get_labels()
-        num_labels = [len(labels) for labels in label_list]  # number of slot-values in each slot-type
-
-        # tokenizer
-        self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir)
-        random.seed(args.seed)
-        np.random.seed(args.seed)
-        torch.manual_seed(args.seed)
-
-        self.device = torch.device("cuda" if USE_CUDA else "cpu")
-
-        self.sumbt_model = BeliefTracker(args, num_labels, self.device)
-        if USE_CUDA and N_GPU > 1:
-            self.sumbt_model = torch.nn.DataParallel(self.sumbt_model)
-        if args.fp16:
-            self.sumbt_model.half()
-        self.sumbt_model.to(self.device)
-
-        ## Get slot-value embeddings
-        self.label_token_ids, self.label_len = [], []
-        for labels in label_list:
-            # encoding values
-            token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, self.device)
-            self.label_token_ids.append(token_ids)
-            self.label_len.append(lens)
-        self.label_map = [{label: i for i, label in enumerate(labels)} for labels in label_list]
-        self.label_map_inv = [{i: label for i, label in enumerate(labels)} for labels in label_list]
-        self.label_list = label_list
-        self.target_slot = processor.target_slot
-        ## Get domain-slot-type embeddings
-        self.slot_token_ids, self.slot_len = \
-            get_label_embedding(processor.target_slot, args.max_label_length, self.tokenizer, self.device)
-
-        self.args = args
-        self.state = default_state()
-        self.param_restored = False
-        if USE_CUDA and N_GPU == 1:
-            self.sumbt_model.initialize_slot_value_lookup(self.label_token_ids, self.slot_token_ids)
-        elif USE_CUDA and N_GPU > 1:
-            self.sumbt_model.module.initialize_slot_value_lookup(self.label_token_ids, self.slot_token_ids)
-
-        self.cached_res = {}
-        convert_to_glue_format(DATA_PATH, SUMBT_PATH)
-        if not os.path.isdir(os.path.join(SUMBT_PATH, args.output_dir)):
-            os.makedirs(os.path.join(SUMBT_PATH, args.output_dir))
-        self.train_examples = processor.get_train_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False)
-        self.dev_examples = processor.get_dev_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False)
-        self.test_examples = processor.get_test_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False)
-
-    def load_weights(self, model_path=None):
-        if model_path is None:
-            model_ckpt = os.path.join(SUMBT_PATH, 'pre-trained/pytorch_model.bin')
-        else:
-            model_ckpt = model_path
-        model = self.sumbt_model
-        # in the case that slot and values are different between the training and evaluation
-        if not USE_CUDA:
-            ptr_model = torch.load(model_ckpt, map_location=torch.device('cpu'))
-        else:
-            ptr_model = torch.load(model_ckpt)
-            print('loading pretrained weights')
-
-        if not USE_CUDA or N_GPU == 1:
-            state = model.state_dict()
-            state.update(ptr_model)
-            model.load_state_dict(state)
-        else:
-            # print("Evaluate using only one device!")
-            model.module.load_state_dict(ptr_model)
-
-        if USE_CUDA:
-            model.to("cuda")
-
-    def init_session(self):
-        self.state = default_state()
-        if not self.param_restored:
-            if os.path.isfile(os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin')):
-                print('loading weights from downloaded model')
-                self.load_weights(model_path=os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin'))
-            elif os.path.isfile(os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin')):
-                print('loading weights from trained model')
-                self.load_weights(model_path=os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin'))
-            else:
-                raise ValueError('no available weights found.')
-            self.param_restored = True
-
-    def construct_query(self, context):
-        '''Construct query from context'''
-        ids = []
-        lens = []
-        context_len = len(context)
-        if context[0][0] != 'sys':
-            context = [['sys', '']] + context
-        for i in range(0, context_len, 2):
-            # utt_user = ''
-            # utt_sys = ''
-            # for evaluation
-            utt_sys = context[i][1]
-            utt_user = context[i + 1][1]
-
-            tokens_user = [x if x != '#' else '[SEP]' for x in self.tokenizer.tokenize(utt_user)]
-            tokens_sys = [x if x != '#' else '[SEP]' for x in self.tokenizer.tokenize(utt_sys)]
-
-            _truncate_seq_pair(tokens_user, tokens_sys, self.args.max_seq_length - 3)
-            tokens = ["[CLS]"] + tokens_user + ["[SEP]"] + tokens_sys + ["[SEP]"]
-            input_len = [len(tokens_user) + 2, len(tokens_sys) + 1]
-
-            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-            padding = [0] * (self.args.max_seq_length - len(input_ids))
-            input_ids += padding
-            assert len(input_ids) == self.args.max_seq_length
-            ids.append(input_ids)
-            lens.append(input_len)
-
-        return (ids, lens)
-
-
-    def update(self, user_act=None):
-        if not isinstance(user_act, str):
-            raise Exception(
-                'Expected user_act is str but found {}'.format(type(user_act))
-            )
-        prev_state = self.state
-
-        actual_history = copy.deepcopy(prev_state['history'])
-
-        # if actual_history[-1][0] == 'user':
-        #     actual_history[-1][1] += user_act
-        # else:
-        #     actual_history.append(['user', user_act])
-        query = self.construct_query(actual_history)
-        pred_states = self.predict(query)
-
-        new_belief_state = copy.deepcopy(prev_state['belief_state'])
-        for domain_slot, value in pred_states:
-            domain, slot = domain_slot.split('-', 1)
-            value = trans_value(value)
-
-            # print(domain, slot, value)
-
-            if domain not in new_belief_state:
-                raise Exception(
-                    'Error: domain <{}> not in belief state'.format(domain))
-
-            domain_dic = new_belief_state[domain]
-            if slot in domain_dic:
-                domain_dic[slot] = value
-            else:
-                with open('sumbt_tracker_unknown_slot.log', 'a+') as f:
-                    f.write(
-                        'unknown slot name <{}> with value <{}> of domain <{}>\nitem: {}\n\n'.format(slot, value, domain, state)
-                    )
-
-        new_state = copy.deepcopy(dict(prev_state))
-        new_state['belief_state'] = new_belief_state
-        self.state = new_state
-        return self.state
-
-    def predict(self, query):
-        cache_query_key = ''.join(str(list(chain.from_iterable(query[0]))))
-        if cache_query_key in self.cached_res.keys():
-            return self.cached_res[cache_query_key]
-
-        input_ids, input_len = query
-        input_ids = torch.tensor(input_ids).to(self.device).unsqueeze(0)
-        input_len = torch.tensor(input_len).to(self.device).unsqueeze(0)
-        labels = None
-        _, pred_slot = self.sumbt_model(input_ids, input_len, labels)
-        pred_slot_t = pred_slot[0][-1].tolist()
-        predict_belief = []
-        for idx, i in enumerate(pred_slot_t):
-            predict_belief.append((self.target_slot[idx], self.label_map_inv[idx][i]))
-            # predict_belief.append('{}-{}'.format(self.target_slot[idx], self.label_map_inv[idx][i]))
-        self.cached_res[cache_query_key] = predict_belief
-
-        return predict_belief
-
-    def train(self, load_model=False, model_path=None):
-        if load_model:
-            if model_path is not None:
-                self.load_weights(model_path)
-        ## Training utterances
-        all_input_ids, all_input_len, all_label_ids = convert_examples_to_features(
-            self.train_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length)
-
-        print('all input ids size: ', all_input_ids.size())
-        num_train_batches = all_input_ids.size(0)
-        num_train_steps = int(
-            num_train_batches / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
-
-        logger.info("***** training *****")
-        logger.info("  Num examples = %d", len(self.train_examples))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_steps)
-
-        all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to(
-            DEVICE), all_label_ids.to(DEVICE)
-
-        train_data = TensorDataset(all_input_ids, all_input_len, all_label_ids)
-        train_sampler = RandomSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-        all_input_ids_dev, all_input_len_dev, all_label_ids_dev = convert_examples_to_features(
-            self.dev_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length)
-
-        logger.info("***** validation *****")
-        logger.info("  Num examples = %d", len(self.dev_examples))
-        logger.info("  Batch size = %d", args.dev_batch_size)
-
-        all_input_ids_dev, all_input_len_dev, all_label_ids_dev = \
-            all_input_ids_dev.to(DEVICE), all_input_len_dev.to(DEVICE), all_label_ids_dev.to(DEVICE)
-
-        dev_data = TensorDataset(all_input_ids_dev, all_input_len_dev, all_label_ids_dev)
-        dev_sampler = SequentialSampler(dev_data)
-        dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.dev_batch_size)
-
-        logger.info("Loaded data!")
-
-        if args.fp16:
-            self.sumbt_model.half()
-        self.sumbt_model.to(DEVICE)
-
-        # ## Get domain-slot-type embeddings
-        # slot_token_ids, slot_len = \
-        #     get_label_embedding(self.processor.target_slot, args.max_label_length, self.tokenizer, DEVICE)
-
-        # # for slot_idx, slot_str in zip(slot_token_ids, self.processor.target_slot):
-        # #     self.idx2slot[slot_idx] = slot_str
-
-        # ## Get slot-value embeddings
-        # label_token_ids, label_len = [], []
-        # for slot_idx, labels in zip(slot_token_ids, self.label_list):
-        #     # self.idx2value[slot_idx] = {}
-        #     token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, DEVICE)
-        #     label_token_ids.append(token_ids)
-        #     label_len.append(lens)
-        #     # for label, token_id in zip(labels, token_ids):
-        #     #     self.idx2value[slot_idx][token_id] = label
-
-        # logger.info('embeddings prepared')
-
-        # if USE_CUDA and N_GPU > 1:
-        #     self.sumbt_model.module.initialize_slot_value_lookup(label_token_ids, slot_token_ids)
-        # else:
-        #     self.sumbt_model.initialize_slot_value_lookup(label_token_ids, slot_token_ids)
-
-        def get_optimizer_grouped_parameters(model):
-            param_optimizer = [(n, p) for n, p in model.named_parameters() if p.requires_grad]
-            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-            optimizer_grouped_parameters = [
-                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01,
-                 'lr': args.learning_rate},
-                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,
-                 'lr': args.learning_rate},
-            ]
-            return optimizer_grouped_parameters
-
-        if not USE_CUDA or N_GPU == 1:
-            optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model)
-        else:
-            optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model.module)
-
-        t_total = num_train_steps
-
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError(
-                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.fp16_loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale)
-
-        else:
-            optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False)
-            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total)
-        logger.info(optimizer)
-
-        # Training code
-        ###############################################################################
-
-        print(torch.cuda.memory_allocated())
-
-        logger.info("Training...")
-
-        global_step = 0
-        last_update = None
-        best_loss = None
-        model = self.sumbt_model
-        if not args.do_not_use_tensorboard:
-            summary_writer = None
-        else:
-            summary_writer = SummaryWriter("./tensorboard_summary/logs_1214/")
-
-        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
-            # Train
-            model.train()
-            tr_loss = 0
-            nb_tr_examples = 0
-            nb_tr_steps = 0
-
-            for step, batch in enumerate(tqdm(train_dataloader)):
-                batch = tuple(t.to(DEVICE) for t in batch)
-                input_ids, input_len, label_ids = batch
-                # print(input_ids.size())
-
-                # Forward
-                if N_GPU == 1:
-                    loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
-                else:
-                    loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
-
-                    # average to multi-gpus
-                    loss = loss.mean()
-                    acc = acc.mean()
-                    acc_slot = acc_slot.mean(0)
-
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                # Backward
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-
-                # tensrboard logging
-                if summary_writer is not None:
-                    summary_writer.add_scalar("Epoch", epoch, global_step)
-                    summary_writer.add_scalar("Train/Loss", loss, global_step)
-                    summary_writer.add_scalar("Train/JointAcc", acc, global_step)
-                    if N_GPU == 1:
-                        for i, slot in enumerate(self.processor.target_slot):
-                            summary_writer.add_scalar("Train/Loss_%s" % slot.replace(' ', '_'), loss_slot[i],
-                                                      global_step)
-                            summary_writer.add_scalar("Train/Acc_%s" % slot.replace(' ', '_'), acc_slot[i], global_step)
-
-                tr_loss += loss.item()
-                nb_tr_examples += input_ids.size(0)
-                nb_tr_steps += 1
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    # modify lealrning rate with special warm up BERT uses
-                    lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion)
-                    if summary_writer is not None:
-                        summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step)
-                    for param_group in optimizer.param_groups:
-                        param_group['lr'] = lr_this_step
-                    if scheduler is not None:
-                        torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0)
-                    optimizer.step()
-                    if scheduler is not None:
-                        scheduler.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-
-
-            # Perform evaluation on validation dataset
-            model.eval()
-            dev_loss = 0
-            dev_acc = 0
-            dev_loss_slot, dev_acc_slot = None, None
-            nb_dev_examples, nb_dev_steps = 0, 0
-
-            for step, batch in enumerate(tqdm(dev_dataloader, desc="Validation")):
-                batch = tuple(t.to(DEVICE) for t in batch)
-                input_ids, input_len, label_ids = batch
-                if input_ids.dim() == 2:
-                    input_ids = input_ids.unsqueeze(0)
-                    input_len = input_len.unsqueeze(0)
-                    label_ids = label_ids.unsuqeeze(0)
-
-                with torch.no_grad():
-                    if N_GPU == 1:
-                        loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
-                    else:
-                        loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
-
-                        # average to multi-gpus
-                        loss = loss.mean()
-                        acc = acc.mean()
-                        acc_slot = acc_slot.mean(0)
-
-                num_valid_turn = torch.sum(label_ids[:, :, 0].view(-1) > -1, 0).item()
-                dev_loss += loss.item() * num_valid_turn
-                dev_acc += acc.item() * num_valid_turn
-
-                if N_GPU == 1:
-                    if dev_loss_slot is None:
-                        dev_loss_slot = [l * num_valid_turn for l in loss_slot]
-                        dev_acc_slot = acc_slot * num_valid_turn
-                    else:
-                        for i, l in enumerate(loss_slot):
-                            dev_loss_slot[i] = dev_loss_slot[i] + l * num_valid_turn
-                        dev_acc_slot += acc_slot * num_valid_turn
-
-                nb_dev_examples += num_valid_turn
-
-
-            dev_loss = dev_loss / nb_dev_examples
-            dev_acc = dev_acc / nb_dev_examples
-
-            if N_GPU == 1:
-                dev_acc_slot = dev_acc_slot / nb_dev_examples
-
-            # tensorboard logging
-            if summary_writer is not None:
-                summary_writer.add_scalar("Validate/Loss", dev_loss, global_step)
-                summary_writer.add_scalar("Validate/Acc", dev_acc, global_step)
-                if N_GPU == 1:
-                    for i, slot in enumerate(self.processor.target_slot):
-                        summary_writer.add_scalar("Validate/Loss_%s" % slot.replace(' ', '_'),
-                                                  dev_loss_slot[i] / nb_dev_examples, global_step)
-                        summary_writer.add_scalar("Validate/Acc_%s" % slot.replace(' ', '_'), dev_acc_slot[i],
-                                                  global_step)
-
-            dev_loss = round(dev_loss, 6)
-
-            output_model_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin")
-
-            if last_update is None or dev_loss < best_loss:
-                last_update = epoch
-                best_loss = dev_loss
-                best_acc = dev_acc
-                if not USE_CUDA or N_GPU == 1:
-                    torch.save(model.state_dict(), output_model_file)
-                else:
-                    torch.save(model.module.state_dict(), output_model_file)
-
-                logger.info(
-                    "*** Model Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % (
-                        last_update, best_loss, best_acc, global_step))
-            else:
-                logger.info(
-                    "*** Model NOT Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d  ***" % (
-                        epoch, dev_loss, dev_acc, global_step))
-
-            if last_update + args.patience <= epoch:
-                break
-
-    def test(self, mode='dev', model_path=os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin")):
-        '''Testing funciton of TRADE (to be added)'''
-        # Evaluation
-        self.load_weights(model_path)
-
-        if mode == 'test':
-            eval_examples = self.dev_examples
-        elif mode == 'dev':
-            eval_examples = self.test_examples
-
-        all_input_ids, all_input_len, all_label_ids = convert_examples_to_features(
-            eval_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length)
-        all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to(
-            DEVICE), all_label_ids.to(DEVICE)
-        logger.info("***** Running evaluation *****")
-        logger.info("  Num examples = %d", len(eval_examples))
-        logger.info("  Batch size = %d", args.dev_batch_size)
-
-        eval_data = TensorDataset(all_input_ids, all_input_len, all_label_ids)
-
-        # Run prediction for full data
-        eval_sampler = SequentialSampler(eval_data)
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.dev_batch_size)
-
-        model = self.sumbt_model
-        eval_loss, eval_accuracy = 0, 0
-        eval_loss_slot, eval_acc_slot = None, None
-        nb_eval_steps, nb_eval_examples = 0, 0
-
-        accuracies = {'joint7': 0, 'slot7': 0, 'joint5': 0, 'slot5': 0, 'joint_rest': 0, 'slot_rest': 0,
-                      'num_turn': 0, 'num_slot7': 0, 'num_slot5': 0, 'num_slot_rest': 0}
-
-        for input_ids, input_len, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-            # if input_ids.dim() == 2:
-            #     input_ids = input_ids.unsqueeze(0)
-            #     input_len = input_len.unsqueeze(0)
-            #     label_ids = label_ids.unsuqeeze(0)
-
-            with torch.no_grad():
-                if not USE_CUDA or N_GPU == 1:
-                    loss, loss_slot, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, 1)
-                else:
-                    loss, _, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, N_GPU)
-                    nbatch = label_ids.size(0)
-                    nslot = pred_slot.size(3)
-                    pred_slot = pred_slot.view(nbatch, -1, nslot)
-
-            accuracies = eval_all_accs(pred_slot, label_ids, accuracies)
-
-            nb_eval_ex = (label_ids[:, :, 0].view(-1) != -1).sum().item()
-            nb_eval_examples += nb_eval_ex
-            nb_eval_steps += 1
-
-            if not USE_CUDA or N_GPU == 1:
-                eval_loss += loss.item() * nb_eval_ex
-                eval_accuracy += acc.item() * nb_eval_ex
-                if eval_loss_slot is None:
-                    eval_loss_slot = [l * nb_eval_ex for l in loss_slot]
-                    eval_acc_slot = acc_slot * nb_eval_ex
-                else:
-                    for i, l in enumerate(loss_slot):
-                        eval_loss_slot[i] = eval_loss_slot[i] + l * nb_eval_ex
-                    eval_acc_slot += acc_slot * nb_eval_ex
-            else:
-                eval_loss += sum(loss) * nb_eval_ex
-                eval_accuracy += sum(acc) * nb_eval_ex
-
-            # exit(1)
-
-        eval_loss = eval_loss / nb_eval_examples
-        eval_accuracy = eval_accuracy / nb_eval_examples
-        if not USE_CUDA or N_GPU == 1:
-            eval_acc_slot = eval_acc_slot / nb_eval_examples
-
-        loss = None
-
-        if not USE_CUDA or N_GPU == 1:
-            result = {
-                # 'num': '\t'.join([str(x) for x in model.num_labels]),
-                'eval_loss': eval_loss,
-                'eval_accuracy': eval_accuracy,
-                'loss': loss,
-                'eval_loss_slot': '\t'.join([str(val / nb_eval_examples) for val in eval_loss_slot]),
-                'eval_acc_slot': '\t'.join([str((val).item()) for val in eval_acc_slot]),
-            }
-        else:
-            result = {'eval_loss': eval_loss,
-                      'eval_accuracy': eval_accuracy,
-                      'loss': loss
-                      }
-
-        out_file_name = 'eval_results'
-        # if TARGET_SLOT == 'all':
-        #     out_file_name += '_all'
-        output_eval_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name)
-
-        if not USE_CUDA or N_GPU == 1:
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key in sorted(result.keys()):
-                    logger.info("  %s = %s", key, str(result[key]))
-                    writer.write("%s = %s\n" % (key, str(result[key])))
-
-        out_file_name = 'eval_all_accuracies'
-        with open(os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name), 'w') as f:
-            s = '{:^22s}:{:^22s}:{:^22s}:{:^22s}:{:^22s}:{:^22s}'.format(
-                'joint acc (7 domain)', 
-                'slot acc (7 domain)', 
-                'joint acc (5 domain)', 
-                'slot acc (5 domain)', 
-                'joint restaurant', 
-                'slot acc restaurant')
-            f.write(s + '\n')
-            print(s)
-            s = '{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}'.format(
-                (accuracies['joint7'] / accuracies['num_turn']).item(),
-                (accuracies['slot7'] / accuracies['num_slot7']).item(),
-                (accuracies['joint5'] / accuracies['num_turn']).item(),
-                (accuracies['slot5'] / accuracies['num_slot5']).item(),
-                (accuracies['joint_rest'] / accuracies['num_turn']).item(),
-                (accuracies['slot_rest'] / accuracies['num_slot_rest']).item()
-            )
-            f.write(s + '\n')
-            print(s)
diff --git a/convlab/dst/sumbt/crosswoz_en/sumbt_config.py b/convlab/dst/sumbt/crosswoz_en/sumbt_config.py
deleted file mode 100644
index a31551177b947697bb751799191af2ed6a25a703..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/crosswoz_en/sumbt_config.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import os
-import convlab
-class DotMap():
-    def __init__(self):
-        self.max_label_length = 35
-        self.num_rnn_layers = 1
-        self.zero_init_rnn = False
-        self.attn_head = 4
-        self.do_eval = True
-        self.do_train = False
-        self.train_batch_size = 3
-        self.dev_batch_size = 1
-        self.eval_batch_size  = 16
-        self.learning_rate = 5e-5
-        self.warmup_proportion = 0.1
-        self.local_rank = -1
-        self.seed = 42
-        self.gradient_accumulation_steps = 1
-        self.fp16 = False
-        self.loss_scale = 0
-        self.do_not_use_tensorboard = False
-        self.fix_utterance_encoder = False
-        self.do_eval = True
-        self.num_train_epochs = 300
-
-        self.bert_model = os.path.join(convlab.get_root_path(), "pre-trained-models/bert-base-uncased")
-        self.bert_model_cache_dir = os.path.join(convlab.get_root_path(), "pre-trained-models/")
-        self.bert_model_name = "bert-base-uncased"
-        self.do_lower_case = True
-        self.task_name = 'bert-gru-sumbt'
-        self.nbt = 'rnn'
-        self.target_slot = 'all'
-        self.distance_metric = 'euclidean'
-        self.patience = 15
-
-        self.hidden_dim = 300
-        self.max_seq_length = 35
-        self.max_turn_length = 23
-
-        self.fp16_loss_scale = 0.0
-        self.data_dir = 'data/crosswoz_en/'
-        self.tf_dir = 'tensorboard'
-        self.tmp_data_dir = 'processed_data/'
-        self.output_dir = 'model_output/'
-
-args = DotMap()
\ No newline at end of file
diff --git a/convlab/dst/sumbt/crosswoz_en/sumbt_utils.py b/convlab/dst/sumbt/crosswoz_en/sumbt_utils.py
deleted file mode 100644
index a67a291a4f8d9ef4d1cd6762742faaa151f109e3..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/crosswoz_en/sumbt_utils.py
+++ /dev/null
@@ -1,449 +0,0 @@
-import csv
-import os
-import json
-import collections
-import logging
-import re
-import torch
-
-from convlab.dst.sumbt.crosswoz_en.convert_to_glue_format import null
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt='%m/%d/%Y %H:%M:%S',
-                    level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding='utf-8') as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                if len(line) > 0 and line[0][0] == '#':  # ignore comments (starting with '#')
-                    continue
-                lines.append(line)
-            return lines
-
-
-class Processor(DataProcessor):
-    """Processor for the belief tracking dataset (GLUE version)."""
-
-    def __init__(self, config):
-        super(Processor, self).__init__()
-
-        # crosswoz dataset
-        with open(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))), config.data_dir, "ontology.json"), "r") as fp_ontology:
-            ontology = json.load(fp_ontology)
-            for slot in ontology.keys():
-                ontology[slot].append(null)
-
-        assert config.target_slot == 'all'
-        # if not config.target_slot == 'all':
-        #     slot_idx = {'attraction': '0:1:2', 'bus': '3:4:5:6', 'hospital': '7',
-        #                 'hotel': '8:9:10:11:12:13:14:15:16:17', \
-        #                 'restaurant': '18:19:20:21:22:23:24', 'taxi': '25:26:27:28', 'train': '29:30:31:32:33:34'}
-        #     target_slot = []
-        #     for key, value in slot_idx.items():
-        #         if key != config.target_slot:
-        #             target_slot.append(value)
-        #     config.target_slot = ':'.join(target_slot)
-
-        # sorting the ontology according to the alphabetic order of the slots
-        ontology = collections.OrderedDict(sorted(ontology.items()))
-
-        # select slots to train
-        nslots = len(ontology.keys())
-        target_slot = list(ontology.keys())
-        if config.target_slot == 'all':
-            self.target_slot_idx = [*range(0, nslots)]
-        else:
-            self.target_slot_idx = sorted([int(x) for x in config.target_slot.split(':')])
-
-        for idx in range(0, nslots):
-            if not idx in self.target_slot_idx:
-                del ontology[target_slot[idx]]
-
-        self.ontology = ontology
-        self.target_slot = list(self.ontology.keys())
-        # for i, slot in enumerate(self.target_slot):
-        #     if slot == "pricerange":
-        #         self.target_slot[i] = "price range"
-        logger.info('Processor: target_slot')
-        logger.info(self.target_slot)
-
-    def get_train_examples(self, data_dir, accumulation=False):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train", accumulation)
-
-    def get_dev_examples(self, data_dir, accumulation=False):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev", accumulation)
-
-    def get_test_examples(self, data_dir, accumulation=False):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test", accumulation)
-
-    def get_labels(self):
-        """See base class."""
-        return [list(map(str.lower, self.ontology[slot])) for slot in self.target_slot]
-
-    def _create_examples(self, lines, set_type, accumulation=False):
-        """Creates examples for the training and dev sets."""
-        prev_dialogue_index = None
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s-%s" % (set_type, line[0], line[1])  # line[0]: dialogue index, line[1]: turn index
-            if accumulation:
-                if prev_dialogue_index is None or prev_dialogue_index != line[0]:
-                    text_a = line[2]
-                    text_b = line[3]
-                    prev_dialogue_index = line[0]
-                else:
-                    # The symbol '#' will be replaced with '[SEP]' after tokenization.
-                    text_a = line[2] + " # " + text_a
-                    text_b = line[3] + " # " + text_b
-            else:
-                text_a = line[2]  # line[2]: user utterance
-                text_b = line[3]  # line[3]: system response
-
-            label = [line[4 + idx] for idx in self.target_slot_idx]
-
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-def normalize_text(text):
-    global replacements
-    # lower case every word
-    text = text.lower()
-    # replace white spaces in front and end
-    text = re.sub(r'^\s*|\s*$', '', text)
-
-    # hotel domain pfb30
-    text = re.sub(r"b&b", "bed and breakfast", text)
-    text = re.sub(r"b and b", "bed and breakfast", text)
-
-    # replace st.
-    text = text.replace(';', ',')
-    text = re.sub('$\/', '', text)
-    text = text.replace('/', ' and ')
-
-    # replace other special characters
-    text = text.replace('-', ' ')
-    text = re.sub('[\"\<>@\(\)]', '', text)  # remove
-
-    # insert white space before and after tokens:
-    for token in ['?', '.', ',', '!']:
-        text = insertSpace(token, text)
-
-    # insert white space for 's
-    text = insertSpace('\'s', text)
-
-    # replace it's, does't, you'd ... etc
-    text = re.sub('^\'', '', text)
-    text = re.sub('\'$', '', text)
-    text = re.sub('\'\s', ' ', text)
-    text = re.sub('\s\'', ' ', text)
-    for fromx, tox in replacements:
-        text = ' ' + text + ' '
-        text = text.replace(fromx, tox)[1:-1]
-
-    # remove multiple spaces
-    text = re.sub(' +', ' ', text)
-
-    # concatenate numbers
-    tmp = text
-    tokens = text.split()
-    i = 1
-    while i < len(tokens):
-        if re.match(u'^\d+$', tokens[i]) and \
-                re.match(u'\d+$', tokens[i - 1]):
-            tokens[i - 1] += tokens[i]
-            del tokens[i]
-        else:
-            i += 1
-    text = ' '.join(tokens)
-
-    return text
-
-
-def insertSpace(token, text):
-    sidx = 0
-    while True:
-        sidx = text.find(token, sidx)
-        if sidx == -1:
-            break
-        if sidx + 1 < len(text) and re.match('[0-9]', text[sidx - 1]) and \
-                re.match('[0-9]', text[sidx + 1]):
-            sidx += 1
-            continue
-        if text[sidx - 1] != ' ':
-            text = text[:sidx] + ' ' + text[sidx:]
-            sidx += 1
-        if sidx + len(token) < len(text) and text[sidx + len(token)] != ' ':
-            text = text[:sidx + 1] + ' ' + text[sidx + 1:]
-        sidx += 1
-    return text
-
-# convert tokens in labels to the identifier in vocabulary
-def get_label_embedding(labels, max_seq_length, tokenizer, device):
-    features = []
-    for label in labels:
-        label_tokens = ["[CLS]"] + tokenizer.tokenize(label) + ["[SEP]"]
-        # just truncate, some names are unreasonable long
-        label_token_ids = tokenizer.convert_tokens_to_ids(label_tokens)[:max_seq_length]    
-        label_len = len(label_token_ids)
-
-        label_padding = [0] * (max_seq_length - len(label_token_ids))
-        label_token_ids += label_padding
-        assert len(label_token_ids) == max_seq_length
-
-        features.append((label_token_ids, label_len))
-
-    all_label_token_ids = torch.tensor([f[0] for f in features], dtype=torch.long).to(device)
-    all_label_len = torch.tensor([f[1] for f in features], dtype=torch.long).to(device)
-
-    return all_label_token_ids, all_label_len
-
-
-def warmup_linear(x, warmup=0.002):
-    if x < warmup:
-        return x / warmup
-    return 1.0 - x
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
-
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
-
-        Args:
-            guid: Unique id for the example.
-            text_a: string. The untokenized text of the first sequence. For single
-            sequence tasks, only this sequence must be specified.
-            text_b: (Optional) string. The untokenized text of the second sequence.
-            Only must be specified for sequence pair tasks.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_len, label_id):
-        self.input_ids = input_ids
-        self.input_len = input_len
-        self.label_id = label_id
-
-
-def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, max_turn_length):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    label_map = [{label: i for i, label in enumerate(labels)} for labels in label_list]
-    slot_dim = len(label_list)
-
-    features = []
-    prev_dialogue_idx = None
-    all_padding = [0] * max_seq_length
-    all_padding_len = [0, 0]
-
-    max_turn = 0
-    for (ex_index, example) in enumerate(examples):
-        if max_turn < int(example.guid.split('-')[2]):
-            max_turn = int(example.guid.split('-')[2])
-    max_turn_length = min(max_turn + 1, max_turn_length)
-    logger.info("max_turn_length = %d" % max_turn)
-
-    for (ex_index, example) in enumerate(examples):
-        tokens_a = [x if x != '#' else '[SEP]' for x in tokenizer.tokenize(example.text_a)]
-        tokens_b = None
-        if example.text_b:
-            tokens_b = [x if x != '#' else '[SEP]' for x in tokenizer.tokenize(example.text_b)]
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[:(max_seq_length - 2)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0   0   0   0  0     0 0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambigiously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-
-        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
-        input_len = [len(tokens), 0]
-
-        if tokens_b:
-            tokens += tokens_b + ["[SEP]"]
-            input_len[1] = len(tokens_b) + 1
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # Zero-pad up to the sequence length.
-        padding = [0] * (max_seq_length - len(input_ids))
-        input_ids += padding
-        assert len(input_ids) == max_seq_length
-
-        FLAG_TEST = False
-        if example.label is not None:
-            label_id = []
-            label_info = 'label: '
-            for i, label in enumerate(example.label):
-                if label == 'dontcare':
-                    label = 'do not care'
-                label_id.append(label_map[i][label])
-                label_info += '%s (id = %d) ' % (label, label_map[i][label])
-
-            if ex_index < 5:
-                logger.info("*** Example ***")
-                logger.info("guid: %s" % example.guid)
-                logger.info("tokens: %s" % " ".join(
-                    [str(x) for x in tokens]))
-                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logger.info("input_len: %s" % " ".join([str(x) for x in input_len]))
-                logger.info("label: " + label_info)
-        else:
-            FLAG_TEST = True
-            label_id = None
-
-        curr_dialogue_idx = example.guid.split('-')[1]
-        curr_turn_idx = int(example.guid.split('-')[2])
-
-        if prev_dialogue_idx is not None and prev_dialogue_idx != curr_dialogue_idx:
-            if prev_turn_idx < max_turn_length:
-                features += [InputFeatures(input_ids=all_padding,
-                                           input_len=all_padding_len,
-                                           label_id=[-1] * slot_dim)] \
-                            * (max_turn_length - prev_turn_idx - 1)
-            # print(len(features), max_turn_length)
-            assert len(features) % max_turn_length == 0
-
-        if prev_dialogue_idx is None or prev_turn_idx < max_turn_length:
-            features.append(
-                InputFeatures(input_ids=input_ids,
-                              input_len=input_len,
-                              label_id=label_id))
-
-        prev_dialogue_idx = curr_dialogue_idx
-        prev_turn_idx = curr_turn_idx
-
-    if prev_turn_idx < max_turn_length:
-        features += [InputFeatures(input_ids=all_padding,
-                                   input_len=all_padding_len,
-                                   label_id=[-1] * slot_dim)] \
-                    * (max_turn_length - prev_turn_idx - 1)
-    assert len(features) % max_turn_length == 0
-
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_len = torch.tensor([f.input_len for f in features], dtype=torch.long)
-    if not FLAG_TEST:
-        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
-
-    # reshape tensors to [#batch, #max_turn_length, #max_seq_length]
-    all_input_ids = all_input_ids.view(-1, max_turn_length, max_seq_length)
-    all_input_len = all_input_len.view(-1, max_turn_length, 2)
-    if not FLAG_TEST:
-        all_label_ids = all_label_ids.view(-1, max_turn_length, slot_dim)
-    else:
-        all_label_ids = None
-
-    return all_input_ids, all_input_len, all_label_ids
-
-
-def eval_all_accs(pred_slot, labels, accuracies):
-
-    def _eval_acc(_pred_slot, _labels):
-        slot_dim = _labels.size(-1)
-        accuracy = (_pred_slot == _labels).view(-1, slot_dim)
-        num_turn = torch.sum(_labels[:, :, 0].view(-1) > -1, 0).float()
-        num_data = torch.sum(_labels > -1).float()
-        # joint accuracy
-        joint_acc = sum(torch.sum(accuracy, 1) / slot_dim).float()
-        # slot accuracy
-        slot_acc = torch.sum(accuracy).float()
-        return joint_acc, slot_acc, num_turn, num_data
-
-    # 7 domains
-    joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot, labels)
-    accuracies['joint7'] += joint_acc
-    accuracies['slot7'] += slot_acc
-    accuracies['num_turn'] += num_turn
-    accuracies['num_slot7'] += num_data
-
-    # restaurant domain
-    joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot[:,:,18:25], labels[:,:,18:25])
-    accuracies['joint_rest'] += joint_acc
-    accuracies['slot_rest'] += slot_acc
-    accuracies['num_slot_rest'] += num_data
-
-    pred_slot5 = torch.cat((pred_slot[:,:,0:3], pred_slot[:,:,8:]), 2)
-    label_slot5 = torch.cat((labels[:,:,0:3], labels[:,:,8:]), 2)
-
-    # 5 domains (excluding bus and hotel domain)
-    joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot5, label_slot5)
-    accuracies['joint5'] += joint_acc
-    accuracies['slot5'] += slot_acc
-    accuracies['num_slot5'] += num_data
-
-    return accuracies
diff --git a/convlab/dst/sumbt/multiwoz/__init__.py b/convlab/dst/sumbt/multiwoz/__init__.py
deleted file mode 100755
index 2072ea48e1cb4f7c682c4d5dbf9be152db9ad4d0..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/multiwoz/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from convlab.dst.sumbt.multiwoz.sumbt import SUMBTTracker as SUMBT
diff --git a/convlab/dst/sumbt/multiwoz/convert_to_glue_format.py b/convlab/dst/sumbt/multiwoz/convert_to_glue_format.py
deleted file mode 100755
index f6645d11ccc564bb13bb251d6b1b01291ef04b34..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/multiwoz/convert_to_glue_format.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import json
-import zipfile
-from convlab.dst.sumbt.multiwoz.sumbt_config import *
-
-
-def convert_to_glue_format(data_dir, sumbt_dir):
-
-    if not os.path.isdir(os.path.join(sumbt_dir, args.tmp_data_dir)):
-        os.mkdir(os.path.join(sumbt_dir, args.tmp_data_dir))
-
-    ### Read ontology file
-    fp_ont = open(os.path.join(data_dir, "ontology_sumbt.json"), "r")
-    data_ont = json.load(fp_ont)
-    ontology = {}
-    for domain_slot in data_ont:
-        domain, slot = domain_slot.split('-')
-        if domain not in ontology:
-            ontology[domain] = {}
-        ontology[domain][slot] = {}
-        for value in data_ont[domain_slot]:
-            ontology[domain][slot][value] = 1
-    fp_ont.close()
-
-    ### Read woz logs and write to tsv files
-    if os.path.exists(os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv")):
-        print('data has been processed!')
-        return 0
-
-    fp_train = open(os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv"), "w")
-    fp_dev = open(os.path.join(sumbt_dir, args.tmp_data_dir, "dev.tsv"), "w")
-    fp_test = open(os.path.join(sumbt_dir, args.tmp_data_dir, "test.tsv"), "w")
-
-    fp_train.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t')
-    fp_dev.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t')
-    fp_test.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t')
-
-    for domain in sorted(ontology.keys()):
-        for slot in sorted(ontology[domain].keys()):
-            fp_train.write(str(domain) + '-' + str(slot) + '\t')
-            fp_dev.write(str(domain) + '-' + str(slot) + '\t')
-            fp_test.write(str(domain) + '-' + str(slot) + '\t')
-
-    fp_train.write('\n')
-    fp_dev.write('\n')
-    fp_test.write('\n')
-
-    # fp_data = open(os.path.join(SELF_DATA_DIR, "data.json"), "r")
-    # data = json.load(fp_data)
-
-    file_split = ['train', 'val', 'test']
-    fp = [fp_train, fp_dev, fp_test]
-
-    for split_type, split_fp in zip(file_split, fp):
-
-        zipfile_name = "{}.json.zip".format(split_type)
-        zip_fp = zipfile.ZipFile(os.path.join(data_dir, zipfile_name))
-        data = json.loads(str(zip_fp.read(zip_fp.namelist()[0]), 'utf-8'))
-
-        for file_id in data:
-            user_utterance = ''
-            system_response = ''
-            turn_idx = 0
-            for idx, turn in enumerate(data[file_id]['log']):
-                if idx % 2 == 0:        # user turn
-                    user_utterance = data[file_id]['log'][idx]['text']
-                else:                   # system turn
-                    user_utterance = user_utterance.replace('\t', ' ')
-                    user_utterance = user_utterance.replace('\n', ' ')
-                    user_utterance = user_utterance.replace('  ', ' ')
-
-                    system_response = system_response.replace('\t', ' ')
-                    system_response = system_response.replace('\n', ' ')
-                    system_response = system_response.replace('  ', ' ')
-
-                    split_fp.write(str(file_id))                   # 0: dialogue ID
-                    split_fp.write('\t' + str(turn_idx))           # 1: turn index
-                    split_fp.write('\t' + str(user_utterance))     # 2: user utterance
-                    split_fp.write('\t' + str(system_response))    # 3: system response
-
-                    belief = {}
-                    for domain in data[file_id]['log'][idx]['metadata'].keys():
-                        for slot in data[file_id]['log'][idx]['metadata'][domain]['semi'].keys():
-                            value = data[file_id]['log'][idx]['metadata'][domain]['semi'][slot].strip()
-                            value = value.lower()
-                            if value == '' or value == 'not mentioned' or value == 'not given':
-                                value = 'none'
-
-                            if slot == "leaveAt" and domain != "bus":
-                                slot = "leave at"
-                            elif slot == "arriveBy" and domain != "bus":
-                                slot = "arrive by"
-                            elif slot == "pricerange":
-                                slot = "price range"
-
-                            if value == "doesn't care" or value == "don't care" or value == "dont care" or value == "does not care" or value == 'dontcare':
-                                value = "do not care"
-                            elif value == "guesthouse" or value == "guesthouses":
-                                value = "guest house"
-                            elif value == "city center" or value == "town centre" or value == "town center" or \
-                                    value == "centre of town" or value == "center" or value == "center of town":
-                                value = "centre"
-                            elif value == "west part of town":
-                                value = "west"
-                            elif value == "mutliple sports":
-                                value = "multiple sports"
-                            elif value == "swimmingpool":
-                                value = "swimming pool"
-                            elif value == "concerthall":
-                                value = "concert hall"
-
-                            if domain not in ontology:
-                                # print("domain (%s) is not defined" % domain)
-                                continue
-
-                            if slot not in ontology[domain]:
-                                # print("slot (%s) in domain (%s) is not defined" % (slot, domain))   # bus-arriveBy not defined
-                                continue
-
-                            if value not in ontology[domain][slot] and value != 'none':
-                                # print("%s: value (%s) in domain (%s) slot (%s) is not defined in ontology" %
-                                #       (file_id, value, domain, slot))
-                                value = 'none'
-
-                            belief[str(domain) + '-' + str(slot)] = value
-
-                        for slot in data[file_id]['log'][idx]['metadata'][domain]['book'].keys():
-                            if slot == 'booked':
-                                continue
-                            if domain == 'bus' and slot == 'people':
-                                continue    # not defined in ontology
-
-                            value = data[file_id]['log'][idx]['metadata'][domain]['book'][slot].strip()
-                            value = value.lower()
-
-                            if value == '' or value == 'not mentioned' or value == 'not given':
-                                value = 'none'
-                            elif value == "doesn't care" or value == "don't care" or value == "dont care" or value == "does not care" or value == 'dontcare':
-                                value = "do not care"
-
-                            if str('book ' + slot) not in ontology[domain]:
-                                # print("book %s is not defined in domain %s" % (slot, domain))
-                                continue
-
-                            if value not in ontology[domain]['book ' + slot] and value != 'none':
-                                # print("%s: value (%s) in domain (%s) slot (book %s) is not defined in ontology" %
-                                #       (file_id, value, domain, slot))
-                                value = 'none'
-
-                            belief[str(domain) + '-book ' + str(slot)] = value
-
-                    for domain in sorted(ontology.keys()):
-                        for slot in sorted(ontology[domain].keys()):
-                            key = str(domain) + '-' + str(slot)
-                            if key in belief:
-                                split_fp.write('\t' + belief[key])
-                            else:
-                                split_fp.write('\tnone')
-
-                    split_fp.write('\n')
-                    split_fp.flush()
-
-                    system_response = data[file_id]['log'][idx]['text']
-                    turn_idx += 1
-
-    fp_train.close()
-    fp_dev.close()
-    fp_test.close()
\ No newline at end of file
diff --git a/convlab/dst/sumbt/multiwoz/sumbt.py b/convlab/dst/sumbt/multiwoz/sumbt.py
deleted file mode 100755
index 506ea358018d451f6a76d003f50272c0b89fab5d..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/multiwoz/sumbt.py
+++ /dev/null
@@ -1,818 +0,0 @@
-import copy
-from pprint import pprint
-import random
-from itertools import chain
-import numpy as np
-import zipfile
-
-from tensorboardX.writer import SummaryWriter
-from tqdm._tqdm import trange, tqdm
-
-from convlab.util.file_util import cached_path
-
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
-
-from transformers import BertTokenizer
-from transformers import get_linear_schedule_with_warmup, AdamW
-
-from convlab.dst.dst import DST
-from convlab.dst.sumbt.multiwoz.convert_to_glue_format import convert_to_glue_format
-from convlab.util.multiwoz.state import default_state
-from convlab.dst.sumbt.BeliefTrackerSlotQueryMultiSlot import BeliefTracker
-from convlab.dst.sumbt.multiwoz.sumbt_utils import *
-from convlab.dst.sumbt.multiwoz.sumbt_config import *
-from convlab.util.multiwoz.multiwoz_slot_trans import REF_SYS_DA, REF_USR_DA
-
-USE_CUDA = torch.cuda.is_available()
-N_GPU = torch.cuda.device_count() if USE_CUDA else 1
-DEVICE = "cuda" if USE_CUDA else "cpu"
-ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
-SUMBT_PATH = os.path.dirname(os.path.abspath(__file__))
-DATA_PATH = os.path.join(ROOT_PATH, 'data/multiwoz')
-DOWNLOAD_DIRECTORY = os.path.join(SUMBT_PATH, 'downloaded_model')
-multiwoz_slot_list = ['attraction-area', 'attraction-name', 'attraction-type', 'hotel-day', 'hotel-people', 'hotel-stay', 'hotel-area', 'hotel-internet', 'hotel-name', 'hotel-parking', 'hotel-pricerange', 'hotel-stars', 'hotel-type', 'restaurant-day', 'restaurant-people', 'restaurant-time', 'restaurant-area', 'restaurant-food', 'restaurant-name', 'restaurant-pricerange', 'taxi-arriveby', 'taxi-departure', 'taxi-destination', 'taxi-leaveat', 'train-people', 'train-arriveby', 'train-day', 'train-departure', 'train-destination', 'train-leaveat']
-
-
-def get_label_embedding(labels, max_seq_length, tokenizer, device):
-    features = []
-    for label in labels:
-        label_tokens = ["[CLS]"] + tokenizer.tokenize(label) + ["[SEP]"]
-        label_token_ids = tokenizer.convert_tokens_to_ids(label_tokens)
-        label_len = len(label_token_ids)
-
-        label_padding = [0] * (max_seq_length - len(label_token_ids))
-        label_token_ids += label_padding
-        assert len(label_token_ids) == max_seq_length
-
-        features.append((label_token_ids, label_len))
-
-    all_label_token_ids = torch.tensor([f[0] for f in features], dtype=torch.long).to(device)
-    all_label_len = torch.tensor([f[1] for f in features], dtype=torch.long).to(device)
-
-    return all_label_token_ids, all_label_len
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-class SUMBTTracker(DST):
-    """
-    Transferable multi-domain dialogue state tracker, adopted from https://github.com/SKTBrain/SUMBT
-    """
-
-
-    def __init__(self, data_dir=DATA_PATH, model_file='https://huggingface.co/ConvLab/ConvLab-2_models/resolve/main/sumbt.tar.gz', eval_slots=multiwoz_slot_list):
-
-        DST.__init__(self)
-
-        # if not os.path.exists(data_dir):
-        #     if model_file == '':
-        #         raise Exception(
-        #             'Please provide remote model file path in config')
-        #     resp = urllib.request.urlretrieve(model_file)[0]
-        #     temp_file = tarfile.open(resp)
-        #     temp_file.extractall('data')
-        #     assert os.path.exists(data_dir)
-
-        processor = Processor(args)
-        self.processor = processor
-        label_list = processor.get_labels()
-        num_labels = [len(labels) for labels in label_list]  # number of slot-values in each slot-type
-
-        # tokenizer
-        self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir)
-        random.seed(args.seed)
-        np.random.seed(args.seed)
-        torch.manual_seed(args.seed)
-
-        self.device = torch.device("cuda" if USE_CUDA else "cpu")
-
-        self.sumbt_model = BeliefTracker(args, num_labels, self.device)
-        if USE_CUDA and N_GPU > 1:
-            self.sumbt_model = torch.nn.DataParallel(self.sumbt_model)
-        if args.fp16:
-            self.sumbt_model.half()
-        self.sumbt_model.to(self.device)
-
-        ## Get slot-value embeddings
-        self.label_token_ids, self.label_len = [], []
-        for labels in label_list:
-            token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, self.device)
-            self.label_token_ids.append(token_ids)
-            self.label_len.append(lens)
-        self.label_map = [{label: i for i, label in enumerate(labels)} for labels in label_list]
-        self.label_map_inv = [{i: label for i, label in enumerate(labels)} for labels in label_list]
-        self.label_list = label_list
-        self.target_slot = processor.target_slot
-        ## Get domain-slot-type embeddings
-        self.slot_token_ids, self.slot_len = \
-            get_label_embedding(processor.target_slot, args.max_label_length, self.tokenizer, self.device)
-
-        self.args = args
-        self.state = default_state()
-        self.param_restored = False
-        if USE_CUDA and N_GPU == 1:
-            self.sumbt_model.initialize_slot_value_lookup(self.label_token_ids, self.slot_token_ids)
-        elif USE_CUDA and N_GPU > 1:
-            self.sumbt_model.module.initialize_slot_value_lookup(self.label_token_ids, self.slot_token_ids)
-
-        self.det_dic = {}
-        for domain, dic in REF_USR_DA.items():
-            for key, value in dic.items():
-                assert '-' not in key
-                self.det_dic[key.lower()] = key + '-' + domain
-                self.det_dic[value.lower()] = key + '-' + domain
-
-        self.cached_res = {}
-        convert_to_glue_format(DATA_PATH, SUMBT_PATH)
-        if not os.path.isdir(os.path.join(SUMBT_PATH, args.output_dir)):
-            os.makedirs(os.path.join(SUMBT_PATH, args.output_dir))
-        self.train_examples = processor.get_train_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False)
-        self.dev_examples = processor.get_dev_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False)
-        self.test_examples = processor.get_test_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False)
-        self.eval_slots = eval_slots
-        self.download_model()
-
-    def download_model(self):
-        if not os.path.isdir(DOWNLOAD_DIRECTORY):
-            os.mkdir(DOWNLOAD_DIRECTORY)
-        # model_file = os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.zip')
-
-        # if not os.path.isfile(model_file):
-        model_file = 'https://huggingface.co/ConvLab/ConvLab-2_models/resolve/main/sumbt.tar.gz'
-
-        import tarfile
-        if not os.path.isfile(os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin')):
-            archive_file = cached_path(model_file)
-            # archive = zipfile.ZipFile(archive_file, 'r')
-            t = tarfile.open(archive_file)
-            t.extractall(path=DOWNLOAD_DIRECTORY)
-            # archive.extractall(DOWNLOAD_DIRECTORY)
-
-    def load_weights(self, model_path=None):
-        if model_path is None:
-            model_ckpt = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), 'pytorch_model.bin')
-        else:
-            model_ckpt = model_path
-        model = self.sumbt_model
-        # in the case that slot and values are different between the training and evaluation
-        if not USE_CUDA:
-            ptr_model = torch.load(model_ckpt, map_location=torch.device('cpu'))
-        else:
-            ptr_model = torch.load(model_ckpt)
-        print('loading pretrained weights')
-
-        if not USE_CUDA or N_GPU == 1:
-            state = model.state_dict()
-            state.update(ptr_model)
-            model.load_state_dict(state)
-        else:
-            # print("Evaluate using only one device!")
-            model.module.load_state_dict(ptr_model)
-
-        if USE_CUDA:
-            model.to("cuda")
-
-    def init_session(self):
-        self.state = default_state()
-        if not self.param_restored:
-            if os.path.isfile(os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin')):
-                print('loading weights from downloaded model')
-                self.load_weights(model_path=os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin'))
-            elif os.path.isfile(os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin')):
-                print('loading weights from trained model')
-                self.load_weights(model_path=os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin'))
-            else:
-                raise ValueError('no availabel weights found.')
-            self.param_restored = True
-
-    def update(self, user_act=None):
-        """Update the dialogue state with the generated tokens from TRADE"""
-        if not user_act:
-            user_act = ""
-        if not isinstance(user_act, str):
-            raise Exception(
-                'Expected user_act is str but found {}'.format(type(user_act))
-            )
-        prev_state = self.state
-
-        actual_history = copy.deepcopy(prev_state['history'])
-        # if actual_history[-1][0] == 'user':
-        #     actual_history[-1][1] += user_act
-        # else:
-        #     actual_history.append(['user', user_act])
-        query = self.construct_query(actual_history)
-        pred_states = self.predict(query)
-
-        new_belief_state = copy.deepcopy(prev_state['belief_state'])
-        for state in pred_states:
-            domain, slot, value = state.split('-', 2)
-            value = '' if value == 'none' else value
-            value = 'dontcare' if value == 'do not care' else value
-            value = 'guesthouse' if value == 'guest house' else value
-            if slot not in ['name', 'book']:
-                if domain not in new_belief_state:
-                    if domain == 'bus':
-                        continue
-                    else:
-                        raise Exception(
-                            'Error: domain <{}> not in belief state'.format(domain))
-            slot = REF_SYS_DA[domain.capitalize()].get(slot, slot)
-            assert 'semi' in new_belief_state[domain]
-            assert 'book' in new_belief_state[domain]
-            if 'book' in slot:
-                assert slot.startswith('book ')
-                slot = slot.strip().split()[1]
-            if slot == 'arrive by':
-                slot = 'arriveBy'
-            elif slot == 'leave at':
-                slot = 'leaveAt'
-            elif slot == 'price range':
-                slot = 'pricerange'
-            domain_dic = new_belief_state[domain]
-            if slot in domain_dic['semi']:
-                new_belief_state[domain]['semi'][slot] = value
-                # normalize_value(self.value_dict, domain, slot, value)
-            elif slot in domain_dic['book']:
-                new_belief_state[domain]['book'][slot] = value
-            elif slot.lower() in domain_dic['book']:
-                new_belief_state[domain]['book'][slot.lower()] = value
-            else:
-                with open('trade_tracker_unknown_slot.log', 'a+') as f:
-                    f.write(
-                        'unknown slot name <{}> with value <{}> of domain <{}>\nitem: {}\n\n'.format(slot, value, domain, state)
-                    )
-        new_request_state = copy.deepcopy(prev_state['request_state'])
-        # update request_state
-        user_request_slot = self.detect_requestable_slots(user_act)
-        for domain in user_request_slot:
-            for key in user_request_slot[domain]:
-                if domain not in new_request_state:
-                    new_request_state[domain] = {}
-                if key not in new_request_state[domain]:
-                    new_request_state[domain][key] = user_request_slot[domain][key]
-
-        new_state = copy.deepcopy(dict(prev_state))
-        new_state['belief_state'] = new_belief_state
-        new_state['request_state'] = new_request_state
-        self.state = new_state
-        # print((pred_states, query))
-        return self.state
-
-    def predict(self, query):
-        cache_query_key = ''.join(str(list(chain.from_iterable(query[0]))))
-        if cache_query_key in self.cached_res.keys():
-            return self.cached_res[cache_query_key]
-
-        input_ids, input_len = query
-        input_ids = torch.tensor(input_ids).to(self.device).unsqueeze(0)
-        input_len = torch.tensor(input_len).to(self.device).unsqueeze(0)
-        labels = None
-        _, pred_slot = self.sumbt_model(input_ids, input_len, labels)
-        pred_slot_t = pred_slot[0][-1].tolist()
-        predict_belief = []
-        for idx, i in enumerate(pred_slot_t):
-            predict_belief.append('{}-{}'.format(self.target_slot[idx], self.label_map_inv[idx][i]))
-        self.cached_res[cache_query_key] = predict_belief
-
-        # print(predict_belief)
-        fixed_belief = []
-        for sv in predict_belief:
-            d, s, v = sv.split('-', 2)
-            if s == 'book day':
-                s = 'day'
-            elif s == 'book people':
-                s = 'people'
-            elif s == 'book stay':
-                s = 'stay'
-            elif s == 'price range':
-                s = 'pricerange'
-            elif s == 'book time':
-                s = 'time'
-            elif s == 'arrive by':
-                s = 'arriveby'
-            elif s == 'leave at':
-                s = 'leaveat'
-            elif s == 'arrive by':
-                s = 'arriveby'
-            _fixed_slot = d + '-' + s
-            if _fixed_slot in self.eval_slots:
-                fixed_belief.append(_fixed_slot+'-'+v)
-        return predict_belief
-
-    def train(self, load_model=False, model_path=None):
-
-        if load_model:
-            if model_path is not None:
-                self.load_weights(model_path)
-        ## Training utterances
-        all_input_ids, all_input_len, all_label_ids = convert_examples_to_features(
-            self.train_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length)
-
-        num_train_batches = all_input_ids.size(0)
-        num_train_steps = int(
-            num_train_batches / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
-
-        logger.info("***** training *****")
-        logger.info("  Num examples = %d", len(self.train_examples))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_steps)
-
-        all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to(
-            DEVICE), all_label_ids.to(DEVICE)
-
-        train_data = TensorDataset(all_input_ids, all_input_len, all_label_ids)
-        train_sampler = RandomSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-        all_input_ids_dev, all_input_len_dev, all_label_ids_dev = convert_examples_to_features(
-            self.dev_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length)
-
-        logger.info("***** validation *****")
-        logger.info("  Num examples = %d", len(self.dev_examples))
-        logger.info("  Batch size = %d", args.dev_batch_size)
-
-        all_input_ids_dev, all_input_len_dev, all_label_ids_dev = \
-            all_input_ids_dev.to(DEVICE), all_input_len_dev.to(DEVICE), all_label_ids_dev.to(DEVICE)
-
-        dev_data = TensorDataset(all_input_ids_dev, all_input_len_dev, all_label_ids_dev)
-        dev_sampler = SequentialSampler(dev_data)
-        dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.dev_batch_size)
-
-        logger.info("Loaded data!")
-
-        if args.fp16:
-            self.sumbt_model.half()
-        self.sumbt_model.to(DEVICE)
-
-        ## Get domain-slot-type embeddings
-        slot_token_ids, slot_len = \
-            get_label_embedding(self.processor.target_slot, args.max_label_length, self.tokenizer, DEVICE)
-
-        # for slot_idx, slot_str in zip(slot_token_ids, self.processor.target_slot):
-        #     self.idx2slot[slot_idx] = slot_str
-
-        ## Get slot-value embeddings
-        label_token_ids, label_len = [], []
-        for slot_idx, labels in zip(slot_token_ids, self.label_list):
-            # self.idx2value[slot_idx] = {}
-            token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, DEVICE)
-            label_token_ids.append(token_ids)
-            label_len.append(lens)
-            # for label, token_id in zip(labels, token_ids):
-            #     self.idx2value[slot_idx][token_id] = label
-
-        logger.info('embeddings prepared')
-
-        if USE_CUDA and N_GPU > 1:
-            self.sumbt_model.module.initialize_slot_value_lookup(label_token_ids, slot_token_ids)
-        else:
-            self.sumbt_model.initialize_slot_value_lookup(label_token_ids, slot_token_ids)
-
-        def get_optimizer_grouped_parameters(model):
-            param_optimizer = [(n, p) for n, p in model.named_parameters() if p.requires_grad]
-            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-            optimizer_grouped_parameters = [
-                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01,
-                 'lr': args.learning_rate},
-                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,
-                 'lr': args.learning_rate},
-            ]
-            return optimizer_grouped_parameters
-
-        if not USE_CUDA or N_GPU == 1:
-            optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model)
-        else:
-            optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model.module)
-
-        t_total = num_train_steps
-
-        scheduler = None
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError(
-                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.fp16_loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale)
-
-        else:
-            optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False)
-            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total)
-        logger.info(optimizer)
-
-        # Training code
-        ###############################################################################
-
-        logger.info("Training...")
-
-        global_step = 0
-        last_update = None
-        best_loss = None
-        model = self.sumbt_model
-        if not args.do_not_use_tensorboard:
-            summary_writer = None
-        else:
-            summary_writer = SummaryWriter("./tensorboard_summary/logs_1214/")
-
-        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
-            # Train
-            model.train()
-            tr_loss = 0
-            nb_tr_examples = 0
-            nb_tr_steps = 0
-
-            for step, batch in enumerate(tqdm(train_dataloader)):
-                batch = tuple(t.to(DEVICE) for t in batch)
-                input_ids, input_len, label_ids = batch
-
-                # Forward
-                if N_GPU == 1:
-                    loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
-                else:
-                    loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
-
-                    # average to multi-gpus
-                    loss = loss.mean()
-                    acc = acc.mean()
-                    acc_slot = acc_slot.mean(0)
-
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                # Backward
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-
-                # tensrboard logging
-                if summary_writer is not None:
-                    summary_writer.add_scalar("Epoch", epoch, global_step)
-                    summary_writer.add_scalar("Train/Loss", loss, global_step)
-                    summary_writer.add_scalar("Train/JointAcc", acc, global_step)
-                    if N_GPU == 1:
-                        for i, slot in enumerate(self.processor.target_slot):
-                            summary_writer.add_scalar("Train/Loss_%s" % slot.replace(' ', '_'), loss_slot[i],
-                                                      global_step)
-                            summary_writer.add_scalar("Train/Acc_%s" % slot.replace(' ', '_'), acc_slot[i], global_step)
-
-                tr_loss += loss.item()
-                nb_tr_examples += input_ids.size(0)
-                nb_tr_steps += 1
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    # modify lealrning rate with special warm up BERT uses
-                    lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion)
-                    if summary_writer is not None:
-                        summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step)
-                    for param_group in optimizer.param_groups:
-                        param_group['lr'] = lr_this_step
-                    if scheduler is not None:
-                        torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0)
-                    optimizer.step()
-                    if scheduler is not None:
-                        scheduler.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-
-
-            # Perform evaluation on validation dataset
-            model.eval()
-            dev_loss = 0
-            dev_acc = 0
-            dev_loss_slot, dev_acc_slot = None, None
-            nb_dev_examples, nb_dev_steps = 0, 0
-
-            for step, batch in enumerate(tqdm(dev_dataloader, desc="Validation")):
-                batch = tuple(t.to(DEVICE) for t in batch)
-                input_ids, input_len, label_ids = batch
-                if input_ids.dim() == 2:
-                    input_ids = input_ids.unsqueeze(0)
-                    input_len = input_len.unsqueeze(0)
-                    label_ids = label_ids.unsuqeeze(0)
-
-                with torch.no_grad():
-                    if N_GPU == 1:
-                        loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
-                    else:
-                        loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
-
-                        # average to multi-gpus
-                        loss = loss.mean()
-                        acc = acc.mean()
-                        acc_slot = acc_slot.mean(0)
-
-                num_valid_turn = torch.sum(label_ids[:, :, 0].view(-1) > -1, 0).item()
-                dev_loss += loss.item() * num_valid_turn
-                dev_acc += acc.item() * num_valid_turn
-
-                if N_GPU == 1:
-                    if dev_loss_slot is None:
-                        dev_loss_slot = [l * num_valid_turn for l in loss_slot]
-                        dev_acc_slot = acc_slot * num_valid_turn
-                    else:
-                        for i, l in enumerate(loss_slot):
-                            dev_loss_slot[i] = dev_loss_slot[i] + l * num_valid_turn
-                        dev_acc_slot += acc_slot * num_valid_turn
-
-                nb_dev_examples += num_valid_turn
-
-
-            dev_loss = dev_loss / nb_dev_examples
-            dev_acc = dev_acc / nb_dev_examples
-
-            if N_GPU == 1:
-                dev_acc_slot = dev_acc_slot / nb_dev_examples
-
-            # tensorboard logging
-            if summary_writer is not None:
-                summary_writer.add_scalar("Validate/Loss", dev_loss, global_step)
-                summary_writer.add_scalar("Validate/Acc", dev_acc, global_step)
-                if N_GPU == 1:
-                    for i, slot in enumerate(self.processor.target_slot):
-                        summary_writer.add_scalar("Validate/Loss_%s" % slot.replace(' ', '_'),
-                                                  dev_loss_slot[i] / nb_dev_examples, global_step)
-                        summary_writer.add_scalar("Validate/Acc_%s" % slot.replace(' ', '_'), dev_acc_slot[i],
-                                                  global_step)
-
-            dev_loss = round(dev_loss, 6)
-
-            output_model_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin")
-
-            if last_update is None or dev_loss < best_loss:
-
-                if not USE_CUDA or N_GPU == 1:
-                    torch.save(model.state_dict(), output_model_file)
-                else:
-                    torch.save(model.module.state_dict(), output_model_file)
-
-                last_update = epoch
-                best_loss = dev_loss
-                best_acc = dev_acc
-
-                logger.info(
-                    "*** Model Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % (
-                        last_update, best_loss, best_acc, global_step))
-            else:
-                logger.info(
-                    "*** Model NOT Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d  ***" % (
-                        epoch, dev_loss, dev_acc, global_step))
-
-            if last_update + args.patience <= epoch:
-                break
-
-
-
-    def test(self, mode='dev', model_path=None):
-        '''Testing funciton of TRADE (to be added)'''
-        # Evaluation
-        self.load_weights(model_path)
-
-        if mode == 'test':
-            eval_examples = self.dev_examples
-        elif mode == 'dev':
-            eval_examples = self.test_examples
-
-        all_input_ids, all_input_len, all_label_ids = convert_examples_to_features(
-            eval_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length)
-        all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to(
-            DEVICE), all_label_ids.to(DEVICE)
-        logger.info("***** Running evaluation *****")
-        logger.info("  Num examples = %d", len(eval_examples))
-        logger.info("  Batch size = %d", args.dev_batch_size)
-
-        eval_data = TensorDataset(all_input_ids, all_input_len, all_label_ids)
-
-        # Run prediction for full data
-        eval_sampler = SequentialSampler(eval_data)
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.dev_batch_size)
-
-        model = self.sumbt_model
-        eval_loss, eval_accuracy = 0, 0
-        eval_loss_slot, eval_acc_slot = None, None
-        nb_eval_steps, nb_eval_examples = 0, 0
-
-        accuracies = {'joint7': 0, 'slot7': 0, 'joint5': 0, 'slot5': 0, 'joint_rest': 0, 'slot_rest': 0,
-                      'num_turn': 0, 'num_slot7': 0, 'num_slot5': 0, 'num_slot_rest': 0}
-
-        for input_ids, input_len, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-            # if input_ids.dim() == 2:
-            #     input_ids = input_ids.unsqueeze(0)
-            #     input_len = input_len.unsqueeze(0)
-            #     label_ids = label_ids.unsuqeeze(0)
-
-            with torch.no_grad():
-                if not USE_CUDA or N_GPU == 1:
-                    loss, loss_slot, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, 1)
-                else:
-                    loss, _, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, N_GPU)
-                    nbatch = label_ids.size(0)
-                    nslot = pred_slot.size(3)
-                    pred_slot = pred_slot.view(nbatch, -1, nslot)
-
-            accuracies = eval_all_accs(pred_slot, label_ids, accuracies)
-
-            nb_eval_ex = (label_ids[:, :, 0].view(-1) != -1).sum().item()
-            nb_eval_examples += nb_eval_ex
-            nb_eval_steps += 1
-
-            if not USE_CUDA or N_GPU == 1:
-                eval_loss += loss.item() * nb_eval_ex
-                eval_accuracy += acc.item() * nb_eval_ex
-                if eval_loss_slot is None:
-                    eval_loss_slot = [l * nb_eval_ex for l in loss_slot]
-                    eval_acc_slot = acc_slot * nb_eval_ex
-                else:
-                    for i, l in enumerate(loss_slot):
-                        eval_loss_slot[i] = eval_loss_slot[i] + l * nb_eval_ex
-                    eval_acc_slot += acc_slot * nb_eval_ex
-            else:
-                eval_loss += sum(loss) * nb_eval_ex
-                eval_accuracy += sum(acc) * nb_eval_ex
-
-        eval_loss = eval_loss / nb_eval_examples
-        eval_accuracy = eval_accuracy / nb_eval_examples
-        if not USE_CUDA or N_GPU == 1:
-            eval_acc_slot = eval_acc_slot / nb_eval_examples
-
-        loss = None
-
-        if not USE_CUDA or N_GPU == 1:
-            result = {'eval_loss': eval_loss,
-                      'eval_accuracy': eval_accuracy,
-                      'loss': loss,
-                      'eval_loss_slot': '\t'.join([str(val / nb_eval_examples) for val in eval_loss_slot]),
-                      'eval_acc_slot': '\t'.join([str((val).item()) for val in eval_acc_slot])
-                      }
-        else:
-            result = {'eval_loss': eval_loss,
-                      'eval_accuracy': eval_accuracy,
-                      'loss': loss
-                      }
-
-        out_file_name = 'eval_results'
-        # if TARGET_SLOT == 'all':
-        #     out_file_name += '_all'
-        output_eval_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name)
-
-        if not USE_CUDA or N_GPU == 1:
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key in sorted(result.keys()):
-                    logger.info("  %s = %s", key, str(result[key]))
-                    writer.write("%s = %s\n" % (key, str(result[key])))
-
-        out_file_name = 'eval_all_accuracies'
-        with open(os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name), 'w') as f:
-            f.write(
-                'joint acc (7 domain) : slot acc (7 domain) : joint acc (5 domain): slot acc (5 domain): joint '
-                'restaurant : slot acc restaurant \n')
-            f.write('%.5f : %.5f : %.5f : %.5f : %.5f : %.5f \n' % (
-                (accuracies['joint7'] / accuracies['num_turn']).item(),
-                (accuracies['slot7'] / accuracies['num_slot7']).item(),
-                (accuracies['joint5'] / accuracies['num_turn']).item(),
-                (accuracies['slot5'] / accuracies['num_slot5']).item(),
-                (accuracies['joint_rest'] / accuracies['num_turn']).item(),
-                (accuracies['slot_rest'] / accuracies['num_slot_rest']).item()
-            ))
-
-    def construct_query(self, context):
-        '''Construct query from context'''
-        ids = []
-        lens = []
-        context_len = len(context)
-        if context[0][0] != 'usr':
-           context = [['usr', '']] + context
-        for i in range(0, context_len, 2):
-            # utt_user = ''
-            # utt_sys = ''
-            # for evaluation
-            utt_sys = context[i][1]
-            if context_len < 2:
-                utt_user = " "
-            else:
-                # print(context_len)
-                utt_user = context[i + 1][1]
-
-            tokens_user = [x if x != '#' else '[SEP]' for x in self.tokenizer.tokenize(utt_user)]
-            tokens_sys = [x if x != '#' else '[SEP]' for x in self.tokenizer.tokenize(utt_sys)]
-
-            _truncate_seq_pair(tokens_user, tokens_sys, self.args.max_seq_length - 3)
-            tokens = ["[CLS]"] + tokens_user + ["[SEP]"] + tokens_sys + ["[SEP]"]
-            input_len = [len(tokens_user) + 2, len(tokens_sys) + 1]
-
-            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-            padding = [0] * (self.args.max_seq_length - len(input_ids))
-            input_ids += padding
-            assert len(input_ids) == self.args.max_seq_length
-            ids.append(input_ids)
-            lens.append(input_len)
-
-        return (ids, lens)
-
-    def detect_requestable_slots(self, observation):
-        result = {}
-        observation = observation.lower()
-        _observation = ' {} '.format(observation)
-        for value in self.det_dic.keys():
-            _value = ' {} '.format(value.strip())
-            if _value in _observation:
-                key, domain = self.det_dic[value].split('-')
-                if domain not in result:
-                    result[domain] = {}
-                result[domain][key] = 0
-        return result
-
-
-def test_update():
-    sumbt_tracker = SUMBTTracker()
-    sumbt_tracker.init_session()
-
-    sumbt_tracker.state['history'] = [
-        ['sys', ''],
-        ['user', 'Could you book a 4 stars hotel for one night, 1 person?'],
-        ['sys', 'If you\'d like something cheap, I recommend the Allenbell']
-    ]
-    sumbt_tracker.state['history'].append(['user', 'Friday and Can you book it for me and get a reference number ?'])
-    from timeit import default_timer as timer
-    start = timer()
-    pprint(sumbt_tracker.update('Friday and Can you book it for me and get a reference number ?'))
-    end = timer()
-    print(end - start)
-    #
-    start = timer()
-    sumbt_tracker.state['history'].append(['sys', 'what is the area'])
-    sumbt_tracker.state['history'].append(['user', "it doesn't matter. I don't care"])
-    pprint(sumbt_tracker.update('in the east area of cambridge'))
-    end = timer()
-    print(end - start)
-
-    start = timer()
-    # sumbt_tracker.state['history'].append(['what is the area'])
-    pprint(sumbt_tracker.update('in the east area of cambridge'))
-    end = timer()
-    print(end - start)
-
-
-def test_update_bak():
-
-    sumbt_tracker = SUMBTTracker()
-    sumbt_tracker.init_session()
-
-    sumbt_tracker.state['history'] = [
-        ['null', 'Could you book a 4 stars hotel for one night, 1 person?'],
-        ['If you\'d like something cheap, I recommend the Allenbell']
-    ]
-    from timeit import default_timer as timer
-    start = timer()
-    pprint(sumbt_tracker.update('Friday and Can you book it for me and get a reference number ?'))
-    sumbt_tracker.state['history'][-1].append('Friday and Can you book it for me and get a reference number ?')
-    end = timer()
-    print(end - start)
-    #
-    start = timer()
-    sumbt_tracker.state['history'].append(['what is the area'])
-    pprint(sumbt_tracker.update('i do not care'))
-    # pprint(sumbt_tracker.update('in the east area of cambridge'))
-    end = timer()
-    print(end - start)
-
-    start = timer()
-    # sumbt_tracker.state['history'].append(['what is the area'])
-    pprint(sumbt_tracker.update('in the east area of cambridge'))
-    end = timer()
-    print(end - start)
-
-
-import argparse
-parser = argparse.ArgumentParser()
-parser.add_argument('--train', action='store_true')
-parser.add_argument('--dev', action='store_true')
-parser.add_argument('--test', action='store_true')
-
-
-if __name__ == '__main__':
-    test_update()
diff --git a/convlab/dst/sumbt/multiwoz/sumbt_config.py b/convlab/dst/sumbt/multiwoz/sumbt_config.py
deleted file mode 100755
index df96eded9c7de28ebe16dd7df3b6f2b4d9fffc37..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/multiwoz/sumbt_config.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import os
-
-import convlab
-class DotMap():
-    def __init__(self):
-        self.max_label_length = 32
-        self.max_turn_length = 22
-        self.hidden_dim = 100
-        self.num_rnn_layers = 1
-        self.zero_init_rnn = False
-        self.attn_head = 4
-        self.do_eval = True
-        self.do_train = False
-        self.distance_metric = 'cosine'
-        self.train_batch_size = 4
-        self.dev_batch_size = 1
-        self.eval_batch_size  = 16
-        self.learning_rate = 5e-5
-        self.num_train_epochs = 3
-        self.patience = 10
-        self.warmup_proportion = 0.1
-        self.local_rank = -1
-        self.seed = 42
-        self.gradient_accumulation_steps = 1
-        self.fp16 = False
-        self.loss_scale = 0
-        self.do_not_use_tensorboard = False
-        self.fix_utterance_encoder = False
-        self.do_eval = True
-        self.num_train_epochs = 300
-
-        self.bert_model = os.path.join(convlab.get_root_path(), "pre-trained-models/bert-base-uncased")
-        self.bert_model_cache_dir = os.path.join(convlab.get_root_path(), "pre-trained-models/")
-        self.bert_model_name = "bert-base-uncased"
-        self.do_lower_case = True
-        self.task_name = 'bert-gru-sumbt'
-        self.nbt = 'rnn'
-        # self.output_dir = os.path.join(path, 'ckpt/')
-        self.target_slot = 'all'
-        self.learning_rate = 5e-5
-        self.train_batch_size = 4
-        self.eval_batch_size = 16
-        self.distance_metric = 'euclidean'
-        self.patience = 15
-
-        self.hidden_dim = 300
-        self.max_label_length = 32
-        self.max_seq_length = 64
-        self.max_turn_length = 22
-
-        self.fp16_loss_scale = 0.0
-        self.data_dir = 'data/multiwoz/'
-        self.tf_dir = 'tensorboard'
-        self.tmp_data_dir = 'processed_data/'
-        self.output_dir = 'model_output/'
-
-args = DotMap()
diff --git a/convlab/dst/sumbt/multiwoz/sumbt_utils.py b/convlab/dst/sumbt/multiwoz/sumbt_utils.py
deleted file mode 100755
index 489e5eca3535de4b951246a103b323f339082afa..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/multiwoz/sumbt_utils.py
+++ /dev/null
@@ -1,450 +0,0 @@
-import csv
-import os
-import json
-import collections
-import logging
-import re
-import torch
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt='%m/%d/%Y %H:%M:%S',
-                    level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding='utf-8') as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                if len(line) > 0 and line[0][0] == '#':  # ignore comments (starting with '#')
-                    continue
-                lines.append(line)
-            return lines
-
-
-class Processor(DataProcessor):
-    """Processor for the belief tracking dataset (GLUE version)."""
-
-    def __init__(self, config):
-        super(Processor, self).__init__()
-
-        print(config)
-        # MultiWOZ dataset
-        fp_ontology = open(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))), config.data_dir, "ontology_sumbt.json"), "r")
-
-        ontology = json.load(fp_ontology)
-        for slot in ontology.keys():
-            ontology[slot].append("none")
-        fp_ontology.close()
-
-        if not config.target_slot == 'all':
-            slot_idx = {'attraction': '0:1:2', 'bus': '3:4:5:6', 'hospital': '7',
-                        'hotel': '8:9:10:11:12:13:14:15:16:17', \
-                        'restaurant': '18:19:20:21:22:23:24', 'taxi': '25:26:27:28', 'train': '29:30:31:32:33:34'}
-            target_slot = []
-            for key, value in slot_idx.items():
-                if key != config.target_slot:
-                    target_slot.append(value)
-            config.target_slot = ':'.join(target_slot)
-
-        # sorting the ontology according to the alphabetic order of the slots
-        ontology = collections.OrderedDict(sorted(ontology.items()))
-
-        # select slots to train
-        nslots = len(ontology.keys())
-        target_slot = list(ontology.keys())
-        if config.target_slot == 'all':
-            self.target_slot_idx = [*range(0, nslots)]
-        else:
-            self.target_slot_idx = sorted([int(x) for x in config.target_slot.split(':')])
-
-        for idx in range(0, nslots):
-            if not idx in self.target_slot_idx:
-                del ontology[target_slot[idx]]
-
-        self.ontology = ontology
-        self.target_slot = list(self.ontology.keys())
-        for i, slot in enumerate(self.target_slot):
-            if slot == "pricerange":
-                self.target_slot[i] = "price range"
-
-        logger.info('Processor: target_slot')
-        logger.info(self.target_slot)
-
-    def get_train_examples(self, data_dir, accumulation=False):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train", accumulation)
-
-    def get_dev_examples(self, data_dir, accumulation=False):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev", accumulation)
-
-    def get_test_examples(self, data_dir, accumulation=False):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test", accumulation)
-
-    def get_labels(self):
-        """See base class."""
-        return [self.ontology[slot] for slot in self.target_slot]
-
-    def _create_examples(self, lines, set_type, accumulation=False):
-        """Creates examples for the training and dev sets."""
-        prev_dialogue_index = None
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s-%s" % (set_type, line[0], line[1])  # line[0]: dialogue index, line[1]: turn index
-            if accumulation:
-                if prev_dialogue_index is None or prev_dialogue_index != line[0]:
-                    text_a = line[2]
-                    text_b = line[3]
-                    prev_dialogue_index = line[0]
-                else:
-                    # The symbol '#' will be replaced with '[SEP]' after tokenization.
-                    text_a = line[2] + " # " + text_a
-                    text_b = line[3] + " # " + text_b
-            else:
-                text_a = line[2]  # line[2]: user utterance
-                text_b = line[3]  # line[3]: system response
-
-            label = [line[4 + idx] for idx in self.target_slot_idx]
-
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-def normalize_text(text):
-    global replacements
-    # lower case every word
-    text = text.lower()
-    # replace white spaces in front and end
-    text = re.sub(r'^\s*|\s*$', '', text)
-
-    # hotel domain pfb30
-    text = re.sub(r"b&b", "bed and breakfast", text)
-    text = re.sub(r"b and b", "bed and breakfast", text)
-
-    # replace st.
-    text = text.replace(';', ',')
-    text = re.sub('$\/', '', text)
-    text = text.replace('/', ' and ')
-
-    # replace other special characters
-    text = text.replace('-', ' ')
-    text = re.sub('[\"\<>@\(\)]', '', text)  # remove
-
-    # insert white space before and after tokens:
-    for token in ['?', '.', ',', '!']:
-        text = insertSpace(token, text)
-
-    # insert white space for 's
-    text = insertSpace('\'s', text)
-
-    # replace it's, does't, you'd ... etc
-    text = re.sub('^\'', '', text)
-    text = re.sub('\'$', '', text)
-    text = re.sub('\'\s', ' ', text)
-    text = re.sub('\s\'', ' ', text)
-    for fromx, tox in replacements:
-        text = ' ' + text + ' '
-        text = text.replace(fromx, tox)[1:-1]
-
-    # remove multiple spaces
-    text = re.sub(' +', ' ', text)
-
-    # concatenate numbers
-    tmp = text
-    tokens = text.split()
-    i = 1
-    while i < len(tokens):
-        if re.match(u'^\d+$', tokens[i]) and \
-                re.match(u'\d+$', tokens[i - 1]):
-            tokens[i - 1] += tokens[i]
-            del tokens[i]
-        else:
-            i += 1
-    text = ' '.join(tokens)
-
-    return text
-
-
-def insertSpace(token, text):
-    sidx = 0
-    while True:
-        sidx = text.find(token, sidx)
-        if sidx == -1:
-            break
-        if sidx + 1 < len(text) and re.match('[0-9]', text[sidx - 1]) and \
-                re.match('[0-9]', text[sidx + 1]):
-            sidx += 1
-            continue
-        if text[sidx - 1] != ' ':
-            text = text[:sidx] + ' ' + text[sidx:]
-            sidx += 1
-        if sidx + len(token) < len(text) and text[sidx + len(token)] != ' ':
-            text = text[:sidx + 1] + ' ' + text[sidx + 1:]
-        sidx += 1
-    return text
-
-
-def get_label_embedding(labels, max_seq_length, tokenizer, device):
-    features = []
-    for label in labels:
-        label_tokens = ["[CLS]"] + tokenizer.tokenize(label) + ["[SEP]"]
-        label_token_ids = tokenizer.convert_tokens_to_ids(label_tokens)
-        label_len = len(label_token_ids)
-
-        label_padding = [0] * (max_seq_length - len(label_token_ids))
-        label_token_ids += label_padding
-        assert len(label_token_ids) == max_seq_length
-
-        features.append((label_token_ids, label_len))
-
-    all_label_token_ids = torch.tensor([f[0] for f in features], dtype=torch.long).to(device)
-    all_label_len = torch.tensor([f[1] for f in features], dtype=torch.long).to(device)
-
-    return all_label_token_ids, all_label_len
-
-
-def warmup_linear(x, warmup=0.002):
-    if x < warmup:
-        return x / warmup
-    return 1.0 - x
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
-
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
-
-        Args:
-            guid: Unique id for the example.
-            text_a: string. The untokenized text of the first sequence. For single
-            sequence tasks, only this sequence must be specified.
-            text_b: (Optional) string. The untokenized text of the second sequence.
-            Only must be specified for sequence pair tasks.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_len, label_id):
-        self.input_ids = input_ids
-        self.input_len = input_len
-        self.label_id = label_id
-
-
-def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, max_turn_length):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    label_map = [{label: i for i, label in enumerate(labels)} for labels in label_list]
-    slot_dim = len(label_list)
-
-    features = []
-    prev_dialogue_idx = None
-    all_padding = [0] * max_seq_length
-    all_padding_len = [0, 0]
-
-    max_turn = 0
-    for (ex_index, example) in enumerate(examples):
-        if max_turn < int(example.guid.split('-')[2]):
-            max_turn = int(example.guid.split('-')[2])
-    max_turn_length = min(max_turn + 1, max_turn_length)
-    logger.info("max_turn_length = %d" % max_turn)
-
-    for (ex_index, example) in enumerate(examples):
-        tokens_a = [x if x != '#' else '[SEP]' for x in tokenizer.tokenize(example.text_a)]
-        tokens_b = None
-        if example.text_b:
-            tokens_b = [x if x != '#' else '[SEP]' for x in tokenizer.tokenize(example.text_b)]
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[:(max_seq_length - 2)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0   0   0   0  0     0 0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambigiously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-
-        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
-        input_len = [len(tokens), 0]
-
-        if tokens_b:
-            tokens += tokens_b + ["[SEP]"]
-            input_len[1] = len(tokens_b) + 1
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # Zero-pad up to the sequence length.
-        padding = [0] * (max_seq_length - len(input_ids))
-        input_ids += padding
-        assert len(input_ids) == max_seq_length
-
-        FLAG_TEST = False
-        if example.label is not None:
-            label_id = []
-            label_info = 'label: '
-            for i, label in enumerate(example.label):
-                if label == 'dontcare':
-                    label = 'do not care'
-                label_id.append(label_map[i][label])
-                label_info += '%s (id = %d) ' % (label, label_map[i][label])
-
-            if ex_index < 5:
-                logger.info("*** Example ***")
-                logger.info("guid: %s" % example.guid)
-                logger.info("tokens: %s" % " ".join(
-                    [str(x) for x in tokens]))
-                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logger.info("input_len: %s" % " ".join([str(x) for x in input_len]))
-                logger.info("label: " + label_info)
-        else:
-            FLAG_TEST = True
-            label_id = None
-
-        curr_dialogue_idx = example.guid.split('-')[1]
-        curr_turn_idx = int(example.guid.split('-')[2])
-
-        if prev_dialogue_idx is not None and prev_dialogue_idx != curr_dialogue_idx:
-            if prev_turn_idx < max_turn_length:
-                features += [InputFeatures(input_ids=all_padding,
-                                           input_len=all_padding_len,
-                                           label_id=[-1] * slot_dim)] \
-                            * (max_turn_length - prev_turn_idx - 1)
-            assert len(features) % max_turn_length == 0
-
-        if prev_dialogue_idx is None or prev_turn_idx < max_turn_length:
-            features.append(
-                InputFeatures(input_ids=input_ids,
-                              input_len=input_len,
-                              label_id=label_id))
-
-        prev_dialogue_idx = curr_dialogue_idx
-        prev_turn_idx = curr_turn_idx
-
-    if prev_turn_idx < max_turn_length:
-        features += [InputFeatures(input_ids=all_padding,
-                                   input_len=all_padding_len,
-                                   label_id=[-1] * slot_dim)] \
-                    * (max_turn_length - prev_turn_idx - 1)
-    assert len(features) % max_turn_length == 0
-
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_len = torch.tensor([f.input_len for f in features], dtype=torch.long)
-    if not FLAG_TEST:
-        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
-
-    # reshape tensors to [#batch, #max_turn_length, #max_seq_length]
-    all_input_ids = all_input_ids.view(-1, max_turn_length, max_seq_length)
-    all_input_len = all_input_len.view(-1, max_turn_length, 2)
-    if not FLAG_TEST:
-        all_label_ids = all_label_ids.view(-1, max_turn_length, slot_dim)
-    else:
-        all_label_ids = None
-
-    return all_input_ids, all_input_len, all_label_ids
-
-
-def eval_all_accs(pred_slot, labels, accuracies):
-
-    def _eval_acc(_pred_slot, _labels):
-        slot_dim = _labels.size(-1)
-        accuracy = (_pred_slot == _labels).view(-1, slot_dim)
-        num_turn = torch.sum(_labels[:, :, 0].view(-1) > -1, 0).float()
-        num_data = torch.sum(_labels > -1).float()
-        # joint accuracy
-        # joint_acc = sum(torch.sum(accuracy, 1) / slot_dim).float()
-        num_slots = accuracy.shape[1]
-        joint_acc = sum(torch.sum(accuracy, 1) == num_slots)
-        # slot accuracy
-        slot_acc = torch.sum(accuracy).float()
-        return joint_acc, slot_acc, num_turn, num_data
-
-    # 7 domains
-    joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot, labels)
-    accuracies['joint7'] += joint_acc
-    accuracies['slot7'] += slot_acc
-    accuracies['num_turn'] += num_turn
-    accuracies['num_slot7'] += num_data
-
-    # restaurant domain
-    joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot[:,:,18:25], labels[:,:,18:25])
-    accuracies['joint_rest'] += joint_acc
-    accuracies['slot_rest'] += slot_acc
-    accuracies['num_slot_rest'] += num_data
-
-    pred_slot5 = torch.cat((pred_slot[:,:,0:3], pred_slot[:,:,8:]), 2)
-    label_slot5 = torch.cat((labels[:,:,0:3], labels[:,:,8:]), 2)
-
-    # 5 domains (excluding bus and hotel domain)
-    joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot5, label_slot5)
-    accuracies['joint5'] += joint_acc
-    accuracies['slot5'] += slot_acc
-    accuracies['num_slot5'] += num_data
-
-    return accuracies
diff --git a/convlab/dst/sumbt/multiwoz_zh/.gitignore b/convlab/dst/sumbt/multiwoz_zh/.gitignore
deleted file mode 100644
index 5b558554b2dea1b954e9bbcacb01bda105f03f17..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/multiwoz_zh/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-pre-trained/
-model_output/
diff --git a/convlab/dst/sumbt/multiwoz_zh/__init__.py b/convlab/dst/sumbt/multiwoz_zh/__init__.py
deleted file mode 100644
index 1550c99e34cef1733e369510958291e7d1f98998..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/multiwoz_zh/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from convlab.dst.sumbt.multiwoz_zh.sumbt import SUMBTTracker as SUMBT
diff --git a/convlab/dst/sumbt/multiwoz_zh/convert_to_glue_format.py b/convlab/dst/sumbt/multiwoz_zh/convert_to_glue_format.py
deleted file mode 100644
index b8fa6c6b9f4dae0c20476a1334d23f4ad2f5576b..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/multiwoz_zh/convert_to_glue_format.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import json
-import zipfile
-from convlab.dst.sumbt.multiwoz_zh.sumbt_config import *
-
-def trans_value(value):
-    trans = {
-        '': '未提及',
-        '没有提到': '未提及',
-        '没有': '未提及',
-        '未提到': '未提及',
-        '一个也没有': '未提及',
-        '无': '未提及',
-        '是的': '有',
-        '不是': '没有',
-        '不关心': '不在意',
-        '不在乎': '不在意',
-    }
-
-    return trans.get(value, value)
-
-
-def convert_to_glue_format(data_dir, sumbt_dir):
-
-    if not os.path.isdir(os.path.join(sumbt_dir, args.tmp_data_dir)):
-        os.mkdir(os.path.join(sumbt_dir, args.tmp_data_dir))
-
-    ### Read ontology file
-    with open(os.path.join(data_dir, "ontology.json"), "r") as fp_ont:
-        data_ont = json.load(fp_ont)
-        ontology = {}
-        for domain_slot in data_ont:
-            domain, slot = domain_slot.split('-')
-            if domain not in ontology:
-                ontology[domain] = {}
-            ontology[domain][slot] = {}
-            for value in data_ont[domain_slot]:
-                ontology[domain][slot][value] = 1
-
-    ### Read woz logs and write to tsv files
-    if os.path.exists(os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv")):
-        print('data has been processed!')
-        return 0
-    print('begin processing data')
-
-    fp_train = open(os.path.join(sumbt_dir, args.tmp_data_dir, "train.tsv"), "w")
-    fp_dev = open(os.path.join(sumbt_dir, args.tmp_data_dir, "dev.tsv"), "w")
-    fp_test = open(os.path.join(sumbt_dir, args.tmp_data_dir, "test.tsv"), "w")
-
-    fp_train.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t')
-    fp_dev.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t')
-    fp_test.write('# Dialogue ID\tTurn Index\tUser Utterance\tSystem Response\t')
-
-    for domain in sorted(ontology.keys()):
-        for slot in sorted(ontology[domain].keys()):
-            fp_train.write(str(domain) + '-' + str(slot) + '\t')
-            fp_dev.write(str(domain) + '-' + str(slot) + '\t')
-            fp_test.write(str(domain) + '-' + str(slot) + '\t')
-
-    fp_train.write('\n')
-    fp_dev.write('\n')
-    fp_test.write('\n')
-
-    # fp_data = open(os.path.join(SELF_DATA_DIR, "data.json"), "r")
-    # data = json.load(fp_data)
-
-    file_split = ['train', 'val', 'test']
-    fp = [fp_train, fp_dev, fp_test]
-
-    for split_type, split_fp in zip(file_split, fp):
-
-        zipfile_name = "{}.json.zip".format(split_type)
-        zip_fp = zipfile.ZipFile(os.path.join(data_dir, zipfile_name))
-        data = json.loads(str(zip_fp.read(zip_fp.namelist()[0]), 'utf-8'))
-
-        for file_id in data:
-            user_utterance = ''
-            system_response = ''
-            turn_idx = 0
-            for idx, turn in enumerate(data[file_id]['log']):
-                if idx % 2 == 0:        # user turn
-                    user_utterance = data[file_id]['log'][idx]['text']
-                else:                   # system turn
-                    user_utterance = user_utterance.replace('\t', ' ')
-                    user_utterance = user_utterance.replace('\n', ' ')
-                    user_utterance = user_utterance.replace('  ', ' ')
-
-                    system_response = system_response.replace('\t', ' ')
-                    system_response = system_response.replace('\n', ' ')
-                    system_response = system_response.replace('  ', ' ')
-
-                    split_fp.write(str(file_id))                   # 0: dialogue ID
-                    split_fp.write('\t' + str(turn_idx))           # 1: turn index
-                    split_fp.write('\t' + str(user_utterance))     # 2: user utterance
-                    split_fp.write('\t' + str(system_response))    # 3: system response
-
-                    belief = {}
-
-                    for domain in data[file_id]['log'][idx]['metadata'].keys():
-                        for slot in data[file_id]['log'][idx]['metadata'][domain]['semi'].keys():
-                            value = data[file_id]['log'][idx]['metadata'][domain]['semi'][slot].strip()
-                            # value = value_trans.get(value, value)
-                            value = trans_value(value)
-
-                            if domain not in ontology:
-                                print("domain (%s) is not defined" % domain)
-                                continue
-
-                            if slot not in ontology[domain]:
-                                print("slot (%s) in domain (%s) is not defined" % (slot, domain))   # bus-arriveBy not defined
-                                continue
-
-                            if value not in ontology[domain][slot] and value != '未提及':
-                                print("%s: value (%s) in domain (%s) slot (%s) is not defined in ontology" %
-                                      (file_id, value, domain, slot))
-                                value = '未提及'
-
-                            belief[str(domain) + '-' + str(slot)] = value
-
-                        for slot in data[file_id]['log'][idx]['metadata'][domain]['book'].keys():
-                            if slot == 'booked':
-                                continue
-                            if domain == '公共汽车' and slot == '人数' or domain == '列车' and slot == '票价':
-                                continue    # not defined in ontology
-
-                            value = data[file_id]['log'][idx]['metadata'][domain]['book'][slot].strip()
-                            value = trans_value(value)
-
-                            if str('预订' + slot) not in ontology[domain]:
-                                print("预订%s is not defined in domain %s" % (slot, domain))
-                                continue
-
-                            if value not in ontology[domain]['预订' + slot] and value != '未提及':
-                                print("%s: value (%s) in domain (%s) slot (预订%s) is not defined in ontology" %
-                                      (file_id, value, domain, slot))
-                                value = '未提及'
-
-                            belief[str(domain) + '-预订' + str(slot)] = value
-
-                    for domain in sorted(ontology.keys()):
-                        for slot in sorted(ontology[domain].keys()):
-                            key = str(domain) + '-' + str(slot)
-                            if key in belief:
-                                split_fp.write('\t' + belief[key])
-                            else:
-                                split_fp.write('\t未提及')
-
-                    split_fp.write('\n')
-                    split_fp.flush()
-
-                    system_response = data[file_id]['log'][idx]['text']
-                    turn_idx += 1
-
-    fp_train.close()
-    fp_dev.close()
-    fp_test.close()
-
-    print('data has been processed!')
-    
\ No newline at end of file
diff --git a/convlab/dst/sumbt/multiwoz_zh/sumbt.py b/convlab/dst/sumbt/multiwoz_zh/sumbt.py
deleted file mode 100644
index 16dc674c0a3d2f1faa8c85194c610f7ef41b987b..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/multiwoz_zh/sumbt.py
+++ /dev/null
@@ -1,682 +0,0 @@
-import copy
-import random
-from itertools import chain
-import numpy as np
-import zipfile
-
-from tensorboardX.writer import SummaryWriter
-from tqdm import trange, tqdm
-
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
-
-from transformers import BertTokenizer
-from transformers import get_linear_schedule_with_warmup, AdamW
-
-from convlab.dst.dst import DST
-from convlab.dst.sumbt.multiwoz_zh.convert_to_glue_format import convert_to_glue_format
-from convlab.util.multiwoz_zh.state import default_state
-
-from convlab.dst.sumbt.BeliefTrackerSlotQueryMultiSlot import BeliefTracker
-from convlab.dst.sumbt.multiwoz_zh.sumbt_utils import *
-from convlab.dst.sumbt.multiwoz_zh.sumbt_config import *
-
-USE_CUDA = torch.cuda.is_available()
-N_GPU = torch.cuda.device_count() if USE_CUDA else 1
-DEVICE = "cuda" if USE_CUDA else "cpu"
-ROOT_PATH = convlab.get_root_path()
-SUMBT_PATH = os.path.dirname(os.path.abspath(__file__))
-DATA_PATH = os.path.join(ROOT_PATH, 'data/multiwoz_zh')
-DOWNLOAD_DIRECTORY = os.path.join(SUMBT_PATH, 'pre-trained')
-multiwoz_zh_slot_list = ['公共汽车-出发地', '公共汽车-出发时间', '公共汽车-到达时间', '公共汽车-日期', '公共汽车-目的地', '出租车-出发地', '出租车-出发时间', '出租车-到达时间', '出租车-目的地', '列车-出发地', '列车-出发时间', '列车-到达时间', '列车-日期', '列车-目的地', '列车-预订人数', '医院-科室', '旅馆-互联网', '旅馆-价格范围', '旅馆-停车处', '旅馆-区域', '旅馆-名称', '旅馆-星级', '旅馆-类型', '旅馆-预订人数', '旅馆-预订停留天数', '旅馆-预订日期', '景点-区域', '景点-名称', '景点-类型', '餐厅-价格范围', '餐厅-区域', '餐厅-名称', '餐厅-预订人数', '餐厅-预订日期', '餐厅-预订时间', '餐厅-食物']
-
-
-def get_label_embedding(labels, max_seq_length, tokenizer, device):
-    features = []
-    for label in labels:
-        label_tokens = ["[CLS]"] + tokenizer.tokenize(label) + ["[SEP]"]
-        label_token_ids = tokenizer.convert_tokens_to_ids(label_tokens)
-        label_len = len(label_token_ids)
-
-        label_padding = [0] * (max_seq_length - len(label_token_ids))
-        label_token_ids += label_padding
-        assert len(label_token_ids) == max_seq_length
-
-        features.append((label_token_ids, label_len))
-
-    all_label_token_ids = torch.tensor([f[0] for f in features], dtype=torch.long).to(device)
-    all_label_len = torch.tensor([f[1] for f in features], dtype=torch.long).to(device)
-
-    return all_label_token_ids, all_label_len
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-class SUMBTTracker(DST):
-    """
-    Transferable multi-domain dialogue state tracker, adopted from https://github.com/SKTBrain/SUMBT
-    """
-
-    # adapt data provider
-    # unzip mt.zip, and zip each [train|val|test].json
-    @staticmethod
-    def init_data():
-        if not os.path.exists(os.path.join(DATA_PATH, 'train.json.zip')):
-            with zipfile.ZipFile(os.path.join(DATA_PATH, 'mt.zip')) as f:
-                f.extractall(DATA_PATH)
-
-        for split in ['train', 'test', 'val']:
-            with zipfile.ZipFile(os.path.join(DATA_PATH, f'{split}.json.zip'), 'w') as f:
-                f.write(os.path.join(DATA_PATH, f'{split}.json'), f'{split}.json')
-
-    def __init__(self, data_dir=DATA_PATH, eval_slots=multiwoz_zh_slot_list):
-        DST.__init__(self)
-
-        self.init_data()
-
-        processor = Processor(args)
-        self.processor = processor
-        label_list = processor.get_labels()
-        num_labels = [len(labels) for labels in label_list]  # number of slot-values in each slot-type
-
-        # tokenizer
-        self.tokenizer = BertTokenizer.from_pretrained(args.bert_model_name, cache_dir=args.bert_model_cache_dir)
-        random.seed(args.seed)
-        np.random.seed(args.seed)
-        torch.manual_seed(args.seed)
-
-        self.device = torch.device("cuda" if USE_CUDA else "cpu")
-
-        self.sumbt_model = BeliefTracker(args, num_labels, self.device)
-        if USE_CUDA and N_GPU > 1:
-            self.sumbt_model = torch.nn.DataParallel(self.sumbt_model)
-        if args.fp16:
-            self.sumbt_model.half()
-        self.sumbt_model.to(self.device)
-
-        ## Get slot-value embeddings
-        self.label_token_ids, self.label_len = [], []
-        for labels in label_list:
-            token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, self.device)
-            self.label_token_ids.append(token_ids)
-            self.label_len.append(lens)
-        self.label_map = [{label: i for i, label in enumerate(labels)} for labels in label_list]
-        self.label_map_inv = [{i: label for i, label in enumerate(labels)} for labels in label_list]
-        self.label_list = label_list
-        self.target_slot = processor.target_slot
-        ## Get domain-slot-type embeddings
-        self.slot_token_ids, self.slot_len = \
-            get_label_embedding(processor.target_slot, args.max_label_length, self.tokenizer, self.device)
-
-        self.args = args
-        self.state = default_state()
-        self.param_restored = False
-        if USE_CUDA and N_GPU == 1:
-            self.sumbt_model.initialize_slot_value_lookup(self.label_token_ids, self.slot_token_ids)
-        elif USE_CUDA and N_GPU > 1:
-            self.sumbt_model.module.initialize_slot_value_lookup(self.label_token_ids, self.slot_token_ids)
-
-        self.cached_res = {}
-        convert_to_glue_format(DATA_PATH, SUMBT_PATH)
-        if not os.path.isdir(os.path.join(SUMBT_PATH, args.output_dir)):
-            os.makedirs(os.path.join(SUMBT_PATH, args.output_dir))
-        self.train_examples = processor.get_train_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False)
-        self.dev_examples = processor.get_dev_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False)
-        self.test_examples = processor.get_test_examples(os.path.join(SUMBT_PATH, args.tmp_data_dir), accumulation=False)
-        self.eval_slots = eval_slots
-
-    def load_weights(self, model_path=None):
-        if model_path is None:
-            model_ckpt = os.path.join(SUMBT_PATH, 'pre-trained/pytorch_model.bin')
-        else:
-            model_ckpt = model_path
-        model = self.sumbt_model
-        # in the case that slot and values are different between the training and evaluation
-        if not USE_CUDA:
-            ptr_model = torch.load(model_ckpt, map_location=torch.device('cpu'))
-        else:
-            ptr_model = torch.load(model_ckpt)
-        print('loading pretrained weights')
-
-        if not USE_CUDA or N_GPU == 1:
-            state = model.state_dict()
-            state.update(ptr_model)
-            model.load_state_dict(state)
-        else:
-            # print("Evaluate using only one device!")
-            model.module.load_state_dict(ptr_model)
-
-        if USE_CUDA:
-            model.to("cuda")
-
-    def train(self, load_model=False, model_path=None):
-        if load_model:
-            if model_path is not None:
-                self.load_weights(model_path)
-        ## Training utterances
-        all_input_ids, all_input_len, all_label_ids = convert_examples_to_features(
-            self.train_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length)
-
-        num_train_batches = all_input_ids.size(0)
-        num_train_steps = int(
-            num_train_batches / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
-
-        logger.info("***** training *****")
-        logger.info("  Num examples = %d", len(self.train_examples))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_steps)
-
-        all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to(
-            DEVICE), all_label_ids.to(DEVICE)
-
-        train_data = TensorDataset(all_input_ids, all_input_len, all_label_ids)
-        train_sampler = RandomSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-        all_input_ids_dev, all_input_len_dev, all_label_ids_dev = convert_examples_to_features(
-            self.dev_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length)
-
-        logger.info("***** validation *****")
-        logger.info("  Num examples = %d", len(self.dev_examples))
-        logger.info("  Batch size = %d", args.dev_batch_size)
-
-        all_input_ids_dev, all_input_len_dev, all_label_ids_dev = \
-            all_input_ids_dev.to(DEVICE), all_input_len_dev.to(DEVICE), all_label_ids_dev.to(DEVICE)
-
-        dev_data = TensorDataset(all_input_ids_dev, all_input_len_dev, all_label_ids_dev)
-        dev_sampler = SequentialSampler(dev_data)
-        dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.dev_batch_size)
-
-        logger.info("Loaded data!")
-
-        if args.fp16:
-            self.sumbt_model.half()
-        self.sumbt_model.to(DEVICE)
-
-        ## Get domain-slot-type embeddings
-        slot_token_ids, slot_len = \
-            get_label_embedding(self.processor.target_slot, args.max_label_length, self.tokenizer, DEVICE)
-
-        # for slot_idx, slot_str in zip(slot_token_ids, self.processor.target_slot):
-        #     self.idx2slot[slot_idx] = slot_str
-
-        ## Get slot-value embeddings
-        label_token_ids, label_len = [], []
-        for slot_idx, labels in zip(slot_token_ids, self.label_list):
-            # self.idx2value[slot_idx] = {}
-            token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, DEVICE)
-            label_token_ids.append(token_ids)
-            label_len.append(lens)
-            # for label, token_id in zip(labels, token_ids):
-            #     self.idx2value[slot_idx][token_id] = label
-
-        logger.info('embeddings prepared')
-
-        if USE_CUDA and N_GPU > 1:
-            self.sumbt_model.module.initialize_slot_value_lookup(label_token_ids, slot_token_ids)
-        else:
-            self.sumbt_model.initialize_slot_value_lookup(label_token_ids, slot_token_ids)
-
-        def get_optimizer_grouped_parameters(model):
-            param_optimizer = [(n, p) for n, p in model.named_parameters() if p.requires_grad]
-            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-            optimizer_grouped_parameters = [
-                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01,
-                 'lr': args.learning_rate},
-                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,
-                 'lr': args.learning_rate},
-            ]
-            return optimizer_grouped_parameters
-
-        if not USE_CUDA or N_GPU == 1:
-            optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model)
-        else:
-            optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model.module)
-
-        t_total = num_train_steps
-
-        scheduler = None
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError(
-                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.fp16_loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale)
-
-        else:
-            optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False)
-            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total)
-        logger.info(optimizer)
-
-        # Training code
-        ###############################################################################
-
-        logger.info("Training...")
-
-        global_step = 0
-        last_update = None
-        best_loss = None
-        model = self.sumbt_model
-        if not args.do_not_use_tensorboard:
-            summary_writer = None
-        else:
-            summary_writer = SummaryWriter("./tensorboard_summary/logs_1214/")
-
-        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
-            # Train
-            model.train()
-            tr_loss = 0
-            nb_tr_examples = 0
-            nb_tr_steps = 0
-
-            for step, batch in enumerate(tqdm(train_dataloader)):
-                batch = tuple(t.to(DEVICE) for t in batch)
-                input_ids, input_len, label_ids = batch
-
-                # Forward
-                if N_GPU == 1:
-                    loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
-                else:
-                    loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
-
-                    # average to multi-gpus
-                    loss = loss.mean()
-                    acc = acc.mean()
-                    acc_slot = acc_slot.mean(0)
-
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                # Backward
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-
-                # tensrboard logging
-                if summary_writer is not None:
-                    summary_writer.add_scalar("Epoch", epoch, global_step)
-                    summary_writer.add_scalar("Train/Loss", loss, global_step)
-                    summary_writer.add_scalar("Train/JointAcc", acc, global_step)
-                    if N_GPU == 1:
-                        for i, slot in enumerate(self.processor.target_slot):
-                            summary_writer.add_scalar("Train/Loss_%s" % slot.replace(' ', '_'), loss_slot[i],
-                                                      global_step)
-                            summary_writer.add_scalar("Train/Acc_%s" % slot.replace(' ', '_'), acc_slot[i], global_step)
-
-                tr_loss += loss.item()
-                nb_tr_examples += input_ids.size(0)
-                nb_tr_steps += 1
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    # modify lealrning rate with special warm up BERT uses
-                    lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion)
-                    if summary_writer is not None:
-                        summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step)
-                    for param_group in optimizer.param_groups:
-                        param_group['lr'] = lr_this_step
-                    if scheduler is not None:
-                        torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0)
-                    optimizer.step()
-                    if scheduler is not None:
-                        scheduler.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-
-
-            # Perform evaluation on validation dataset
-            model.eval()
-            dev_loss = 0
-            dev_acc = 0
-            dev_loss_slot, dev_acc_slot = None, None
-            nb_dev_examples, nb_dev_steps = 0, 0
-
-            for step, batch in enumerate(tqdm(dev_dataloader, desc="Validation")):
-                batch = tuple(t.to(DEVICE) for t in batch)
-                input_ids, input_len, label_ids = batch
-                if input_ids.dim() == 2:
-                    input_ids = input_ids.unsqueeze(0)
-                    input_len = input_len.unsqueeze(0)
-                    label_ids = label_ids.unsuqeeze(0)
-
-                with torch.no_grad():
-                    if N_GPU == 1:
-                        loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
-                    else:
-                        loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
-
-                        # average to multi-gpus
-                        loss = loss.mean()
-                        acc = acc.mean()
-                        acc_slot = acc_slot.mean(0)
-
-                num_valid_turn = torch.sum(label_ids[:, :, 0].view(-1) > -1, 0).item()
-                dev_loss += loss.item() * num_valid_turn
-                dev_acc += acc.item() * num_valid_turn
-
-                if N_GPU == 1:
-                    if dev_loss_slot is None:
-                        dev_loss_slot = [l * num_valid_turn for l in loss_slot]
-                        dev_acc_slot = acc_slot * num_valid_turn
-                    else:
-                        for i, l in enumerate(loss_slot):
-                            dev_loss_slot[i] = dev_loss_slot[i] + l * num_valid_turn
-                        dev_acc_slot += acc_slot * num_valid_turn
-
-                nb_dev_examples += num_valid_turn
-
-
-            dev_loss = dev_loss / nb_dev_examples
-            dev_acc = dev_acc / nb_dev_examples
-
-            if N_GPU == 1:
-                dev_acc_slot = dev_acc_slot / nb_dev_examples
-
-            # tensorboard logging
-            if summary_writer is not None:
-                summary_writer.add_scalar("Validate/Loss", dev_loss, global_step)
-                summary_writer.add_scalar("Validate/Acc", dev_acc, global_step)
-                if N_GPU == 1:
-                    for i, slot in enumerate(self.processor.target_slot):
-                        summary_writer.add_scalar("Validate/Loss_%s" % slot.replace(' ', '_'),
-                                                  dev_loss_slot[i] / nb_dev_examples, global_step)
-                        summary_writer.add_scalar("Validate/Acc_%s" % slot.replace(' ', '_'), dev_acc_slot[i],
-                                                  global_step)
-
-            dev_loss = round(dev_loss, 6)
-
-            output_model_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin")
-
-            if last_update is None or dev_loss < best_loss:
-
-                if not USE_CUDA or N_GPU == 1:
-                    torch.save(model.state_dict(), output_model_file)
-                else:
-                    torch.save(model.module.state_dict(), output_model_file)
-
-                last_update = epoch
-                best_loss = dev_loss
-                best_acc = dev_acc
-
-                logger.info(
-                    "*** Model Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % (
-                        last_update, best_loss, best_acc, global_step))
-            else:
-                logger.info(
-                    "*** Model NOT Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d  ***" % (
-                        epoch, dev_loss, dev_acc, global_step))
-
-            if last_update + args.patience <= epoch:
-                break
-
-    def test(self, mode='dev', model_path=None):
-        '''Testing funciton of TRADE (to be added)'''
-        # Evaluation
-        self.load_weights(model_path)
-
-        if mode == 'test':
-            eval_examples = self.dev_examples
-        elif mode == 'dev':
-            eval_examples = self.test_examples
-
-        all_input_ids, all_input_len, all_label_ids = convert_examples_to_features(
-            eval_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length)
-        all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to(
-            DEVICE), all_label_ids.to(DEVICE)
-        logger.info("***** Running evaluation *****")
-        logger.info("  Num examples = %d", len(eval_examples))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-
-        eval_data = TensorDataset(all_input_ids, all_input_len, all_label_ids)
-
-        # Run prediction for full data
-        eval_sampler = SequentialSampler(eval_data)
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.dev_batch_size)
-
-        model = self.sumbt_model
-        eval_loss, eval_accuracy = 0, 0
-        eval_loss_slot, eval_acc_slot = None, None
-        nb_eval_steps, nb_eval_examples = 0, 0
-
-        accuracies = {'joint7': 0, 'slot7': 0, 'joint5': 0, 'slot5': 0, 'joint_rest': 0, 'slot_rest': 0,
-                      'num_turn': 0, 'num_slot7': 0, 'num_slot5': 0, 'num_slot_rest': 0}
-
-        for input_ids, input_len, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-            # if input_ids.dim() == 2:
-            #     input_ids = input_ids.unsqueeze(0)
-            #     input_len = input_len.unsqueeze(0)
-            #     label_ids = label_ids.unsuqeeze(0)
-
-            with torch.no_grad():
-                if not USE_CUDA or N_GPU == 1:
-                    loss, loss_slot, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, 1)
-                else:
-                    loss, _, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, N_GPU)
-                    nbatch = label_ids.size(0)
-                    nslot = pred_slot.size(3)
-                    pred_slot = pred_slot.view(nbatch, -1, nslot)
-
-            accuracies = eval_all_accs(pred_slot, label_ids, accuracies)
-
-            nb_eval_ex = (label_ids[:, :, 0].view(-1) != -1).sum().item()
-            nb_eval_examples += nb_eval_ex
-            nb_eval_steps += 1
-
-            if not USE_CUDA or N_GPU == 1:
-                eval_loss += loss.item() * nb_eval_ex
-                eval_accuracy += acc.item() * nb_eval_ex
-                if eval_loss_slot is None:
-                    eval_loss_slot = [l * nb_eval_ex for l in loss_slot]
-                    eval_acc_slot = acc_slot * nb_eval_ex
-                else:
-                    for i, l in enumerate(loss_slot):
-                        eval_loss_slot[i] = eval_loss_slot[i] + l * nb_eval_ex
-                    eval_acc_slot += acc_slot * nb_eval_ex
-            else:
-                eval_loss += sum(loss) * nb_eval_ex
-                eval_accuracy += sum(acc) * nb_eval_ex
-
-        eval_loss = eval_loss / nb_eval_examples
-        eval_accuracy = eval_accuracy / nb_eval_examples
-        if not USE_CUDA or N_GPU == 1:
-            eval_acc_slot = eval_acc_slot / nb_eval_examples
-
-        loss = None
-
-        if not USE_CUDA or N_GPU == 1:
-            result = {'eval_loss': eval_loss,
-                      'eval_accuracy': eval_accuracy,
-                      'loss': loss,
-                      'eval_loss_slot': '\t'.join([str(val / nb_eval_examples) for val in eval_loss_slot]),
-                      'eval_acc_slot': '\t'.join([str((val).item()) for val in eval_acc_slot])
-                      }
-        else:
-            result = {'eval_loss': eval_loss,
-                      'eval_accuracy': eval_accuracy,
-                      'loss': loss
-                      }
-
-        out_file_name = 'eval_results'
-        # if TARGET_SLOT == 'all':
-        #     out_file_name += '_all'
-        output_eval_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name)
-
-        if not USE_CUDA or N_GPU == 1:
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key in sorted(result.keys()):
-                    logger.info("  %s = %s", key, str(result[key]))
-                    writer.write("%s = %s\n" % (key, str(result[key])))
-
-        out_file_name = 'eval_all_accuracies'
-        with open(os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name), 'w') as f:
-            s = '{:^22s}:{:^22s}:{:^22s}:{:^22s}:{:^22s}:{:^22s}'.format(
-                'joint acc (7 domain)', 
-                'slot acc (7 domain)', 
-                'joint acc (5 domain)', 
-                'slot acc (5 domain)', 
-                'joint restaurant', 
-                'slot acc restaurant')
-            f.write(s + '\n')
-            print(s)
-            s = '{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}'.format(
-                (accuracies['joint7'] / accuracies['num_turn']).item(),
-                (accuracies['slot7'] / accuracies['num_slot7']).item(),
-                (accuracies['joint5'] / accuracies['num_turn']).item(),
-                (accuracies['slot5'] / accuracies['num_slot5']).item(),
-                (accuracies['joint_rest'] / accuracies['num_turn']).item(),
-                (accuracies['slot_rest'] / accuracies['num_slot_rest']).item()
-            )
-            f.write(s + '\n')
-            print(s)
-
-    def init_session(self):
-        self.state = default_state()
-        if not self.param_restored:
-            if os.path.isfile(os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin')):
-                print('loading weights from downloaded model')
-                self.load_weights(model_path=os.path.join(DOWNLOAD_DIRECTORY, 'pytorch_model.bin'))
-            elif os.path.isfile(os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin')):
-                print('loading weights from trained model')
-                self.load_weights(model_path=os.path.join(SUMBT_PATH, args.output_dir, 'pytorch_model.bin'))
-            else:
-                raise ValueError('no available weights found.')
-            self.param_restored = True
-
-    def update(self, user_act=None):
-        """Update the dialogue state with the generated tokens from TRADE"""
-        if not isinstance(user_act, str):
-            raise Exception(
-                'Expected user_act is str but found {}'.format(type(user_act))
-            )
-        prev_state = self.state
-
-        actual_history = copy.deepcopy(prev_state['history'])
-
-        query = self.construct_query(actual_history)
-        pred_states = self.predict(query)
-
-        new_belief_state = copy.deepcopy(prev_state['belief_state'])
-        for state in pred_states:
-            domain, slot, value = state.split('-', 2)
-            # slot = REF_SYS_DA[domain.capitalize()].get(slot, slot)
-            assert 'semi' in new_belief_state[domain]
-            assert 'book' in new_belief_state[domain]
-            domain_dic = new_belief_state[domain]
-            if '预订' in slot:
-                assert slot.startswith('预订')
-                slot = slot[2:]
-                assert slot in domain_dic['book']
-
-            if slot in domain_dic['semi']:
-                new_belief_state[domain]['semi'][slot] = value
-                # normalize_value(self.value_dict, domain, slot, value)
-            elif slot in domain_dic['book']:
-                new_belief_state[domain]['book'][slot] = value
-            else:
-                with open('trade_tracker_unknown_slot.log', 'a+') as f:
-                    f.write(
-                        'unknown slot name <{}> with value <{}> of domain <{}>\nitem: {}\n\n'.format(slot, value, domain, state)
-                    )
-
-        # new_request_state = copy.deepcopy(prev_state['request_state'])
-        # # update request_state
-        # user_request_slot = self.detect_requestable_slots(user_act)
-        # for domain in user_request_slot:
-        #     for key in user_request_slot[domain]:
-        #         if domain not in new_request_state:
-        #             new_request_state[domain] = {}
-        #         if key not in new_request_state[domain]:
-        #             new_request_state[domain][key] = user_request_slot[domain][key]
-
-        new_state = copy.deepcopy(dict(prev_state))
-        new_state['belief_state'] = new_belief_state
-        # new_state['request_state'] = new_request_state
-        self.state = new_state
-        # print((pred_states, query))
-        return self.state
-
-    def predict(self, query):
-        cache_query_key = ''.join(str(list(chain.from_iterable(query[0]))))
-        if cache_query_key in self.cached_res.keys():
-            return self.cached_res[cache_query_key]
-
-        input_ids, input_len = query
-        input_ids = torch.tensor(input_ids).to(self.device).unsqueeze(0)
-        input_len = torch.tensor(input_len).to(self.device).unsqueeze(0)
-        labels = None
-        _, pred_slot = self.sumbt_model(input_ids, input_len, labels)
-        pred_slot_t = pred_slot[0][-1].tolist()
-        predict_belief = []
-        for idx, i in enumerate(pred_slot_t):
-            predict_belief.append('{}-{}'.format(self.target_slot[idx], self.label_map_inv[idx][i]))
-        self.cached_res[cache_query_key] = predict_belief
-
-        return predict_belief
-
-
-    def construct_query(self, context):
-        '''Construct query from context'''
-        ids = []
-        lens = []
-        context_len = len(context)
-        if context[0][0] != 'sys':
-            context = [['sys', '']] + context
-        for i in range(0, context_len, 2):
-            # utt_user = ''
-            # utt_sys = ''
-            # for evaluation
-            utt_sys = context[i][1]
-            utt_user = context[i + 1][1]
-
-            tokens_user = [x if x != '#' else '[SEP]' for x in self.tokenizer.tokenize(utt_user)]
-            tokens_sys = [x if x != '#' else '[SEP]' for x in self.tokenizer.tokenize(utt_sys)]
-
-            _truncate_seq_pair(tokens_user, tokens_sys, self.args.max_seq_length - 3)
-            tokens = ["[CLS]"] + tokens_user + ["[SEP]"] + tokens_sys + ["[SEP]"]
-            input_len = [len(tokens_user) + 2, len(tokens_sys) + 1]
-
-            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-            padding = [0] * (self.args.max_seq_length - len(input_ids))
-            input_ids += padding
-            assert len(input_ids) == self.args.max_seq_length
-            ids.append(input_ids)
-            lens.append(input_len)
-
-        return (ids, lens)
-
-    def detect_requestable_slots(self, observation):
-        result = {}
-        observation = observation.lower()
-        _observation = ' {} '.format(observation)
-        for value in self.det_dic.keys():
-            _value = ' {} '.format(value.strip())
-            if _value in _observation:
-                key, domain = self.det_dic[value].split('-')
-                if domain not in result:
-                    result[domain] = {}
-                result[domain][key] = 0
-        return result
diff --git a/convlab/dst/sumbt/multiwoz_zh/sumbt_config.py b/convlab/dst/sumbt/multiwoz_zh/sumbt_config.py
deleted file mode 100644
index f0bfd90ff474d903329c6956f0ade7a861caedfc..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/multiwoz_zh/sumbt_config.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import os
-import convlab
-
-class DotMap():
-    def __init__(self):
-        self.max_label_length = 32
-        self.max_turn_length = 22
-        self.num_rnn_layers = 1
-        self.zero_init_rnn = False
-        self.attn_head = 4
-        self.do_eval = True
-        self.do_train = False
-        self.train_batch_size = 3
-        self.dev_batch_size = 1
-        self.eval_batch_size  = 1
-        self.learning_rate = 5e-5
-        self.num_train_epochs = 3
-        self.patience = 10
-        self.warmup_proportion = 0.1
-        self.local_rank = -1
-        self.seed = 42
-        self.gradient_accumulation_steps = 1
-        self.fp16 = False
-        self.loss_scale = 0
-        self.do_not_use_tensorboard = False
-        self.fix_utterance_encoder = False
-        self.do_eval = True
-        self.num_train_epochs = 300
-
-        self.bert_model = os.path.join(convlab.get_root_path(), "pre-trained-models/chinese-bert-wwm-ext")
-        self.bert_model_cache_dir = os.path.join(convlab.get_root_path(), "pre-trained-models/")
-        self.bert_model_name = "hfl/chinese-bert-wwm-ext"
-        self.do_lower_case = True
-        self.task_name = 'bert-gru-sumbt'
-        self.nbt = 'rnn'
-        # self.output_dir = os.path.join(path, 'ckpt/')
-        self.target_slot = 'all'
-        self.learning_rate = 5e-5
-        self.distance_metric = 'euclidean'
-        self.patience = 15
-
-        self.hidden_dim = 300
-        self.max_label_length = 32
-        self.max_seq_length = 64
-        self.max_turn_length = 22
-
-        self.fp16_loss_scale = 0.0
-        self.data_dir = 'data/multiwoz_zh/'
-        self.tf_dir = 'tensorboard'
-        self.tmp_data_dir = 'processed_data/'
-        self.output_dir = 'model_output/'
-
-args = DotMap()
\ No newline at end of file
diff --git a/convlab/dst/sumbt/multiwoz_zh/sumbt_utils.py b/convlab/dst/sumbt/multiwoz_zh/sumbt_utils.py
deleted file mode 100644
index 19b6864115e0b7129546097f033b2efa6ccba9a3..0000000000000000000000000000000000000000
--- a/convlab/dst/sumbt/multiwoz_zh/sumbt_utils.py
+++ /dev/null
@@ -1,440 +0,0 @@
-import csv
-import os
-import json
-import collections
-import logging
-import re
-import torch
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt='%m/%d/%Y %H:%M:%S',
-                    level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding='utf-8') as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                if len(line) > 0 and line[0][0] == '#':  # ignore comments (starting with '#')
-                    continue
-                lines.append(line)
-            return lines
-
-
-class Processor(DataProcessor):
-    """Processor for the belief tracking dataset (GLUE version)."""
-
-    def __init__(self, config):
-        super(Processor, self).__init__()
-
-        print(config)
-        # MultiWOZ dataset
-
-        with open(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))), config.data_dir, "ontology.json"), "r") as fp_ontology:
-            ontology = json.load(fp_ontology)
-            for slot in ontology.keys():
-                ontology[slot].append("未提及")
-
-        if config.target_slot != 'all':
-            raise Exception('unsupported')
-
-        # sorting the ontology according to the alphabetic order of the slots
-        ontology = collections.OrderedDict(sorted(ontology.items()))
-
-        # select slots to train
-        nslots = len(ontology.keys())
-        target_slot = list(ontology.keys())
-
-        self.target_slot_idx = [*range(0, nslots)]
-
-        for idx in range(0, nslots):
-            if not idx in self.target_slot_idx:
-                del ontology[target_slot[idx]]
-
-        self.ontology = ontology
-        self.target_slot = list(self.ontology.keys())
-        for i, slot in enumerate(self.target_slot):
-            if slot == "pricerange":
-                self.target_slot[i] = "price range"
-
-        logger.info('Processor: target_slot')
-        logger.info(self.target_slot)
-
-    def get_train_examples(self, data_dir, accumulation=False):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train", accumulation)
-
-    def get_dev_examples(self, data_dir, accumulation=False):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev", accumulation)
-
-    def get_test_examples(self, data_dir, accumulation=False):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test", accumulation)
-
-    def get_labels(self):
-        """See base class."""
-        return [self.ontology[slot] for slot in self.target_slot]
-
-    def _create_examples(self, lines, set_type, accumulation=False):
-        """Creates examples for the training and dev sets."""
-        prev_dialogue_index = None
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s-%s" % (set_type, line[0], line[1])  # line[0]: dialogue index, line[1]: turn index
-            if accumulation:
-                if prev_dialogue_index is None or prev_dialogue_index != line[0]:
-                    text_a = line[2]
-                    text_b = line[3]
-                    prev_dialogue_index = line[0]
-                else:
-                    # The symbol '#' will be replaced with '[SEP]' after tokenization.
-                    text_a = line[2] + " # " + text_a
-                    text_b = line[3] + " # " + text_b
-            else:
-                text_a = line[2]  # line[2]: user utterance
-                text_b = line[3]  # line[3]: system response
-
-            label = [line[4 + idx] for idx in self.target_slot_idx]
-
-            examples.append(
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-def normalize_text(text):
-    global replacements
-    # lower case every word
-    text = text.lower()
-    # replace white spaces in front and end
-    text = re.sub(r'^\s*|\s*$', '', text)
-
-    # hotel domain pfb30
-    text = re.sub(r"b&b", "bed and breakfast", text)
-    text = re.sub(r"b and b", "bed and breakfast", text)
-
-    # replace st.
-    text = text.replace(';', ',')
-    text = re.sub('$\/', '', text)
-    text = text.replace('/', ' and ')
-
-    # replace other special characters
-    text = text.replace('-', ' ')
-    text = re.sub('[\"\<>@\(\)]', '', text)  # remove
-
-    # insert white space before and after tokens:
-    for token in ['?', '.', ',', '!']:
-        text = insertSpace(token, text)
-
-    # insert white space for 's
-    text = insertSpace('\'s', text)
-
-    # replace it's, does't, you'd ... etc
-    text = re.sub('^\'', '', text)
-    text = re.sub('\'$', '', text)
-    text = re.sub('\'\s', ' ', text)
-    text = re.sub('\s\'', ' ', text)
-    for fromx, tox in replacements:
-        text = ' ' + text + ' '
-        text = text.replace(fromx, tox)[1:-1]
-
-    # remove multiple spaces
-    text = re.sub(' +', ' ', text)
-
-    # concatenate numbers
-    tmp = text
-    tokens = text.split()
-    i = 1
-    while i < len(tokens):
-        if re.match(u'^\d+$', tokens[i]) and \
-                re.match(u'\d+$', tokens[i - 1]):
-            tokens[i - 1] += tokens[i]
-            del tokens[i]
-        else:
-            i += 1
-    text = ' '.join(tokens)
-
-    return text
-
-
-def insertSpace(token, text):
-    sidx = 0
-    while True:
-        sidx = text.find(token, sidx)
-        if sidx == -1:
-            break
-        if sidx + 1 < len(text) and re.match('[0-9]', text[sidx - 1]) and \
-                re.match('[0-9]', text[sidx + 1]):
-            sidx += 1
-            continue
-        if text[sidx - 1] != ' ':
-            text = text[:sidx] + ' ' + text[sidx:]
-            sidx += 1
-        if sidx + len(token) < len(text) and text[sidx + len(token)] != ' ':
-            text = text[:sidx + 1] + ' ' + text[sidx + 1:]
-        sidx += 1
-    return text
-
-
-def get_label_embedding(labels, max_seq_length, tokenizer, device):
-    features = []
-    for label in labels:
-        label_tokens = ["[CLS]"] + tokenizer.tokenize(label) + ["[SEP]"]
-        label_token_ids = tokenizer.convert_tokens_to_ids(label_tokens)
-        label_len = len(label_token_ids)
-
-        label_padding = [0] * (max_seq_length - len(label_token_ids))
-        label_token_ids += label_padding
-        assert len(label_token_ids) == max_seq_length
-
-        features.append((label_token_ids, label_len))
-
-    all_label_token_ids = torch.tensor([f[0] for f in features], dtype=torch.long).to(device)
-    all_label_len = torch.tensor([f[1] for f in features], dtype=torch.long).to(device)
-
-    return all_label_token_ids, all_label_len
-
-
-def warmup_linear(x, warmup=0.002):
-    if x < warmup:
-        return x / warmup
-    return 1.0 - x
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
-
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
-
-        Args:
-            guid: Unique id for the example.
-            text_a: string. The untokenized text of the first sequence. For single
-            sequence tasks, only this sequence must be specified.
-            text_b: (Optional) string. The untokenized text of the second sequence.
-            Only must be specified for sequence pair tasks.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_len, label_id):
-        self.input_ids = input_ids
-        self.input_len = input_len
-        self.label_id = label_id
-
-
-def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, max_turn_length):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    label_map = [{label: i for i, label in enumerate(labels)} for labels in label_list]
-    slot_dim = len(label_list)
-
-    features = []
-    prev_dialogue_idx = None
-    all_padding = [0] * max_seq_length
-    all_padding_len = [0, 0]
-
-    max_turn = 0
-    for (ex_index, example) in enumerate(examples):
-        if max_turn < int(example.guid.split('-')[2]):
-            max_turn = int(example.guid.split('-')[2])
-    max_turn_length = min(max_turn + 1, max_turn_length)
-    logger.info("max_turn_length = %d" % max_turn)
-
-    for (ex_index, example) in enumerate(examples):
-        tokens_a = [x if x != '#' else '[SEP]' for x in tokenizer.tokenize(example.text_a)]
-        tokens_b = None
-        if example.text_b:
-            tokens_b = [x if x != '#' else '[SEP]' for x in tokenizer.tokenize(example.text_b)]
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[:(max_seq_length - 2)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0   0   0   0  0     0 0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambigiously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-
-        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
-        input_len = [len(tokens), 0]
-
-        if tokens_b:
-            tokens += tokens_b + ["[SEP]"]
-            input_len[1] = len(tokens_b) + 1
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # Zero-pad up to the sequence length.
-        padding = [0] * (max_seq_length - len(input_ids))
-        input_ids += padding
-        assert len(input_ids) == max_seq_length
-
-        FLAG_TEST = False
-        if example.label is not None:
-            label_id = []
-            label_info = 'label: '
-            for i, label in enumerate(example.label):
-                if label == 'dontcare':
-                    label = 'do not care'
-                label_id.append(label_map[i][label])
-                label_info += '%s (id = %d) ' % (label, label_map[i][label])
-
-            if ex_index < 5:
-                logger.info("*** Example ***")
-                logger.info("guid: %s" % example.guid)
-                logger.info("tokens: %s" % " ".join(
-                    [str(x) for x in tokens]))
-                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                logger.info("input_len: %s" % " ".join([str(x) for x in input_len]))
-                logger.info("label: " + label_info)
-        else:
-            FLAG_TEST = True
-            label_id = None
-
-        curr_dialogue_idx = example.guid.split('-')[1]
-        curr_turn_idx = int(example.guid.split('-')[2])
-
-        if prev_dialogue_idx is not None and prev_dialogue_idx != curr_dialogue_idx:
-            if prev_turn_idx < max_turn_length:
-                features += [InputFeatures(input_ids=all_padding,
-                                           input_len=all_padding_len,
-                                           label_id=[-1] * slot_dim)] \
-                            * (max_turn_length - prev_turn_idx - 1)
-            assert len(features) % max_turn_length == 0
-
-        if prev_dialogue_idx is None or prev_turn_idx < max_turn_length:
-            features.append(
-                InputFeatures(input_ids=input_ids,
-                              input_len=input_len,
-                              label_id=label_id))
-
-        prev_dialogue_idx = curr_dialogue_idx
-        prev_turn_idx = curr_turn_idx
-
-    if prev_turn_idx < max_turn_length:
-        features += [InputFeatures(input_ids=all_padding,
-                                   input_len=all_padding_len,
-                                   label_id=[-1] * slot_dim)] \
-                    * (max_turn_length - prev_turn_idx - 1)
-    assert len(features) % max_turn_length == 0
-
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_len = torch.tensor([f.input_len for f in features], dtype=torch.long)
-    if not FLAG_TEST:
-        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
-
-    # reshape tensors to [#batch, #max_turn_length, #max_seq_length]
-    all_input_ids = all_input_ids.view(-1, max_turn_length, max_seq_length)
-    all_input_len = all_input_len.view(-1, max_turn_length, 2)
-    if not FLAG_TEST:
-        all_label_ids = all_label_ids.view(-1, max_turn_length, slot_dim)
-    else:
-        all_label_ids = None
-
-    return all_input_ids, all_input_len, all_label_ids
-
-
-def eval_all_accs(pred_slot, labels, accuracies):
-
-    def _eval_acc(_pred_slot, _labels):
-        slot_dim = _labels.size(-1)
-        accuracy = (_pred_slot == _labels).view(-1, slot_dim)
-        num_turn = torch.sum(_labels[:, :, 0].view(-1) > -1, 0).float()
-        num_data = torch.sum(_labels > -1).float()
-        # joint accuracy
-        # joint_acc = sum(torch.sum(accuracy, 1) / slot_dim).float()
-        num_slots = accuracy.shape[1]
-        joint_acc = sum(torch.sum(accuracy, 1) == num_slots)
-        # slot accuracy
-        slot_acc = torch.sum(accuracy).float()
-        return joint_acc, slot_acc, num_turn, num_data
-
-    # 7 domains
-    joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot, labels)
-    accuracies['joint7'] += joint_acc
-    accuracies['slot7'] += slot_acc
-    accuracies['num_turn'] += num_turn
-    accuracies['num_slot7'] += num_data
-
-    # restaurant domain
-    joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot[:,:,18:25], labels[:,:,18:25])
-    accuracies['joint_rest'] += joint_acc
-    accuracies['slot_rest'] += slot_acc
-    accuracies['num_slot_rest'] += num_data
-
-    pred_slot5 = torch.cat((pred_slot[:,:,0:3], pred_slot[:,:,8:]), 2)
-    label_slot5 = torch.cat((labels[:,:,0:3], labels[:,:,8:]), 2)
-
-    # 5 domains (excluding bus and hotel domain)
-    joint_acc, slot_acc, num_turn, num_data = _eval_acc(pred_slot5, label_slot5)
-    accuracies['joint5'] += joint_acc
-    accuracies['slot5'] += slot_acc
-    accuracies['num_slot5'] += num_data
-
-    return accuracies
diff --git a/convlab/policy/ppo/semantic_level_config.json b/convlab/policy/ppo/semantic_level_config.json
index 04b0626a10bc8d48add16732df26a7cc00a35088..b9908c9cb7717515775221227f3fba19636d20dc 100644
--- a/convlab/policy/ppo/semantic_level_config.json
+++ b/convlab/policy/ppo/semantic_level_config.json
@@ -6,7 +6,7 @@
 		"batchsz": 1000,
 		"seed": 0,
 		"epoch": 10,
-		"eval_frequency": 5,
+		"eval_frequency": 1,
 		"process_num": 4,
 		"sys_semantic_to_usr": false,
 		"num_eval_dialogues": 500
diff --git a/convlab/policy/ppo/setsumbt_config.json b/convlab/policy/ppo/setsumbt_config.json
index 31a8ac6d275166e4163e416e0dbef6f742cddb7f..b6a02adbf371bfea63e3e156a2d9e47f13456c78 100644
--- a/convlab/policy/ppo/setsumbt_config.json
+++ b/convlab/policy/ppo/setsumbt_config.json
@@ -26,7 +26,7 @@
 		"setsumbt-mul": {
 			"class_path": "convlab.dst.setsumbt.SetSUMBTTracker",
 			"ini_params": {
-				"model_path": "/gpfs/project/niekerk/models/setsumbt_models/SetSUMBT+ActPrediction-multiwoz21-roberta-gru-cosine-distribution_distillation-Seed0-30-08-22-15-00"
+				"model_path": "https://huggingface.co/ConvLab/setsumbt-dst_nlu-multiwoz21-EnD2/resolve/main/SetSUMBT-nlu-multiwoz21-roberta-gru-cosine-distribution_distillation-Seed0.zip"
 			}
 		}
 	},
diff --git a/convlab/policy/ppo/setsumbt_unc_config.json b/convlab/policy/ppo/setsumbt_unc_config.json
index 6b7d115aafa53a2bfc5c58672e67086f04d5884d..fafdb3fc9bd8f7fe09e3759d58a591cf964fb93b 100644
--- a/convlab/policy/ppo/setsumbt_unc_config.json
+++ b/convlab/policy/ppo/setsumbt_unc_config.json
@@ -1,6 +1,6 @@
 {
 	"model": {
-		"load_path": "/gpfs/project/niekerk/src/ConvLab3/convlab/policy/mle/experiments/experiment_2022-11-10-10-37-30/save/supervised",
+		"load_path": "",
 		"pretrained_load_path": "",
 		"use_pretrained_initialisation": false,
 		"batchsz": 1000,
@@ -28,7 +28,7 @@
 		"setsumbt-mul": {
 			"class_path": "convlab.dst.setsumbt.SetSUMBTTracker",
 			"ini_params": {
-				"model_path": "/gpfs/project/niekerk/models/setsumbt_models/SetSUMBT+ActPrediction-multiwoz21-roberta-gru-cosine-distribution_distillation-Seed0-30-08-22-15-00",
+				"model_path": "https://huggingface.co/ConvLab/setsumbt-dst_nlu-multiwoz21-EnD2/resolve/main/SetSUMBT-nlu-multiwoz21-roberta-gru-cosine-distribution_distillation-Seed0.zip",
 				"return_confidence_scores": true,
 				"return_belief_state_mutual_info": true
 			}