diff --git a/convlab2/base_models/t5/create_data.py b/convlab2/base_models/t5/create_data.py
index cc9e651291bcaf2284750901c8e3c5f386b0c43f..c2f3da96523266936e4f15ad19682de62e5da3da 100644
--- a/convlab2/base_models/t5/create_data.py
+++ b/convlab2/base_models/t5/create_data.py
@@ -2,6 +2,7 @@ import os
 import json
 from tqdm import tqdm
 import re
+from transformers import AutoTokenizer
 from convlab2.util import load_dataset, load_nlu_data, load_dst_data, load_policy_data, load_nlg_data, load_e2e_data, load_rg_data
 from convlab2.base_models.t5.nlu.serialization import serialize_dialogue_acts, deserialize_dialogue_acts, equal_da_seq
 from convlab2.base_models.t5.dst.serialization import serialize_dialogue_state, deserialize_dialogue_state, equal_state_seq
@@ -23,6 +24,8 @@ def create_rg_data(dataset, data_dir, args):
         file_name = os.path.join(data_dir, f"{data_split}.json")
         with open(file_name, "w", encoding='utf-8') as f:
             f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
 
 def create_nlu_data(dataset, data_dir, args):
     data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
@@ -45,6 +48,8 @@ def create_nlu_data(dataset, data_dir, args):
         file_name = os.path.join(data_dir, f"{data_split}.json")
         with open(file_name, "w", encoding='utf-8') as f:
             f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
 
 def create_dst_data(dataset, data_dir, args):
     data_by_split = load_dst_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
@@ -67,6 +72,8 @@ def create_dst_data(dataset, data_dir, args):
         file_name = os.path.join(data_dir, f"{data_split}.json")
         with open(file_name, "w", encoding='utf-8') as f:
             f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
 
 def create_nlg_data(dataset, data_dir, args):
     data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
@@ -89,6 +96,8 @@ def create_nlg_data(dataset, data_dir, args):
         file_name = os.path.join(data_dir, f"{data_split}.json")
         with open(file_name, "w", encoding='utf-8') as f:
             f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
 
 def create_goal2dialogue_data(dataset, data_dir, args):
     data_by_split = dataset
@@ -105,6 +114,20 @@ def create_goal2dialogue_data(dataset, data_dir, args):
         file_name = os.path.join(data_dir, f"{data_split}.json")
         with open(file_name, "w", encoding='utf-8') as f:
             f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
+
+def get_max_len(data_by_split, tokenizer):
+    for data_split in data_by_split.keys():
+        seq_len = {}
+        for line in data_by_split[data_split]:
+            item = json.loads(line.strip())
+            for column, seq in item.items():
+                seq_len.setdefault(column, [])
+                seq_len[column].append(len(tokenizer.tokenize(seq)))
+        print(f"data split: {data_split}")
+        for column, lens in seq_len.items():
+            print(f'\t{column}\tmax_len: {max(lens)}\tmean_len: {round(sum(lens)/len(lens),2)}')
 
 
 if __name__ == '__main__':
@@ -114,10 +137,15 @@ if __name__ == '__main__':
     parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
     parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s)')
     parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
+    parser.add_argument('--len_tokenizer', '-l', type=str, default=None, help='name or path of tokenizer that used to get seq len')
     args = parser.parse_args()
     print(args)
+    if args.len_tokenizer:
+        tokenizer = AutoTokenizer.from_pretrained(args.len_tokenizer)
     for dataset_name in tqdm(args.datasets, desc='datasets'):
         dataset = load_dataset(dataset_name)
         for task_name in tqdm(args.tasks, desc='tasks', leave=False):
             data_dir = os.path.join('data', task_name, dataset_name)
-            eval(f"create_{task_name}_data")(dataset, data_dir, args)
+            data_by_split = eval(f"create_{task_name}_data")(dataset, data_dir, args)
+            if args.len_tokenizer:
+                get_max_len(data_by_split, tokenizer)
diff --git a/convlab2/base_models/t5/dst/serialization.py b/convlab2/base_models/t5/dst/serialization.py
index 96cc0beb105b5c9ae22aeffb02881a28e0a443ae..f452ae2d8ef3f63844b7df0c5034c145080771d1 100644
--- a/convlab2/base_models/t5/dst/serialization.py
+++ b/convlab2/base_models/t5/dst/serialization.py
@@ -5,26 +5,32 @@ def serialize_dialogue_state(state):
             if len(value) > 0:
                 state_dict.setdefault(f'[{domain}]', [])
                 state_dict[f'[{domain}]'].append(f'[{slot}][{value}]')
-    return ';'.join([domain+'{'+','.join(slot_values)+'}' for domain, slot_values in state_dict.items()])
+    return ';'.join([domain+'('+','.join(slot_values)+')' for domain, slot_values in state_dict.items()])
 
 def deserialize_dialogue_state(state_seq):
     state = {}
     if len(state_seq) == 0:
         return state
-    state_seqs = state_seq.split(']};[')  # will consume "]}" and "["
+    state_seqs = state_seq.split(']);[')  # will consume "])" and "["
     for i, state_seq in enumerate(state_seqs):
-        if len(state_seq) == 0 or len(state_seq.split(']{[')) != 2:
+        if len(state_seq) == 0 or len(state_seq.split(']([')) != 2:
             continue
         if i == 0:
             if state_seq[0] == '[':
                 state_seq = state_seq[1:]
         if i == len(state_seqs) - 1:
-            if state_seq[-2:] == ']}':
+            if state_seq[-2:] == '])':
                 state_seq = state_seq[:-2]
         
-        domain, slot_values = state_seq.split(']{[')
+        try:
+            domain, slot_values = state_seq.split(']([')
+        except:
+            continue
         for slot_value in slot_values.split('],['):
-            slot, value = slot_value.split('][')
+            try:
+                slot, value = slot_value.split('][')
+            except:
+                continue
             state.setdefault(domain, {})
             state[domain][slot] = value
     return state
diff --git a/convlab2/base_models/t5/nlg/nlg_metric.py b/convlab2/base_models/t5/nlg/nlg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c0155fff3fcd021a2d07c25528e75c4fd912d83
--- /dev/null
+++ b/convlab2/base_models/t5/nlg/nlg_metric.py
@@ -0,0 +1,80 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NLG Metric"""
+
+import datasets
+import sacrebleu
+
+# TODO: Add BibTeX citation
+_CITATION = """\
+@inproceedings{post-2018-call,
+    title = "A Call for Clarity in Reporting {BLEU} Scores",
+    author = "Post, Matt",
+    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
+    month = oct,
+    year = "2018",
+    address = "Belgium, Brussels",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/W18-6319",
+    pages = "186--191",
+}
+"""
+
+_DESCRIPTION = """\
+Metric to evaluate text-to-text models on the natural language generation task.
+"""
+
+_KWARGS_DESCRIPTION = """
+Calculates corpus-bleu4
+Args:
+    predictions: list of predictions to score. Each predictions
+        should be a string.
+    references: list of reference for each prediction. Each
+        reference should be a string.
+Returns:
+    bleu: corpus-bleu score
+Examples:
+
+    >>> nlg_metric = datasets.load_metric("nlg_metric.py")
+    >>> predictions = ["hello there general kenobi", "foo bar foobar"]
+    >>> references = ["hello there kenobi", "foo bar foobar"]
+    >>> results = nlg_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'bleu': 35.35533905932737}
+"""
+
+
+@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class NLGMetrics(datasets.Metric):
+    """Metric to evaluate text-to-text models on the natural language generation task."""
+
+    def _info(self):
+        return datasets.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            # This defines the format of each prediction and reference
+            features=datasets.Features({
+                'predictions': datasets.Value('string'),
+                'references': datasets.Value('string'),
+            })
+        )
+
+    def _compute(self, predictions, references):
+        """Returns the scores: bleu"""
+        bleu = sacrebleu.corpus_bleu(predictions, [references], lowercase=True).score
+        
+        return {
+            "bleu": bleu
+        }
diff --git a/convlab2/base_models/t5/nlu/nlu_metric.py b/convlab2/base_models/t5/nlu/nlu_metric.py
index a2f3c0bc4f9a50464eebc67d220964c39a45f2be..d337887201e683eebb5bfa932714e2ee4e2cfae2 100644
--- a/convlab2/base_models/t5/nlu/nlu_metric.py
+++ b/convlab2/base_models/t5/nlu/nlu_metric.py
@@ -36,21 +36,15 @@ Returns:
     seq_em: sequence exact match
     accuracy: dialog acts accuracy
     overall_f1: dialog acts overall f1
-    binary_f1: binary dialog acts f1
-    categorical_f1: categorical dialog acts f1
-    non-categorical_f1: non-categorical dialog acts f1
 Examples:
 
     >>> nlu_metric = datasets.load_metric("nlu_metric.py")
-    >>> predictions = ["[binary][thank][general][]", "[non-categorical][inform][taxi][leave at][17:15]"]
-    >>> references = ["[binary][thank][general][]", "[non-categorical][inform][train][leave at][17:15]"]
+    >>> predictions = ["[thank][general]{[][]}", "[inform][taxi]{[leave at][17:15]}"]
+    >>> references = ["[thank][general]{[][]}", "[inform][train]{[leave at][17:15]}"]
     >>> results = nlu_metric.compute(predictions=predictions, references=references)
     >>> print(results)
     {'seq_em': 0.5, 'accuracy': 0.5, 
-    'overall_f1': 0.5, 'overall_precision': 0.5, 'overall_recall': 0.5, 
-    'binary_f1': 1.0, 'binary_precision': 1.0, 'binary_recall': 1.0, 
-    'categorical_f1': 0.0, 'categorical_precision': 0.0, 'categorical_recall': 0.0, 
-    'non-categorical_f1': 0.0, 'non-categorical_precision': 0.0, 'non-categorical_recall': 0.0}
+    'overall_f1': 0.5, 'overall_precision': 0.5, 'overall_recall': 0.5}
 """
 
 
@@ -74,45 +68,33 @@ class NLUMetrics(datasets.Metric):
         """Returns the scores: sequence exact match, dialog acts accuracy and f1"""
         seq_em = []
         acc = []
-        f1_metrics = {x: {'TP':0, 'FP':0, 'FN':0} for x in ['overall', 'binary', 'categorical', 'non-categorical']}
+        f1_metrics = {'TP':0, 'FP':0, 'FN':0}
 
         for prediction, reference in zip(predictions, references):
             seq_em.append(prediction.strip()==reference.strip())
             pred_da = deserialize_dialogue_acts(prediction)
             gold_da = deserialize_dialogue_acts(reference)
-            flag = True
-            for da_type in ['binary', 'categorical', 'non-categorical']:
-                if da_type == 'binary':
-                    predicts = sorted(list({(x['intent'], x['domain'], x['slot']) for x in pred_da[da_type]}))
-                    labels = sorted(list({(x['intent'], x['domain'], x['slot']) for x in gold_da[da_type]}))
+            pred_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da in pred_da}))
+            gold_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da in gold_da}))
+            acc.append(pred_da==gold_da)
+            for ele in pred_da:
+                if ele in gold_da:
+                    f1_metrics['TP'] += 1
                 else:
-                    predicts = sorted(list({(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in pred_da[da_type]}))
-                    labels = sorted(list({(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in gold_da[da_type]}))
-                for ele in predicts:
-                    if ele in labels:
-                        f1_metrics['overall']['TP'] += 1
-                        f1_metrics[da_type]['TP'] += 1
-                    else:
-                        f1_metrics['overall']['FP'] += 1
-                        f1_metrics[da_type]['FP'] += 1
-                for ele in labels:
-                    if ele not in predicts:
-                        f1_metrics['overall']['FN'] += 1
-                        f1_metrics[da_type]['FN'] += 1
-                flag &= (predicts==labels)
-            acc.append(flag)
+                    f1_metrics['FP'] += 1
+            for ele in gold_da:
+                if ele not in pred_da:
+                    f1_metrics['FN'] += 1
 
-        for metric in list(f1_metrics.keys()):
-            TP = f1_metrics[metric].pop('TP')
-            FP = f1_metrics[metric].pop('FP')
-            FN = f1_metrics[metric].pop('FN')
-            precision = 1.0 * TP / (TP + FP) if TP + FP else 0.
-            recall = 1.0 * TP / (TP + FN) if TP + FN else 0.
-            f1 = 2.0 * precision * recall / (precision + recall) if precision + recall else 0.
-            f1_metrics.pop(metric)
-            f1_metrics[f'{metric}_f1'] = f1
-            f1_metrics[f'{metric}_precision'] = precision
-            f1_metrics[f'{metric}_recall'] = recall
+        TP = f1_metrics.pop('TP')
+        FP = f1_metrics.pop('FP')
+        FN = f1_metrics.pop('FN')
+        precision = 1.0 * TP / (TP + FP) if TP + FP else 0.
+        recall = 1.0 * TP / (TP + FN) if TP + FN else 0.
+        f1 = 2.0 * precision * recall / (precision + recall) if precision + recall else 0.
+        f1_metrics[f'overall_f1'] = f1
+        f1_metrics[f'overall_precision'] = precision
+        f1_metrics[f'overall_recall'] = recall
 
         return {
             "seq_em": sum(seq_em)/len(seq_em),
diff --git a/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh b/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh
deleted file mode 100644
index a9e9d6c55b38bfa1b43b0a837e4eef9d60a7e233..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="multiwoz21"
-speaker="user"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh b/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh
deleted file mode 100644
index e90e71d459da3bd43eeccc82d7ad192d7f751996..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="multiwoz21"
-speaker="user"
-context_window_size=3
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/dst/run_multiwoz21.sh b/convlab2/base_models/t5/nlu/run_nlu.sh
similarity index 76%
rename from convlab2/base_models/t5/dst/run_multiwoz21.sh
rename to convlab2/base_models/t5/nlu/run_nlu.sh
index 1c383cbb24130db89c4304912f74a8323dadd343..71b5f5f8e488b2c44e76637cc0152b8ff9db8671 100644
--- a/convlab2/base_models/t5/dst/run_multiwoz21.sh
+++ b/convlab2/base_models/t5/nlu/run_nlu.sh
@@ -1,8 +1,8 @@
-n_gpus=4
-task_name="dst"
-dataset_name="multiwoz21"
+n_gpus=1
+task_name="nlu"
+dataset_name=$1
 speaker="user"
-context_window_size=100
+context_window_size=$2
 data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
 output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
 cache_dir="../cache"
@@ -10,28 +10,26 @@ logging_dir="${output_dir}/runs"
 train_file="${data_dir}/train.json"
 validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
-metric_name_or_path="dst_metric.py"
-metric_for_best_model="accuracy"
+metric_name_or_path="nlu_metric.py"
+metric_for_best_model="overall_f1"
 source_column="context"
-target_column="state_seq"
+target_column="dialogue_acts_seq"
 truncation_side="left"
 max_source_length=512
 max_target_length=512
 model_name_or_path="t5-small"
-per_device_train_batch_size=64
+per_device_train_batch_size=128
 per_device_eval_batch_size=64
 gradient_accumulation_steps=2
 lr=1e-3
 num_train_epochs=10
 
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
+python ../create_data.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
 
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+python ../run_seq2seq.py \
     --task_name ${task_name} \
     --train_file ${train_file} \
     --validation_file ${validation_file} \
-    --test_file ${test_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
     --max_source_length ${max_source_length} \
@@ -40,7 +38,6 @@ python -m torch.distributed.launch \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --do_eval \
-    --do_predict \
     --save_strategy epoch \
     --evaluation_strategy epoch \
     --prediction_loss_only \
@@ -58,8 +55,7 @@ python -m torch.distributed.launch \
     --adafactor \
     --gradient_checkpointing
 
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+python ../run_seq2seq.py \
     --task_name ${task_name} \
     --test_file ${test_file} \
     --source_column ${source_column} \
@@ -76,8 +72,8 @@ python -m torch.distributed.launch \
     --logging_dir ${logging_dir} \
     --overwrite_output_dir \
     --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size}
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
 
-python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
+python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_tm1_user.sh b/convlab2/base_models/t5/nlu/run_tm1_user.sh
deleted file mode 100644
index 5372441aa284067f137e892454b8fed76e251e6e..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_tm1_user.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="tm1"
-speaker="user"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh
deleted file mode 100644
index 65482a1f517b7c1eb8607e0858d3ae576d1483b4..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="tm1"
-speaker="user"
-context_window_size=3
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=64
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_tm2_user.sh b/convlab2/base_models/t5/nlu/run_tm2_user.sh
deleted file mode 100644
index 84dc8b71ae560dcb481dee8bcfc31340ed4a778d..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_tm2_user.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="tm2"
-speaker="user"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh
deleted file mode 100644
index abca0a60a98c1a71295ae1ad77791dec6b482547..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="tm2"
-speaker="user"
-context_window_size=3
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_tm3_user.sh b/convlab2/base_models/t5/nlu/run_tm3_user.sh
deleted file mode 100644
index 689a626c9a871581b49eb84a7db1e9af1152e32e..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_tm3_user.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="tm3"
-speaker="user"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh
deleted file mode 100644
index e2ded66dbe940387f1997ba92028ef4dc4a5b5c5..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="tm3"
-speaker="user"
-context_window_size=3
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/serialization.py b/convlab2/base_models/t5/nlu/serialization.py
index 7c9a764fb9e4d04ec4036722448b4a1a00636a35..bb1a8b88b81ca23e97271c9bae211fbfd775149b 100644
--- a/convlab2/base_models/t5/nlu/serialization.py
+++ b/convlab2/base_models/t5/nlu/serialization.py
@@ -6,28 +6,35 @@ def serialize_dialogue_acts(dialogue_acts):
             intent_domain = f'[{intent}][{domain}]'
             da_dict.setdefault(intent_domain, [])
             da_dict[intent_domain].append(f'[{slot}][{value}]')
-    return ';'.join([intent_domain+'{'+','.join(slot_values)+'}' for intent_domain, slot_values in da_dict.items()])
+    return ';'.join([intent_domain+'('+','.join(slot_values)+')' for intent_domain, slot_values in da_dict.items()])
 
 def deserialize_dialogue_acts(das_seq):
     dialogue_acts = []
     if len(das_seq) == 0:
         return dialogue_acts
-    da_seqs = das_seq.split(']};[')  # will consume "]}" and "["
+    da_seqs = das_seq.split(']);[')  # will consume "])" and "["
     for i, da_seq in enumerate(da_seqs):
-        if len(da_seq) == 0 or len(da_seq.split(']{[')) != 2:
+        if len(da_seq) == 0 or len(da_seq.split(']([')) != 2:
             continue
         if i == 0:
             if da_seq[0] == '[':
                 da_seq = da_seq[1:]
         if i == len(da_seqs) - 1:
-            if da_seq[-2:] == ']}':
+            if da_seq[-2:] == '])':
                 da_seq = da_seq[:-2]
         
-        intent_domain, slot_values = da_seq.split(']{[')
-        intent, domain = intent_domain.split('][')
+        try:
+            intent_domain, slot_values = da_seq.split(']([')
+            intent, domain = intent_domain.split('][')
+        except:
+            continue
         for slot_value in slot_values.split('],['):
-            slot, value = slot_value.split('][')
+            try:
+                slot, value = slot_value.split('][')
+            except:
+                continue
             dialogue_acts.append({'intent': intent, 'domain': domain, 'slot': slot, 'value': value})
+        
     return dialogue_acts
 
 def equal_da_seq(dialogue_acts, das_seq):
diff --git a/convlab2/nlu/evaluate_unified_datasets.py b/convlab2/nlu/evaluate_unified_datasets.py
index 907b1afaaee6788c1e90e3bd85b67b3360c9c2da..2eee44d7a02b405d526ce9389993b9cf0783d48a 100644
--- a/convlab2/nlu/evaluate_unified_datasets.py
+++ b/convlab2/nlu/evaluate_unified_datasets.py
@@ -10,28 +10,43 @@ def evaluate(predict_result):
 
     for sample in predict_result:
         flag = True
-        for da_type in ['binary', 'categorical', 'non-categorical']:
-            if da_type == 'binary':
-                predicts = [(x['intent'], x['domain'], x['slot']) for x in sample['predictions']['dialogue_acts'][da_type]]
-                labels = [(x['intent'], x['domain'], x['slot']) for x in sample['dialogue_acts'][da_type]]
-            else:
-                predicts = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in sample['predictions']['dialogue_acts'][da_type]]
-                labels = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in sample['dialogue_acts'][da_type]]
-            predicts = sorted(list(set(predicts)))
-            labels = sorted(list(set(labels)))
-            for ele in predicts:
-                if ele in labels:
+        if isinstance(sample['predictions']['dialogue_acts'], dict):
+            for da_type in ['binary', 'categorical', 'non-categorical']:
+                if da_type == 'binary':
+                    predicts = [(x['intent'], x['domain'], x['slot']) for x in sample['predictions']['dialogue_acts'][da_type]]
+                    labels = [(x['intent'], x['domain'], x['slot']) for x in sample['dialogue_acts'][da_type]]
+                else:
+                    predicts = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in sample['predictions']['dialogue_acts'][da_type]]
+                    labels = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in sample['dialogue_acts'][da_type]]
+                predicts = sorted(list(set(predicts)))
+                labels = sorted(list(set(labels)))
+                for ele in predicts:
+                    if ele in labels:
+                        metrics['overall']['TP'] += 1
+                        metrics[da_type]['TP'] += 1
+                    else:
+                        metrics['overall']['FP'] += 1
+                        metrics[da_type]['FP'] += 1
+                for ele in labels:
+                    if ele not in predicts:
+                        metrics['overall']['FN'] += 1
+                        metrics[da_type]['FN'] += 1
+                flag &= (predicts==labels)
+            acc.append(flag)
+        elif isinstance(sample['predictions']['dialogue_acts'], list):
+            gold_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da_type in ['binary', 'categorical', 'non-categorical'] for da in sample['dialogue_acts'][da_type]}))
+            pred_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da in sample['predictions']['dialogue_acts']}))
+            acc.append(pred_da==gold_da)
+            for ele in pred_da:
+                if ele in gold_da:
                     metrics['overall']['TP'] += 1
-                    metrics[da_type]['TP'] += 1
                 else:
                     metrics['overall']['FP'] += 1
-                    metrics[da_type]['FP'] += 1
-            for ele in labels:
-                if ele not in predicts:
+            for ele in gold_da:
+                if ele not in pred_da:
                     metrics['overall']['FN'] += 1
-                    metrics[da_type]['FN'] += 1
-            flag &= (predicts==labels)
-        acc.append(flag)
+        else:
+            raise TypeError('type of predictions:dialogue_acts should be dict or list')
     
     for metric in metrics:
         TP = metrics[metric].pop('TP')