diff --git a/convlab2/base_models/gpt/create_data.py b/convlab2/base_models/gpt/create_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd616b590ce0d725a9ae393de4f180a3bfd6d097
--- /dev/null
+++ b/convlab2/base_models/gpt/create_data.py
@@ -0,0 +1,39 @@
+import os
+import json
+from tqdm import tqdm
+import re
+from convlab2.util import load_dataset
+
+
+def create_lm_data(dataset, data_dir, args):
+    data_by_split = dataset
+    os.makedirs(data_dir, exist_ok=True)
+
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            if args.model_type == 'dialogpt':
+                dialogue = ' <|endoftext|> '.join([turn['utterance'] for turn in sample['turns']]) + ' <|endoftext|>'
+            else:
+                dialogue = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']])
+            data.append(json.dumps({'dialogue': dialogue}, ensure_ascii=False)+'\n')
+
+        file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="create data for seq2seq training")
+    parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['lm'], help='names of tasks')
+    parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
+    parser.add_argument('--model_type', '-m', metavar='model_type', help='type of the language model: gpt, dialogpt, ..')
+    args = parser.parse_args()
+    print(args)
+    for dataset_name in tqdm(args.datasets, desc='datasets'):
+        dataset = load_dataset(dataset_name)
+        for task_name in tqdm(args.tasks, desc='tasks', leave=False):
+            data_dir = os.path.join('data', task_name, dataset_name, args.model_type)
+            eval(f"create_{task_name}_data")(dataset, data_dir, args)
diff --git a/convlab2/base_models/gpt/keyword_extraction/get_keywords.sh b/convlab2/base_models/gpt/keyword_extraction/get_keywords.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6dd2680bc3c4390cf2d85cff46d7000c5293ef70
--- /dev/null
+++ b/convlab2/base_models/gpt/keyword_extraction/get_keywords.sh
@@ -0,0 +1,20 @@
+model_type=dialogpt
+dataset_name=multiwoz21
+model_name=dialogpt-large
+data_dir="data/lm/${dataset_name}/${model_type}"
+word_loss_file="${data_dir}/${model_name}_${dataset_name}_word_loss.json"
+keywords_num=5
+keywords_ratio=1
+keywords_th=0
+stopwords=True
+output_file="${data_dir}/${dataset_name}_keywords_${model_name}_topk_${keywords_num}_ratio_${keywords_ratio}_th_${keywords_th}_stopwords_${stopwords}.json"
+
+python lmloss2keywords.py \
+    --model_type ${model_type} \
+    --word_loss_file ${word_loss_file} \
+    --keywords_num ${keywords_num} \
+    --keywords_ratio ${keywords_ratio} \
+    --keywords_th ${keywords_th} \
+    --stopwords ${stopwords} \
+    --output_file ${output_file}
+    
\ No newline at end of file
diff --git a/convlab2/base_models/gpt/keyword_extraction/get_word_loss.sh b/convlab2/base_models/gpt/keyword_extraction/get_word_loss.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2aad467cf181c08532505a1523af746e52aacb4a
--- /dev/null
+++ b/convlab2/base_models/gpt/keyword_extraction/get_word_loss.sh
@@ -0,0 +1,65 @@
+set -e
+n_gpus=1
+task_name="lm"
+dataset_name="multiwoz21"
+model_type="dialogpt"
+data_dir="data/${task_name}/${dataset_name}/${model_type}"
+output_dir="output/${task_name}/${dataset_name}/${model_type}"
+cache_dir="../cache"
+validation_file="${data_dir}/validation.json"
+source_column="dialogue"
+max_length=512
+model_name_or_path="microsoft/DialoGPT-large"
+per_device_eval_batch_size=4
+
+dump_eval_loss_to="${data_dir}/dialogpt-large_${dataset_name}_token_loss.json"
+python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
+python ../run_clm.py \
+    --dump_eval_loss_to ${dump_eval_loss_to}\
+    --model_name_or_path ${model_name_or_path} \
+    --output_dir ${data_dir} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --max_length ${max_length} \
+    --do_eval \
+    --prediction_loss_only \
+    --cache_dir ${cache_dir} \
+    --preprocessing_num_workers 4 \
+    --per_device_eval_batch_size ${per_device_eval_batch_size}
+python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
+
+dump_eval_loss_to="${data_dir}/dialogpt-large-mwoz_${dataset_name}_token_loss.json"
+python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
+python ../run_clm.py \
+    --dump_eval_loss_to ${dump_eval_loss_to}\
+    --model_name_or_path ${output_dir} \
+    --output_dir ${data_dir} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --max_length ${max_length} \
+    --do_eval \
+    --prediction_loss_only \
+    --cache_dir ${cache_dir} \
+    --preprocessing_num_workers 4 \
+    --per_device_eval_batch_size ${per_device_eval_batch_size}
+python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
+
+model_type="gpt"
+data_dir="data/${task_name}/${dataset_name}/${model_type}"
+validation_file="${data_dir}/validation.json"
+model_name_or_path="gpt2-large"
+dump_eval_loss_to="${data_dir}/gpt2-large_${dataset_name}_token_loss.json"
+python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type gpt
+python ../run_clm.py \
+    --dump_eval_loss_to ${dump_eval_loss_to}\
+    --model_name_or_path ${model_name_or_path} \
+    --output_dir ${data_dir} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --max_length ${max_length} \
+    --do_eval \
+    --prediction_loss_only \
+    --cache_dir ${cache_dir} \
+    --preprocessing_num_workers 4 \
+    --per_device_eval_batch_size ${per_device_eval_batch_size}
+python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
diff --git a/convlab2/base_models/gpt/keyword_extraction/lmloss2keywords.py b/convlab2/base_models/gpt/keyword_extraction/lmloss2keywords.py
new file mode 100644
index 0000000000000000000000000000000000000000..307d57edf4d09c8a72968f35051d451afe21bc64
--- /dev/null
+++ b/convlab2/base_models/gpt/keyword_extraction/lmloss2keywords.py
@@ -0,0 +1,123 @@
+import json
+import json_lines
+from pprint import pprint
+import os
+from tqdm import tqdm
+import numpy as np
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+
+
+
+def merge_tokens(tokens, losses, loss_merge_func=np.mean):
+    res = []
+    i = 0
+    while i < len(tokens):
+        token = tokens[i]
+        loss = losses[i]
+        if token in ['Ġ', 'Ċ']:
+            if token == 'Ċ' and i < len(tokens) - 1:
+                tokens[i+1] = 'Ġ'+tokens[i+1]
+            i += 1
+            continue
+        if token in ['user', 'system', 'Ġuser', 'Ġsystem'] and i < len(tokens)-1 and tokens[i+1] == ':':
+            if i > 0:
+                tokens[i+1] = '<|endoftext|>'
+                i += 1
+            else:
+                i += 2
+            continue
+        if token.startswith('Ġ'):
+            # Ġ means space
+            token = token.replace("Ġ", "")
+            res.append([token, loss])
+        elif token == '<|endoftext|>':
+            res.append([token, loss])
+        else:
+            assert 'Ġ' not in token
+            if len(res) > 0:
+                res[-1][0] += token
+                res[-1].append(loss)
+            else:
+                res.append([token, loss])
+        i += 1
+    if loss_merge_func:
+        for i in range(len(res)):
+            res[i] = [res[i][0], loss_merge_func(res[i][1:])]
+    return res
+
+
+def convert_token_loss2word_loss(token_loss_file, loss_merge_func=np.mean):
+    word_loss_file = os.path.join(os.path.dirname(token_loss_file), token_loss_file.split('/')[-1].replace('token', 'word'))
+    fin = open(token_loss_file, 'rb')
+    fout = open(word_loss_file, 'w', encoding='utf-8')
+    lines = []
+
+    for item in tqdm(json_lines.reader(fin)):
+        tokens, losses = item['tokens'], item['losses']
+        assert len(tokens) == len(losses)
+        word2losses = merge_tokens(tokens, losses, loss_merge_func)
+        lines.append({"words": [x[0] for x in word2losses], "losses": [x[1] for x in word2losses]})
+        fout.write(json.dumps(lines[-1], ensure_ascii=False)+'\n')
+
+    fin.close()
+    fout.close()
+    return lines
+
+def main(args):
+    if not args.word_loss_file:
+        word_loss_list = convert_token_loss2word_loss(args.token_loss_file)
+    else:
+        fin = open(args.word_loss_file, 'rb')
+        word_loss_list = []
+        for item in json_lines.reader(fin):
+            words, losses = item['words'], item['losses']
+            word_loss_list.append({"words": words, "losses": losses})
+        fin.close()
+
+    if not args.output_file:
+        return
+
+    stop_words = set(stopwords.words('english'))
+
+    dialogs = []
+    for item in word_loss_list:
+        words = item['words']
+        losses = item['losses']
+        turns = []
+        turn = {'words': [], 'losses': []}
+        for word, loss in zip(words, losses):
+            if word == '<|endoftext|>':
+                # switch turn
+                turn['utterance'] = ' '.join(turn['words'])
+                turn['keywords'] = list(zip(turn['words'], turn['losses']))
+                if args.stopwords:
+                    turn['keywords'] = [x for x in turn['keywords'] if not any([w.lower() in stop_words for w in word_tokenize(x[0])])]
+                turn['keywords'] = sorted(turn['keywords'], key=lambda x: x[1], reverse=True)
+                turn['keywords'] = [x for x in turn['keywords'] if x[1] > args.keywords_th][:min(round(args.keywords_ratio*len(turn['keywords'])), args.keywords_num)]
+                turn.pop('words')
+                turn.pop('losses')
+                turns.append(turn)
+                turn = {'words': [], 'losses': []}
+            else:
+                turn['words'].append(word)
+                turn['losses'].append(loss)
+        dialogs.append(turns)
+    json.dump(dialogs, open(args.output_file, "w", encoding='utf-8'), indent=2, ensure_ascii=False)
+
+
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="extract keywords according to lm loss")
+    parser.add_argument('--model_type', '-m', type=str, help='gpt or dialogpt')
+    parser.add_argument('--token_loss_file', '-t', type=str, help='path to the token loss file that contains two columns: [tokens, losses]')
+    parser.add_argument('--word_loss_file', '-w', type=str, help='path to the token loss file that contains two columns: [tokens, losses]')
+    parser.add_argument('--output_file', '-o', type=str, help='path to the output file')
+    parser.add_argument('--keywords_num', '-n', type=int, default=100, help='how many words in an utterance serve as keywords')
+    parser.add_argument('--keywords_ratio', '-r', type=float, default=1.0, help='how many words (in ratio) in an utterance serve as keywords')
+    parser.add_argument('--keywords_th', '-th', type=float, default=0., help='loss threshold for the keywords')
+    parser.add_argument('--stopwords', '-s', type=lambda x: bool(eval(x)), default=True, help='filter out stopwords')
+    args = parser.parse_args()
+    print(args)
+    main(args)
diff --git a/convlab2/base_models/gpt/keyword_extraction/merge_keywords_res.py b/convlab2/base_models/gpt/keyword_extraction/merge_keywords_res.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8ebd5ba0b623cae37521bfe93fe046d8cd0c53e
--- /dev/null
+++ b/convlab2/base_models/gpt/keyword_extraction/merge_keywords_res.py
@@ -0,0 +1,29 @@
+import json
+
+def main(args):
+    filename2data = {f.split('/')[-1]: json.load(open(f)) for f in args.keywords_files}
+    first_filename = args.keywords_files[0].split('/')[-1]
+    dialogs = []
+    for i in range(len(filename2data[first_filename])):
+        turns = []
+        for j in range(len(filename2data[first_filename][i])):
+            utt = filename2data[first_filename][i][j]['utterance']
+            keywords = {filename.split('_')[2]+'_nonstopword'+filename.split('_')[-1]: ' | '.join([x[0] for x in filename2data[filename][i][j]['keywords']]) for filename in filename2data}
+            turns.append({
+                "utterance": utt,
+                **keywords
+            })
+        dialogs.append(turns)
+    json.dump(dialogs, open(args.output_file, "w", encoding='utf-8'), indent=2, ensure_ascii=False)
+
+
+    
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="calculate NLU metrics for unified datasets")
+    parser.add_argument('--keywords_files', '-f', metavar='keywords_files', nargs='*', help='keywords files')
+    parser.add_argument('--output_file', '-o', type=str, help='path to the output file')
+    args = parser.parse_args()
+    print(args)
+    main(args)
diff --git a/convlab2/base_models/gpt/keyword_extraction/train_lm.sh b/convlab2/base_models/gpt/keyword_extraction/train_lm.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4ae47c3296e5ca7150cbbffcb7a7d247973613de
--- /dev/null
+++ b/convlab2/base_models/gpt/keyword_extraction/train_lm.sh
@@ -0,0 +1,47 @@
+set -e
+n_gpus=1
+task_name="lm"
+dataset_name="multiwoz21"
+model_type="dialogpt"
+data_dir="data/${task_name}/${dataset_name}/${model_type}"
+output_dir="output/${task_name}/${dataset_name}/${model_type}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test.json"
+source_column="dialogue"
+max_length=512
+model_name_or_path="microsoft/DialoGPT-large"
+per_device_train_batch_size=16
+per_device_eval_batch_size=16
+gradient_accumulation_steps=4
+lr=5e-5
+num_train_epochs=3
+
+python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
+
+python ../run_clm.py \
+    --model_name_or_path ${model_name_or_path} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --max_length ${max_length} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --load_best_model_at_end \
+    --prediction_loss_only \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --debug underflow_overflow \
+    --gradient_checkpointing
diff --git a/convlab2/base_models/gpt/run_clm.py b/convlab2/base_models/gpt/run_clm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dff4a0ae4106964c0ccba636953bbd6544baded
--- /dev/null
+++ b/convlab2/base_models/gpt/run_clm.py
@@ -0,0 +1,601 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
+Modified from https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=text-generation
+"""
+# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
+
+import logging
+import math
+import os
+import sys
+from dataclasses import dataclass, field
+from itertools import chain
+from typing import Optional
+
+import datasets
+from datasets import load_dataset
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+import torch
+import json
+
+import transformers
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_FOR_CAUSAL_LM_MAPPING,
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    DataCollatorForTokenClassification,
+    is_torch_tpu_available,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.17.0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
+
+logger = logging.getLogger(__name__)
+
+
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_overrides: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Override some existing default config settings when a model is trained from scratch. Example: "
+            "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
+        },
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    truncation_side: Optional[str] = field(
+        default="right",
+        metadata={"help": "Which side to truncate, left or right."}
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+    resize_position_embeddings: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": "Whether to automatically resize the position embeddings if `max_source_length` exceeds "
+                    "the model's position embeddings."
+        },
+    )
+
+    def __post_init__(self):
+        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
+            raise ValueError(
+                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
+            )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    source_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the column in the datasets containing the texts."},
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a text, jsonlines or csv file)."}
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input evaluation data file to evaluate the metrics on (a text, jsonlines or csv file)."
+        },
+    )
+    dump_eval_loss_to: Optional[str] = field(
+        default=None, metadata={"help": "Where to dump the tokens' losses in the evaluation data, default not to"}
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_length: Optional[int] = field(
+        default=1024,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+                    "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to model maximum sentence length. "
+                    "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+                    "efficient on GPU but very bad for TPU."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
+        },
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    keep_linebreaks: bool = field(
+        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+            )
+            raw_datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+            )
+    else:
+        data_files = {}
+        dataset_args = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = (
+            data_args.train_file.split(".")[-1]
+            if data_args.train_file is not None
+            else data_args.validation_file.split(".")[-1]
+        )
+        if extension == "txt":
+            extension = "text"
+            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
+        raw_datasets = load_dataset(
+            extension,
+            data_files=data_files,
+            cache_dir=model_args.cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+            **dataset_args,
+        )
+        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+                **dataset_args,
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+                **dataset_args,
+            )
+
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config_kwargs = {
+        "cache_dir": model_args.cache_dir,
+        "revision": model_args.model_revision,
+        "use_auth_token": True if model_args.use_auth_token else None,
+    }
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+        if model_args.config_overrides is not None:
+            logger.info(f"Overriding config: {model_args.config_overrides}")
+            config.update_from_string(model_args.config_overrides)
+            logger.info(f"New config: {config}")
+
+    tokenizer_kwargs = {
+        "cache_dir": model_args.cache_dir,
+        "use_fast": model_args.use_fast_tokenizer,
+        "truncation_side": model_args.truncation_side,
+        "revision": model_args.model_revision,
+        "use_auth_token": True if model_args.use_auth_token else None,
+    }
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    if model_args.model_name_or_path:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            from_tf=bool(".ckpt" in model_args.model_name_or_path),
+            config=config,
+            cache_dir=model_args.cache_dir,
+            revision=model_args.model_revision,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+    else:
+        model = AutoModelForCausalLM.from_config(config)
+        n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
+        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    if training_args.gradient_checkpointing:
+        # use_cache=True is incompatible with gradient checkpointing.
+        config.use_cache = False
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if training_args.do_train:
+        column_names = raw_datasets["train"].column_names
+    elif training_args.do_eval:
+        column_names = raw_datasets["validation"].column_names
+    else:
+        logger.info("There is nothing to do. Please pass `do_train` and/or `do_eval`.")
+        return
+    if data_args.source_column is None:
+        source_column = column_names[0]
+    else:
+        source_column = data_args.source_column
+        if source_column not in column_names:
+            raise ValueError(
+                f"--source_column' value '{data_args.source_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    def preprocess_function(examples):
+
+        inputs = []
+        for i in range(len(examples[source_column])):
+            if len(examples[source_column][i]) > 0:
+                inputs.append(examples[source_column][i])
+        
+        padding = "max_length" if data_args.pad_to_max_length else False
+        model_inputs = tokenizer(inputs, max_length=data_args.max_length, padding=padding, truncation=True)
+
+        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+        # padding in the loss. Else pad in data_collator.
+        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
+            model_inputs["labels"] = [
+                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in model_inputs["input_ids"]
+            ]
+        else:
+            model_inputs["labels"] = model_inputs["input_ids"].copy()
+
+        return model_inputs
+
+    with training_args.main_process_first(desc="dataset map tokenization"):
+        tokenized_datasets = raw_datasets.map(
+            preprocess_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )
+    
+    lm_datasets = tokenized_datasets
+
+    if training_args.do_train:
+        if "train" not in tokenized_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = lm_datasets["train"]
+        if data_args.max_train_samples is not None:
+            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+            train_dataset = train_dataset.select(range(max_train_samples))
+
+    if training_args.do_eval:
+        if "validation" not in tokenized_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = lm_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
+            eval_dataset = eval_dataset.select(range(max_eval_samples))
+
+    # Data collator
+    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+    data_collator = DataCollatorForTokenClassification(
+        tokenizer,
+        label_pad_token_id=label_pad_token_id,
+        pad_to_multiple_of=8 if training_args.fp16 else None,
+    )
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        # Data collator will default to DataCollatorWithPadding, so we change it.
+        data_collator=data_collator
+    )
+
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        if not data_args.dump_eval_loss_to:
+            metrics = trainer.evaluate(metric_key_prefix="eval")
+            max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+            metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+            try:
+                perplexity = math.exp(metrics["eval_loss"])
+            except OverflowError:
+                perplexity = float("inf")
+            metrics["eval_perplexity"] = perplexity
+            logger.info(f"eval_perplexity: {perplexity}")
+
+            trainer.log_metrics("eval", metrics)
+            trainer.save_metrics("eval", metrics)
+        else:
+            if trainer.is_world_process_zero():
+                output_prediction_file = data_args.dump_eval_loss_to
+                writer = open(output_prediction_file, "w", encoding='utf-8')
+
+                eval_dataloader = DataLoader(
+                    eval_dataset, collate_fn=lambda x: {k: v.to(model.device) for k, v in data_collator(x).items()}, batch_size=training_args.per_device_eval_batch_size
+                )
+                model.eval()
+                losses = []
+                loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
+                for batch in tqdm(eval_dataloader):
+                    with torch.no_grad():
+                        outputs = model(**batch)
+
+                    loss = outputs.loss
+                    losses.append(loss.repeat(training_args.per_device_eval_batch_size))
+                    
+                    shift_logits = outputs.logits[..., :-1, :].contiguous()
+                    shift_labels = batch['labels'][..., 1:].contiguous()
+                    batch_token_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+                    batch_token_loss = batch_token_loss.view(shift_labels.size()).tolist()
+                    labels = batch['labels'].tolist()
+                    for i in range(len(labels)):
+                        token_ids = [x for x in labels[i] if x != -100]
+                        tokens = tokenizer.convert_ids_to_tokens(token_ids)
+                        token_losses = [0] + batch_token_loss[i][:len(token_ids)-1]
+                        writer.write(json.dumps({"tokens": tokens, "losses": token_losses}, ensure_ascii=False)+'\n')
+
+                losses = torch.cat(losses)
+                losses = losses[: len(eval_dataset)]
+                try:
+                    perplexity = math.exp(torch.mean(losses))
+                except OverflowError:
+                    perplexity = float("inf")
+                logger.info(f"perplexity: {perplexity}")
+
+                writer.close()
+
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
+    if data_args.dataset_name is not None:
+        kwargs["dataset_tags"] = data_args.dataset_name
+        if data_args.dataset_config_name is not None:
+            kwargs["dataset_args"] = data_args.dataset_config_name
+            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+        else:
+            kwargs["dataset"] = data_args.dataset_name
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/convlab2/base_models/t5/create_data.py b/convlab2/base_models/t5/create_data.py
index d4b99d9a7ebfc294433f06e795aff2620f0c1fb5..c2f3da96523266936e4f15ad19682de62e5da3da 100644
--- a/convlab2/base_models/t5/create_data.py
+++ b/convlab2/base_models/t5/create_data.py
@@ -2,6 +2,7 @@ import os
 import json
 from tqdm import tqdm
 import re
+from transformers import AutoTokenizer
 from convlab2.util import load_dataset, load_nlu_data, load_dst_data, load_policy_data, load_nlg_data, load_e2e_data, load_rg_data
 from convlab2.base_models.t5.nlu.serialization import serialize_dialogue_acts, deserialize_dialogue_acts, equal_da_seq
 from convlab2.base_models.t5.dst.serialization import serialize_dialogue_state, deserialize_dialogue_state, equal_state_seq
@@ -15,13 +16,16 @@ def create_rg_data(dataset, data_dir, args):
     for data_split in data_splits:
         data = []
         for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
-            context = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']])
-            response = f"{sample['speaker']}: {sample['utterance']}"
-            data.append(json.dumps({'context': context, 'response': response}, ensure_ascii=False)+'\n')
+            if len(sample['context']) == 0:
+                continue
+            context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: '])
+            data.append(json.dumps({'context': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n')
 
         file_name = os.path.join(data_dir, f"{data_split}.json")
         with open(file_name, "w", encoding='utf-8') as f:
             f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
 
 def create_nlu_data(dataset, data_dir, args):
     data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
@@ -34,7 +38,7 @@ def create_nlu_data(dataset, data_dir, args):
         for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
             response = f"{sample['speaker']}: {sample['utterance']}"
             if args.context_window_size>0:
-                context = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[response])
+                context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[response])
             else:
                 context = response
             dialogue_acts_seq = serialize_dialogue_acts(sample['dialogue_acts'])
@@ -44,6 +48,8 @@ def create_nlu_data(dataset, data_dir, args):
         file_name = os.path.join(data_dir, f"{data_split}.json")
         with open(file_name, "w", encoding='utf-8') as f:
             f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
 
 def create_dst_data(dataset, data_dir, args):
     data_by_split = load_dst_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
@@ -56,7 +62,7 @@ def create_dst_data(dataset, data_dir, args):
         for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
             response = f"{sample['speaker']}: {sample['utterance']}"
             if args.context_window_size>0:
-                context = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[response])
+                context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[response])
             else:
                 context = response
             state_seq = serialize_dialogue_state(sample['state'])
@@ -66,6 +72,32 @@ def create_dst_data(dataset, data_dir, args):
         file_name = os.path.join(data_dir, f"{data_split}.json")
         with open(file_name, "w", encoding='utf-8') as f:
             f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
+
+def create_nlg_data(dataset, data_dir, args):
+    data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
+    data_dir = os.path.join(data_dir, args.speaker, f'context_{args.context_window_size}')
+    os.makedirs(data_dir, exist_ok=True)
+
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            dialogue_acts_seq = serialize_dialogue_acts(sample['dialogue_acts'])
+            if args.context_window_size>0:
+                context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: '])
+                context = f'{dialogue_acts_seq}\n{context}'
+            else:
+                context = f'{dialogue_acts_seq}\n{sample["speaker"]}: '
+            assert equal_da_seq(sample['dialogue_acts'], dialogue_acts_seq), print(sample['dialogue_acts'], dialogue_acts_seq, deserialize_dialogue_acts(dialogue_acts_seq))
+            data.append(json.dumps({'context+da': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n')
+
+        file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
 
 def create_goal2dialogue_data(dataset, data_dir, args):
     data_by_split = dataset
@@ -76,25 +108,44 @@ def create_goal2dialogue_data(dataset, data_dir, args):
         data = []
         for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
             goal = re.sub(r'<.*?>', '', sample['goal']['description'])
-            dialogue = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']])
+            dialogue = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']])
             data.append(json.dumps({'goal': goal, 'dialogue': dialogue}, ensure_ascii=False)+'\n')
 
         file_name = os.path.join(data_dir, f"{data_split}.json")
         with open(file_name, "w", encoding='utf-8') as f:
             f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
+
+def get_max_len(data_by_split, tokenizer):
+    for data_split in data_by_split.keys():
+        seq_len = {}
+        for line in data_by_split[data_split]:
+            item = json.loads(line.strip())
+            for column, seq in item.items():
+                seq_len.setdefault(column, [])
+                seq_len[column].append(len(tokenizer.tokenize(seq)))
+        print(f"data split: {data_split}")
+        for column, lens in seq_len.items():
+            print(f'\t{column}\tmax_len: {max(lens)}\tmean_len: {round(sum(lens)/len(lens),2)}')
 
 
 if __name__ == '__main__':
     from argparse import ArgumentParser
     parser = ArgumentParser(description="create data for seq2seq training")
-    parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['rg', 'nlu', 'dst', 'goal2dialogue'], help='names of tasks')
+    parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['rg', 'nlu', 'dst', 'nlg', 'goal2dialogue'], help='names of tasks')
     parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
     parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s)')
     parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
+    parser.add_argument('--len_tokenizer', '-l', type=str, default=None, help='name or path of tokenizer that used to get seq len')
     args = parser.parse_args()
     print(args)
+    if args.len_tokenizer:
+        tokenizer = AutoTokenizer.from_pretrained(args.len_tokenizer)
     for dataset_name in tqdm(args.datasets, desc='datasets'):
         dataset = load_dataset(dataset_name)
         for task_name in tqdm(args.tasks, desc='tasks', leave=False):
             data_dir = os.path.join('data', task_name, dataset_name)
-            eval(f"create_{task_name}_data")(dataset, data_dir, args)
+            data_by_split = eval(f"create_{task_name}_data")(dataset, data_dir, args)
+            if args.len_tokenizer:
+                get_max_len(data_by_split, tokenizer)
diff --git a/convlab2/base_models/t5/dst/serialization.py b/convlab2/base_models/t5/dst/serialization.py
index 6ccf25ae76048a11834566163591366ae5cdc61a..f452ae2d8ef3f63844b7df0c5034c145080771d1 100644
--- a/convlab2/base_models/t5/dst/serialization.py
+++ b/convlab2/base_models/t5/dst/serialization.py
@@ -1,32 +1,38 @@
 def serialize_dialogue_state(state):
-    state_seqs = []
+    state_dict = {}
     for domain in state:
-        for slot, value in state[domain].items():
+        for slot, value in sorted(state[domain].items()):
             if len(value) > 0:
-                state_seqs.append(f'[{domain}][{slot}][{value}]')
-    
-    return ';'.join(state_seqs)
+                state_dict.setdefault(f'[{domain}]', [])
+                state_dict[f'[{domain}]'].append(f'[{slot}][{value}]')
+    return ';'.join([domain+'('+','.join(slot_values)+')' for domain, slot_values in state_dict.items()])
 
 def deserialize_dialogue_state(state_seq):
     state = {}
     if len(state_seq) == 0:
         return state
-    state_seqs = state_seq.split('];[')
+    state_seqs = state_seq.split(']);[')  # will consume "])" and "["
     for i, state_seq in enumerate(state_seqs):
-        if len(state_seq) == 0:
+        if len(state_seq) == 0 or len(state_seq.split(']([')) != 2:
             continue
         if i == 0:
             if state_seq[0] == '[':
                 state_seq = state_seq[1:]
         if i == len(state_seqs) - 1:
-            if state_seq[-1] == ']':
-                state_seq = state_seq[:-1]
-        s = state_seq.split('][')
-        if len(s) != 3:
+            if state_seq[-2:] == '])':
+                state_seq = state_seq[:-2]
+        
+        try:
+            domain, slot_values = state_seq.split(']([')
+        except:
             continue
-        domain, slot, value = s
-        state.setdefault(domain, {})
-        state[domain][slot] = value
+        for slot_value in slot_values.split('],['):
+            try:
+                slot, value = slot_value.split('][')
+            except:
+                continue
+            state.setdefault(domain, {})
+            state[domain][slot] = value
     return state
 
 def equal_state_seq(state, state_seq):
diff --git a/convlab2/base_models/t5/nlg/nlg_metric.py b/convlab2/base_models/t5/nlg/nlg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c0155fff3fcd021a2d07c25528e75c4fd912d83
--- /dev/null
+++ b/convlab2/base_models/t5/nlg/nlg_metric.py
@@ -0,0 +1,80 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NLG Metric"""
+
+import datasets
+import sacrebleu
+
+# TODO: Add BibTeX citation
+_CITATION = """\
+@inproceedings{post-2018-call,
+    title = "A Call for Clarity in Reporting {BLEU} Scores",
+    author = "Post, Matt",
+    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
+    month = oct,
+    year = "2018",
+    address = "Belgium, Brussels",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/W18-6319",
+    pages = "186--191",
+}
+"""
+
+_DESCRIPTION = """\
+Metric to evaluate text-to-text models on the natural language generation task.
+"""
+
+_KWARGS_DESCRIPTION = """
+Calculates corpus-bleu4
+Args:
+    predictions: list of predictions to score. Each predictions
+        should be a string.
+    references: list of reference for each prediction. Each
+        reference should be a string.
+Returns:
+    bleu: corpus-bleu score
+Examples:
+
+    >>> nlg_metric = datasets.load_metric("nlg_metric.py")
+    >>> predictions = ["hello there general kenobi", "foo bar foobar"]
+    >>> references = ["hello there kenobi", "foo bar foobar"]
+    >>> results = nlg_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'bleu': 35.35533905932737}
+"""
+
+
+@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class NLGMetrics(datasets.Metric):
+    """Metric to evaluate text-to-text models on the natural language generation task."""
+
+    def _info(self):
+        return datasets.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            # This defines the format of each prediction and reference
+            features=datasets.Features({
+                'predictions': datasets.Value('string'),
+                'references': datasets.Value('string'),
+            })
+        )
+
+    def _compute(self, predictions, references):
+        """Returns the scores: bleu"""
+        bleu = sacrebleu.corpus_bleu(predictions, [references], lowercase=True).score
+        
+        return {
+            "bleu": bleu
+        }
diff --git a/convlab2/base_models/t5/nlu/nlu_metric.py b/convlab2/base_models/t5/nlu/nlu_metric.py
index a2f3c0bc4f9a50464eebc67d220964c39a45f2be..d337887201e683eebb5bfa932714e2ee4e2cfae2 100644
--- a/convlab2/base_models/t5/nlu/nlu_metric.py
+++ b/convlab2/base_models/t5/nlu/nlu_metric.py
@@ -36,21 +36,15 @@ Returns:
     seq_em: sequence exact match
     accuracy: dialog acts accuracy
     overall_f1: dialog acts overall f1
-    binary_f1: binary dialog acts f1
-    categorical_f1: categorical dialog acts f1
-    non-categorical_f1: non-categorical dialog acts f1
 Examples:
 
     >>> nlu_metric = datasets.load_metric("nlu_metric.py")
-    >>> predictions = ["[binary][thank][general][]", "[non-categorical][inform][taxi][leave at][17:15]"]
-    >>> references = ["[binary][thank][general][]", "[non-categorical][inform][train][leave at][17:15]"]
+    >>> predictions = ["[thank][general]{[][]}", "[inform][taxi]{[leave at][17:15]}"]
+    >>> references = ["[thank][general]{[][]}", "[inform][train]{[leave at][17:15]}"]
     >>> results = nlu_metric.compute(predictions=predictions, references=references)
     >>> print(results)
     {'seq_em': 0.5, 'accuracy': 0.5, 
-    'overall_f1': 0.5, 'overall_precision': 0.5, 'overall_recall': 0.5, 
-    'binary_f1': 1.0, 'binary_precision': 1.0, 'binary_recall': 1.0, 
-    'categorical_f1': 0.0, 'categorical_precision': 0.0, 'categorical_recall': 0.0, 
-    'non-categorical_f1': 0.0, 'non-categorical_precision': 0.0, 'non-categorical_recall': 0.0}
+    'overall_f1': 0.5, 'overall_precision': 0.5, 'overall_recall': 0.5}
 """
 
 
@@ -74,45 +68,33 @@ class NLUMetrics(datasets.Metric):
         """Returns the scores: sequence exact match, dialog acts accuracy and f1"""
         seq_em = []
         acc = []
-        f1_metrics = {x: {'TP':0, 'FP':0, 'FN':0} for x in ['overall', 'binary', 'categorical', 'non-categorical']}
+        f1_metrics = {'TP':0, 'FP':0, 'FN':0}
 
         for prediction, reference in zip(predictions, references):
             seq_em.append(prediction.strip()==reference.strip())
             pred_da = deserialize_dialogue_acts(prediction)
             gold_da = deserialize_dialogue_acts(reference)
-            flag = True
-            for da_type in ['binary', 'categorical', 'non-categorical']:
-                if da_type == 'binary':
-                    predicts = sorted(list({(x['intent'], x['domain'], x['slot']) for x in pred_da[da_type]}))
-                    labels = sorted(list({(x['intent'], x['domain'], x['slot']) for x in gold_da[da_type]}))
+            pred_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da in pred_da}))
+            gold_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da in gold_da}))
+            acc.append(pred_da==gold_da)
+            for ele in pred_da:
+                if ele in gold_da:
+                    f1_metrics['TP'] += 1
                 else:
-                    predicts = sorted(list({(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in pred_da[da_type]}))
-                    labels = sorted(list({(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in gold_da[da_type]}))
-                for ele in predicts:
-                    if ele in labels:
-                        f1_metrics['overall']['TP'] += 1
-                        f1_metrics[da_type]['TP'] += 1
-                    else:
-                        f1_metrics['overall']['FP'] += 1
-                        f1_metrics[da_type]['FP'] += 1
-                for ele in labels:
-                    if ele not in predicts:
-                        f1_metrics['overall']['FN'] += 1
-                        f1_metrics[da_type]['FN'] += 1
-                flag &= (predicts==labels)
-            acc.append(flag)
+                    f1_metrics['FP'] += 1
+            for ele in gold_da:
+                if ele not in pred_da:
+                    f1_metrics['FN'] += 1
 
-        for metric in list(f1_metrics.keys()):
-            TP = f1_metrics[metric].pop('TP')
-            FP = f1_metrics[metric].pop('FP')
-            FN = f1_metrics[metric].pop('FN')
-            precision = 1.0 * TP / (TP + FP) if TP + FP else 0.
-            recall = 1.0 * TP / (TP + FN) if TP + FN else 0.
-            f1 = 2.0 * precision * recall / (precision + recall) if precision + recall else 0.
-            f1_metrics.pop(metric)
-            f1_metrics[f'{metric}_f1'] = f1
-            f1_metrics[f'{metric}_precision'] = precision
-            f1_metrics[f'{metric}_recall'] = recall
+        TP = f1_metrics.pop('TP')
+        FP = f1_metrics.pop('FP')
+        FN = f1_metrics.pop('FN')
+        precision = 1.0 * TP / (TP + FP) if TP + FP else 0.
+        recall = 1.0 * TP / (TP + FN) if TP + FN else 0.
+        f1 = 2.0 * precision * recall / (precision + recall) if precision + recall else 0.
+        f1_metrics[f'overall_f1'] = f1
+        f1_metrics[f'overall_precision'] = precision
+        f1_metrics[f'overall_recall'] = recall
 
         return {
             "seq_em": sum(seq_em)/len(seq_em),
diff --git a/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh b/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh
deleted file mode 100644
index a9e9d6c55b38bfa1b43b0a837e4eef9d60a7e233..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="multiwoz21"
-speaker="user"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh b/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh
deleted file mode 100644
index e90e71d459da3bd43eeccc82d7ad192d7f751996..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="multiwoz21"
-speaker="user"
-context_window_size=3
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/dst/run_multiwoz21.sh b/convlab2/base_models/t5/nlu/run_nlu.sh
similarity index 76%
rename from convlab2/base_models/t5/dst/run_multiwoz21.sh
rename to convlab2/base_models/t5/nlu/run_nlu.sh
index 1c383cbb24130db89c4304912f74a8323dadd343..71b5f5f8e488b2c44e76637cc0152b8ff9db8671 100644
--- a/convlab2/base_models/t5/dst/run_multiwoz21.sh
+++ b/convlab2/base_models/t5/nlu/run_nlu.sh
@@ -1,8 +1,8 @@
-n_gpus=4
-task_name="dst"
-dataset_name="multiwoz21"
+n_gpus=1
+task_name="nlu"
+dataset_name=$1
 speaker="user"
-context_window_size=100
+context_window_size=$2
 data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
 output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
 cache_dir="../cache"
@@ -10,28 +10,26 @@ logging_dir="${output_dir}/runs"
 train_file="${data_dir}/train.json"
 validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
-metric_name_or_path="dst_metric.py"
-metric_for_best_model="accuracy"
+metric_name_or_path="nlu_metric.py"
+metric_for_best_model="overall_f1"
 source_column="context"
-target_column="state_seq"
+target_column="dialogue_acts_seq"
 truncation_side="left"
 max_source_length=512
 max_target_length=512
 model_name_or_path="t5-small"
-per_device_train_batch_size=64
+per_device_train_batch_size=128
 per_device_eval_batch_size=64
 gradient_accumulation_steps=2
 lr=1e-3
 num_train_epochs=10
 
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
+python ../create_data.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
 
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+python ../run_seq2seq.py \
     --task_name ${task_name} \
     --train_file ${train_file} \
     --validation_file ${validation_file} \
-    --test_file ${test_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
     --max_source_length ${max_source_length} \
@@ -40,7 +38,6 @@ python -m torch.distributed.launch \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --do_eval \
-    --do_predict \
     --save_strategy epoch \
     --evaluation_strategy epoch \
     --prediction_loss_only \
@@ -58,8 +55,7 @@ python -m torch.distributed.launch \
     --adafactor \
     --gradient_checkpointing
 
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+python ../run_seq2seq.py \
     --task_name ${task_name} \
     --test_file ${test_file} \
     --source_column ${source_column} \
@@ -76,8 +72,8 @@ python -m torch.distributed.launch \
     --logging_dir ${logging_dir} \
     --overwrite_output_dir \
     --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size}
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
 
-python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
+python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_tm1_user.sh b/convlab2/base_models/t5/nlu/run_tm1_user.sh
deleted file mode 100644
index 5372441aa284067f137e892454b8fed76e251e6e..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_tm1_user.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="tm1"
-speaker="user"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh
deleted file mode 100644
index 65482a1f517b7c1eb8607e0858d3ae576d1483b4..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="tm1"
-speaker="user"
-context_window_size=3
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=64
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_tm2_user.sh b/convlab2/base_models/t5/nlu/run_tm2_user.sh
deleted file mode 100644
index 84dc8b71ae560dcb481dee8bcfc31340ed4a778d..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_tm2_user.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="tm2"
-speaker="user"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh
deleted file mode 100644
index abca0a60a98c1a71295ae1ad77791dec6b482547..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="tm2"
-speaker="user"
-context_window_size=3
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_tm3_user.sh b/convlab2/base_models/t5/nlu/run_tm3_user.sh
deleted file mode 100644
index 689a626c9a871581b49eb84a7db1e9af1152e32e..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_tm3_user.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="tm3"
-speaker="user"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh
deleted file mode 100644
index e2ded66dbe940387f1997ba92028ef4dc4a5b5c5..0000000000000000000000000000000000000000
--- a/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-n_gpus=1
-task_name="nlu"
-dataset_name="tm3"
-speaker="user"
-context_window_size=3
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="nlu_metric.py"
-metric_for_best_model="overall_f1"
-source_column="context"
-target_column="dialogue_acts_seq"
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=10
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_strategy epoch \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
-    --adafactor \
-    --gradient_checkpointing
-
-python -m torch.distributed.launch \
-    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-
-python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
diff --git a/convlab2/base_models/t5/nlu/serialization.py b/convlab2/base_models/t5/nlu/serialization.py
index 5a620f4689519accaccdc1149a54ed6c8efb52d8..bb1a8b88b81ca23e97271c9bae211fbfd775149b 100644
--- a/convlab2/base_models/t5/nlu/serialization.py
+++ b/convlab2/base_models/t5/nlu/serialization.py
@@ -1,51 +1,46 @@
 def serialize_dialogue_acts(dialogue_acts):
-    da_seqs = []
+    da_dict = {}
     for da_type in dialogue_acts:
         for da in dialogue_acts[da_type]:
-            intent, domain, slot = da['intent'], da['domain'], da['slot']
-            if da_type == 'binary':
-                da_seq = f'[{da_type}][{intent}][{domain}][{slot}]'
-            else:
-                value = da['value']
-                da_seq = f'[{da_type}][{intent}][{domain}][{slot}][{value}]'
-            da_seqs.append(da_seq)
-    return ';'.join(da_seqs)
+            intent, domain, slot, value = da['intent'], da['domain'], da['slot'], da.get('value', '')
+            intent_domain = f'[{intent}][{domain}]'
+            da_dict.setdefault(intent_domain, [])
+            da_dict[intent_domain].append(f'[{slot}][{value}]')
+    return ';'.join([intent_domain+'('+','.join(slot_values)+')' for intent_domain, slot_values in da_dict.items()])
 
 def deserialize_dialogue_acts(das_seq):
-    dialogue_acts = {'binary': [], 'categorical': [], 'non-categorical': []}
+    dialogue_acts = []
     if len(das_seq) == 0:
         return dialogue_acts
-    da_seqs = das_seq.split('];[')
+    da_seqs = das_seq.split(']);[')  # will consume "])" and "["
     for i, da_seq in enumerate(da_seqs):
-        if len(da_seq) == 0:
+        if len(da_seq) == 0 or len(da_seq.split(']([')) != 2:
             continue
         if i == 0:
             if da_seq[0] == '[':
                 da_seq = da_seq[1:]
         if i == len(da_seqs) - 1:
-            if da_seq[-1] == ']':
-                da_seq = da_seq[:-1]
-        da = da_seq.split('][')
-        if len(da) == 0:
+            if da_seq[-2:] == '])':
+                da_seq = da_seq[:-2]
+        
+        try:
+            intent_domain, slot_values = da_seq.split(']([')
+            intent, domain = intent_domain.split('][')
+        except:
             continue
-        da_type = da[0]
-        if len(da) == 5 and da_type in ['categorical', 'non-categorical']:
-            dialogue_acts[da_type].append({'intent': da[1], 'domain': da[2], 'slot': da[3], 'value': da[4]})
-        elif len(da) == 4 and da_type == 'binary':
-            dialogue_acts[da_type].append({'intent': da[1], 'domain': da[2], 'slot': da[3]})
-        else:
-            # invalid da format, skip
-            # print(das_seq)
-            # print(da_seq)
-            # print()
-            pass
+        for slot_value in slot_values.split('],['):
+            try:
+                slot, value = slot_value.split('][')
+            except:
+                continue
+            dialogue_acts.append({'intent': intent, 'domain': domain, 'slot': slot, 'value': value})
+        
     return dialogue_acts
 
 def equal_da_seq(dialogue_acts, das_seq):
     predict_dialogue_acts = deserialize_dialogue_acts(das_seq)
-    for da_type in ['binary', 'categorical', 'non-categorical']:
-        das = sorted([(da['intent'], da['domain'], da['slot'], da.get('value', '')) for da in dialogue_acts[da_type]])
-        predict_das = sorted([(da['intent'], da['domain'], da['slot'], da.get('value', '')) for da in predict_dialogue_acts[da_type]])
-        if das != predict_das:
-            return False
+    das = sorted([(da['intent'], da['domain'], da['slot'], da.get('value', '')) for da_type in ['binary', 'categorical', 'non-categorical'] for da in dialogue_acts[da_type]])
+    predict_das = sorted([(da['intent'], da['domain'], da['slot'], da.get('value', '')) for da in predict_dialogue_acts])
+    if das != predict_das:
+        return False
     return True
diff --git a/convlab2/base_models/t5/run_seq2seq.py b/convlab2/base_models/t5/run_seq2seq.py
index dace9713d540b7fe2aa1c552132cc4c54d698989..2f0f5481243c2f78eac4d352786482508f70e617 100644
--- a/convlab2/base_models/t5/run_seq2seq.py
+++ b/convlab2/base_models/t5/run_seq2seq.py
@@ -212,8 +212,8 @@ class DataTrainingArguments:
             "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
         },
     )
-    source_prefix_filepath: Optional[str] = field(
-        default=None, metadata={"help": "A file whose first line is the prefix to add before every source text (useful for T5 models)."}
+    source_prefix: Optional[str] = field(
+        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
     )
 
     def __post_init__(self):
@@ -271,7 +271,7 @@ def main():
     )
     logger.info(f"Training/evaluation parameters {training_args}")
 
-    if data_args.source_prefix_filepath is None and model_args.model_name_or_path in [
+    if data_args.source_prefix is None and model_args.model_name_or_path in [
         "t5-small",
         "t5-base",
         "t5-large",
@@ -280,7 +280,7 @@ def main():
     ]:
         logger.warning(
             "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
-            "`--source_prefix_filepath 'path_to_prefix_file' ` whose first line is the source prefix"
+            "`--source_prefix 'summarize: ' `"
         )
 
     # Detecting last checkpoint.
@@ -386,10 +386,7 @@ def main():
                 "resize the model's position encodings by passing `--resize_position_embeddings`."
             )
 
-    if data_args.source_prefix_filepath is not None:
-        prefix = open(data_args.source_prefix_filepath, 'r', encoding='utf-8').readline().strip('\n')
-    else:
-        prefix = ""
+    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
     
     logger.info(f'source prefix: "{prefix}"')
 
diff --git a/convlab2/nlu/evaluate_unified_datasets.py b/convlab2/nlu/evaluate_unified_datasets.py
index 907b1afaaee6788c1e90e3bd85b67b3360c9c2da..2eee44d7a02b405d526ce9389993b9cf0783d48a 100644
--- a/convlab2/nlu/evaluate_unified_datasets.py
+++ b/convlab2/nlu/evaluate_unified_datasets.py
@@ -10,28 +10,43 @@ def evaluate(predict_result):
 
     for sample in predict_result:
         flag = True
-        for da_type in ['binary', 'categorical', 'non-categorical']:
-            if da_type == 'binary':
-                predicts = [(x['intent'], x['domain'], x['slot']) for x in sample['predictions']['dialogue_acts'][da_type]]
-                labels = [(x['intent'], x['domain'], x['slot']) for x in sample['dialogue_acts'][da_type]]
-            else:
-                predicts = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in sample['predictions']['dialogue_acts'][da_type]]
-                labels = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in sample['dialogue_acts'][da_type]]
-            predicts = sorted(list(set(predicts)))
-            labels = sorted(list(set(labels)))
-            for ele in predicts:
-                if ele in labels:
+        if isinstance(sample['predictions']['dialogue_acts'], dict):
+            for da_type in ['binary', 'categorical', 'non-categorical']:
+                if da_type == 'binary':
+                    predicts = [(x['intent'], x['domain'], x['slot']) for x in sample['predictions']['dialogue_acts'][da_type]]
+                    labels = [(x['intent'], x['domain'], x['slot']) for x in sample['dialogue_acts'][da_type]]
+                else:
+                    predicts = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in sample['predictions']['dialogue_acts'][da_type]]
+                    labels = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in sample['dialogue_acts'][da_type]]
+                predicts = sorted(list(set(predicts)))
+                labels = sorted(list(set(labels)))
+                for ele in predicts:
+                    if ele in labels:
+                        metrics['overall']['TP'] += 1
+                        metrics[da_type]['TP'] += 1
+                    else:
+                        metrics['overall']['FP'] += 1
+                        metrics[da_type]['FP'] += 1
+                for ele in labels:
+                    if ele not in predicts:
+                        metrics['overall']['FN'] += 1
+                        metrics[da_type]['FN'] += 1
+                flag &= (predicts==labels)
+            acc.append(flag)
+        elif isinstance(sample['predictions']['dialogue_acts'], list):
+            gold_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da_type in ['binary', 'categorical', 'non-categorical'] for da in sample['dialogue_acts'][da_type]}))
+            pred_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da in sample['predictions']['dialogue_acts']}))
+            acc.append(pred_da==gold_da)
+            for ele in pred_da:
+                if ele in gold_da:
                     metrics['overall']['TP'] += 1
-                    metrics[da_type]['TP'] += 1
                 else:
                     metrics['overall']['FP'] += 1
-                    metrics[da_type]['FP'] += 1
-            for ele in labels:
-                if ele not in predicts:
+            for ele in gold_da:
+                if ele not in pred_da:
                     metrics['overall']['FN'] += 1
-                    metrics[da_type]['FN'] += 1
-            flag &= (predicts==labels)
-        acc.append(flag)
+        else:
+            raise TypeError('type of predictions:dialogue_acts should be dict or list')
     
     for metric in metrics:
         TP = metrics[metric].pop('TP')
diff --git a/convlab2/util/unified_datasets_util.py b/convlab2/util/unified_datasets_util.py
index 46ba13d79c2bde1693454caad7c95596e7e63d81..014f5306a4f41637d9dab92d1fcb00a4d41a3463 100644
--- a/convlab2/util/unified_datasets_util.py
+++ b/convlab2/util/unified_datasets_util.py
@@ -124,7 +124,7 @@ def load_unified_data(
                         sample['domains'] = dialogue['domains']
                     if terminated:
                         sample['terminated'] = turn['utt_idx'] == len(dialogue['turns']) - 1
-                    if speaker == 'system':
+                    if speaker == 'system' and 'booked' in turn:
                         sample['booked'] = turn['booked']
                     data_by_split[data_split].append(sample)
             if not split_to_turn: