diff --git a/convlab2/base_models/t5/create_data.py b/convlab2/base_models/t5/create_data.py
index 19be0b81520cf4077ac34166e7b2e7a0d12f80a3..d4b99d9a7ebfc294433f06e795aff2620f0c1fb5 100644
--- a/convlab2/base_models/t5/create_data.py
+++ b/convlab2/base_models/t5/create_data.py
@@ -4,6 +4,7 @@ from tqdm import tqdm
 import re
 from convlab2.util import load_dataset, load_nlu_data, load_dst_data, load_policy_data, load_nlg_data, load_e2e_data, load_rg_data
 from convlab2.base_models.t5.nlu.serialization import serialize_dialogue_acts, deserialize_dialogue_acts, equal_da_seq
+from convlab2.base_models.t5.dst.serialization import serialize_dialogue_state, deserialize_dialogue_state, equal_state_seq
 
 def create_rg_data(dataset, data_dir, args):
     data_by_split = load_rg_data(dataset, speaker=args.speaker)
@@ -11,9 +12,6 @@ def create_rg_data(dataset, data_dir, args):
     os.makedirs(data_dir, exist_ok=True)
 
     data_splits = data_by_split.keys()
-    file_name = os.path.join(data_dir, f"source_prefix.txt")
-    with open(file_name, "w") as f:
-        f.write("generate a system response according to the context: ")
     for data_split in data_splits:
         data = []
         for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
@@ -31,9 +29,6 @@ def create_nlu_data(dataset, data_dir, args):
     os.makedirs(data_dir, exist_ok=True)
 
     data_splits = data_by_split.keys()
-    file_name = os.path.join(data_dir, f"source_prefix.txt")
-    with open(file_name, "w") as f:
-        f.write("parse the dialogue action of the last utterance: ")
     for data_split in data_splits:
         data = []
         for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
@@ -50,14 +45,33 @@ def create_nlu_data(dataset, data_dir, args):
         with open(file_name, "w", encoding='utf-8') as f:
             f.writelines(data)
 
+def create_dst_data(dataset, data_dir, args):
+    data_by_split = load_dst_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
+    data_dir = os.path.join(data_dir, args.speaker, f'context_{args.context_window_size}')
+    os.makedirs(data_dir, exist_ok=True)
+
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            response = f"{sample['speaker']}: {sample['utterance']}"
+            if args.context_window_size>0:
+                context = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[response])
+            else:
+                context = response
+            state_seq = serialize_dialogue_state(sample['state'])
+            assert equal_state_seq(sample['state'], state_seq), print(sample['state'], state_seq, deserialize_dialogue_state(state_seq))
+            data.append(json.dumps({'context': context, 'state_seq': state_seq}, ensure_ascii=False)+'\n')
+
+        file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+
 def create_goal2dialogue_data(dataset, data_dir, args):
     data_by_split = dataset
     os.makedirs(data_dir, exist_ok=True)
 
     data_splits = data_by_split.keys()
-    file_name = os.path.join(data_dir, f"source_prefix.txt")
-    with open(file_name, "w") as f:
-        f.write("generate a dialogue between user and system according to the user goal: ")
     for data_split in data_splits:
         data = []
         for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
@@ -73,7 +87,7 @@ def create_goal2dialogue_data(dataset, data_dir, args):
 if __name__ == '__main__':
     from argparse import ArgumentParser
     parser = ArgumentParser(description="create data for seq2seq training")
-    parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['rg', 'nlu', 'goal2dialogue'], help='names of tasks')
+    parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['rg', 'nlu', 'dst', 'goal2dialogue'], help='names of tasks')
     parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
     parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s)')
     parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
diff --git a/convlab2/base_models/t5/dst/dst_metric.py b/convlab2/base_models/t5/dst/dst_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a4f73b02a08b4b924a1d4f1bc545a023664e2bc
--- /dev/null
+++ b/convlab2/base_models/t5/dst/dst_metric.py
@@ -0,0 +1,107 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DST Metric"""
+
+import datasets
+from convlab2.base_models.t5.dst.serialization import deserialize_dialogue_state
+
+
+# TODO: Add BibTeX citation
+_CITATION = """\
+"""
+
+_DESCRIPTION = """\
+Metric to evaluate text-to-text models on the dialog state tracking task.
+"""
+
+_KWARGS_DESCRIPTION = """
+Calculates sequence exact match, joint goal accuracy and slot f1
+Args:
+    predictions: list of predictions to score. Each predictions
+        should be a string.
+    references: list of reference for each prediction. Each
+        reference should be a string.
+Returns:
+    seq_em: sequence exact match
+    accuracy: dialog state accuracy
+    slot_f1: slot f1
+Examples:
+
+    >>> dst_metric = datasets.load_metric("dst_metric.py")
+    >>> predictions = ["[restaurant][price range][moderate]", "[restaurant][price range][moderate];[restaurant][food][catalan];[restaurant][area][centre]"]
+    >>> references = ["[restaurant][price range][moderate]", "[restaurant][price range][moderate];[restaurant][food][catalan];[attraction][area][centre]"]
+    >>> results = dst_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'seq_em': 0.5, 'accuracy': 0.5, 
+    'slot_f1': 0.75, 'slot_precision': 0.75, 'slot_recall': 0.75}
+"""
+
+
+@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class DSTMetrics(datasets.Metric):
+    """Metric to evaluate text-to-text models on the dialog state tracking task."""
+
+    def _info(self):
+        return datasets.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            # This defines the format of each prediction and reference
+            features=datasets.Features({
+                'predictions': datasets.Value('string'),
+                'references': datasets.Value('string'),
+            })
+        )
+
+    def _compute(self, predictions, references):
+        """Returns the scores: sequence exact match, joint goal accuracy and slot f1"""
+        seq_em = []
+        acc = []
+        f1_metrics = {'TP':0, 'FP':0, 'FN':0}
+
+        for prediction, reference in zip(predictions, references):
+            seq_em.append(prediction.strip()==reference.strip())
+            pred_state = deserialize_dialogue_state(prediction)
+            gold_state = deserialize_dialogue_state(reference)
+
+            predicts = sorted(list({(domain, slot, value) for domain in pred_state for slot, value in pred_state[domain].items() if len(value)>0}))
+            labels = sorted(list({(domain, slot, value) for domain in gold_state for slot, value in gold_state[domain].items() if len(value)>0}))
+
+            flag = True
+            for ele in predicts:
+                if ele in labels:
+                    f1_metrics['TP'] += 1
+                else:
+                    f1_metrics['FP'] += 1
+            for ele in labels:
+                if ele not in predicts:
+                    f1_metrics['FN'] += 1
+            flag &= (predicts==labels)
+            acc.append(flag)
+
+        TP = f1_metrics.pop('TP')
+        FP = f1_metrics.pop('FP')
+        FN = f1_metrics.pop('FN')
+        precision = 1.0 * TP / (TP + FP) if TP + FP else 0.
+        recall = 1.0 * TP / (TP + FN) if TP + FN else 0.
+        f1 = 2.0 * precision * recall / (precision + recall) if precision + recall else 0.
+        f1_metrics[f'slot_f1'] = f1
+        f1_metrics[f'slot_precision'] = precision
+        f1_metrics[f'slot_recall'] = recall
+
+        return {
+            "seq_em": sum(seq_em)/len(seq_em),
+            "accuracy": sum(acc)/len(acc),
+            **f1_metrics
+        }
diff --git a/convlab2/base_models/t5/dst/merge_predict_res.py b/convlab2/base_models/t5/dst/merge_predict_res.py
new file mode 100755
index 0000000000000000000000000000000000000000..ebdada8a6cd5d428e02f72f7d40448076454a9e9
--- /dev/null
+++ b/convlab2/base_models/t5/dst/merge_predict_res.py
@@ -0,0 +1,34 @@
+import json
+import os
+from convlab2.util import load_dataset, load_dst_data
+from convlab2.base_models.t5.dst.serialization import deserialize_state
+
+
+def merge(dataset_name, speaker, save_dir, context_window_size, predict_result):
+    assert os.path.exists(predict_result)
+    dataset = load_dataset(dataset_name)
+    data = load_dst_data(dataset, data_split='test', speaker=speaker, use_context=context_window_size>0, context_window_size=context_window_size)['test']
+    
+    if save_dir is None:
+        save_dir = os.path.dirname(predict_result)
+    else:
+        os.makedirs(save_dir, exist_ok=True)
+    predict_result = [deserialize_state(json.loads(x)['predictions'].strip()) for x in open(predict_result)]
+
+    for sample, prediction in zip(data, predict_result):
+        sample['predictions'] = {'state': prediction}
+
+    json.dump(data, open(os.path.join(save_dir, 'predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="merge predict results with original data for unified NLU evaluation")
+    parser.add_argument('--dataset', '-d', metavar='dataset_name', type=str, help='name of the unified dataset')
+    parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s) of utterances')
+    parser.add_argument('--save_dir', type=str, help='merged data will be saved as $save_dir/predictions.json. default: on the same directory as predict_result')
+    parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
+    parser.add_argument('--predict_result', '-p', type=str, required=True, help='path to the output file generated_predictions.json')
+    args = parser.parse_args()
+    print(args)
+    merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result)
diff --git a/convlab2/base_models/t5/dst/run_multiwoz21.sh b/convlab2/base_models/t5/dst/run_multiwoz21.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e031be48b32d252a0b835d51ac4674ee8b927723
--- /dev/null
+++ b/convlab2/base_models/t5/dst/run_multiwoz21.sh
@@ -0,0 +1,83 @@
+n_gpus=4
+task_name="dst"
+dataset_name="multiwoz21"
+speaker="user"
+context_window_size=100
+data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
+output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test.json"
+metric_name_or_path="dst_metric.py"
+metric_for_best_model="accuracy"
+source_column="context"
+target_column="state_seq"
+truncation_side="left"
+max_source_length=512
+max_target_length=512
+model_name_or_path="t5-small"
+per_device_train_batch_size=64
+per_device_eval_batch_size=64
+gradient_accumulation_steps=2
+lr=1e-3
+num_train_epochs=10
+
+python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
+
+python -m torch.distributed.launch --master_port 29501 \
+    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --do_predict \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --load_best_model_at_end \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --debug underflow_overflow \
+    --adafactor \
+    --gradient_checkpointing
+
+# python -m torch.distributed.launch \
+#     --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+#     --task_name ${task_name} \
+#     --test_file ${test_file} \
+#     --source_column ${source_column} \
+#     --target_column ${target_column} \
+#     --max_source_length ${max_source_length} \
+#     --max_target_length ${max_target_length} \
+#     --truncation_side ${truncation_side} \
+#     --model_name_or_path ${output_dir} \
+#     --do_predict \
+#     --predict_with_generate \
+#     --metric_name_or_path ${metric_name_or_path} \
+#     --cache_dir ${cache_dir} \
+#     --output_dir ${output_dir} \
+#     --logging_dir ${logging_dir} \
+#     --overwrite_output_dir \
+#     --preprocessing_num_workers 4 \
+#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
+
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
diff --git a/convlab2/base_models/t5/dst/serialization.py b/convlab2/base_models/t5/dst/serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ccf25ae76048a11834566163591366ae5cdc61a
--- /dev/null
+++ b/convlab2/base_models/t5/dst/serialization.py
@@ -0,0 +1,38 @@
+def serialize_dialogue_state(state):
+    state_seqs = []
+    for domain in state:
+        for slot, value in state[domain].items():
+            if len(value) > 0:
+                state_seqs.append(f'[{domain}][{slot}][{value}]')
+    
+    return ';'.join(state_seqs)
+
+def deserialize_dialogue_state(state_seq):
+    state = {}
+    if len(state_seq) == 0:
+        return state
+    state_seqs = state_seq.split('];[')
+    for i, state_seq in enumerate(state_seqs):
+        if len(state_seq) == 0:
+            continue
+        if i == 0:
+            if state_seq[0] == '[':
+                state_seq = state_seq[1:]
+        if i == len(state_seqs) - 1:
+            if state_seq[-1] == ']':
+                state_seq = state_seq[:-1]
+        s = state_seq.split('][')
+        if len(s) != 3:
+            continue
+        domain, slot, value = s
+        state.setdefault(domain, {})
+        state[domain][slot] = value
+    return state
+
+def equal_state_seq(state, state_seq):
+    predict_state = deserialize_dialogue_state(state_seq)
+    svs = sorted([(domain, slot, value) for domain in state for slot, value in state[domain].items() if len(value)>0])
+    predict_svs = sorted([(domain, slot, value) for domain in predict_state for slot, value in predict_state[domain].items() if len(value)>0])
+    if svs != predict_svs:
+        return False
+    return True
diff --git a/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh b/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh
index a60f77c2bbd9b2660b5616dcad8c9a162f728f12..09a2c33aa06fa5134dba0707e1df5e633ac9f269 100644
--- a/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh
+++ b/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh
@@ -8,7 +8,6 @@ logging_dir="${output_dir}/runs"
 train_file="${data_dir}/train.json"
 validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
-source_prefix="${data_dir}/source_prefix.txt"
 source_column="goal"
 target_column="dialogue"
 max_target_length=1024
@@ -30,7 +29,6 @@ python -m torch.distributed.launch \
     --source_column ${source_column} \
     --target_column ${target_column} \
     --max_target_length ${max_target_length} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --do_eval \
diff --git a/convlab2/base_models/t5/nlu/nlu_metric.py b/convlab2/base_models/t5/nlu/nlu_metric.py
index 1eb57c84a02bd1f019eb1978271cb53c3b3a1916..a2f3c0bc4f9a50464eebc67d220964c39a45f2be 100644
--- a/convlab2/base_models/t5/nlu/nlu_metric.py
+++ b/convlab2/base_models/t5/nlu/nlu_metric.py
@@ -83,13 +83,11 @@ class NLUMetrics(datasets.Metric):
             flag = True
             for da_type in ['binary', 'categorical', 'non-categorical']:
                 if da_type == 'binary':
-                    predicts = [(x['intent'], x['domain'], x['slot']) for x in pred_da[da_type]]
-                    labels = [(x['intent'], x['domain'], x['slot']) for x in gold_da[da_type]]
+                    predicts = sorted(list({(x['intent'], x['domain'], x['slot']) for x in pred_da[da_type]}))
+                    labels = sorted(list({(x['intent'], x['domain'], x['slot']) for x in gold_da[da_type]}))
                 else:
-                    predicts = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in pred_da[da_type]]
-                    labels = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in gold_da[da_type]]
-                predicts = sorted(list(set(predicts)))
-                labels = sorted(list(set(labels)))
+                    predicts = sorted(list({(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in pred_da[da_type]}))
+                    labels = sorted(list({(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in gold_da[da_type]}))
                 for ele in predicts:
                     if ele in labels:
                         f1_metrics['overall']['TP'] += 1
diff --git a/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh b/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh
index 85f3ec8302d161b29ba71b760a56d0f64a6b4dfc..4080a09249fbed8260ed8e0b0fab7961d9d9120a 100644
--- a/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh
+++ b/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh
@@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
 metric_name_or_path="nlu_metric.py"
 metric_for_best_model="overall_f1"
-source_prefix="${data_dir}/source_prefix.txt"
 source_column="context"
 target_column="dialogue_acts_seq"
 model_name_or_path="t5-small"
@@ -30,7 +29,6 @@ python -m torch.distributed.launch \
     --train_file ${train_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --save_strategy epoch \
@@ -41,7 +39,6 @@ python -m torch.distributed.launch \
     --overwrite_output_dir \
     --preprocessing_num_workers 4 \
     --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
@@ -55,7 +52,6 @@ python -m torch.distributed.launch \
     --test_file ${test_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${output_dir} \
     --do_predict \
     --predict_with_generate \
diff --git a/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh b/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh
index 8d7b5c93e8deb9c8c5da9ecd03e42bbc53341442..a8cbc9bd6c30b91a16c1e278450161992d8dca9a 100644
--- a/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh
+++ b/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh
@@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
 metric_name_or_path="nlu_metric.py"
 metric_for_best_model="overall_f1"
-source_prefix="${data_dir}/source_prefix.txt"
 source_column="context"
 target_column="dialogue_acts_seq"
 model_name_or_path="t5-small"
@@ -30,7 +29,6 @@ python -m torch.distributed.launch \
     --train_file ${train_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --save_strategy epoch \
@@ -41,7 +39,6 @@ python -m torch.distributed.launch \
     --overwrite_output_dir \
     --preprocessing_num_workers 4 \
     --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
@@ -55,7 +52,6 @@ python -m torch.distributed.launch \
     --test_file ${test_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${output_dir} \
     --do_predict \
     --predict_with_generate \
diff --git a/convlab2/base_models/t5/nlu/run_tm1_user.sh b/convlab2/base_models/t5/nlu/run_tm1_user.sh
index 16a16fdb106f09a7001190477de8b0878d2e20f3..9faedd7f664fb40ba20883033aaa1d3817d66c1c 100644
--- a/convlab2/base_models/t5/nlu/run_tm1_user.sh
+++ b/convlab2/base_models/t5/nlu/run_tm1_user.sh
@@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
 metric_name_or_path="nlu_metric.py"
 metric_for_best_model="overall_f1"
-source_prefix="${data_dir}/source_prefix.txt"
 source_column="context"
 target_column="dialogue_acts_seq"
 model_name_or_path="t5-small"
@@ -30,7 +29,6 @@ python -m torch.distributed.launch \
     --train_file ${train_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --save_strategy epoch \
@@ -41,7 +39,6 @@ python -m torch.distributed.launch \
     --overwrite_output_dir \
     --preprocessing_num_workers 4 \
     --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
@@ -55,7 +52,6 @@ python -m torch.distributed.launch \
     --test_file ${test_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${output_dir} \
     --do_predict \
     --predict_with_generate \
diff --git a/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh
index ccb67609279be5c4b044a9baadc19672d69c1532..bb6b55fe06c54bab7294a55d8abda30e959acf34 100644
--- a/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh
+++ b/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh
@@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
 metric_name_or_path="nlu_metric.py"
 metric_for_best_model="overall_f1"
-source_prefix="${data_dir}/source_prefix.txt"
 source_column="context"
 target_column="dialogue_acts_seq"
 model_name_or_path="t5-small"
@@ -30,7 +29,6 @@ python -m torch.distributed.launch \
     --train_file ${train_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --save_strategy epoch \
@@ -41,7 +39,6 @@ python -m torch.distributed.launch \
     --overwrite_output_dir \
     --preprocessing_num_workers 4 \
     --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
@@ -55,7 +52,6 @@ python -m torch.distributed.launch \
     --test_file ${test_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${output_dir} \
     --do_predict \
     --predict_with_generate \
diff --git a/convlab2/base_models/t5/nlu/run_tm2_user.sh b/convlab2/base_models/t5/nlu/run_tm2_user.sh
index 8686822fea882cb75776bee89dbd4344b71ea64b..728a8a94748c8344104fb9176fd8d2599580b11d 100644
--- a/convlab2/base_models/t5/nlu/run_tm2_user.sh
+++ b/convlab2/base_models/t5/nlu/run_tm2_user.sh
@@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
 metric_name_or_path="nlu_metric.py"
 metric_for_best_model="overall_f1"
-source_prefix="${data_dir}/source_prefix.txt"
 source_column="context"
 target_column="dialogue_acts_seq"
 model_name_or_path="t5-small"
@@ -30,7 +29,6 @@ python -m torch.distributed.launch \
     --train_file ${train_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --save_strategy epoch \
@@ -41,7 +39,6 @@ python -m torch.distributed.launch \
     --overwrite_output_dir \
     --preprocessing_num_workers 4 \
     --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
@@ -55,7 +52,6 @@ python -m torch.distributed.launch \
     --test_file ${test_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${output_dir} \
     --do_predict \
     --predict_with_generate \
diff --git a/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh
index 03c2489940e38dd16256f6b4f2683a413f514235..8ebb102dd99c22a9e6dc752c09b48b1538c77ad8 100644
--- a/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh
+++ b/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh
@@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
 metric_name_or_path="nlu_metric.py"
 metric_for_best_model="overall_f1"
-source_prefix="${data_dir}/source_prefix.txt"
 source_column="context"
 target_column="dialogue_acts_seq"
 model_name_or_path="t5-small"
@@ -30,7 +29,6 @@ python -m torch.distributed.launch \
     --train_file ${train_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --save_strategy epoch \
@@ -41,7 +39,6 @@ python -m torch.distributed.launch \
     --overwrite_output_dir \
     --preprocessing_num_workers 4 \
     --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
@@ -55,7 +52,6 @@ python -m torch.distributed.launch \
     --test_file ${test_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${output_dir} \
     --do_predict \
     --predict_with_generate \
diff --git a/convlab2/base_models/t5/nlu/run_tm3_user.sh b/convlab2/base_models/t5/nlu/run_tm3_user.sh
index 470cb7d71c2b7a630e6917912e21d2c61ca1c075..0d775f7ae41a63f72fc93539186b63aa2b4a551f 100644
--- a/convlab2/base_models/t5/nlu/run_tm3_user.sh
+++ b/convlab2/base_models/t5/nlu/run_tm3_user.sh
@@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
 metric_name_or_path="nlu_metric.py"
 metric_for_best_model="overall_f1"
-source_prefix="${data_dir}/source_prefix.txt"
 source_column="context"
 target_column="dialogue_acts_seq"
 model_name_or_path="t5-small"
@@ -30,7 +29,6 @@ python -m torch.distributed.launch \
     --train_file ${train_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --save_strategy epoch \
@@ -41,7 +39,6 @@ python -m torch.distributed.launch \
     --overwrite_output_dir \
     --preprocessing_num_workers 4 \
     --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
@@ -55,7 +52,6 @@ python -m torch.distributed.launch \
     --test_file ${test_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${output_dir} \
     --do_predict \
     --predict_with_generate \
diff --git a/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh
index 5e325d1fe2b127ef1af0b0733dd5db03bb1cbe3c..c8a4a9f6b5e77ee6e05ae36aea2e002202243c72 100644
--- a/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh
+++ b/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh
@@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
 metric_name_or_path="nlu_metric.py"
 metric_for_best_model="overall_f1"
-source_prefix="${data_dir}/source_prefix.txt"
 source_column="context"
 target_column="dialogue_acts_seq"
 model_name_or_path="t5-small"
@@ -30,7 +29,6 @@ python -m torch.distributed.launch \
     --train_file ${train_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --save_strategy epoch \
@@ -41,7 +39,6 @@ python -m torch.distributed.launch \
     --overwrite_output_dir \
     --preprocessing_num_workers 4 \
     --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
@@ -55,7 +52,6 @@ python -m torch.distributed.launch \
     --test_file ${test_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
     --model_name_or_path ${output_dir} \
     --do_predict \
     --predict_with_generate \
diff --git a/convlab2/base_models/t5/rg/run_rg.sh b/convlab2/base_models/t5/rg/run_rg.sh
index 8bf742d51b1f28765a5d28775970d12a25178434..55accadfaeceb7c43ad9df079f054aa3e00c5a1c 100644
--- a/convlab2/base_models/t5/rg/run_rg.sh
+++ b/convlab2/base_models/t5/rg/run_rg.sh
@@ -1,7 +1,8 @@
-n_gpus=8
+set -e
+n_gpus=2
 task_name="rg"
-dataset_name="multiwoz21"
-speaker="system"
+dataset_name="metalwoz+sgd+tm1+tm2+tm3"
+speaker="all"
 data_dir="data/${task_name}/${dataset_name}/${speaker}"
 output_dir="output/${task_name}/${dataset_name}/${speaker}"
 cache_dir="../cache"
@@ -9,17 +10,30 @@ logging_dir="${output_dir}/runs"
 train_file="${data_dir}/train.json"
 validation_file="${data_dir}/validation.json"
 test_file="${data_dir}/test.json"
-source_prefix="${data_dir}/source_prefix.txt"
 source_column="context"
 target_column="response"
+truncation_side="left"
+max_source_length=512
+max_target_length=128
 model_name_or_path="t5-small"
 per_device_train_batch_size=32
 per_device_eval_batch_size=128
-gradient_accumulation_steps=1
+gradient_accumulation_steps=4
 lr=1e-3
 num_train_epochs=5
 
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker}
+# names=$(echo ${dataset_name} | tr "+" "\n")
+# mkdir -p ${data_dir}
+# for name in ${names};
+# do
+#     echo "preprocessing ${name}"
+#     python ../create_data.py --tasks ${task_name} --datasets ${name} --speaker ${speaker}
+#     if [ "${name}" != "${dataset_name}" ]; then
+#         cat "data/${task_name}/${name}/${speaker}/train.json" >> ${train_file}
+#         cat "data/${task_name}/${name}/${speaker}/validation.json" >> ${validation_file}
+#         cat "data/${task_name}/${name}/${speaker}/test.json" >> ${test_file}
+#     fi
+# done
 
 python -m torch.distributed.launch \
     --nproc_per_node ${n_gpus} ../run_seq2seq.py \
@@ -29,7 +43,9 @@ python -m torch.distributed.launch \
     --test_file ${test_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --do_eval \
diff --git a/convlab2/base_models/t5/run_seq2seq.py b/convlab2/base_models/t5/run_seq2seq.py
index e9348f5579d35dffe85b9b7e9b90cedf176d0de3..dace9713d540b7fe2aa1c552132cc4c54d698989 100644
--- a/convlab2/base_models/t5/run_seq2seq.py
+++ b/convlab2/base_models/t5/run_seq2seq.py
@@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.5")
+check_min_version("4.17.0")
 
 require_version("datasets>=1.16.1")
 
@@ -78,6 +78,10 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
     )
+    truncation_side: Optional[str] = field(
+        default="right",
+        metadata={"help": "Which side to truncate, left or right."}
+    )
     model_revision: str = field(
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
@@ -341,6 +345,7 @@ def main():
         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
+        truncation_side=model_args.truncation_side,
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
     )
@@ -382,10 +387,11 @@ def main():
             )
 
     if data_args.source_prefix_filepath is not None:
-        prefix = open(data_args.source_prefix_filepath, 'r', encoding='utf-8').readline().strip()
+        prefix = open(data_args.source_prefix_filepath, 'r', encoding='utf-8').readline().strip('\n')
     else:
         prefix = ""
     
+    logger.info(f'source prefix: "{prefix}"')
 
     # Preprocessing the datasets.
     # We need to tokenize inputs and targets.