diff --git a/convlab/base_models/t5/dst/merge_predict_res.py b/convlab/base_models/t5/dst/merge_predict_res.py
index a1445ea35525ac403e6990edb677f801eb0d6468..9b519229b5b0a941a6d4172bd1473c6c8077658a 100755
--- a/convlab/base_models/t5/dst/merge_predict_res.py
+++ b/convlab/base_models/t5/dst/merge_predict_res.py
@@ -4,7 +4,7 @@ from convlab.util import load_dataset, load_dst_data
 from convlab.base_models.t5.dst.serialization import deserialize_dialogue_state
 
 
-def merge(dataset_names, speaker, save_dir, context_window_size, predict_result):
+def merge(dataset_names, speaker, save_dir, context_window_size, predict_result, dial_ids_order):
     assert os.path.exists(predict_result)
     
     if save_dir is None:
@@ -17,14 +17,18 @@ def merge(dataset_names, speaker, save_dir, context_window_size, predict_result)
     i = 0
     for dataset_name in dataset_names.split('+'):
         print(dataset_name)
-        dataset = load_dataset(dataset_name, args.dial_ids_order)
+        single = []
+        dataset = load_dataset(dataset_name, dial_ids_order)
         data = load_dst_data(dataset, data_split='test', speaker=speaker, use_context=context_window_size>0, context_window_size=context_window_size)['test']
     
         for sample in data:
             sample['predictions'] = {'state': predict_result[i]}
             i += 1
+            single.append(sample)
             merged.append(sample)
 
+        json.dump(single, open(os.path.join(save_dir, f'{dataset_name}_predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    
     json.dump(merged, open(os.path.join(save_dir, 'predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
 
 
@@ -39,4 +43,4 @@ if __name__ == '__main__':
     parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments')
     args = parser.parse_args()
     print(args)
-    merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result)
+    merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result, args.dial_ids_order)
diff --git a/convlab/base_models/t5/dst/run_dst_multitask.sh b/convlab/base_models/t5/dst/run_dst_multitask.sh
index 4803b521fee6b5a464e08fca93b4116466373b6b..7846e049aecf9302063517533e2810aaea9d339c 100644
--- a/convlab/base_models/t5/dst/run_dst_multitask.sh
+++ b/convlab/base_models/t5/dst/run_dst_multitask.sh
@@ -30,7 +30,7 @@ mkdir -p ${data_dir}
 for name in ${names};
 do
     echo "preprocessing ${name}"
-    # python ../create_data.py -t ${task_name} -d ${name} -s ${speaker} -c ${context_window_size}
+    python ../create_data.py -t ${task_name} -d ${name} -s ${speaker} -c ${context_window_size}
 done
 
 python merge_data.py $(echo ${dataset_name} | tr "+" " ")
@@ -91,4 +91,8 @@ python ../run_seq2seq.py \
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json
 
-python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
+for name in ${names};
+do
+    echo "evaluating ${name}"
+    python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/${name}_predictions.json
+done
\ No newline at end of file
diff --git a/convlab/base_models/t5/nlg/merge_predict_res.py b/convlab/base_models/t5/nlg/merge_predict_res.py
index 9e4615cfb0ad527e12a5294f1dbdcb69ba267651..a4d1b34155fda3826b6aaa100ddf7a8f5c37c49f 100755
--- a/convlab/base_models/t5/nlg/merge_predict_res.py
+++ b/convlab/base_models/t5/nlg/merge_predict_res.py
@@ -3,7 +3,7 @@ import os
 from convlab.util import load_dataset, load_nlg_data
 
 
-def merge(dataset_names, speaker, save_dir, context_window_size, predict_result):
+def merge(dataset_names, speaker, save_dir, context_window_size, predict_result, dial_ids_order):
     assert os.path.exists(predict_result)
     
     if save_dir is None:
@@ -16,7 +16,8 @@ def merge(dataset_names, speaker, save_dir, context_window_size, predict_result)
     i = 0
     for dataset_name in dataset_names.split('+'):
         print(dataset_name)
-        dataset = load_dataset(dataset_name, args.dial_ids_order)
+        single = []
+        dataset = load_dataset(dataset_name, dial_ids_order)
         data = load_nlg_data(dataset, data_split='test', speaker=speaker, use_context=context_window_size>0, context_window_size=context_window_size)['test']
     
         for sample in data:
@@ -24,23 +25,18 @@ def merge(dataset_names, speaker, save_dir, context_window_size, predict_result)
                 continue
             sample['predictions'] = {'utterance': predict_result[i]}
             i += 1
-            if args.sub_dataset:
-                if dataset_name == args.sub_dataset:
-                    merged.append(sample)
-            else:
-                merged.append(sample)
+            single.append(sample)
+            merged.append(sample)
 
-    if args.sub_dataset:
-        json.dump(merged, open(os.path.join(save_dir, f'{args.sub_dataset}predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
-    else:
-        json.dump(merged, open(os.path.join(save_dir, 'predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+        json.dump(single, open(os.path.join(save_dir, f'{dataset_name}_predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    
+    json.dump(merged, open(os.path.join(save_dir, 'predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
 
 
 if __name__ == '__main__':
     from argparse import ArgumentParser
     parser = ArgumentParser(description="merge predict results with original data for unified NLU evaluation")
     parser.add_argument('--dataset', '-d', metavar='dataset_name', type=str, help='name of the unified dataset')
-    parser.add_argument('--sub_dataset', metavar='sub dataset_name', type=str, help='name of the unified dataset')
     parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s) of utterances')
     parser.add_argument('--save_dir', type=str, help='merged data will be saved as $save_dir/predictions.json. default: on the same directory as predict_result')
     parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
@@ -48,4 +44,4 @@ if __name__ == '__main__':
     parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments')
     args = parser.parse_args()
     print(args)
-    merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result)
+    merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result, args.dial_ids_order)
diff --git a/convlab/base_models/t5/nlg/run_nlg.sh b/convlab/base_models/t5/nlg/run_nlg.sh
index b1ca7bf400c8c8c232dae9054e9413a22527d9ac..718dca4a2e344fc903e20f48e75cccd46b7821ce 100644
--- a/convlab/base_models/t5/nlg/run_nlg.sh
+++ b/convlab/base_models/t5/nlg/run_nlg.sh
@@ -24,36 +24,36 @@ gradient_accumulation_steps=8
 lr=1e-3
 num_train_epochs=10
 
-# python ../create_data.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
+python ../create_data.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
 
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --train_file ${train_file} \
-#     --validation_file ${validation_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${model_name_or_path} \
-#     --do_train \
-#     --do_eval \
-#     --save_strategy epoch \
-#     --evaluation_strategy epoch \
-#     --save_total_limit 1 \
-#     --prediction_loss_only \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --optim adafactor \
-#     --gradient_checkpointing
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --optim adafactor \
+    --gradient_checkpointing
 
 python ../run_seq2seq.py \
     --task_name ${task_name} \
diff --git a/convlab/base_models/t5/nlg/run_nlg_multitask.sh b/convlab/base_models/t5/nlg/run_nlg_multitask.sh
index 6caf22391c7f0690c8f788ea37ed5d15c9415583..9556bdcc0d55e4e520c371386438f9a99c71280c 100644
--- a/convlab/base_models/t5/nlg/run_nlg_multitask.sh
+++ b/convlab/base_models/t5/nlg/run_nlg_multitask.sh
@@ -1,6 +1,6 @@
 n_gpus=1
 task_name="nlg"
-dataset_name="tm1+tm2+tm3"
+dataset_name="sgd+tm1+tm2+tm3+multiwoz21"
 speaker="system"
 context_window_size=0
 data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
@@ -25,70 +25,74 @@ lr=1e-3
 num_train_epochs=10
 
 names=$(echo ${dataset_name} | tr "+" "\n")
-# rm -r ${data_dir}
-# mkdir -p ${data_dir}
-# for name in ${names};
-# do
-#     echo "preprocessing ${name}"
-#     python ../create_data.py -t ${task_name} -d ${name} -s ${speaker} -c ${context_window_size}
-# done
+rm -r ${data_dir}
+mkdir -p ${data_dir}
+for name in ${names};
+do
+    echo "preprocessing ${name}"
+    python ../create_data.py -t ${task_name} -d ${name} -s ${speaker} -c ${context_window_size}
+done
 
-# python merge_data.py $(echo ${dataset_name} | tr "+" " ")
+python merge_data.py $(echo ${dataset_name} | tr "+" " ")
 
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --train_file ${train_file} \
-#     --validation_file ${validation_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${model_name_or_path} \
-#     --do_train \
-#     --do_eval \
-#     --save_strategy epoch \
-#     --evaluation_strategy epoch \
-#     --save_total_limit 1 \
-#     --prediction_loss_only \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --optim adafactor \
-#     --gradient_checkpointing
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --optim adafactor \
+    --gradient_checkpointing
 
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --test_file ${test_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${output_dir} \
-#     --do_predict \
-#     --predict_with_generate \
-#     --metric_name_or_path ${metric_name_or_path} \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --optim adafactor \
-#     --gradient_checkpointing
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --optim adafactor \
+    --gradient_checkpointing
 
-# python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json
+python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json
 
-python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/Nonepredictions.json --dataset_name tm1
+for name in ${names};
+do
+    echo "evaluating ${name}"
+    python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/${name}_predictions.json --dataset_name ${name}
+done
\ No newline at end of file
diff --git a/convlab/base_models/t5/nlu/merge_predict_res.py b/convlab/base_models/t5/nlu/merge_predict_res.py
index bdb7e404f4b348f5c5125f9b934d3509a771371b..8c522063f98e38773841554ddddfc9f476b18049 100755
--- a/convlab/base_models/t5/nlu/merge_predict_res.py
+++ b/convlab/base_models/t5/nlu/merge_predict_res.py
@@ -4,7 +4,7 @@ from convlab.util import load_dataset, load_nlu_data
 from convlab.base_models.t5.nlu.serialization import deserialize_dialogue_acts
 
 
-def merge(dataset_names, speaker, save_dir, context_window_size, predict_result):
+def merge(dataset_names, speaker, save_dir, context_window_size, predict_result, dial_ids_order):
     assert os.path.exists(predict_result)
     
     if save_dir is None:
@@ -17,14 +17,18 @@ def merge(dataset_names, speaker, save_dir, context_window_size, predict_result)
     i = 0
     for dataset_name in dataset_names.split('+'):
         print(dataset_name)
-        dataset = load_dataset(dataset_name, args.dial_ids_order)
+        single = []
+        dataset = load_dataset(dataset_name, dial_ids_order)
         data = load_nlu_data(dataset, data_split='test', speaker=speaker, use_context=context_window_size>0, context_window_size=context_window_size)['test']
         
         for sample in data:
             sample['predictions'] = {'dialogue_acts': predict_result[i]}
             i += 1
+            single.append(sample)
             merged.append(sample)
 
+        json.dump(single, open(os.path.join(save_dir, f'{dataset_name}_predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    
     json.dump(merged, open(os.path.join(save_dir, 'predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
 
 
@@ -39,4 +43,4 @@ if __name__ == '__main__':
     parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments')
     args = parser.parse_args()
     print(args)
-    merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result)
+    merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result, args.dial_ids_order)
diff --git a/convlab/base_models/t5/nlu/run_nlu_multitask.sh b/convlab/base_models/t5/nlu/run_nlu_multitask.sh
index 9e4425b4b8b677d537dc2ca46a471a5ea52c2efe..4e29168007763e44817b8c8215705864d0131a8f 100644
--- a/convlab/base_models/t5/nlu/run_nlu_multitask.sh
+++ b/convlab/base_models/t5/nlu/run_nlu_multitask.sh
@@ -91,4 +91,8 @@ python ../run_seq2seq.py \
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json
 
-python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
+for name in ${names};
+do
+    echo "evaluating ${name}"
+    python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/${name}_predictions.json
+done
\ No newline at end of file