diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen+key2gen_noisy.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen+key2gen_noisy.sh
index 878acdd6c71dc4b20a2946b6a58049f5463f0c9e..8e0b3617210408d3226bd7da9f675534c9458398 100644
--- a/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen+key2gen_noisy.sh
+++ b/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen+key2gen_noisy.sh
@@ -35,7 +35,7 @@ python -m torch.distributed.launch --master_port ${master_port} \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --save_steps 5000 \
-    --save_total_limit 3 \
+    --save_total_limit 1 \
     --cache_dir ${cache_dir} \
     --output_dir ${output_dir} \
     --logging_dir ${logging_dir} \
diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh
index b2de55410064b0234f8416b6338e2a070c79147f..8d9a019bd0fa10d63586c023705807a3eafd5ff0 100644
--- a/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh
+++ b/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh
@@ -35,7 +35,7 @@ python -m torch.distributed.launch --master_port ${master_port} \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --save_steps 5000 \
-    --save_total_limit 3 \
+    --save_total_limit 1 \
     --cache_dir ${cache_dir} \
     --output_dir ${output_dir} \
     --logging_dir ${logging_dir} \
diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh
index 74c418164815cfd538e17cb08cd0de7c24ba7624..75b79932bb94b0699d2e2349a4c8cb8846915cb3 100644
--- a/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh
+++ b/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh
@@ -35,7 +35,7 @@ python -m torch.distributed.launch --master_port ${master_port} \
     --model_name_or_path ${model_name_or_path} \
     --do_train \
     --save_steps 5000 \
-    --save_total_limit 3 \
+    --save_total_limit 1 \
     --cache_dir ${cache_dir} \
     --output_dir ${output_dir} \
     --logging_dir ${logging_dir} \
diff --git a/convlab/base_models/t5/dst/run_dst.sh b/convlab/base_models/t5/dst/run_dst.sh
index 0704ebf9257be910c2148d052574b535182be07e..05975400bd1ca901e1058dc80587e2cce0b0f1bb 100644
--- a/convlab/base_models/t5/dst/run_dst.sh
+++ b/convlab/base_models/t5/dst/run_dst.sh
@@ -52,7 +52,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python ../run_seq2seq.py \
@@ -77,7 +77,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
diff --git a/convlab/base_models/t5/dst/run_dst_fewshot.sh b/convlab/base_models/t5/dst/run_dst_fewshot.sh
index f548c053b544b51101f0cfbcc0b1a7b3a09c8088..4acd605706752c67d1f1df3b5fa04df13d2e46ad 100644
--- a/convlab/base_models/t5/dst/run_dst_fewshot.sh
+++ b/convlab/base_models/t5/dst/run_dst_fewshot.sh
@@ -54,7 +54,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python ../run_seq2seq.py \
@@ -79,7 +79,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
diff --git a/convlab/base_models/t5/dst/run_dst_multitask.sh b/convlab/base_models/t5/dst/run_dst_multitask.sh
index 0f3b60a63a2f1bf861cb430a247121d966aac822..aefb1d5200db292d0e68e7d39498dfbd182d1fa0 100644
--- a/convlab/base_models/t5/dst/run_dst_multitask.sh
+++ b/convlab/base_models/t5/dst/run_dst_multitask.sh
@@ -61,7 +61,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python ../run_seq2seq.py \
@@ -86,7 +86,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
diff --git a/convlab/base_models/t5/dst/run_dst_pretrain.sh b/convlab/base_models/t5/dst/run_dst_pretrain.sh
index 29cc280e590ebeb700e2f60af6d1dd6f36d5741f..1a995f4cbd24cd6a84151844b54559e3ce332b28 100644
--- a/convlab/base_models/t5/dst/run_dst_pretrain.sh
+++ b/convlab/base_models/t5/dst/run_dst_pretrain.sh
@@ -63,5 +63,5 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
diff --git a/convlab/base_models/t5/goal2dialogue/run_goal2dialogue.sh b/convlab/base_models/t5/goal2dialogue/run_goal2dialogue.sh
index 0d10fe5072c238b47b2302a7b0d4cfa8ced9ed73..ac0a877a146815395f392655b9ad861adbc2311e 100644
--- a/convlab/base_models/t5/goal2dialogue/run_goal2dialogue.sh
+++ b/convlab/base_models/t5/goal2dialogue/run_goal2dialogue.sh
@@ -47,5 +47,5 @@ python -m torch.distributed.launch \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
diff --git a/convlab/base_models/t5/nlg/run_nlg.sh b/convlab/base_models/t5/nlg/run_nlg.sh
index c45079a6e5b253accb9e463407cef42ba2223272..0b5fa390dcaf98b098abc17f18026994ee54702c 100644
--- a/convlab/base_models/t5/nlg/run_nlg.sh
+++ b/convlab/base_models/t5/nlg/run_nlg.sh
@@ -52,7 +52,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python ../run_seq2seq.py \
@@ -77,7 +77,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
diff --git a/convlab/base_models/t5/nlg/run_nlg_fewshot.sh b/convlab/base_models/t5/nlg/run_nlg_fewshot.sh
index 4e00fb9dbd0b7c0f98850c300960787aafcc6909..61e50cdaa094b301660d38f74fcf8420424a7d3f 100644
--- a/convlab/base_models/t5/nlg/run_nlg_fewshot.sh
+++ b/convlab/base_models/t5/nlg/run_nlg_fewshot.sh
@@ -55,7 +55,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python ../run_seq2seq.py \
@@ -80,7 +80,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
diff --git a/convlab/base_models/t5/nlg/run_nlg_multitask.sh b/convlab/base_models/t5/nlg/run_nlg_multitask.sh
index 9b0a3d47e68ddaffe53112b3eb6a51030902b44d..dec894aab37a37ba7923d60431fb22ef5ac4d6b6 100644
--- a/convlab/base_models/t5/nlg/run_nlg_multitask.sh
+++ b/convlab/base_models/t5/nlg/run_nlg_multitask.sh
@@ -61,7 +61,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python ../run_seq2seq.py \
@@ -86,7 +86,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
diff --git a/convlab/base_models/t5/nlg/run_nlg_pretrain.sh b/convlab/base_models/t5/nlg/run_nlg_pretrain.sh
index a1a1b6010c14953a18d3842235eb04e0f2d0fcc9..7ce91a5000cd4627d1549efe88e4d4826bfca0a1 100644
--- a/convlab/base_models/t5/nlg/run_nlg_pretrain.sh
+++ b/convlab/base_models/t5/nlg/run_nlg_pretrain.sh
@@ -60,5 +60,5 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
diff --git a/convlab/base_models/t5/nlu/run_nlu.sh b/convlab/base_models/t5/nlu/run_nlu.sh
index 8cba74aca0510464d176aa44ef0388c914796f5f..b81b04c0f360fe55c25e55f85ff8ceac3578a99d 100644
--- a/convlab/base_models/t5/nlu/run_nlu.sh
+++ b/convlab/base_models/t5/nlu/run_nlu.sh
@@ -52,7 +52,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python ../run_seq2seq.py \
@@ -77,7 +77,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
diff --git a/convlab/base_models/t5/nlu/run_nlu_fewshot.sh b/convlab/base_models/t5/nlu/run_nlu_fewshot.sh
index 8da69801df77d8f72d23204c8cf008ea7512d10c..a966310a5bea242db413dda7b9ca12bcbda0ae43 100644
--- a/convlab/base_models/t5/nlu/run_nlu_fewshot.sh
+++ b/convlab/base_models/t5/nlu/run_nlu_fewshot.sh
@@ -55,7 +55,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python ../run_seq2seq.py \
@@ -80,7 +80,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
diff --git a/convlab/base_models/t5/nlu/run_nlu_multitask.sh b/convlab/base_models/t5/nlu/run_nlu_multitask.sh
index 6380acff2fc5e8a2712e530823c5d0b61af451a2..b91f21e3f02270ff2f1dfa42fe8baa8f16a20acc 100644
--- a/convlab/base_models/t5/nlu/run_nlu_multitask.sh
+++ b/convlab/base_models/t5/nlu/run_nlu_multitask.sh
@@ -61,7 +61,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python ../run_seq2seq.py \
@@ -86,7 +86,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
diff --git a/convlab/base_models/t5/nlu/run_nlu_pretrain.sh b/convlab/base_models/t5/nlu/run_nlu_pretrain.sh
index 0f500292b54ff16381783248652f893b218bc2e8..c0511254f44ff39328ec5759253df5aae0a0d360 100644
--- a/convlab/base_models/t5/nlu/run_nlu_pretrain.sh
+++ b/convlab/base_models/t5/nlu/run_nlu_pretrain.sh
@@ -63,5 +63,5 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
diff --git a/convlab/base_models/t5/nlu/run_retnlu.sh b/convlab/base_models/t5/nlu/run_retnlu.sh
index b45a0e45643fd5a2247633305df7e0c1f11ce848..fd44e063dc84da86e4f77ead69b0e329ac0cc7d1 100644
--- a/convlab/base_models/t5/nlu/run_retnlu.sh
+++ b/convlab/base_models/t5/nlu/run_retnlu.sh
@@ -53,7 +53,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python ../run_seq2seq.py \
@@ -78,7 +78,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
diff --git a/convlab/base_models/t5/nlu/run_retnlu_fewshot.sh b/convlab/base_models/t5/nlu/run_retnlu_fewshot.sh
index d165859b01485b7885f88e5b1ae3a279e41f4caf..e778c80bdc844dfea732421e9234e8965e20d987 100644
--- a/convlab/base_models/t5/nlu/run_retnlu_fewshot.sh
+++ b/convlab/base_models/t5/nlu/run_retnlu_fewshot.sh
@@ -56,7 +56,7 @@ num_train_epochs=100
 #     --gradient_accumulation_steps ${gradient_accumulation_steps} \
 #     --learning_rate ${lr} \
 #     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
+#     --optim adafactor \
 #     --gradient_checkpointing
 
 # python ../run_seq2seq.py \
@@ -81,7 +81,7 @@ num_train_epochs=100
 #     --gradient_accumulation_steps ${gradient_accumulation_steps} \
 #     --learning_rate ${lr} \
 #     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
+#     --optim adafactor \
 #     --gradient_checkpointing
 
 # python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
diff --git a/convlab/base_models/t5/nlu/run_retnlu_in_context.sh b/convlab/base_models/t5/nlu/run_retnlu_in_context.sh
index 82dae873ebb419d1d311f347a813f1da6071dccb..775b4b06ed35f82610466ca96e518e95eb9b86f8 100644
--- a/convlab/base_models/t5/nlu/run_retnlu_in_context.sh
+++ b/convlab/base_models/t5/nlu/run_retnlu_in_context.sh
@@ -53,7 +53,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python ../run_seq2seq.py \
@@ -78,7 +78,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
diff --git a/convlab/base_models/t5/nlu/run_retnlu_in_context_fewshot.sh b/convlab/base_models/t5/nlu/run_retnlu_in_context_fewshot.sh
index 836152f80e9d21695aaadde0016aa7399eedbdf2..913ef7cbad5fae0b3092c29fe0cd5f44604c333d 100644
--- a/convlab/base_models/t5/nlu/run_retnlu_in_context_fewshot.sh
+++ b/convlab/base_models/t5/nlu/run_retnlu_in_context_fewshot.sh
@@ -56,7 +56,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python ../run_seq2seq.py \
@@ -81,7 +81,7 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing
 
 python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
diff --git a/convlab/base_models/t5/rg/run_rg.sh b/convlab/base_models/t5/rg/run_rg.sh
index ff97ce5a9f63deae0ee5660c60bf111e891ef7da..976e862e40c32d1a2f78c77564e577af5edcccfa 100644
--- a/convlab/base_models/t5/rg/run_rg.sh
+++ b/convlab/base_models/t5/rg/run_rg.sh
@@ -65,5 +65,5 @@ python -m torch.distributed.launch \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --adafactor \
+    --optim adafactor \
     --gradient_checkpointing