diff --git a/convlab2/base_models/bert/infer_bio.sh b/convlab2/base_models/bert/infer_bio.sh
index c07f55375dac3f589845781a5cf91ba7b140557c..455c76a8256503e28f0f60842c844094e4a4ad07 100644
--- a/convlab2/base_models/bert/infer_bio.sh
+++ b/convlab2/base_models/bert/infer_bio.sh
@@ -30,8 +30,7 @@ do
         --logging_dir ${logging_dir} \
         --overwrite_output_dir \
         --preprocessing_num_workers 4 \
-        --per_device_eval_batch_size ${per_device_eval_batch_size} \
-        --debug underflow_overflow
+        --per_device_eval_batch_size ${per_device_eval_batch_size}
 
     mv ${output_dir}/predictions.json ${output_dir}/${split}.json
 done
diff --git a/convlab2/base_models/bert/train_bio.sh b/convlab2/base_models/bert/train_bio.sh
index 4d6a4d8df84f5811484f8a5459cdd02d20759b4a..7b30df7515e32b670a9ad3ee99af61e465cddb7a 100644
--- a/convlab2/base_models/bert/train_bio.sh
+++ b/convlab2/base_models/bert/train_bio.sh
@@ -58,5 +58,4 @@ CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch \
     --per_device_eval_batch_size ${per_device_eval_batch_size} \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow
+    --num_train_epochs ${num_train_epochs}
diff --git a/convlab2/base_models/gpt/keyword_extraction/test_t5_key2gen.sh b/convlab2/base_models/gpt/keyword_extraction/test_t5_key2gen.sh
index 469ec695ba681a835c7d9c51e95803c674c87d11..ac204b5d564fe0acb5fb2ac1b49d4d1d6bcad17d 100644
--- a/convlab2/base_models/gpt/keyword_extraction/test_t5_key2gen.sh
+++ b/convlab2/base_models/gpt/keyword_extraction/test_t5_key2gen.sh
@@ -45,6 +45,5 @@ python -m torch.distributed.launch \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh b/convlab2/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh
index 303ecb3e0c660a13e190b193c5b1769fbe70812d..f260f7071529e6837f9c7807d6d5ecf2469494a2 100644
--- a/convlab2/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh
+++ b/convlab2/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh
@@ -43,5 +43,4 @@ python ../run_clm.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --gradient_checkpointing
diff --git a/convlab2/base_models/gpt/keyword_extraction/train_lm_gpt.sh b/convlab2/base_models/gpt/keyword_extraction/train_lm_gpt.sh
index fb510c880b25505e83780eeab76760e30dbccf9d..82c63a1f4c4a1633ad5e7d4a721a3bbac558cefb 100644
--- a/convlab2/base_models/gpt/keyword_extraction/train_lm_gpt.sh
+++ b/convlab2/base_models/gpt/keyword_extraction/train_lm_gpt.sh
@@ -43,5 +43,4 @@ python ../run_clm.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --gradient_checkpointing
diff --git a/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh b/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh
index d92365e787e2c58fdd8b6f4a4f870053c7561f2e..2c795ecf58e331e2acbe8ada66b4cf057ed83037 100644
--- a/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh
+++ b/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh
@@ -52,6 +52,5 @@ python -m torch.distributed.launch \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/t5/dst/run_dst.sh b/convlab2/base_models/t5/dst/run_dst.sh
index c678005ef1284bcb40333ff47e9a1fbf06c90c16..2dfc622d88a9b1b38e70e15f1f5cefd2d4a78661 100644
--- a/convlab2/base_models/t5/dst/run_dst.sh
+++ b/convlab2/base_models/t5/dst/run_dst.sh
@@ -52,7 +52,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
@@ -78,7 +77,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
diff --git a/convlab2/base_models/t5/dst/run_dst_fewshot.sh b/convlab2/base_models/t5/dst/run_dst_fewshot.sh
index 298a37f17a1c0817ff257742b5aa6e61bb9cd5d0..d45719112e50dd44672ab52b28c04014cb5d6e5c 100644
--- a/convlab2/base_models/t5/dst/run_dst_fewshot.sh
+++ b/convlab2/base_models/t5/dst/run_dst_fewshot.sh
@@ -54,7 +54,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
@@ -80,7 +79,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
diff --git a/convlab2/base_models/t5/dst/run_dst_pretrain.sh b/convlab2/base_models/t5/dst/run_dst_pretrain.sh
index 119959431f07485f613de28e5f338b307c9647d6..29cc280e590ebeb700e2f60af6d1dd6f36d5741f 100644
--- a/convlab2/base_models/t5/dst/run_dst_pretrain.sh
+++ b/convlab2/base_models/t5/dst/run_dst_pretrain.sh
@@ -63,6 +63,5 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh b/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh
index 09a2c33aa06fa5134dba0707e1df5e633ac9f269..0d10fe5072c238b47b2302a7b0d4cfa8ced9ed73 100644
--- a/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh
+++ b/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh
@@ -47,6 +47,5 @@ python -m torch.distributed.launch \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/t5/nlg/run_nlg.sh b/convlab2/base_models/t5/nlg/run_nlg.sh
index 3352e6c14f4a5c2f61690f0c32fc31b709c73a23..9de7fece68dc5d3a7721001c4d18bdf8712e4c4a 100644
--- a/convlab2/base_models/t5/nlg/run_nlg.sh
+++ b/convlab2/base_models/t5/nlg/run_nlg.sh
@@ -52,7 +52,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
@@ -78,7 +77,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
diff --git a/convlab2/base_models/t5/nlg/run_nlg_fewshot.sh b/convlab2/base_models/t5/nlg/run_nlg_fewshot.sh
index 45d1964a21c44c898958d32f53af7f995a53281a..6f7c8d177e014b5505267060c6d20f0113266140 100644
--- a/convlab2/base_models/t5/nlg/run_nlg_fewshot.sh
+++ b/convlab2/base_models/t5/nlg/run_nlg_fewshot.sh
@@ -55,7 +55,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
@@ -81,7 +80,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
diff --git a/convlab2/base_models/t5/nlg/run_nlg_pretrain.sh b/convlab2/base_models/t5/nlg/run_nlg_pretrain.sh
index 4d2b440ece35fd448277a4a50b5e3c122606a8ae..8af5dd10dfafd71a23ebd287ad1f77cc1486c787 100644
--- a/convlab2/base_models/t5/nlg/run_nlg_pretrain.sh
+++ b/convlab2/base_models/t5/nlg/run_nlg_pretrain.sh
@@ -63,6 +63,5 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/t5/nlu/run_nlu.sh b/convlab2/base_models/t5/nlu/run_nlu.sh
index 05671139cfd691422c924c9da880af317c916a19..fb9be0227b3cced261ed6ccbffa9857e477012a2 100644
--- a/convlab2/base_models/t5/nlu/run_nlu.sh
+++ b/convlab2/base_models/t5/nlu/run_nlu.sh
@@ -52,7 +52,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
@@ -78,7 +77,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
diff --git a/convlab2/base_models/t5/nlu/run_nlu_fewshot.sh b/convlab2/base_models/t5/nlu/run_nlu_fewshot.sh
index 2c783912d66281c8e44f01aafc232b4051e73f86..568c271323cf2472f7989e0cb68e9af051bcc89b 100644
--- a/convlab2/base_models/t5/nlu/run_nlu_fewshot.sh
+++ b/convlab2/base_models/t5/nlu/run_nlu_fewshot.sh
@@ -55,7 +55,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
@@ -81,7 +80,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
diff --git a/convlab2/base_models/t5/nlu/run_nlu_pretrain.sh b/convlab2/base_models/t5/nlu/run_nlu_pretrain.sh
index ccc7c08fcf1ebbd5d71ed800ce56b7413432631a..0f500292b54ff16381783248652f893b218bc2e8 100644
--- a/convlab2/base_models/t5/nlu/run_nlu_pretrain.sh
+++ b/convlab2/base_models/t5/nlu/run_nlu_pretrain.sh
@@ -63,6 +63,5 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/t5/rg/run_rg.sh b/convlab2/base_models/t5/rg/run_rg.sh
index 308c639c71ceeb8957188ab499341e8fa2229943..ff97ce5a9f63deae0ee5660c60bf111e891ef7da 100644
--- a/convlab2/base_models/t5/rg/run_rg.sh
+++ b/convlab2/base_models/t5/rg/run_rg.sh
@@ -65,6 +65,5 @@ python -m torch.distributed.launch \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/t5/run_seq2seq.py b/convlab2/base_models/t5/run_seq2seq.py
index c76bb5cd690001e550aef4a1ce287d007c5a066d..c702897d5c2d19d164ae00ee058718ff0dc0be96 100644
--- a/convlab2/base_models/t5/run_seq2seq.py
+++ b/convlab2/base_models/t5/run_seq2seq.py
@@ -565,7 +565,7 @@ def main():
         data_collator=data_collator,
         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
     )
-    if data_args.early_stopping_patience > 0:
+    if training_args.load_best_model_at_end:
         trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=data_args.early_stopping_patience))
 
     # Training