From 96c469b984ee10cc371a59dafbc3004baea3ab38 Mon Sep 17 00:00:00 2001
From: zqwerty <zhuq96@hotmail.com>
Date: Wed, 11 May 2022 16:14:10 +0800
Subject: [PATCH] run_seq2seq will use early stop(patient=10 epochs) when using
 load_best_model_at_end param

---
 convlab2/base_models/bert/infer_bio.sh                         | 3 +--
 convlab2/base_models/bert/train_bio.sh                         | 3 +--
 convlab2/base_models/gpt/keyword_extraction/test_t5_key2gen.sh | 1 -
 .../base_models/gpt/keyword_extraction/train_lm_dialogpt.sh    | 1 -
 convlab2/base_models/gpt/keyword_extraction/train_lm_gpt.sh    | 1 -
 .../base_models/gpt/keyword_extraction/train_t5_key2gen.sh     | 1 -
 convlab2/base_models/t5/dst/run_dst.sh                         | 2 --
 convlab2/base_models/t5/dst/run_dst_fewshot.sh                 | 2 --
 convlab2/base_models/t5/dst/run_dst_pretrain.sh                | 1 -
 convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh     | 1 -
 convlab2/base_models/t5/nlg/run_nlg.sh                         | 2 --
 convlab2/base_models/t5/nlg/run_nlg_fewshot.sh                 | 2 --
 convlab2/base_models/t5/nlg/run_nlg_pretrain.sh                | 1 -
 convlab2/base_models/t5/nlu/run_nlu.sh                         | 2 --
 convlab2/base_models/t5/nlu/run_nlu_fewshot.sh                 | 2 --
 convlab2/base_models/t5/nlu/run_nlu_pretrain.sh                | 1 -
 convlab2/base_models/t5/rg/run_rg.sh                           | 1 -
 convlab2/base_models/t5/run_seq2seq.py                         | 2 +-
 18 files changed, 3 insertions(+), 26 deletions(-)

diff --git a/convlab2/base_models/bert/infer_bio.sh b/convlab2/base_models/bert/infer_bio.sh
index c07f5537..455c76a8 100644
--- a/convlab2/base_models/bert/infer_bio.sh
+++ b/convlab2/base_models/bert/infer_bio.sh
@@ -30,8 +30,7 @@ do
         --logging_dir ${logging_dir} \
         --overwrite_output_dir \
         --preprocessing_num_workers 4 \
-        --per_device_eval_batch_size ${per_device_eval_batch_size} \
-        --debug underflow_overflow
+        --per_device_eval_batch_size ${per_device_eval_batch_size}
 
     mv ${output_dir}/predictions.json ${output_dir}/${split}.json
 done
diff --git a/convlab2/base_models/bert/train_bio.sh b/convlab2/base_models/bert/train_bio.sh
index 4d6a4d8d..7b30df75 100644
--- a/convlab2/base_models/bert/train_bio.sh
+++ b/convlab2/base_models/bert/train_bio.sh
@@ -58,5 +58,4 @@ CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch \
     --per_device_eval_batch_size ${per_device_eval_batch_size} \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow
+    --num_train_epochs ${num_train_epochs}
diff --git a/convlab2/base_models/gpt/keyword_extraction/test_t5_key2gen.sh b/convlab2/base_models/gpt/keyword_extraction/test_t5_key2gen.sh
index 469ec695..ac204b5d 100644
--- a/convlab2/base_models/gpt/keyword_extraction/test_t5_key2gen.sh
+++ b/convlab2/base_models/gpt/keyword_extraction/test_t5_key2gen.sh
@@ -45,6 +45,5 @@ python -m torch.distributed.launch \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh b/convlab2/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh
index 303ecb3e..f260f707 100644
--- a/convlab2/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh
+++ b/convlab2/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh
@@ -43,5 +43,4 @@ python ../run_clm.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --gradient_checkpointing
diff --git a/convlab2/base_models/gpt/keyword_extraction/train_lm_gpt.sh b/convlab2/base_models/gpt/keyword_extraction/train_lm_gpt.sh
index fb510c88..82c63a1f 100644
--- a/convlab2/base_models/gpt/keyword_extraction/train_lm_gpt.sh
+++ b/convlab2/base_models/gpt/keyword_extraction/train_lm_gpt.sh
@@ -43,5 +43,4 @@ python ../run_clm.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --gradient_checkpointing
diff --git a/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh b/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh
index d92365e7..2c795ecf 100644
--- a/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh
+++ b/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh
@@ -52,6 +52,5 @@ python -m torch.distributed.launch \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/t5/dst/run_dst.sh b/convlab2/base_models/t5/dst/run_dst.sh
index c678005e..2dfc622d 100644
--- a/convlab2/base_models/t5/dst/run_dst.sh
+++ b/convlab2/base_models/t5/dst/run_dst.sh
@@ -52,7 +52,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
@@ -78,7 +77,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
diff --git a/convlab2/base_models/t5/dst/run_dst_fewshot.sh b/convlab2/base_models/t5/dst/run_dst_fewshot.sh
index 298a37f1..d4571911 100644
--- a/convlab2/base_models/t5/dst/run_dst_fewshot.sh
+++ b/convlab2/base_models/t5/dst/run_dst_fewshot.sh
@@ -54,7 +54,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
@@ -80,7 +79,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
diff --git a/convlab2/base_models/t5/dst/run_dst_pretrain.sh b/convlab2/base_models/t5/dst/run_dst_pretrain.sh
index 11995943..29cc280e 100644
--- a/convlab2/base_models/t5/dst/run_dst_pretrain.sh
+++ b/convlab2/base_models/t5/dst/run_dst_pretrain.sh
@@ -63,6 +63,5 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh b/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh
index 09a2c33a..0d10fe50 100644
--- a/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh
+++ b/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh
@@ -47,6 +47,5 @@ python -m torch.distributed.launch \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/t5/nlg/run_nlg.sh b/convlab2/base_models/t5/nlg/run_nlg.sh
index 3352e6c1..9de7fece 100644
--- a/convlab2/base_models/t5/nlg/run_nlg.sh
+++ b/convlab2/base_models/t5/nlg/run_nlg.sh
@@ -52,7 +52,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
@@ -78,7 +77,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
diff --git a/convlab2/base_models/t5/nlg/run_nlg_fewshot.sh b/convlab2/base_models/t5/nlg/run_nlg_fewshot.sh
index 45d1964a..6f7c8d17 100644
--- a/convlab2/base_models/t5/nlg/run_nlg_fewshot.sh
+++ b/convlab2/base_models/t5/nlg/run_nlg_fewshot.sh
@@ -55,7 +55,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
@@ -81,7 +80,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
diff --git a/convlab2/base_models/t5/nlg/run_nlg_pretrain.sh b/convlab2/base_models/t5/nlg/run_nlg_pretrain.sh
index 4d2b440e..8af5dd10 100644
--- a/convlab2/base_models/t5/nlg/run_nlg_pretrain.sh
+++ b/convlab2/base_models/t5/nlg/run_nlg_pretrain.sh
@@ -63,6 +63,5 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/t5/nlu/run_nlu.sh b/convlab2/base_models/t5/nlu/run_nlu.sh
index 05671139..fb9be022 100644
--- a/convlab2/base_models/t5/nlu/run_nlu.sh
+++ b/convlab2/base_models/t5/nlu/run_nlu.sh
@@ -52,7 +52,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
@@ -78,7 +77,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
diff --git a/convlab2/base_models/t5/nlu/run_nlu_fewshot.sh b/convlab2/base_models/t5/nlu/run_nlu_fewshot.sh
index 2c783912..568c2713 100644
--- a/convlab2/base_models/t5/nlu/run_nlu_fewshot.sh
+++ b/convlab2/base_models/t5/nlu/run_nlu_fewshot.sh
@@ -55,7 +55,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
@@ -81,7 +80,6 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
 
diff --git a/convlab2/base_models/t5/nlu/run_nlu_pretrain.sh b/convlab2/base_models/t5/nlu/run_nlu_pretrain.sh
index ccc7c08f..0f500292 100644
--- a/convlab2/base_models/t5/nlu/run_nlu_pretrain.sh
+++ b/convlab2/base_models/t5/nlu/run_nlu_pretrain.sh
@@ -63,6 +63,5 @@ python ../run_seq2seq.py \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/t5/rg/run_rg.sh b/convlab2/base_models/t5/rg/run_rg.sh
index 308c639c..ff97ce5a 100644
--- a/convlab2/base_models/t5/rg/run_rg.sh
+++ b/convlab2/base_models/t5/rg/run_rg.sh
@@ -65,6 +65,5 @@ python -m torch.distributed.launch \
     --gradient_accumulation_steps ${gradient_accumulation_steps} \
     --learning_rate ${lr} \
     --num_train_epochs ${num_train_epochs} \
-    --debug underflow_overflow \
     --adafactor \
     --gradient_checkpointing
diff --git a/convlab2/base_models/t5/run_seq2seq.py b/convlab2/base_models/t5/run_seq2seq.py
index c76bb5cd..c702897d 100644
--- a/convlab2/base_models/t5/run_seq2seq.py
+++ b/convlab2/base_models/t5/run_seq2seq.py
@@ -565,7 +565,7 @@ def main():
         data_collator=data_collator,
         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
     )
-    if data_args.early_stopping_patience > 0:
+    if training_args.load_best_model_at_end:
         trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=data_args.early_stopping_patience))
 
     # Training
-- 
GitLab