Skip to content
Snippets Groups Projects
Commit 3775f099 authored by zqwerty's avatar zqwerty
Browse files

change --adafactor to --optim adafactor

parent 824068f8
No related branches found
No related tags found
No related merge requests found
Showing
with 33 additions and 33 deletions
......@@ -35,7 +35,7 @@ python -m torch.distributed.launch --master_port ${master_port} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--save_steps 5000 \
--save_total_limit 3 \
--save_total_limit 1 \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
......
......@@ -35,7 +35,7 @@ python -m torch.distributed.launch --master_port ${master_port} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--save_steps 5000 \
--save_total_limit 3 \
--save_total_limit 1 \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
......
......@@ -35,7 +35,7 @@ python -m torch.distributed.launch --master_port ${master_port} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--save_steps 5000 \
--save_total_limit 3 \
--save_total_limit 1 \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
......
......@@ -52,7 +52,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
......@@ -77,7 +77,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
......
......@@ -54,7 +54,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
......@@ -79,7 +79,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
......
......@@ -61,7 +61,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
......@@ -86,7 +86,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
......
......@@ -63,5 +63,5 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
......@@ -47,5 +47,5 @@ python -m torch.distributed.launch \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
......@@ -52,7 +52,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
......@@ -77,7 +77,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
......
......@@ -55,7 +55,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
......@@ -80,7 +80,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
......
......@@ -61,7 +61,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
......@@ -86,7 +86,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
......
......@@ -60,5 +60,5 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
......@@ -52,7 +52,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
......@@ -77,7 +77,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
......
......@@ -55,7 +55,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
......@@ -80,7 +80,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
......
......@@ -61,7 +61,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
......@@ -86,7 +86,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
......
......@@ -63,5 +63,5 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
......@@ -53,7 +53,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
......@@ -78,7 +78,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
......
......@@ -56,7 +56,7 @@ num_train_epochs=100
# --gradient_accumulation_steps ${gradient_accumulation_steps} \
# --learning_rate ${lr} \
# --num_train_epochs ${num_train_epochs} \
# --adafactor \
# --optim adafactor \
# --gradient_checkpointing
# python ../run_seq2seq.py \
......@@ -81,7 +81,7 @@ num_train_epochs=100
# --gradient_accumulation_steps ${gradient_accumulation_steps} \
# --learning_rate ${lr} \
# --num_train_epochs ${num_train_epochs} \
# --adafactor \
# --optim adafactor \
# --gradient_checkpointing
# python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
......
......@@ -53,7 +53,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
......@@ -78,7 +78,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
......
......@@ -56,7 +56,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
......@@ -81,7 +81,7 @@ python ../run_seq2seq.py \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--optim adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment