add finetune suite and update pre-training scripts

5a1ed974 · zqwerty · fb70aed4 · fb70aed4 · fb70aed4 · fb70aed4
Commit 5a1ed974 authored Jul 21, 2022 by zqwerty
--- a/convlab/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh
+++ b/convlab/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh
-n_gpus=1
-task_name="kvret"
-dataset_name="kvret"
-speaker="system"
-ratio=$1
-dial_ids_order=$2
-data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
-output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+db"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/key2gen/run_kvret_key2gen.sh
+++ b/convlab/base_models/t5/key2gen/run_kvret_key2gen.sh
-n_gpus=1
-task_name="kvret"
-dataset_name="kvret"
-speaker="system"
-data_dir="data/${task_name}/${dataset_name}"
-output_dir="output/${task_name}/key2gen_${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+db"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name}
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/key2gen/run_nlg.sh
+++ b/convlab/base_models/t5/key2gen/run_nlg.sh
-n_gpus=1
-task_name="nlg"
-dataset_name=$1
-speaker="system"
-context_window_size=$2
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+da"
-target_column="response"
-truncation_side="left"
-max_source_length=512
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --train_file ${train_file} \
-#     --validation_file ${validation_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${model_name_or_path} \
-#     --do_train \
-#     --do_eval \
-#     --save_strategy epoch \
-#     --evaluation_strategy epoch \
-#     --save_total_limit 1 \
-#     --prediction_loss_only \
-#     --load_best_model_at_end \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --test_file ${test_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${output_dir} \
-#     --do_predict \
-#     --predict_with_generate \
-#     --metric_name_or_path ${metric_name_or_path} \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/key2gen/run_nlg_fewshot.sh
+++ b/convlab/base_models/t5/key2gen/run_nlg_fewshot.sh
-n_gpus=1
-task_name="nlg"
-dataset_name=$1
-speaker="system"
-context_window_size=$2
-ratio=$3
-dial_ids_order=$4
-data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+da"
-target_column="response"
-truncation_side="left"
-max_source_length=512
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order}
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --train_file ${train_file} \
-#     --validation_file ${validation_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${model_name_or_path} \
-#     --do_train \
-#     --do_eval \
-#     --save_strategy epoch \
-#     --evaluation_strategy epoch \
-#     --save_total_limit 1 \
-#     --prediction_loss_only \
-#     --load_best_model_at_end \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --test_file ${test_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${output_dir} \
-#     --do_predict \
-#     --predict_with_generate \
-#     --metric_name_or_path ${metric_name_or_path} \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
-python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh
+++ b/convlab/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh
-n_gpus=1
-task_name="nlg"
-dataset_name=$1
-speaker="system"
-context_window_size=$2
-ratio=$3
-dial_ids_order=$4
-data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+da"
-target_column="response"
-truncation_side="left"
-max_source_length=512
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order} --key2gen
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --train_file ${train_file} \
-#     --validation_file ${validation_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${model_name_or_path} \
-#     --do_train \
-#     --do_eval \
-#     --save_strategy epoch \
-#     --evaluation_strategy epoch \
-#     --save_total_limit 1 \
-#     --prediction_loss_only \
-#     --load_best_model_at_end \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --test_file ${test_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${output_dir} \
-#     --do_predict \
-#     --predict_with_generate \
-#     --metric_name_or_path ${metric_name_or_path} \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
-python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/key2gen/run_nlg_key2gen.sh
+++ b/convlab/base_models/t5/key2gen/run_nlg_key2gen.sh
-n_gpus=1
-task_name="nlg"
-dataset_name=$1
-speaker="system"
-context_window_size=$2
-data_dir="data/${task_name}/key2gen_${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/key2gen_${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+da"
-target_column="response"
-truncation_side="left"
-max_source_length=512
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --train_file ${train_file} \
-#     --validation_file ${validation_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${model_name_or_path} \
-#     --do_train \
-#     --do_eval \
-#     --save_strategy epoch \
-#     --evaluation_strategy epoch \
-#     --save_total_limit 1 \
-#     --prediction_loss_only \
-#     --load_best_model_at_end \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --test_file ${test_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${output_dir} \
-#     --do_predict \
-#     --predict_with_generate \
-#     --metric_name_or_path ${metric_name_or_path} \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/key2gen/run_persona.sh
+++ b/convlab/base_models/t5/key2gen/run_persona.sh
-n_gpus=1
-task_name="personachat"
-dataset_name="personachat"
-speaker="system"
-data_dir="data/${task_name}/${dataset_name}"
-output_dir="output/${task_name}/${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+persona"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name}
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/key2gen/run_persona_fewshot.sh
+++ b/convlab/base_models/t5/key2gen/run_persona_fewshot.sh
-n_gpus=1
-task_name="personachat"
-dataset_name="personachat"
-speaker="system"
-ratio=$1
-dial_ids_order=$2
-data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
-output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+persona"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order}
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh
+++ b/convlab/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh
-n_gpus=1
-task_name="personachat"
-dataset_name="personachat"
-speaker="system"
-ratio=$1
-dial_ids_order=$2
-data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
-output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+persona"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/key2gen/run_persona_key2gen.sh
+++ b/convlab/base_models/t5/key2gen/run_persona_key2gen.sh
-n_gpus=1
-task_name="personachat"
-dataset_name="personachat"
-speaker="system"
-data_dir="data/${task_name}/key2gen_${dataset_name}"
-output_dir="output/${task_name}/key2gen_${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+persona"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} --key2gen
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/key2gen/run_wow.sh
+++ b/convlab/base_models/t5/key2gen/run_wow.sh
-n_gpus=1
-task_name="wow"
-dataset_name="wow"
-speaker="system"
-data_dir="data/${task_name}/${dataset_name}"
-output_dir="output/${task_name}/${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test_unseen.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+knowledge"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name}
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/key2gen/run_wow_fewshot.sh
+++ b/convlab/base_models/t5/key2gen/run_wow_fewshot.sh
-n_gpus=1
-task_name="wow"
-dataset_name="wow"
-speaker="system"
-ratio=$1
-dial_ids_order=$2
-data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
-output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test_unseen.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+knowledge"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order}
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh
+++ b/convlab/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh
-n_gpus=1
-task_name="wow"
-dataset_name="wow"
-speaker="system"
-ratio=$1
-dial_ids_order=$2
-data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
-output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test_unseen.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+knowledge"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/key2gen/run_wow_key2gen.sh
+++ b/convlab/base_models/t5/key2gen/run_wow_key2gen.sh
-n_gpus=1
-task_name="wow"
-dataset_name="wow"
-speaker="system"
-data_dir="data/${task_name}/key2gen_${dataset_name}"
-output_dir="output/${task_name}/key2gen_${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test_unseen.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+knowledge"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} --key2gen
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
--- a/convlab/base_models/t5/run_seq2seq.py
+++ b/convlab/base_models/t5/run_seq2seq.py
@@ -149,6 +149,9 @@ class DataTrainingArguments:
            "help": "An optional metric name or file to evaluate the model."
        },
    )
+    metric_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the metric to use (via the datasets library)."}
+    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
@@ -317,8 +320,17 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
        raw_datasets = load_dataset(
-            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir,
+            data_files=data_files if len(data_files) > 0 else None
        )
    else:
        data_files = {}
@@ -528,7 +540,7 @@ def main():
    # compute custom metric at evaluation.
    if data_args.metric_name_or_path:
-        metric = load_metric(data_args.metric_name_or_path)
+        metric = load_metric(data_args.metric_name_or_path, data_args.metric_config_name)
    # Must take a EvalPrediction and return a dictionary string to metric values.
    def compute_metrics(p: EvalPrediction):
        preds, labels = p.predictions, p.label_ids