diff --git a/convlab/base_models/t5/key2gen/evaluate.py b/convlab/base_models/t5/key2gen/evaluate.py index 7acb1118cc857d4cd3e1b401b1d8ecddab2288e9..769fdfcf3d1c899aad1b5389dad2c8d9465c05c6 100644 --- a/convlab/base_models/t5/key2gen/evaluate.py +++ b/convlab/base_models/t5/key2gen/evaluate.py @@ -59,31 +59,26 @@ if __name__ == '__main__': for shot in tqdm(args.shots, desc='shots', leave=False): for output_dir in tqdm(args.output_dirs, desc='models', leave=False): model_name = output_dir.split('/')[-1] - if task_name == "wow": - test_splits = ["_seen", "_unseen"] - else: - test_splits = [""] - for test_split in test_splits: - results = [] - for dial_ids_order in tqdm(args.dial_ids_orders, desc='dial_ids_orders', leave=False): - result_dir = os.path.join(output_dir, task_name, f"{dataset_name}_{shot}shot_order{dial_ids_order}/gen{test_split}") - result_file = os.path.join(result_dir, "result.json") - if not os.path.exists(result_file): - filename = os.path.join(output_dir, task_name, f"{dataset_name}_{shot}shot_order{dial_ids_order}/gen{test_split}/generated_predictions.json") - result = evaluate(filename, metric) - json.dump(result, open(result_file, 'w', encoding='utf-8'), indent=2, ensure_ascii=False) - else: - result = json.load(open(result_file)) - results.append(result) - res = { - "dataset": f"{task_name}{test_split}-{shot}shot", - "model": f"{model_name}", - **avg_result(results) - } - table.append(res) - for k in res: - if k not in fieldnames: - fieldnames.append(k) + results = [] + for dial_ids_order in tqdm(args.dial_ids_orders, desc='dial_ids_orders', leave=False): + result_dir = os.path.join(output_dir, task_name, f"{dataset_name}_{shot}shot_order{dial_ids_order}/gen") + result_file = os.path.join(result_dir, "result.json") + if not os.path.exists(result_file): + filename = os.path.join(output_dir, task_name, f"{dataset_name}_{shot}shot_order{dial_ids_order}/gen/generated_predictions.json") + result = evaluate(filename, metric) + json.dump(result, open(result_file, 'w', encoding='utf-8'), indent=2, ensure_ascii=False) + else: + result = json.load(open(result_file)) + results.append(result) + res = { + "dataset": f"{task_name}-{shot}shot", + "model": f"{model_name}", + **avg_result(results) + } + table.append(res) + for k in res: + if k not in fieldnames: + fieldnames.append(k) res = tabulate(table, headers='keys', tablefmt='github') with open(f'eval_results.txt', 'w', encoding='utf-8') as f: diff --git a/convlab/base_models/t5/key2gen/finetune.sh b/convlab/base_models/t5/key2gen/finetune.sh index 390ea1a908dcf9f335ec74d35a422239e9a923ca..8b2eb8d208966ed8f8056f01ece1b1a373033014 100644 --- a/convlab/base_models/t5/key2gen/finetune.sh +++ b/convlab/base_models/t5/key2gen/finetune.sh @@ -1,15 +1,24 @@ +set -e +dataset_path=$1 +model_name=$2 +model_name_or_path=$3 +dataset_name=$4 +if [ "${dataset_name}" == "multiwoz21" ] +then + task_name="nlg" +else + task_name=${dataset_name} +fi +master_port=$5 + n_gpus=2 -master_port=23456 cache_dir="../cache" -dataset_path="dataset_vanilla.py" metric_name_or_path="metric.py" source_column="context+knowledge" target_column="response" truncation_side="left" max_source_length=512 max_target_length=512 -model_name="t5-small" -model_name_or_path="t5-small" per_device_train_batch_size=64 per_device_eval_batch_size=64 gradient_accumulation_steps=1 @@ -17,137 +26,91 @@ num_workers=16 lr=1e-3 num_train_epochs=100 -for dataset_name in multiwoz21 kvret opendialkg wow personachat +for shot in 50 100 200 do - if [ "${dataset_name}" == "multiwoz21" ] - then - task_name="nlg" - else - task_name=${dataset_name} - fi - for shot in 50 100 200 + for dial_ids_order in 0 1 2 3 4 do - for dial_ids_order in 0 1 2 3 4 - do - python create_data.py -t ${task_name} -d ${dataset_name} -o ${dial_ids_order} -s ${shot} + python create_data.py -t ${task_name} -d ${dataset_name} -o ${dial_ids_order} -s ${shot} - data_dir="data/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}" - output_dir="output/${model_name}/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}" - logging_dir="${output_dir}/runs" - train_file="${data_dir}/train.json" - validation_file="${data_dir}/validation.json" + data_dir="data/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}" + output_dir="output/${model_name}/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}" + logging_dir="${output_dir}/runs" + train_file="${data_dir}/train.json" + validation_file="${data_dir}/validation.json" - # training - python -m torch.distributed.launch --master_port ${master_port} \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --dataset_name ${dataset_path} \ - --dataset_config_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 1 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --preprocessing_num_workers ${num_workers} \ - --dataloader_num_workers ${num_workers} \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --optim adafactor \ - --lr_scheduler_type constant \ - --gradient_checkpointing + # training + python -m torch.distributed.launch --master_port ${master_port} \ + --nproc_per_node ${n_gpus} ../run_seq2seq.py \ + --task_name ${task_name} \ + --dataset_name ${dataset_path} \ + --dataset_config_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --overwrite_output_dir \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --preprocessing_num_workers ${num_workers} \ + --dataloader_num_workers ${num_workers} \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --optim adafactor \ + --lr_scheduler_type constant \ + --gradient_checkpointing - # inference - if [ "${dataset_name}" == "wow" ] - then - for test_split in seen unseen - do - test_file="data/${task_name}/test_${test_split}.json" - gen_output_dir="${output_dir}/gen_${test_split}" + # inference + test_file="data/${task_name}/test.json" + gen_output_dir="${output_dir}/gen" - python -m torch.distributed.launch --master_port ${master_port} \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --dataset_name ${dataset_path} \ - --dataset_config_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --cache_dir ${cache_dir} \ - --output_dir ${gen_output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers ${num_workers} \ - --dataloader_num_workers ${num_workers} \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --optim adafactor \ - --lr_scheduler_type constant \ - --gradient_checkpointing - done - else - test_file="data/${task_name}/test.json" - gen_output_dir="${output_dir}/gen" - - python -m torch.distributed.launch --master_port ${master_port} \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --dataset_name ${dataset_path} \ - --dataset_config_name ${task_name} \ - --metric_name_or_path ${metric_name_or_path} \ - --metric_config_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --cache_dir ${cache_dir} \ - --output_dir ${gen_output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers ${num_workers} \ - --dataloader_num_workers ${num_workers} \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --optim adafactor \ - --lr_scheduler_type constant \ - --gradient_checkpointing - fi - done + python -m torch.distributed.launch --master_port ${master_port} \ + --nproc_per_node ${n_gpus} ../run_seq2seq.py \ + --task_name ${task_name} \ + --dataset_name ${dataset_path} \ + --dataset_config_name ${task_name} \ + --metric_name_or_path ${metric_name_or_path} \ + --metric_config_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --cache_dir ${cache_dir} \ + --output_dir ${gen_output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers ${num_workers} \ + --dataloader_num_workers ${num_workers} \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --optim adafactor \ + --lr_scheduler_type constant \ + --gradient_checkpointing + done done # evaluation -python evaluate.py --output_dirs output/${model_name} -t nlg kvret opendialkg personachat wow -s 50 100 200 -o 0 1 2 3 4 \ No newline at end of file +python evaluate.py --output_dirs output/${model_name} -t ${task_name} -s 50 100 200 -o 0 1 2 3 4 diff --git a/convlab/base_models/t5/key2gen/metric.py b/convlab/base_models/t5/key2gen/metric.py index d3e493188194639adad04539619bb63f14284841..808934b65268ab2ae4180b9bbe64457fb5ca1b68 100644 --- a/convlab/base_models/t5/key2gen/metric.py +++ b/convlab/base_models/t5/key2gen/metric.py @@ -65,15 +65,6 @@ Returns: unigram f1: unigram overlap, from parlai distinct-1/2: from parlai other knowledge utility score: task-specific knowledge utility metrics - -Examples: - - >>> nlg_metric = datasets.load_metric("metric.py", "nlg") - >>> predictions = ["hello there general kenobi", "foo bar foobar"] - >>> references = ["hello there kenobi", "foo bar foobar"] - >>> results = nlg_metric.compute(predictions=predictions, references=references) - >>> print(results) - {"bleu": 35.35533905932737} """ re_art = re.compile(r'\b(a|an|the)\b') @@ -325,12 +316,12 @@ def f1_score(y_pred, y_true, average="micro"): if average == "macro": F1_macro_score = F1_pred / float(F1_count) if F1_count != 0 else 0 - return F1_macro_score + return F1_macro_score * 100 elif average == "micro": P_score = TP_all / float(TP_all + FP_all) if (TP_all + FP_all) != 0 else 0 R_score = TP_all / float(TP_all + FN_all) if (TP_all + FN_all) != 0 else 0 F1_micro_score = _compute_F1(P_score, R_score) - return F1_micro_score + return F1_micro_score * 100 else: raise ValueError("Options other than micro/macro are not supported.")