diff --git a/convlab/base_models/t5/dst/merge_predict_res.py b/convlab/base_models/t5/dst/merge_predict_res.py index a1445ea35525ac403e6990edb677f801eb0d6468..9b519229b5b0a941a6d4172bd1473c6c8077658a 100755 --- a/convlab/base_models/t5/dst/merge_predict_res.py +++ b/convlab/base_models/t5/dst/merge_predict_res.py @@ -4,7 +4,7 @@ from convlab.util import load_dataset, load_dst_data from convlab.base_models.t5.dst.serialization import deserialize_dialogue_state -def merge(dataset_names, speaker, save_dir, context_window_size, predict_result): +def merge(dataset_names, speaker, save_dir, context_window_size, predict_result, dial_ids_order): assert os.path.exists(predict_result) if save_dir is None: @@ -17,14 +17,18 @@ def merge(dataset_names, speaker, save_dir, context_window_size, predict_result) i = 0 for dataset_name in dataset_names.split('+'): print(dataset_name) - dataset = load_dataset(dataset_name, args.dial_ids_order) + single = [] + dataset = load_dataset(dataset_name, dial_ids_order) data = load_dst_data(dataset, data_split='test', speaker=speaker, use_context=context_window_size>0, context_window_size=context_window_size)['test'] for sample in data: sample['predictions'] = {'state': predict_result[i]} i += 1 + single.append(sample) merged.append(sample) + json.dump(single, open(os.path.join(save_dir, f'{dataset_name}_predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False) + json.dump(merged, open(os.path.join(save_dir, 'predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False) @@ -39,4 +43,4 @@ if __name__ == '__main__': parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments') args = parser.parse_args() print(args) - merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result) + merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result, args.dial_ids_order) diff --git a/convlab/base_models/t5/dst/run_dst_multitask.sh b/convlab/base_models/t5/dst/run_dst_multitask.sh index 4803b521fee6b5a464e08fca93b4116466373b6b..7846e049aecf9302063517533e2810aaea9d339c 100644 --- a/convlab/base_models/t5/dst/run_dst_multitask.sh +++ b/convlab/base_models/t5/dst/run_dst_multitask.sh @@ -30,7 +30,7 @@ mkdir -p ${data_dir} for name in ${names}; do echo "preprocessing ${name}" - # python ../create_data.py -t ${task_name} -d ${name} -s ${speaker} -c ${context_window_size} + python ../create_data.py -t ${task_name} -d ${name} -s ${speaker} -c ${context_window_size} done python merge_data.py $(echo ${dataset_name} | tr "+" " ") @@ -91,4 +91,8 @@ python ../run_seq2seq.py \ python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json -python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/predictions.json +for name in ${names}; +do + echo "evaluating ${name}" + python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/${name}_predictions.json +done \ No newline at end of file diff --git a/convlab/base_models/t5/nlg/merge_predict_res.py b/convlab/base_models/t5/nlg/merge_predict_res.py index 9e4615cfb0ad527e12a5294f1dbdcb69ba267651..a4d1b34155fda3826b6aaa100ddf7a8f5c37c49f 100755 --- a/convlab/base_models/t5/nlg/merge_predict_res.py +++ b/convlab/base_models/t5/nlg/merge_predict_res.py @@ -3,7 +3,7 @@ import os from convlab.util import load_dataset, load_nlg_data -def merge(dataset_names, speaker, save_dir, context_window_size, predict_result): +def merge(dataset_names, speaker, save_dir, context_window_size, predict_result, dial_ids_order): assert os.path.exists(predict_result) if save_dir is None: @@ -16,7 +16,8 @@ def merge(dataset_names, speaker, save_dir, context_window_size, predict_result) i = 0 for dataset_name in dataset_names.split('+'): print(dataset_name) - dataset = load_dataset(dataset_name, args.dial_ids_order) + single = [] + dataset = load_dataset(dataset_name, dial_ids_order) data = load_nlg_data(dataset, data_split='test', speaker=speaker, use_context=context_window_size>0, context_window_size=context_window_size)['test'] for sample in data: @@ -24,23 +25,18 @@ def merge(dataset_names, speaker, save_dir, context_window_size, predict_result) continue sample['predictions'] = {'utterance': predict_result[i]} i += 1 - if args.sub_dataset: - if dataset_name == args.sub_dataset: - merged.append(sample) - else: - merged.append(sample) + single.append(sample) + merged.append(sample) - if args.sub_dataset: - json.dump(merged, open(os.path.join(save_dir, f'{args.sub_dataset}predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False) - else: - json.dump(merged, open(os.path.join(save_dir, 'predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False) + json.dump(single, open(os.path.join(save_dir, f'{dataset_name}_predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False) + + json.dump(merged, open(os.path.join(save_dir, 'predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False) if __name__ == '__main__': from argparse import ArgumentParser parser = ArgumentParser(description="merge predict results with original data for unified NLU evaluation") parser.add_argument('--dataset', '-d', metavar='dataset_name', type=str, help='name of the unified dataset') - parser.add_argument('--sub_dataset', metavar='sub dataset_name', type=str, help='name of the unified dataset') parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s) of utterances') parser.add_argument('--save_dir', type=str, help='merged data will be saved as $save_dir/predictions.json. default: on the same directory as predict_result') parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered') @@ -48,4 +44,4 @@ if __name__ == '__main__': parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments') args = parser.parse_args() print(args) - merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result) + merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result, args.dial_ids_order) diff --git a/convlab/base_models/t5/nlg/run_nlg.sh b/convlab/base_models/t5/nlg/run_nlg.sh index b1ca7bf400c8c8c232dae9054e9413a22527d9ac..718dca4a2e344fc903e20f48e75cccd46b7821ce 100644 --- a/convlab/base_models/t5/nlg/run_nlg.sh +++ b/convlab/base_models/t5/nlg/run_nlg.sh @@ -24,36 +24,36 @@ gradient_accumulation_steps=8 lr=1e-3 num_train_epochs=10 -# python ../create_data.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} +python ../create_data.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -# python ../run_seq2seq.py \ -# --task_name ${task_name} \ -# --train_file ${train_file} \ -# --validation_file ${validation_file} \ -# --source_column ${source_column} \ -# --target_column ${target_column} \ -# --max_source_length ${max_source_length} \ -# --max_target_length ${max_target_length} \ -# --truncation_side ${truncation_side} \ -# --model_name_or_path ${model_name_or_path} \ -# --do_train \ -# --do_eval \ -# --save_strategy epoch \ -# --evaluation_strategy epoch \ -# --save_total_limit 1 \ -# --prediction_loss_only \ -# --cache_dir ${cache_dir} \ -# --output_dir ${output_dir} \ -# --logging_dir ${logging_dir} \ -# --overwrite_output_dir \ -# --preprocessing_num_workers 4 \ -# --per_device_train_batch_size ${per_device_train_batch_size} \ -# --per_device_eval_batch_size ${per_device_eval_batch_size} \ -# --gradient_accumulation_steps ${gradient_accumulation_steps} \ -# --learning_rate ${lr} \ -# --num_train_epochs ${num_train_epochs} \ -# --optim adafactor \ -# --gradient_checkpointing +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --optim adafactor \ + --gradient_checkpointing python ../run_seq2seq.py \ --task_name ${task_name} \ diff --git a/convlab/base_models/t5/nlg/run_nlg_multitask.sh b/convlab/base_models/t5/nlg/run_nlg_multitask.sh index 6caf22391c7f0690c8f788ea37ed5d15c9415583..9556bdcc0d55e4e520c371386438f9a99c71280c 100644 --- a/convlab/base_models/t5/nlg/run_nlg_multitask.sh +++ b/convlab/base_models/t5/nlg/run_nlg_multitask.sh @@ -1,6 +1,6 @@ n_gpus=1 task_name="nlg" -dataset_name="tm1+tm2+tm3" +dataset_name="sgd+tm1+tm2+tm3+multiwoz21" speaker="system" context_window_size=0 data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" @@ -25,70 +25,74 @@ lr=1e-3 num_train_epochs=10 names=$(echo ${dataset_name} | tr "+" "\n") -# rm -r ${data_dir} -# mkdir -p ${data_dir} -# for name in ${names}; -# do -# echo "preprocessing ${name}" -# python ../create_data.py -t ${task_name} -d ${name} -s ${speaker} -c ${context_window_size} -# done +rm -r ${data_dir} +mkdir -p ${data_dir} +for name in ${names}; +do + echo "preprocessing ${name}" + python ../create_data.py -t ${task_name} -d ${name} -s ${speaker} -c ${context_window_size} +done -# python merge_data.py $(echo ${dataset_name} | tr "+" " ") +python merge_data.py $(echo ${dataset_name} | tr "+" " ") -# python ../run_seq2seq.py \ -# --task_name ${task_name} \ -# --train_file ${train_file} \ -# --validation_file ${validation_file} \ -# --source_column ${source_column} \ -# --target_column ${target_column} \ -# --max_source_length ${max_source_length} \ -# --max_target_length ${max_target_length} \ -# --truncation_side ${truncation_side} \ -# --model_name_or_path ${model_name_or_path} \ -# --do_train \ -# --do_eval \ -# --save_strategy epoch \ -# --evaluation_strategy epoch \ -# --save_total_limit 1 \ -# --prediction_loss_only \ -# --cache_dir ${cache_dir} \ -# --output_dir ${output_dir} \ -# --logging_dir ${logging_dir} \ -# --overwrite_output_dir \ -# --preprocessing_num_workers 4 \ -# --per_device_train_batch_size ${per_device_train_batch_size} \ -# --per_device_eval_batch_size ${per_device_eval_batch_size} \ -# --gradient_accumulation_steps ${gradient_accumulation_steps} \ -# --learning_rate ${lr} \ -# --num_train_epochs ${num_train_epochs} \ -# --optim adafactor \ -# --gradient_checkpointing +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --optim adafactor \ + --gradient_checkpointing -# python ../run_seq2seq.py \ -# --task_name ${task_name} \ -# --test_file ${test_file} \ -# --source_column ${source_column} \ -# --target_column ${target_column} \ -# --max_source_length ${max_source_length} \ -# --max_target_length ${max_target_length} \ -# --truncation_side ${truncation_side} \ -# --model_name_or_path ${output_dir} \ -# --do_predict \ -# --predict_with_generate \ -# --metric_name_or_path ${metric_name_or_path} \ -# --cache_dir ${cache_dir} \ -# --output_dir ${output_dir} \ -# --logging_dir ${logging_dir} \ -# --overwrite_output_dir \ -# --preprocessing_num_workers 4 \ -# --per_device_train_batch_size ${per_device_train_batch_size} \ -# --per_device_eval_batch_size ${per_device_eval_batch_size} \ -# --gradient_accumulation_steps ${gradient_accumulation_steps} \ -# --learning_rate ${lr} \ -# --num_train_epochs ${num_train_epochs} \ -# --optim adafactor \ -# --gradient_checkpointing +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --optim adafactor \ + --gradient_checkpointing -# python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json +python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json -python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/Nonepredictions.json --dataset_name tm1 +for name in ${names}; +do + echo "evaluating ${name}" + python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/${name}_predictions.json --dataset_name ${name} +done \ No newline at end of file diff --git a/convlab/base_models/t5/nlu/merge_predict_res.py b/convlab/base_models/t5/nlu/merge_predict_res.py index bdb7e404f4b348f5c5125f9b934d3509a771371b..8c522063f98e38773841554ddddfc9f476b18049 100755 --- a/convlab/base_models/t5/nlu/merge_predict_res.py +++ b/convlab/base_models/t5/nlu/merge_predict_res.py @@ -4,7 +4,7 @@ from convlab.util import load_dataset, load_nlu_data from convlab.base_models.t5.nlu.serialization import deserialize_dialogue_acts -def merge(dataset_names, speaker, save_dir, context_window_size, predict_result): +def merge(dataset_names, speaker, save_dir, context_window_size, predict_result, dial_ids_order): assert os.path.exists(predict_result) if save_dir is None: @@ -17,14 +17,18 @@ def merge(dataset_names, speaker, save_dir, context_window_size, predict_result) i = 0 for dataset_name in dataset_names.split('+'): print(dataset_name) - dataset = load_dataset(dataset_name, args.dial_ids_order) + single = [] + dataset = load_dataset(dataset_name, dial_ids_order) data = load_nlu_data(dataset, data_split='test', speaker=speaker, use_context=context_window_size>0, context_window_size=context_window_size)['test'] for sample in data: sample['predictions'] = {'dialogue_acts': predict_result[i]} i += 1 + single.append(sample) merged.append(sample) + json.dump(single, open(os.path.join(save_dir, f'{dataset_name}_predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False) + json.dump(merged, open(os.path.join(save_dir, 'predictions.json'), 'w', encoding='utf-8'), indent=2, ensure_ascii=False) @@ -39,4 +43,4 @@ if __name__ == '__main__': parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments') args = parser.parse_args() print(args) - merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result) + merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result, args.dial_ids_order) diff --git a/convlab/base_models/t5/nlu/run_nlu_multitask.sh b/convlab/base_models/t5/nlu/run_nlu_multitask.sh index 9e4425b4b8b677d537dc2ca46a471a5ea52c2efe..4e29168007763e44817b8c8215705864d0131a8f 100644 --- a/convlab/base_models/t5/nlu/run_nlu_multitask.sh +++ b/convlab/base_models/t5/nlu/run_nlu_multitask.sh @@ -91,4 +91,8 @@ python ../run_seq2seq.py \ python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/test_generated_predictions.json -python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json +for name in ${names}; +do + echo "evaluating ${name}" + python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/${name}_predictions.json +done \ No newline at end of file