diff --git a/convlab2/base_models/t5/create_data.py b/convlab2/base_models/t5/create_data.py index 19be0b81520cf4077ac34166e7b2e7a0d12f80a3..77c817b3ebf3c8f53a101c4f1d270b03c592988f 100644 --- a/convlab2/base_models/t5/create_data.py +++ b/convlab2/base_models/t5/create_data.py @@ -11,9 +11,6 @@ def create_rg_data(dataset, data_dir, args): os.makedirs(data_dir, exist_ok=True) data_splits = data_by_split.keys() - file_name = os.path.join(data_dir, f"source_prefix.txt") - with open(file_name, "w") as f: - f.write("generate a system response according to the context: ") for data_split in data_splits: data = [] for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): @@ -31,9 +28,6 @@ def create_nlu_data(dataset, data_dir, args): os.makedirs(data_dir, exist_ok=True) data_splits = data_by_split.keys() - file_name = os.path.join(data_dir, f"source_prefix.txt") - with open(file_name, "w") as f: - f.write("parse the dialogue action of the last utterance: ") for data_split in data_splits: data = [] for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): @@ -55,9 +49,6 @@ def create_goal2dialogue_data(dataset, data_dir, args): os.makedirs(data_dir, exist_ok=True) data_splits = data_by_split.keys() - file_name = os.path.join(data_dir, f"source_prefix.txt") - with open(file_name, "w") as f: - f.write("generate a dialogue between user and system according to the user goal: ") for data_split in data_splits: data = [] for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): diff --git a/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh b/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh index a60f77c2bbd9b2660b5616dcad8c9a162f728f12..09a2c33aa06fa5134dba0707e1df5e633ac9f269 100644 --- a/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh +++ b/convlab2/base_models/t5/goal2dialogue/run_goal2dialogue.sh @@ -8,7 +8,6 @@ logging_dir="${output_dir}/runs" train_file="${data_dir}/train.json" validation_file="${data_dir}/validation.json" test_file="${data_dir}/test.json" -source_prefix="${data_dir}/source_prefix.txt" source_column="goal" target_column="dialogue" max_target_length=1024 @@ -30,7 +29,6 @@ python -m torch.distributed.launch \ --source_column ${source_column} \ --target_column ${target_column} \ --max_target_length ${max_target_length} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${model_name_or_path} \ --do_train \ --do_eval \ diff --git a/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh b/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh index 85f3ec8302d161b29ba71b760a56d0f64a6b4dfc..12e9c9c32e784c85940831d76a9cd7e6f1f124a2 100644 --- a/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh +++ b/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh @@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json" test_file="${data_dir}/test.json" metric_name_or_path="nlu_metric.py" metric_for_best_model="overall_f1" -source_prefix="${data_dir}/source_prefix.txt" source_column="context" target_column="dialogue_acts_seq" model_name_or_path="t5-small" @@ -30,7 +29,6 @@ python -m torch.distributed.launch \ --train_file ${train_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${model_name_or_path} \ --do_train \ --save_strategy epoch \ @@ -55,7 +53,6 @@ python -m torch.distributed.launch \ --test_file ${test_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${output_dir} \ --do_predict \ --predict_with_generate \ diff --git a/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh b/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh index 8d7b5c93e8deb9c8c5da9ecd03e42bbc53341442..a12b28d34524e6503b8824cadeacbbf800df9b3b 100644 --- a/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh +++ b/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh @@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json" test_file="${data_dir}/test.json" metric_name_or_path="nlu_metric.py" metric_for_best_model="overall_f1" -source_prefix="${data_dir}/source_prefix.txt" source_column="context" target_column="dialogue_acts_seq" model_name_or_path="t5-small" @@ -30,7 +29,6 @@ python -m torch.distributed.launch \ --train_file ${train_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${model_name_or_path} \ --do_train \ --save_strategy epoch \ @@ -55,7 +53,6 @@ python -m torch.distributed.launch \ --test_file ${test_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${output_dir} \ --do_predict \ --predict_with_generate \ diff --git a/convlab2/base_models/t5/nlu/run_tm1_user.sh b/convlab2/base_models/t5/nlu/run_tm1_user.sh index 16a16fdb106f09a7001190477de8b0878d2e20f3..ec45989743cf0b3a3aae66a3237cfb3ca95d69da 100644 --- a/convlab2/base_models/t5/nlu/run_tm1_user.sh +++ b/convlab2/base_models/t5/nlu/run_tm1_user.sh @@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json" test_file="${data_dir}/test.json" metric_name_or_path="nlu_metric.py" metric_for_best_model="overall_f1" -source_prefix="${data_dir}/source_prefix.txt" source_column="context" target_column="dialogue_acts_seq" model_name_or_path="t5-small" @@ -30,7 +29,6 @@ python -m torch.distributed.launch \ --train_file ${train_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${model_name_or_path} \ --do_train \ --save_strategy epoch \ @@ -55,7 +53,6 @@ python -m torch.distributed.launch \ --test_file ${test_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${output_dir} \ --do_predict \ --predict_with_generate \ diff --git a/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh index ccb67609279be5c4b044a9baadc19672d69c1532..12afd6fa2e1501b9c0e2823d705ca705950c09fe 100644 --- a/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh +++ b/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh @@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json" test_file="${data_dir}/test.json" metric_name_or_path="nlu_metric.py" metric_for_best_model="overall_f1" -source_prefix="${data_dir}/source_prefix.txt" source_column="context" target_column="dialogue_acts_seq" model_name_or_path="t5-small" @@ -30,7 +29,6 @@ python -m torch.distributed.launch \ --train_file ${train_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${model_name_or_path} \ --do_train \ --save_strategy epoch \ @@ -55,7 +53,6 @@ python -m torch.distributed.launch \ --test_file ${test_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${output_dir} \ --do_predict \ --predict_with_generate \ diff --git a/convlab2/base_models/t5/nlu/run_tm2_user.sh b/convlab2/base_models/t5/nlu/run_tm2_user.sh index 8686822fea882cb75776bee89dbd4344b71ea64b..d918d97e52059a8a4abad5797eed3c42bdfc1f84 100644 --- a/convlab2/base_models/t5/nlu/run_tm2_user.sh +++ b/convlab2/base_models/t5/nlu/run_tm2_user.sh @@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json" test_file="${data_dir}/test.json" metric_name_or_path="nlu_metric.py" metric_for_best_model="overall_f1" -source_prefix="${data_dir}/source_prefix.txt" source_column="context" target_column="dialogue_acts_seq" model_name_or_path="t5-small" @@ -30,7 +29,6 @@ python -m torch.distributed.launch \ --train_file ${train_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${model_name_or_path} \ --do_train \ --save_strategy epoch \ @@ -55,7 +53,6 @@ python -m torch.distributed.launch \ --test_file ${test_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${output_dir} \ --do_predict \ --predict_with_generate \ diff --git a/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh index 03c2489940e38dd16256f6b4f2683a413f514235..fe3d35983ff8e0daf1b93a929a302455a4c842b3 100644 --- a/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh +++ b/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh @@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json" test_file="${data_dir}/test.json" metric_name_or_path="nlu_metric.py" metric_for_best_model="overall_f1" -source_prefix="${data_dir}/source_prefix.txt" source_column="context" target_column="dialogue_acts_seq" model_name_or_path="t5-small" @@ -30,7 +29,6 @@ python -m torch.distributed.launch \ --train_file ${train_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${model_name_or_path} \ --do_train \ --save_strategy epoch \ @@ -55,7 +53,6 @@ python -m torch.distributed.launch \ --test_file ${test_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${output_dir} \ --do_predict \ --predict_with_generate \ diff --git a/convlab2/base_models/t5/nlu/run_tm3_user.sh b/convlab2/base_models/t5/nlu/run_tm3_user.sh index 470cb7d71c2b7a630e6917912e21d2c61ca1c075..71623e18eff12a4578b7078205df7b8366a5377b 100644 --- a/convlab2/base_models/t5/nlu/run_tm3_user.sh +++ b/convlab2/base_models/t5/nlu/run_tm3_user.sh @@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json" test_file="${data_dir}/test.json" metric_name_or_path="nlu_metric.py" metric_for_best_model="overall_f1" -source_prefix="${data_dir}/source_prefix.txt" source_column="context" target_column="dialogue_acts_seq" model_name_or_path="t5-small" @@ -30,7 +29,6 @@ python -m torch.distributed.launch \ --train_file ${train_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${model_name_or_path} \ --do_train \ --save_strategy epoch \ @@ -55,7 +53,6 @@ python -m torch.distributed.launch \ --test_file ${test_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${output_dir} \ --do_predict \ --predict_with_generate \ diff --git a/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh index 5e325d1fe2b127ef1af0b0733dd5db03bb1cbe3c..aa22b362e1e6d6eaa30a0e5a29816f4cf3239d6f 100644 --- a/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh +++ b/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh @@ -12,7 +12,6 @@ validation_file="${data_dir}/validation.json" test_file="${data_dir}/test.json" metric_name_or_path="nlu_metric.py" metric_for_best_model="overall_f1" -source_prefix="${data_dir}/source_prefix.txt" source_column="context" target_column="dialogue_acts_seq" model_name_or_path="t5-small" @@ -30,7 +29,6 @@ python -m torch.distributed.launch \ --train_file ${train_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${model_name_or_path} \ --do_train \ --save_strategy epoch \ @@ -55,7 +53,6 @@ python -m torch.distributed.launch \ --test_file ${test_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ --model_name_or_path ${output_dir} \ --do_predict \ --predict_with_generate \ diff --git a/convlab2/base_models/t5/rg/run_rg.sh b/convlab2/base_models/t5/rg/run_rg.sh index 8bf742d51b1f28765a5d28775970d12a25178434..55accadfaeceb7c43ad9df079f054aa3e00c5a1c 100644 --- a/convlab2/base_models/t5/rg/run_rg.sh +++ b/convlab2/base_models/t5/rg/run_rg.sh @@ -1,7 +1,8 @@ -n_gpus=8 +set -e +n_gpus=2 task_name="rg" -dataset_name="multiwoz21" -speaker="system" +dataset_name="metalwoz+sgd+tm1+tm2+tm3" +speaker="all" data_dir="data/${task_name}/${dataset_name}/${speaker}" output_dir="output/${task_name}/${dataset_name}/${speaker}" cache_dir="../cache" @@ -9,17 +10,30 @@ logging_dir="${output_dir}/runs" train_file="${data_dir}/train.json" validation_file="${data_dir}/validation.json" test_file="${data_dir}/test.json" -source_prefix="${data_dir}/source_prefix.txt" source_column="context" target_column="response" +truncation_side="left" +max_source_length=512 +max_target_length=128 model_name_or_path="t5-small" per_device_train_batch_size=32 per_device_eval_batch_size=128 -gradient_accumulation_steps=1 +gradient_accumulation_steps=4 lr=1e-3 num_train_epochs=5 -python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} +# names=$(echo ${dataset_name} | tr "+" "\n") +# mkdir -p ${data_dir} +# for name in ${names}; +# do +# echo "preprocessing ${name}" +# python ../create_data.py --tasks ${task_name} --datasets ${name} --speaker ${speaker} +# if [ "${name}" != "${dataset_name}" ]; then +# cat "data/${task_name}/${name}/${speaker}/train.json" >> ${train_file} +# cat "data/${task_name}/${name}/${speaker}/validation.json" >> ${validation_file} +# cat "data/${task_name}/${name}/${speaker}/test.json" >> ${test_file} +# fi +# done python -m torch.distributed.launch \ --nproc_per_node ${n_gpus} ../run_seq2seq.py \ @@ -29,7 +43,9 @@ python -m torch.distributed.launch \ --test_file ${test_file} \ --source_column ${source_column} \ --target_column ${target_column} \ - --source_prefix ${source_prefix} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ --model_name_or_path ${model_name_or_path} \ --do_train \ --do_eval \ diff --git a/convlab2/base_models/t5/run_seq2seq.py b/convlab2/base_models/t5/run_seq2seq.py index e9348f5579d35dffe85b9b7e9b90cedf176d0de3..dace9713d540b7fe2aa1c552132cc4c54d698989 100644 --- a/convlab2/base_models/t5/run_seq2seq.py +++ b/convlab2/base_models/t5/run_seq2seq.py @@ -47,7 +47,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.12.5") +check_min_version("4.17.0") require_version("datasets>=1.16.1") @@ -78,6 +78,10 @@ class ModelArguments: default=True, metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, ) + truncation_side: Optional[str] = field( + default="right", + metadata={"help": "Which side to truncate, left or right."} + ) model_revision: str = field( default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, @@ -341,6 +345,7 @@ def main(): model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, + truncation_side=model_args.truncation_side, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) @@ -382,10 +387,11 @@ def main(): ) if data_args.source_prefix_filepath is not None: - prefix = open(data_args.source_prefix_filepath, 'r', encoding='utf-8').readline().strip() + prefix = open(data_args.source_prefix_filepath, 'r', encoding='utf-8').readline().strip('\n') else: prefix = "" + logger.info(f'source prefix: "{prefix}"') # Preprocessing the datasets. # We need to tokenize inputs and targets.