diff --git a/convlab2/base_models/t5/create_data.py b/convlab2/base_models/t5/create_data.py index cc9e651291bcaf2284750901c8e3c5f386b0c43f..c2f3da96523266936e4f15ad19682de62e5da3da 100644 --- a/convlab2/base_models/t5/create_data.py +++ b/convlab2/base_models/t5/create_data.py @@ -2,6 +2,7 @@ import os import json from tqdm import tqdm import re +from transformers import AutoTokenizer from convlab2.util import load_dataset, load_nlu_data, load_dst_data, load_policy_data, load_nlg_data, load_e2e_data, load_rg_data from convlab2.base_models.t5.nlu.serialization import serialize_dialogue_acts, deserialize_dialogue_acts, equal_da_seq from convlab2.base_models.t5.dst.serialization import serialize_dialogue_state, deserialize_dialogue_state, equal_state_seq @@ -23,6 +24,8 @@ def create_rg_data(dataset, data_dir, args): file_name = os.path.join(data_dir, f"{data_split}.json") with open(file_name, "w", encoding='utf-8') as f: f.writelines(data) + data_by_split[data_split] = data + return data_by_split def create_nlu_data(dataset, data_dir, args): data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size) @@ -45,6 +48,8 @@ def create_nlu_data(dataset, data_dir, args): file_name = os.path.join(data_dir, f"{data_split}.json") with open(file_name, "w", encoding='utf-8') as f: f.writelines(data) + data_by_split[data_split] = data + return data_by_split def create_dst_data(dataset, data_dir, args): data_by_split = load_dst_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size) @@ -67,6 +72,8 @@ def create_dst_data(dataset, data_dir, args): file_name = os.path.join(data_dir, f"{data_split}.json") with open(file_name, "w", encoding='utf-8') as f: f.writelines(data) + data_by_split[data_split] = data + return data_by_split def create_nlg_data(dataset, data_dir, args): data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size) @@ -89,6 +96,8 @@ def create_nlg_data(dataset, data_dir, args): file_name = os.path.join(data_dir, f"{data_split}.json") with open(file_name, "w", encoding='utf-8') as f: f.writelines(data) + data_by_split[data_split] = data + return data_by_split def create_goal2dialogue_data(dataset, data_dir, args): data_by_split = dataset @@ -105,6 +114,20 @@ def create_goal2dialogue_data(dataset, data_dir, args): file_name = os.path.join(data_dir, f"{data_split}.json") with open(file_name, "w", encoding='utf-8') as f: f.writelines(data) + data_by_split[data_split] = data + return data_by_split + +def get_max_len(data_by_split, tokenizer): + for data_split in data_by_split.keys(): + seq_len = {} + for line in data_by_split[data_split]: + item = json.loads(line.strip()) + for column, seq in item.items(): + seq_len.setdefault(column, []) + seq_len[column].append(len(tokenizer.tokenize(seq))) + print(f"data split: {data_split}") + for column, lens in seq_len.items(): + print(f'\t{column}\tmax_len: {max(lens)}\tmean_len: {round(sum(lens)/len(lens),2)}') if __name__ == '__main__': @@ -114,10 +137,15 @@ if __name__ == '__main__': parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets') parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s)') parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered') + parser.add_argument('--len_tokenizer', '-l', type=str, default=None, help='name or path of tokenizer that used to get seq len') args = parser.parse_args() print(args) + if args.len_tokenizer: + tokenizer = AutoTokenizer.from_pretrained(args.len_tokenizer) for dataset_name in tqdm(args.datasets, desc='datasets'): dataset = load_dataset(dataset_name) for task_name in tqdm(args.tasks, desc='tasks', leave=False): data_dir = os.path.join('data', task_name, dataset_name) - eval(f"create_{task_name}_data")(dataset, data_dir, args) + data_by_split = eval(f"create_{task_name}_data")(dataset, data_dir, args) + if args.len_tokenizer: + get_max_len(data_by_split, tokenizer) diff --git a/convlab2/base_models/t5/dst/serialization.py b/convlab2/base_models/t5/dst/serialization.py index 96cc0beb105b5c9ae22aeffb02881a28e0a443ae..f452ae2d8ef3f63844b7df0c5034c145080771d1 100644 --- a/convlab2/base_models/t5/dst/serialization.py +++ b/convlab2/base_models/t5/dst/serialization.py @@ -5,26 +5,32 @@ def serialize_dialogue_state(state): if len(value) > 0: state_dict.setdefault(f'[{domain}]', []) state_dict[f'[{domain}]'].append(f'[{slot}][{value}]') - return ';'.join([domain+'{'+','.join(slot_values)+'}' for domain, slot_values in state_dict.items()]) + return ';'.join([domain+'('+','.join(slot_values)+')' for domain, slot_values in state_dict.items()]) def deserialize_dialogue_state(state_seq): state = {} if len(state_seq) == 0: return state - state_seqs = state_seq.split(']};[') # will consume "]}" and "[" + state_seqs = state_seq.split(']);[') # will consume "])" and "[" for i, state_seq in enumerate(state_seqs): - if len(state_seq) == 0 or len(state_seq.split(']{[')) != 2: + if len(state_seq) == 0 or len(state_seq.split(']([')) != 2: continue if i == 0: if state_seq[0] == '[': state_seq = state_seq[1:] if i == len(state_seqs) - 1: - if state_seq[-2:] == ']}': + if state_seq[-2:] == '])': state_seq = state_seq[:-2] - domain, slot_values = state_seq.split(']{[') + try: + domain, slot_values = state_seq.split(']([') + except: + continue for slot_value in slot_values.split('],['): - slot, value = slot_value.split('][') + try: + slot, value = slot_value.split('][') + except: + continue state.setdefault(domain, {}) state[domain][slot] = value return state diff --git a/convlab2/base_models/t5/nlg/nlg_metric.py b/convlab2/base_models/t5/nlg/nlg_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..0c0155fff3fcd021a2d07c25528e75c4fd912d83 --- /dev/null +++ b/convlab2/base_models/t5/nlg/nlg_metric.py @@ -0,0 +1,80 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""NLG Metric""" + +import datasets +import sacrebleu + +# TODO: Add BibTeX citation +_CITATION = """\ +@inproceedings{post-2018-call, + title = "A Call for Clarity in Reporting {BLEU} Scores", + author = "Post, Matt", + booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers", + month = oct, + year = "2018", + address = "Belgium, Brussels", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/W18-6319", + pages = "186--191", +} +""" + +_DESCRIPTION = """\ +Metric to evaluate text-to-text models on the natural language generation task. +""" + +_KWARGS_DESCRIPTION = """ +Calculates corpus-bleu4 +Args: + predictions: list of predictions to score. Each predictions + should be a string. + references: list of reference for each prediction. Each + reference should be a string. +Returns: + bleu: corpus-bleu score +Examples: + + >>> nlg_metric = datasets.load_metric("nlg_metric.py") + >>> predictions = ["hello there general kenobi", "foo bar foobar"] + >>> references = ["hello there kenobi", "foo bar foobar"] + >>> results = nlg_metric.compute(predictions=predictions, references=references) + >>> print(results) + {'bleu': 35.35533905932737} +""" + + +@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class NLGMetrics(datasets.Metric): + """Metric to evaluate text-to-text models on the natural language generation task.""" + + def _info(self): + return datasets.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + # This defines the format of each prediction and reference + features=datasets.Features({ + 'predictions': datasets.Value('string'), + 'references': datasets.Value('string'), + }) + ) + + def _compute(self, predictions, references): + """Returns the scores: bleu""" + bleu = sacrebleu.corpus_bleu(predictions, [references], lowercase=True).score + + return { + "bleu": bleu + } diff --git a/convlab2/base_models/t5/nlu/nlu_metric.py b/convlab2/base_models/t5/nlu/nlu_metric.py index a2f3c0bc4f9a50464eebc67d220964c39a45f2be..d337887201e683eebb5bfa932714e2ee4e2cfae2 100644 --- a/convlab2/base_models/t5/nlu/nlu_metric.py +++ b/convlab2/base_models/t5/nlu/nlu_metric.py @@ -36,21 +36,15 @@ Returns: seq_em: sequence exact match accuracy: dialog acts accuracy overall_f1: dialog acts overall f1 - binary_f1: binary dialog acts f1 - categorical_f1: categorical dialog acts f1 - non-categorical_f1: non-categorical dialog acts f1 Examples: >>> nlu_metric = datasets.load_metric("nlu_metric.py") - >>> predictions = ["[binary][thank][general][]", "[non-categorical][inform][taxi][leave at][17:15]"] - >>> references = ["[binary][thank][general][]", "[non-categorical][inform][train][leave at][17:15]"] + >>> predictions = ["[thank][general]{[][]}", "[inform][taxi]{[leave at][17:15]}"] + >>> references = ["[thank][general]{[][]}", "[inform][train]{[leave at][17:15]}"] >>> results = nlu_metric.compute(predictions=predictions, references=references) >>> print(results) {'seq_em': 0.5, 'accuracy': 0.5, - 'overall_f1': 0.5, 'overall_precision': 0.5, 'overall_recall': 0.5, - 'binary_f1': 1.0, 'binary_precision': 1.0, 'binary_recall': 1.0, - 'categorical_f1': 0.0, 'categorical_precision': 0.0, 'categorical_recall': 0.0, - 'non-categorical_f1': 0.0, 'non-categorical_precision': 0.0, 'non-categorical_recall': 0.0} + 'overall_f1': 0.5, 'overall_precision': 0.5, 'overall_recall': 0.5} """ @@ -74,45 +68,33 @@ class NLUMetrics(datasets.Metric): """Returns the scores: sequence exact match, dialog acts accuracy and f1""" seq_em = [] acc = [] - f1_metrics = {x: {'TP':0, 'FP':0, 'FN':0} for x in ['overall', 'binary', 'categorical', 'non-categorical']} + f1_metrics = {'TP':0, 'FP':0, 'FN':0} for prediction, reference in zip(predictions, references): seq_em.append(prediction.strip()==reference.strip()) pred_da = deserialize_dialogue_acts(prediction) gold_da = deserialize_dialogue_acts(reference) - flag = True - for da_type in ['binary', 'categorical', 'non-categorical']: - if da_type == 'binary': - predicts = sorted(list({(x['intent'], x['domain'], x['slot']) for x in pred_da[da_type]})) - labels = sorted(list({(x['intent'], x['domain'], x['slot']) for x in gold_da[da_type]})) + pred_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da in pred_da})) + gold_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da in gold_da})) + acc.append(pred_da==gold_da) + for ele in pred_da: + if ele in gold_da: + f1_metrics['TP'] += 1 else: - predicts = sorted(list({(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in pred_da[da_type]})) - labels = sorted(list({(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in gold_da[da_type]})) - for ele in predicts: - if ele in labels: - f1_metrics['overall']['TP'] += 1 - f1_metrics[da_type]['TP'] += 1 - else: - f1_metrics['overall']['FP'] += 1 - f1_metrics[da_type]['FP'] += 1 - for ele in labels: - if ele not in predicts: - f1_metrics['overall']['FN'] += 1 - f1_metrics[da_type]['FN'] += 1 - flag &= (predicts==labels) - acc.append(flag) + f1_metrics['FP'] += 1 + for ele in gold_da: + if ele not in pred_da: + f1_metrics['FN'] += 1 - for metric in list(f1_metrics.keys()): - TP = f1_metrics[metric].pop('TP') - FP = f1_metrics[metric].pop('FP') - FN = f1_metrics[metric].pop('FN') - precision = 1.0 * TP / (TP + FP) if TP + FP else 0. - recall = 1.0 * TP / (TP + FN) if TP + FN else 0. - f1 = 2.0 * precision * recall / (precision + recall) if precision + recall else 0. - f1_metrics.pop(metric) - f1_metrics[f'{metric}_f1'] = f1 - f1_metrics[f'{metric}_precision'] = precision - f1_metrics[f'{metric}_recall'] = recall + TP = f1_metrics.pop('TP') + FP = f1_metrics.pop('FP') + FN = f1_metrics.pop('FN') + precision = 1.0 * TP / (TP + FP) if TP + FP else 0. + recall = 1.0 * TP / (TP + FN) if TP + FN else 0. + f1 = 2.0 * precision * recall / (precision + recall) if precision + recall else 0. + f1_metrics[f'overall_f1'] = f1 + f1_metrics[f'overall_precision'] = precision + f1_metrics[f'overall_recall'] = recall return { "seq_em": sum(seq_em)/len(seq_em), diff --git a/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh b/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh deleted file mode 100644 index a9e9d6c55b38bfa1b43b0a837e4eef9d60a7e233..0000000000000000000000000000000000000000 --- a/convlab2/base_models/t5/nlu/run_multiwoz21_user.sh +++ /dev/null @@ -1,68 +0,0 @@ -n_gpus=1 -task_name="nlu" -dataset_name="multiwoz21" -speaker="user" -context_window_size=0 -data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="nlu_metric.py" -metric_for_best_model="overall_f1" -source_column="context" -target_column="dialogue_acts_seq" -model_name_or_path="t5-small" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=2 -lr=1e-3 -num_train_epochs=10 - -python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size} - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --save_strategy epoch \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --debug underflow_overflow \ - --adafactor \ - --gradient_checkpointing - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - -python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json diff --git a/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh b/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh deleted file mode 100644 index e90e71d459da3bd43eeccc82d7ad192d7f751996..0000000000000000000000000000000000000000 --- a/convlab2/base_models/t5/nlu/run_multiwoz21_user_context3.sh +++ /dev/null @@ -1,68 +0,0 @@ -n_gpus=1 -task_name="nlu" -dataset_name="multiwoz21" -speaker="user" -context_window_size=3 -data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="nlu_metric.py" -metric_for_best_model="overall_f1" -source_column="context" -target_column="dialogue_acts_seq" -model_name_or_path="t5-small" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=2 -lr=1e-3 -num_train_epochs=10 - -python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size} - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --save_strategy epoch \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --debug underflow_overflow \ - --adafactor \ - --gradient_checkpointing - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - -python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json diff --git a/convlab2/base_models/t5/dst/run_multiwoz21.sh b/convlab2/base_models/t5/nlu/run_nlu.sh similarity index 76% rename from convlab2/base_models/t5/dst/run_multiwoz21.sh rename to convlab2/base_models/t5/nlu/run_nlu.sh index 1c383cbb24130db89c4304912f74a8323dadd343..71b5f5f8e488b2c44e76637cc0152b8ff9db8671 100644 --- a/convlab2/base_models/t5/dst/run_multiwoz21.sh +++ b/convlab2/base_models/t5/nlu/run_nlu.sh @@ -1,8 +1,8 @@ -n_gpus=4 -task_name="dst" -dataset_name="multiwoz21" +n_gpus=1 +task_name="nlu" +dataset_name=$1 speaker="user" -context_window_size=100 +context_window_size=$2 data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" cache_dir="../cache" @@ -10,28 +10,26 @@ logging_dir="${output_dir}/runs" train_file="${data_dir}/train.json" validation_file="${data_dir}/validation.json" test_file="${data_dir}/test.json" -metric_name_or_path="dst_metric.py" -metric_for_best_model="accuracy" +metric_name_or_path="nlu_metric.py" +metric_for_best_model="overall_f1" source_column="context" -target_column="state_seq" +target_column="dialogue_acts_seq" truncation_side="left" max_source_length=512 max_target_length=512 model_name_or_path="t5-small" -per_device_train_batch_size=64 +per_device_train_batch_size=128 per_device_eval_batch_size=64 gradient_accumulation_steps=2 lr=1e-3 num_train_epochs=10 -python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size} +python ../create_data.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ +python ../run_seq2seq.py \ --task_name ${task_name} \ --train_file ${train_file} \ --validation_file ${validation_file} \ - --test_file ${test_file} \ --source_column ${source_column} \ --target_column ${target_column} \ --max_source_length ${max_source_length} \ @@ -40,7 +38,6 @@ python -m torch.distributed.launch \ --model_name_or_path ${model_name_or_path} \ --do_train \ --do_eval \ - --do_predict \ --save_strategy epoch \ --evaluation_strategy epoch \ --prediction_loss_only \ @@ -58,8 +55,7 @@ python -m torch.distributed.launch \ --adafactor \ --gradient_checkpointing -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ +python ../run_seq2seq.py \ --task_name ${task_name} \ --test_file ${test_file} \ --source_column ${source_column} \ @@ -76,8 +72,8 @@ python -m torch.distributed.launch \ --logging_dir ${logging_dir} \ --overwrite_output_dir \ --preprocessing_num_workers 4 \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/predictions.json +python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json diff --git a/convlab2/base_models/t5/nlu/run_tm1_user.sh b/convlab2/base_models/t5/nlu/run_tm1_user.sh deleted file mode 100644 index 5372441aa284067f137e892454b8fed76e251e6e..0000000000000000000000000000000000000000 --- a/convlab2/base_models/t5/nlu/run_tm1_user.sh +++ /dev/null @@ -1,68 +0,0 @@ -n_gpus=1 -task_name="nlu" -dataset_name="tm1" -speaker="user" -context_window_size=0 -data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="nlu_metric.py" -metric_for_best_model="overall_f1" -source_column="context" -target_column="dialogue_acts_seq" -model_name_or_path="t5-small" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=2 -lr=1e-3 -num_train_epochs=10 - -python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size} - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --save_strategy epoch \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --debug underflow_overflow \ - --adafactor \ - --gradient_checkpointing - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - -python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json diff --git a/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh deleted file mode 100644 index 65482a1f517b7c1eb8607e0858d3ae576d1483b4..0000000000000000000000000000000000000000 --- a/convlab2/base_models/t5/nlu/run_tm1_user_context3.sh +++ /dev/null @@ -1,68 +0,0 @@ -n_gpus=1 -task_name="nlu" -dataset_name="tm1" -speaker="user" -context_window_size=3 -data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="nlu_metric.py" -metric_for_best_model="overall_f1" -source_column="context" -target_column="dialogue_acts_seq" -model_name_or_path="t5-small" -per_device_train_batch_size=64 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=10 - -python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size} - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --save_strategy epoch \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --debug underflow_overflow \ - --adafactor \ - --gradient_checkpointing - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - -python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json diff --git a/convlab2/base_models/t5/nlu/run_tm2_user.sh b/convlab2/base_models/t5/nlu/run_tm2_user.sh deleted file mode 100644 index 84dc8b71ae560dcb481dee8bcfc31340ed4a778d..0000000000000000000000000000000000000000 --- a/convlab2/base_models/t5/nlu/run_tm2_user.sh +++ /dev/null @@ -1,68 +0,0 @@ -n_gpus=1 -task_name="nlu" -dataset_name="tm2" -speaker="user" -context_window_size=0 -data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="nlu_metric.py" -metric_for_best_model="overall_f1" -source_column="context" -target_column="dialogue_acts_seq" -model_name_or_path="t5-small" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=2 -lr=1e-3 -num_train_epochs=10 - -python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size} - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --save_strategy epoch \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --debug underflow_overflow \ - --adafactor \ - --gradient_checkpointing - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - -python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json diff --git a/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh deleted file mode 100644 index abca0a60a98c1a71295ae1ad77791dec6b482547..0000000000000000000000000000000000000000 --- a/convlab2/base_models/t5/nlu/run_tm2_user_context3.sh +++ /dev/null @@ -1,68 +0,0 @@ -n_gpus=1 -task_name="nlu" -dataset_name="tm2" -speaker="user" -context_window_size=3 -data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="nlu_metric.py" -metric_for_best_model="overall_f1" -source_column="context" -target_column="dialogue_acts_seq" -model_name_or_path="t5-small" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=2 -lr=1e-3 -num_train_epochs=10 - -python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size} - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --save_strategy epoch \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --debug underflow_overflow \ - --adafactor \ - --gradient_checkpointing - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - -python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json diff --git a/convlab2/base_models/t5/nlu/run_tm3_user.sh b/convlab2/base_models/t5/nlu/run_tm3_user.sh deleted file mode 100644 index 689a626c9a871581b49eb84a7db1e9af1152e32e..0000000000000000000000000000000000000000 --- a/convlab2/base_models/t5/nlu/run_tm3_user.sh +++ /dev/null @@ -1,68 +0,0 @@ -n_gpus=1 -task_name="nlu" -dataset_name="tm3" -speaker="user" -context_window_size=0 -data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="nlu_metric.py" -metric_for_best_model="overall_f1" -source_column="context" -target_column="dialogue_acts_seq" -model_name_or_path="t5-small" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=2 -lr=1e-3 -num_train_epochs=10 - -python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size} - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --save_strategy epoch \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --debug underflow_overflow \ - --adafactor \ - --gradient_checkpointing - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - -python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json diff --git a/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh b/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh deleted file mode 100644 index e2ded66dbe940387f1997ba92028ef4dc4a5b5c5..0000000000000000000000000000000000000000 --- a/convlab2/base_models/t5/nlu/run_tm3_user_context3.sh +++ /dev/null @@ -1,68 +0,0 @@ -n_gpus=1 -task_name="nlu" -dataset_name="tm3" -speaker="user" -context_window_size=3 -data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="nlu_metric.py" -metric_for_best_model="overall_f1" -source_column="context" -target_column="dialogue_acts_seq" -model_name_or_path="t5-small" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=2 -lr=1e-3 -num_train_epochs=10 - -python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size} - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --save_strategy epoch \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --debug underflow_overflow \ - --adafactor \ - --gradient_checkpointing - -python -m torch.distributed.launch \ - --nproc_per_node ${n_gpus} ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - -python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json diff --git a/convlab2/base_models/t5/nlu/serialization.py b/convlab2/base_models/t5/nlu/serialization.py index 7c9a764fb9e4d04ec4036722448b4a1a00636a35..bb1a8b88b81ca23e97271c9bae211fbfd775149b 100644 --- a/convlab2/base_models/t5/nlu/serialization.py +++ b/convlab2/base_models/t5/nlu/serialization.py @@ -6,28 +6,35 @@ def serialize_dialogue_acts(dialogue_acts): intent_domain = f'[{intent}][{domain}]' da_dict.setdefault(intent_domain, []) da_dict[intent_domain].append(f'[{slot}][{value}]') - return ';'.join([intent_domain+'{'+','.join(slot_values)+'}' for intent_domain, slot_values in da_dict.items()]) + return ';'.join([intent_domain+'('+','.join(slot_values)+')' for intent_domain, slot_values in da_dict.items()]) def deserialize_dialogue_acts(das_seq): dialogue_acts = [] if len(das_seq) == 0: return dialogue_acts - da_seqs = das_seq.split(']};[') # will consume "]}" and "[" + da_seqs = das_seq.split(']);[') # will consume "])" and "[" for i, da_seq in enumerate(da_seqs): - if len(da_seq) == 0 or len(da_seq.split(']{[')) != 2: + if len(da_seq) == 0 or len(da_seq.split(']([')) != 2: continue if i == 0: if da_seq[0] == '[': da_seq = da_seq[1:] if i == len(da_seqs) - 1: - if da_seq[-2:] == ']}': + if da_seq[-2:] == '])': da_seq = da_seq[:-2] - intent_domain, slot_values = da_seq.split(']{[') - intent, domain = intent_domain.split('][') + try: + intent_domain, slot_values = da_seq.split(']([') + intent, domain = intent_domain.split('][') + except: + continue for slot_value in slot_values.split('],['): - slot, value = slot_value.split('][') + try: + slot, value = slot_value.split('][') + except: + continue dialogue_acts.append({'intent': intent, 'domain': domain, 'slot': slot, 'value': value}) + return dialogue_acts def equal_da_seq(dialogue_acts, das_seq): diff --git a/convlab2/nlu/evaluate_unified_datasets.py b/convlab2/nlu/evaluate_unified_datasets.py index 907b1afaaee6788c1e90e3bd85b67b3360c9c2da..2eee44d7a02b405d526ce9389993b9cf0783d48a 100644 --- a/convlab2/nlu/evaluate_unified_datasets.py +++ b/convlab2/nlu/evaluate_unified_datasets.py @@ -10,28 +10,43 @@ def evaluate(predict_result): for sample in predict_result: flag = True - for da_type in ['binary', 'categorical', 'non-categorical']: - if da_type == 'binary': - predicts = [(x['intent'], x['domain'], x['slot']) for x in sample['predictions']['dialogue_acts'][da_type]] - labels = [(x['intent'], x['domain'], x['slot']) for x in sample['dialogue_acts'][da_type]] - else: - predicts = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in sample['predictions']['dialogue_acts'][da_type]] - labels = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in sample['dialogue_acts'][da_type]] - predicts = sorted(list(set(predicts))) - labels = sorted(list(set(labels))) - for ele in predicts: - if ele in labels: + if isinstance(sample['predictions']['dialogue_acts'], dict): + for da_type in ['binary', 'categorical', 'non-categorical']: + if da_type == 'binary': + predicts = [(x['intent'], x['domain'], x['slot']) for x in sample['predictions']['dialogue_acts'][da_type]] + labels = [(x['intent'], x['domain'], x['slot']) for x in sample['dialogue_acts'][da_type]] + else: + predicts = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in sample['predictions']['dialogue_acts'][da_type]] + labels = [(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in sample['dialogue_acts'][da_type]] + predicts = sorted(list(set(predicts))) + labels = sorted(list(set(labels))) + for ele in predicts: + if ele in labels: + metrics['overall']['TP'] += 1 + metrics[da_type]['TP'] += 1 + else: + metrics['overall']['FP'] += 1 + metrics[da_type]['FP'] += 1 + for ele in labels: + if ele not in predicts: + metrics['overall']['FN'] += 1 + metrics[da_type]['FN'] += 1 + flag &= (predicts==labels) + acc.append(flag) + elif isinstance(sample['predictions']['dialogue_acts'], list): + gold_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da_type in ['binary', 'categorical', 'non-categorical'] for da in sample['dialogue_acts'][da_type]})) + pred_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da in sample['predictions']['dialogue_acts']})) + acc.append(pred_da==gold_da) + for ele in pred_da: + if ele in gold_da: metrics['overall']['TP'] += 1 - metrics[da_type]['TP'] += 1 else: metrics['overall']['FP'] += 1 - metrics[da_type]['FP'] += 1 - for ele in labels: - if ele not in predicts: + for ele in gold_da: + if ele not in pred_da: metrics['overall']['FN'] += 1 - metrics[da_type]['FN'] += 1 - flag &= (predicts==labels) - acc.append(flag) + else: + raise TypeError('type of predictions:dialogue_acts should be dict or list') for metric in metrics: TP = metrics[metric].pop('TP')