diff --git a/.gitignore b/.gitignore index b8590175ec627a7a5ac34b117b814962e0037997..60e65936e5ba6b821ca44873109eb6ec04f383a6 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,8 @@ __pycache__ .vscode # data +data/unified_datasets/multiwoz21/MultiWOZ_2.1.zip +data/unified_datasets/tm1/master.zip data/**/train.json data/**/val.json data/**/test.json @@ -25,6 +27,9 @@ data/schema/dstc8-schema-guided-dialogue-master data/mdbt/data data/mdbt/models data/mdbt/word-vectors +convlab2/**/data +convlab2/**/output +convlab2/**/cache convlab2/nlg/sclstm/**/resource/* convlab2/nlg/sclstm/**/resource_usr/* convlab2/nlg/sclstm/**/sclstm.pt diff --git a/convlab2/base_models/bert/create_data.py b/convlab2/base_models/bert/create_data.py index 825c736d2259552017166c93db168069fe0a6976..fcaad6d711a1d20a9fd94c70b994696e001ba2c7 100644 --- a/convlab2/base_models/bert/create_data.py +++ b/convlab2/base_models/bert/create_data.py @@ -4,8 +4,10 @@ from tqdm import tqdm from convlab2.util import load_dataset, load_nlu_data, load_dst_data, load_policy_data, load_nlg_data, load_e2e_data, load_rg_data from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer from collections import Counter +import json_lines +from convlab2.util.unified_datasets_util import create_delex_data -def create_bio_data(dataset, data_dir): +def create_bio_data(dataset, data_dir, args): data_by_split = load_nlu_data(dataset, speaker='all') os.makedirs(data_dir, exist_ok=True) @@ -47,7 +49,7 @@ def create_bio_data(dataset, data_dir): f.writelines(data) print('num of spans in utterances', cnt) -def create_dialogBIO_data(dataset, data_dir): +def create_dialogBIO_data(dataset, data_dir, args): data_by_split = load_nlu_data(dataset, split_to_turn=False) os.makedirs(data_dir, exist_ok=True) @@ -94,16 +96,83 @@ def create_dialogBIO_data(dataset, data_dir): f.writelines(data) print('num of spans in utterances', cnt) +def create_revert_dialogBIO_data(dataset, data_dir, args): + def tag2da(tokens, tags): + assert len(tokens)==len(tags) + triples = [] + i = 0 + utt = '' + while i < len(tags): + tag = tags[i] + if tag == 'B': + value = tokens[i] + j = i + 1 + while j < len(tags): + next_tag = tags[j] + if next_tag == 'I': + value += ' ' + tokens[j] + i += 1 + j += 1 + else: + break + triples.append({'intent':'', 'domain':'', 'slot':'', 'value': value, 'start': len(utt), 'end': len(utt)+len(value)}) + utt += value + ' ' + assert utt[triples[-1]['start']:triples[-1]['end']] == value, print(utt[triples[-1]['start']:triples[-1]['end']],triples[-1]) + else: + utt += tokens[i] + ' ' + i += 1 + utt = utt[:-1] + assert utt == ' '.join(tokens), print(utt, '\n', ' '.join(tokens)) + return triples + + def dialog2turn(tokens, labels): + turns = [] + turn = {'tokens': [], 'tags': []} + i = 0 + while i < len(tokens): + token = tokens[i] + if i < len(tokens) - 1 and token in ['user', 'system'] and tokens[i+1] == ':': + turns.append(turn) + turn = {'tokens': [], 'tags': []} + i += 2 + continue + turn['tokens'].append(token) + turn['tags'].append(labels[i]) + i += 1 + turns.pop(0) + for turn in turns: + da = {'binary': [], 'categorical': [], 'non-categorical': []} + da['non-categorical'] = tag2da(turn['tokens'], turn['tags']) + turn['utterance'] = ' '.join(turn['tokens']) + turn['dialogue_acts'] = da + return turns + + for data_split in dataset: + infer_output_data_path = os.path.join(args.infer_data_dir, f'{data_split}.json') + for original_dial, bio_dial in zip(dataset[data_split], json_lines.reader(open(infer_output_data_path))): + bio_turns = dialog2turn(bio_dial['tokens'], bio_dial['labels']) + original_dial['turns'] = original_dial['turns'][:len(bio_turns)] + assert len(bio_turns) == len(original_dial['turns']), print(len(bio_turns), len(original_dial['turns'])) + for ori_turn, new_turn in zip(original_dial['turns'], bio_turns): + ori_turn['original_utterance'] = ori_turn['utterance'] + ori_turn['utterance'] = new_turn['utterance'] + ori_turn['original_dialogue_acts'] = ori_turn['dialogue_acts'] + ori_turn['dialogue_acts'] = new_turn['dialogue_acts'] + dataset, _ = create_delex_data(dataset, delex_func=lambda d,s,v: f'<v>{v}</v>') + os.makedirs(data_dir, exist_ok=True) + json.dump(dataset, open(os.path.join(data_dir, 'data.json'), 'w', encoding='utf-8'), ensure_ascii=False, indent=2) + if __name__ == '__main__': from argparse import ArgumentParser parser = ArgumentParser(description="create data for seq2seq training") - parser.add_argument('--tasks', metavar='task_name', nargs='*', choices=['bio', 'dialogBIO'], help='names of tasks') + parser.add_argument('--tasks', metavar='task_name', nargs='*', choices=['bio', 'dialogBIO', 'revert_dialogBIO'], help='names of tasks') parser.add_argument('--datasets', metavar='dataset_name', nargs='*', help='names of unified datasets') parser.add_argument('--save_dir', metavar='save_directory', type=str, default='data', help='directory to save the data, default: data/$task_name/$dataset_name') + parser.add_argument('--infer_data_dir', metavar='infer_data_dir', type=str, default=None, help='directory of inference output data, default: None') args = parser.parse_args() print(args) for dataset_name in tqdm(args.datasets, desc='datasets'): dataset = load_dataset(dataset_name) for task_name in tqdm(args.tasks, desc='tasks', leave=False): data_dir = os.path.join(args.save_dir, task_name, dataset_name) - eval(f"create_{task_name}_data")(dataset, data_dir) + eval(f"create_{task_name}_data")(dataset, data_dir, args) diff --git a/convlab2/base_models/bert/infer_bio.sh b/convlab2/base_models/bert/infer_bio.sh index ed784c515c6703088313da0809b7c0442bcec333..c07f55375dac3f589845781a5cf91ba7b140557c 100644 --- a/convlab2/base_models/bert/infer_bio.sh +++ b/convlab2/base_models/bert/infer_bio.sh @@ -1,5 +1,5 @@ set -e -n_gpus=3 +n_gpus=2 task_name="dialogBIO" dataset_name="multiwoz21" data_dir="data/${task_name}/${dataset_name}" @@ -8,14 +8,14 @@ cache_dir="cache" logging_dir="${output_dir}/runs" source_column="tokens" target_column="labels" -model_name_or_path="output/dialogBIO/sgd" -per_device_eval_batch_size=16 +model_name_or_path="output/dialogBIO/sgd+tm1+tm2+tm3" +per_device_eval_batch_size=32 python create_data.py --tasks ${task_name} --datasets ${dataset_name} --save_dir "data" for split in test validation train do - python -m torch.distributed.launch \ + CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch \ --nproc_per_node ${n_gpus} run_token_classification.py \ --task_name ${task_name} \ --train_file ${data_dir}/${split}.json \ diff --git a/convlab2/base_models/bert/run_token_classification.py b/convlab2/base_models/bert/run_token_classification.py index c97fc60aa49a50d42a8470522d2dfaa09227b2ce..19c581efb3f423863dd46e51c39dc73dee9ce3a4 100644 --- a/convlab2/base_models/bert/run_token_classification.py +++ b/convlab2/base_models/bert/run_token_classification.py @@ -135,7 +135,7 @@ class DataTrainingArguments: metadata={"help": "The number of processes to use for the preprocessing."}, ) max_seq_length: int = field( - default=None, + default=512, metadata={ "help": "The maximum total input sequence length after tokenization. If set, sequences longer " "than this will be truncated, sequences shorter will be padded." diff --git a/convlab2/base_models/bert/train_bio.sh b/convlab2/base_models/bert/train_bio.sh index db2ee860d2464c57dfb20d57a54ea5b34cda85b1..59973634fe819c0c2d8b8c7fb6b2b79d420ecc2d 100644 --- a/convlab2/base_models/bert/train_bio.sh +++ b/convlab2/base_models/bert/train_bio.sh @@ -1,6 +1,7 @@ -n_gpus=3 +set -e +n_gpus=2 task_name="dialogBIO" -dataset_name="sgd" +dataset_name="sgd+tm1+tm2+tm3" data_dir="data/${task_name}/${dataset_name}" output_dir="output/${task_name}/${dataset_name}" cache_dir="cache" @@ -18,9 +19,21 @@ lr=2e-5 num_train_epochs=1 metric_for_best_model="f1" -python create_data.py --tasks ${task_name} --datasets ${dataset_name} --save_dir "data" +names=$(echo ${dataset_name} | tr "+" "\n") +mkdir -p ${data_dir} +for name in ${names}; +do + echo "preprocessing ${name}" + python create_data.py --tasks ${task_name} --datasets ${name} --save_dir "data" + if [ "${name}" != "${dataset_name}" ]; then + cat "data/${task_name}/${name}/train.json" >> ${train_file} + cat "data/${task_name}/${name}/validation.json" >> ${validation_file} + cat "data/${task_name}/${name}/test.json" >> ${test_file} + fi +done -python -m torch.distributed.launch \ + +CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch \ --nproc_per_node ${n_gpus} run_token_classification.py \ --task_name ${task_name} \ --train_file ${train_file} \ @@ -39,7 +52,6 @@ python -m torch.distributed.launch \ --cache_dir ${cache_dir} \ --output_dir ${output_dir} \ --logging_dir ${logging_dir} \ - --overwrite_output_dir \ --preprocessing_num_workers 4 \ --per_device_train_batch_size ${per_device_train_batch_size} \ --per_device_eval_batch_size ${per_device_eval_batch_size} \ diff --git a/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.py b/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.py index b8b5af5c41d5d40db1ec16ac63427920c3cd8b5e..0f9c841257387a293866b6d0727900d626c8047f 100644 --- a/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.py +++ b/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.py @@ -1,22 +1,42 @@ import json import os +import random +from tqdm import tqdm def main(args): + random.seed(45) os.makedirs(args.output_dir, exist_ok=True) filenames = [f for (_, _, fs) in os.walk(args.input_dir) for f in fs if 'keywords' in f] for filename in filenames: data = json.load(open(os.path.join(args.input_dir, filename))) fout = open(os.path.join(args.output_dir, f"{filename.split('/')[-1].split('_')[1]}.json"), 'w', encoding='utf-8') - for dial in data: + turn_keywords = [turn['keywords'] for dial in data for turn in dial] + random.shuffle(turn_keywords) + cnt = 0 + # keywords_set = {keyword for keywords in turn_keywords_set for keyword in keywords} + for dial in tqdm(data): context = [] for i, turn in enumerate(dial): speaker = 'user' if i%2 == 0 else 'system' - keywords = ', '.join(turn['keywords']) + random.shuffle(turn['keywords']) + keywords = ' | '.join(turn['keywords']) utt = turn['utterance'] - input_seq = '\n'.join([f"{turn['speaker']}: {turn['utt']}" for turn in context]+[f'{speaker}: ']) - input_seq = f'{keywords}\n{input_seq}' + context_seq = '\n'.join([f"{turn['speaker']}: {turn['utt']}" for turn in context]+[f'{speaker}: ']) + input_seq = f'keywords: {keywords}\n\ncontext: {context_seq}' context.append({'speaker': speaker, 'utt':utt}) fout.write(json.dumps({'keywords+context': input_seq, 'response': utt}, ensure_ascii=False)+'\n') + + # min_neg = len(turn['keywords']) + # max_neg = 4 * min_neg + # negative_keywords = random.sample(keywords_set, random.randint(min_neg, max_neg)) + # negative_keywords = random.sample(turn_keywords_set, 1)[0] + negative_keywords = turn_keywords[cnt] + cnt += 1 + possible_keywords = turn['keywords'] + list(negative_keywords) + random.shuffle(possible_keywords) + possible_keywords = ' | '.join(possible_keywords) + input_seq = f'possible keywords: {possible_keywords}\n\ncontext: {context_seq}' + fout.write(json.dumps({'keywords+context': input_seq, 'response': utt}, ensure_ascii=False)+'\n') if __name__ == '__main__': diff --git a/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.sh b/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.sh index 7fd24bdfee565e73b7e326f71514d565573e5e7a..8a4290c3f2cd4fcae432d7fb94e4da25ec4ec727 100644 --- a/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.sh +++ b/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.sh @@ -1,7 +1,7 @@ -dataset_name="sgd+metalwoz+tm1+tm2+tm3" +dataset_name="metalwoz+sgd+tm1+tm2+tm3" names=$(echo ${dataset_name} | tr "+" "\n") model_type="gpt" -data_dir=data/key2gen/${model_type}/${name}/${dataset_name} +data_dir=data/key2gen_shuffle_noisy/${model_type}/${name}/${dataset_name} mkdir -p ${data_dir} train_file="${data_dir}/train.json" validation_file="${data_dir}/validation.json" @@ -9,10 +9,11 @@ test_file="${data_dir}/test.json" for name in ${names} do echo "preprocessing ${name}" - python gen_pretraining_data.py -i data/lm/${name}/${model_type} -o data/keygen/${model_type}/${name} + python gen_pretraining_data.py -i data/lm/${name}/${model_type} -o data/key2gen_shuffle_noisy/${model_type}/${name} if [ "${name}" != "${dataset_name}" ]; then - cat "data/keygen/gpt/${name}/train.json" >> ${train_file} - cat "data/keygen/gpt/${name}/validation.json" >> ${validation_file} - cat "data/keygen/gpt/${name}/test.json" >> ${test_file} + cat "data/key2gen_shuffle_noisy/gpt/${name}/train.json" >> ${train_file} + cat "data/key2gen_shuffle_noisy/gpt/${name}/validation.json" >> ${validation_file} + cat "data/key2gen_shuffle_noisy/gpt/${name}/test.json" >> ${test_file} fi done +python gen_pretraining_data.py -i data/lm/multiwoz21/${model_type} -o data/key2gen_shuffle_noisy/${model_type}/multiwoz21 \ No newline at end of file diff --git a/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh b/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh index 8bd5de0590addaed4c6f32feb7f7d7bd18acd23f..c04d68fc374c38eb27b78f0ac288d04470e98d05 100644 --- a/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh +++ b/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh @@ -1,6 +1,6 @@ set -e -n_gpus=2 -task_name="key2gen" +n_gpus=1 +task_name="key2gen_shuffle_noisy" dataset_name="metalwoz+sgd+tm1+tm2+tm3" speaker="all" model_type="gpt" @@ -19,7 +19,7 @@ max_target_length=128 model_name_or_path="t5-small" per_device_train_batch_size=128 per_device_eval_batch_size=128 -gradient_accumulation_steps=4 +gradient_accumulation_steps=8 lr=1e-3 num_train_epochs=1 diff --git a/convlab2/base_models/t5/rg/run_rg.sh b/convlab2/base_models/t5/rg/run_rg.sh index 6fcffca23995db20fab4d581fe350b5d47894882..ac065b81fefa2c406a469a6658263477f1fbe18d 100644 --- a/convlab2/base_models/t5/rg/run_rg.sh +++ b/convlab2/base_models/t5/rg/run_rg.sh @@ -20,7 +20,7 @@ per_device_train_batch_size=128 per_device_eval_batch_size=128 gradient_accumulation_steps=4 lr=1e-3 -num_train_epochs=5 +num_train_epochs=1 names=$(echo ${dataset_name} | tr "+" "\n") mkdir -p ${data_dir}