diff --git a/convlab/base_models/gpt/keyword_extraction/run.sh b/convlab/base_models/gpt/keyword_extraction/run.sh deleted file mode 100644 index f35c2403ce21f9450d3d7a84dc8e7076ee6f5f89..0000000000000000000000000000000000000000 --- a/convlab/base_models/gpt/keyword_extraction/run.sh +++ /dev/null @@ -1,5 +0,0 @@ -set -e -for dataset_name in dailydialog metalwoz tm1 tm2 tm3 sgd multiwoz21 -do - bash get_keywords.sh ${dataset_name} -done \ No newline at end of file diff --git a/convlab/base_models/gpt/keyword_extraction/test_t5_key2gen.sh b/convlab/base_models/gpt/keyword_extraction/test_t5_key2gen.sh deleted file mode 100644 index faaef560c20bd1a928f9c99503277780c4e8c26d..0000000000000000000000000000000000000000 --- a/convlab/base_models/gpt/keyword_extraction/test_t5_key2gen.sh +++ /dev/null @@ -1,49 +0,0 @@ -set -e -n_gpus=2 -master_port=23457 -task_name="key2gen_noisy" -dataset_name=$1 -model_type="gpt" -data_dir="data/${task_name}/${model_type}/${dataset_name}" -output_dir="output/${task_name}/${model_type}/${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -source_column="source" -target_column="target" -truncation_side="left" -max_source_length=512 -max_target_length=128 -model_name_or_path="output/${task_name}/${model_type}/dailydialog+metalwoz+tm1+tm2+tm3" -per_device_train_batch_size=128 -per_device_eval_batch_size=128 -gradient_accumulation_steps=2 -lr=1e-3 -num_train_epochs=3 - -python -m torch.distributed.launch --master_port ${master_port} \ - --nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_predict \ - --predict_with_generate \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 16 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen.sh deleted file mode 100644 index 36e24587d3d7181e4a92b649e5f1b1e47cdcd9f3..0000000000000000000000000000000000000000 --- a/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen.sh +++ /dev/null @@ -1,56 +0,0 @@ -set -e -n_gpus=2 -master_port=23457 -task_name="key2gen_noisy" -dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+wikidialog" -model_type="gpt" -data_dir="data/${task_name}/${model_type}/${dataset_name}" -output_dir="output/${task_name}/${model_type}/${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -source_column="source" -target_column="target" -truncation_side="left" -max_source_length=512 -max_target_length=128 -model_name_or_path="t5-small" -per_device_train_batch_size=128 -per_device_eval_batch_size=128 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=3 - -python -m torch.distributed.launch --master_port ${master_port} \ - --nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --do_predict \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --load_best_model_at_end \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 16 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh index 4d628f7f2c53d766c4a0b92861ac7681b8c80b02..b2de55410064b0234f8416b6338e2a070c79147f 100644 --- a/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh +++ b/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh @@ -4,8 +4,9 @@ master_port=23456 task_name="rg" dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog" model_type="gpt" +model_name="t5-small" data_dir="data/${task_name}/${model_type}/${dataset_name}" -output_dir="output/${task_name}/${model_type}/${dataset_name}" +output_dir="output/${task_name}/${model_name}/${dataset_name}" cache_dir="../cache" logging_dir="${output_dir}/runs" train_file="${data_dir}/train.json" @@ -14,7 +15,7 @@ target_column="target" truncation_side="left" max_source_length=512 max_target_length=128 -model_name_or_path="t5-small" +model_name_or_path="${model_name}" per_device_train_batch_size=64 per_device_eval_batch_size=128 gradient_accumulation_steps=1 diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_multitask.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh similarity index 91% rename from convlab/base_models/gpt/keyword_extraction/train_t5_multitask.sh rename to convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh index dd8d5a460478ae068143b5d18914500e31bae439..74c418164815cfd538e17cb08cd0de7c24ba7624 100644 --- a/convlab/base_models/gpt/keyword_extraction/train_t5_multitask.sh +++ b/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh @@ -4,8 +4,9 @@ master_port=23456 task_name="key2gen+key2gen_noisy" dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog" model_type="gpt" +model_name="t5-small" data_dir="data/${task_name}/${model_type}/${dataset_name}" -output_dir="output/${task_name}/${model_type}/${dataset_name}" +output_dir="output/${task_name}/${model_name}/${dataset_name}" cache_dir="../cache" logging_dir="${output_dir}/runs" train_file="${data_dir}/train.json" @@ -14,7 +15,7 @@ target_column="target" truncation_side="left" max_source_length=512 max_target_length=128 -model_name_or_path="output/rg/${model_type}/${dataset_name}" +model_name_or_path="output/rg/${model_name}/${dataset_name}" per_device_train_batch_size=64 per_device_eval_batch_size=128 gradient_accumulation_steps=1 diff --git a/convlab/base_models/t5/key2gen/create_data.py b/convlab/base_models/t5/key2gen/create_data.py new file mode 100644 index 0000000000000000000000000000000000000000..cb4e12c8e720f3031808ec0972c38926a617ef68 --- /dev/null +++ b/convlab/base_models/t5/key2gen/create_data.py @@ -0,0 +1,159 @@ +import os +import json +from tqdm import tqdm +from convlab.util import load_dataset, load_unified_data, load_nlu_data + +def create_nlg_data(dataset, data_dir, args): + data_by_split = load_nlu_data(dataset, speaker='system', use_context=True, context_window_size=3) + os.makedirs(data_dir, exist_ok=True) + + data_splits = data_by_split.keys() + for data_split in data_splits: + data = [] + for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): + context = [(turn['speaker'], turn['utterance']) for turn in sample['context']] + response = sample['utterance'] + if len(context) > 0 and len(response) > 0: + knowledge = sample['dialogue_acts'] + data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n') + + if 'test' in data_split: + file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json") + else: + file_name = os.path.join(data_dir, f"{data_split}.json") + with open(file_name, "w", encoding='utf-8') as f: + f.writelines(data) + data_by_split[data_split] = data + return data_by_split + +def create_kvret_data(dataset, data_dir, args): + data_by_split = load_unified_data(dataset, speaker='system', utterance=True, db_results=True, use_context=True, context_window_size=100) + os.makedirs(data_dir, exist_ok=True) + + domain2entity_col = {'schedule': 'event' ,'navigate': 'poi', 'weather': 'location'} + data_splits = data_by_split.keys() + for data_split in data_splits: + data = [] + for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): + context = [(turn['speaker'], turn['utterance']) for turn in sample['context']] + response = sample['utterance'] + if len(context) > 0 and len(response) > 0: + knowledge = sample['db_results'] + for domain, db_items in knowledge.items(): + entity_col = domain2entity_col[domain] + for db_item in db_items: + db_item['entity'] = db_item.pop(entity_col) + + data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n') + + if 'test' in data_split: + file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json") + else: + file_name = os.path.join(data_dir, f"{data_split}.json") + with open(file_name, "w", encoding='utf-8') as f: + f.writelines(data) + data_by_split[data_split] = data + return data_by_split + +def create_personachat_data(dataset, data_dir, args): + data_by_split = dataset + os.makedirs(data_dir, exist_ok=True) + + data_splits = data_by_split.keys() + for data_split in data_splits: + data = [] + for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): + knowledge = dial['persona']['system'] + context = [] + for turn in dial['turns']: + response = turn['utterance'] + if turn['speaker'] == 'system' and len(context) > 0 and len(response) > 0: + data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n') + context.append((turn['speaker'], turn['utterance'])) + + if 'test' in data_split: + file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json") + else: + file_name = os.path.join(data_dir, f"{data_split}.json") + with open(file_name, "w", encoding='utf-8') as f: + f.writelines(data) + data_by_split[data_split] = data + return data_by_split + +def create_wow_data(dataset, data_dir, args): + data_by_split = dataset + os.makedirs(data_dir, exist_ok=True) + + data_splits = data_by_split.keys() + for data_split in data_splits: + data = [] + for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): + context = [] + for turn in dial['turns']: + response = turn['utterance'] + if turn['speaker'] == 'system' and len(context) > 0 and len(response) > 0: + knowledge = turn['checked_passage'] + if knowledge is None: + knowledge = [] + elif isinstance(knowledge, str): + knowledge = [knowledge] + data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n') + context.append((turn['speaker'], turn['utterance'])) + + if 'test' in data_split: + file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json") + else: + file_name = os.path.join(data_dir, f"{data_split}.json") + with open(file_name, "w", encoding='utf-8') as f: + f.writelines(data) + data_by_split[data_split] = data + return data_by_split + +def create_opendialkg_data(dataset, data_dir, args): + data_by_split = dataset + os.makedirs(data_dir, exist_ok=True) + + data_splits = data_by_split.keys() + for data_split in data_splits: + data = [] + for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): + context = [] + for turn in dial['turns']: + response = turn['utterance'] + if turn['speaker'] == 'system' and 'kg_path' in turn and len(context) > 0 and len(response) > 0: + knowledge = turn['kg_path']['triples'] + data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n') + context.append((turn['speaker'], turn['utterance'])) + + if 'test' in data_split: + file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json") + else: + file_name = os.path.join(data_dir, f"{data_split}.json") + with open(file_name, "w", encoding='utf-8') as f: + f.writelines(data) + data_by_split[data_split] = data + return data_by_split + + +if __name__ == '__main__': + from argparse import ArgumentParser + parser = ArgumentParser(description="create data for seq2seq training") + parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['nlg', 'kvret', 'opendialkg', 'personachat', 'wow'], help='names of tasks') + parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets') + parser.add_argument('--shot', '-s', type=float, default=None, help='how many data is used for training and evaluation, ratio if < 1 else absolute number') + parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments') + args = parser.parse_args() + print(args) + for dataset_name in tqdm(args.datasets, desc='datasets'): + dataset = load_dataset(dataset_name, dial_ids_order=args.dial_ids_order) + if args.shot: + if args.shot < 1: + dataset['train'] = dataset['train'][:round(len(dataset['train'])*args.shot)] + dataset['validation'] = dataset['validation'][:round(len(dataset['validation'])*args.shot)] + else: + args.shot = int(args.shot) + dataset['train'] = dataset['train'][:args.shot] + dataset['validation'] = dataset['validation'][:args.shot] + for task_name in tqdm(args.tasks, desc='tasks', leave=False): + data_dir = os.path.join('data', task_name, (dataset_name if not args.shot else f'{dataset_name}_{args.shot}shot_order{args.dial_ids_order}')) + data_by_split = eval(f"create_{task_name}_data")(dataset, data_dir, args) diff --git a/convlab/base_models/t5/key2gen/create_data_key2gen.py b/convlab/base_models/t5/key2gen/create_data_key2gen.py deleted file mode 100644 index 6c6d9a0fa6f2435f6b657c44d9ce714d3a66ddfb..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/create_data_key2gen.py +++ /dev/null @@ -1,269 +0,0 @@ -import os -import json -from tqdm import tqdm -from transformers import AutoTokenizer -from convlab.util import load_dataset, load_unified_data, load_nlu_data -from convlab.base_models.t5.nlu.serialization import serialize_dialogue_acts -from collections import Counter -from nltk.tokenize import word_tokenize, sent_tokenize -from nltk.corpus import stopwords -from string import punctuation - -def create_nlg_data(dataset, data_dir, args): - data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size) - data_dir = os.path.join(data_dir, args.speaker, f'context_{args.context_window_size}') - os.makedirs(data_dir, exist_ok=True) - - data_splits = data_by_split.keys() - for data_split in data_splits: - data = [] - for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): - if args.key2gen: - keywords = [] - for da_type in sample['dialogue_acts']: - for da in sample['dialogue_acts'][da_type]: - intent, domain, slot, value = da['intent'], da['domain'], da['slot'], da.get('value', '') - intent_domain = f'{intent}-{domain}' - keywords.append([intent_domain]) - if len(slot) > 0: - keywords[-1].append(slot) - if len(value) > 0: - keywords[-1].append(value) - dialogue_acts_seq = '| {} |'.format(' | '.join([' : '.join(da_keywords) for da_keywords in keywords])) - else: - dialogue_acts_seq = serialize_dialogue_acts(sample['dialogue_acts']) - - if args.context_window_size>0: - context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: ']) - context = f'generate a response: grounded knowledge: {dialogue_acts_seq} context:\n\n{context}' - else: - context = f'generate a response: grounded knowledge: {dialogue_acts_seq} context:\n\n{sample["speaker"]}: ' - - data.append(json.dumps({'context+da': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n') - - file_name = os.path.join(data_dir, f"{data_split}.json") - with open(file_name, "w", encoding='utf-8') as f: - f.writelines(data) - data_by_split[data_split] = data - return data_by_split - -def create_dart_data(dataset, data_dir, args): - data_by_split = dataset - os.makedirs(data_dir, exist_ok=True) - - data_splits = data_by_split.keys() - for data_split in data_splits: - data = [] - for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): - sample = sample['turns'][0] - triples = sample['tripleset'] - if args.key2gen: - keywords = [w for triple in triples for w in triple] - # DONE: try adding prompt, no improvement - entity_cnt = Counter() - for triple in triples: - e1, r, e2 = triple - for e in [e1, e2]: - if e.startswith('[') and e.endswith(']'): - continue - entity_cnt[e] += 1 - - assert len(entity_cnt) > 0 - common_entity = entity_cnt.most_common(1)[0][0] - context = f'{" | ".join(keywords)}\n\ncontext: user: tell me something about {common_entity}. system: ' - else: - triples = [' : '.join(triple) for triple in triples] - context = f'{" | ".join(triples)}\n\ncontext: system: ' - - data.append(json.dumps({'triples': context, 'text': sample['utterance']}, ensure_ascii=False)+'\n') - - file_name = os.path.join(data_dir, f"{data_split}.json") - with open(file_name, "w", encoding='utf-8') as f: - f.writelines(data) - data_by_split[data_split] = data - return data_by_split - -def create_commongen_data(dataset, data_dir, args): - data_by_split = dataset - os.makedirs(data_dir, exist_ok=True) - - data_splits = data_by_split.keys() - for data_split in data_splits: - data = [] - for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): - sample = sample['turns'][0] - concepts = sample['concepts'] - context = f'{" | ".join(concepts)}\n\ncontext: system: ' - - data.append(json.dumps({'concepts': context, 'text': sample['utterance']}, ensure_ascii=False)+'\n') - - file_name = os.path.join(data_dir, f"{data_split}.json") - with open(file_name, "w", encoding='utf-8') as f: - f.writelines(data) - data_by_split[data_split] = data - return data_by_split - -def create_kvret_data(dataset, data_dir, args): - data_by_split = load_unified_data(dataset, speaker='system', utterance=True, db_results=True, use_context=True, context_window_size=100) - os.makedirs(data_dir, exist_ok=True) - - domain2entity_col = {'schedule': 'event' ,'navigate': 'poi', 'weather': 'location'} - data_splits = data_by_split.keys() - for data_split in data_splits: - data = [] - for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): - if len(sample['utterance']) == 0: - continue - db_results = sample['db_results'] - db_seqs = [] - for domain, db_items in db_results.items(): - entity_col = domain2entity_col[domain] - for db_item in db_items: - entity = db_item[entity_col] - for db_key, db_value in db_item.items(): - if db_key == entity_col: - continue - db_seqs.append(' : '.join([entity, db_key, db_value])) - db_seq = ' |\n'.join(db_seqs) - - context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: ']) - context = f'generate a response: all knowledge:\n\n| {db_seq} | context:\n\n{context}' - data.append(json.dumps({'context+db': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n') - - file_name = os.path.join(data_dir, f"{data_split}.json") - with open(file_name, "w", encoding='utf-8') as f: - f.writelines(data) - data_by_split[data_split] = data - return data_by_split - -def create_personachat_data(dataset, data_dir, args): - data_by_split = dataset - os.makedirs(data_dir, exist_ok=True) - - stop_words = set(stopwords.words('english')) | set(punctuation) - def sentence2keywords(sentence): - index2keyword = {} - for i, w in enumerate(word_tokenize(sentence)): - if not w.lower() in stop_words: - index2keyword[i] = w - indexes = sorted(index2keyword.keys()) - keywords = [] - for i, index in enumerate(indexes): - if i > 0 and index == indexes[i-1] + 1: - keywords[-1]+= ' '+index2keyword[index] - else: - keywords.append(index2keyword[index]) - return keywords - - data_splits = data_by_split.keys() - for data_split in data_splits: - data = [] - for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): - persona = dial['persona']['system'] - if args.key2gen: - persona_seq = ' |\n'.join([' : '.join(sentence2keywords(s)) for s in persona]) - else: - persona_seq = ' | '.join(persona) - context = [] - for turn in dial['turns']: - if turn['speaker'] == 'system': - context_seq = '\n'.join([f"{t['speaker']}: {t['utterance']}" for t in context]+[f'{turn["speaker"]}: ']) - context_seq = f'generate a response: all knowledge:\n\n| {persona_seq} | context:\n\n{context_seq}' - data.append(json.dumps({'context+persona': context_seq, 'response': turn['utterance']}, ensure_ascii=False)+'\n') - context.append({'speaker': turn['speaker'], 'utterance': turn['utterance']}) - - file_name = os.path.join(data_dir, f"{data_split}.json") - with open(file_name, "w", encoding='utf-8') as f: - f.writelines(data) - data_by_split[data_split] = data - return data_by_split - -def create_wow_data(dataset, data_dir, args): - data_by_split = dataset - os.makedirs(data_dir, exist_ok=True) - - stop_words = set(stopwords.words('english')) | set(punctuation) - def sentence2keywords(sentence): - index2keyword = {} - for i, w in enumerate(word_tokenize(sentence)): - if not w.lower() in stop_words: - index2keyword[i] = w - indexes = sorted(index2keyword.keys()) - keywords = [] - for i, index in enumerate(indexes): - if i > 0 and index == indexes[i-1] + 1: - keywords[-1]+= ' '+index2keyword[index] - else: - keywords.append(index2keyword[index]) - return keywords - - def sentences2keywords_seq(sentences): - return ' |\n'.join([' : '.join(sentence2keywords(sentence)) for sentence in sent_tokenize(sentences)]) - - - data_splits = data_by_split.keys() - for data_split in data_splits: - data = [] - for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): - context = [] - for turn in dial['turns']: - if turn['speaker'] == 'system': - if turn['checked_sentence']: - if args.key2gen: - know_seq = f" | {sentences2keywords_seq(turn['checked_sentence'])} |" - else: - know_seq = turn['checked_sentence'] - else: - know_seq = '' - context_seq = '\n'.join([f"{t['speaker']}: {t['utterance']}" for t in context]+[f'{turn["speaker"]}: ']) - context_seq = f'generate a response: grounded knowledge:\n\n{know_seq} context:\n\n{context_seq}' - data.append(json.dumps({'context+knowledge': context_seq, 'response': turn['utterance']}, ensure_ascii=False)+'\n') - context.append({'speaker': turn['speaker'], 'utterance': turn['utterance']}) - - file_name = os.path.join(data_dir, f"{data_split}.json") - with open(file_name, "w", encoding='utf-8') as f: - f.writelines(data) - data_by_split[data_split] = data - return data_by_split - -def get_max_len(data_by_split, tokenizer): - for data_split in data_by_split.keys(): - seq_len = {} - for line in data_by_split[data_split]: - item = json.loads(line.strip()) - for column, seq in item.items(): - seq_len.setdefault(column, []) - seq_len[column].append(len(tokenizer.tokenize(seq))) - print(f"data split: {data_split}") - for column, lens in seq_len.items(): - print(f'\t{column}\tmax_len: {max(lens)}\tmean_len: {round(sum(lens)/len(lens),2)}') - - -if __name__ == '__main__': - from argparse import ArgumentParser - parser = ArgumentParser(description="create data for seq2seq training") - parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['nlg', 'dart', 'commongen', 'kvret', 'personachat', 'wow'], help='names of tasks') - parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets') - parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s)') - parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered') - parser.add_argument('--len_tokenizer', '-l', type=str, default=None, help='name or path of tokenizer that used to get seq len') - parser.add_argument('--ratio', '-r', type=float, default=None, help='how many data is used for training and evaluation') - parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments') - parser.add_argument('--key2gen', '-k', action='store_true', default=False, help='generate data for key2gen models') - args = parser.parse_args() - print(args) - if args.len_tokenizer: - tokenizer = AutoTokenizer.from_pretrained(args.len_tokenizer) - for dataset_name in tqdm(args.datasets, desc='datasets'): - dataset = load_dataset(dataset_name, args.dial_ids_order) - if args.ratio: - dataset['train'] = dataset['train'][:round(len(dataset['train'])*args.ratio)] - dataset['validation'] = dataset['validation'][:round(len(dataset['validation'])*args.ratio)] - for task_name in tqdm(args.tasks, desc='tasks', leave=False): - if args.key2gen: - data_dir = os.path.join('data', task_name, "key2gen_"+(dataset_name if not args.ratio else f'{dataset_name}_{args.ratio}_order{args.dial_ids_order}')) - else: - data_dir = os.path.join('data', task_name, (dataset_name if not args.ratio else f'{dataset_name}_{args.ratio}_order{args.dial_ids_order}')) - data_by_split = eval(f"create_{task_name}_data")(dataset, data_dir, args) - if args.len_tokenizer: - get_max_len(data_by_split, tokenizer) diff --git a/convlab/base_models/t5/key2gen/dataset_vanilla.py b/convlab/base_models/t5/key2gen/dataset_vanilla.py new file mode 100644 index 0000000000000000000000000000000000000000..083c277f2c0f0af346d5b33e2248c40bb010cec0 --- /dev/null +++ b/convlab/base_models/t5/key2gen/dataset_vanilla.py @@ -0,0 +1,123 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Data processing for vanilla generator""" + +import json +import datasets +from convlab.base_models.t5.key2gen.features import FEATURES +from copy import deepcopy + + +class VanillaDataset(datasets.GeneratorBasedBuilder): + """Dataset for vanilla generator (e.g., t5)""" + + VERSION = datasets.Version("1.18.0") + + BUILDER_CONFIGS = [ + datasets.BuilderConfig(name="nlg", version=VERSION, description="DA grounded generation task"), + datasets.BuilderConfig(name="kvret", version=VERSION, description="KB grounded generation task"), + datasets.BuilderConfig(name="opendialkg", version=VERSION, description="KG grounded generation task"), + datasets.BuilderConfig(name="wow", version=VERSION, description="Passage grounded generation task"), + datasets.BuilderConfig(name="personachat", version=VERSION, description="Persona grounded generation task"), + ] + + def _info(self): + return datasets.DatasetInfo( + description=f"Vanilla Dataset for {self.config.description}", + features=datasets.Features(deepcopy(FEATURES[self.config.name])) + ) + + def _split_generators(self, dl_manager): + generators = [] + if "train" in self.config.data_files: + generators.append(datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": self.config.data_files["train"][0], + "split": "train", + }, + )) + if "validation" in self.config.data_files: + generators.append(datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": self.config.data_files["validation"][0], + "split": "validation", + }, + )) + if "test" in self.config.data_files: + generators.append(datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": self.config.data_files["test"][0], + "split": "test", + }, + )) + + return generators + + def _generate_examples(self, filepath, split): + with open(filepath, encoding="utf-8") as f: + for key, row in enumerate(f): + item = json.loads(row) + if self.config.name == "nlg": + knowledge = item["knowledge"] + triples = [] + for da_type in knowledge: + for da in knowledge[da_type]: + intent, domain, slot, value = da["intent"], da["domain"], da["slot"], da.get("value", "") + if 'start' in da: + da.pop('start') + da.pop('end') + intent_domain = f"{intent}-{domain}" + triples.append([intent_domain]) + if len(slot) > 0: + triples[-1].append(slot) + if len(value) > 0: + triples[-1].append(value) + knowledge_seq = "| {} |".format(" | ".join([" : ".join(da_keywords) for da_keywords in triples])) + + elif self.config.name == "kvret": + knowledge = {"schedule": [], "weather": [], "navigate": []} + triples = [] + for domain, db_items in item["knowledge"].items(): + knowledge[domain] = db_items + for db_item in db_items: + entity = db_item["entity"] + for db_key, db_value in db_item.items(): + if db_key == "entity": + continue + triples.append([entity, db_key, db_value]) + knowledge_seq = "| {} |".format(" | ".join([" : ".join(triple) for triple in triples])) + + elif self.config.name == "opendialkg": + knowledge = item["knowledge"] + knowledge_seq = "| {} |".format(" | ".join([" : ".join(triple) for triple in item["knowledge"]])) + + elif self.config.name in ["wow", "personachat"]: + knowledge = item["knowledge"] + try: + knowledge_seq = "| {} |".format(" | ".join(item["knowledge"])) + except: + print([knowledge]) + raise + + context = "\n".join([f"{turn[0]}: {turn[1]}" for turn in item["context"]]+["system: "]) + context_knowledge = f"generate a response: grounded knowledge: \n\n{knowledge_seq} context:\n\n{context}" + + yield key, { + "context+knowledge": context_knowledge, + "response": item["response"], + "knowledge": knowledge, + } diff --git a/convlab/base_models/t5/key2gen/evaluate.py b/convlab/base_models/t5/key2gen/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..63bf815de92d5cc5aa1b6adda3d8edfe2321c130 --- /dev/null +++ b/convlab/base_models/t5/key2gen/evaluate.py @@ -0,0 +1,78 @@ +from tabulate import tabulate +import os +import json +from tqdm import tqdm +from datasets import load_metric +import numpy as np + +def evaluate(filename, metric): + """ + It reads the predictions, references, and knowledge from a file, and then computes the metric + + :param filename: the path to the file containing the predictions + :param metric: the metric to use for evaluation + :return: The result of the evaluation. + """ + predictions, references, knowledge = [], [], [] + with open(filename, 'r') as f: + for line in f: + item = json.loads(line) + predictions.append(item['predictions']) + references.append(item['response']) + knowledge.append(item['knowledge']) + result = metric.compute(predictions=predictions, references=references, knowledge=knowledge) + return result + + +def avg_result(results): + """ + It takes a list of dictionaries, and returns a dictionary with the same keys, but the values are the + mean and standard deviation of the values in the input dictionaries + + :param results: a list of dictionaries, each dictionary is the result of a single run of the model + :return: The average and standard deviation of the results. + """ + ret = {} + for k in results[0]: + m = round(np.mean([result[k] for result in results]), 2) + v = round(np.std([result[k] for result in results], ddof=1), 2) if len(results) > 1 else None + ret[k] = f"{m}({v})" + return ret + + +if __name__ == '__main__': + from argparse import ArgumentParser + parser = ArgumentParser(description="create data for seq2seq training") + parser.add_argument("--output_dirs", type=str, nargs='*', required=True) + parser.add_argument('--tasks', '-t', type=str, nargs='*', choices=['nlg', 'kvret', 'opendialkg', 'personachat', 'wow'], help='names of tasks') + parser.add_argument('--shots', '-s', type=int, nargs='*', help='how many data is used for training and evaluation, ratio if < 1 else absolute number') + parser.add_argument('--dial_ids_orders', '-o', type=int, nargs='*', help='which data order is used for experiments') + args = parser.parse_args() + print(args) + + tables = [] + for task_name in tqdm(args.tasks, desc='tasks'): + metric = load_metric("metric.py", task_name) + dataset_name = task_name if task_name != "nlg" else "multiwoz21" + for shot in tqdm(args.shots, desc='shots'): + for output_dir in tqdm(args.output_dirs, desc='models'): + model_name = output_dir.split('/')[-1] + if task_name == "wow": + test_splits = ["_seen", "_unseen"] + else: + test_splits = [""] + for test_split in test_splits: + results = [] + for dial_ids_order in tqdm(args.dial_ids_orders, desc='dial_ids_orders'): + filename = os.path.join(output_dir, task_name, f"{dataset_name}_{shot}shot_order{dial_ids_order}/gen{test_split}/generated_predictions.json") + results.append(evaluate(filename, metric)) + res = { + "dataset": f"{task_name}-{shot}shot", + "model": f"{model_name}{test_split}", + **avg_result(results) + } + tables.append(res) + # print(res) + res = tabulate(tables, headers='keys', tablefmt='github') + with open(f'eval_results.txt', 'w', encoding='utf-8') as f: + print(res, file=f) diff --git a/convlab/base_models/t5/key2gen/features.py b/convlab/base_models/t5/key2gen/features.py new file mode 100644 index 0000000000000000000000000000000000000000..0ac768b5cbe61d46e430580b025182e515db93ef --- /dev/null +++ b/convlab/base_models/t5/key2gen/features.py @@ -0,0 +1,72 @@ +import datasets + +FEATURES = { + "nlg": { + "context+knowledge": datasets.Value("string"), + "response": datasets.Value("string"), + "knowledge": { + "categorical": datasets.Sequence({ + "intent": datasets.Value("string"), + "domain": datasets.Value("string"), + "slot": datasets.Value("string"), + "value": datasets.Value("string"), + }), + "non-categorical": datasets.Sequence({ + "intent": datasets.Value("string"), + "domain": datasets.Value("string"), + "slot": datasets.Value("string"), + "value": datasets.Value("string"), + }), + "binary": datasets.Sequence({ + "intent": datasets.Value("string"), + "domain": datasets.Value("string"), + "slot": datasets.Value("string"), + }) + }}, + "kvret": { + "context+knowledge": datasets.Value("string"), + "response": datasets.Value("string"), + "knowledge": { + "schedule": datasets.Sequence({ + "entity": datasets.Value("string"), + "time": datasets.Value("string"), + "date": datasets.Value("string"), + "party": datasets.Value("string"), + "room": datasets.Value("string"), + "agenda": datasets.Value("string") + }), + "weather": datasets.Sequence({ + "entity": datasets.Value("string"), + "today": datasets.Value("string"), + "monday": datasets.Value("string"), + "tuesday": datasets.Value("string"), + "wednesday": datasets.Value("string"), + "thursday": datasets.Value("string"), + "friday": datasets.Value("string"), + "saturday": datasets.Value("string"), + "sunday": datasets.Value("string"), + }), + "navigate": datasets.Sequence({ + "entity": datasets.Value("string"), + "traffic_info": datasets.Value("string"), + "poi_type": datasets.Value("string"), + "address": datasets.Value("string"), + "distance": datasets.Value("string") + }) + }}, + "opendialkg": { + "context+knowledge": datasets.Value("string"), + "response": datasets.Value("string"), + "knowledge": datasets.Sequence(datasets.Sequence(datasets.Value("string"))), + }, + "wow": { + "context+knowledge": datasets.Value("string"), + "response": datasets.Value("string"), + "knowledge": datasets.Sequence(datasets.Value("string")), + }, + "personachat": { + "context+knowledge": datasets.Value("string"), + "response": datasets.Value("string"), + "knowledge": datasets.Sequence(datasets.Value("string")), + } +} \ No newline at end of file diff --git a/convlab/base_models/t5/key2gen/finetune.sh b/convlab/base_models/t5/key2gen/finetune.sh new file mode 100644 index 0000000000000000000000000000000000000000..390ea1a908dcf9f335ec74d35a422239e9a923ca --- /dev/null +++ b/convlab/base_models/t5/key2gen/finetune.sh @@ -0,0 +1,153 @@ +n_gpus=2 +master_port=23456 +cache_dir="../cache" +dataset_path="dataset_vanilla.py" +metric_name_or_path="metric.py" +source_column="context+knowledge" +target_column="response" +truncation_side="left" +max_source_length=512 +max_target_length=512 +model_name="t5-small" +model_name_or_path="t5-small" +per_device_train_batch_size=64 +per_device_eval_batch_size=64 +gradient_accumulation_steps=1 +num_workers=16 +lr=1e-3 +num_train_epochs=100 + +for dataset_name in multiwoz21 kvret opendialkg wow personachat +do + if [ "${dataset_name}" == "multiwoz21" ] + then + task_name="nlg" + else + task_name=${dataset_name} + fi + for shot in 50 100 200 + do + for dial_ids_order in 0 1 2 3 4 + do + python create_data.py -t ${task_name} -d ${dataset_name} -o ${dial_ids_order} -s ${shot} + + data_dir="data/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}" + output_dir="output/${model_name}/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}" + logging_dir="${output_dir}/runs" + train_file="${data_dir}/train.json" + validation_file="${data_dir}/validation.json" + + # training + python -m torch.distributed.launch --master_port ${master_port} \ + --nproc_per_node ${n_gpus} ../run_seq2seq.py \ + --task_name ${task_name} \ + --dataset_name ${dataset_path} \ + --dataset_config_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --overwrite_output_dir \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --preprocessing_num_workers ${num_workers} \ + --dataloader_num_workers ${num_workers} \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --optim adafactor \ + --lr_scheduler_type constant \ + --gradient_checkpointing + + # inference + if [ "${dataset_name}" == "wow" ] + then + for test_split in seen unseen + do + test_file="data/${task_name}/test_${test_split}.json" + gen_output_dir="${output_dir}/gen_${test_split}" + + python -m torch.distributed.launch --master_port ${master_port} \ + --nproc_per_node ${n_gpus} ../run_seq2seq.py \ + --task_name ${task_name} \ + --dataset_name ${dataset_path} \ + --dataset_config_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --cache_dir ${cache_dir} \ + --output_dir ${gen_output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers ${num_workers} \ + --dataloader_num_workers ${num_workers} \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --optim adafactor \ + --lr_scheduler_type constant \ + --gradient_checkpointing + done + else + test_file="data/${task_name}/test.json" + gen_output_dir="${output_dir}/gen" + + python -m torch.distributed.launch --master_port ${master_port} \ + --nproc_per_node ${n_gpus} ../run_seq2seq.py \ + --task_name ${task_name} \ + --dataset_name ${dataset_path} \ + --dataset_config_name ${task_name} \ + --metric_name_or_path ${metric_name_or_path} \ + --metric_config_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --cache_dir ${cache_dir} \ + --output_dir ${gen_output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers ${num_workers} \ + --dataloader_num_workers ${num_workers} \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --optim adafactor \ + --lr_scheduler_type constant \ + --gradient_checkpointing + fi + done + done +done + +# evaluation +python evaluate.py --output_dirs output/${model_name} -t nlg kvret opendialkg personachat wow -s 50 100 200 -o 0 1 2 3 4 \ No newline at end of file diff --git a/convlab/base_models/t5/key2gen/metric.py b/convlab/base_models/t5/key2gen/metric.py index 050db04d120383bece5bfd5e9cb3362f66c8ecac..d3e493188194639adad04539619bb63f14284841 100644 --- a/convlab/base_models/t5/key2gen/metric.py +++ b/convlab/base_models/t5/key2gen/metric.py @@ -11,8 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""NLG Metric""" +"""Grounded Dialog Generation Metric""" +from weakref import ref import datasets from sacrebleu.metrics import BLEU from sacrebleu.utils import sum_of_lists @@ -20,169 +21,423 @@ import re from collections import Counter import numpy as np from nltk.corpus import stopwords -from nltk import sent_tokenize from rouge_score import rouge_scorer, scoring from nltk.translate import meteor_score from datasets.config import importlib_metadata, version +from convlab.base_models.t5.key2gen.features import FEATURES +from convlab.util import load_ontology +from copy import deepcopy NLTK_VERSION = version.parse(importlib_metadata.version("nltk")) if NLTK_VERSION >= version.Version("3.6.5"): from nltk import word_tokenize +# Uncomment to download nltk_data for the first time running. +# import nltk +# nltk.download("wordnet") +# if NLTK_VERSION >= version.Version("3.6.5"): +# nltk.download("punkt") +# if NLTK_VERSION >= version.Version("3.6.6"): +# nltk.download("omw-1.4") -# TODO: Add BibTeX citation -_CITATION = """\ -@inproceedings{post-2018-call, - title = "A Call for Clarity in Reporting {BLEU} Scores", - author = "Post, Matt", - booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers", - month = oct, - year = "2018", - address = "Belgium, Brussels", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/W18-6319", - pages = "186--191", -} + +_CITATION = """ """ _DESCRIPTION = """\ -Metric to evaluate text-to-text models on the natural language generation task. +Metric to evaluate text generation models on the grounded dialog generation task. """ +# TODO _KWARGS_DESCRIPTION = """ -Calculates corpus-bleu4 Args: predictions: list of predictions to score. Each predictions should be a string. references: list of reference for each prediction. Each reference should be a string. + knowledge: task-specific grounded knowledge + Returns: - bleu: corpus-bleu score + bleu-1/2/3/4: corpus-bleu score, from sacrebleu + rouge-1/2/L: ROUGE-F1, from rouge_score + meteor: METEOR, from nltk + unigram f1: unigram overlap, from parlai + distinct-1/2: from parlai + other knowledge utility score: task-specific knowledge utility metrics + Examples: - >>> nlg_metric = datasets.load_metric("nlg_metric.py") + >>> nlg_metric = datasets.load_metric("metric.py", "nlg") >>> predictions = ["hello there general kenobi", "foo bar foobar"] >>> references = ["hello there kenobi", "foo bar foobar"] >>> results = nlg_metric.compute(predictions=predictions, references=references) >>> print(results) - {'bleu': 35.35533905932737} + {"bleu": 35.35533905932737} """ +re_art = re.compile(r'\b(a|an|the)\b') +re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']') +stop_words = set(stopwords.words("english")) +def utt2words(s): + """Lower text and remove punctuation, articles and extra whitespace. + from parlai https://github.com/facebookresearch/ParlAI/blob/9daae69320c07104493486e022c0e46a7871b253/parlai/core/metrics.py#L810""" + s = s.lower() + s = re_punc.sub(' ', s) + s = re_art.sub(' ', s) + return s.split() + + +def get_bleu(predictions, references): + """bleu-1/2/3/4 from sacrebleu""" + references = [" " if ref=="" else ref for ref in references] + metrics = {} + bleu = BLEU(lowercase=True, force=False, tokenize=BLEU.TOKENIZER_DEFAULT, smooth_method="exp", smooth_value=None, effective_order=False) + stats = sum_of_lists(bleu._extract_corpus_statistics(predictions, [references])) + for n in range(1,5): + metrics[f"bleu-{n}"] = bleu.compute_bleu( + correct=stats[2: 2 + bleu.max_ngram_order], + total=stats[2 + bleu.max_ngram_order:], + sys_len=int(stats[0]), ref_len=int(stats[1]), + smooth_method=bleu.smooth_method, smooth_value=bleu.smooth_value, + effective_order=bleu.effective_order, + max_ngram_order=n).score + return metrics + + +def get_unigram_f1(predictions, references): + """unigram f1 between prediction and reference, from parlai""" + metrics = {} + metrics["unigram f1"] = [] + metrics["unigram f1 (non-stop words)"] = [] + for prediction, reference in zip(predictions, references): + pred_items = utt2words(prediction) + gold_items = utt2words(reference) + for remove_stopwords in [False, True]: + if remove_stopwords: + pred_items = [w for w in pred_items if w not in stop_words] + gold_items = [w for w in gold_items if w not in stop_words] + common = Counter(pred_items) & Counter(gold_items) + num_same = sum(common.values()) + if num_same == 0: + f1 = 0 + else: + precision = 1.0 * num_same / len(pred_items) + recall = 1.0 * num_same / len(gold_items) + f1 = (2 * precision * recall) / (precision + recall) + if not remove_stopwords: + metrics["unigram f1"].append(f1) + else: + metrics["unigram f1 (non-stop words)"].append(f1) + metrics["unigram f1"] = np.mean(metrics["unigram f1"]) * 100 + metrics["unigram f1 (non-stop words)"] = np.mean(metrics["unigram f1 (non-stop words)"]) * 100 + return metrics + + +def get_rouge(predictions, references): + """rouge-1/2/L from rouge-score""" + rouge_types=["rouge1", "rouge2", "rougeL"] + scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=True) + aggregator = scoring.BootstrapAggregator() + + for prediction, reference in zip(predictions, references): + score = scorer.score(reference, prediction) + aggregator.add_scores(score) + + return {key: 100 * (value.mid.fmeasure if key == "rougeL" else value.mid.recall) for key, value in aggregator.aggregate().items()} + + +def get_meteor(predictions, references): + """meteor from nltk""" + alpha=0.9 + beta=3 + gamma=0.5 + if NLTK_VERSION >= version.Version("3.6.5"): + scores = [ + meteor_score.single_meteor_score( + word_tokenize(ref), word_tokenize(pred), alpha=alpha, beta=beta, gamma=gamma + ) + for ref, pred in zip(references, predictions) + ] + else: + scores = [ + meteor_score.single_meteor_score(ref, pred, alpha=alpha, beta=beta, gamma=gamma) + for ref, pred in zip(references, predictions) + ] + return {"meteor": np.mean(scores) * 100} + + +def get_distinct(predictions): + """distinct-1/2 + from parlai https://github.com/facebookresearch/ParlAI/blob/9daae69320c07104493486e022c0e46a7871b253/parlai/core/metrics.py#L781""" + def _ngram(seq, n): + for i in range(len(seq) - n + 1): + yield tuple(seq[i : i + n]) + + metrics = {} + for k in [1, 2]: + inter_cnt = Counter() + for prediction in predictions: + ngram = Counter(_ngram(utt2words(prediction), k)) + inter_cnt += ngram + metrics[f"distinct-{k}"] = max(len(inter_cnt), 1e-12) / max(sum(inter_cnt.values()), 1e-5) * 100 + return metrics + + +def get_nlg_slot_err(predictions, knowledge): + """slot error rate: (missing_count + redundant_count) / all_count for value in dialog acts""" + val2ds_dict = {} + ontology = load_ontology("multiwoz21") + for domain_name in ontology["domains"]: + domain = ontology["domains"][domain_name] + for slot_name in domain["slots"]: + slot = domain["slots"][slot_name] + if "possible_values" not in slot: + continue + possible_vals = slot["possible_values"] + if len(possible_vals) > 0: + for val in possible_vals: + val2ds_dict[val] = f"{domain_name}-{slot_name}" + score_list = [] + for utterance, da in zip(predictions, knowledge): + missing_count = 0 + redundant_count = 0 + all_count = 0 + all_values = set() + ## missing values + # print(da) + # print(utterance) + for key in ['categorical', 'non-categorical']: + for value in da[key]['value']: + if len(value) > 0: + # print(value) + all_values.add(value) + if value.strip().lower() not in utterance.lower(): + missing_count += 1 + # print(f"\tmissing: {value}") + all_count += 1 + if all_count == 0: + continue + ## redundant values + for val in val2ds_dict: + if f" {val.strip().lower()} " in f" {utterance.strip().lower()} " and val.strip().lower() not in all_values: + wlist = val2ds_dict[val].split("-") + domain, slot = wlist[0], wlist[1] + if f" {slot.strip().lower()}" in f" {utterance.strip().lower()} ": + redundant_count += 1 + # print(f"redundant: {val}/{val2ds_dict[val]}") + item_score = float(missing_count + redundant_count) / all_count + # print(f"\tredundant: {redundant_count} | missing_count: {missing_count} |all_count: {all_count}") + # print('-'*100) + score_list.append(item_score) + return {"err": np.mean(score_list) * 100} + + +def load_entities(): + """modified (load from unified ontology) from UnifiedSKG + https://github.com/HKUNLP/UnifiedSKG/blob/49a2ff950bb312b980c22ad72b11520db72ab6a3/metrics/kvret/evaluator.py#L8""" + + ontology = load_ontology("kvret") + all_entities = set() + for domain in ontology["domains"]: + for slot in ontology["domains"][domain]["slots"]: + all_entities |= set(ontology["domains"][domain]["slots"][slot]["possible_values"]) + missed_entities = ["yoga", "tennis", "swimming", "football", " lab ", "doctor", "optometrist", "dentist", "1st", + "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th", + "11th", "12th", "13th", "14th", "15th", "16th", "17th", "18th", "19th", "20th", "Jill", + "Jack"] + all_entities |= set(missed_entities) + all_entities.remove("HR") + all_entities.add(" HR ") + all_entities = sorted(list(all_entities), key=lambda i: len(i), reverse=True) + return all_entities + + +def check_sub_str(str_list: list, sub_str: str): + """ + It takes a list of strings and a substring as input, and returns True if the substring is found + in any of the strings in the list, and False otherwise + """ + for str_item in str_list: + if sub_str in str_item or sub_str.lower() in str_item.lower(): + return True + return False + + +def extract_entities_from_utterance(utterance, sorted_entities): + """modified (remove underscore) from UnifiedSKG + https://github.com/HKUNLP/UnifiedSKG/blob/49a2ff950bb312b980c22ad72b11520db72ab6a3/metrics/kvret/response_entity_hit.py#L45""" + + utterance = " {} ".format(utterance) # for entity matching + for h in range(0, 13): # for formulating am & pm + utterance = utterance.replace("{} am".format(h), "{}am".format(h)) + utterance = utterance.replace("{} pm".format(h), "{}pm".format(h)) + for entity_item_a in [20, 30, 40, 50, 60, 70, 80, 90, 100]: + for entity_item_b in [20, 30, 40, 50, 60, 70, 80, 90, 100]: + utterance = utterance.replace("{}-{}f".format(str(entity_item_a), str(entity_item_b)), "{}f-{}f".format(str(entity_item_a), str(entity_item_b))) + entities_in_this_utterance = [] + for entity in sorted_entities: + # len(entity) decreases + if (entity in utterance) or (entity.lower() in utterance.lower()): + if not check_sub_str(entities_in_this_utterance, entity): + # in case of "week & weekend", "week & next_week" etc + entities_in_this_utterance.append(entity) + return entities_in_this_utterance + + +def f1_score(y_pred, y_true, average="micro"): + """micro/marco-F1 score, modified from UnifiedSKG + https://github.com/HKUNLP/UnifiedSKG/blob/49a2ff950bb312b980c22ad72b11520db72ab6a3/metrics/kvret/response_entity_hit.py#L76""" + + assert len(y_pred) == len(y_true) + + def _compute_F1(precision, recall): + return 2 * precision * recall / float(precision + recall) if (precision + recall) != 0 else 0 + + def _compute_prf(gold, pred): + TP, FP, FN = 0, 0, 0 + if len(gold) != 0: + count = 1 + for g in gold: + if g in pred: + TP += 1 + else: + FN += 1 + for p in set(pred): + if p not in gold: + FP += 1 + precision = TP / float(TP + FP) if (TP + FP) != 0 else 0 + recall = TP / float(TP + FN) if (TP + FN) != 0 else 0 + F1 = _compute_F1(precision, recall) + else: + precision, recall, F1, count = 0, 0, 0, 0 + return TP, FP, FN, F1, count + + F1_pred, F1_count, TP_all, FP_all, FN_all = 0, 0, 0, 0, 0 + + for y_true_item, y_pred_item in zip(y_true, y_pred): + single_tp, single_fp, single_fn, single_f1, count = _compute_prf(y_true_item, y_pred_item) + F1_pred += single_f1 + F1_count += count + TP_all += single_tp + FP_all += single_fp + FN_all += single_fn + + if average == "macro": + F1_macro_score = F1_pred / float(F1_count) if F1_count != 0 else 0 + return F1_macro_score + elif average == "micro": + P_score = TP_all / float(TP_all + FP_all) if (TP_all + FP_all) != 0 else 0 + R_score = TP_all / float(TP_all + FN_all) if (TP_all + FN_all) != 0 else 0 + F1_micro_score = _compute_F1(P_score, R_score) + return F1_micro_score + else: + raise ValueError("Options other than micro/macro are not supported.") + + +def get_kvret_entity_f1(predictions, references, knowledge): + """entity f1 for kvret, modified from + https://github.com/HKUNLP/UnifiedSKG/blob/49a2ff950bb312b980c22ad72b11520db72ab6a3/metrics/kvret/response_entity_hit.py#L178""" + + global_entities = load_entities() + F1_scores = {} + entities_from_predictions_and_references = { + d: {"predictions_entities": [], "references_entities": []} for d in ["all", "schedule", "weather", "navigate"] + } + for prediction, reference, kb in zip(predictions, references, knowledge): + prediction_entities = extract_entities_from_utterance(utterance=prediction, sorted_entities=global_entities) + reference_entities = extract_entities_from_utterance(utterance=reference, sorted_entities=global_entities) + entities_from_predictions_and_references["all"]["predictions_entities"].append(prediction_entities) + entities_from_predictions_and_references["all"]["references_entities"].append(reference_entities) + domain = "schedule" + for d in kb: + if len(kb[d]["entity"]) > 0: + domain = d + break + entities_from_predictions_and_references[domain]["predictions_entities"].append(prediction_entities) + entities_from_predictions_and_references[domain]["references_entities"].append(reference_entities) + + for category in entities_from_predictions_and_references.keys(): + predictions_entities = entities_from_predictions_and_references[category]["predictions_entities"] + references_entities = entities_from_predictions_and_references[category]["references_entities"] + F1_scores["{} micro entity F1".format(category)] = f1_score(y_pred=predictions_entities, y_true=references_entities, average="micro") + F1_scores["{} macro entity F1".format(category)] = f1_score(y_pred=predictions_entities, y_true=references_entities, average="macro") + + return {**F1_scores} + + +def get_opendialkg_entity_f1(predictions, references, knowledge): + predictions_entities, references_entities = [], [] + for prediction, reference, kg_path in zip(predictions, references, knowledge): + kg_entities = set() + for kg_triple in kg_path: + # add head and tail entities + kg_entities.add(kg_triple[0]) + kg_entities.add(kg_triple[-1]) + kg_entities = sorted(list(kg_entities), key=lambda i: len(i), reverse=True) + + for utterance, entities in zip([prediction, reference], [predictions_entities, references_entities]): + entities_in_this_utterance = [] + for entity in kg_entities: + if (entity in utterance) or (entity.lower() in utterance.lower()): + if not check_sub_str(entities_in_this_utterance, entity): + # in case of "week & weekend", "week & next_week" etc + entities_in_this_utterance.append(entity) + entities.append(entities_in_this_utterance) + + return { + "micro entity f1": f1_score(y_pred=predictions_entities, y_true=references_entities, average="micro"), + "macro entity f1": f1_score(y_pred=predictions_entities, y_true=references_entities, average="macro") + } + +def get_knowledge_sentences_f1(predictions, knowledge): + knowledge_reference = [' '.join(k_sens) for k_sens in knowledge] + f1_score = get_unigram_f1(predictions, knowledge_reference) + return {f"knowledge {k}": v for k, v in f1_score.items()} + @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) -class NLGMetrics(datasets.Metric): - """Metric to evaluate text-to-text models on the natural language generation task.""" +class GroundedDialogGenerationMetrics(datasets.Metric): + """Metric to evaluate text generation models on the grounded dialog generation task.""" def _info(self): return datasets.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, - # This defines the format of each prediction and reference features=datasets.Features({ - 'predictions': datasets.Value('string'), - 'references': datasets.Value('string'), + "predictions": datasets.Value("string"), + "references": datasets.Value("string"), + "knowledge": deepcopy(FEATURES[self.config_name]["knowledge"]) }) ) - # def _download_and_prepare(self, dl_manager): - # import nltk - # nltk.download("wordnet") - # if NLTK_VERSION >= version.Version("3.6.5"): - # nltk.download("punkt") - # if NLTK_VERSION >= version.Version("3.6.6"): - # nltk.download("omw-1.4") - - def _compute(self, predictions, references): + def compute(self, predictions, references, knowledge=None): """Returns the scores: bleu""" metrics = {} + # bleu - bleu = BLEU(lowercase=True, force=False, tokenize=BLEU.TOKENIZER_DEFAULT, smooth_method='exp', smooth_value=None, effective_order=False) - stats = sum_of_lists(bleu._extract_corpus_statistics(predictions, [references])) - for n in range(1,5): - metrics[f'bleu-{n}'] = bleu.compute_bleu( - correct=stats[2: 2 + bleu.max_ngram_order], - total=stats[2 + bleu.max_ngram_order:], - sys_len=int(stats[0]), ref_len=int(stats[1]), - smooth_method=bleu.smooth_method, smooth_value=bleu.smooth_value, - effective_order=bleu.effective_order, - max_ngram_order=n).score + metrics.update(get_bleu(predictions, references)) # unigram f1 - re_art = re.compile(r'\b(a|an|the)\b') - re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']') - stop_words = set(stopwords.words('english')) - def utt2words(s): - """Lower text and remove punctuation, articles and extra whitespace.""" - s = s.lower() - s = re_punc.sub(' ', s) - s = re_art.sub(' ', s) - return s.split() - - metrics['unigram f1'] = [] - metrics['unigram f1 (non-stop words)'] = [] - for prediction, reference in zip(predictions, references): - pred_items = utt2words(prediction) - gold_items = utt2words(reference) - for remove_stopwords in [False, True]: - if remove_stopwords: - pred_items = [w for w in pred_items if w not in stop_words] - gold_items = [w for w in gold_items if w not in stop_words] - common = Counter(pred_items) & Counter(gold_items) - num_same = sum(common.values()) - if num_same == 0: - f1 = 0 - else: - precision = 1.0 * num_same / len(pred_items) - recall = 1.0 * num_same / len(gold_items) - f1 = (2 * precision * recall) / (precision + recall) - if not remove_stopwords: - metrics['unigram f1'].append(f1) - else: - metrics['unigram f1 (non-stop words)'].append(f1) - metrics['unigram f1'] = np.mean(metrics['unigram f1']) - metrics['unigram f1 (non-stop words)'] = np.mean(metrics['unigram f1 (non-stop words)']) - + metrics.update(get_unigram_f1(predictions, references)) + # rouge-1/2/L-fmeasure - rouge_types=["rouge1", "rouge2", "rougeL"] - scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=True) - aggregator = scoring.BootstrapAggregator() - - for prediction, reference in zip(predictions, references): - score = scorer.score(reference, prediction) - aggregator.add_scores(score) - - result = aggregator.aggregate() - metrics.update({key: value.mid.fmeasure * 100 for key, value in result.items()}) + metrics.update(get_rouge(predictions, references)) # meteor - alpha=0.9 - beta=3 - gamma=0.5 - if NLTK_VERSION >= version.Version("3.6.5"): - scores = [ - meteor_score.single_meteor_score( - word_tokenize(ref), word_tokenize(pred), alpha=alpha, beta=beta, gamma=gamma - ) - for ref, pred in zip(references, predictions) - ] - else: - scores = [ - meteor_score.single_meteor_score(ref, pred, alpha=alpha, beta=beta, gamma=gamma) - for ref, pred in zip(references, predictions) - ] - metrics.update({"meteor": np.mean(scores)}) - - # inter/intra-distinct-1/2 - def _ngram(seq, n): - for i in range(len(seq) - n + 1): - yield tuple(seq[i : i + n]) + metrics.update(get_meteor(predictions, references)) + + # inter-distinct-1/2 + metrics.update(get_distinct(predictions)) - for k in [1, 2]: - inter_cnt = Counter() - for prediction in predictions: - ngram = Counter(_ngram(utt2words(prediction), k)) - inter_cnt += ngram - metrics[f'distinct-{k}'] = max(len(inter_cnt), 1e-12) / max(sum(inter_cnt.values()), 1e-5) + if knowledge is not None: + if self.config_name == "nlg": + metrics.update(get_nlg_slot_err(predictions, knowledge)) + elif self.config_name == "kvret": + metrics.update(get_kvret_entity_f1(predictions, references, knowledge)) + elif self.config_name == "opendialkg": + metrics.update(get_opendialkg_entity_f1(predictions, references, knowledge)) + elif self.config_name in ["wow", "personachat"]: + metrics.update(get_knowledge_sentences_f1(predictions, knowledge)) return metrics diff --git a/convlab/base_models/t5/key2gen/run.sh b/convlab/base_models/t5/key2gen/run.sh deleted file mode 100644 index 74e44434ea1ab9e0099e680f2862521f7063049b..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run.sh +++ /dev/null @@ -1,7 +0,0 @@ -for ratio in 0.1 0.01 -do - for dial_ids_order in 0 1 2 - do - bash run_persona_fewshot_key2gen.sh ${ratio} ${dial_ids_order} - done -done \ No newline at end of file diff --git a/convlab/base_models/t5/key2gen/run2.sh b/convlab/base_models/t5/key2gen/run2.sh deleted file mode 100644 index 7e7bf5d6a75e72d6c236fbf738a8c7d010f0e18b..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run2.sh +++ /dev/null @@ -1,7 +0,0 @@ -for ratio in 0.1 0.01 -do - for dial_ids_order in 0 1 2 - do - bash run_wow_fewshot_key2gen.sh ${ratio} ${dial_ids_order} - done -done \ No newline at end of file diff --git a/convlab/base_models/t5/key2gen/run_commongen.sh b/convlab/base_models/t5/key2gen/run_commongen.sh deleted file mode 100644 index 5952725709afa1848695158b76f04aa81ff328a6..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_commongen.sh +++ /dev/null @@ -1,85 +0,0 @@ -n_gpus=1 -task_name="commongen" -dataset_name="commongen" -speaker="system" -context_window_size=0 -data_dir="data/${task_name}/${dataset_name}" -output_dir="output/${task_name}/${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/validation.json" -metric_name_or_path="../nlg/nlg_metric.py" -metric_for_best_model="bleu" -source_column="concepts" -target_column="text" -truncation_side="left" -max_source_length=512 -max_target_length=512 -model_name_or_path="t5-small" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=10 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 3 \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_commongen_key2gen.sh b/convlab/base_models/t5/key2gen/run_commongen_key2gen.sh deleted file mode 100644 index e09d3b464e2685dd8e0d774539f8c8dc4808f99c..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_commongen_key2gen.sh +++ /dev/null @@ -1,88 +0,0 @@ -n_gpus=1 -task_name="commongen" -dataset_name="commongen" -speaker="system" -context_window_size=0 -data_dir="data/${task_name}/key2gen_${dataset_name}" -output_dir="output/${task_name}/key2gen_${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/validation.json" -metric_name_or_path="../nlg/nlg_metric.py" -metric_for_best_model="bleu" -source_column="concepts" -target_column="text" -source_prefix="keywords: " -truncation_side="left" -max_source_length=512 -max_target_length=512 -model_name_or_path="../../gpt/keyword_extraction/output/key2gen_shuffle_noisy/gpt/dailydialog+metalwoz+sgd+tm1+tm2+tm3" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=10 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --source_prefix ${source_prefix} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 3 \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --source_prefix ${source_prefix} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_dart.sh b/convlab/base_models/t5/key2gen/run_dart.sh deleted file mode 100644 index 62eb3236f0c468c6b185eb9c5c39e807e3a92800..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_dart.sh +++ /dev/null @@ -1,85 +0,0 @@ -n_gpus=1 -task_name="dart" -dataset_name="dart" -speaker="system" -context_window_size=0 -data_dir="data/${task_name}/key2gen_${dataset_name}" -output_dir="output/${task_name}/${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="../nlg/nlg_metric.py" -metric_for_best_model="bleu" -source_column="triples" -target_column="text" -truncation_side="left" -max_source_length=512 -max_target_length=512 -model_name_or_path="t5-small" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=10 - -# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} - -# python ../run_seq2seq.py \ -# --task_name ${task_name} \ -# --train_file ${train_file} \ -# --validation_file ${validation_file} \ -# --source_column ${source_column} \ -# --target_column ${target_column} \ -# --max_source_length ${max_source_length} \ -# --max_target_length ${max_target_length} \ -# --truncation_side ${truncation_side} \ -# --model_name_or_path ${model_name_or_path} \ -# --do_train \ -# --do_eval \ -# --save_strategy epoch \ -# --evaluation_strategy epoch \ -# --save_total_limit 3 \ -# --prediction_loss_only \ -# --cache_dir ${cache_dir} \ -# --output_dir ${output_dir} \ -# --logging_dir ${logging_dir} \ -# --overwrite_output_dir \ -# --preprocessing_num_workers 4 \ -# --per_device_train_batch_size ${per_device_train_batch_size} \ -# --per_device_eval_batch_size ${per_device_eval_batch_size} \ -# --gradient_accumulation_steps ${gradient_accumulation_steps} \ -# --learning_rate ${lr} \ -# --num_train_epochs ${num_train_epochs} \ -# --adafactor \ -# --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_dart_key2gen.sh b/convlab/base_models/t5/key2gen/run_dart_key2gen.sh deleted file mode 100644 index b729da6fdf27ed61dad7a2a48120fbbbaef49afc..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_dart_key2gen.sh +++ /dev/null @@ -1,88 +0,0 @@ -n_gpus=1 -task_name="dart" -dataset_name="dart" -speaker="system" -context_window_size=0 -data_dir="data/${task_name}/${dataset_name}" -output_dir="output/${task_name}/key2gen_${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="../nlg/nlg_metric.py" -metric_for_best_model="bleu" -source_column="triples" -target_column="text" -source_prefix="possible keywords: " -truncation_side="left" -max_source_length=512 -max_target_length=512 -model_name_or_path="../../gpt/keyword_extraction/output/key2gen_shuffle_noisy/gpt/dailydialog+metalwoz+sgd+tm1+tm2+tm3" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=10 - -# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen - -# python ../run_seq2seq.py \ -# --task_name ${task_name} \ -# --train_file ${train_file} \ -# --validation_file ${validation_file} \ -# --source_column ${source_column} \ -# --target_column ${target_column} \ -# --source_prefix "${source_prefix}" \ -# --max_source_length ${max_source_length} \ -# --max_target_length ${max_target_length} \ -# --truncation_side ${truncation_side} \ -# --model_name_or_path ${model_name_or_path} \ -# --do_train \ -# --do_eval \ -# --save_strategy epoch \ -# --evaluation_strategy epoch \ -# --save_total_limit 3 \ -# --prediction_loss_only \ -# --cache_dir ${cache_dir} \ -# --output_dir ${output_dir} \ -# --logging_dir ${logging_dir} \ -# --overwrite_output_dir \ -# --preprocessing_num_workers 4 \ -# --per_device_train_batch_size ${per_device_train_batch_size} \ -# --per_device_eval_batch_size ${per_device_eval_batch_size} \ -# --gradient_accumulation_steps ${gradient_accumulation_steps} \ -# --learning_rate ${lr} \ -# --num_train_epochs ${num_train_epochs} \ -# --adafactor \ -# --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --source_prefix "${source_prefix}" \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_kvret.sh b/convlab/base_models/t5/key2gen/run_kvret.sh deleted file mode 100644 index b960ed18735d804f82116b4307223b4245f2b368..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_kvret.sh +++ /dev/null @@ -1,85 +0,0 @@ -n_gpus=1 -task_name="kvret" -dataset_name="kvret" -speaker="system" -data_dir="data/${task_name}/${dataset_name}" -output_dir="output/${task_name}/${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="../nlg/nlg_metric.py" -metric_for_best_model="bleu" -source_column="context+db" -target_column="response" -truncation_side="left" -max_source_length=1024 -max_target_length=512 -model_name_or_path="t5-small" -per_device_train_batch_size=32 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=10 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 1 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_kvret_fewshot.sh b/convlab/base_models/t5/key2gen/run_kvret_fewshot.sh deleted file mode 100644 index a6523d6cea506b6433f2cd853ca6255a28cfd8a6..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_kvret_fewshot.sh +++ /dev/null @@ -1,87 +0,0 @@ -n_gpus=1 -task_name="kvret" -dataset_name="kvret" -speaker="system" -ratio=$1 -dial_ids_order=$2 -data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}" -output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="../nlg/nlg_metric.py" -metric_for_best_model="bleu" -source_column="context+db" -target_column="response" -truncation_side="left" -max_source_length=1024 -max_target_length=512 -model_name_or_path="t5-small" -per_device_train_batch_size=32 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=100 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 1 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh b/convlab/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh deleted file mode 100644 index d19ae9c1ca3890896491839c76e3d7beff9bdbfc..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh +++ /dev/null @@ -1,87 +0,0 @@ -n_gpus=1 -task_name="kvret" -dataset_name="kvret" -speaker="system" -ratio=$1 -dial_ids_order=$2 -data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}" -output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="../nlg/nlg_metric.py" -metric_for_best_model="bleu" -source_column="context+db" -target_column="response" -truncation_side="left" -max_source_length=1024 -max_target_length=512 -model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3" -per_device_train_batch_size=32 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=100 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 1 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_kvret_key2gen.sh b/convlab/base_models/t5/key2gen/run_kvret_key2gen.sh deleted file mode 100644 index 44f8dbc6732f8ccd3470b8d5fd62a886898b949c..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_kvret_key2gen.sh +++ /dev/null @@ -1,85 +0,0 @@ -n_gpus=1 -task_name="kvret" -dataset_name="kvret" -speaker="system" -data_dir="data/${task_name}/${dataset_name}" -output_dir="output/${task_name}/key2gen_${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="../nlg/nlg_metric.py" -metric_for_best_model="bleu" -source_column="context+db" -target_column="response" -truncation_side="left" -max_source_length=1024 -max_target_length=512 -model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3" -per_device_train_batch_size=32 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=10 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 1 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_nlg.sh b/convlab/base_models/t5/key2gen/run_nlg.sh deleted file mode 100644 index ec225ef719f25e970bd00ca662491fe76fd6b8af..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_nlg.sh +++ /dev/null @@ -1,86 +0,0 @@ -n_gpus=1 -task_name="nlg" -dataset_name=$1 -speaker="system" -context_window_size=$2 -data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="../nlg/nlg_metric.py" -metric_for_best_model="bleu" -source_column="context+da" -target_column="response" -truncation_side="left" -max_source_length=512 -max_target_length=512 -model_name_or_path="t5-small" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=10 - -# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} - -# python ../run_seq2seq.py \ -# --task_name ${task_name} \ -# --train_file ${train_file} \ -# --validation_file ${validation_file} \ -# --source_column ${source_column} \ -# --target_column ${target_column} \ -# --max_source_length ${max_source_length} \ -# --max_target_length ${max_target_length} \ -# --truncation_side ${truncation_side} \ -# --model_name_or_path ${model_name_or_path} \ -# --do_train \ -# --do_eval \ -# --save_strategy epoch \ -# --evaluation_strategy epoch \ -# --save_total_limit 1 \ -# --prediction_loss_only \ -# --load_best_model_at_end \ -# --cache_dir ${cache_dir} \ -# --output_dir ${output_dir} \ -# --logging_dir ${logging_dir} \ -# --overwrite_output_dir \ -# --preprocessing_num_workers 4 \ -# --per_device_train_batch_size ${per_device_train_batch_size} \ -# --per_device_eval_batch_size ${per_device_eval_batch_size} \ -# --gradient_accumulation_steps ${gradient_accumulation_steps} \ -# --learning_rate ${lr} \ -# --num_train_epochs ${num_train_epochs} \ -# --adafactor \ -# --gradient_checkpointing - -# python ../run_seq2seq.py \ -# --task_name ${task_name} \ -# --test_file ${test_file} \ -# --source_column ${source_column} \ -# --target_column ${target_column} \ -# --max_source_length ${max_source_length} \ -# --max_target_length ${max_target_length} \ -# --truncation_side ${truncation_side} \ -# --model_name_or_path ${output_dir} \ -# --do_predict \ -# --predict_with_generate \ -# --metric_name_or_path ${metric_name_or_path} \ -# --cache_dir ${cache_dir} \ -# --output_dir ${output_dir} \ -# --logging_dir ${logging_dir} \ -# --overwrite_output_dir \ -# --preprocessing_num_workers 4 \ -# --per_device_train_batch_size ${per_device_train_batch_size} \ -# --per_device_eval_batch_size ${per_device_eval_batch_size} \ -# --gradient_accumulation_steps ${gradient_accumulation_steps} \ -# --learning_rate ${lr} \ -# --num_train_epochs ${num_train_epochs} \ -# --adafactor \ -# --gradient_checkpointing - -python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_nlg_fewshot.sh b/convlab/base_models/t5/key2gen/run_nlg_fewshot.sh deleted file mode 100644 index f8ab2ee0f5969f2c27552de4fcf82ede5526c384..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_nlg_fewshot.sh +++ /dev/null @@ -1,88 +0,0 @@ -n_gpus=1 -task_name="nlg" -dataset_name=$1 -speaker="system" -context_window_size=$2 -ratio=$3 -dial_ids_order=$4 -data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}" -output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="../nlg/nlg_metric.py" -metric_for_best_model="bleu" -source_column="context+da" -target_column="response" -truncation_side="left" -max_source_length=512 -max_target_length=512 -model_name_or_path="t5-small" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=100 - -# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order} - -# python ../run_seq2seq.py \ -# --task_name ${task_name} \ -# --train_file ${train_file} \ -# --validation_file ${validation_file} \ -# --source_column ${source_column} \ -# --target_column ${target_column} \ -# --max_source_length ${max_source_length} \ -# --max_target_length ${max_target_length} \ -# --truncation_side ${truncation_side} \ -# --model_name_or_path ${model_name_or_path} \ -# --do_train \ -# --do_eval \ -# --save_strategy epoch \ -# --evaluation_strategy epoch \ -# --save_total_limit 1 \ -# --prediction_loss_only \ -# --load_best_model_at_end \ -# --cache_dir ${cache_dir} \ -# --output_dir ${output_dir} \ -# --logging_dir ${logging_dir} \ -# --overwrite_output_dir \ -# --preprocessing_num_workers 4 \ -# --per_device_train_batch_size ${per_device_train_batch_size} \ -# --per_device_eval_batch_size ${per_device_eval_batch_size} \ -# --gradient_accumulation_steps ${gradient_accumulation_steps} \ -# --learning_rate ${lr} \ -# --num_train_epochs ${num_train_epochs} \ -# --adafactor \ -# --gradient_checkpointing - -# python ../run_seq2seq.py \ -# --task_name ${task_name} \ -# --test_file ${test_file} \ -# --source_column ${source_column} \ -# --target_column ${target_column} \ -# --max_source_length ${max_source_length} \ -# --max_target_length ${max_target_length} \ -# --truncation_side ${truncation_side} \ -# --model_name_or_path ${output_dir} \ -# --do_predict \ -# --predict_with_generate \ -# --metric_name_or_path ${metric_name_or_path} \ -# --cache_dir ${cache_dir} \ -# --output_dir ${output_dir} \ -# --logging_dir ${logging_dir} \ -# --overwrite_output_dir \ -# --preprocessing_num_workers 4 \ -# --per_device_train_batch_size ${per_device_train_batch_size} \ -# --per_device_eval_batch_size ${per_device_eval_batch_size} \ -# --gradient_accumulation_steps ${gradient_accumulation_steps} \ -# --learning_rate ${lr} \ -# --num_train_epochs ${num_train_epochs} \ -# --adafactor \ -# --gradient_checkpointing - -python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order} - -python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh b/convlab/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh deleted file mode 100644 index c13a187799bcfb6364d5d9d8a22f4ede96c87897..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh +++ /dev/null @@ -1,88 +0,0 @@ -n_gpus=1 -task_name="nlg" -dataset_name=$1 -speaker="system" -context_window_size=$2 -ratio=$3 -dial_ids_order=$4 -data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}" -output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="../nlg/nlg_metric.py" -metric_for_best_model="bleu" -source_column="context+da" -target_column="response" -truncation_side="left" -max_source_length=512 -max_target_length=512 -model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=100 - -# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order} --key2gen - -# python ../run_seq2seq.py \ -# --task_name ${task_name} \ -# --train_file ${train_file} \ -# --validation_file ${validation_file} \ -# --source_column ${source_column} \ -# --target_column ${target_column} \ -# --max_source_length ${max_source_length} \ -# --max_target_length ${max_target_length} \ -# --truncation_side ${truncation_side} \ -# --model_name_or_path ${model_name_or_path} \ -# --do_train \ -# --do_eval \ -# --save_strategy epoch \ -# --evaluation_strategy epoch \ -# --save_total_limit 1 \ -# --prediction_loss_only \ -# --load_best_model_at_end \ -# --cache_dir ${cache_dir} \ -# --output_dir ${output_dir} \ -# --logging_dir ${logging_dir} \ -# --overwrite_output_dir \ -# --preprocessing_num_workers 4 \ -# --per_device_train_batch_size ${per_device_train_batch_size} \ -# --per_device_eval_batch_size ${per_device_eval_batch_size} \ -# --gradient_accumulation_steps ${gradient_accumulation_steps} \ -# --learning_rate ${lr} \ -# --num_train_epochs ${num_train_epochs} \ -# --adafactor \ -# --gradient_checkpointing - -# python ../run_seq2seq.py \ -# --task_name ${task_name} \ -# --test_file ${test_file} \ -# --source_column ${source_column} \ -# --target_column ${target_column} \ -# --max_source_length ${max_source_length} \ -# --max_target_length ${max_target_length} \ -# --truncation_side ${truncation_side} \ -# --model_name_or_path ${output_dir} \ -# --do_predict \ -# --predict_with_generate \ -# --metric_name_or_path ${metric_name_or_path} \ -# --cache_dir ${cache_dir} \ -# --output_dir ${output_dir} \ -# --logging_dir ${logging_dir} \ -# --overwrite_output_dir \ -# --preprocessing_num_workers 4 \ -# --per_device_train_batch_size ${per_device_train_batch_size} \ -# --per_device_eval_batch_size ${per_device_eval_batch_size} \ -# --gradient_accumulation_steps ${gradient_accumulation_steps} \ -# --learning_rate ${lr} \ -# --num_train_epochs ${num_train_epochs} \ -# --adafactor \ -# --gradient_checkpointing - -python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order} - -python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_nlg_key2gen.sh b/convlab/base_models/t5/key2gen/run_nlg_key2gen.sh deleted file mode 100644 index f9b89aae85ca2963a162acde2f670caee0f1790e..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_nlg_key2gen.sh +++ /dev/null @@ -1,86 +0,0 @@ -n_gpus=1 -task_name="nlg" -dataset_name=$1 -speaker="system" -context_window_size=$2 -data_dir="data/${task_name}/key2gen_${dataset_name}/${speaker}/context_${context_window_size}" -output_dir="output/${task_name}/key2gen_${dataset_name}/${speaker}/context_${context_window_size}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="../nlg/nlg_metric.py" -metric_for_best_model="bleu" -source_column="context+da" -target_column="response" -truncation_side="left" -max_source_length=512 -max_target_length=512 -model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3" -per_device_train_batch_size=128 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=10 - -# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen - -# python ../run_seq2seq.py \ -# --task_name ${task_name} \ -# --train_file ${train_file} \ -# --validation_file ${validation_file} \ -# --source_column ${source_column} \ -# --target_column ${target_column} \ -# --max_source_length ${max_source_length} \ -# --max_target_length ${max_target_length} \ -# --truncation_side ${truncation_side} \ -# --model_name_or_path ${model_name_or_path} \ -# --do_train \ -# --do_eval \ -# --save_strategy epoch \ -# --evaluation_strategy epoch \ -# --save_total_limit 1 \ -# --prediction_loss_only \ -# --load_best_model_at_end \ -# --cache_dir ${cache_dir} \ -# --output_dir ${output_dir} \ -# --logging_dir ${logging_dir} \ -# --overwrite_output_dir \ -# --preprocessing_num_workers 4 \ -# --per_device_train_batch_size ${per_device_train_batch_size} \ -# --per_device_eval_batch_size ${per_device_eval_batch_size} \ -# --gradient_accumulation_steps ${gradient_accumulation_steps} \ -# --learning_rate ${lr} \ -# --num_train_epochs ${num_train_epochs} \ -# --adafactor \ -# --gradient_checkpointing - -# python ../run_seq2seq.py \ -# --task_name ${task_name} \ -# --test_file ${test_file} \ -# --source_column ${source_column} \ -# --target_column ${target_column} \ -# --max_source_length ${max_source_length} \ -# --max_target_length ${max_target_length} \ -# --truncation_side ${truncation_side} \ -# --model_name_or_path ${output_dir} \ -# --do_predict \ -# --predict_with_generate \ -# --metric_name_or_path ${metric_name_or_path} \ -# --cache_dir ${cache_dir} \ -# --output_dir ${output_dir} \ -# --logging_dir ${logging_dir} \ -# --overwrite_output_dir \ -# --preprocessing_num_workers 4 \ -# --per_device_train_batch_size ${per_device_train_batch_size} \ -# --per_device_eval_batch_size ${per_device_eval_batch_size} \ -# --gradient_accumulation_steps ${gradient_accumulation_steps} \ -# --learning_rate ${lr} \ -# --num_train_epochs ${num_train_epochs} \ -# --adafactor \ -# --gradient_checkpointing - -python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_persona.sh b/convlab/base_models/t5/key2gen/run_persona.sh deleted file mode 100644 index 20ae23608ca8ebcd05cb937b53a088df924e0d66..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_persona.sh +++ /dev/null @@ -1,85 +0,0 @@ -n_gpus=1 -task_name="personachat" -dataset_name="personachat" -speaker="system" -data_dir="data/${task_name}/${dataset_name}" -output_dir="output/${task_name}/${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="metric.py" -metric_for_best_model="bleu" -source_column="context+persona" -target_column="response" -truncation_side="left" -max_source_length=1024 -max_target_length=512 -model_name_or_path="t5-small" -per_device_train_batch_size=32 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=10 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 1 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_persona_fewshot.sh b/convlab/base_models/t5/key2gen/run_persona_fewshot.sh deleted file mode 100644 index 4ecf435e52293d2d8cc03411940ce55f5e73f3ab..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_persona_fewshot.sh +++ /dev/null @@ -1,87 +0,0 @@ -n_gpus=1 -task_name="personachat" -dataset_name="personachat" -speaker="system" -ratio=$1 -dial_ids_order=$2 -data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}" -output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="metric.py" -metric_for_best_model="bleu" -source_column="context+persona" -target_column="response" -truncation_side="left" -max_source_length=1024 -max_target_length=512 -model_name_or_path="t5-small" -per_device_train_batch_size=32 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=100 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 1 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh b/convlab/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh deleted file mode 100644 index 6b95839a376e4857c387535ef0a47a5317530498..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh +++ /dev/null @@ -1,87 +0,0 @@ -n_gpus=1 -task_name="personachat" -dataset_name="personachat" -speaker="system" -ratio=$1 -dial_ids_order=$2 -data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}" -output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="metric.py" -metric_for_best_model="bleu" -source_column="context+persona" -target_column="response" -truncation_side="left" -max_source_length=1024 -max_target_length=512 -model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog" -per_device_train_batch_size=32 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=100 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 1 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_persona_key2gen.sh b/convlab/base_models/t5/key2gen/run_persona_key2gen.sh deleted file mode 100644 index a2fea9b1fea8540731fd7e3d499156bfe7025690..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_persona_key2gen.sh +++ /dev/null @@ -1,85 +0,0 @@ -n_gpus=1 -task_name="personachat" -dataset_name="personachat" -speaker="system" -data_dir="data/${task_name}/key2gen_${dataset_name}" -output_dir="output/${task_name}/key2gen_${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -metric_name_or_path="metric.py" -metric_for_best_model="bleu" -source_column="context+persona" -target_column="response" -truncation_side="left" -max_source_length=1024 -max_target_length=512 -model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog" -per_device_train_batch_size=32 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=10 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} --key2gen - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 1 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_wow.sh b/convlab/base_models/t5/key2gen/run_wow.sh deleted file mode 100644 index 0fd9fb78ea5e260ef30fdf00658d4a390421f900..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_wow.sh +++ /dev/null @@ -1,85 +0,0 @@ -n_gpus=1 -task_name="wow" -dataset_name="wow" -speaker="system" -data_dir="data/${task_name}/${dataset_name}" -output_dir="output/${task_name}/${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test_unseen.json" -metric_name_or_path="metric.py" -metric_for_best_model="bleu" -source_column="context+knowledge" -target_column="response" -truncation_side="left" -max_source_length=1024 -max_target_length=512 -model_name_or_path="t5-small" -per_device_train_batch_size=32 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=10 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 1 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_wow_fewshot.sh b/convlab/base_models/t5/key2gen/run_wow_fewshot.sh deleted file mode 100644 index c9cdbe645d771d099403c9f840a08114e005a18d..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_wow_fewshot.sh +++ /dev/null @@ -1,87 +0,0 @@ -n_gpus=1 -task_name="wow" -dataset_name="wow" -speaker="system" -ratio=$1 -dial_ids_order=$2 -data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}" -output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test_unseen.json" -metric_name_or_path="metric.py" -metric_for_best_model="bleu" -source_column="context+knowledge" -target_column="response" -truncation_side="left" -max_source_length=1024 -max_target_length=512 -model_name_or_path="t5-small" -per_device_train_batch_size=32 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=100 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 1 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh b/convlab/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh deleted file mode 100644 index 6aaf8ca4dd160f8c108281413441e021b5bd0571..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh +++ /dev/null @@ -1,87 +0,0 @@ -n_gpus=1 -task_name="wow" -dataset_name="wow" -speaker="system" -ratio=$1 -dial_ids_order=$2 -data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}" -output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test_unseen.json" -metric_name_or_path="metric.py" -metric_for_best_model="bleu" -source_column="context+knowledge" -target_column="response" -truncation_side="left" -max_source_length=1024 -max_target_length=512 -model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog" -per_device_train_batch_size=32 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=100 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 1 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/key2gen/run_wow_key2gen.sh b/convlab/base_models/t5/key2gen/run_wow_key2gen.sh deleted file mode 100644 index b5badb7f7fefb979f6d9dea201d87e384a403493..0000000000000000000000000000000000000000 --- a/convlab/base_models/t5/key2gen/run_wow_key2gen.sh +++ /dev/null @@ -1,85 +0,0 @@ -n_gpus=1 -task_name="wow" -dataset_name="wow" -speaker="system" -data_dir="data/${task_name}/key2gen_${dataset_name}" -output_dir="output/${task_name}/key2gen_${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test_unseen.json" -metric_name_or_path="metric.py" -metric_for_best_model="bleu" -source_column="context+knowledge" -target_column="response" -truncation_side="left" -max_source_length=1024 -max_target_length=512 -model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog" -per_device_train_batch_size=32 -per_device_eval_batch_size=64 -gradient_accumulation_steps=4 -lr=1e-3 -num_train_epochs=10 - -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} --key2gen - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 1 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing - -# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json - -# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab/base_models/t5/run_seq2seq.py b/convlab/base_models/t5/run_seq2seq.py index 7aac3c70746e877469fc34892cd3f93f9fd01f22..5fa921f0d4c855dc17b7f3b5d1daa8cc404f957c 100644 --- a/convlab/base_models/t5/run_seq2seq.py +++ b/convlab/base_models/t5/run_seq2seq.py @@ -149,6 +149,9 @@ class DataTrainingArguments: "help": "An optional metric name or file to evaluate the model." }, ) + metric_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the metric to use (via the datasets library)."} + ) overwrite_cache: bool = field( default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} ) @@ -317,8 +320,17 @@ def main(): # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. + data_files = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + if data_args.test_file is not None: + data_files["test"] = data_args.test_file + raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, + data_files=data_files if len(data_files) > 0 else None ) else: data_files = {} @@ -528,7 +540,7 @@ def main(): # compute custom metric at evaluation. if data_args.metric_name_or_path: - metric = load_metric(data_args.metric_name_or_path) + metric = load_metric(data_args.metric_name_or_path, data_args.metric_config_name) # Must take a EvalPrediction and return a dictionary string to metric values. def compute_metrics(p: EvalPrediction): preds, labels = p.predictions, p.label_ids