diff --git a/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.sh b/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.sh index f24058ecfa63c40c9100f03f061d64b58946796f..c98060c6044359fabbb4ee9295f2e3e70df86eec 100644 --- a/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.sh +++ b/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.sh @@ -1,5 +1,5 @@ task_name="key2gen_noisy" -dataset_name="dailydialog+metalwoz+tm1+tm2+tm3" +dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+wikidialog" names=$(echo ${dataset_name} | tr "+" "\n") model_type="gpt" data_dir=data/${task_name}/${model_type}/${name}/${dataset_name} diff --git a/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh b/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh index 9a413f2dd91381cd0a05b13fdc79e5be588f8cc2..36e24587d3d7181e4a92b649e5f1b1e47cdcd9f3 100644 --- a/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh +++ b/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh @@ -2,7 +2,7 @@ set -e n_gpus=2 master_port=23457 task_name="key2gen_noisy" -dataset_name="dailydialog+metalwoz+tm1+tm2+tm3" +dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+wikidialog" model_type="gpt" data_dir="data/${task_name}/${model_type}/${dataset_name}" output_dir="output/${task_name}/${model_type}/${dataset_name}" diff --git a/convlab2/base_models/t5/key2gen/create_data_key2gen.py b/convlab2/base_models/t5/key2gen/create_data_key2gen.py index edf4a130997f9d6ed0716f1e99577fb4621e8cb5..d2a688558617c3f6aee306e0d6895021482b888c 100644 --- a/convlab2/base_models/t5/key2gen/create_data_key2gen.py +++ b/convlab2/base_models/t5/key2gen/create_data_key2gen.py @@ -1,11 +1,13 @@ import os import json from tqdm import tqdm -import re from transformers import AutoTokenizer -from convlab2.util import load_dataset, load_nlu_data, load_dst_data, load_policy_data, load_nlg_data, load_e2e_data, load_rg_data -from convlab2.base_models.t5.nlu.serialization import serialize_dialogue_acts, deserialize_dialogue_acts, equal_da_seq +from convlab2.util import load_dataset, load_unified_data, load_nlu_data +from convlab2.base_models.t5.nlu.serialization import serialize_dialogue_acts from collections import Counter +from nltk.tokenize import word_tokenize, sent_tokenize +from nltk.corpus import stopwords +from string import punctuation def create_nlg_data(dataset, data_dir, args): data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size) @@ -17,30 +19,26 @@ def create_nlg_data(dataset, data_dir, args): data = [] for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): if args.key2gen: - da_dict = {} + keywords = [] for da_type in sample['dialogue_acts']: for da in sample['dialogue_acts'][da_type]: intent, domain, slot, value = da['intent'], da['domain'], da['slot'], da.get('value', '') intent_domain = f'{intent}-{domain}' - da_dict.setdefault(intent_domain, []) - da_dict[intent_domain].append((slot, value)) - keywords = [] - for intent_domain, slot_values in da_dict.items(): - keywords.append(intent_domain) - for slot, value in slot_values: + keywords.append([intent_domain]) if len(slot) > 0: - keywords.append(slot) + keywords[-1].append(slot) if len(value) > 0: - keywords.append(value) - dialogue_acts_seq = ' | '.join(keywords) + keywords[-1].append(value) + dialogue_acts_seq = '| {} |'.format(' | '.join([' : '.join(da_keywords) for da_keywords in keywords])) else: dialogue_acts_seq = serialize_dialogue_acts(sample['dialogue_acts']) if args.context_window_size>0: context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: ']) - context = f'{dialogue_acts_seq}\n\ncontext: {context}' + context = f'generate a response: grounded knowledge: {dialogue_acts_seq} context:\n\n{context}' else: - context = f'{dialogue_acts_seq}\n\ncontext: {sample["speaker"]}: ' + context = f'generate a response: grounded knowledge: {dialogue_acts_seq} context:\n\n{sample["speaker"]}: ' + data.append(json.dumps({'context+da': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n') file_name = os.path.join(data_dir, f"{data_split}.json") @@ -61,23 +59,21 @@ def create_dart_data(dataset, data_dir, args): triples = sample['tripleset'] if args.key2gen: keywords = [w for triple in triples for w in triple] - # TODO: try adding prompt - # entity_cnt = Counter() - # for triple in triples: - # e1, r, e2 = triple - # for e in [e1, e2]: - # if e.startswith('[') and e.endswith(']'): - # continue - # entity_cnt[e] += 1 + # DONE: try adding prompt, no improvement + entity_cnt = Counter() + for triple in triples: + e1, r, e2 = triple + for e in [e1, e2]: + if e.startswith('[') and e.endswith(']'): + continue + entity_cnt[e] += 1 - # assert len(entity_cnt) > 0 - # common_entity = entity_cnt.most_common(1)[0][0] - # context = f'{" | ".join(keywords)}\n\ncontext: user: tell me something about {common_entity}. system: ' - - context = f'{" | ".join(keywords)}\n\ncontext: system: ' + assert len(entity_cnt) > 0 + common_entity = entity_cnt.most_common(1)[0][0] + context = f'{" | ".join(keywords)}\n\ncontext: user: tell me something about {common_entity}. system: ' else: - triples = [f"[{triple[0]}][{triple[1]}][{triple[2]}]" for triple in triples] - context = f'{";".join(triples)}\n\ncontext: system: ' + triples = [' : '.join(triple) for triple in triples] + context = f'{" | ".join(triples)}\n\ncontext: system: ' data.append(json.dumps({'triples': context, 'text': sample['utterance']}, ensure_ascii=False)+'\n') @@ -87,6 +83,148 @@ def create_dart_data(dataset, data_dir, args): data_by_split[data_split] = data return data_by_split +def create_commongen_data(dataset, data_dir, args): + data_by_split = dataset + os.makedirs(data_dir, exist_ok=True) + + data_splits = data_by_split.keys() + for data_split in data_splits: + data = [] + for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): + sample = sample['turns'][0] + concepts = sample['concepts'] + context = f'{" | ".join(concepts)}\n\ncontext: system: ' + + data.append(json.dumps({'concepts': context, 'text': sample['utterance']}, ensure_ascii=False)+'\n') + + file_name = os.path.join(data_dir, f"{data_split}.json") + with open(file_name, "w", encoding='utf-8') as f: + f.writelines(data) + data_by_split[data_split] = data + return data_by_split + +def create_kvret_data(dataset, data_dir, args): + data_by_split = load_unified_data(dataset, speaker='system', utterance=True, db_results=True, use_context=True, context_window_size=100) + os.makedirs(data_dir, exist_ok=True) + + domain2entity_col = {'schedule': 'event' ,'navigate': 'poi', 'weather': 'location'} + data_splits = data_by_split.keys() + for data_split in data_splits: + data = [] + for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): + if len(sample['utterance']) == 0: + continue + db_results = sample['db_results'] + db_seqs = [] + for domain, db_items in db_results.items(): + entity_col = domain2entity_col[domain] + for db_item in db_items: + entity = db_item[entity_col] + for db_key, db_value in db_item.items(): + if db_key == entity_col: + continue + db_seqs.append(' : '.join([entity, db_key, db_value])) + db_seq = ' |\n'.join(db_seqs) + + context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: ']) + context = f'generate a response: all knowledge:\n\n| {db_seq} | context:\n\n{context}' + data.append(json.dumps({'context+db': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n') + + file_name = os.path.join(data_dir, f"{data_split}.json") + with open(file_name, "w", encoding='utf-8') as f: + f.writelines(data) + data_by_split[data_split] = data + return data_by_split + +def create_personachat_data(dataset, data_dir, args): + data_by_split = dataset + os.makedirs(data_dir, exist_ok=True) + + stop_words = set(stopwords.words('english')) | set(punctuation) + def sentence2keywords(sentence): + index2keyword = {} + for i, w in enumerate(word_tokenize(sentence)): + if not w.lower() in stop_words: + index2keyword[i] = w + indexes = sorted(index2keyword.keys()) + keywords = [] + for i, index in enumerate(indexes): + if i > 0 and index == indexes[i-1] + 1: + keywords[-1]+= ' '+index2keyword[index] + else: + keywords.append(index2keyword[index]) + return keywords + + data_splits = data_by_split.keys() + for data_split in data_splits: + data = [] + for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): + persona = dial['persona']['system'] + if args.key2gen: + persona_seq = ' |\n'.join([' : '.join(sentence2keywords(s)) for s in persona]) + else: + persona_seq = ' | '.join(persona) + context = [] + for turn in dial['turns']: + if turn['speaker'] == 'system': + context_seq = '\n'.join([f"{t['speaker']}: {t['utterance']}" for t in context]+[f'{turn["speaker"]}: ']) + context_seq = f'generate a response: all knowledge:\n\n| {persona_seq} | context:\n\n{context_seq}' + data.append(json.dumps({'context+persona': context_seq, 'response': turn['utterance']}, ensure_ascii=False)+'\n') + context.append({'speaker': turn['speaker'], 'utterance': turn['utterance']}) + + file_name = os.path.join(data_dir, f"{data_split}.json") + with open(file_name, "w", encoding='utf-8') as f: + f.writelines(data) + data_by_split[data_split] = data + return data_by_split + +def create_wow_data(dataset, data_dir, args): + data_by_split = dataset + os.makedirs(data_dir, exist_ok=True) + + stop_words = set(stopwords.words('english')) | set(punctuation) + def sentence2keywords(sentence): + index2keyword = {} + for i, w in enumerate(word_tokenize(sentence)): + if not w.lower() in stop_words: + index2keyword[i] = w + indexes = sorted(index2keyword.keys()) + keywords = [] + for i, index in enumerate(indexes): + if i > 0 and index == indexes[i-1] + 1: + keywords[-1]+= ' '+index2keyword[index] + else: + keywords.append(index2keyword[index]) + return keywords + + def sentences2keywords_seq(sentences): + return ' |\n'.join([' : '.join(sentence2keywords(sentence)) for sentence in sent_tokenize(sentences)]) + + + data_splits = data_by_split.keys() + for data_split in data_splits: + data = [] + for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): + context = [] + for turn in dial['turns']: + if turn['speaker'] == 'system': + if turn['checked_sentence']: + if args.key2gen: + know_seq = f" | {sentences2keywords_seq(turn['checked_sentence'])} |" + else: + know_seq = turn['checked_sentence'] + else: + know_seq = '' + context_seq = '\n'.join([f"{t['speaker']}: {t['utterance']}" for t in context]+[f'{turn["speaker"]}: ']) + context_seq = f'generate a response: grounded knowledge:\n\n{know_seq} context:\n\n{context_seq}' + data.append(json.dumps({'context+knowledge': context_seq, 'response': turn['utterance']}, ensure_ascii=False)+'\n') + context.append({'speaker': turn['speaker'], 'utterance': turn['utterance']}) + + file_name = os.path.join(data_dir, f"{data_split}.json") + with open(file_name, "w", encoding='utf-8') as f: + f.writelines(data) + data_by_split[data_split] = data + return data_by_split def get_max_len(data_by_split, tokenizer): for data_split in data_by_split.keys(): @@ -104,7 +242,7 @@ def get_max_len(data_by_split, tokenizer): if __name__ == '__main__': from argparse import ArgumentParser parser = ArgumentParser(description="create data for seq2seq training") - parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['nlg', 'dart'], help='names of tasks') + parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['nlg', 'dart', 'commongen', 'kvret', 'personachat', 'wow'], help='names of tasks') parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets') parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s)') parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered') diff --git a/convlab2/base_models/t5/key2gen/eval.ipynb b/convlab2/base_models/t5/key2gen/eval.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..51fcc5e0da1321ef740084d0a8b0241b5721a2fc --- /dev/null +++ b/convlab2/base_models/t5/key2gen/eval.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["import json\n","import re"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["def read_jsonline(path):\n"," return [json.loads(line) for line in open(path)]"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["origin = read_jsonline('output/wow/wow/test_unseen.json')"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[],"source":["key2gen = read_jsonline('output/wow/key2gen_wow/test_unseen.json')"]},{"cell_type":"code","execution_count":23,"metadata":{},"outputs":[],"source":["with open('tmp_wow.txt', 'w') as f:\n"," for d1, d2 in zip(origin, key2gen):\n"," print(re.split('context:|grounded knowledge:', d1['context+knowledge'])[1].strip(), file=f)\n"," print(re.split('context:|grounded knowledge:', d2['context+knowledge'])[1].strip(), file=f)\n"," print(d1['context+knowledge'].split('context:')[1].replace('\\n\\n', '\\n'), file=f)\n"," print(file=f)\n"," print('target', d1['response'], file=f)\n"," print('origin', d1['predictions'], file=f)\n"," print('key2gen', d2['predictions'], file=f)\n"," print('='*100, file=f)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["for ratio in [0.1, 0.01]:\n"," for order in [0, 1, 2]:\n"," origin = read_jsonline(f'output/personachat/key2gen_personachat_{ratio}_order{order}/generated_predictions.json')\n"," score = metric.compute(predictions=[d['predictions'] for d in origin], references=[d['response'] for d in origin])\n"," print(ratio, order)\n"," print(score)\n"," "]},{"cell_type":"code","execution_count":51,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["0.01 1\n","{'bleu-1': 24.322560358946276, 'bleu-2': 13.03630111937752, 'bleu-3': 7.43647978674912, 'bleu-4': 4.450365738541082, 'unigram f1': 0.20101056184593705, 'unigram f1 (non-stop words)': 0.09881569367818614, 'rouge1': 21.359332522961864, 'rouge2': 6.532120354812852, 'rougeL': 19.76437990594138}\n"]}],"source":["for ratio in [0.01]:\n"," for order in [1]:\n"," origin = read_jsonline(f'output/personachat/personachat/generated_predictions.json')\n"," score = metric.compute(predictions=[d['predictions'] for d in origin], references=[d['response'] for d in origin])\n"," print(ratio, order)\n"," print(score)\n"," "]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[],"source":["from datasets import load_metric"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[],"source":["metric = load_metric('metric.py')"]},{"cell_type":"code","execution_count":58,"metadata":{},"outputs":[{"data":{"text/plain":["{'bleu-1': 47.9848465486215,\n"," 'bleu-2': 37.18000679532912,\n"," 'bleu-3': 29.346646172092814,\n"," 'bleu-4': 23.410526740211363,\n"," 'unigram f1': 0.4999850046010773,\n"," 'unigram f1 (non-stop words)': 0.5150265227462978,\n"," 'rouge1': 50.536642578692195,\n"," 'rouge2': 33.10681789367832,\n"," 'rougeL': 46.84702913163778,\n"," 'meteor': 0.4641962079490068}"]},"execution_count":58,"metadata":{},"output_type":"execute_result"}],"source":["metric.compute(predictions=[d['predictions'] for d in key2gen], references=[d['response'] for d in key2gen])"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"data":{"text/plain":["{'bleu-1': 37.570099942714585,\n"," 'bleu-2': 26.77393964962893,\n"," 'bleu-3': 21.115954644820572,\n"," 'bleu-4': 17.513316671216046,\n"," 'unigram f1': 0.3656930567072274,\n"," 'unigram f1 (non-stop words)': 0.36456219281235724,\n"," 'rouge1': 39.1982724920493,\n"," 'rouge2': 20.825159884632743,\n"," 'rougeL': 34.98278542180112,\n"," 'meteor': 0.3405671227693821,\n"," 'distinct-1': 0.07838670580160921,\n"," 'distinct-2': 0.29689084413659694}"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["metric.compute(predictions=[d['predictions'] for d in origin], references=[d['response'] for d in origin])"]},{"cell_type":"code","execution_count":34,"metadata":{},"outputs":[{"data":{"text/plain":["{'bleu-1': 47.9848465486215,\n"," 'bleu-2': 37.18000679532912,\n"," 'bleu-3': 29.346646172092814,\n"," 'bleu-4': 23.410526740211363,\n"," 'unigram f1': 0.4999850046010773,\n"," 'unigram f1 (non-stop words)': 0.5150265227462978,\n"," 'rouge1': AggregateScore(low=Score(precision=0.5301926525013549, recall=0.4821419251082986, fmeasure=0.48565655175230005), mid=Score(precision=0.5513392693168799, recall=0.50235850981064, fmeasure=0.5053664257869219), high=Score(precision=0.5760132731228504, recall=0.5268580272115051, fmeasure=0.5279111393835526)),\n"," 'rouge2': AggregateScore(low=Score(precision=0.34772127155901306, recall=0.30411953889228, fmeasure=0.31029658993105447), mid=Score(precision=0.3696898381097765, recall=0.32612705034192035, fmeasure=0.3310681789367832), high=Score(precision=0.3947745596965405, recall=0.34880792116864995, fmeasure=0.35356317521641434)),\n"," 'rougeL': AggregateScore(low=Score(precision=0.4874189522136045, recall=0.4413343070361347, fmeasure=0.4464463084888409), mid=Score(precision=0.5108530997712726, recall=0.4642203560120527, fmeasure=0.46847029131637785), high=Score(precision=0.5350154077389535, recall=0.4855131911095939, fmeasure=0.4899950876629784)),\n"," 'rougeLsum': AggregateScore(low=Score(precision=0.4871840444049138, recall=0.44081531444183386, fmeasure=0.44514075751478493), mid=Score(precision=0.5105975305923949, recall=0.4639265647317744, fmeasure=0.46779186414456864), high=Score(precision=0.5348015149575474, recall=0.48693312722760357, fmeasure=0.4918651382986408))}"]},"execution_count":34,"metadata":{},"output_type":"execute_result"}],"source":["metric.compute(predictions=[d['predictions'] for d in key2gen], references=[d['response'] for d in key2gen])"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]}],"metadata":{"interpreter":{"hash":"0f9333403d680bc010aa5ce5a2f27ba398c9e47e92ba3724506306aa234cd07d"},"kernelspec":{"display_name":"Python 3.8.12 ('py38')","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.12"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} diff --git a/convlab2/base_models/t5/key2gen/metric.py b/convlab2/base_models/t5/key2gen/metric.py new file mode 100644 index 0000000000000000000000000000000000000000..050db04d120383bece5bfd5e9cb3362f66c8ecac --- /dev/null +++ b/convlab2/base_models/t5/key2gen/metric.py @@ -0,0 +1,188 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""NLG Metric""" + +import datasets +from sacrebleu.metrics import BLEU +from sacrebleu.utils import sum_of_lists +import re +from collections import Counter +import numpy as np +from nltk.corpus import stopwords +from nltk import sent_tokenize +from rouge_score import rouge_scorer, scoring +from nltk.translate import meteor_score +from datasets.config import importlib_metadata, version + + +NLTK_VERSION = version.parse(importlib_metadata.version("nltk")) +if NLTK_VERSION >= version.Version("3.6.5"): + from nltk import word_tokenize + + +# TODO: Add BibTeX citation +_CITATION = """\ +@inproceedings{post-2018-call, + title = "A Call for Clarity in Reporting {BLEU} Scores", + author = "Post, Matt", + booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers", + month = oct, + year = "2018", + address = "Belgium, Brussels", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/W18-6319", + pages = "186--191", +} +""" + +_DESCRIPTION = """\ +Metric to evaluate text-to-text models on the natural language generation task. +""" + +_KWARGS_DESCRIPTION = """ +Calculates corpus-bleu4 +Args: + predictions: list of predictions to score. Each predictions + should be a string. + references: list of reference for each prediction. Each + reference should be a string. +Returns: + bleu: corpus-bleu score +Examples: + + >>> nlg_metric = datasets.load_metric("nlg_metric.py") + >>> predictions = ["hello there general kenobi", "foo bar foobar"] + >>> references = ["hello there kenobi", "foo bar foobar"] + >>> results = nlg_metric.compute(predictions=predictions, references=references) + >>> print(results) + {'bleu': 35.35533905932737} +""" + + +@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class NLGMetrics(datasets.Metric): + """Metric to evaluate text-to-text models on the natural language generation task.""" + def _info(self): + return datasets.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + # This defines the format of each prediction and reference + features=datasets.Features({ + 'predictions': datasets.Value('string'), + 'references': datasets.Value('string'), + }) + ) + + # def _download_and_prepare(self, dl_manager): + # import nltk + # nltk.download("wordnet") + # if NLTK_VERSION >= version.Version("3.6.5"): + # nltk.download("punkt") + # if NLTK_VERSION >= version.Version("3.6.6"): + # nltk.download("omw-1.4") + + def _compute(self, predictions, references): + """Returns the scores: bleu""" + metrics = {} + # bleu + bleu = BLEU(lowercase=True, force=False, tokenize=BLEU.TOKENIZER_DEFAULT, smooth_method='exp', smooth_value=None, effective_order=False) + stats = sum_of_lists(bleu._extract_corpus_statistics(predictions, [references])) + for n in range(1,5): + metrics[f'bleu-{n}'] = bleu.compute_bleu( + correct=stats[2: 2 + bleu.max_ngram_order], + total=stats[2 + bleu.max_ngram_order:], + sys_len=int(stats[0]), ref_len=int(stats[1]), + smooth_method=bleu.smooth_method, smooth_value=bleu.smooth_value, + effective_order=bleu.effective_order, + max_ngram_order=n).score + + # unigram f1 + re_art = re.compile(r'\b(a|an|the)\b') + re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']') + stop_words = set(stopwords.words('english')) + def utt2words(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + s = s.lower() + s = re_punc.sub(' ', s) + s = re_art.sub(' ', s) + return s.split() + + metrics['unigram f1'] = [] + metrics['unigram f1 (non-stop words)'] = [] + for prediction, reference in zip(predictions, references): + pred_items = utt2words(prediction) + gold_items = utt2words(reference) + for remove_stopwords in [False, True]: + if remove_stopwords: + pred_items = [w for w in pred_items if w not in stop_words] + gold_items = [w for w in gold_items if w not in stop_words] + common = Counter(pred_items) & Counter(gold_items) + num_same = sum(common.values()) + if num_same == 0: + f1 = 0 + else: + precision = 1.0 * num_same / len(pred_items) + recall = 1.0 * num_same / len(gold_items) + f1 = (2 * precision * recall) / (precision + recall) + if not remove_stopwords: + metrics['unigram f1'].append(f1) + else: + metrics['unigram f1 (non-stop words)'].append(f1) + metrics['unigram f1'] = np.mean(metrics['unigram f1']) + metrics['unigram f1 (non-stop words)'] = np.mean(metrics['unigram f1 (non-stop words)']) + + # rouge-1/2/L-fmeasure + rouge_types=["rouge1", "rouge2", "rougeL"] + scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=True) + aggregator = scoring.BootstrapAggregator() + + for prediction, reference in zip(predictions, references): + score = scorer.score(reference, prediction) + aggregator.add_scores(score) + + result = aggregator.aggregate() + metrics.update({key: value.mid.fmeasure * 100 for key, value in result.items()}) + + # meteor + alpha=0.9 + beta=3 + gamma=0.5 + if NLTK_VERSION >= version.Version("3.6.5"): + scores = [ + meteor_score.single_meteor_score( + word_tokenize(ref), word_tokenize(pred), alpha=alpha, beta=beta, gamma=gamma + ) + for ref, pred in zip(references, predictions) + ] + else: + scores = [ + meteor_score.single_meteor_score(ref, pred, alpha=alpha, beta=beta, gamma=gamma) + for ref, pred in zip(references, predictions) + ] + metrics.update({"meteor": np.mean(scores)}) + + # inter/intra-distinct-1/2 + def _ngram(seq, n): + for i in range(len(seq) - n + 1): + yield tuple(seq[i : i + n]) + + for k in [1, 2]: + inter_cnt = Counter() + for prediction in predictions: + ngram = Counter(_ngram(utt2words(prediction), k)) + inter_cnt += ngram + metrics[f'distinct-{k}'] = max(len(inter_cnt), 1e-12) / max(sum(inter_cnt.values()), 1e-5) + + return metrics diff --git a/convlab2/base_models/t5/key2gen/run.sh b/convlab2/base_models/t5/key2gen/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..74e44434ea1ab9e0099e680f2862521f7063049b --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run.sh @@ -0,0 +1,7 @@ +for ratio in 0.1 0.01 +do + for dial_ids_order in 0 1 2 + do + bash run_persona_fewshot_key2gen.sh ${ratio} ${dial_ids_order} + done +done \ No newline at end of file diff --git a/convlab2/base_models/t5/key2gen/run2.sh b/convlab2/base_models/t5/key2gen/run2.sh new file mode 100644 index 0000000000000000000000000000000000000000..7e7bf5d6a75e72d6c236fbf738a8c7d010f0e18b --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run2.sh @@ -0,0 +1,7 @@ +for ratio in 0.1 0.01 +do + for dial_ids_order in 0 1 2 + do + bash run_wow_fewshot_key2gen.sh ${ratio} ${dial_ids_order} + done +done \ No newline at end of file diff --git a/convlab2/base_models/t5/key2gen/run_commongen.sh b/convlab2/base_models/t5/key2gen/run_commongen.sh new file mode 100644 index 0000000000000000000000000000000000000000..5952725709afa1848695158b76f04aa81ff328a6 --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_commongen.sh @@ -0,0 +1,85 @@ +n_gpus=1 +task_name="commongen" +dataset_name="commongen" +speaker="system" +context_window_size=0 +data_dir="data/${task_name}/${dataset_name}" +output_dir="output/${task_name}/${dataset_name}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/validation.json" +metric_name_or_path="../nlg/nlg_metric.py" +metric_for_best_model="bleu" +source_column="concepts" +target_column="text" +truncation_side="left" +max_source_length=512 +max_target_length=512 +model_name_or_path="t5-small" +per_device_train_batch_size=128 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=10 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 3 \ + --prediction_loss_only \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_commongen_key2gen.sh b/convlab2/base_models/t5/key2gen/run_commongen_key2gen.sh new file mode 100644 index 0000000000000000000000000000000000000000..e09d3b464e2685dd8e0d774539f8c8dc4808f99c --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_commongen_key2gen.sh @@ -0,0 +1,88 @@ +n_gpus=1 +task_name="commongen" +dataset_name="commongen" +speaker="system" +context_window_size=0 +data_dir="data/${task_name}/key2gen_${dataset_name}" +output_dir="output/${task_name}/key2gen_${dataset_name}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/validation.json" +metric_name_or_path="../nlg/nlg_metric.py" +metric_for_best_model="bleu" +source_column="concepts" +target_column="text" +source_prefix="keywords: " +truncation_side="left" +max_source_length=512 +max_target_length=512 +model_name_or_path="../../gpt/keyword_extraction/output/key2gen_shuffle_noisy/gpt/dailydialog+metalwoz+sgd+tm1+tm2+tm3" +per_device_train_batch_size=128 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=10 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --source_prefix ${source_prefix} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 3 \ + --prediction_loss_only \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --source_prefix ${source_prefix} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_dart.sh b/convlab2/base_models/t5/key2gen/run_dart.sh index 6c346408c18c3ead330018cae58effd97fb45265..62eb3236f0c468c6b185eb9c5c39e807e3a92800 100644 --- a/convlab2/base_models/t5/key2gen/run_dart.sh +++ b/convlab2/base_models/t5/key2gen/run_dart.sh @@ -3,7 +3,7 @@ task_name="dart" dataset_name="dart" speaker="system" context_window_size=0 -data_dir="data/${task_name}/${dataset_name}" +data_dir="data/${task_name}/key2gen_${dataset_name}" output_dir="output/${task_name}/${dataset_name}" cache_dir="../cache" logging_dir="${output_dir}/runs" @@ -24,36 +24,36 @@ gradient_accumulation_steps=4 lr=1e-3 num_train_epochs=10 -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} +# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 3 \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing +# python ../run_seq2seq.py \ +# --task_name ${task_name} \ +# --train_file ${train_file} \ +# --validation_file ${validation_file} \ +# --source_column ${source_column} \ +# --target_column ${target_column} \ +# --max_source_length ${max_source_length} \ +# --max_target_length ${max_target_length} \ +# --truncation_side ${truncation_side} \ +# --model_name_or_path ${model_name_or_path} \ +# --do_train \ +# --do_eval \ +# --save_strategy epoch \ +# --evaluation_strategy epoch \ +# --save_total_limit 3 \ +# --prediction_loss_only \ +# --cache_dir ${cache_dir} \ +# --output_dir ${output_dir} \ +# --logging_dir ${logging_dir} \ +# --overwrite_output_dir \ +# --preprocessing_num_workers 4 \ +# --per_device_train_batch_size ${per_device_train_batch_size} \ +# --per_device_eval_batch_size ${per_device_eval_batch_size} \ +# --gradient_accumulation_steps ${gradient_accumulation_steps} \ +# --learning_rate ${lr} \ +# --num_train_epochs ${num_train_epochs} \ +# --adafactor \ +# --gradient_checkpointing python ../run_seq2seq.py \ --task_name ${task_name} \ @@ -63,7 +63,7 @@ python ../run_seq2seq.py \ --max_source_length ${max_source_length} \ --max_target_length ${max_target_length} \ --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ + --model_name_or_path ${model_name_or_path} \ --do_predict \ --predict_with_generate \ --metric_name_or_path ${metric_name_or_path} \ @@ -80,6 +80,6 @@ python ../run_seq2seq.py \ --adafactor \ --gradient_checkpointing -python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_dart_key2gen.sh b/convlab2/base_models/t5/key2gen/run_dart_key2gen.sh index 8f539f469d58ff169ad7a6c71f0a38a894acf566..b729da6fdf27ed61dad7a2a48120fbbbaef49afc 100644 --- a/convlab2/base_models/t5/key2gen/run_dart_key2gen.sh +++ b/convlab2/base_models/t5/key2gen/run_dart_key2gen.sh @@ -3,7 +3,7 @@ task_name="dart" dataset_name="dart" speaker="system" context_window_size=0 -data_dir="data/${task_name}/key2gen_${dataset_name}" +data_dir="data/${task_name}/${dataset_name}" output_dir="output/${task_name}/key2gen_${dataset_name}" cache_dir="../cache" logging_dir="${output_dir}/runs" @@ -14,56 +14,59 @@ metric_name_or_path="../nlg/nlg_metric.py" metric_for_best_model="bleu" source_column="triples" target_column="text" +source_prefix="possible keywords: " truncation_side="left" max_source_length=512 max_target_length=512 -model_name_or_path="t5-small" +model_name_or_path="../../gpt/keyword_extraction/output/key2gen_shuffle_noisy/gpt/dailydialog+metalwoz+sgd+tm1+tm2+tm3" per_device_train_batch_size=128 per_device_eval_batch_size=64 gradient_accumulation_steps=4 lr=1e-3 num_train_epochs=10 -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen +# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 3 \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing +# python ../run_seq2seq.py \ +# --task_name ${task_name} \ +# --train_file ${train_file} \ +# --validation_file ${validation_file} \ +# --source_column ${source_column} \ +# --target_column ${target_column} \ +# --source_prefix "${source_prefix}" \ +# --max_source_length ${max_source_length} \ +# --max_target_length ${max_target_length} \ +# --truncation_side ${truncation_side} \ +# --model_name_or_path ${model_name_or_path} \ +# --do_train \ +# --do_eval \ +# --save_strategy epoch \ +# --evaluation_strategy epoch \ +# --save_total_limit 3 \ +# --prediction_loss_only \ +# --cache_dir ${cache_dir} \ +# --output_dir ${output_dir} \ +# --logging_dir ${logging_dir} \ +# --overwrite_output_dir \ +# --preprocessing_num_workers 4 \ +# --per_device_train_batch_size ${per_device_train_batch_size} \ +# --per_device_eval_batch_size ${per_device_eval_batch_size} \ +# --gradient_accumulation_steps ${gradient_accumulation_steps} \ +# --learning_rate ${lr} \ +# --num_train_epochs ${num_train_epochs} \ +# --adafactor \ +# --gradient_checkpointing python ../run_seq2seq.py \ --task_name ${task_name} \ --test_file ${test_file} \ --source_column ${source_column} \ --target_column ${target_column} \ + --source_prefix "${source_prefix}" \ --max_source_length ${max_source_length} \ --max_target_length ${max_target_length} \ --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ + --model_name_or_path ${model_name_or_path} \ --do_predict \ --predict_with_generate \ --metric_name_or_path ${metric_name_or_path} \ @@ -80,6 +83,6 @@ python ../run_seq2seq.py \ --adafactor \ --gradient_checkpointing -python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_kvret.sh b/convlab2/base_models/t5/key2gen/run_kvret.sh new file mode 100644 index 0000000000000000000000000000000000000000..b960ed18735d804f82116b4307223b4245f2b368 --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_kvret.sh @@ -0,0 +1,85 @@ +n_gpus=1 +task_name="kvret" +dataset_name="kvret" +speaker="system" +data_dir="data/${task_name}/${dataset_name}" +output_dir="output/${task_name}/${dataset_name}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/test.json" +metric_name_or_path="../nlg/nlg_metric.py" +metric_for_best_model="bleu" +source_column="context+db" +target_column="response" +truncation_side="left" +max_source_length=1024 +max_target_length=512 +model_name_or_path="t5-small" +per_device_train_batch_size=32 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=10 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_kvret_fewshot.sh b/convlab2/base_models/t5/key2gen/run_kvret_fewshot.sh new file mode 100644 index 0000000000000000000000000000000000000000..a6523d6cea506b6433f2cd853ca6255a28cfd8a6 --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_kvret_fewshot.sh @@ -0,0 +1,87 @@ +n_gpus=1 +task_name="kvret" +dataset_name="kvret" +speaker="system" +ratio=$1 +dial_ids_order=$2 +data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}" +output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/test.json" +metric_name_or_path="../nlg/nlg_metric.py" +metric_for_best_model="bleu" +source_column="context+db" +target_column="response" +truncation_side="left" +max_source_length=1024 +max_target_length=512 +model_name_or_path="t5-small" +per_device_train_batch_size=32 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=100 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh b/convlab2/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh new file mode 100644 index 0000000000000000000000000000000000000000..d19ae9c1ca3890896491839c76e3d7beff9bdbfc --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh @@ -0,0 +1,87 @@ +n_gpus=1 +task_name="kvret" +dataset_name="kvret" +speaker="system" +ratio=$1 +dial_ids_order=$2 +data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}" +output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/test.json" +metric_name_or_path="../nlg/nlg_metric.py" +metric_for_best_model="bleu" +source_column="context+db" +target_column="response" +truncation_side="left" +max_source_length=1024 +max_target_length=512 +model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3" +per_device_train_batch_size=32 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=100 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_kvret_key2gen.sh b/convlab2/base_models/t5/key2gen/run_kvret_key2gen.sh new file mode 100644 index 0000000000000000000000000000000000000000..44f8dbc6732f8ccd3470b8d5fd62a886898b949c --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_kvret_key2gen.sh @@ -0,0 +1,85 @@ +n_gpus=1 +task_name="kvret" +dataset_name="kvret" +speaker="system" +data_dir="data/${task_name}/${dataset_name}" +output_dir="output/${task_name}/key2gen_${dataset_name}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/test.json" +metric_name_or_path="../nlg/nlg_metric.py" +metric_for_best_model="bleu" +source_column="context+db" +target_column="response" +truncation_side="left" +max_source_length=1024 +max_target_length=512 +model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3" +per_device_train_batch_size=32 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=10 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_nlg.sh b/convlab2/base_models/t5/key2gen/run_nlg.sh index 65140ce334b4b7e7de1201eeef628ad017d03165..ec225ef719f25e970bd00ca662491fe76fd6b8af 100644 --- a/convlab2/base_models/t5/key2gen/run_nlg.sh +++ b/convlab2/base_models/t5/key2gen/run_nlg.sh @@ -24,61 +24,62 @@ gradient_accumulation_steps=4 lr=1e-3 num_train_epochs=10 -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} +# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 3 \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing +# python ../run_seq2seq.py \ +# --task_name ${task_name} \ +# --train_file ${train_file} \ +# --validation_file ${validation_file} \ +# --source_column ${source_column} \ +# --target_column ${target_column} \ +# --max_source_length ${max_source_length} \ +# --max_target_length ${max_target_length} \ +# --truncation_side ${truncation_side} \ +# --model_name_or_path ${model_name_or_path} \ +# --do_train \ +# --do_eval \ +# --save_strategy epoch \ +# --evaluation_strategy epoch \ +# --save_total_limit 1 \ +# --prediction_loss_only \ +# --load_best_model_at_end \ +# --cache_dir ${cache_dir} \ +# --output_dir ${output_dir} \ +# --logging_dir ${logging_dir} \ +# --overwrite_output_dir \ +# --preprocessing_num_workers 4 \ +# --per_device_train_batch_size ${per_device_train_batch_size} \ +# --per_device_eval_batch_size ${per_device_eval_batch_size} \ +# --gradient_accumulation_steps ${gradient_accumulation_steps} \ +# --learning_rate ${lr} \ +# --num_train_epochs ${num_train_epochs} \ +# --adafactor \ +# --gradient_checkpointing -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing +# python ../run_seq2seq.py \ +# --task_name ${task_name} \ +# --test_file ${test_file} \ +# --source_column ${source_column} \ +# --target_column ${target_column} \ +# --max_source_length ${max_source_length} \ +# --max_target_length ${max_target_length} \ +# --truncation_side ${truncation_side} \ +# --model_name_or_path ${output_dir} \ +# --do_predict \ +# --predict_with_generate \ +# --metric_name_or_path ${metric_name_or_path} \ +# --cache_dir ${cache_dir} \ +# --output_dir ${output_dir} \ +# --logging_dir ${logging_dir} \ +# --overwrite_output_dir \ +# --preprocessing_num_workers 4 \ +# --per_device_train_batch_size ${per_device_train_batch_size} \ +# --per_device_eval_batch_size ${per_device_eval_batch_size} \ +# --gradient_accumulation_steps ${gradient_accumulation_steps} \ +# --learning_rate ${lr} \ +# --num_train_epochs ${num_train_epochs} \ +# --adafactor \ +# --gradient_checkpointing python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json diff --git a/convlab2/base_models/t5/key2gen/run_nlg_fewshot.sh b/convlab2/base_models/t5/key2gen/run_nlg_fewshot.sh index b831beef20ca0d44eb8809c8823d2fdb56b656f7..f8ab2ee0f5969f2c27552de4fcf82ede5526c384 100644 --- a/convlab2/base_models/t5/key2gen/run_nlg_fewshot.sh +++ b/convlab2/base_models/t5/key2gen/run_nlg_fewshot.sh @@ -26,62 +26,62 @@ gradient_accumulation_steps=4 lr=1e-3 num_train_epochs=100 -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order} +# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order} -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 3 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing +# python ../run_seq2seq.py \ +# --task_name ${task_name} \ +# --train_file ${train_file} \ +# --validation_file ${validation_file} \ +# --source_column ${source_column} \ +# --target_column ${target_column} \ +# --max_source_length ${max_source_length} \ +# --max_target_length ${max_target_length} \ +# --truncation_side ${truncation_side} \ +# --model_name_or_path ${model_name_or_path} \ +# --do_train \ +# --do_eval \ +# --save_strategy epoch \ +# --evaluation_strategy epoch \ +# --save_total_limit 1 \ +# --prediction_loss_only \ +# --load_best_model_at_end \ +# --cache_dir ${cache_dir} \ +# --output_dir ${output_dir} \ +# --logging_dir ${logging_dir} \ +# --overwrite_output_dir \ +# --preprocessing_num_workers 4 \ +# --per_device_train_batch_size ${per_device_train_batch_size} \ +# --per_device_eval_batch_size ${per_device_eval_batch_size} \ +# --gradient_accumulation_steps ${gradient_accumulation_steps} \ +# --learning_rate ${lr} \ +# --num_train_epochs ${num_train_epochs} \ +# --adafactor \ +# --gradient_checkpointing -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing +# python ../run_seq2seq.py \ +# --task_name ${task_name} \ +# --test_file ${test_file} \ +# --source_column ${source_column} \ +# --target_column ${target_column} \ +# --max_source_length ${max_source_length} \ +# --max_target_length ${max_target_length} \ +# --truncation_side ${truncation_side} \ +# --model_name_or_path ${output_dir} \ +# --do_predict \ +# --predict_with_generate \ +# --metric_name_or_path ${metric_name_or_path} \ +# --cache_dir ${cache_dir} \ +# --output_dir ${output_dir} \ +# --logging_dir ${logging_dir} \ +# --overwrite_output_dir \ +# --preprocessing_num_workers 4 \ +# --per_device_train_batch_size ${per_device_train_batch_size} \ +# --per_device_eval_batch_size ${per_device_eval_batch_size} \ +# --gradient_accumulation_steps ${gradient_accumulation_steps} \ +# --learning_rate ${lr} \ +# --num_train_epochs ${num_train_epochs} \ +# --adafactor \ +# --gradient_checkpointing python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order} diff --git a/convlab2/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh b/convlab2/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh index 1ffcc869e554e6002cb4408e7eb3607acb9f1119..c13a187799bcfb6364d5d9d8a22f4ede96c87897 100644 --- a/convlab2/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh +++ b/convlab2/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh @@ -19,69 +19,69 @@ target_column="response" truncation_side="left" max_source_length=512 max_target_length=512 -model_name_or_path="t5-small" +model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3" per_device_train_batch_size=128 per_device_eval_batch_size=64 gradient_accumulation_steps=4 lr=1e-3 num_train_epochs=100 -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order} --key2gen +# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order} --key2gen -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 3 \ - --prediction_loss_only \ - --load_best_model_at_end \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing +# python ../run_seq2seq.py \ +# --task_name ${task_name} \ +# --train_file ${train_file} \ +# --validation_file ${validation_file} \ +# --source_column ${source_column} \ +# --target_column ${target_column} \ +# --max_source_length ${max_source_length} \ +# --max_target_length ${max_target_length} \ +# --truncation_side ${truncation_side} \ +# --model_name_or_path ${model_name_or_path} \ +# --do_train \ +# --do_eval \ +# --save_strategy epoch \ +# --evaluation_strategy epoch \ +# --save_total_limit 1 \ +# --prediction_loss_only \ +# --load_best_model_at_end \ +# --cache_dir ${cache_dir} \ +# --output_dir ${output_dir} \ +# --logging_dir ${logging_dir} \ +# --overwrite_output_dir \ +# --preprocessing_num_workers 4 \ +# --per_device_train_batch_size ${per_device_train_batch_size} \ +# --per_device_eval_batch_size ${per_device_eval_batch_size} \ +# --gradient_accumulation_steps ${gradient_accumulation_steps} \ +# --learning_rate ${lr} \ +# --num_train_epochs ${num_train_epochs} \ +# --adafactor \ +# --gradient_checkpointing -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing +# python ../run_seq2seq.py \ +# --task_name ${task_name} \ +# --test_file ${test_file} \ +# --source_column ${source_column} \ +# --target_column ${target_column} \ +# --max_source_length ${max_source_length} \ +# --max_target_length ${max_target_length} \ +# --truncation_side ${truncation_side} \ +# --model_name_or_path ${output_dir} \ +# --do_predict \ +# --predict_with_generate \ +# --metric_name_or_path ${metric_name_or_path} \ +# --cache_dir ${cache_dir} \ +# --output_dir ${output_dir} \ +# --logging_dir ${logging_dir} \ +# --overwrite_output_dir \ +# --preprocessing_num_workers 4 \ +# --per_device_train_batch_size ${per_device_train_batch_size} \ +# --per_device_eval_batch_size ${per_device_eval_batch_size} \ +# --gradient_accumulation_steps ${gradient_accumulation_steps} \ +# --learning_rate ${lr} \ +# --num_train_epochs ${num_train_epochs} \ +# --adafactor \ +# --gradient_checkpointing python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order} diff --git a/convlab2/base_models/t5/key2gen/run_nlg_key2gen.sh b/convlab2/base_models/t5/key2gen/run_nlg_key2gen.sh index 38807e9a97027a636a7cd54a552f60283fceda0a..f9b89aae85ca2963a162acde2f670caee0f1790e 100644 --- a/convlab2/base_models/t5/key2gen/run_nlg_key2gen.sh +++ b/convlab2/base_models/t5/key2gen/run_nlg_key2gen.sh @@ -17,68 +17,69 @@ target_column="response" truncation_side="left" max_source_length=512 max_target_length=512 -model_name_or_path="t5-small" +model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3" per_device_train_batch_size=128 per_device_eval_batch_size=64 gradient_accumulation_steps=4 lr=1e-3 num_train_epochs=10 -python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen +# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --save_total_limit 3 \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing +# python ../run_seq2seq.py \ +# --task_name ${task_name} \ +# --train_file ${train_file} \ +# --validation_file ${validation_file} \ +# --source_column ${source_column} \ +# --target_column ${target_column} \ +# --max_source_length ${max_source_length} \ +# --max_target_length ${max_target_length} \ +# --truncation_side ${truncation_side} \ +# --model_name_or_path ${model_name_or_path} \ +# --do_train \ +# --do_eval \ +# --save_strategy epoch \ +# --evaluation_strategy epoch \ +# --save_total_limit 1 \ +# --prediction_loss_only \ +# --load_best_model_at_end \ +# --cache_dir ${cache_dir} \ +# --output_dir ${output_dir} \ +# --logging_dir ${logging_dir} \ +# --overwrite_output_dir \ +# --preprocessing_num_workers 4 \ +# --per_device_train_batch_size ${per_device_train_batch_size} \ +# --per_device_eval_batch_size ${per_device_eval_batch_size} \ +# --gradient_accumulation_steps ${gradient_accumulation_steps} \ +# --learning_rate ${lr} \ +# --num_train_epochs ${num_train_epochs} \ +# --adafactor \ +# --gradient_checkpointing -python ../run_seq2seq.py \ - --task_name ${task_name} \ - --test_file ${test_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${output_dir} \ - --do_predict \ - --predict_with_generate \ - --metric_name_or_path ${metric_name_or_path} \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --adafactor \ - --gradient_checkpointing +# python ../run_seq2seq.py \ +# --task_name ${task_name} \ +# --test_file ${test_file} \ +# --source_column ${source_column} \ +# --target_column ${target_column} \ +# --max_source_length ${max_source_length} \ +# --max_target_length ${max_target_length} \ +# --truncation_side ${truncation_side} \ +# --model_name_or_path ${output_dir} \ +# --do_predict \ +# --predict_with_generate \ +# --metric_name_or_path ${metric_name_or_path} \ +# --cache_dir ${cache_dir} \ +# --output_dir ${output_dir} \ +# --logging_dir ${logging_dir} \ +# --overwrite_output_dir \ +# --preprocessing_num_workers 4 \ +# --per_device_train_batch_size ${per_device_train_batch_size} \ +# --per_device_eval_batch_size ${per_device_eval_batch_size} \ +# --gradient_accumulation_steps ${gradient_accumulation_steps} \ +# --learning_rate ${lr} \ +# --num_train_epochs ${num_train_epochs} \ +# --adafactor \ +# --gradient_checkpointing python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json diff --git a/convlab2/base_models/t5/key2gen/run_persona.sh b/convlab2/base_models/t5/key2gen/run_persona.sh new file mode 100644 index 0000000000000000000000000000000000000000..20ae23608ca8ebcd05cb937b53a088df924e0d66 --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_persona.sh @@ -0,0 +1,85 @@ +n_gpus=1 +task_name="personachat" +dataset_name="personachat" +speaker="system" +data_dir="data/${task_name}/${dataset_name}" +output_dir="output/${task_name}/${dataset_name}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/test.json" +metric_name_or_path="metric.py" +metric_for_best_model="bleu" +source_column="context+persona" +target_column="response" +truncation_side="left" +max_source_length=1024 +max_target_length=512 +model_name_or_path="t5-small" +per_device_train_batch_size=32 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=10 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_persona_fewshot.sh b/convlab2/base_models/t5/key2gen/run_persona_fewshot.sh new file mode 100644 index 0000000000000000000000000000000000000000..4ecf435e52293d2d8cc03411940ce55f5e73f3ab --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_persona_fewshot.sh @@ -0,0 +1,87 @@ +n_gpus=1 +task_name="personachat" +dataset_name="personachat" +speaker="system" +ratio=$1 +dial_ids_order=$2 +data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}" +output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/test.json" +metric_name_or_path="metric.py" +metric_for_best_model="bleu" +source_column="context+persona" +target_column="response" +truncation_side="left" +max_source_length=1024 +max_target_length=512 +model_name_or_path="t5-small" +per_device_train_batch_size=32 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=100 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh b/convlab2/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh new file mode 100644 index 0000000000000000000000000000000000000000..6b95839a376e4857c387535ef0a47a5317530498 --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh @@ -0,0 +1,87 @@ +n_gpus=1 +task_name="personachat" +dataset_name="personachat" +speaker="system" +ratio=$1 +dial_ids_order=$2 +data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}" +output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/test.json" +metric_name_or_path="metric.py" +metric_for_best_model="bleu" +source_column="context+persona" +target_column="response" +truncation_side="left" +max_source_length=1024 +max_target_length=512 +model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog" +per_device_train_batch_size=32 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=100 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_persona_key2gen.sh b/convlab2/base_models/t5/key2gen/run_persona_key2gen.sh new file mode 100644 index 0000000000000000000000000000000000000000..a2fea9b1fea8540731fd7e3d499156bfe7025690 --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_persona_key2gen.sh @@ -0,0 +1,85 @@ +n_gpus=1 +task_name="personachat" +dataset_name="personachat" +speaker="system" +data_dir="data/${task_name}/key2gen_${dataset_name}" +output_dir="output/${task_name}/key2gen_${dataset_name}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/test.json" +metric_name_or_path="metric.py" +metric_for_best_model="bleu" +source_column="context+persona" +target_column="response" +truncation_side="left" +max_source_length=1024 +max_target_length=512 +model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog" +per_device_train_batch_size=32 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=10 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} --key2gen + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_wow.sh b/convlab2/base_models/t5/key2gen/run_wow.sh new file mode 100644 index 0000000000000000000000000000000000000000..0fd9fb78ea5e260ef30fdf00658d4a390421f900 --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_wow.sh @@ -0,0 +1,85 @@ +n_gpus=1 +task_name="wow" +dataset_name="wow" +speaker="system" +data_dir="data/${task_name}/${dataset_name}" +output_dir="output/${task_name}/${dataset_name}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/test_unseen.json" +metric_name_or_path="metric.py" +metric_for_best_model="bleu" +source_column="context+knowledge" +target_column="response" +truncation_side="left" +max_source_length=1024 +max_target_length=512 +model_name_or_path="t5-small" +per_device_train_batch_size=32 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=10 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_wow_fewshot.sh b/convlab2/base_models/t5/key2gen/run_wow_fewshot.sh new file mode 100644 index 0000000000000000000000000000000000000000..c9cdbe645d771d099403c9f840a08114e005a18d --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_wow_fewshot.sh @@ -0,0 +1,87 @@ +n_gpus=1 +task_name="wow" +dataset_name="wow" +speaker="system" +ratio=$1 +dial_ids_order=$2 +data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}" +output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/test_unseen.json" +metric_name_or_path="metric.py" +metric_for_best_model="bleu" +source_column="context+knowledge" +target_column="response" +truncation_side="left" +max_source_length=1024 +max_target_length=512 +model_name_or_path="t5-small" +per_device_train_batch_size=32 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=100 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh b/convlab2/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh new file mode 100644 index 0000000000000000000000000000000000000000..6aaf8ca4dd160f8c108281413441e021b5bd0571 --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh @@ -0,0 +1,87 @@ +n_gpus=1 +task_name="wow" +dataset_name="wow" +speaker="system" +ratio=$1 +dial_ids_order=$2 +data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}" +output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/test_unseen.json" +metric_name_or_path="metric.py" +metric_for_best_model="bleu" +source_column="context+knowledge" +target_column="response" +truncation_side="left" +max_source_length=1024 +max_target_length=512 +model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog" +per_device_train_batch_size=32 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=100 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name} diff --git a/convlab2/base_models/t5/key2gen/run_wow_key2gen.sh b/convlab2/base_models/t5/key2gen/run_wow_key2gen.sh new file mode 100644 index 0000000000000000000000000000000000000000..b5badb7f7fefb979f6d9dea201d87e384a403493 --- /dev/null +++ b/convlab2/base_models/t5/key2gen/run_wow_key2gen.sh @@ -0,0 +1,85 @@ +n_gpus=1 +task_name="wow" +dataset_name="wow" +speaker="system" +data_dir="data/${task_name}/key2gen_${dataset_name}" +output_dir="output/${task_name}/key2gen_${dataset_name}" +cache_dir="../cache" +logging_dir="${output_dir}/runs" +train_file="${data_dir}/train.json" +validation_file="${data_dir}/validation.json" +test_file="${data_dir}/test_unseen.json" +metric_name_or_path="metric.py" +metric_for_best_model="bleu" +source_column="context+knowledge" +target_column="response" +truncation_side="left" +max_source_length=1024 +max_target_length=512 +model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog" +per_device_train_batch_size=32 +per_device_eval_batch_size=64 +gradient_accumulation_steps=4 +lr=1e-3 +num_train_epochs=10 + +python create_data_key2gen.py -t ${task_name} -d ${dataset_name} --key2gen + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --train_file ${train_file} \ + --validation_file ${validation_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${model_name_or_path} \ + --do_train \ + --do_eval \ + --save_strategy epoch \ + --evaluation_strategy epoch \ + --save_total_limit 1 \ + --prediction_loss_only \ + --load_best_model_at_end \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +python ../run_seq2seq.py \ + --task_name ${task_name} \ + --test_file ${test_file} \ + --source_column ${source_column} \ + --target_column ${target_column} \ + --max_source_length ${max_source_length} \ + --max_target_length ${max_target_length} \ + --truncation_side ${truncation_side} \ + --model_name_or_path ${output_dir} \ + --do_predict \ + --predict_with_generate \ + --metric_name_or_path ${metric_name_or_path} \ + --cache_dir ${cache_dir} \ + --output_dir ${output_dir} \ + --logging_dir ${logging_dir} \ + --overwrite_output_dir \ + --preprocessing_num_workers 4 \ + --per_device_train_batch_size ${per_device_train_batch_size} \ + --per_device_eval_batch_size ${per_device_eval_batch_size} \ + --gradient_accumulation_steps ${gradient_accumulation_steps} \ + --learning_rate ${lr} \ + --num_train_epochs ${num_train_epochs} \ + --adafactor \ + --gradient_checkpointing + +# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json + +# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}