Skip to content
Snippets Groups Projects
Commit 5a1ed974 authored by zqwerty's avatar zqwerty
Browse files

add finetune suite and update pre-training scripts

parent fb70aed4
No related branches found
No related tags found
No related merge requests found
Showing
with 961 additions and 1030 deletions
set -e
for dataset_name in dailydialog metalwoz tm1 tm2 tm3 sgd multiwoz21
do
bash get_keywords.sh ${dataset_name}
done
\ No newline at end of file
set -e
n_gpus=2
master_port=23457
task_name="key2gen_noisy"
dataset_name=$1
model_type="gpt"
data_dir="data/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_type}/${dataset_name}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
source_column="source"
target_column="target"
truncation_side="left"
max_source_length=512
max_target_length=128
model_name_or_path="output/${task_name}/${model_type}/dailydialog+metalwoz+tm1+tm2+tm3"
per_device_train_batch_size=128
per_device_eval_batch_size=128
gradient_accumulation_steps=2
lr=1e-3
num_train_epochs=3
python -m torch.distributed.launch --master_port ${master_port} \
--nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_predict \
--predict_with_generate \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 16 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--gradient_checkpointing
set -e
n_gpus=2
master_port=23457
task_name="key2gen_noisy"
dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
model_type="gpt"
data_dir="data/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_type}/${dataset_name}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
source_column="source"
target_column="target"
truncation_side="left"
max_source_length=512
max_target_length=128
model_name_or_path="t5-small"
per_device_train_batch_size=128
per_device_eval_batch_size=128
gradient_accumulation_steps=4
lr=1e-3
num_train_epochs=3
python -m torch.distributed.launch --master_port ${master_port} \
--nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--do_eval \
--do_predict \
--save_strategy epoch \
--evaluation_strategy epoch \
--load_best_model_at_end \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 16 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--gradient_checkpointing
......@@ -4,8 +4,9 @@ master_port=23456
task_name="rg"
dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
model_type="gpt"
model_name="t5-small"
data_dir="data/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_name}/${dataset_name}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
......@@ -14,7 +15,7 @@ target_column="target"
truncation_side="left"
max_source_length=512
max_target_length=128
model_name_or_path="t5-small"
model_name_or_path="${model_name}"
per_device_train_batch_size=64
per_device_eval_batch_size=128
gradient_accumulation_steps=1
......
......@@ -4,8 +4,9 @@ master_port=23456
task_name="key2gen+key2gen_noisy"
dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
model_type="gpt"
model_name="t5-small"
data_dir="data/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_name}/${dataset_name}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
......@@ -14,7 +15,7 @@ target_column="target"
truncation_side="left"
max_source_length=512
max_target_length=128
model_name_or_path="output/rg/${model_type}/${dataset_name}"
model_name_or_path="output/rg/${model_name}/${dataset_name}"
per_device_train_batch_size=64
per_device_eval_batch_size=128
gradient_accumulation_steps=1
......
import os
import json
from tqdm import tqdm
from convlab.util import load_dataset, load_unified_data, load_nlu_data
def create_nlg_data(dataset, data_dir, args):
data_by_split = load_nlu_data(dataset, speaker='system', use_context=True, context_window_size=3)
os.makedirs(data_dir, exist_ok=True)
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
context = [(turn['speaker'], turn['utterance']) for turn in sample['context']]
response = sample['utterance']
if len(context) > 0 and len(response) > 0:
knowledge = sample['dialogue_acts']
data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
if 'test' in data_split:
file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
else:
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def create_kvret_data(dataset, data_dir, args):
data_by_split = load_unified_data(dataset, speaker='system', utterance=True, db_results=True, use_context=True, context_window_size=100)
os.makedirs(data_dir, exist_ok=True)
domain2entity_col = {'schedule': 'event' ,'navigate': 'poi', 'weather': 'location'}
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
context = [(turn['speaker'], turn['utterance']) for turn in sample['context']]
response = sample['utterance']
if len(context) > 0 and len(response) > 0:
knowledge = sample['db_results']
for domain, db_items in knowledge.items():
entity_col = domain2entity_col[domain]
for db_item in db_items:
db_item['entity'] = db_item.pop(entity_col)
data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
if 'test' in data_split:
file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
else:
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def create_personachat_data(dataset, data_dir, args):
data_by_split = dataset
os.makedirs(data_dir, exist_ok=True)
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
knowledge = dial['persona']['system']
context = []
for turn in dial['turns']:
response = turn['utterance']
if turn['speaker'] == 'system' and len(context) > 0 and len(response) > 0:
data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
context.append((turn['speaker'], turn['utterance']))
if 'test' in data_split:
file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
else:
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def create_wow_data(dataset, data_dir, args):
data_by_split = dataset
os.makedirs(data_dir, exist_ok=True)
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
context = []
for turn in dial['turns']:
response = turn['utterance']
if turn['speaker'] == 'system' and len(context) > 0 and len(response) > 0:
knowledge = turn['checked_passage']
if knowledge is None:
knowledge = []
elif isinstance(knowledge, str):
knowledge = [knowledge]
data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
context.append((turn['speaker'], turn['utterance']))
if 'test' in data_split:
file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
else:
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def create_opendialkg_data(dataset, data_dir, args):
data_by_split = dataset
os.makedirs(data_dir, exist_ok=True)
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
context = []
for turn in dial['turns']:
response = turn['utterance']
if turn['speaker'] == 'system' and 'kg_path' in turn and len(context) > 0 and len(response) > 0:
knowledge = turn['kg_path']['triples']
data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
context.append((turn['speaker'], turn['utterance']))
if 'test' in data_split:
file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
else:
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="create data for seq2seq training")
parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['nlg', 'kvret', 'opendialkg', 'personachat', 'wow'], help='names of tasks')
parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
parser.add_argument('--shot', '-s', type=float, default=None, help='how many data is used for training and evaluation, ratio if < 1 else absolute number')
parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments')
args = parser.parse_args()
print(args)
for dataset_name in tqdm(args.datasets, desc='datasets'):
dataset = load_dataset(dataset_name, dial_ids_order=args.dial_ids_order)
if args.shot:
if args.shot < 1:
dataset['train'] = dataset['train'][:round(len(dataset['train'])*args.shot)]
dataset['validation'] = dataset['validation'][:round(len(dataset['validation'])*args.shot)]
else:
args.shot = int(args.shot)
dataset['train'] = dataset['train'][:args.shot]
dataset['validation'] = dataset['validation'][:args.shot]
for task_name in tqdm(args.tasks, desc='tasks', leave=False):
data_dir = os.path.join('data', task_name, (dataset_name if not args.shot else f'{dataset_name}_{args.shot}shot_order{args.dial_ids_order}'))
data_by_split = eval(f"create_{task_name}_data")(dataset, data_dir, args)
import os
import json
from tqdm import tqdm
from transformers import AutoTokenizer
from convlab.util import load_dataset, load_unified_data, load_nlu_data
from convlab.base_models.t5.nlu.serialization import serialize_dialogue_acts
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
def create_nlg_data(dataset, data_dir, args):
data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
data_dir = os.path.join(data_dir, args.speaker, f'context_{args.context_window_size}')
os.makedirs(data_dir, exist_ok=True)
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
if args.key2gen:
keywords = []
for da_type in sample['dialogue_acts']:
for da in sample['dialogue_acts'][da_type]:
intent, domain, slot, value = da['intent'], da['domain'], da['slot'], da.get('value', '')
intent_domain = f'{intent}-{domain}'
keywords.append([intent_domain])
if len(slot) > 0:
keywords[-1].append(slot)
if len(value) > 0:
keywords[-1].append(value)
dialogue_acts_seq = '| {} |'.format(' | '.join([' : '.join(da_keywords) for da_keywords in keywords]))
else:
dialogue_acts_seq = serialize_dialogue_acts(sample['dialogue_acts'])
if args.context_window_size>0:
context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: '])
context = f'generate a response: grounded knowledge: {dialogue_acts_seq} context:\n\n{context}'
else:
context = f'generate a response: grounded knowledge: {dialogue_acts_seq} context:\n\n{sample["speaker"]}: '
data.append(json.dumps({'context+da': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n')
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def create_dart_data(dataset, data_dir, args):
data_by_split = dataset
os.makedirs(data_dir, exist_ok=True)
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
sample = sample['turns'][0]
triples = sample['tripleset']
if args.key2gen:
keywords = [w for triple in triples for w in triple]
# DONE: try adding prompt, no improvement
entity_cnt = Counter()
for triple in triples:
e1, r, e2 = triple
for e in [e1, e2]:
if e.startswith('[') and e.endswith(']'):
continue
entity_cnt[e] += 1
assert len(entity_cnt) > 0
common_entity = entity_cnt.most_common(1)[0][0]
context = f'{" | ".join(keywords)}\n\ncontext: user: tell me something about {common_entity}. system: '
else:
triples = [' : '.join(triple) for triple in triples]
context = f'{" | ".join(triples)}\n\ncontext: system: '
data.append(json.dumps({'triples': context, 'text': sample['utterance']}, ensure_ascii=False)+'\n')
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def create_commongen_data(dataset, data_dir, args):
data_by_split = dataset
os.makedirs(data_dir, exist_ok=True)
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
sample = sample['turns'][0]
concepts = sample['concepts']
context = f'{" | ".join(concepts)}\n\ncontext: system: '
data.append(json.dumps({'concepts': context, 'text': sample['utterance']}, ensure_ascii=False)+'\n')
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def create_kvret_data(dataset, data_dir, args):
data_by_split = load_unified_data(dataset, speaker='system', utterance=True, db_results=True, use_context=True, context_window_size=100)
os.makedirs(data_dir, exist_ok=True)
domain2entity_col = {'schedule': 'event' ,'navigate': 'poi', 'weather': 'location'}
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
if len(sample['utterance']) == 0:
continue
db_results = sample['db_results']
db_seqs = []
for domain, db_items in db_results.items():
entity_col = domain2entity_col[domain]
for db_item in db_items:
entity = db_item[entity_col]
for db_key, db_value in db_item.items():
if db_key == entity_col:
continue
db_seqs.append(' : '.join([entity, db_key, db_value]))
db_seq = ' |\n'.join(db_seqs)
context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: '])
context = f'generate a response: all knowledge:\n\n| {db_seq} | context:\n\n{context}'
data.append(json.dumps({'context+db': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n')
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def create_personachat_data(dataset, data_dir, args):
data_by_split = dataset
os.makedirs(data_dir, exist_ok=True)
stop_words = set(stopwords.words('english')) | set(punctuation)
def sentence2keywords(sentence):
index2keyword = {}
for i, w in enumerate(word_tokenize(sentence)):
if not w.lower() in stop_words:
index2keyword[i] = w
indexes = sorted(index2keyword.keys())
keywords = []
for i, index in enumerate(indexes):
if i > 0 and index == indexes[i-1] + 1:
keywords[-1]+= ' '+index2keyword[index]
else:
keywords.append(index2keyword[index])
return keywords
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
persona = dial['persona']['system']
if args.key2gen:
persona_seq = ' |\n'.join([' : '.join(sentence2keywords(s)) for s in persona])
else:
persona_seq = ' | '.join(persona)
context = []
for turn in dial['turns']:
if turn['speaker'] == 'system':
context_seq = '\n'.join([f"{t['speaker']}: {t['utterance']}" for t in context]+[f'{turn["speaker"]}: '])
context_seq = f'generate a response: all knowledge:\n\n| {persona_seq} | context:\n\n{context_seq}'
data.append(json.dumps({'context+persona': context_seq, 'response': turn['utterance']}, ensure_ascii=False)+'\n')
context.append({'speaker': turn['speaker'], 'utterance': turn['utterance']})
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def create_wow_data(dataset, data_dir, args):
data_by_split = dataset
os.makedirs(data_dir, exist_ok=True)
stop_words = set(stopwords.words('english')) | set(punctuation)
def sentence2keywords(sentence):
index2keyword = {}
for i, w in enumerate(word_tokenize(sentence)):
if not w.lower() in stop_words:
index2keyword[i] = w
indexes = sorted(index2keyword.keys())
keywords = []
for i, index in enumerate(indexes):
if i > 0 and index == indexes[i-1] + 1:
keywords[-1]+= ' '+index2keyword[index]
else:
keywords.append(index2keyword[index])
return keywords
def sentences2keywords_seq(sentences):
return ' |\n'.join([' : '.join(sentence2keywords(sentence)) for sentence in sent_tokenize(sentences)])
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
context = []
for turn in dial['turns']:
if turn['speaker'] == 'system':
if turn['checked_sentence']:
if args.key2gen:
know_seq = f" | {sentences2keywords_seq(turn['checked_sentence'])} |"
else:
know_seq = turn['checked_sentence']
else:
know_seq = ''
context_seq = '\n'.join([f"{t['speaker']}: {t['utterance']}" for t in context]+[f'{turn["speaker"]}: '])
context_seq = f'generate a response: grounded knowledge:\n\n{know_seq} context:\n\n{context_seq}'
data.append(json.dumps({'context+knowledge': context_seq, 'response': turn['utterance']}, ensure_ascii=False)+'\n')
context.append({'speaker': turn['speaker'], 'utterance': turn['utterance']})
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def get_max_len(data_by_split, tokenizer):
for data_split in data_by_split.keys():
seq_len = {}
for line in data_by_split[data_split]:
item = json.loads(line.strip())
for column, seq in item.items():
seq_len.setdefault(column, [])
seq_len[column].append(len(tokenizer.tokenize(seq)))
print(f"data split: {data_split}")
for column, lens in seq_len.items():
print(f'\t{column}\tmax_len: {max(lens)}\tmean_len: {round(sum(lens)/len(lens),2)}')
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="create data for seq2seq training")
parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['nlg', 'dart', 'commongen', 'kvret', 'personachat', 'wow'], help='names of tasks')
parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s)')
parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
parser.add_argument('--len_tokenizer', '-l', type=str, default=None, help='name or path of tokenizer that used to get seq len')
parser.add_argument('--ratio', '-r', type=float, default=None, help='how many data is used for training and evaluation')
parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments')
parser.add_argument('--key2gen', '-k', action='store_true', default=False, help='generate data for key2gen models')
args = parser.parse_args()
print(args)
if args.len_tokenizer:
tokenizer = AutoTokenizer.from_pretrained(args.len_tokenizer)
for dataset_name in tqdm(args.datasets, desc='datasets'):
dataset = load_dataset(dataset_name, args.dial_ids_order)
if args.ratio:
dataset['train'] = dataset['train'][:round(len(dataset['train'])*args.ratio)]
dataset['validation'] = dataset['validation'][:round(len(dataset['validation'])*args.ratio)]
for task_name in tqdm(args.tasks, desc='tasks', leave=False):
if args.key2gen:
data_dir = os.path.join('data', task_name, "key2gen_"+(dataset_name if not args.ratio else f'{dataset_name}_{args.ratio}_order{args.dial_ids_order}'))
else:
data_dir = os.path.join('data', task_name, (dataset_name if not args.ratio else f'{dataset_name}_{args.ratio}_order{args.dial_ids_order}'))
data_by_split = eval(f"create_{task_name}_data")(dataset, data_dir, args)
if args.len_tokenizer:
get_max_len(data_by_split, tokenizer)
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data processing for vanilla generator"""
import json
import datasets
from convlab.base_models.t5.key2gen.features import FEATURES
from copy import deepcopy
class VanillaDataset(datasets.GeneratorBasedBuilder):
"""Dataset for vanilla generator (e.g., t5)"""
VERSION = datasets.Version("1.18.0")
BUILDER_CONFIGS = [
datasets.BuilderConfig(name="nlg", version=VERSION, description="DA grounded generation task"),
datasets.BuilderConfig(name="kvret", version=VERSION, description="KB grounded generation task"),
datasets.BuilderConfig(name="opendialkg", version=VERSION, description="KG grounded generation task"),
datasets.BuilderConfig(name="wow", version=VERSION, description="Passage grounded generation task"),
datasets.BuilderConfig(name="personachat", version=VERSION, description="Persona grounded generation task"),
]
def _info(self):
return datasets.DatasetInfo(
description=f"Vanilla Dataset for {self.config.description}",
features=datasets.Features(deepcopy(FEATURES[self.config.name]))
)
def _split_generators(self, dl_manager):
generators = []
if "train" in self.config.data_files:
generators.append(datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": self.config.data_files["train"][0],
"split": "train",
},
))
if "validation" in self.config.data_files:
generators.append(datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": self.config.data_files["validation"][0],
"split": "validation",
},
))
if "test" in self.config.data_files:
generators.append(datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": self.config.data_files["test"][0],
"split": "test",
},
))
return generators
def _generate_examples(self, filepath, split):
with open(filepath, encoding="utf-8") as f:
for key, row in enumerate(f):
item = json.loads(row)
if self.config.name == "nlg":
knowledge = item["knowledge"]
triples = []
for da_type in knowledge:
for da in knowledge[da_type]:
intent, domain, slot, value = da["intent"], da["domain"], da["slot"], da.get("value", "")
if 'start' in da:
da.pop('start')
da.pop('end')
intent_domain = f"{intent}-{domain}"
triples.append([intent_domain])
if len(slot) > 0:
triples[-1].append(slot)
if len(value) > 0:
triples[-1].append(value)
knowledge_seq = "| {} |".format(" | ".join([" : ".join(da_keywords) for da_keywords in triples]))
elif self.config.name == "kvret":
knowledge = {"schedule": [], "weather": [], "navigate": []}
triples = []
for domain, db_items in item["knowledge"].items():
knowledge[domain] = db_items
for db_item in db_items:
entity = db_item["entity"]
for db_key, db_value in db_item.items():
if db_key == "entity":
continue
triples.append([entity, db_key, db_value])
knowledge_seq = "| {} |".format(" | ".join([" : ".join(triple) for triple in triples]))
elif self.config.name == "opendialkg":
knowledge = item["knowledge"]
knowledge_seq = "| {} |".format(" | ".join([" : ".join(triple) for triple in item["knowledge"]]))
elif self.config.name in ["wow", "personachat"]:
knowledge = item["knowledge"]
try:
knowledge_seq = "| {} |".format(" | ".join(item["knowledge"]))
except:
print([knowledge])
raise
context = "\n".join([f"{turn[0]}: {turn[1]}" for turn in item["context"]]+["system: "])
context_knowledge = f"generate a response: grounded knowledge: \n\n{knowledge_seq} context:\n\n{context}"
yield key, {
"context+knowledge": context_knowledge,
"response": item["response"],
"knowledge": knowledge,
}
from tabulate import tabulate
import os
import json
from tqdm import tqdm
from datasets import load_metric
import numpy as np
def evaluate(filename, metric):
"""
It reads the predictions, references, and knowledge from a file, and then computes the metric
:param filename: the path to the file containing the predictions
:param metric: the metric to use for evaluation
:return: The result of the evaluation.
"""
predictions, references, knowledge = [], [], []
with open(filename, 'r') as f:
for line in f:
item = json.loads(line)
predictions.append(item['predictions'])
references.append(item['response'])
knowledge.append(item['knowledge'])
result = metric.compute(predictions=predictions, references=references, knowledge=knowledge)
return result
def avg_result(results):
"""
It takes a list of dictionaries, and returns a dictionary with the same keys, but the values are the
mean and standard deviation of the values in the input dictionaries
:param results: a list of dictionaries, each dictionary is the result of a single run of the model
:return: The average and standard deviation of the results.
"""
ret = {}
for k in results[0]:
m = round(np.mean([result[k] for result in results]), 2)
v = round(np.std([result[k] for result in results], ddof=1), 2) if len(results) > 1 else None
ret[k] = f"{m}({v})"
return ret
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="create data for seq2seq training")
parser.add_argument("--output_dirs", type=str, nargs='*', required=True)
parser.add_argument('--tasks', '-t', type=str, nargs='*', choices=['nlg', 'kvret', 'opendialkg', 'personachat', 'wow'], help='names of tasks')
parser.add_argument('--shots', '-s', type=int, nargs='*', help='how many data is used for training and evaluation, ratio if < 1 else absolute number')
parser.add_argument('--dial_ids_orders', '-o', type=int, nargs='*', help='which data order is used for experiments')
args = parser.parse_args()
print(args)
tables = []
for task_name in tqdm(args.tasks, desc='tasks'):
metric = load_metric("metric.py", task_name)
dataset_name = task_name if task_name != "nlg" else "multiwoz21"
for shot in tqdm(args.shots, desc='shots'):
for output_dir in tqdm(args.output_dirs, desc='models'):
model_name = output_dir.split('/')[-1]
if task_name == "wow":
test_splits = ["_seen", "_unseen"]
else:
test_splits = [""]
for test_split in test_splits:
results = []
for dial_ids_order in tqdm(args.dial_ids_orders, desc='dial_ids_orders'):
filename = os.path.join(output_dir, task_name, f"{dataset_name}_{shot}shot_order{dial_ids_order}/gen{test_split}/generated_predictions.json")
results.append(evaluate(filename, metric))
res = {
"dataset": f"{task_name}-{shot}shot",
"model": f"{model_name}{test_split}",
**avg_result(results)
}
tables.append(res)
# print(res)
res = tabulate(tables, headers='keys', tablefmt='github')
with open(f'eval_results.txt', 'w', encoding='utf-8') as f:
print(res, file=f)
import datasets
FEATURES = {
"nlg": {
"context+knowledge": datasets.Value("string"),
"response": datasets.Value("string"),
"knowledge": {
"categorical": datasets.Sequence({
"intent": datasets.Value("string"),
"domain": datasets.Value("string"),
"slot": datasets.Value("string"),
"value": datasets.Value("string"),
}),
"non-categorical": datasets.Sequence({
"intent": datasets.Value("string"),
"domain": datasets.Value("string"),
"slot": datasets.Value("string"),
"value": datasets.Value("string"),
}),
"binary": datasets.Sequence({
"intent": datasets.Value("string"),
"domain": datasets.Value("string"),
"slot": datasets.Value("string"),
})
}},
"kvret": {
"context+knowledge": datasets.Value("string"),
"response": datasets.Value("string"),
"knowledge": {
"schedule": datasets.Sequence({
"entity": datasets.Value("string"),
"time": datasets.Value("string"),
"date": datasets.Value("string"),
"party": datasets.Value("string"),
"room": datasets.Value("string"),
"agenda": datasets.Value("string")
}),
"weather": datasets.Sequence({
"entity": datasets.Value("string"),
"today": datasets.Value("string"),
"monday": datasets.Value("string"),
"tuesday": datasets.Value("string"),
"wednesday": datasets.Value("string"),
"thursday": datasets.Value("string"),
"friday": datasets.Value("string"),
"saturday": datasets.Value("string"),
"sunday": datasets.Value("string"),
}),
"navigate": datasets.Sequence({
"entity": datasets.Value("string"),
"traffic_info": datasets.Value("string"),
"poi_type": datasets.Value("string"),
"address": datasets.Value("string"),
"distance": datasets.Value("string")
})
}},
"opendialkg": {
"context+knowledge": datasets.Value("string"),
"response": datasets.Value("string"),
"knowledge": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
},
"wow": {
"context+knowledge": datasets.Value("string"),
"response": datasets.Value("string"),
"knowledge": datasets.Sequence(datasets.Value("string")),
},
"personachat": {
"context+knowledge": datasets.Value("string"),
"response": datasets.Value("string"),
"knowledge": datasets.Sequence(datasets.Value("string")),
}
}
\ No newline at end of file
n_gpus=2
master_port=23456
cache_dir="../cache"
dataset_path="dataset_vanilla.py"
metric_name_or_path="metric.py"
source_column="context+knowledge"
target_column="response"
truncation_side="left"
max_source_length=512
max_target_length=512
model_name="t5-small"
model_name_or_path="t5-small"
per_device_train_batch_size=64
per_device_eval_batch_size=64
gradient_accumulation_steps=1
num_workers=16
lr=1e-3
num_train_epochs=100
for dataset_name in multiwoz21 kvret opendialkg wow personachat
do
if [ "${dataset_name}" == "multiwoz21" ]
then
task_name="nlg"
else
task_name=${dataset_name}
fi
for shot in 50 100 200
do
for dial_ids_order in 0 1 2 3 4
do
python create_data.py -t ${task_name} -d ${dataset_name} -o ${dial_ids_order} -s ${shot}
data_dir="data/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}"
output_dir="output/${model_name}/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
# training
python -m torch.distributed.launch --master_port ${master_port} \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--dataset_name ${dataset_path} \
--dataset_config_name ${task_name} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--do_eval \
--save_strategy epoch \
--evaluation_strategy epoch \
--save_total_limit 1 \
--prediction_loss_only \
--load_best_model_at_end \
--overwrite_output_dir \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--preprocessing_num_workers ${num_workers} \
--dataloader_num_workers ${num_workers} \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--optim adafactor \
--lr_scheduler_type constant \
--gradient_checkpointing
# inference
if [ "${dataset_name}" == "wow" ]
then
for test_split in seen unseen
do
test_file="data/${task_name}/test_${test_split}.json"
gen_output_dir="${output_dir}/gen_${test_split}"
python -m torch.distributed.launch --master_port ${master_port} \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--dataset_name ${dataset_path} \
--dataset_config_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--cache_dir ${cache_dir} \
--output_dir ${gen_output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers ${num_workers} \
--dataloader_num_workers ${num_workers} \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--optim adafactor \
--lr_scheduler_type constant \
--gradient_checkpointing
done
else
test_file="data/${task_name}/test.json"
gen_output_dir="${output_dir}/gen"
python -m torch.distributed.launch --master_port ${master_port} \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--dataset_name ${dataset_path} \
--dataset_config_name ${task_name} \
--metric_name_or_path ${metric_name_or_path} \
--metric_config_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--cache_dir ${cache_dir} \
--output_dir ${gen_output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers ${num_workers} \
--dataloader_num_workers ${num_workers} \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--optim adafactor \
--lr_scheduler_type constant \
--gradient_checkpointing
fi
done
done
done
# evaluation
python evaluate.py --output_dirs output/${model_name} -t nlg kvret opendialkg personachat wow -s 50 100 200 -o 0 1 2 3 4
\ No newline at end of file
This diff is collapsed.
for ratio in 0.1 0.01
do
for dial_ids_order in 0 1 2
do
bash run_persona_fewshot_key2gen.sh ${ratio} ${dial_ids_order}
done
done
\ No newline at end of file
for ratio in 0.1 0.01
do
for dial_ids_order in 0 1 2
do
bash run_wow_fewshot_key2gen.sh ${ratio} ${dial_ids_order}
done
done
\ No newline at end of file
n_gpus=1
task_name="commongen"
dataset_name="commongen"
speaker="system"
context_window_size=0
data_dir="data/${task_name}/${dataset_name}"
output_dir="output/${task_name}/${dataset_name}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/validation.json"
metric_name_or_path="../nlg/nlg_metric.py"
metric_for_best_model="bleu"
source_column="concepts"
target_column="text"
truncation_side="left"
max_source_length=512
max_target_length=512
model_name_or_path="t5-small"
per_device_train_batch_size=128
per_device_eval_batch_size=64
gradient_accumulation_steps=4
lr=1e-3
num_train_epochs=10
python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
python ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--do_eval \
--save_strategy epoch \
--evaluation_strategy epoch \
--save_total_limit 3 \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--gradient_checkpointing
python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
n_gpus=1
task_name="commongen"
dataset_name="commongen"
speaker="system"
context_window_size=0
data_dir="data/${task_name}/key2gen_${dataset_name}"
output_dir="output/${task_name}/key2gen_${dataset_name}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/validation.json"
metric_name_or_path="../nlg/nlg_metric.py"
metric_for_best_model="bleu"
source_column="concepts"
target_column="text"
source_prefix="keywords: "
truncation_side="left"
max_source_length=512
max_target_length=512
model_name_or_path="../../gpt/keyword_extraction/output/key2gen_shuffle_noisy/gpt/dailydialog+metalwoz+sgd+tm1+tm2+tm3"
per_device_train_batch_size=128
per_device_eval_batch_size=64
gradient_accumulation_steps=4
lr=1e-3
num_train_epochs=10
python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen
python ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--source_prefix ${source_prefix} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--do_eval \
--save_strategy epoch \
--evaluation_strategy epoch \
--save_total_limit 3 \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--source_prefix ${source_prefix} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--gradient_checkpointing
python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
n_gpus=1
task_name="dart"
dataset_name="dart"
speaker="system"
context_window_size=0
data_dir="data/${task_name}/key2gen_${dataset_name}"
output_dir="output/${task_name}/${dataset_name}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="../nlg/nlg_metric.py"
metric_for_best_model="bleu"
source_column="triples"
target_column="text"
truncation_side="left"
max_source_length=512
max_target_length=512
model_name_or_path="t5-small"
per_device_train_batch_size=128
per_device_eval_batch_size=64
gradient_accumulation_steps=4
lr=1e-3
num_train_epochs=10
# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
# python ../run_seq2seq.py \
# --task_name ${task_name} \
# --train_file ${train_file} \
# --validation_file ${validation_file} \
# --source_column ${source_column} \
# --target_column ${target_column} \
# --max_source_length ${max_source_length} \
# --max_target_length ${max_target_length} \
# --truncation_side ${truncation_side} \
# --model_name_or_path ${model_name_or_path} \
# --do_train \
# --do_eval \
# --save_strategy epoch \
# --evaluation_strategy epoch \
# --save_total_limit 3 \
# --prediction_loss_only \
# --cache_dir ${cache_dir} \
# --output_dir ${output_dir} \
# --logging_dir ${logging_dir} \
# --overwrite_output_dir \
# --preprocessing_num_workers 4 \
# --per_device_train_batch_size ${per_device_train_batch_size} \
# --per_device_eval_batch_size ${per_device_eval_batch_size} \
# --gradient_accumulation_steps ${gradient_accumulation_steps} \
# --learning_rate ${lr} \
# --num_train_epochs ${num_train_epochs} \
# --adafactor \
# --gradient_checkpointing
python ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--gradient_checkpointing
# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
n_gpus=1
task_name="dart"
dataset_name="dart"
speaker="system"
context_window_size=0
data_dir="data/${task_name}/${dataset_name}"
output_dir="output/${task_name}/key2gen_${dataset_name}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="../nlg/nlg_metric.py"
metric_for_best_model="bleu"
source_column="triples"
target_column="text"
source_prefix="possible keywords: "
truncation_side="left"
max_source_length=512
max_target_length=512
model_name_or_path="../../gpt/keyword_extraction/output/key2gen_shuffle_noisy/gpt/dailydialog+metalwoz+sgd+tm1+tm2+tm3"
per_device_train_batch_size=128
per_device_eval_batch_size=64
gradient_accumulation_steps=4
lr=1e-3
num_train_epochs=10
# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen
# python ../run_seq2seq.py \
# --task_name ${task_name} \
# --train_file ${train_file} \
# --validation_file ${validation_file} \
# --source_column ${source_column} \
# --target_column ${target_column} \
# --source_prefix "${source_prefix}" \
# --max_source_length ${max_source_length} \
# --max_target_length ${max_target_length} \
# --truncation_side ${truncation_side} \
# --model_name_or_path ${model_name_or_path} \
# --do_train \
# --do_eval \
# --save_strategy epoch \
# --evaluation_strategy epoch \
# --save_total_limit 3 \
# --prediction_loss_only \
# --cache_dir ${cache_dir} \
# --output_dir ${output_dir} \
# --logging_dir ${logging_dir} \
# --overwrite_output_dir \
# --preprocessing_num_workers 4 \
# --per_device_train_batch_size ${per_device_train_batch_size} \
# --per_device_eval_batch_size ${per_device_eval_batch_size} \
# --gradient_accumulation_steps ${gradient_accumulation_steps} \
# --learning_rate ${lr} \
# --num_train_epochs ${num_train_epochs} \
# --adafactor \
# --gradient_checkpointing
python ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--source_prefix "${source_prefix}" \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--gradient_checkpointing
# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
n_gpus=1
task_name="kvret"
dataset_name="kvret"
speaker="system"
data_dir="data/${task_name}/${dataset_name}"
output_dir="output/${task_name}/${dataset_name}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="../nlg/nlg_metric.py"
metric_for_best_model="bleu"
source_column="context+db"
target_column="response"
truncation_side="left"
max_source_length=1024
max_target_length=512
model_name_or_path="t5-small"
per_device_train_batch_size=32
per_device_eval_batch_size=64
gradient_accumulation_steps=4
lr=1e-3
num_train_epochs=10
python create_data_key2gen.py -t ${task_name} -d ${dataset_name}
python ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--do_eval \
--save_strategy epoch \
--evaluation_strategy epoch \
--save_total_limit 1 \
--prediction_loss_only \
--load_best_model_at_end \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--gradient_checkpointing
# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
n_gpus=1
task_name="kvret"
dataset_name="kvret"
speaker="system"
ratio=$1
dial_ids_order=$2
data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="../nlg/nlg_metric.py"
metric_for_best_model="bleu"
source_column="context+db"
target_column="response"
truncation_side="left"
max_source_length=1024
max_target_length=512
model_name_or_path="t5-small"
per_device_train_batch_size=32
per_device_eval_batch_size=64
gradient_accumulation_steps=4
lr=1e-3
num_train_epochs=100
python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order}
python ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--do_eval \
--save_strategy epoch \
--evaluation_strategy epoch \
--save_total_limit 1 \
--prediction_loss_only \
--load_best_model_at_end \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--adafactor \
--gradient_checkpointing
# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment