Skip to content
Snippets Groups Projects
Unverified Commit a74d1579 authored by zhuqi's avatar zhuqi Committed by GitHub
Browse files

Merge pull request #53 from ConvLab/pre-training

Pre training
parents ee8991e9 4b333b1f
No related branches found
No related tags found
No related merge requests found
Showing
with 1146 additions and 588 deletions
import os
import json
from tqdm import tqdm
import re
from convlab2.util import load_dataset
def create_lm_data(dataset, data_dir, args):
data_by_split = dataset
os.makedirs(data_dir, exist_ok=True)
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
if args.model_type == 'dialogpt':
dialogue = ' <|endoftext|> '.join([turn['utterance'] for turn in sample['turns']]) + ' <|endoftext|>'
else:
dialogue = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']])
data.append(json.dumps({'dialogue': dialogue}, ensure_ascii=False)+'\n')
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="create data for seq2seq training")
parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['lm'], help='names of tasks')
parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
parser.add_argument('--model_type', '-m', metavar='model_type', help='type of the language model: gpt, dialogpt, ..')
args = parser.parse_args()
print(args)
for dataset_name in tqdm(args.datasets, desc='datasets'):
dataset = load_dataset(dataset_name)
for task_name in tqdm(args.tasks, desc='tasks', leave=False):
data_dir = os.path.join('data', task_name, dataset_name, args.model_type)
eval(f"create_{task_name}_data")(dataset, data_dir, args)
model_type=dialogpt
dataset_name=multiwoz21
model_name=dialogpt-large
data_dir="data/lm/${dataset_name}/${model_type}"
word_loss_file="${data_dir}/${model_name}_${dataset_name}_word_loss.json"
keywords_num=5
keywords_ratio=1
keywords_th=0
stopwords=True
output_file="${data_dir}/${dataset_name}_keywords_${model_name}_topk_${keywords_num}_ratio_${keywords_ratio}_th_${keywords_th}_stopwords_${stopwords}.json"
python lmloss2keywords.py \
--model_type ${model_type} \
--word_loss_file ${word_loss_file} \
--keywords_num ${keywords_num} \
--keywords_ratio ${keywords_ratio} \
--keywords_th ${keywords_th} \
--stopwords ${stopwords} \
--output_file ${output_file}
\ No newline at end of file
set -e
n_gpus=1
task_name="lm"
dataset_name="multiwoz21"
model_type="dialogpt"
data_dir="data/${task_name}/${dataset_name}/${model_type}"
output_dir="output/${task_name}/${dataset_name}/${model_type}"
cache_dir="../cache"
validation_file="${data_dir}/validation.json"
source_column="dialogue"
max_length=512
model_name_or_path="microsoft/DialoGPT-large"
per_device_eval_batch_size=4
dump_eval_loss_to="${data_dir}/dialogpt-large_${dataset_name}_token_loss.json"
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
python ../run_clm.py \
--dump_eval_loss_to ${dump_eval_loss_to}\
--model_name_or_path ${model_name_or_path} \
--output_dir ${data_dir} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--max_length ${max_length} \
--do_eval \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size}
python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
dump_eval_loss_to="${data_dir}/dialogpt-large-mwoz_${dataset_name}_token_loss.json"
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
python ../run_clm.py \
--dump_eval_loss_to ${dump_eval_loss_to}\
--model_name_or_path ${output_dir} \
--output_dir ${data_dir} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--max_length ${max_length} \
--do_eval \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size}
python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
model_type="gpt"
data_dir="data/${task_name}/${dataset_name}/${model_type}"
validation_file="${data_dir}/validation.json"
model_name_or_path="gpt2-large"
dump_eval_loss_to="${data_dir}/gpt2-large_${dataset_name}_token_loss.json"
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type gpt
python ../run_clm.py \
--dump_eval_loss_to ${dump_eval_loss_to}\
--model_name_or_path ${model_name_or_path} \
--output_dir ${data_dir} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--max_length ${max_length} \
--do_eval \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size}
python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
import json
import json_lines
from pprint import pprint
import os
from tqdm import tqdm
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def merge_tokens(tokens, losses, loss_merge_func=np.mean):
res = []
i = 0
while i < len(tokens):
token = tokens[i]
loss = losses[i]
if token in ['Ġ', 'Ċ']:
if token == 'Ċ' and i < len(tokens) - 1:
tokens[i+1] = 'Ġ'+tokens[i+1]
i += 1
continue
if token in ['user', 'system', 'Ġuser', 'Ġsystem'] and i < len(tokens)-1 and tokens[i+1] == ':':
if i > 0:
tokens[i+1] = '<|endoftext|>'
i += 1
else:
i += 2
continue
if token.startswith('Ġ'):
# Ġ means space
token = token.replace("Ġ", "")
res.append([token, loss])
elif token == '<|endoftext|>':
res.append([token, loss])
else:
assert 'Ġ' not in token
if len(res) > 0:
res[-1][0] += token
res[-1].append(loss)
else:
res.append([token, loss])
i += 1
if loss_merge_func:
for i in range(len(res)):
res[i] = [res[i][0], loss_merge_func(res[i][1:])]
return res
def convert_token_loss2word_loss(token_loss_file, loss_merge_func=np.mean):
word_loss_file = os.path.join(os.path.dirname(token_loss_file), token_loss_file.split('/')[-1].replace('token', 'word'))
fin = open(token_loss_file, 'rb')
fout = open(word_loss_file, 'w', encoding='utf-8')
lines = []
for item in tqdm(json_lines.reader(fin)):
tokens, losses = item['tokens'], item['losses']
assert len(tokens) == len(losses)
word2losses = merge_tokens(tokens, losses, loss_merge_func)
lines.append({"words": [x[0] for x in word2losses], "losses": [x[1] for x in word2losses]})
fout.write(json.dumps(lines[-1], ensure_ascii=False)+'\n')
fin.close()
fout.close()
return lines
def main(args):
if not args.word_loss_file:
word_loss_list = convert_token_loss2word_loss(args.token_loss_file)
else:
fin = open(args.word_loss_file, 'rb')
word_loss_list = []
for item in json_lines.reader(fin):
words, losses = item['words'], item['losses']
word_loss_list.append({"words": words, "losses": losses})
fin.close()
if not args.output_file:
return
stop_words = set(stopwords.words('english'))
dialogs = []
for item in word_loss_list:
words = item['words']
losses = item['losses']
turns = []
turn = {'words': [], 'losses': []}
for word, loss in zip(words, losses):
if word == '<|endoftext|>':
# switch turn
turn['utterance'] = ' '.join(turn['words'])
turn['keywords'] = list(zip(turn['words'], turn['losses']))
if args.stopwords:
turn['keywords'] = [x for x in turn['keywords'] if not any([w.lower() in stop_words for w in word_tokenize(x[0])])]
turn['keywords'] = sorted(turn['keywords'], key=lambda x: x[1], reverse=True)
turn['keywords'] = [x for x in turn['keywords'] if x[1] > args.keywords_th][:min(round(args.keywords_ratio*len(turn['keywords'])), args.keywords_num)]
turn.pop('words')
turn.pop('losses')
turns.append(turn)
turn = {'words': [], 'losses': []}
else:
turn['words'].append(word)
turn['losses'].append(loss)
dialogs.append(turns)
json.dump(dialogs, open(args.output_file, "w", encoding='utf-8'), indent=2, ensure_ascii=False)
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="extract keywords according to lm loss")
parser.add_argument('--model_type', '-m', type=str, help='gpt or dialogpt')
parser.add_argument('--token_loss_file', '-t', type=str, help='path to the token loss file that contains two columns: [tokens, losses]')
parser.add_argument('--word_loss_file', '-w', type=str, help='path to the token loss file that contains two columns: [tokens, losses]')
parser.add_argument('--output_file', '-o', type=str, help='path to the output file')
parser.add_argument('--keywords_num', '-n', type=int, default=100, help='how many words in an utterance serve as keywords')
parser.add_argument('--keywords_ratio', '-r', type=float, default=1.0, help='how many words (in ratio) in an utterance serve as keywords')
parser.add_argument('--keywords_th', '-th', type=float, default=0., help='loss threshold for the keywords')
parser.add_argument('--stopwords', '-s', type=lambda x: bool(eval(x)), default=True, help='filter out stopwords')
args = parser.parse_args()
print(args)
main(args)
import json
def main(args):
filename2data = {f.split('/')[-1]: json.load(open(f)) for f in args.keywords_files}
first_filename = args.keywords_files[0].split('/')[-1]
dialogs = []
for i in range(len(filename2data[first_filename])):
turns = []
for j in range(len(filename2data[first_filename][i])):
utt = filename2data[first_filename][i][j]['utterance']
keywords = {filename.split('_')[2]+'_nonstopword'+filename.split('_')[-1]: ' | '.join([x[0] for x in filename2data[filename][i][j]['keywords']]) for filename in filename2data}
turns.append({
"utterance": utt,
**keywords
})
dialogs.append(turns)
json.dump(dialogs, open(args.output_file, "w", encoding='utf-8'), indent=2, ensure_ascii=False)
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="calculate NLU metrics for unified datasets")
parser.add_argument('--keywords_files', '-f', metavar='keywords_files', nargs='*', help='keywords files')
parser.add_argument('--output_file', '-o', type=str, help='path to the output file')
args = parser.parse_args()
print(args)
main(args)
set -e
n_gpus=1
task_name="nlu"
dataset_name="tm1"
speaker="user"
context_window_size=3
data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
task_name="lm"
dataset_name="multiwoz21"
model_type="dialogpt"
data_dir="data/${task_name}/${dataset_name}/${model_type}"
output_dir="output/${task_name}/${dataset_name}/${model_type}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="nlu_metric.py"
metric_for_best_model="overall_f1"
source_column="context"
target_column="dialogue_acts_seq"
model_name_or_path="t5-small"
per_device_train_batch_size=64
per_device_eval_batch_size=64
source_column="dialogue"
max_length=512
model_name_or_path="microsoft/DialoGPT-large"
per_device_train_batch_size=16
per_device_eval_batch_size=16
gradient_accumulation_steps=4
lr=1e-3
num_train_epochs=10
lr=5e-5
num_train_epochs=3
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
python ../run_clm.py \
--model_name_or_path ${model_name_or_path} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${model_name_or_path} \
--max_length ${max_length} \
--do_train \
--do_eval \
--save_strategy epoch \
--evaluation_strategy epoch \
--load_best_model_at_end \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
......@@ -39,30 +39,9 @@ python -m torch.distributed.launch \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
This diff is collapsed.
......@@ -2,6 +2,7 @@ import os
import json
from tqdm import tqdm
import re
from transformers import AutoTokenizer
from convlab2.util import load_dataset, load_nlu_data, load_dst_data, load_policy_data, load_nlg_data, load_e2e_data, load_rg_data
from convlab2.base_models.t5.nlu.serialization import serialize_dialogue_acts, deserialize_dialogue_acts, equal_da_seq
from convlab2.base_models.t5.dst.serialization import serialize_dialogue_state, deserialize_dialogue_state, equal_state_seq
......@@ -15,13 +16,16 @@ def create_rg_data(dataset, data_dir, args):
for data_split in data_splits:
data = []
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
context = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']])
response = f"{sample['speaker']}: {sample['utterance']}"
data.append(json.dumps({'context': context, 'response': response}, ensure_ascii=False)+'\n')
if len(sample['context']) == 0:
continue
context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: '])
data.append(json.dumps({'context': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n')
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def create_nlu_data(dataset, data_dir, args):
data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
......@@ -34,7 +38,7 @@ def create_nlu_data(dataset, data_dir, args):
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
response = f"{sample['speaker']}: {sample['utterance']}"
if args.context_window_size>0:
context = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[response])
context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[response])
else:
context = response
dialogue_acts_seq = serialize_dialogue_acts(sample['dialogue_acts'])
......@@ -44,6 +48,8 @@ def create_nlu_data(dataset, data_dir, args):
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def create_dst_data(dataset, data_dir, args):
data_by_split = load_dst_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
......@@ -56,7 +62,7 @@ def create_dst_data(dataset, data_dir, args):
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
response = f"{sample['speaker']}: {sample['utterance']}"
if args.context_window_size>0:
context = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[response])
context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[response])
else:
context = response
state_seq = serialize_dialogue_state(sample['state'])
......@@ -66,6 +72,32 @@ def create_dst_data(dataset, data_dir, args):
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def create_nlg_data(dataset, data_dir, args):
data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
data_dir = os.path.join(data_dir, args.speaker, f'context_{args.context_window_size}')
os.makedirs(data_dir, exist_ok=True)
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
dialogue_acts_seq = serialize_dialogue_acts(sample['dialogue_acts'])
if args.context_window_size>0:
context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: '])
context = f'{dialogue_acts_seq}\n{context}'
else:
context = f'{dialogue_acts_seq}\n{sample["speaker"]}: '
assert equal_da_seq(sample['dialogue_acts'], dialogue_acts_seq), print(sample['dialogue_acts'], dialogue_acts_seq, deserialize_dialogue_acts(dialogue_acts_seq))
data.append(json.dumps({'context+da': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n')
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def create_goal2dialogue_data(dataset, data_dir, args):
data_by_split = dataset
......@@ -76,25 +108,44 @@ def create_goal2dialogue_data(dataset, data_dir, args):
data = []
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
goal = re.sub(r'<.*?>', '', sample['goal']['description'])
dialogue = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']])
dialogue = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']])
data.append(json.dumps({'goal': goal, 'dialogue': dialogue}, ensure_ascii=False)+'\n')
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
data_by_split[data_split] = data
return data_by_split
def get_max_len(data_by_split, tokenizer):
for data_split in data_by_split.keys():
seq_len = {}
for line in data_by_split[data_split]:
item = json.loads(line.strip())
for column, seq in item.items():
seq_len.setdefault(column, [])
seq_len[column].append(len(tokenizer.tokenize(seq)))
print(f"data split: {data_split}")
for column, lens in seq_len.items():
print(f'\t{column}\tmax_len: {max(lens)}\tmean_len: {round(sum(lens)/len(lens),2)}')
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="create data for seq2seq training")
parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['rg', 'nlu', 'dst', 'goal2dialogue'], help='names of tasks')
parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['rg', 'nlu', 'dst', 'nlg', 'goal2dialogue'], help='names of tasks')
parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s)')
parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
parser.add_argument('--len_tokenizer', '-l', type=str, default=None, help='name or path of tokenizer that used to get seq len')
args = parser.parse_args()
print(args)
if args.len_tokenizer:
tokenizer = AutoTokenizer.from_pretrained(args.len_tokenizer)
for dataset_name in tqdm(args.datasets, desc='datasets'):
dataset = load_dataset(dataset_name)
for task_name in tqdm(args.tasks, desc='tasks', leave=False):
data_dir = os.path.join('data', task_name, dataset_name)
eval(f"create_{task_name}_data")(dataset, data_dir, args)
data_by_split = eval(f"create_{task_name}_data")(dataset, data_dir, args)
if args.len_tokenizer:
get_max_len(data_by_split, tokenizer)
def serialize_dialogue_state(state):
state_seqs = []
state_dict = {}
for domain in state:
for slot, value in state[domain].items():
for slot, value in sorted(state[domain].items()):
if len(value) > 0:
state_seqs.append(f'[{domain}][{slot}][{value}]')
return ';'.join(state_seqs)
state_dict.setdefault(f'[{domain}]', [])
state_dict[f'[{domain}]'].append(f'[{slot}][{value}]')
return ';'.join([domain+'('+','.join(slot_values)+')' for domain, slot_values in state_dict.items()])
def deserialize_dialogue_state(state_seq):
state = {}
if len(state_seq) == 0:
return state
state_seqs = state_seq.split('];[')
state_seqs = state_seq.split(']);[') # will consume "])" and "["
for i, state_seq in enumerate(state_seqs):
if len(state_seq) == 0:
if len(state_seq) == 0 or len(state_seq.split(']([')) != 2:
continue
if i == 0:
if state_seq[0] == '[':
state_seq = state_seq[1:]
if i == len(state_seqs) - 1:
if state_seq[-1] == ']':
state_seq = state_seq[:-1]
s = state_seq.split('][')
if len(s) != 3:
if state_seq[-2:] == '])':
state_seq = state_seq[:-2]
try:
domain, slot_values = state_seq.split(']([')
except:
continue
for slot_value in slot_values.split('],['):
try:
slot, value = slot_value.split('][')
except:
continue
domain, slot, value = s
state.setdefault(domain, {})
state[domain][slot] = value
return state
......
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""NLG Metric"""
import datasets
import sacrebleu
# TODO: Add BibTeX citation
_CITATION = """\
@inproceedings{post-2018-call,
title = "A Call for Clarity in Reporting {BLEU} Scores",
author = "Post, Matt",
booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6319",
pages = "186--191",
}
"""
_DESCRIPTION = """\
Metric to evaluate text-to-text models on the natural language generation task.
"""
_KWARGS_DESCRIPTION = """
Calculates corpus-bleu4
Args:
predictions: list of predictions to score. Each predictions
should be a string.
references: list of reference for each prediction. Each
reference should be a string.
Returns:
bleu: corpus-bleu score
Examples:
>>> nlg_metric = datasets.load_metric("nlg_metric.py")
>>> predictions = ["hello there general kenobi", "foo bar foobar"]
>>> references = ["hello there kenobi", "foo bar foobar"]
>>> results = nlg_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'bleu': 35.35533905932737}
"""
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class NLGMetrics(datasets.Metric):
"""Metric to evaluate text-to-text models on the natural language generation task."""
def _info(self):
return datasets.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
# This defines the format of each prediction and reference
features=datasets.Features({
'predictions': datasets.Value('string'),
'references': datasets.Value('string'),
})
)
def _compute(self, predictions, references):
"""Returns the scores: bleu"""
bleu = sacrebleu.corpus_bleu(predictions, [references], lowercase=True).score
return {
"bleu": bleu
}
......@@ -36,21 +36,15 @@ Returns:
seq_em: sequence exact match
accuracy: dialog acts accuracy
overall_f1: dialog acts overall f1
binary_f1: binary dialog acts f1
categorical_f1: categorical dialog acts f1
non-categorical_f1: non-categorical dialog acts f1
Examples:
>>> nlu_metric = datasets.load_metric("nlu_metric.py")
>>> predictions = ["[binary][thank][general][]", "[non-categorical][inform][taxi][leave at][17:15]"]
>>> references = ["[binary][thank][general][]", "[non-categorical][inform][train][leave at][17:15]"]
>>> predictions = ["[thank][general]{[][]}", "[inform][taxi]{[leave at][17:15]}"]
>>> references = ["[thank][general]{[][]}", "[inform][train]{[leave at][17:15]}"]
>>> results = nlu_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'seq_em': 0.5, 'accuracy': 0.5,
'overall_f1': 0.5, 'overall_precision': 0.5, 'overall_recall': 0.5,
'binary_f1': 1.0, 'binary_precision': 1.0, 'binary_recall': 1.0,
'categorical_f1': 0.0, 'categorical_precision': 0.0, 'categorical_recall': 0.0,
'non-categorical_f1': 0.0, 'non-categorical_precision': 0.0, 'non-categorical_recall': 0.0}
'overall_f1': 0.5, 'overall_precision': 0.5, 'overall_recall': 0.5}
"""
......@@ -74,45 +68,33 @@ class NLUMetrics(datasets.Metric):
"""Returns the scores: sequence exact match, dialog acts accuracy and f1"""
seq_em = []
acc = []
f1_metrics = {x: {'TP':0, 'FP':0, 'FN':0} for x in ['overall', 'binary', 'categorical', 'non-categorical']}
f1_metrics = {'TP':0, 'FP':0, 'FN':0}
for prediction, reference in zip(predictions, references):
seq_em.append(prediction.strip()==reference.strip())
pred_da = deserialize_dialogue_acts(prediction)
gold_da = deserialize_dialogue_acts(reference)
flag = True
for da_type in ['binary', 'categorical', 'non-categorical']:
if da_type == 'binary':
predicts = sorted(list({(x['intent'], x['domain'], x['slot']) for x in pred_da[da_type]}))
labels = sorted(list({(x['intent'], x['domain'], x['slot']) for x in gold_da[da_type]}))
pred_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da in pred_da}))
gold_da = sorted(list({(da['intent'], da['domain'], da['slot'], ''.join(da.get('value', '').split()).lower()) for da in gold_da}))
acc.append(pred_da==gold_da)
for ele in pred_da:
if ele in gold_da:
f1_metrics['TP'] += 1
else:
predicts = sorted(list({(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in pred_da[da_type]}))
labels = sorted(list({(x['intent'], x['domain'], x['slot'], ''.join(x['value'].split()).lower()) for x in gold_da[da_type]}))
for ele in predicts:
if ele in labels:
f1_metrics['overall']['TP'] += 1
f1_metrics[da_type]['TP'] += 1
else:
f1_metrics['overall']['FP'] += 1
f1_metrics[da_type]['FP'] += 1
for ele in labels:
if ele not in predicts:
f1_metrics['overall']['FN'] += 1
f1_metrics[da_type]['FN'] += 1
flag &= (predicts==labels)
acc.append(flag)
f1_metrics['FP'] += 1
for ele in gold_da:
if ele not in pred_da:
f1_metrics['FN'] += 1
for metric in list(f1_metrics.keys()):
TP = f1_metrics[metric].pop('TP')
FP = f1_metrics[metric].pop('FP')
FN = f1_metrics[metric].pop('FN')
TP = f1_metrics.pop('TP')
FP = f1_metrics.pop('FP')
FN = f1_metrics.pop('FN')
precision = 1.0 * TP / (TP + FP) if TP + FP else 0.
recall = 1.0 * TP / (TP + FN) if TP + FN else 0.
f1 = 2.0 * precision * recall / (precision + recall) if precision + recall else 0.
f1_metrics.pop(metric)
f1_metrics[f'{metric}_f1'] = f1
f1_metrics[f'{metric}_precision'] = precision
f1_metrics[f'{metric}_recall'] = recall
f1_metrics[f'overall_f1'] = f1
f1_metrics[f'overall_precision'] = precision
f1_metrics[f'overall_recall'] = recall
return {
"seq_em": sum(seq_em)/len(seq_em),
......
n_gpus=1
task_name="nlu"
dataset_name="multiwoz21"
speaker="user"
context_window_size=0
data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="nlu_metric.py"
metric_for_best_model="overall_f1"
source_column="context"
target_column="dialogue_acts_seq"
model_name_or_path="t5-small"
per_device_train_batch_size=128
per_device_eval_batch_size=64
gradient_accumulation_steps=2
lr=1e-3
num_train_epochs=10
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--save_strategy epoch \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
n_gpus=1
task_name="nlu"
dataset_name="multiwoz21"
speaker="user"
context_window_size=3
data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="nlu_metric.py"
metric_for_best_model="overall_f1"
source_column="context"
target_column="dialogue_acts_seq"
model_name_or_path="t5-small"
per_device_train_batch_size=128
per_device_eval_batch_size=64
gradient_accumulation_steps=2
lr=1e-3
num_train_epochs=10
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--save_strategy epoch \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
n_gpus=4
task_name="dst"
dataset_name="multiwoz21"
n_gpus=1
task_name="nlu"
dataset_name=$1
speaker="user"
context_window_size=100
context_window_size=$2
data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
cache_dir="../cache"
......@@ -10,28 +10,26 @@ logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="dst_metric.py"
metric_for_best_model="accuracy"
metric_name_or_path="nlu_metric.py"
metric_for_best_model="overall_f1"
source_column="context"
target_column="state_seq"
target_column="dialogue_acts_seq"
truncation_side="left"
max_source_length=512
max_target_length=512
model_name_or_path="t5-small"
per_device_train_batch_size=64
per_device_train_batch_size=128
per_device_eval_batch_size=64
gradient_accumulation_steps=2
lr=1e-3
num_train_epochs=10
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
python ../create_data.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
python ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
......@@ -40,7 +38,6 @@ python -m torch.distributed.launch \
--model_name_or_path ${model_name_or_path} \
--do_train \
--do_eval \
--do_predict \
--save_strategy epoch \
--evaluation_strategy epoch \
--prediction_loss_only \
......@@ -58,8 +55,7 @@ python -m torch.distributed.launch \
--adafactor \
--gradient_checkpointing
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
python ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
......@@ -76,8 +72,8 @@ python -m torch.distributed.launch \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size}
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
n_gpus=1
task_name="nlu"
dataset_name="tm1"
speaker="user"
context_window_size=0
data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="nlu_metric.py"
metric_for_best_model="overall_f1"
source_column="context"
target_column="dialogue_acts_seq"
model_name_or_path="t5-small"
per_device_train_batch_size=128
per_device_eval_batch_size=64
gradient_accumulation_steps=2
lr=1e-3
num_train_epochs=10
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--save_strategy epoch \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
n_gpus=1
task_name="nlu"
dataset_name="tm2"
speaker="user"
context_window_size=0
data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="nlu_metric.py"
metric_for_best_model="overall_f1"
source_column="context"
target_column="dialogue_acts_seq"
model_name_or_path="t5-small"
per_device_train_batch_size=128
per_device_eval_batch_size=64
gradient_accumulation_steps=2
lr=1e-3
num_train_epochs=10
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--save_strategy epoch \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
n_gpus=1
task_name="nlu"
dataset_name="tm2"
speaker="user"
context_window_size=3
data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="nlu_metric.py"
metric_for_best_model="overall_f1"
source_column="context"
target_column="dialogue_acts_seq"
model_name_or_path="t5-small"
per_device_train_batch_size=128
per_device_eval_batch_size=64
gradient_accumulation_steps=2
lr=1e-3
num_train_epochs=10
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--save_strategy epoch \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
n_gpus=1
task_name="nlu"
dataset_name="tm3"
speaker="user"
context_window_size=0
data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="nlu_metric.py"
metric_for_best_model="overall_f1"
source_column="context"
target_column="dialogue_acts_seq"
model_name_or_path="t5-small"
per_device_train_batch_size=128
per_device_eval_batch_size=64
gradient_accumulation_steps=2
lr=1e-3
num_train_epochs=10
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--save_strategy epoch \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
n_gpus=1
task_name="nlu"
dataset_name="tm3"
speaker="user"
context_window_size=3
data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="nlu_metric.py"
metric_for_best_model="overall_f1"
source_column="context"
target_column="dialogue_acts_seq"
model_name_or_path="t5-small"
per_device_train_batch_size=128
per_device_eval_batch_size=64
gradient_accumulation_steps=2
lr=1e-3
num_train_epochs=10
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --speaker ${speaker} --context_window_size ${context_window_size}
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--save_strategy epoch \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
python ../../../nlu/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
def serialize_dialogue_acts(dialogue_acts):
da_seqs = []
da_dict = {}
for da_type in dialogue_acts:
for da in dialogue_acts[da_type]:
intent, domain, slot = da['intent'], da['domain'], da['slot']
if da_type == 'binary':
da_seq = f'[{da_type}][{intent}][{domain}][{slot}]'
else:
value = da['value']
da_seq = f'[{da_type}][{intent}][{domain}][{slot}][{value}]'
da_seqs.append(da_seq)
return ';'.join(da_seqs)
intent, domain, slot, value = da['intent'], da['domain'], da['slot'], da.get('value', '')
intent_domain = f'[{intent}][{domain}]'
da_dict.setdefault(intent_domain, [])
da_dict[intent_domain].append(f'[{slot}][{value}]')
return ';'.join([intent_domain+'('+','.join(slot_values)+')' for intent_domain, slot_values in da_dict.items()])
def deserialize_dialogue_acts(das_seq):
dialogue_acts = {'binary': [], 'categorical': [], 'non-categorical': []}
dialogue_acts = []
if len(das_seq) == 0:
return dialogue_acts
da_seqs = das_seq.split('];[')
da_seqs = das_seq.split(']);[') # will consume "])" and "["
for i, da_seq in enumerate(da_seqs):
if len(da_seq) == 0:
if len(da_seq) == 0 or len(da_seq.split(']([')) != 2:
continue
if i == 0:
if da_seq[0] == '[':
da_seq = da_seq[1:]
if i == len(da_seqs) - 1:
if da_seq[-1] == ']':
da_seq = da_seq[:-1]
da = da_seq.split('][')
if len(da) == 0:
if da_seq[-2:] == '])':
da_seq = da_seq[:-2]
try:
intent_domain, slot_values = da_seq.split(']([')
intent, domain = intent_domain.split('][')
except:
continue
for slot_value in slot_values.split('],['):
try:
slot, value = slot_value.split('][')
except:
continue
da_type = da[0]
if len(da) == 5 and da_type in ['categorical', 'non-categorical']:
dialogue_acts[da_type].append({'intent': da[1], 'domain': da[2], 'slot': da[3], 'value': da[4]})
elif len(da) == 4 and da_type == 'binary':
dialogue_acts[da_type].append({'intent': da[1], 'domain': da[2], 'slot': da[3]})
else:
# invalid da format, skip
# print(das_seq)
# print(da_seq)
# print()
pass
dialogue_acts.append({'intent': intent, 'domain': domain, 'slot': slot, 'value': value})
return dialogue_acts
def equal_da_seq(dialogue_acts, das_seq):
predict_dialogue_acts = deserialize_dialogue_acts(das_seq)
for da_type in ['binary', 'categorical', 'non-categorical']:
das = sorted([(da['intent'], da['domain'], da['slot'], da.get('value', '')) for da in dialogue_acts[da_type]])
predict_das = sorted([(da['intent'], da['domain'], da['slot'], da.get('value', '')) for da in predict_dialogue_acts[da_type]])
das = sorted([(da['intent'], da['domain'], da['slot'], da.get('value', '')) for da_type in ['binary', 'categorical', 'non-categorical'] for da in dialogue_acts[da_type]])
predict_das = sorted([(da['intent'], da['domain'], da['slot'], da.get('value', '')) for da in predict_dialogue_acts])
if das != predict_das:
return False
return True
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment