Skip to content
Snippets Groups Projects
Commit 245360f0 authored by zz-jacob's avatar zz-jacob
Browse files

Merge branch 'master' of github.com:ConvLab/ConvLab-3

parents d6295047 2cd7eeea
No related branches found
No related tags found
No related merge requests found
Showing
with 156 additions and 1871 deletions
...@@ -66,7 +66,8 @@ convlab/nlu/jointBERT_new/**/output/ ...@@ -66,7 +66,8 @@ convlab/nlu/jointBERT_new/**/output/
convlab/nlu/milu/09* convlab/nlu/milu/09*
convlab/nlu/jointBERT/multiwoz/configs/multiwoz_new_usr_context.json convlab/nlu/jointBERT/multiwoz/configs/multiwoz_new_usr_context.json
convlab/nlu/milu/multiwoz/configs/system_without_context.jsonnet convlab/nlu/milu/multiwoz/configs/system_without_context.jsonnet
convlab/nlu/milu/multiwoz/configs/user_without_context.jsonnet convlab/nlu/milu/multiwoz/configs/user_without_context.jsonnet\
*.pkl
# test script # test script
*_test.py *_test.py
...@@ -87,7 +88,6 @@ dist ...@@ -87,7 +88,6 @@ dist
convlab.egg-info convlab.egg-info
# configs # configs
*experiment* *experiment*
*pretrained_models* *pretrained_models*
.ipynb_checkpoints .ipynb_checkpoints
...@@ -102,8 +102,8 @@ convlab/dst/trade/multiwoz_config/ ...@@ -102,8 +102,8 @@ convlab/dst/trade/multiwoz_config/
convlab/deploy/bert_multiwoz_all.zip convlab/deploy/bert_multiwoz_all.zip
convlab/deploy/templates/dialog_eg.html convlab/deploy/templates/dialog_eg.html
test.py test.py
*convlab/policy/vector/action_dicts *convlab/policy/vector/action_dicts
*.egg-info *.egg-info
pre-trained-models/ pre-trained-models/
venv venv
This diff is collapsed.
import os
import json
from tqdm import tqdm
import re
from convlab.util import load_dataset
def create_lm_data(dataset, data_dir, args):
data_by_split = dataset
os.makedirs(data_dir, exist_ok=True)
data_splits = data_by_split.keys()
for data_split in data_splits:
data = []
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
if args.model_type == 'dialogpt':
dialogue = ' <|endoftext|> '.join([turn['utterance'] for turn in sample['turns']]) + ' <|endoftext|>'
else:
dialogue = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']])
data.append(json.dumps({'dialogue': dialogue}, ensure_ascii=False)+'\n')
file_name = os.path.join(data_dir, f"{data_split}.json")
with open(file_name, "w", encoding='utf-8') as f:
f.writelines(data)
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="create data for seq2seq training")
parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['lm'], help='names of tasks')
parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
parser.add_argument('--model_type', '-m', metavar='model_type', help='type of the language model: gpt, dialogpt, ..')
args = parser.parse_args()
print(args)
for dataset_name in tqdm(args.datasets, desc='datasets'):
dataset = load_dataset(dataset_name)
for task_name in tqdm(args.tasks, desc='tasks', leave=False):
data_dir = os.path.join('data', task_name, args.model_type, dataset_name)
eval(f"create_{task_name}_data")(dataset, data_dir, args)
import json
import datasets
from tabulate import tabulate
def main(predict_result):
data = {
"grounded keywords": {
"positive_keywords": [], "negative_keywords": None,
"predictions": [], "references": []
},
"all keywords": {
"positive_keywords": [], "negative_keywords": [],
"predictions": [], "references": []
},
"no keywords": {
"positive_keywords": None, "negative_keywords": None,
"predictions": [], "references": []
}
}
with open(predict_result) as f:
for line in f:
item = json.loads(line)
prediction = item['predictions'].strip()
reference = item['target'].strip()
if 'all_keywords' in item and item['all_keywords']:
sample_type = 'all keywords'
positive_keywords = [k for g in item['keywords'] for k in g]
data[sample_type]["positive_keywords"].append(positive_keywords)
all_keywords = [k for g in item['all_keywords'] for k in g]
for keyword in positive_keywords:
all_keywords.remove(keyword)
data[sample_type]["negative_keywords"].append(all_keywords)
elif 'keywords' in item and item['keywords']:
sample_type = 'grounded keywords'
positive_keywords = [k for g in item['keywords'] for k in g]
data[sample_type]["positive_keywords"].append(positive_keywords)
else:
sample_type = 'no keywords'
data[sample_type]["predictions"].append(prediction)
data[sample_type]["references"].append(reference)
metric = datasets.load_metric('./key2gen_metric.py')
table = []
for sample_type in data:
table.append({'sample_type': sample_type, **metric.compute(**data[sample_type])})
print(tabulate(table, headers='keys', tablefmt='github'))
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="evaluate keywords to response generation performance")
parser.add_argument('--predict_result', '-p', type=str, required=True, help='path to the output file generated_predictions.json')
args = parser.parse_args()
print(args)
main(args.predict_result)
import json
import json_lines
import os
import random
from tqdm import tqdm
from nltk import sent_tokenize
def main(args):
random.seed(42)
os.makedirs(args.output_dir, exist_ok=True)
filenames = [os.path.join(args.input_dir, f) for (_, _, fs) in os.walk(args.input_dir) for f in fs if 'keywords' in f]
for filename in filenames:
dataset_name = filename.split('/')[-2]
data_split = filename.split('/')[-1].split('_')[-1].split('.')[0]
output_file = os.path.join(args.output_dir, f"{filename.split('/')[-1].split('_')[-1]}")
print(f'processing {dataset_name}: {filename} => {output_file}')
with open(filename, 'rb') as fin, open(output_file, 'w', encoding='utf-8') as fout:
for dial in tqdm(json_lines.reader(fin)):
context = []
turns_keywords = [turn['keywords'] for turn in dial]
for i, turn in enumerate(dial):
if dataset_name == 'wikidialog':
# skip user turns that generated by T5 in wikidialog
speaker = 'user' if i % 2 == 1 else 'system'
else:
speaker = 'user' if i % 2 == 0 else 'system'
utt = turn['utterance']
context_seq = '\n'.join([f"{turn['speaker']}: {turn['utt']}" for turn in context]+[f'{speaker}: '])
context.append({'speaker': speaker, 'utt': utt})
if i == 0 or (dataset_name == 'wikidialog' and speaker == 'user'):
continue
if args.mode == 'rg':
input_seq = f'generate a response: all knowledge: | | context:\n\n{context_seq}'
fout.write(json.dumps({
'dataset': dataset_name,
'source': input_seq,
'target': utt
}, ensure_ascii=False)+'\n')
continue
if args.mode == 'key2gen':
random.shuffle(turn['keywords'])
for j in range(len(turn['keywords'])):
random.shuffle(turn['keywords'][j])
keywords = ' | '.join([' : '.join(sent_keywords) for sent_keywords in turn['keywords']])
input_seq = f'generate a response: grounded knowledge: | {keywords} | context:\n\n{context_seq}'
json2dump = {
'dataset': dataset_name,
'source': input_seq,
'target': utt
}
if data_split == 'validation':
json2dump.update({'keywords': turn['keywords']})
fout.write(json.dumps(json2dump, ensure_ascii=False)+'\n')
continue
if args.mode == 'key2gen_noisy':
if random.random() < 0.8:
possible_keywords_sents = turn['keywords'][:]
else:
possible_keywords_sents = []
num_possible_keywords_turns = min(random.randint(1, 5), len(turns_keywords) - 1)
for turn_keywords in random.sample(turns_keywords[:i] + turns_keywords[i+1:], num_possible_keywords_turns):
possible_keywords_sents.extend(turn_keywords)
random.shuffle(possible_keywords_sents)
possible_keywords = ' | '.join([' : '.join(sent_keywords) for sent_keywords in possible_keywords_sents])
input_seq = f'generate a response: all knowledge: | {possible_keywords} | context:\n\n{context_seq}'
json2dump = {
'dataset': dataset_name,
'source': input_seq,
'target': utt
}
if data_split == 'validation':
json2dump.update({'keywords': turn['keywords'], 'all_keywords': possible_keywords_sents})
fout.write(json.dumps(json2dump, ensure_ascii=False)+'\n')
continue
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="calculate NLU metrics for unified datasets")
parser.add_argument('--input_dir', '-i', type=str, help='path to the input files')
parser.add_argument('--output_dir', '-o', type=str, help='path to the output files')
parser.add_argument('--mode', '-m', type=str, choices=['rg', 'key2gen', 'key2gen_noisy'], help='which task to perform')
args = parser.parse_args()
print(args)
main(args)
# generate data for response generation, key2gen, key2gen_noisy
for task_name in rg key2gen key2gen_noisy
do
dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
names=$(echo ${dataset_name} | tr "+" "\n")
model_type="gpt"
data_dir=data/${task_name}/${model_type}/${dataset_name}
mkdir -p ${data_dir}
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
rm ${train_file} ${validation_file}
for name in ${names}
do
echo "preprocessing ${name}"
python gen_pretraining_data.py -i data/lm/${model_type}/${name} -o data/${task_name}/${model_type}/${name} -m ${task_name}
if [ "${name}" != "${dataset_name}" ]; then
cat "data/${task_name}/${model_type}/${name}/train.json" >> ${train_file}
cat "data/${task_name}/${model_type}/${name}/validation.json" >> ${validation_file}
fi
done
done
# merge key2gen+key2gen_noisy data
task_name="key2gen+key2gen_noisy"
dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
names=$(echo ${task_name} | tr "+" "\n")
model_type="gpt"
data_dir=data/${task_name}/${model_type}/${dataset_name}
mkdir -p ${data_dir}
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
rm ${train_file} ${validation_file}
for name in ${names}
do
echo "preprocessing ${name}"
if [ "${name}" != "${task_name}" ]; then
cat "data/${name}/${model_type}/${dataset_name}/train.json" >> ${train_file}
cat "data/${name}/${model_type}/${dataset_name}/validation.json" >> ${validation_file}
fi
done
\ No newline at end of file
task_name="lm"
model_type="gpt"
model_name_or_path="gpt2-large"
keywords_num=100
keywords_ratio=0.3
keywords_loss_th=0
stopwords=True
for dataset_name in dailydialog metalwoz tm1 tm2 tm3 sgd reddit wikidialog
do
data_dir="data/${task_name}/${model_type}/${dataset_name}"
for data_split in validation train
do
token_loss_file="${data_dir}/token_loss_${data_split}.json"
output_file="${data_dir}/keywords_${data_split}.json"
python lmloss2keywords.py \
--model_type ${model_type} \
--model_name_or_path ${model_name_or_path} \
--token_loss_file ${token_loss_file} \
--keywords_num ${keywords_num} \
--keywords_ratio ${keywords_ratio} \
--keywords_loss_th ${keywords_loss_th} \
--stopwords ${stopwords} \
--output_file ${output_file}
done
done
\ No newline at end of file
n_gpus=4
master_port=23456
task_name="lm"
model_type="gpt"
cache_dir="../cache"
source_column="dialogue"
max_length=512
model_name_or_path="gpt2-large"
per_device_eval_batch_size=16
for dataset_name in dailydialog metalwoz tm1 tm2 tm3 sgd reddit wikidialog
do
data_dir="data/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_type}/${dataset_name}"
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type}
for data_split in validation train
do
validation_file="${data_dir}/${data_split}.json"
dump_eval_loss_to="${data_dir}/token_loss_${data_split}.json"
rm ${dump_eval_loss_to}
python -m torch.distributed.launch --master_port ${master_port} \
--nproc_per_node ${n_gpus} ../run_clm.py \
--dump_eval_loss_to ${dump_eval_loss_to}\
--model_name_or_path ${model_name_or_path} \
--output_dir ${data_dir} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--max_length ${max_length} \
--do_eval \
--cache_dir ${cache_dir} \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size}
done
done
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""key2gen Metric"""
import datasets
import sacrebleu
# TODO: Add BibTeX citation
_CITATION = """\
@inproceedings{post-2018-call,
title = "A Call for Clarity in Reporting {BLEU} Scores",
author = "Post, Matt",
booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6319",
pages = "186--191",
}
"""
_DESCRIPTION = """\
Metric to evaluate text-to-text models on the keywords grounded generation task.
"""
_KWARGS_DESCRIPTION = """
Calculates corpus-bleu4, positive keywords recall, negative keywords recall
Args:
positive_keywords: list of keywords (list of string) in the ground truth references
negative_keywords: list of keywords (list of string) in the random sampled references
predictions: list of predictions to score. Each predictions
should be a string.
references: list of reference for each prediction. Each
reference should be a string.
Returns:
bleu: corpus-bleu score
positive_keywords_recall: how many keywords in the ground truth response are generated, micro-averaged
negative_keywords_recall: how many keywords in the random sampled response are generated, micro-averaged
"""
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Key2GenMetrics(datasets.Metric):
"""Metric to evaluate text-to-text models on the keywords grounded generation task."""
def _info(self):
return datasets.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
# This defines the format of each prediction and reference
features=datasets.Features({
'predictions': datasets.Value('string'),
'references': datasets.Value('string'),
})
)
def _compute(self, predictions, references, positive_keywords, negative_keywords=None):
"""Returns the scores: bleu, positive_keywords_recall, negative_keywords_recall"""
bleu = sacrebleu.corpus_bleu(predictions, [references], lowercase=True).score
cnt = {'pos': 0, 'neg': 0, 'pos_recall': 0, 'neg_recall': 0}
if positive_keywords:
if not negative_keywords:
negative_keywords = [[]] * len(positive_keywords)
for poskeys, negkeys, prediction in zip(positive_keywords, negative_keywords, predictions):
cnt['pos'] += len(poskeys)
cnt['neg'] += len(negkeys)
prediction = prediction.lower()
for key in poskeys:
key = key.lower()
if key in prediction:
cnt['pos_recall'] += 1
for key in negkeys:
key = key.lower()
if key in prediction:
cnt['neg_recall'] += 1
return {
"bleu": bleu,
"positive_keywords_recall": cnt['pos_recall']/cnt['pos'] if cnt['pos'] > 0 else 0,
"negative_keywords_recall": cnt['neg_recall']/cnt['neg'] if cnt['neg'] > 0 else 0,
}
import json
import json_lines
from pprint import pprint
import os
from tqdm import tqdm
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, PunktSentenceTokenizer
from transformers import GPT2Tokenizer
from string import punctuation
def merge_tokens(tokens, losses):
"""Merge tokens into words"""
res = []
i = 0
while i < len(tokens):
token = tokens[i]
loss = losses[i]
if token in ['Ġ', 'Ċ']:
# "Ġ" means " ", "Ċ" means "\n"
if token == 'Ċ' and i < len(tokens) - 1 and not tokens[i+1].startswith('Ġ'):
tokens[i+1] = 'Ġ'+tokens[i+1]
i += 1
continue
if token in ['user', 'system', 'Ġuser', 'Ġsystem'] and i < len(tokens)-1 and tokens[i+1] == ':':
if i > 0:
tokens[i+1] = '<|endoftext|>'
i += 1
else:
i += 2
continue
if token.startswith('Ġ'):
# token = token.replace("Ġ", "")
res.append([[token], [loss]])
elif token == '<|endoftext|>':
res.append([[token], [0.]])
else:
assert 'Ġ' not in token
if len(res) > 0:
res[-1][0].append(token)
res[-1][1].append(loss)
else:
res.append([[token], [loss]])
i += 1
return res
def convert_token_loss2word_loss(token_loss_file):
"""generate a word loss file according to the token loss file"""
word_loss_file = os.path.join(os.path.dirname(token_loss_file), token_loss_file.split('/')[-1].replace('token', 'word'))
fin = open(token_loss_file, 'rb')
fout = open(word_loss_file, 'w', encoding='utf-8')
for item in tqdm(json_lines.reader(fin)):
tokens, losses = item['tokens'], item['losses']
assert len(tokens) == len(losses)
word2losses = merge_tokens(tokens, losses)
fout.write(json.dumps({"words": [x[0] for x in word2losses], "losses": [x[1] for x in word2losses]}, ensure_ascii=False)+'\n')
fin.close()
fout.close()
return word_loss_file
def main(args):
if not args.word_loss_file:
word_loss_file = convert_token_loss2word_loss(args.token_loss_file)
else:
word_loss_file = args.word_loss_file
if not args.output_file:
return
stop_words = set(stopwords.words('english'))
tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
sent_tokenizer = PunktSentenceTokenizer()
def keywords_filter(words, losses):
word_loss_pairs = list(zip(words, losses))
index2keyword = {}
index2turn_sent = {}
num_turns = 0
turns_sent_spans = [list(sent_tokenizer.span_tokenize(utt)) for utt in ''.join(words).strip().split('<|endoftext|>')]
utt = ''
for i, word_loss_pair in enumerate(word_loss_pairs):
if word_loss_pair[0].startswith('<|endoftext|>'):
num_turns += 1
utt = ''
continue
utt += word_loss_pair[0]
words = word_tokenize(word_loss_pair[0])
if args.stopwords and any([w.lower() in stop_words for w in words]):
# skip stopwords
continue
if word_loss_pair[1] <= args.keywords_loss_th:
# skip if loss is too small
continue
# strip punctuation
strip_punctuation = word_loss_pair[0].strip(punctuation).strip()
if len(strip_punctuation) == 0:
# skip punctuation
continue
index2keyword[i] = strip_punctuation
for sent_idx, (sent_start, sent_end) in enumerate(turns_sent_spans[num_turns]):
if len(utt.strip()) <= sent_end:
index2turn_sent[i] = (num_turns, sent_idx)
break
candidate_indexes = list(index2keyword.keys())
topk = min(round(args.keywords_ratio*(len(word_loss_pairs)-num_turns)), args.keywords_num)
topk_indexes = sorted(candidate_indexes, key=lambda x: word_loss_pairs[x][1], reverse=True)[:topk]
topk_indexes = sorted(topk_indexes)
keywords = []
keywords_turn_sent2idx = {}
for i, index in enumerate(topk_indexes):
if i > 0 and index == topk_indexes[i-1] + 1 and \
word_loss_pairs[index][0].strip().startswith(index2keyword[index]) and \
word_loss_pairs[topk_indexes[i-1]][0].strip().endswith(index2keyword[topk_indexes[i-1]]):
keywords[-1]+= ' '+index2keyword[index]
else:
keywords_turn_sent2idx.setdefault(index2turn_sent[index][0], {})
keywords_turn_sent2idx[index2turn_sent[index][0]].setdefault(index2turn_sent[index][1], [])
keywords_turn_sent2idx[index2turn_sent[index][0]][index2turn_sent[index][1]].append(len(keywords))
keywords.append(index2keyword[index])
return keywords, keywords_turn_sent2idx
fin = open(word_loss_file, 'rb')
fout = open(args.output_file, 'w', encoding='utf-8')
for item in tqdm(json_lines.reader(fin)):
words = [tokenizer.convert_tokens_to_string(tokens) for tokens in item['words']]
losses = [np.mean(loss) for loss in item['losses']]
dialog_keywords, keywords_turn_sent2idx = keywords_filter(words, losses)
# print(keywords_turn_sent2idx)
turns = []
turn = {'words': [], 'losses': []}
for i, (word, loss) in enumerate(zip(words, losses)):
if word != '<|endoftext|>':
turn['words'].append(word)
turn['losses'].append(loss)
if word == '<|endoftext|>' or i == len(words) - 1:
# switch turn
turn['utterance'] = ''.join(turn['words']).strip()
# 1) extract keywords according to LM loss within the turn
# keywords, _ = keywords_filter(turn['words'], turn['losses'])
# turn['turn-level_keywords'] = keywords
# 1) extract keywords according to LM loss over the dialog, and group them by sentence
turn['keywords'] = [[dialog_keywords[idx] for idx in k_idxes] for sent_idx, k_idxes in keywords_turn_sent2idx.get(len(turns), {}).items()]
turn.pop('words')
turn.pop('losses')
turns.append(turn)
turn = {'words': [], 'losses': []}
fout.write(json.dumps(turns, ensure_ascii=False)+'\n')
fin.close()
fout.close()
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="extract keywords according to lm loss")
parser.add_argument('--model_type', '-m', type=str, help='gpt or dialogpt')
parser.add_argument('--model_name_or_path', type=str, help='model name or path')
parser.add_argument('--token_loss_file', '-t', type=str, help='path to the token loss file that contains two columns: [tokens, losses]')
parser.add_argument('--word_loss_file', '-w', type=str, help='path to the token loss file that contains two columns: [tokens, losses]')
parser.add_argument('--output_file', '-o', type=str, help='path to the output file')
parser.add_argument('--keywords_num', '-n', type=int, default=100, help='how many words in an utterance serve as keywords')
parser.add_argument('--keywords_ratio', '-r', type=float, default=1.0, help='how many words (in ratio) in an utterance serve as keywords')
parser.add_argument('--keywords_loss_th', '-th', type=float, default=0., help='loss threshold for the keywords')
parser.add_argument('--stopwords', '-s', type=lambda x: bool(eval(x)), default=True, help='filter out stopwords')
args = parser.parse_args()
print(args)
main(args)
import json
def main(args):
filename2data = {f.split('/')[-1]: json.load(open(f)) for f in args.keywords_files}
first_filename = args.keywords_files[0].split('/')[-1]
dialogs = []
for i in range(len(filename2data[first_filename])):
turns = []
for j in range(min([len(filename2data[filename][i]) for filename in filename2data])):
utt = filename2data[first_filename][i][j]['utterance']
keywords = {filename.split('_')[3]+'_nonstopword'+filename.split('_')[-1]: ' | '.join(filename2data[filename][i][j]['keywords']) for filename in filename2data}
turns.append({
"utterance": utt,
**keywords
})
dialogs.append(turns)
json.dump(dialogs, open(args.output_file, "w", encoding='utf-8'), indent=2, ensure_ascii=False)
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="calculate NLU metrics for unified datasets")
parser.add_argument('--keywords_files', '-f', metavar='keywords_files', nargs='*', help='keywords files')
parser.add_argument('--output_file', '-o', type=str, help='path to the output file')
args = parser.parse_args()
print(args)
main(args)
set -e
n_gpus=1
task_name="lm"
dataset_name="multiwoz21"
model_type="dialogpt"
data_dir="data/${task_name}/${dataset_name}/${model_type}"
output_dir="output/${task_name}/${dataset_name}/${model_type}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
source_column="dialogue"
max_length=512
model_name_or_path="microsoft/DialoGPT-large"
per_device_train_batch_size=16
per_device_eval_batch_size=16
gradient_accumulation_steps=4
lr=5e-5
num_train_epochs=3
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type}
python ../run_clm.py \
--model_name_or_path ${model_name_or_path} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--max_length ${max_length} \
--do_train \
--do_eval \
--save_strategy epoch \
--evaluation_strategy epoch \
--load_best_model_at_end \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--gradient_checkpointing
set -e
n_gpus=1
task_name="lm"
dataset_name="multiwoz21"
model_type="gpt"
data_dir="data/${task_name}/${dataset_name}/${model_type}"
output_dir="output/${task_name}/${dataset_name}/${model_type}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
source_column="dialogue"
max_length=512
model_name_or_path="gpt2-large"
per_device_train_batch_size=16
per_device_eval_batch_size=16
gradient_accumulation_steps=4
lr=5e-5
num_train_epochs=3
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type}
python ../run_clm.py \
--model_name_or_path ${model_name_or_path} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--max_length ${max_length} \
--do_train \
--do_eval \
--save_strategy epoch \
--evaluation_strategy epoch \
--load_best_model_at_end \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--gradient_checkpointing
set -e
n_gpus=8
master_port=23456
task_name="key2gen+key2gen_noisy"
dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
model_type="gpt"
model_name="t5-small"
data_dir="data/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_name}/${dataset_name}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
source_column="source"
target_column="target"
truncation_side="left"
max_source_length=512
max_target_length=128
model_name_or_path="${model_name}"
per_device_train_batch_size=64
per_device_eval_batch_size=128
gradient_accumulation_steps=1
num_workers=16
lr=1e-3
num_train_epochs=1
python -m torch.distributed.launch --master_port ${master_port} \
--nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--save_steps 5000 \
--save_total_limit 1 \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--preprocessing_num_workers ${num_workers} \
--dataloader_num_workers ${num_workers} \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--optim adafactor \
--lr_scheduler_type constant \
--gradient_checkpointing
set -e
n_gpus=8
master_port=23456
task_name="rg"
dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
model_type="gpt"
model_name="t5-small"
data_dir="data/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_name}/${dataset_name}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
source_column="source"
target_column="target"
truncation_side="left"
max_source_length=512
max_target_length=128
model_name_or_path="${model_name}"
per_device_train_batch_size=64
per_device_eval_batch_size=128
gradient_accumulation_steps=1
num_workers=16
lr=1e-3
num_train_epochs=1
python -m torch.distributed.launch --master_port ${master_port} \
--nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--save_steps 5000 \
--save_total_limit 1 \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--preprocessing_num_workers ${num_workers} \
--dataloader_num_workers ${num_workers} \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--optim adafactor \
--lr_scheduler_type constant \
--gradient_checkpointing
set -e
n_gpus=8
master_port=23456
task_name="key2gen+key2gen_noisy"
dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
model_type="gpt"
model_name="t5-small"
data_dir="data/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_name}/${dataset_name}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
source_column="source"
target_column="target"
truncation_side="left"
max_source_length=512
max_target_length=128
model_name_or_path="output/rg/${model_name}/${dataset_name}"
per_device_train_batch_size=64
per_device_eval_batch_size=128
gradient_accumulation_steps=1
num_workers=16
lr=1e-3
num_train_epochs=1
python -m torch.distributed.launch --master_port ${master_port} \
--nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--save_steps 5000 \
--save_total_limit 1 \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--preprocessing_num_workers ${num_workers} \
--dataloader_num_workers ${num_workers} \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--optim adafactor \
--lr_scheduler_type constant \
--gradient_checkpointing
This diff is collapsed.
from transformers import Trainer
from transformers.trainer_utils import EvalLoopOutput, has_length
from transformers.deepspeed import deepspeed_init
from transformers.utils import logging
from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, IterableDatasetShard, nested_truncate
from transformers.trainer_utils import EvalPrediction, denumpify_detensorize
import torch
from torch.utils.data import DataLoader
import numpy as np
from typing import List, Optional
import json
logger = logging.get_logger(__name__)
class DumpTokenLossTrainer(Trainer):
def evaluation_loop(
self,
dataloader: DataLoader,
description: str,
prediction_loss_only: Optional[bool] = None,
ignore_keys: Optional[List[str]] = None,
metric_key_prefix: str = "eval",
) -> EvalLoopOutput:
"""
Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
Works both with or without labels.
"""
args = self.args
prediction_loss_only = args.prediction_loss_only
# if eval is called w/o train init deepspeed here
if args.deepspeed and not self.deepspeed:
# XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
# from the checkpoint eventually
deepspeed_engine, _, _ = deepspeed_init(
self, num_training_steps=0, resume_from_checkpoint=None, inference=True
)
self.model = deepspeed_engine.module
self.model_wrapped = deepspeed_engine
self.deepspeed = deepspeed_engine
model = self._wrap_model(self.model, training=False, dataloader=dataloader)
# if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
# while ``train`` is running, cast it to the right dtype first and then put on device
if not self.is_in_train:
if args.fp16_full_eval:
model = model.to(dtype=torch.float16, device=args.device)
elif args.bf16_full_eval:
model = model.to(dtype=torch.bfloat16, device=args.device)
batch_size = self.args.eval_batch_size
logger.info(f"***** Running {description} *****")
if has_length(dataloader):
logger.info(f" Num examples = {self.num_examples(dataloader)}")
else:
logger.info(" Num examples: Unknown")
logger.info(f" Batch size = {batch_size}")
model.eval()
self.callback_handler.eval_dataloader = dataloader
# Do this before wrapping.
eval_dataset = getattr(dataloader, "dataset", None)
if args.past_index >= 0:
self._past = None
# Initialize containers
# losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps)
losses_host = None
preds_host = None
labels_host = None
inputs_host = None
# losses/preds/labels on CPU (final containers)
all_losses = None
all_preds = None
all_labels = None
all_inputs = None
# Will be useful when we have an iterable dataset so don't know its length.
if args.dump_eval_loss_to:
writer = open(args.dump_eval_loss_to, "a", encoding='utf-8')
loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
num_sample_to_write = len(eval_dataset)
observed_num_examples = 0
# Main evaluation loop
for step, inputs in enumerate(dataloader):
# Update the observed num examples
observed_batch_size = find_batch_size(inputs)
if observed_batch_size is not None:
observed_num_examples += observed_batch_size
# For batch samplers, batch_size is not known by the dataloader in advance.
if batch_size is None:
batch_size = observed_batch_size
# Prediction step
loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None
# Update containers on host
if loss is not None:
losses = self._nested_gather(loss.repeat(batch_size))
losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
if labels is not None:
labels = self._pad_across_processes(labels)
labels = self._nested_gather(labels)
# labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
if inputs_decode is not None:
inputs_decode = self._pad_across_processes(inputs_decode)
inputs_decode = self._nested_gather(inputs_decode)
inputs_host = (
inputs_decode
if inputs_host is None
else nested_concat(inputs_host, inputs_decode, padding_index=-100)
)
if logits is not None:
logits = self._pad_across_processes(logits)
logits = self._nested_gather(logits)
if self.preprocess_logits_for_metrics is not None:
logits = self.preprocess_logits_for_metrics(logits, labels)
# preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
if args.dump_eval_loss_to:
if self.is_world_process_zero() and num_sample_to_write > 0:
assert logits is not None and labels is not None, print('prediction_loss_only', prediction_loss_only)
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
batch_token_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
batch_token_loss = batch_token_loss.view(shift_labels.size()).tolist()
labels = labels.tolist()
for i in range(len(labels)):
if num_sample_to_write > 0:
num_sample_to_write -= 1
else:
break
token_ids = [x for x in labels[i] if x != -100]
tokens = self.tokenizer.convert_ids_to_tokens(token_ids)
token_losses = [0] + batch_token_loss[i][:len(token_ids)-1]
writer.write(json.dumps({"tokens": tokens, "losses": token_losses}, ensure_ascii=False)+'\n')
self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
# Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
if losses_host is not None:
losses = nested_numpify(losses_host)
all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
if preds_host is not None:
logits = nested_numpify(preds_host)
all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
if inputs_host is not None:
inputs_decode = nested_numpify(inputs_host)
all_inputs = (
inputs_decode
if all_inputs is None
else nested_concat(all_inputs, inputs_decode, padding_index=-100)
)
if labels_host is not None:
labels = nested_numpify(labels_host)
all_labels = (
labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
)
# Set back to None to begin a new accumulation
losses_host, preds_host, inputs_host, labels_host = None, None, None, None
if args.dump_eval_loss_to:
writer.close()
if args.past_index and hasattr(self, "_past"):
# Clean the state at the end of the evaluation loop
delattr(self, "_past")
# Gather all remaining tensors and put them back on the CPU
if losses_host is not None:
losses = nested_numpify(losses_host)
all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
if preds_host is not None:
logits = nested_numpify(preds_host)
all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
if inputs_host is not None:
inputs_decode = nested_numpify(inputs_host)
all_inputs = (
inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100)
)
if labels_host is not None:
labels = nested_numpify(labels_host)
all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
# Number of samples
if has_length(eval_dataset):
num_samples = len(eval_dataset)
# The instance check is weird and does not actually check for the type, but whether the dataset has the right
# methods. Therefore we need to make sure it also has the attribute.
elif isinstance(eval_dataset, IterableDatasetShard) and hasattr(eval_dataset, "num_examples"):
num_samples = eval_dataset.num_examples
else:
if has_length(dataloader):
num_samples = self.num_examples(dataloader)
else: # both len(dataloader.dataset) and len(dataloader) fail
num_samples = observed_num_examples
# Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of
# samplers has been rounded to a multiple of batch_size, so we truncate.
if all_losses is not None:
all_losses = all_losses[:num_samples]
if all_preds is not None:
all_preds = nested_truncate(all_preds, num_samples)
if all_labels is not None:
all_labels = nested_truncate(all_labels, num_samples)
if all_inputs is not None:
all_inputs = nested_truncate(all_inputs, num_samples)
# Metrics!
if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
if args.include_inputs_for_metrics:
metrics = self.compute_metrics(
EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs)
)
else:
metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
else:
metrics = {}
# To be JSON-serializable, we need to remove numpy types or zero-d tensors
metrics = denumpify_detensorize(metrics)
if all_losses is not None:
metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
# Prefix all keys with metric_key_prefix + '_'
for key in list(metrics.keys()):
if not key.startswith(f"{metric_key_prefix}_"):
metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)
# T5 models
By converting NLP tasks into a text-to-text format, we can use one single model to solve various tasks. Here we use T5 as backbone model and provide a unified training script `run_seq2seq.py` for many tasks. **See `*.sh` under each task directory for usage.**
## Create Data
Currently we support natural language understanding (**NLU**), dialog state tracking (**DST**), natural language generation (**NLG**), response generation (**RG**), and generating a dialog from a user goal (**Goal2Dialogue**). We provide serialization and deserialization methods for dialog acts and state in the unified data format (user goals are already natural language instruction). An example of serialized dialog acts and state:
```
User: I am looking for a cheap restaurant.
System: Is there a particular area of town you prefer?
User: In the centre of town.
User dialog acts: [inform][restaurant]([area][centre])
State: [restaurant]([area][centre],[price range][cheap])
System dialog acts: [recommend][restaurant]([name][Zizzi Cambridge])
System: I would recommend Zizzi Cambridge.
```
Dialogue acts are in the form of `[intent][domain]([slot][value],...);...`. State is in the form of `[domain]([slot][value],...);...`. Multiple items will be concatenated by a semicolon `;`.
To create data for a specific task, run `create_data.py` with corresponding arguments. For example, create data for single turn NLU on MultiWOZ 2.1:
```bash
python create_data.py --tasks nlu --datasets multiwoz21 --speaker user
```
Note that the script only supported **datasets in the unified format**.
## Training
To train the model, specify the arguments like data path, learning rate, epochs, etc., and then run `run_seq2seq.py`. See `nlu/run_nlu.sh` for an example.
## Evaluation
The standard evaluation scripts of NLU, DST, and NLG task are located under `../../$task/evaluate_unified_datasets.py` directories. See `nlu/run_nlu.sh` for an example.
## Trained Models
Trained models and their performance are available in [Hugging Face Hub](https://huggingface.co/ConvLab). You can try some example with hosted inference API.
| Name | Task | Training Dataset |
| ------------------------------------------------------------ | ------------- | ---------------------------- |
| [t5-small-goal2dialogue-multiwoz21](https://huggingface.co/ConvLab/t5-small-goal2dialogue-multiwoz21) | Goal2Dialogue | MultiWOZ 2.1 |
| [t5-small-nlu-multiwoz21](https://huggingface.co/ConvLab/t5-small-nlu-multiwoz21) | NLU | MultiWOZ 2.1 |
| [t5-small-nlu-sgd](https://huggingface.co/ConvLab/t5-small-nlu-sgd) | NLU | SGD |
| [t5-small-nlu-tm1_tm2_tm3](https://huggingface.co/ConvLab/t5-small-nlu-tm1_tm2_tm3) | NLU | TM1+TM2+TM3 |
| [t5-small-nlu-multiwoz21_sgd_tm1_tm2_tm3](https://huggingface.co/ConvLab/t5-small-nlu-multiwoz21_sgd_tm1_tm2_tm3) | NLU | MultiWOZ 2.1+SGD+TM1+TM2+TM3 |
| [t5-small-dst-multiwoz21](https://huggingface.co/ConvLab/t5-small-dst-multiwoz21) | DST | MultiWOZ 2.1 |
| [t5-small-dst-sgd](https://huggingface.co/ConvLab/t5-small-dst-sgd) | DST | SGD |
| [t5-small-dst-tm1_tm2_tm3](https://huggingface.co/ConvLab/t5-small-dst-tm1_tm2_tm3) | DST | TM1+TM2+TM3 |
| [t5-small-dst-multiwoz21_sgd_tm1_tm2_tm3](https://huggingface.co/ConvLab/t5-small-dst-multiwoz21_sgd_tm1_tm2_tm3) | DST | MultiWOZ 2.1+SGD+TM1+TM2+TM3 |
| [t5-small-nlg-multiwoz21](https://huggingface.co/ConvLab/t5-small-nlg-multiwoz21) | NLG | MultiWOZ 2.1 |
| [t5-small-nlg-sgd](https://huggingface.co/ConvLab/t5-small-nlg-sgd) | NLG | SGD |
| [t5-small-nlg-tm1_tm2_tm3](https://huggingface.co/ConvLab/t5-small-nlg-tm1_tm2_tm3) | NLG | TM1+TM2+TM3 |
| [t5-small-nlg-multiwoz21_sgd_tm1_tm2_tm3](https://huggingface.co/ConvLab/t5-small-nlg-multiwoz21_sgd_tm1_tm2_tm3) | NLG | MultiWOZ 2.1+SGD+TM1+TM2+TM3 |
## Interface
To use trained models in a dialog system, import them through:
```python
from convlab.base_models.t5.nlu import T5NLU
from convlab.base_models.t5.dst import T5DST
from convlab.base_models.t5.nlg import T5NLG
# example instantiation
# model_name_or_path could be model in hugging face hub or local path
nlu = T5NLU(speaker='user', context_window_size=0, model_name_or_path='ConvLab/t5-small-nlu-multiwoz21')
```
See `nlu/nlu.py`, `dst/dst.py`, `nlg/nlg.py` for example usage.
## Support a New Task
To support a new task, you can first serialize model input and output like `create_data.py`, and then train the model with `run_seq2seq.py`. Finally, write a evaluation script for the task or pass the `metric_name_or_path` for an existing metric to `run_seq2seq.py`.
## Author
Qi Zhu(zhuq96 at gmail dot com)
\ No newline at end of file
from convlab.base_models.t5.dst.dst import T5DST
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment