diff --git a/MANIFEST.in b/MANIFEST.in index b4a333984ad8092f3519ef587aa5ee4c163b10a6..167edb98a7a4ccd01d53c094c16690e794841e91 100755 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ + include LICENSE.txt include README.md prune convlab2/*/__pycache__ diff --git a/convlab2/nlg/evaluate_unified_datasets.py b/convlab2/nlg/evaluate_unified_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..cb36a82e8bffb45be2a34b5adfc33249719083ed --- /dev/null +++ b/convlab2/nlg/evaluate_unified_datasets.py @@ -0,0 +1,102 @@ +import sys +from nltk.translate.bleu_score import corpus_bleu +from nltk.tokenize import word_tokenize +sys.path.append('../..') +import json +from pprint import pprint +from evaluate_util import GentScorer +from convlab2.util.unified_datasets_util import load_ontology +import numpy as np + +logger = None + +class Logging: + def __init__(self, path): + file = open(path, 'w+') + file.write('') + file.close() + self.path = path + + def log(self, sent): + with open(self.path, 'a') as f: + f.write(sent) + f.write('\n') + f.close() + +def evaluate(predict_result, ontology): + predict_result = json.load(open(predict_result)) + metrics = {} + + # BLEU Score + evaluator = GentScorer() + references = [] + candidates = [] + for i in range(len(predict_result)): + references.append([word_tokenize(predict_result[i]['utterance'])]) + candidates.append(word_tokenize(predict_result[i]['prediction'])) + metrics['bleu'] = corpus_bleu(references, candidates) + + # ERROR Rate + ## get all values in ontology + val2ds_dict = {} + for domain_name in ontology['domains']: + domain = ontology['domains'][domain_name] + for slot_name in domain['slots']: + slot = domain['slots'][slot_name] + if 'possible_values' not in slot: + continue + possible_vals = slot['possible_values'] + if len(possible_vals) > 0: + for val in possible_vals: + val2ds_dict[val] = f'{domain_name}-{slot_name}' + score_list = [] + for item in predict_result: + da = item['dialogue_acts'] + utterance = item['prediction'] + missing_count = 0 + redundant_count = 0 + all_count = 0 + all_values = set() + ## missing values + for key in da: + slot_value = da[key] + for triple in slot_value: + if 'value' in triple: + value = triple['value'] + all_values.add(value) + if value.strip().lower() not in utterance.lower(): + missing_count += 1 + # logger.log(f"missing: {triple['slot']}-{triple['value']} | {item['prediction']} | {item['utterance']}") + all_count += 1 + if all_count == 0: + continue + ## redundant values + for val in val2ds_dict: + if f' {val.strip().lower()} ' in f' {utterance.strip().lower()} ' and val.strip().lower() not in all_values: + wlist = val2ds_dict[val].split('-') + domain, slot = wlist[0], wlist[1] + if f' {slot.strip().lower()}' in f' {utterance.strip().lower()} ': + redundant_count += 1 + # logger.log(f"redundant: {val}/{val2ds_dict[val]} | {item['prediction']} | {item['utterance']}") + item_score = float(missing_count + redundant_count) / all_count + # logger.log(f"redundant: {redundant_count} | missing_count: {missing_count} |all_count: {all_count}") + score_list.append(item_score) + metrics['err'] = np.mean(score_list) + + return metrics + + +if __name__ == '__main__': + from argparse import ArgumentParser + + parser = ArgumentParser(description="calculate NLG metrics for unified datasets") + parser.add_argument('--predict_result', '-p', type=str, required=True, + help='path to the prediction file that in the unified data format') + parser.add_argument('--dataset_name', type=str, required=True, + help='the name of the dataset to be evaluated') + args = parser.parse_args() + print(args) + ontology = load_ontology(args.dataset_name) + # logger = Logging('./evaluate_unified_datasets.log') + metrics = evaluate(args.predict_result, ontology) + pprint(metrics) diff --git a/convlab2/nlg/evaluate_util.py b/convlab2/nlg/evaluate_util.py new file mode 100644 index 0000000000000000000000000000000000000000..f7435b680a1ed6ea3920706e59b4f4a168d41fc7 --- /dev/null +++ b/convlab2/nlg/evaluate_util.py @@ -0,0 +1,260 @@ +# Part of the evaluation script is adopted from https://github.com/pengbaolin/SC-GPT. +import os +import json +import sys +import math +import operator +import nltk +from collections import Counter +from nltk.util import ngrams + +file = open +class ERRScorer(): + + ## Scorer for calculating the slot errors + ## it scores utterances one by one + ## using two levels of matching + ## 1. exact match for categorical values + ## 2. multiple keyword matching for binary values + ## 3. cannot deal with don't care and none values + def __init__(self, detectfile): + + self.detectPairs = [] + fin = file(detectfile) + self.detectPairs = json.load(fin) + fin.close() + + def countSlots(self, dataset, reader): + count = 0 + for t in dataset: + feat = reader.formatter.format(t[0])[0] + c = count + for s, v in feat: + # skip type token + if s == 'type': + continue + if v == '_' or v == 'yes' or v == 'none' or v == 'no': + count += 1 + return count + + def score(self, a, feat, gen): + # import pdb + # pdb.set_trace() + # total slots + slot_count = 0 + # exact match for categorical slots + caty_slot_error = 0 + # fo each slot - token pair in the detect pair dict + for s, tok in self.detectPairs['general'].items(): + # token compare to + comparetos = ['sv.' + s + '._1', 'sv.' + s + '._2', 'sv.' + s + '._3'] + # count feature count in da feature + fcnt = 0 + for f in feat: + for compareto in comparetos: + if compareto == f: fcnt += 1 + # count generated semantic tokens + gcnt = gen.split().count(tok) + # count the slot difference + # if fcnt!=gcnt: + # caty_slot_error += 1.0 + caty_slot_error += abs(fcnt - gcnt) + # accumulate slot count + slot_count += fcnt + + # key word match for binary slots, only an approximation + bnay_slot_error = 0 + # for each binary slot + for s, toks in self.detectPairs['binary'].items(): + # tokens compare to + comparetos = ['sv.' + s + '.yes', 'sv.' + s + '.no', + 'sv.' + s + '.dontcare', 'sv.' + s + '.none'] + # count feature occurrence in da + fcnt = 0 + for f in feat: + for compareto in comparetos: + if compareto == f: fcnt += 1 + # count generated semantic tokens + gcnt = sum([gen.split().count(tok) for tok in toks]) + # count the slot difference + bnay_slot_error += abs(fcnt - gcnt) + # accumulate slot count + slot_count += fcnt + # total slot error + total_slot_error = caty_slot_error + bnay_slot_error + # when ?select/suggest act, only consider categorical errors + if a == [4] or a == [14]: + # return slot_count, caty_slot_error, caty_slot_error + return 0.0, 0.0, 0.0 + else: + return slot_count, total_slot_error, caty_slot_error + + +class BLEUScorer(object): + ## BLEU score calculator via GentScorer interface + ## it calculates the BLEU-4 by taking the entire corpus in + ## Calulate based multiple candidates against multiple references + def __init__(self): + pass + + def score(self, parallel_corpus): + # ref_ = [] + # hyp_ = [] + # for hyps,refs in parallel_corpus: + # ref_.append(refs) + # hyp_.append(hyps[0]) + # return nltk.translate.bleu_score.corpus_bleu(ref_, hyp_) + # asdf + # containers and parameters + r, c = 0, 0 + count = [0, 0, 0, 0] + clip_count = [0, 0, 0, 0] + weights = [0.25, 0.25, 0.25, 0.25] + + # accumulate ngram statistics + for hyps, refs in parallel_corpus: + # BLEUscore = nltk.translate.bleu_score.sentence_bleu(refs, hyps[0]) + # print(hyps, refs, BLEUscore) + hyps = [hyp.lower().split() for hyp in hyps] + refs = [ref.lower().split() for ref in refs] + # compute ngram counts by matching each hypothesis + for hyp in hyps: + # for each ngram + for i in range(4): + # accumulate hyp ngram counts + hypcnts = Counter(ngrams(hyp, i + 1)) + cnt = sum(hypcnts.values()) + count[i] += cnt + + # compute clipped counts + max_counts = {} + # compare to each reference + for ref in refs: + # get reference ngrams + refcnts = Counter(ngrams(ref, i + 1)) + # for each ngram + for ng in hypcnts: + # clipped counts + max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng]) + # compute clipped counts by clipping the hyp count if necessary + clipcnt = dict((ng, min(count, max_counts[ng])) \ + for ng, count in hypcnts.items()) + clip_count[i] += sum(clipcnt.values()) + + # accumulate r & c, find best match among all references + bestmatch = [1000, 1000] + for ref in refs: + if bestmatch[0] == 0: break + # length difference + diff = abs(len(ref) - len(hyp)) + # if the current diff less than stored one, change it + if diff < bestmatch[0]: + bestmatch[0] = diff + bestmatch[1] = len(ref) + # extract the best length match in references + r += bestmatch[1] + c += len(hyp) + + # computing bleu score + # for numerical stability + p0 = 1e-7 + # brevity penality + bp = 1 if c > r else math.exp(1 - float(r) / float(c)) + # modified prec. + p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 \ + for i in range(4)] + # weighted prec. + s = math.fsum(w * math.log(p_n) \ + for w, p_n in zip(weights, p_ns) if p_n) + # final bleu score + bleu = bp * math.exp(s) + return bleu + + def sentence_bleu_4(self, parallel_corpus): + # input : single sentence, multiple references + count = [0, 0, 0, 0] + clip_count = [0, 0, 0, 0] + weights = [0.25, 0.25, 0.25, 0.25] + r = 0 + c = 0 + + # accumulate ngram statistics + for hyps, refs in parallel_corpus: + hyps = [hyp.split() for hyp in hyps] + refs = [ref.split() for ref in refs] + # compute ngram counts by matching each hypothesis + for hyp in hyps: + # for each ngram + for i in range(4): + # accumulate hyp ngram counts + hypcnts = Counter(ngrams(hyp, i + 1)) + cnt = sum(hypcnts.values()) + count[i] += cnt + + # compute clipped counts + max_counts = {} + # compare to each reference + for ref in refs: + # get reference ngrams + refcnts = Counter(ngrams(ref, i + 1)) + # for each ngram + for ng in hypcnts: + # clipped counts + max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng]) + # compute clipped counts by clipping the hyp count if necessary + clipcnt = dict((ng, min(count, max_counts[ng])) \ + for ng, count in hypcnts.items()) + clip_count[i] += sum(clipcnt.values()) + + # accumulate r & c, find best match among all references + bestmatch = [1000, 1000] + for ref in refs: + if bestmatch[0] == 0: break + # length difference + diff = abs(len(ref) - len(hyp)) + # if the current diff less than stored one, change it + if diff < bestmatch[0]: + bestmatch[0] = diff + bestmatch[1] = len(ref) + # extract the best length match in references + r += bestmatch[1] + c += len(hyp) + + # for numerical stability + p0 = 1e-7 + # modified brevity penality + bp = math.exp(-abs(1.0 - float(r) / float(c + p0))) + # smoothed version of modified prec. + p_ns = [0, 0, 0, 0] + for i in range(4): + if i < 2: # original version n-gram counts + p_ns[i] = float(clip_count[i]) / float(count[i] + p0) + p0 + else: # smoothed version of ngram counts + smooth_term = 5 * p_ns[i - 1] * p_ns[i - 1] / p_ns[i - 2] + p_ns[i] = float(clip_count[i] + smooth_term) / float(count[i] + 5) + p0 + # weighted prec. + s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns) if p_n) + # final sentence bleu score + bleu_hyp = bp * math.exp(s) + return bleu_hyp + + +class GentScorer(object): + ## main Scorer interfaces for all scorers + ## it can do + ## 1. Compute bleu score + ## 2. Compute slot error rate + ## 3. Detailed illustraction of how differet split + ## of data affect performance + def __init__(self): + self.bleuscorer = BLEUScorer() + + def scoreERR(self, parallel_pairs): + """input: [[dialoge_act, utterance], [dialog_act, utterance], ...]""" + + + def scoreBLEU(self, parallel_corpus): + return self.bleuscorer.score(parallel_corpus) + + def scoreSBLEU(self, parallel_corpus): + return self.bleuscorer.sentence_bleu_4(parallel_corpus) \ No newline at end of file diff --git a/convlab2/nlg/scgpt/README.md b/convlab2/nlg/scgpt/README.md index 7d32ac05e05d008462439486b37c6d4139e18cc4..a925f2fe0d1253ca45667d4310669fa375641cbb 100644 --- a/convlab2/nlg/scgpt/README.md +++ b/convlab2/nlg/scgpt/README.md @@ -2,6 +2,12 @@ This is the implemention of [SC-GPT](https://aclanthology.org/2020.findings-emnlp.17) which is proposed by Peng et al., 2020. +You should first download and unzip the SG-GPT checkpoint +```bash +wget https://bapengstorage.blob.core.windows.net/fileshare/scgpt.tar.gz +tar -xvf scgpt.tar.gz +``` +and if you want to use this checkpoint, you have to specifiy its path through ``--scgpt_model_ckpt_path`` parameter in ``train.sh`` and ``test.sh``. ## Train @@ -12,9 +18,9 @@ When using the training code, you may have to adjust the parameters according to your machine configuration. Note that the training code only supports GPU training. -## Test +## Evaluation ```python -./test.sh +./evaluate.sh ``` The test code also only supports GPU mode. We will report the BLEU score and ERR score according to the original SC-GPT paper(Peng et al., 2020). diff --git a/convlab2/nlg/scgpt/evaluate.sh b/convlab2/nlg/scgpt/evaluate.sh new file mode 100644 index 0000000000000000000000000000000000000000..dfe4435da6a13938f332c3bff5dcbaa1f705ed1d --- /dev/null +++ b/convlab2/nlg/scgpt/evaluate.sh @@ -0,0 +1,4 @@ +CUDA_VISIBLE_DEVICES="5" python -m torch.distributed.launch --nproc_per_node 1 --master_port 3046 main.py \ +--dataset multiwoz21 \ +--scgpt_model_ckpt_path /data/zhangzheng/scgpt \ +--model_path /data/zhangzheng/ConvLab-3/convlab2/nlg/scgpt/saved_model/epoch_4/epoch_4_step8875.pt \ No newline at end of file diff --git a/convlab2/nlg/scgpt/main.py b/convlab2/nlg/scgpt/main.py index 8f45918258e50f29dbbe1e39885b1f614e08cad2..82707914ba1f0e2e439b090aad9d73aa1c4f7860 100644 --- a/convlab2/nlg/scgpt/main.py +++ b/convlab2/nlg/scgpt/main.py @@ -2,6 +2,7 @@ import sys sys.path.append('../../..') import argparse +import json from tqdm import tqdm import torch import numpy as np @@ -31,7 +32,8 @@ parser.add_argument("--local_rank", default=-1, type=int) parser.add_argument('--do_train', action="store_true", help="Whether to run training.") parser.add_argument('--dataset', default="multiwoz21", type=str, help="The name of the dataset to be used.") parser.add_argument('--model_path', default="", type=str, help="The path of model for testing.") -parser.add_argument("--max_seq_len", default=256, type=int) +parser.add_argument('--scgpt_model_ckpt_path', default="", type=str, help="The path of model for testing.") +parser.add_argument("--max_seq_len", default=128, type=int) FLAGS = parser.parse_args() local_rank = FLAGS.local_rank @@ -39,14 +41,25 @@ torch.cuda.set_device(local_rank) dist.init_process_group(backend='nccl') # TensorBoard -tb_writer = SummaryWriter('') +tb_dir = './runs' +if not os.path.exists(tb_dir): + os.mkdir(tb_dir) +tb_writer = SummaryWriter(tb_dir) special_tokens = [START_OF_PRED, END_OF_PRED, SYS_SPEAK, USR_SPEAK] ## load model -tokenizer = GPT2Tokenizer.from_pretrained('./gpt2') -tokenizer.add_special_tokens({'pad_token': PAD_TOKEN, 'eos_token': END_OF_PRED, 'additional_special_tokens': special_tokens}) -model = GPT2LMHeadModel.from_pretrained('./gpt2').to(local_rank) -model.resize_token_embeddings(len(tokenizer)) +if FLAGS.scgpt_model_ckpt_path == '': + tokenizer = GPT2Tokenizer.from_pretrained('./gpt2') + tokenizer.add_special_tokens({'pad_token': PAD_TOKEN, 'eos_token': END_OF_PRED, 'additional_special_tokens': special_tokens}) + model = GPT2LMHeadModel.from_pretrained('./gpt2').to(local_rank) + model.resize_token_embeddings(len(tokenizer)) +else: + tokenizer = GPT2Tokenizer.from_pretrained(FLAGS.scgpt_model_ckpt_path) + tokenizer.add_special_tokens( + {'pad_token': PAD_TOKEN, 'eos_token': END_OF_PRED, 'additional_special_tokens': special_tokens}) + model = GPT2LMHeadModel.from_pretrained(FLAGS.scgpt_model_ckpt_path).to(local_rank) + print('model load from ' + FLAGS.scgpt_model_ckpt_path) + model.resize_token_embeddings(len(tokenizer)) nll_loss = nn.NLLLoss(reduce=False).to(local_rank) ce_loss = nn.CrossEntropyLoss(reduce=False).to(local_rank) @@ -79,6 +92,7 @@ def pad_collate(batch): batch = [item[0] + [START_OF_PRED_ID] + item[1] for item in batch] batch = [item[-FLAGS.max_seq_len:] for item in batch] max_len = max([len(item) for item in batch]) + # print('max_len', max_len) seq_lens = [len(item) for item in batch] split_id = tokenizer._convert_token_to_id_with_added_voc(START_OF_PRED) def get_x_len(tokens): @@ -91,12 +105,15 @@ def pad_collate(batch): return split_idx seq_lens_input = [get_x_len(item) for item in batch] batch = [item + [pad_token_id]*(max_len-len(item)) for item in batch] + # print(batch) + # print(seq_lens) + # print(seq_lens_input) return torch.LongTensor(batch), torch.LongTensor(seq_lens), torch.LongTensor(seq_lens_input) ## Training Hyper-params EPOCH_NUM = 20 -BATCH_SIZE = 20 # real_batch_size = BATCH_SIZE * num_gpu -VAL_STEP = 300 +BATCH_SIZE = 32 # real_batch_size = BATCH_SIZE * num_gpu +VAL_STEP = 500 WARM_STEPS = 250 if code_test: EPOCH_NUM = 2 @@ -138,16 +155,19 @@ def train(model, nlg_data, global_step=0): tb_writer.add_scalar(f'Train/PPL', torch.exp(loss).item(), global_step) tb_writer.add_scalar(f'Train/Learning Rate', scheduler.get_last_lr()[0], global_step) - if batch_id % VAL_STEP == 0: - model.eval() - val_loss = eval(model, val_dataloader) - ppl = np.exp(val_loss) - tb_writer.add_scalar(f'Val/Loss', val_loss, global_step) - tb_writer.add_scalar(f'Val/PPL', ppl, global_step) - model.train() global_step += 1 # save the model when each epoch ends if dist.get_rank() == 0: + + # vaidation + model.eval() + val_loss = eval(model, val_dataloader) + ppl = np.exp(val_loss) + tb_writer.add_scalar(f'Val/Loss', val_loss, global_step) + tb_writer.add_scalar(f'Val/PPL', ppl, global_step) + model.train() + + # save model save_dir = os.path.join(SAVE_PATH, f'epoch_{epoch}') os.makedirs(save_dir, exist_ok=True) torch.save(model.module.state_dict(), os.path.join(save_dir, f'epoch_{epoch}_step{global_step}.pt')) @@ -155,6 +175,7 @@ def train(model, nlg_data, global_step=0): torch.save(optimizer.state_dict(), os.path.join(save_dir, 'optimizer.pt')) torch.save(scheduler.state_dict(), os.path.join(save_dir, 'scheduler.pt')) print(f'Save model checkpoint to [{save_dir}]') + tb_writer.flush() @@ -212,17 +233,29 @@ def test(model, nlg_data, ontology, model_path): print(f'model loaded from [{model_path}]') # Load test nlg data test_data = nlg_data['test'] - dialog_acts = [act2str(item['dialogue_acts']) for item in test_data] - golden_responses = [item['utterance'] for item in test_data] + dialog_acts = [act2str(item['dialogue_acts']).strip() for item in test_data] + golden_responses = [item['utterance'].strip() for item in test_data] # dialog_acts = dialog_acts[:10] # golden_responses = golden_responses[:10] outputs = inference_sents(model, dialog_acts) + def get_real_output(ipt): + if '[start_of_pred]' in ipt: + ipt = ipt[ipt.index('[start_of_pred]')+15:].strip() + if '[_pad_token_]' in ipt: + ipt = ipt[:ipt.index('[_pad_token_]')].strip() + return ipt + outputs = [get_real_output(item) for item in outputs] + output_file = './test_output.json' if dist.get_rank() == 0: - output_file = './test_output.txt' with open(output_file, 'w+') as f: + result = [] for i in range(len(dialog_acts)): - f.write(f'{dialog_acts[i]}\n{golden_responses[i]}\n{outputs[i]}\n\n') - f.close() + result.append({ + 'dialogue_acts': test_data[i]['dialogue_acts'], + 'utterance': test_data[i]['utterance'], + 'prediction': outputs[i] + }) + json.dump(result, f, indent=2, ensure_ascii=False) evaluator = GentScorer() parallel_corpus = [] # BLEU @@ -270,6 +303,9 @@ def test(model, nlg_data, ontology, model_path): score_list.append(item_score) ERR_Score = np.mean(score_list) print(f'BLEU: {BLEU_Score}\nERR_Score: {ERR_Score}') + # with open(output_file, 'a') as f: + # f.write(f'BLEU: {BLEU_Score}\nERR_Score: {ERR_Score}') + # f.close() if __name__ == '__main__': diff --git a/convlab2/nlg/scgpt/test.sh b/convlab2/nlg/scgpt/test.sh deleted file mode 100644 index 96d51bc6429e5d5248be24ab1c3331b0f195349c..0000000000000000000000000000000000000000 --- a/convlab2/nlg/scgpt/test.sh +++ /dev/null @@ -1 +0,0 @@ -CUDA_VISIBLE_DEVICES="6" python -m torch.distributed.launch --nproc_per_node 1 --master_port 3046 main.py --dataset multiwoz21 \ No newline at end of file diff --git a/convlab2/nlg/scgpt/train.sh b/convlab2/nlg/scgpt/train.sh index 12f190670e835c0d1f8f5df198dade11ac59ce77..d36d1066abec984ca89c203435a5cf7111209c98 100644 --- a/convlab2/nlg/scgpt/train.sh +++ b/convlab2/nlg/scgpt/train.sh @@ -1 +1 @@ -CUDA_VISIBLE_DEVICES="1" python -m torch.distributed.launch --nproc_per_node 1 main.py --do_train --dataset multiwoz21 \ No newline at end of file +CUDA_VISIBLE_DEVICES="5" python -m torch.distributed.launch --nproc_per_node 1 main.py --do_train --dataset multiwoz21 --scgpt_model_ckpt_path /data/zhangzheng/scgpt \ No newline at end of file