diff --git a/convlab2/nlg/scgpt/README.md b/convlab2/nlg/scgpt/README.md index 5eed2c0fc167cd9ee79d66e3252be25060bb294d..7d32ac05e05d008462439486b37c6d4139e18cc4 100644 --- a/convlab2/nlg/scgpt/README.md +++ b/convlab2/nlg/scgpt/README.md @@ -1,51 +1,42 @@ -# GPT +# SC-GPT -The code derives from [HuggingFace/Transformers](https://github.com/huggingface/transformers). - -## Preprocess - -```python -cd $dataset$ -python preprocess.py -``` +This is the implemention of [SC-GPT](https://aclanthology.org/2020.findings-emnlp.17) which is proposed by +Peng et al., 2020. ## Train -Fetch and unzip the checkpoint - -``` -wget https://bapengstorage.blob.core.windows.net/fileshare/scgpt.tar.gz -tar -xvf scgpt.tar.gz +```python +./train.sh ``` +When using the training code, you may have to adjust the parameters +according to your machine configuration. Note that the training code +only supports GPU training. -Then - -``` python -python train.py --output_dir=trained_output --model_type=gpt2 --model_name_or_path=scgpt --do_train --do_eval --eval_data_file=multiwoz/data/test_sys.txt --use_tokenize --train_data_file=multiwoz/data/train_sys.txt --overwrite_output_dir +## Test +```python +./test.sh ``` +The test code also only supports GPU mode. We will report the BLEU score +and ERR score according to the original SC-GPT paper(Peng et al., 2020). -some tricks (optional training argument): -* `--gradient_accumulation_steps xxx` -* `--fp16`, if it's set, you'd better set `--per_gpu_train_batch_size` to be multiple of 8 -* `--max_seq xxx`, it should be larger than the length of the longest sequence. You can set `--max_seq 1024`. The script uses a dynamic sequence length at each training step. -* `--gradient_checkpointing`, it allows larger `per_gpu_train_batch_size` -* `--use_multi_tensor_adamw`, someone says it's a faster optimizer - -distributed data parallel: - - If multiple GPUs are available, you can run `python -m torch.distributed.launch --nproc_per_node CUDA_COUNT train.py ......` - - `CUDA_COUNT` is the number of GPUs. `.....` are arguments of `train.py`. - -## Use - +## NLG Interface +The NLG interface of SC-GPT is implemented in ./scgpt.py. ```python -python run.py --model_type=gpt2 --model_name_or_path=$save_dir$ --num_samples 5 --input_file=$test_file$ --output_file=$output_file$ --length 100 --stop_token '<|endoftext|>' --batch_size 16 +def generate(self, action) ``` +This class supports both CPU and GPU mode by providing the +'device' parameter in constructor function. -## Data Format +## Reference ``` -dialog act seq & user utterance -``` - +@inproceedings{peng-etal-2020-shot, + title = "Few-shot Natural Language Generation for Task-Oriented Dialog", + author = "Peng, Baolin and Zhu, Chenguang and Li, Chunyuan and Li, Xiujun and Li, Jinchao and Zeng, Michael and Gao, Jianfeng", + booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020", + month = nov, + year = "2020", + publisher = "Association for Computational Linguistics", + pages = "172--182", +} +``` \ No newline at end of file diff --git a/convlab2/nlg/scgpt/decode.py b/convlab2/nlg/scgpt/decode.py deleted file mode 100644 index e95025afdde60d8beb41bac5d2fb038e39357f3d..0000000000000000000000000000000000000000 --- a/convlab2/nlg/scgpt/decode.py +++ /dev/null @@ -1,90 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Sat Apr 4 21:34:38 2020 - -@author: truthless -""" -import numpy as np -import torch - -def set_seed(seed, n_gpu): - np.random.seed(seed) - torch.manual_seed(seed) - if n_gpu > 0: - torch.cuda.manual_seed_all(seed) - - -def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): - """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering - Args: - logits: logits distribution shape (batch size x vocabulary size) - top_k > 0: keep only top k tokens with highest probability (top-k filtering). - top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). - Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) - From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 - """ - top_k = min(top_k, logits.size(-1)) # Safety check - if top_k > 0: - # Remove all tokens with a probability less than the last token of the top-k - indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] - logits[indices_to_remove] = filter_value - - if top_p > 0.0: - sorted_logits, sorted_indices = torch.sort(logits, descending=True) - cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1) - - # Remove tokens with cumulative probability above the threshold - sorted_indices_to_remove = cumulative_probs > top_p - # Shift the indices to the right to keep also the first token above the threshold - sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() - sorted_indices_to_remove[..., 0] = 0 - - # scatter sorted tensors to original indexing - indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove) - logits[indices_to_remove] = filter_value - return logits - - -def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, - is_xlnet=False, is_xlm_mlm=False, xlm_mask_token=None, xlm_lang=None, device='cpu'): - context = torch.tensor(context, dtype=torch.long, device=device) - context = context.unsqueeze(0).repeat(num_samples, 1) - generated = context - with torch.no_grad(): - for _ in range(length): - - inputs = {'input_ids': generated} - if is_xlnet: - # XLNet is a direct (predict same token, not next token) and bi-directional model by default - # => need one additional dummy token in the input (will be masked), attention mask and target mapping (see model docstring) - input_ids = torch.cat((generated, torch.zeros((1, 1), dtype=torch.long, device=device)), dim=1) - perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float, device=device) - perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token - target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float, device=device) - target_mapping[0, 0, -1] = 1.0 # predict last token - inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping} - - if is_xlm_mlm and xlm_mask_token: - # XLM MLM models are direct models (predict same token, not next token) - # => need one additional dummy token in the input (will be masked and guessed) - input_ids = torch.cat((generated, torch.full((1, 1), xlm_mask_token, dtype=torch.long, device=device)), dim=1) - inputs = {'input_ids': input_ids} - - if xlm_lang is not None: - inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1], device=device).view(1, -1) - - outputs = model(**inputs) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states) - next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.) - - # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858) - for i in range(num_samples): - for _ in set(generated[i].tolist()): - next_token_logits[i, _] /= repetition_penalty - - filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) - if temperature == 0: # greedy sampling: - next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1) - else: - next_token = torch.multinomial(torch.softmax(filtered_logits, dim=-1), num_samples=1) - generated = torch.cat((generated, next_token), dim=1) - return generated diff --git a/convlab2/nlg/scgpt/evaluate.py b/convlab2/nlg/scgpt/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..f7435b680a1ed6ea3920706e59b4f4a168d41fc7 --- /dev/null +++ b/convlab2/nlg/scgpt/evaluate.py @@ -0,0 +1,260 @@ +# Part of the evaluation script is adopted from https://github.com/pengbaolin/SC-GPT. +import os +import json +import sys +import math +import operator +import nltk +from collections import Counter +from nltk.util import ngrams + +file = open +class ERRScorer(): + + ## Scorer for calculating the slot errors + ## it scores utterances one by one + ## using two levels of matching + ## 1. exact match for categorical values + ## 2. multiple keyword matching for binary values + ## 3. cannot deal with don't care and none values + def __init__(self, detectfile): + + self.detectPairs = [] + fin = file(detectfile) + self.detectPairs = json.load(fin) + fin.close() + + def countSlots(self, dataset, reader): + count = 0 + for t in dataset: + feat = reader.formatter.format(t[0])[0] + c = count + for s, v in feat: + # skip type token + if s == 'type': + continue + if v == '_' or v == 'yes' or v == 'none' or v == 'no': + count += 1 + return count + + def score(self, a, feat, gen): + # import pdb + # pdb.set_trace() + # total slots + slot_count = 0 + # exact match for categorical slots + caty_slot_error = 0 + # fo each slot - token pair in the detect pair dict + for s, tok in self.detectPairs['general'].items(): + # token compare to + comparetos = ['sv.' + s + '._1', 'sv.' + s + '._2', 'sv.' + s + '._3'] + # count feature count in da feature + fcnt = 0 + for f in feat: + for compareto in comparetos: + if compareto == f: fcnt += 1 + # count generated semantic tokens + gcnt = gen.split().count(tok) + # count the slot difference + # if fcnt!=gcnt: + # caty_slot_error += 1.0 + caty_slot_error += abs(fcnt - gcnt) + # accumulate slot count + slot_count += fcnt + + # key word match for binary slots, only an approximation + bnay_slot_error = 0 + # for each binary slot + for s, toks in self.detectPairs['binary'].items(): + # tokens compare to + comparetos = ['sv.' + s + '.yes', 'sv.' + s + '.no', + 'sv.' + s + '.dontcare', 'sv.' + s + '.none'] + # count feature occurrence in da + fcnt = 0 + for f in feat: + for compareto in comparetos: + if compareto == f: fcnt += 1 + # count generated semantic tokens + gcnt = sum([gen.split().count(tok) for tok in toks]) + # count the slot difference + bnay_slot_error += abs(fcnt - gcnt) + # accumulate slot count + slot_count += fcnt + # total slot error + total_slot_error = caty_slot_error + bnay_slot_error + # when ?select/suggest act, only consider categorical errors + if a == [4] or a == [14]: + # return slot_count, caty_slot_error, caty_slot_error + return 0.0, 0.0, 0.0 + else: + return slot_count, total_slot_error, caty_slot_error + + +class BLEUScorer(object): + ## BLEU score calculator via GentScorer interface + ## it calculates the BLEU-4 by taking the entire corpus in + ## Calulate based multiple candidates against multiple references + def __init__(self): + pass + + def score(self, parallel_corpus): + # ref_ = [] + # hyp_ = [] + # for hyps,refs in parallel_corpus: + # ref_.append(refs) + # hyp_.append(hyps[0]) + # return nltk.translate.bleu_score.corpus_bleu(ref_, hyp_) + # asdf + # containers and parameters + r, c = 0, 0 + count = [0, 0, 0, 0] + clip_count = [0, 0, 0, 0] + weights = [0.25, 0.25, 0.25, 0.25] + + # accumulate ngram statistics + for hyps, refs in parallel_corpus: + # BLEUscore = nltk.translate.bleu_score.sentence_bleu(refs, hyps[0]) + # print(hyps, refs, BLEUscore) + hyps = [hyp.lower().split() for hyp in hyps] + refs = [ref.lower().split() for ref in refs] + # compute ngram counts by matching each hypothesis + for hyp in hyps: + # for each ngram + for i in range(4): + # accumulate hyp ngram counts + hypcnts = Counter(ngrams(hyp, i + 1)) + cnt = sum(hypcnts.values()) + count[i] += cnt + + # compute clipped counts + max_counts = {} + # compare to each reference + for ref in refs: + # get reference ngrams + refcnts = Counter(ngrams(ref, i + 1)) + # for each ngram + for ng in hypcnts: + # clipped counts + max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng]) + # compute clipped counts by clipping the hyp count if necessary + clipcnt = dict((ng, min(count, max_counts[ng])) \ + for ng, count in hypcnts.items()) + clip_count[i] += sum(clipcnt.values()) + + # accumulate r & c, find best match among all references + bestmatch = [1000, 1000] + for ref in refs: + if bestmatch[0] == 0: break + # length difference + diff = abs(len(ref) - len(hyp)) + # if the current diff less than stored one, change it + if diff < bestmatch[0]: + bestmatch[0] = diff + bestmatch[1] = len(ref) + # extract the best length match in references + r += bestmatch[1] + c += len(hyp) + + # computing bleu score + # for numerical stability + p0 = 1e-7 + # brevity penality + bp = 1 if c > r else math.exp(1 - float(r) / float(c)) + # modified prec. + p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 \ + for i in range(4)] + # weighted prec. + s = math.fsum(w * math.log(p_n) \ + for w, p_n in zip(weights, p_ns) if p_n) + # final bleu score + bleu = bp * math.exp(s) + return bleu + + def sentence_bleu_4(self, parallel_corpus): + # input : single sentence, multiple references + count = [0, 0, 0, 0] + clip_count = [0, 0, 0, 0] + weights = [0.25, 0.25, 0.25, 0.25] + r = 0 + c = 0 + + # accumulate ngram statistics + for hyps, refs in parallel_corpus: + hyps = [hyp.split() for hyp in hyps] + refs = [ref.split() for ref in refs] + # compute ngram counts by matching each hypothesis + for hyp in hyps: + # for each ngram + for i in range(4): + # accumulate hyp ngram counts + hypcnts = Counter(ngrams(hyp, i + 1)) + cnt = sum(hypcnts.values()) + count[i] += cnt + + # compute clipped counts + max_counts = {} + # compare to each reference + for ref in refs: + # get reference ngrams + refcnts = Counter(ngrams(ref, i + 1)) + # for each ngram + for ng in hypcnts: + # clipped counts + max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng]) + # compute clipped counts by clipping the hyp count if necessary + clipcnt = dict((ng, min(count, max_counts[ng])) \ + for ng, count in hypcnts.items()) + clip_count[i] += sum(clipcnt.values()) + + # accumulate r & c, find best match among all references + bestmatch = [1000, 1000] + for ref in refs: + if bestmatch[0] == 0: break + # length difference + diff = abs(len(ref) - len(hyp)) + # if the current diff less than stored one, change it + if diff < bestmatch[0]: + bestmatch[0] = diff + bestmatch[1] = len(ref) + # extract the best length match in references + r += bestmatch[1] + c += len(hyp) + + # for numerical stability + p0 = 1e-7 + # modified brevity penality + bp = math.exp(-abs(1.0 - float(r) / float(c + p0))) + # smoothed version of modified prec. + p_ns = [0, 0, 0, 0] + for i in range(4): + if i < 2: # original version n-gram counts + p_ns[i] = float(clip_count[i]) / float(count[i] + p0) + p0 + else: # smoothed version of ngram counts + smooth_term = 5 * p_ns[i - 1] * p_ns[i - 1] / p_ns[i - 2] + p_ns[i] = float(clip_count[i] + smooth_term) / float(count[i] + 5) + p0 + # weighted prec. + s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns) if p_n) + # final sentence bleu score + bleu_hyp = bp * math.exp(s) + return bleu_hyp + + +class GentScorer(object): + ## main Scorer interfaces for all scorers + ## it can do + ## 1. Compute bleu score + ## 2. Compute slot error rate + ## 3. Detailed illustraction of how differet split + ## of data affect performance + def __init__(self): + self.bleuscorer = BLEUScorer() + + def scoreERR(self, parallel_pairs): + """input: [[dialoge_act, utterance], [dialog_act, utterance], ...]""" + + + def scoreBLEU(self, parallel_corpus): + return self.bleuscorer.score(parallel_corpus) + + def scoreSBLEU(self, parallel_corpus): + return self.bleuscorer.sentence_bleu_4(parallel_corpus) \ No newline at end of file diff --git a/convlab2/nlg/scgpt/main.py b/convlab2/nlg/scgpt/main.py new file mode 100644 index 0000000000000000000000000000000000000000..9f1ed5817d36d12c5f6a69b29c82c63dd585db44 --- /dev/null +++ b/convlab2/nlg/scgpt/main.py @@ -0,0 +1,285 @@ +import sys +sys.path.append('../../..') + +import argparse +from tqdm import tqdm +import torch +import numpy as np +import torch.nn as nn +import torch.nn.functional as F +from transformers import GPT2Tokenizer, GPT2LMHeadModel +from torch.utils.data import DataLoader +from torch.utils.data import Dataset +from torch.utils.tensorboard import SummaryWriter +import os +from transformers import get_linear_schedule_with_warmup + +from convlab2.util.unified_datasets_util import load_dataset, load_nlg_data, load_ontology +from convlab2.nlg.scgpt.util import act2str +from convlab2.nlg.scgpt.model import SCGPTDataset +from evaluate import GentScorer + +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP + +from util import build_mask +from scgpt_special_tokens import * + +code_test = False + +parser = argparse.ArgumentParser() +parser.add_argument("--local_rank", default=-1, type=int) +parser.add_argument('--do_train', action="store_true", help="Whether to run training.") +parser.add_argument('--dataset', default="multiwoz21", type=str, help="The name of the dataset to be used.") +parser.add_argument('--model_path', default="", type=str, help="The path of model for testing.") +parser.add_argument("--max_seq_len", default=256, type=int) +FLAGS = parser.parse_args() +local_rank = FLAGS.local_rank + +torch.cuda.set_device(local_rank) +dist.init_process_group(backend='nccl') + +# TensorBoard +tb_writer = SummaryWriter() + +special_tokens = [START_OF_PRED, END_OF_PRED, SYS_SPEAK, USR_SPEAK] +## load model +tokenizer = GPT2Tokenizer.from_pretrained('./gpt2') +tokenizer.add_special_tokens({'pad_token': PAD_TOKEN, 'eos_token': END_OF_PRED, 'additional_special_tokens': special_tokens}) +model = GPT2LMHeadModel.from_pretrained('./gpt2').to(local_rank) +model.resize_token_embeddings(len(tokenizer)) + +nll_loss = nn.NLLLoss(reduce=False).to(local_rank) +ce_loss = nn.CrossEntropyLoss(reduce=False).to(local_rank) +def cal_loss(input, target, seq_lens, seq_lens_input): + """Only calculate loss on responses, not on dialog act""" + global nll_loss + """Input: [batch, length, vocab]; target: [batch, length]; seq_lens: [batch]""" + log_probs = F.log_softmax(input, dim=-1).transpose(1, 2) + loss = nll_loss(log_probs, target) + # loss = ce_loss(input, target) + mask = build_mask(torch.max(seq_lens).item()-1, seq_lens-1).to(local_rank) + input_mask = build_mask(torch.max(seq_lens).item()-1, seq_lens_input-1).to(local_rank) + output_mask = torch.logical_xor(mask, input_mask) + pad_mask = torch.logical_not(mask) + # masked_loss = loss * output_mask + masked_loss = loss * (output_mask + pad_mask) + mean_loss = torch.sum(masked_loss) / torch.sum(output_mask + pad_mask) + return mean_loss + + +def pad_collate(batch): + """ + Returns: + batch: batch * max_len + seq_lens: the length of len(da)+1+len(response) + seq_lens_input: the length of len(da) + """ + START_OF_PRED_ID = tokenizer._convert_token_to_id_with_added_voc(START_OF_PRED) + pad_token_id = tokenizer.pad_token_id + batch = [item[0] + [START_OF_PRED_ID] + item[1] for item in batch] + batch = [item[-FLAGS.max_seq_len:] for item in batch] + max_len = max([len(item) for item in batch]) + seq_lens = [len(item) for item in batch] + split_id = tokenizer._convert_token_to_id_with_added_voc(START_OF_PRED) + def get_x_len(tokens): + """Get the length of dialogue act tokens""" + split_idx = len(tokens) + try: + split_idx = tokens.index(split_id)+1 + except: + pass + return split_idx + seq_lens_input = [get_x_len(item) for item in batch] + batch = [item + [pad_token_id]*(max_len-len(item)) for item in batch] + return torch.LongTensor(batch), torch.LongTensor(seq_lens), torch.LongTensor(seq_lens_input) + +## Training Hyper-params +EPOCH_NUM = 20 +BATCH_SIZE = 20 # real_batch_size = BATCH_SIZE * num_gpu +VAL_STEP = 300 +WARM_STEPS = 250 +if code_test: + EPOCH_NUM = 2 + BATCH_SIZE = 4 + VAL_STEP = 2 + WARM_STEPS = 3 +LR = 5e-5 +TASK_TYPE = 'nlu' # nlu or dst +SAVE_PATH = f'./saved_model' +def train(model, nlg_data, global_step=0): + train_dataset = SCGPTDataset(nlg_data['train'], tokenizer) + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=2, sampler=train_sampler, collate_fn=pad_collate) + + val_dataset = SCGPTDataset(nlg_data['validation'], tokenizer) + val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset) + val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=2, sampler=val_sampler, collate_fn=pad_collate) + + model = DDP(model, device_ids=[local_rank], output_device=local_rank) + optimizer = torch.optim.AdamW(model.parameters(), lr=LR) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARM_STEPS, + num_training_steps=len(train_dataloader) * EPOCH_NUM) + model.train() + for epoch in range(EPOCH_NUM): + train_dataloader.sampler.set_epoch(epoch) + for batch_id, (inputs, seq_lens, seq_lens_input) in enumerate(tqdm(train_dataloader, desc=f'EPOCH:[{epoch+1}/{EPOCH_NUM}]')): + inputs = inputs.to(local_rank) + seq_lens = seq_lens.to(local_rank) + seq_lens_input = seq_lens_input.to(local_rank) + + outputs = model(inputs) + preds = outputs[0] + loss = cal_loss(preds[:, :-1, :], inputs[:, 1:], seq_lens, seq_lens_input) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + scheduler.step() + tb_writer.add_scalar(f'Train/loss', loss.item(), global_step) + tb_writer.add_scalar(f'Train/PPL', torch.exp(loss).item(), global_step) + tb_writer.add_scalar(f'Train/Learning Rate', scheduler.get_last_lr()[0], global_step) + + if batch_id % VAL_STEP == 0: + model.eval() + val_loss = eval(model, val_dataloader) + ppl = np.exp(val_loss) + tb_writer.add_scalar(f'Val/Loss', val_loss, global_step) + tb_writer.add_scalar(f'Val/PPL', ppl, global_step) + model.train() + global_step += 1 + # save the model when each epoch ends + if dist.get_rank() == 0: + save_dir = os.path.join(SAVE_PATH, f'epoch_{epoch}') + os.makedirs(save_dir, exist_ok=True) + torch.save(model.module.state_dict(), os.path.join(save_dir, f'epoch_{epoch}_step{global_step}.pt')) + tokenizer.save_pretrained(save_dir) + torch.save(optimizer.state_dict(), os.path.join(save_dir, 'optimizer.pt')) + torch.save(scheduler.state_dict(), os.path.join(save_dir, 'scheduler.pt')) + print(f'Save model checkpoint to [{save_dir}]') + tb_writer.flush() + + +def eval(model, loader, use_tqdm=False): + with torch.no_grad(): + loss_list = [] + iter = tqdm(loader, desc='Val') if use_tqdm else loader + for inputs, seq_lens, seq_lens_input in iter: + inputs = inputs.to(local_rank) + seq_lens = seq_lens.to(local_rank) + seq_lens_input = seq_lens_input.to(local_rank) + outputs = model(inputs) + preds = outputs[0] + loss = cal_loss(preds[:, :-1, :], inputs[:, 1:], seq_lens, seq_lens_input) + loss_list.append(loss.item()) + mean_loss = np.mean(loss_list) + return mean_loss + + +def inference_batch(model, sents): + """Inference model given a batch of sents.""" + with torch.no_grad(): + sents = [sent + ' ' + START_OF_PRED for sent in sents] + sent_ids = [tokenizer.encode(sent) for sent in sents] + max_len = max([len(sent) for sent in sent_ids]) + sent_ids = [sent + [tokenizer.pad_token_id]*(max_len-len(sent)) for sent in sent_ids] + inputs = torch.LongTensor(sent_ids).to(local_rank) + model_to_run = model.module if type(model) is DDP else model + outputs = model_to_run.generate(inputs, max_length=FLAGS.max_seq_len, eos_token_id=tokenizer.pad_token_id, + pad_token_id=tokenizer.pad_token_id) # greedy + # outputs = model_to_run.generate(inputs, num_beams=4, max_length=513, eos_token_id=gpt2_tokenizer.eos_token_id, + # pad_token_id=gpt2_tokenizer.pad_token_id) # beam search + output_strs = [tokenizer.decode(item) for item in outputs] + return output_strs + + +def inference_sent(model, sent): + """Inference model given one single sentence.""" + return inference_batch(model, [sent])[0] + + +def inference_sents(model, sents): + """Get the outputs of multiple sentences.""" + outputs = [] + for sent in tqdm(sents, desc='Inference Sentences'): + output = inference_sent(model, sent) + outputs.append(output) + return outputs + + +def test(model, nlg_data, ontology, model_path): + """将sheel中的GPU个数设为1运行""" + model.load_state_dict(torch.load(model_path)) + model.eval() + print(f'model loaded from [{model_path}]') + # sample_file = os.path.join(f'../../../data/dstc2/sample50_{TASK_TYPE}_input_data.txt') + # Load test nlg data + test_data = nlg_data['test'] + dialog_acts = [act2str(item['dialogue_acts']) for item in test_data] + golden_responses = [item['utterance'] for item in test_data] + # dialog_acts = dialog_acts[:10] + # golden_responses = golden_responses[:10] + outputs = inference_sents(model, dialog_acts) + if dist.get_rank() == 0: + output_file = './test_output.txt' + with open(output_file, 'w+') as f: + for i in range(len(dialog_acts)): + f.write(f'{dialog_acts[i]}\n{golden_responses[i]}\n{outputs[i]}\n\n') + f.close() + evaluator = GentScorer() + parallel_corpus = [] + # BLEU + for i in range(len(dialog_acts)): + parallel_corpus.append([[golden_responses[i]], [outputs[i]]]) + BLEU_Score = evaluator.scoreSBLEU(parallel_corpus) + # ERR + ## all values in ontology + val2ds_dict = {} + for domain_name in ontology['domains']: + domain = ontology['domains'][domain_name] + for slot_name in domain['slots']: + slot = domain['slots'][slot_name] + if 'possible_values' not in slot: + continue + possible_vals = slot['possible_values'] + if len(possible_vals) > 0: + for val in possible_vals: + val2ds_dict[val] = f'{domain_name}-{slot_name}' + ## missing values + score_list = [] + for item in test_data: + da = item['dialogue_acts'] + utterance = item['utterance'] + missing_count = 0 + redundant_count = 0 + all_count = 0 + all_values = set() + for key in da: + slot_value = da[key] + for triple in slot_value: + if 'value' in triple: + value = triple['value'] + all_values.add(value) + if value.strip().lower() not in utterance.lower(): + missing_count += 1 + all_count += 1 + if all_count == 0: + continue + ## redundant values + for val in val2ds_dict: + if f' {val.strip().lower()} ' in f' {utterance.strip().lower()} ' and val.strip().lower() not in all_values: + redundant_count += 1 + item_score = float(redundant_count + redundant_count) / all_count + score_list.append(item_score) + ERR_Score = np.mean(score_list) + print(f'BLEU: {BLEU_Score}\nERR_Score: {ERR_Score}') + + +if __name__ == '__main__': + dataset = load_dataset(FLAGS.dataset) + ontology = load_ontology(FLAGS.dataset) + nlg_data = load_nlg_data(dataset) + if FLAGS.do_train: + train(model, nlg_data) + else: + test(model, nlg_data, ontology, FLAGS.model_path) diff --git a/convlab2/nlg/scgpt/model.py b/convlab2/nlg/scgpt/model.py new file mode 100644 index 0000000000000000000000000000000000000000..9a41cfa53e92938d781abc27bc7082c3948b7a0d --- /dev/null +++ b/convlab2/nlg/scgpt/model.py @@ -0,0 +1,28 @@ +from torch.utils.data import Dataset +from util import act2str +from scgpt_special_tokens import * +import torch +import numpy as np + +class SCGPTDataset(Dataset): + def __init__(self, data, tokenizer): + """ + Args: + data: [[da_str, response], [da_str, response], ...] + tokenizer: GPT2 Tokenizer + """ + self.data = [] + length_list = [] + for item in data: + da, response = item['dialogue_acts'], item['utterance'] + da_tokens = tokenizer.encode(act2str(da)) + response_tokens = tokenizer.encode(response) + length_list.append(len(da_tokens) + len(response_tokens) + 1) + self.data.append([da_tokens, response_tokens]) + print(f'max: {np.max(length_list)}, min: {np.min(length_list)}, median: {np.quantile(length_list, 0.5)}, 0.99: {np.quantile(length_list, 0.99)}') + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx] \ No newline at end of file diff --git a/convlab2/nlg/scgpt/modeling_utils.py b/convlab2/nlg/scgpt/modeling_utils.py deleted file mode 100644 index a8b3f6ddfc6b7347c624446bf7869c67d3064cc1..0000000000000000000000000000000000000000 --- a/convlab2/nlg/scgpt/modeling_utils.py +++ /dev/null @@ -1,53 +0,0 @@ -import warnings -from contextlib import nullcontext -from typing import TYPE_CHECKING -import torch.cuda.amp as amp -import transformers -from transformers import GPT2LMHeadModel - - -# reference: https://pytorch.org/docs/master/notes/amp_examples.html -class AmpGPT2LMHeadModel(GPT2LMHeadModel): - if TYPE_CHECKING: - # For IDE's code hinting - forward = GPT2LMHeadModel.forward - else: - def forward(self, *args, **kwargs): - with amp.autocast(): - return super().forward(*args, **kwargs) - - -def try_enable_gradient_checkpointing(model: "transformers.modeling_utils.PreTrainedModel"): - if model.supports_gradient_checkpointing: - model.gradient_checkpointing_enable() - else: - warnings.warn(f"{type(model)} doesn't support gradient_checkpointing") - - -class AmpHelper: - """ - References: - https://pytorch.org/docs/master/notes/amp_examples.html - """ - def __init__(self, use_amp=True): - self.use_amp = use_amp - self.might_enable_autocast = amp.autocast() if use_amp else nullcontext() - self.scaler = amp.GradScaler() - - def backward(self, loss): - if self.use_amp: - return self.scaler.scale(loss).backward() - else: - return loss.backward() - - def step(self, optimizer): - if self.use_amp: - self.scaler.step(optimizer) - self.scaler.update() - else: - optimizer.step() - - def might_unscale_(self, optimizer): - if self.use_amp: - # Unscales the gradients of optimizer's assigned params in-place - self.scaler.unscale_(optimizer) \ No newline at end of file diff --git a/convlab2/nlg/scgpt/scgpt.py b/convlab2/nlg/scgpt/scgpt.py new file mode 100644 index 0000000000000000000000000000000000000000..d184abff9c8d7179a907c3084bb2f22b5760dc1e --- /dev/null +++ b/convlab2/nlg/scgpt/scgpt.py @@ -0,0 +1,45 @@ +import sys +sys.path.append('../../..') + +import torch +from transformers import GPT2Tokenizer, GPT2LMHeadModel +from torch.nn.parallel import DistributedDataParallel as DDP + +from convlab2.nlg.nlg import NLG +from util import act2str +from scgpt_special_tokens import * + +special_tokens = [START_OF_PRED, END_OF_PRED, SYS_SPEAK, USR_SPEAK] + +class SCGPT(NLG): + def __init__(self, dataset_name, model_path, device='cpu'): + super(SCGPT, self).__init__() + self.device = device + self.model = GPT2LMHeadModel.from_pretrained('gpt2').to(self.device) + self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + self.tokenizer.add_special_tokens({'pad_token': PAD_TOKEN, 'eos_token': END_OF_PRED, + 'additional_special_tokens': special_tokens}) + self.model.resize_token_embeddings(len(self.tokenizer)) + self.model.load_state_dict(torch.load(model_path)) + + + def generate(self, action): + action_str = act2str(action) + output = self._inference_batch([action_str])[0] + return output + + def _inference_batch(self, sents): + with torch.no_grad(): + sents = [sent + ' ' + START_OF_PRED for sent in sents] + sent_ids = [self.tokenizer.encode(sent) for sent in sents] + max_len = max([len(sent) for sent in sent_ids]) + sent_ids = [sent + [self.tokenizer.pad_token_id] * (max_len - len(sent)) for sent in sent_ids] + inputs = torch.LongTensor(sent_ids).to(self.device) + model_to_run = self.model.module if type(self.model) is DDP else self.model + outputs = model_to_run.generate(inputs, max_length=256, + eos_token_id=self.tokenizer.pad_token_id, + pad_token_id=self.tokenizer.pad_token_id) # greedy + # outputs = model_to_run.generate(inputs, num_beams=4, max_length=513, eos_token_id=gpt2_tokenizer.eos_token_id, + # pad_token_id=gpt2_tokenizer.pad_token_id) # beam search + output_strs = [self.tokenizer.decode(item) for item in outputs] + return output_strs \ No newline at end of file diff --git a/convlab2/nlg/scgpt/scgpt_special_tokens.py b/convlab2/nlg/scgpt/scgpt_special_tokens.py new file mode 100644 index 0000000000000000000000000000000000000000..643820dd04e26bde83edddcd4581784577ad3853 --- /dev/null +++ b/convlab2/nlg/scgpt/scgpt_special_tokens.py @@ -0,0 +1,14 @@ +# separator +SYS_SPEAK = '[sys_speak]' +USR_SPEAK = '[usr_speak]' +START_OF_PRED = '[start_of_pred]' +END_OF_PRED = '[end_of_pred]' +PAD_TOKEN = '[_pad_token_]' +START_OF_INTENT = '[start_of_intent]' +END_OF_INTENT = '[end_of_intent]' +START_OF_SLOT = '' + +SPECIAL_TOKENS = [val for name, val in globals().items() if + str.isupper(name) and isinstance(val, str) and val and val[0] == '[' and val[-1] == ']'] + +assert all(token.islower() for token in SPECIAL_TOKENS) \ No newline at end of file diff --git a/convlab2/nlg/scgpt/test.sh b/convlab2/nlg/scgpt/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..96d51bc6429e5d5248be24ab1c3331b0f195349c --- /dev/null +++ b/convlab2/nlg/scgpt/test.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES="6" python -m torch.distributed.launch --nproc_per_node 1 --master_port 3046 main.py --dataset multiwoz21 \ No newline at end of file diff --git a/convlab2/nlg/scgpt/train.py b/convlab2/nlg/scgpt/train.py deleted file mode 100644 index 0878f31353735ede8b2036ec1f46ef56ce129bed..0000000000000000000000000000000000000000 --- a/convlab2/nlg/scgpt/train.py +++ /dev/null @@ -1,682 +0,0 @@ -from __future__ import absolute_import, division, print_function - -import argparse -import glob -import logging -import os -import pickle -import random -import re -import shutil - -import numpy as np -import torch -from tqdm import tqdm, trange -from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler -from torch.utils.data.distributed import DistributedSampler - -try: - from torch.utils.tensorboard import SummaryWriter -except ImportError: - from tensorboardX import SummaryWriter - -from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, - BertConfig, BertForMaskedLM, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, - OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2TokenizerFast, - RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, - DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer, BertTokenizer) -from convlab2.nlg.scgpt.modeling_utils import AmpGPT2LMHeadModel, try_enable_gradient_checkpointing, AmpHelper - -logger = logging.getLogger(__name__) - -MODEL_CLASSES = { - 'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2TokenizerFast), - 'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), - 'bert': (BertConfig, BertForMaskedLM, BertTokenizer), - 'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer), - 'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer) -} - - -def closest_multiple_of_8(n): - """ - Returns: - a closest number, which is a multiple of 8 and >= n - """ - return ((n + 7) >> 3) << 3 - - -class TextDataset(Dataset): - def __init__(self, tokenizer, args, file_path='train', block_size=512, max_seq=80): - assert os.path.isfile(file_path) - directory, filename = os.path.split(file_path) - cached_features_file = os.path.join(directory, args.model_name_or_path + '_cached_lm_' + str( - block_size) + '_seqlen_' + str(max_seq) + '_' + filename) - - if os.path.exists(cached_features_file) and not args.overwrite_cache: - logger.info("Loading features from cached file %s", cached_features_file) - with open(cached_features_file, 'rb') as handle: - self.examples = pickle.load(handle) - else: - logger.info("Creating features from dataset file at %s", directory) - - self.examples = [] - - with open(file_path, encoding="utf-8") as f: - if args.text_chunk: - text = f.read() - tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) - else: - for line in f: - tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(line.strip() + ' eos')) - self.examples.append(tokenized_text) - - if args.text_chunk: - for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size - self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i + block_size])) - - # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) - # If your dataset is small, first you should look for a bigger one :-) and second you - # can change this behavior by adding (model specific) padding. - - logger.info("Saving features into cached file %s", cached_features_file) - with open(cached_features_file, 'wb') as handle: - pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) - - def __len__(self): - return len(self.examples) - - def __getitem__(self, item): - return torch.tensor(self.examples[item]) - - -class TextSeqDataset(Dataset): - def __init__(self, tokenizer, args, file_path='train', block_size=512, max_seq=80, separator=' & '): - max_seq = closest_multiple_of_8(max_seq) - assert os.path.isfile(file_path) - directory, filename = os.path.split(file_path) - cached_features_file = os.path.join(directory, args.output_dir.replace(os.sep, '_') + '_cached_lm_' + str( - block_size) + '_seqlen_' + str(max_seq) + '_' + filename) - - if os.path.exists(cached_features_file) and not args.overwrite_cache: - logger.info("Loading features from cached file %s", cached_features_file) - with open(cached_features_file, 'rb') as handle: - self.examples, self.masks, self.labels, self.seq_lengths = pickle.load(handle) - else: - logger.info("Creating features from dataset file at %s", directory) - self.examples = [] - self.labels = [] - self.masks = [] - self.seq_lengths = [] - with open(file_path, encoding="utf-8") as f: - for line in tqdm(f): - line = line.strip() - raw_str = line.lower() # do we need lowercase? - code_str = line.lower().split(separator)[0] + separator - code_str = code_str.strip() - if len(raw_str.split()) > max_seq -1: - raw_str = ' '.join(raw_str.split()[:max_seq -1]) - raw_str += ' ' + tokenizer.eos_token - if args.use_tokenize: - tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(raw_str)) - code_str_len = len(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(code_str))) - else: - tokenized_text = tokenizer.convert_tokens_to_ids(raw_str.split()) - code_str_len = len(tokenizer.convert_tokens_to_ids(code_str.split())) - - label = [-1] * max_seq - label[:len(tokenized_text)] = tokenized_text - mask = [1] * max_seq - - if len(tokenized_text) < max_seq: - self.seq_lengths.append(len(tokenized_text)) - mask[-(max_seq - len(tokenized_text)):] = [0] * (max_seq - len(tokenized_text)) - # label[code_str_len:len(tokenized_text)] = tokenized_text[code_str_len:] - tokenized_text = tokenized_text + [tokenizer.eos_token_id] * (max_seq - len(tokenized_text)) - else: - self.seq_lengths.append(max_seq) - tokenized_text = tokenized_text[:max_seq] - # label[code_str_len:] = tokenized_text[code_str_len:] - - self.examples.append(tokenized_text) - self.masks.append(mask) - self.labels.append(label) - - # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) - # If your dataset is small, first you should look for a bigger one :-) and second you - # can change this behavior by adding (model specific) padding. - if args.with_code_loss: - self.labels = self.examples - logger.info("Saving features into cached file %s", cached_features_file) - with open(cached_features_file, 'wb') as handle: - pickle.dump((self.examples, self.masks, self.labels, self.seq_lengths), handle, - protocol=pickle.HIGHEST_PROTOCOL) - - def __len__(self): - return len(self.examples) - - def __getitem__(self, item): - return torch.tensor(self.examples[item]), torch.tensor(self.masks[item]), torch.tensor( - self.labels[item]), torch.tensor(self.seq_lengths[item]) - - -def load_and_cache_examples(args, tokenizer, evaluate=False): - dataset = TextSeqDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, - block_size=args.block_size, max_seq=args.max_seq) - return dataset - - -def set_seed(args): - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - if args.n_gpu > 0: - torch.cuda.manual_seed_all(args.seed) - - -def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False): - if not args.save_total_limit: - return - if args.save_total_limit <= 0: - return - - # Check if we should delete older checkpoint(s) - glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix))) - if len(glob_checkpoints) <= args.save_total_limit: - return - - ordering_and_checkpoint_path = [] - for path in glob_checkpoints: - if use_mtime: - ordering_and_checkpoint_path.append((os.path.getmtime(path), path)) - else: - regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path) - if regex_match and regex_match.groups(): - ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path)) - - checkpoints_sorted = sorted(ordering_and_checkpoint_path) - checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted] - number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit) - checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete] - for checkpoint in checkpoints_to_be_deleted: - logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint)) - shutil.rmtree(checkpoint) - - -def mask_tokens(inputs, tokenizer, args): - """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ - labels = inputs.clone() - # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) - probability_matrix = torch.full(labels.shape, args.mlm_probability) - special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in - labels.tolist()] - probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) - masked_indices = torch.bernoulli(probability_matrix).bool() - labels[~masked_indices] = -1 # We only compute loss on masked tokens - - # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) - indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices - inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) - - # 10% of the time, we replace masked input tokens with random word - indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced - random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) - inputs[indices_random] = random_words[indices_random] - - # The rest of the time (10% of the time) we keep the masked input tokens unchanged - return inputs, labels - - -def preprocess_batch(inputs, masks, labels, seq_lengths): - """ - The real sequence length of a batch may be shorter than max_seq of the whole dataset. - Remove some padding tokens to accelerate the training process. - And make sure that the sequence length is multiple of 8. - - References: - https://huggingface.co/transformers/performance.html#fp16 - """ - # The gain for FP16 training is that in each of those cases, the training with the flag --fp16 is twice as fast, - # which does require every tensor to have every dimension be a multiple of 8 - # (examples pad the tensors to a sequence length that is a multiple of 8). - max_seq_len = seq_lengths.max() - max_seq_len = closest_multiple_of_8(max_seq_len) - return inputs[:, :max_seq_len], masks[:, :max_seq_len], labels[:, :max_seq_len] - - -def train(args, train_dataset, model, tokenizer): - """ Train the model """ - if args.local_rank in [-1, 0]: - tb_writer = SummaryWriter() - - args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) - train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) - train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) - - if args.max_steps > 0: - t_total = args.max_steps - args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 - else: - t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs - - # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ['bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], - 'weight_decay': args.weight_decay}, - {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, - num_training_steps=t_total) - # https://pytorch.org/docs/master/notes/amp_examples.html - amp_helper = AmpHelper(use_amp=args.fp16) - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Distributed training - if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=False) - - # Train! - logger.info("***** Running training *****") - logger.info(" Num examples = %d", len(train_dataset)) - logger.info(" Num Epochs = %d", args.num_train_epochs) - logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * ( - torch.distributed.get_world_size() if args.local_rank != -1 else 1)) - logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) - logger.info(" Total optimization steps = %d", t_total) - - global_step = 0 - tr_loss, logging_loss = 0.0, 0.0 - model.zero_grad() - train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) - set_seed(args) # Added here for reproducibility (even between python 2 and 3) - for e in train_iterator: - - # epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) - for step, batch in enumerate(train_dataloader): - # inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) - logger.info(f" PROGRESS: {float(global_step) / t_total * 100}%") - inputs, masks, labels, seq_lengths = batch - inputs, masks, labels = preprocess_batch(inputs, masks, labels, seq_lengths) # cut seq - # import pdb - # pdb.set_trace() - inputs = inputs.to(args.device) - # masks = masks.to(args.device) - labels = labels.to(args.device) - - model.train() - try: - with amp_helper.might_enable_autocast: - outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) - loss = outputs[0] # model outputs are always tuple in transformers (see doc) - - if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - amp_helper.backward(loss) - except RuntimeError as e: - if 'CUDA out of memory' in str(e): - # if out of memory, we must choose smaller batch_size - print(f'inputs.shape = {inputs.shape}, labels.shape = {labels.shape}') - raise - - tr_loss += loss.item() - if (step + 1) % args.gradient_accumulation_steps == 0: - amp_helper.might_unscale_(optimizer) - torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) - # optimizer.step() - amp_helper.step(optimizer) - scheduler.step() # Update learning rate schedule - model.zero_grad() - global_step += 1 - - if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: - # Log metrics - if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well - results = evaluate(args, model, tokenizer) - for key, value in results.items(): - tb_writer.add_scalar('eval_{}'.format(key), value, global_step) - tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) - tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) - logger.info(f" EVALERR: {(tr_loss - logging_loss) / float(args.logging_steps)}") - logging_loss = tr_loss - - if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: - checkpoint_prefix = 'checkpoint' - # Save model checkpoint - output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step)) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - model_to_save = model.module if hasattr(model, - 'module') else model # Take care of distributed/parallel training - model_to_save.save_pretrained(output_dir) - tokenizer.save_pretrained(output_dir) - torch.save(args, os.path.join(output_dir, 'training_args.bin')) - logger.info("Saving model checkpoint to %s", output_dir) - - _rotate_checkpoints(args, checkpoint_prefix) - - if global_step > args.max_steps > 0: - train_iterator.close() - break - - if args.local_rank in [-1, 0]: - tb_writer.close() - - return global_step, tr_loss / global_step - - -def evaluate(args, model, tokenizer, prefix=""): - # Loop to handle MNLI double evaluation (matched, mis-matched) - eval_output_dir = args.output_dir - - eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) - - if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: - os.makedirs(eval_output_dir) - - args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) - # Note that DistributedSampler samples randomly - eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) - - # multi-gpu evaluate - if args.n_gpu > 1 and not (isinstance(model, torch.nn.DataParallel) or - isinstance(model, torch.nn.parallel.DistributedDataParallel)): - # if args.evaluate_during_training, DataParallel is already used - model = torch.nn.DataParallel(model) - - # Eval! - logger.info("***** Running evaluation {} *****".format(prefix)) - logger.info(" Num examples = %d", len(eval_dataset)) - logger.info(" Batch size = %d", args.eval_batch_size) - eval_loss = 0.0 - nb_eval_steps = 0 - model.eval() - - for batch in tqdm(eval_dataloader, desc="Evaluating"): - # inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) - - inputs, masks, labels, seq_lengths = batch - inputs, masks, labels = preprocess_batch(inputs, masks, labels, seq_lengths) # cut seq - # import pdb - # pdb.set_trace() - inputs = inputs.to(args.device) - masks = masks.to(args.device) - labels = labels.to(args.device) - # inputs = inputs.to(args.device) - # labels = labels.to(args.device) - - with torch.no_grad(): - outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) - loss = outputs[0] # model outputs are always tuple in transformers (see doc) - eval_loss += loss.mean().item() - nb_eval_steps += 1 - - eval_loss = eval_loss / nb_eval_steps - perplexity = float(np.exp(eval_loss)) - - result = { - "perplexity": perplexity - } - - output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results {} *****".format(prefix)) - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) - - return result - - -def main(): - global AdamW - parser = argparse.ArgumentParser() - - ## Required parameters - parser.add_argument("--train_data_file", default=None, type=str, required=True, - help="The input training data file (a text file).") - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model predictions and checkpoints will be written.") - - ## Other parameters - parser.add_argument("--eval_data_file", default=None, type=str, - help="An optional input evaluation data file to evaluate the perplexity on (a text file).") - - parser.add_argument("--model_type", default="gpt2", type=str, - help="The model architecture to be fine-tuned.") - parser.add_argument("--model_name_or_path", default="gpt2", type=str, - help="The model checkpoint for weights initialization.") - - parser.add_argument("--mlm", action='store_true', - help="Train with masked-language modeling loss instead of language modeling.") - parser.add_argument("--mlm_probability", type=float, default=0.15, - help="Ratio of tokens to mask for masked language modeling loss") - - parser.add_argument("--config_name", default="", type=str, - help="Optional pretrained config name or path if not the same as model_name_or_path") - parser.add_argument("--tokenizer_name", default="", type=str, - help="Optional pretrained tokenizer name or path if not the same as model_name_or_path") - parser.add_argument("--cache_dir", default="", type=str, - help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)") - parser.add_argument("--block_size", default=80, type=int, - help="Optional input sequence length after tokenization." - "The training dataset will be truncated in block of this size for training." - "Default to the model max input length for single sentence inputs (take into account special tokens).") - parser.add_argument("--do_train", action='store_true', - help="Whether to run training.") - parser.add_argument("--do_eval", action='store_true', - help="Whether to run eval on the dev set.") - parser.add_argument("--evaluate_during_training", action='store_true', - help="Run evaluation during training at each logging step.") - parser.add_argument("--do_lower_case", action='store_true', - help="Set this flag if you are using an uncased model.") - - parser.add_argument("--per_gpu_train_batch_size", default=1, type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int, - help="Batch size per GPU/CPU for evaluation.") - parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument("--learning_rate", default=1e-5, type=float, - help="The initial learning rate for Adam.") - parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight deay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, - help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=5.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.") - parser.add_argument("--warmup_steps", default=0, type=int, - help="Linear warmup over warmup_steps.") - - parser.add_argument('--logging_steps', type=int, default=100, - help="Log every X updates steps.") - parser.add_argument('--save_steps', type=int, default=5000, - help="Save checkpoint every X updates steps.") - parser.add_argument('--save_total_limit', type=int, default=None, - help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default') - parser.add_argument("--eval_all_checkpoints", action='store_true', - help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number") - parser.add_argument("--no_cuda", action='store_true', - help="Avoid using CUDA when available") - parser.add_argument('--overwrite_output_dir', action='store_true', - help="Overwrite the content of the output directory") - parser.add_argument('--overwrite_cache', action='store_true', - help="Overwrite the cached training and evaluation sets") - parser.add_argument('--seed', type=int, default=42, - help="random seed for initialization") - - parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit (mixed) precision (through torch.cuda.amp) instead of 32-bit") - parser.add_argument("--local_rank", type=int, default=-1, - help="For distributed training: local_rank") - parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") - parser.add_argument('--text_chunk', action='store_true', help="") - parser.add_argument('--use_reverse', action='store_true', help="") - parser.add_argument('--with_code_loss', type=bool, default=True, help="") - parser.add_argument('--use_tokenize', action='store_true', help="") - - parser.add_argument("--max_seq", default=80, type=int, - help="") - parser.add_argument('--gradient_checkpointing', action='store_true', help='enable gradient checkpointing') - parser.add_argument('--use_multi_tensor_adamw', action='store_true', - help='use torch.optim._multi_tensor.AdamW instead of transformers.AdamW') - - args = parser.parse_args() - if args.use_multi_tensor_adamw: - try: - # overwrite the previous imported AdamW - # https://huggingface.co/transformers/performance.html#faster-optimizer - from torch.optim._multi_tensor import AdamW - except ImportError as e: - print(e) - - if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm: - raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm " - "flag (masked language modeling).") - if args.eval_data_file is None and args.do_eval: - raise ValueError( - "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " - "or remove the --do_eval argument.") - - if os.path.exists(args.output_dir) and os.listdir( - args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError( - "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( - args.output_dir)) - - # Setup distant debugging if needed - if args.server_ip and args.server_port: - # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script - import ptvsd - print("Waiting for debugger attach") - ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) - ptvsd.wait_for_attach() - - # Setup logging before `torch.distributed.init_process_group` is called - logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S', - level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - - # Setup CUDA, GPU & distributed training - if args.local_rank == -1 or args.no_cuda: - device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - args.n_gpu = torch.cuda.device_count() - else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs - torch.cuda.set_device(args.local_rank) - device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend='nccl') - args.n_gpu = 1 - args.device = device - logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) - - # Set seed - set_seed(args) - - # Load pretrained model and tokenizer - if args.local_rank not in [-1, 0]: - torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab - - if args.fp16: - MODEL_CLASSES['gpt2'] = (GPT2Config, AmpGPT2LMHeadModel, GPT2TokenizerFast) - config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - cache_dir=args.cache_dir if args.cache_dir else None) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - # tokenizer = BertTokenizer(vocab_file='../GPT2-chitchat/vocabulary/vocab_small.txt', eos_token='<T>', - do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None) - - if args.block_size <= 0: - args.block_size = tokenizer.max_len_single_sentence # Our input block size will be the max possible for the model - args.block_size = min(args.block_size, tokenizer.max_len_single_sentence) - model = model_class.from_pretrained(args.model_name_or_path, - from_tf=bool('.ckpt' in args.model_name_or_path), - config=config, - cache_dir=args.cache_dir if args.cache_dir else None) - if model.config.vocab_size != len(tokenizer): - logger.info('resize token embeddings, since there may be added tokens.') - model.resize_token_embeddings(len(tokenizer)) - model.to(args.device) - if args.gradient_checkpointing: - # https://huggingface.co/transformers/performance.html#gradient-checkpointing - try_enable_gradient_checkpointing(model) - - if args.local_rank == 0: - torch.distributed.barrier() # End of barrier to make sure only the first process in distributed training download model & vocab - - logger.info("Training/evaluation parameters %s", args) - - # Training - if args.do_train: - if args.local_rank not in [-1, 0]: - torch.distributed.barrier() # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache - - train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False) - - if args.local_rank == 0: - torch.distributed.barrier() - - global_step, tr_loss = train(args, train_dataset, model, tokenizer) - logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - - # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained() - if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): - # Create output directory if needed - if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: - os.makedirs(args.output_dir) - - logger.info("Saving model checkpoint to %s", args.output_dir) - # Save a trained model, configuration and tokenizer using `save_pretrained()`. - # They can then be reloaded using `from_pretrained()` - model_to_save = model.module if hasattr(model, - 'module') else model # Take care of distributed/parallel training - model_to_save.save_pretrained(args.output_dir) - tokenizer.save_pretrained(args.output_dir) - - # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) - - # Load a trained model and vocabulary that you have fine-tuned - model = model_class.from_pretrained(args.output_dir) - tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) - model.to(args.device) - - # Evaluation - results = {} - if args.do_eval and args.local_rank in [-1, 0]: - checkpoints = [args.output_dir] - if args.eval_all_checkpoints: - checkpoints = list( - os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) - logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging - logger.info("Evaluate the following checkpoints: %s", checkpoints) - for checkpoint in checkpoints: - global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" - prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" - - model = model_class.from_pretrained(checkpoint) - model.to(args.device) - result = evaluate(args, model, tokenizer, prefix=prefix) - result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) - results.update(result) - return results - - -if __name__ == "__main__": - main() diff --git a/convlab2/nlg/scgpt/train.sh b/convlab2/nlg/scgpt/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..12f190670e835c0d1f8f5df198dade11ac59ce77 --- /dev/null +++ b/convlab2/nlg/scgpt/train.sh @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES="1" python -m torch.distributed.launch --nproc_per_node 1 main.py --do_train --dataset multiwoz21 \ No newline at end of file diff --git a/convlab2/nlg/scgpt/util.py b/convlab2/nlg/scgpt/util.py new file mode 100644 index 0000000000000000000000000000000000000000..b0efa589fc485706fa10b9129be8081ddfaf4fdb --- /dev/null +++ b/convlab2/nlg/scgpt/util.py @@ -0,0 +1,94 @@ +import torch + +def act2str(act): + """Convert unified dataset dialog act dict to string. + act: + {'categorical': [{'intent': 'inform', 'domain': 'restaurant', 'slot': 'area', 'value': 'north'}], + 'non-categorical': [{'intent': 'inform', 'domain': 'hotel', 'slot': 'area', 'value': 'north'}], + 'binary': [{'intent': 'request', 'domain': 'hotel', 'slot': 'area'}]} + return: + restaurant { inform ( area = north ) } | hotel { inform ( area = north ) @ request ( area ) } + """ + old_format_dict = convert2old_format(act) + return dict2seq(old_format_dict) + + +def build_mask(max_len, seq_lens, use_float=False): + """ + make one-hot masks given seq_lens list. + e.g., input: max_len=4, seq_lens=[2,3], return: [[1,1,0,0], [1,1,1,0]] + Args: + max_len (int): maximum sequence length + seq_lens (torch.Tensor): (batch) + Returns: + mask (torch.Tensor): (batch, max_len) + """ + a = torch.arange(max_len)[None, :] + b = seq_lens[:, None].cpu() + mask = a < b + if use_float: + mask = mask.float() + return mask + + +def convert2old_format(act): + """ + dict: {'categorical': [{'intent': 'request', 'domain': 'hotel', 'slot': 'area'}], 'non-categorical': [...], 'binary': [,,,]} + """ + new_act = {} + for key in act: + for item_dic in act[key]: + domain = item_dic['domain'] + if domain not in new_act: + new_act[domain] = {} + intent = item_dic['intent'] + if intent not in new_act[domain]: + new_act[domain][intent] = [] + slot = item_dic['slot'] + if 'value' in item_dic: + value = item_dic['value'] + else: + value = None + new_act[domain][intent].append([slot, value]) + return new_act + + +def dict2seq(d): + ''' + dict: [domain: { intent: [slot, value] }] + seq: [domain { intent ( slot = value ; ) @ } | ] + ''' + s = '' + first_domain = True + first_intent = True + first_slot = True + for domain in d: + if not first_domain: + s += ' | ' + s += domain + s += ' { ' + first_domain = False + first_intent = True + for intent in d[domain]: + if not first_intent: + s += ' @ ' + s += intent + s += ' ( ' + first_intent = False + first_slot = True + for slot, value in d[domain][intent]: + if not first_slot: + s += ' ; ' + s += slot + if value: + s += ' = ' + s += value + first_slot = False + s += ' )' + s += ' }' + return s.lower() + + +if __name__ == '__main__': + ipt = {'categorical': [{'intent': 'inform', 'domain': 'restaurant', 'slot': 'area', 'value': 'north'}], 'non-categorical': [{'intent': 'inform', 'domain': 'hotel', 'slot': 'area', 'value': 'north'}], 'binary': [{'intent': 'request', 'domain': 'hotel', 'slot': 'area'}]} + print(act2str(ipt)) \ No newline at end of file diff --git a/convlab2/nlg/scgpt/utils.py b/convlab2/nlg/scgpt/utils.py deleted file mode 100644 index 7fefc166f096c307590fd5b0478c4db1cf551e7f..0000000000000000000000000000000000000000 --- a/convlab2/nlg/scgpt/utils.py +++ /dev/null @@ -1,98 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Tue Mar 24 18:34:55 2020 - -@author: truthless -""" - -def tuple2dict(t): - ''' - tuple: [(intent, domain, slot, value)] - dict: [domain: { intent: [slot, value] }] - ''' - d = {} - for intent, domain, slot, value in t: - if domain not in d: - d[domain] = {} - if intent not in d[domain]: - d[domain][intent] = [] - if slot == 'none' or slot is None: - continue - d[domain][intent].append([slot, value]) - return d - -def dict2dict(D): - ''' - dict: [domain-intent: [slot, value]] - dict: [domain: { intent: [slot, value] }] - ''' - d = {} - for domint in D: - domain, intent = domint.split('-') - if domain not in d: - d[domain] = {} - if intent not in d[domain]: - d[domain][intent] = [] - for slot, value in D[domint]: - if slot == 'none' or slot is None: - continue - d[domain][intent].append([slot, value]) - return d - -def dict2seq(d): - ''' - dict: [domain: { intent: [slot, value] }] - seq: [domain { intent ( slot = value ; ) @ } | ] - ''' - s = '' - first_domain = True - first_intent = True - first_slot = True - for domain in d: - if not first_domain: - s += ' | ' - s += domain - s += ' { ' - first_domain = False - first_intent = True - for intent in d[domain]: - if not first_intent: - s += ' @ ' - s += intent - s += ' ( ' - first_intent = False - first_slot = True - for slot, value in d[domain][intent]: - if not first_slot: - s += ' ; ' - s += slot - if value: - s += ' = ' - s += value - first_slot = False - s += ' )' - s += ' }' - return s.lower() - -def tuple2seq(t): - d = tuple2dict(t) - s = dict2seq(d) - return s - -if __name__ == '__main__': - da_tuple = [('Inform', 'Booking', 'none', 'none'), ('Inform', 'Hotel', 'Price', 'cheap'), ('Inform', 'Hotel', 'Choice', '1'), ('Inform', 'Hotel', 'Parking', 'none')] - da_dict = tuple2dict(da_tuple) - print(da_dict) - da_seq = dict2seq(da_dict) - print(da_seq) - - da_tuple = [('Request', 'Hotel', 'Address', '?'), ('Request', 'Hotel', 'Area', '?'), ('Inform', 'Attraction', 'Area', 'center'), ('Inform', 'Hotel', 'Price', 'cheap')] - da_dict = tuple2dict(da_tuple) - print(da_dict) - da_seq = dict2seq(da_dict) - print(da_seq) - - D = {'Hotel-Inform': [['Price', 'cheap'], ['Type', 'hotel']]} - da_dict = dict2dict(D) - print(da_dict) -