diff --git a/convlab2/nlg/evaluate.py b/convlab2/nlg/evaluate.py index 5845f4338987db6161c5697c8e496f276c7393a4..1a4747b7f19a47f2c069e4ba286c9ad16763043b 100755 --- a/convlab2/nlg/evaluate.py +++ b/convlab2/nlg/evaluate.py @@ -5,16 +5,173 @@ Usage: python evaluate.py [MultiWOZ] [SCLSTM|TemplateNLG] [usr|sys] """ import json +import os import random import sys +import itertools import zipfile import numpy +from numpy.lib.shape_base import _put_along_axis_dispatcher +from numpy.lib.twodim_base import triu_indices_from import torch from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction from pprint import pprint from tqdm import tqdm +def slot_error(dialog_acts, utts): + halucination = [] + halucinate = 0 + missing = 0 + total = 0 + + for acts,utt in zip(dialog_acts, utts): + for act in acts: + tmp_act = [x.lower() for x in act] + tmp_utt = utt.lower() + i, d, s, v = tmp_act + if i == 'inform': + total = total + 1 + if not (v in tmp_utt): + missing = missing + 1 + return missing, total + +def fine_SER(dialog_acts, utts): + path = os.path.dirname(os.path.abspath(__file__)) + path = os.path.join(path, 'template', 'multiwoz', 'label_maps.json') + with open(path, 'r') as mapping_file: + mappings = json.load(mapping_file) + mapping_file.close() + + path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + path = os.path.join(path, 'data', 'multiwoz', 'ontology_nlg_eval.json') + with open(path, 'r') as entity_file: + possible_entity = json.load(entity_file) + entity_file.close() + + entity_list = [] + + for key in possible_entity.keys(): + entity_list = entity_list + possible_entity[key] + + hallucinate = 0 + missing = 0 + total = 0 + + unk_token_count = 0 + missing_dialogs = [] + hallucination_dialogs = [] + + slot_span = [] + domain_span = [] + + for acts,utt in zip(dialog_acts, utts): + hallucination_flag = False + tmp_utt = utt.lower() + origin_utt = utt.lower() + legal_act_flag = False + + for act in acts: + missing_fact = None + missing_flag = False + tmp_act = [x.lower() for x in act] + i, d, s, v = tmp_act + + if not(d in domain_span): + domain_span.append(d) + if not(s in slot_span): + slot_span.append(s) + #intializing all possible span keyword + + if i in ['inform', 'recommend', 'offerbook', 'offerbooked','book','select']: + legal_act_flag = True + total = total + 1 + if not (v in origin_utt) and v!='none': + exist_flag = False + try: + synoyms = mappings[v] + for item in synoyms: + if item in origin_utt: + exist_flag = True + tmp_utt = tmp_utt.replace(item,'') + tmp_utt = tmp_utt.replace(s,'') + #remove span for hallucination detection + except: + pass + if i in ['offerbook', 'offerbooked'] and v =='none': + if 'book' in origin_utt: + exist_flag = True + tmp_utt = tmp_utt.replace('book','') + if i in ['inform','recommend'] and v=='none': + if d in origin_utt: + exist_flag = True + tmp_utt = tmp_utt.replace(d, '') + if exist_flag == False: + missing_flag = True + missing_fact = v + else: + tmp_utt = tmp_utt.replace(v,'') + tmp_utt = tmp_utt.replace(s,'') + + if s in origin_utt: + missing_flag = False + if s =='booking' and ('book' in origin_utt or 'reserv' in origin_utt): + missing_flag = False + + elif i == 'request': + legal_act_flag = True + total = total + 1 + if s=='depart' or s=='dest' or s=='area': + if not ('where' in origin_utt): + if s in origin_utt: + tmp_utt = tmp_utt.replace(s,'') + else: + missing_flag = True + missing_fact = s + elif s=='leave' or s=='arrive': + if (not 'when' in origin_utt): + if not ('what' in origin_utt and 'time' in origin_utt): + missing_flag = True + missing_fact = s + else: + tmp_utt.replace('time', '') + else: + tmp_utt = tmp_utt.replace(s,'') + tmp_utt = tmp_utt.replace(d,'') + + if s in origin_utt: + missing_flag = False + if s =='booking' and ('book' in origin_utt or 'reserv' in origin_utt): + missing_flag = False + + try: + tmp_utt = tmp_utt.replace(d,'') + tmp_utt = tmp_utt.replace(s,'') + if 'arrive' in s or 'leave' in s: + tmp_utt = tmp_utt.replace('time', '') + except: + pass + + if missing_flag == True: + missing = missing + 1 + missing_dialogs.append(missing_fact) + missing_dialogs.append(acts) + missing_dialogs.append(utt) + + for keyword in slot_span + entity_list: + if keyword in tmp_utt and len(keyword) >= 4 and legal_act_flag == True: + hallucination_flag = True + hallucinate = hallucinate + 1 + hallucination_dialogs.append(keyword) + hallucination_dialogs.append(acts) + hallucination_dialogs.append(tmp_utt) + hallucination_dialogs.append(utt) + break + + + return missing, hallucinate, total, hallucination_dialogs, missing_dialogs + + def get_bleu4(dialog_acts, golden_utts, gen_utts): das2utts = {} for das, utt, gen in zip(dialog_acts, golden_utts, gen_utts): @@ -55,36 +212,52 @@ if __name__ == '__main__': numpy.random.seed(seed) torch.manual_seed(seed) - if len(sys.argv) != 4: + if len(sys.argv) < 4: print("usage:") print("\t python evaluate.py dataset model role") print("\t dataset=MultiWOZ, CrossWOZ, or Camrest") - print("\t model=SCLSTM, or TemplateNLG") + print("\t model=SCLSTM, SCLSTM_NoUNK, SCGPT or TemplateNLG") print("\t role=usr/sys") + print("\t [Optional] model_file") sys.exit() dataset_name = sys.argv[1] model_name = sys.argv[2] role = sys.argv[3] + model_file = sys.argv[4] if len(sys.argv) >= 5 else None if dataset_name == 'MultiWOZ': if model_name == 'SCLSTM': from convlab2.nlg.sclstm.multiwoz import SCLSTM if role == 'usr': - model = SCLSTM(is_user=True, use_cuda=True) + model = SCLSTM(is_user=True, use_cuda=True, unk_suppress=False) + elif role == 'sys': + model = SCLSTM(is_user=False, use_cuda=True, unk_suppress=False) + elif model_name == 'SCLSTM_NoUNK': + from convlab2.nlg.sclstm.multiwoz import SCLSTM + if role == 'usr': + model = SCLSTM(is_user=True, use_cuda=True, unk_suppress=True) elif role == 'sys': - model = SCLSTM(is_user=False, use_cuda=True) + model = SCLSTM(is_user=False, use_cuda=True, unk_suppress=True) elif model_name == 'TemplateNLG': from convlab2.nlg.template.multiwoz import TemplateNLG if role == 'usr': model = TemplateNLG(is_user=True) elif role == 'sys': model = TemplateNLG(is_user=False) + elif model_name == 'SCGPT': + from convlab2.nlg.scgpt.multiwoz import SCGPT + if model_file is not None: + print(f"load model at {model_file}") + if role == 'usr': + model = SCGPT(model_file, is_user=True) + elif role == 'sys': + model = SCGPT(model_file, is_user=False) else: - raise Exception("Available models: SCLSTM, TEMPLATE") + raise Exception("Available models: SCLSTM, SCGPT, TEMPLATE") from convlab2.util.dataloader.module_dataloader import SingleTurnNLGDataloader from convlab2.util.dataloader.dataset_dataloader import MultiWOZDataloader dataloader = SingleTurnNLGDataloader(dataset_dataloader=MultiWOZDataloader()) - data = dataloader.load_data(data_key='test', role=role)['test'] + data = dataloader.load_data(data_key='all', role=role, session_id=True)['test'] dialog_acts = [] golden_utts = [] @@ -93,17 +266,51 @@ if __name__ == '__main__': sen_num = 0 + # sys.stdout = open(sys.argv[2] + '-' + sys.argv[3] + '-' + 'evaluate_logs_neo.txt','w') + assert 'utterance' in data and 'dialog_act' in data and 'session_id' in data + assert len(data['utterance']) == len(data['dialog_act']) == len(data['session_id']) + + # Turns during the same session should be contiguous, so we can call init_session at the first turn of a new session. + # This is necessary for SCGPT, but unnecessary for SCLSTM and TemplateNLG. + is_first_turn = [] + for _, iterator in itertools.groupby(data['session_id']): + is_first_turn.append(True) + next(iterator) + is_first_turn.extend(False for _ in iterator) for i in tqdm(range(len(data['utterance']))): + if is_first_turn[i]: + model.init_session() dialog_acts.append(data['dialog_act'][i]) golden_utts.append(data['utterance'][i]) gen_utts.append(model.generate(data['dialog_act'][i])) + # print(dialog_acts[-1]) + # print(golden_utts[-1]) + # print(gen_utts[-1]) + + print("Calculate SER for golden responses") + missing, hallucinate, total, hallucination_dialogs, missing_dialogs = fine_SER(dialog_acts, golden_utts) + print("Golden response Missing acts: {}, Total acts: {}, Hallucinations {}, SER {}".format(missing, total, hallucinate, missing/total)) + + print("Calculate SER") + missing, hallucinate, total, hallucination_dialogs, missing_dialogs = fine_SER(dialog_acts, gen_utts) + # with open('{}-{}-genutt_neo.txt'.format(sys.argv[2], sys.argv[3]), mode='wt', encoding='utf-8') as gen_diag: + # for x in gen_utts: + # gen_diag.writelines(str(x)+'\n') - bleu4 = get_bleu4(dialog_acts, golden_utts, gen_utts) + # with open('{}-{}-hallucinate_neo.txt'.format(sys.argv[2], sys.argv[3]), mode='wt', encoding='utf-8') as hal_diag: + # for x in hallucination_dialogs: + # hal_diag.writelines(str(x)+'\n') + + # with open('{}-{}-missing_neo.txt'.format(sys.argv[2], sys.argv[3]), mode='wt', encoding='utf-8') as miss_diag: + # for x in missing_dialogs: + # miss_diag.writelines(str(x)+'\n') + print("{} Missing acts: {}, Total acts: {}, Hallucinations {}, SER {}".format(sys.argv[2], missing, total, hallucinate, missing/total)) print("Calculate bleu-4") + bleu4 = get_bleu4(dialog_acts, golden_utts, gen_utts) print("BLEU-4: %.4f" % bleu4) - print('Model on {} sentences role={}'.format(len(data['utterance']), role)) + # sys.stdout.close() else: raise Exception("currently supported dataset: MultiWOZ") diff --git a/convlab2/nlg/scgpt/README.md b/convlab2/nlg/scgpt/README.md index 8a6a47c9c85ca0633dc8001db09b401a53996d13..5eed2c0fc167cd9ee79d66e3252be25060bb294d 100644 --- a/convlab2/nlg/scgpt/README.md +++ b/convlab2/nlg/scgpt/README.md @@ -21,9 +21,22 @@ tar -xvf scgpt.tar.gz Then ``` python -python train.py --output_dir=$output_dir$ --model_type=scgpt --model_name_or_path=gpt2 --do_train --do_eval --eval_data_file=$test_file$ --overwrite_cache --use_tokenize --train_data_file=$train_file$ --overwrite_output_dir +python train.py --output_dir=trained_output --model_type=gpt2 --model_name_or_path=scgpt --do_train --do_eval --eval_data_file=multiwoz/data/test_sys.txt --use_tokenize --train_data_file=multiwoz/data/train_sys.txt --overwrite_output_dir ``` +some tricks (optional training argument): +* `--gradient_accumulation_steps xxx` +* `--fp16`, if it's set, you'd better set `--per_gpu_train_batch_size` to be multiple of 8 +* `--max_seq xxx`, it should be larger than the length of the longest sequence. You can set `--max_seq 1024`. The script uses a dynamic sequence length at each training step. +* `--gradient_checkpointing`, it allows larger `per_gpu_train_batch_size` +* `--use_multi_tensor_adamw`, someone says it's a faster optimizer + +distributed data parallel: + + If multiple GPUs are available, you can run `python -m torch.distributed.launch --nproc_per_node CUDA_COUNT train.py ......` + + `CUDA_COUNT` is the number of GPUs. `.....` are arguments of `train.py`. + ## Use ```python diff --git a/convlab2/nlg/scgpt/modeling_utils.py b/convlab2/nlg/scgpt/modeling_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a8b3f6ddfc6b7347c624446bf7869c67d3064cc1 --- /dev/null +++ b/convlab2/nlg/scgpt/modeling_utils.py @@ -0,0 +1,53 @@ +import warnings +from contextlib import nullcontext +from typing import TYPE_CHECKING +import torch.cuda.amp as amp +import transformers +from transformers import GPT2LMHeadModel + + +# reference: https://pytorch.org/docs/master/notes/amp_examples.html +class AmpGPT2LMHeadModel(GPT2LMHeadModel): + if TYPE_CHECKING: + # For IDE's code hinting + forward = GPT2LMHeadModel.forward + else: + def forward(self, *args, **kwargs): + with amp.autocast(): + return super().forward(*args, **kwargs) + + +def try_enable_gradient_checkpointing(model: "transformers.modeling_utils.PreTrainedModel"): + if model.supports_gradient_checkpointing: + model.gradient_checkpointing_enable() + else: + warnings.warn(f"{type(model)} doesn't support gradient_checkpointing") + + +class AmpHelper: + """ + References: + https://pytorch.org/docs/master/notes/amp_examples.html + """ + def __init__(self, use_amp=True): + self.use_amp = use_amp + self.might_enable_autocast = amp.autocast() if use_amp else nullcontext() + self.scaler = amp.GradScaler() + + def backward(self, loss): + if self.use_amp: + return self.scaler.scale(loss).backward() + else: + return loss.backward() + + def step(self, optimizer): + if self.use_amp: + self.scaler.step(optimizer) + self.scaler.update() + else: + optimizer.step() + + def might_unscale_(self, optimizer): + if self.use_amp: + # Unscales the gradients of optimizer's assigned params in-place + self.scaler.unscale_(optimizer) \ No newline at end of file diff --git a/convlab2/nlg/scgpt/multiwoz/preprocess.py b/convlab2/nlg/scgpt/multiwoz/preprocess.py index 27c5e9005f3ee36bf7108d8477050a61fd164d6e..d7a47bd2bf7a9dc1772bc861afc63beb6c3ccd8f 100644 --- a/convlab2/nlg/scgpt/multiwoz/preprocess.py +++ b/convlab2/nlg/scgpt/multiwoz/preprocess.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- """ Created on Mon Sep 14 11:38:53 2020 - @author: truthless """ import os import json +from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser from convlab2.nlg.scgpt.utils import dict2dict, dict2seq import zipfile @@ -15,65 +15,6 @@ def read_zipped_json(filepath, filename): archive = zipfile.ZipFile(filepath, 'r') return json.load(archive.open(filename)) -cur_dir = os.path.dirname(os.path.abspath(__file__)) -data_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname( - cur_dir)))), 'data/multiwoz/') - -keys = ['train', 'val', 'test'] -data = {} -for key in keys: - data_key = read_zipped_json(os.path.join(data_dir, key + '.json.zip'), key + '.json') - print('load {}, size {}'.format(key, len(data_key))) - data = dict(data, **data_key) - -with open(os.path.join(data_dir, 'valListFile'), 'r') as f: - val_list = f.read().splitlines() -with open(os.path.join(data_dir, 'testListFile'), 'r') as f: - test_list = f.read().splitlines() - -results = {} -results_val = {} -results_test = {} - -for title, sess in data.items(): - logs = sess['log'] - turns = [] - turn = {'turn':0, 'sys':'', 'sys_da':''} - current_domain = None - for i, diag in enumerate(logs): - text = diag['text'] - da = diag['dialog_act'] - span = diag['span_info'] - if i % 2 == 0: - turn['usr'] = text - if current_domain: - da = eval(str(da).replace('Booking', current_domain)) - span = eval(str(span).replace('Booking', current_domain)) - turn['usr_da'] = da - turn['usr_span'] = span - turns.append(turn) - else: - turn = {'turn': i//2 +1} - turn['sys'] = text - turn['sys_da'] = da - turn['sys_span'] = span - for key in da: - domain = key.split('-')[0] - if domain not in ['general', 'Booking']: - current_domain = domain - title = title - if title in val_list: - current = results_val - elif title in test_list: - current = results_test - else: - current = results - current[title] = turns - -results = eval(str(results).replace(" n't", " not")) -results_val = eval(str(results_val).replace(" n't", " not")) -results_test = eval(str(results_test).replace(" n't", " not")) - def init_domain(): return {'Attraction':False, 'Hospital':False, @@ -83,25 +24,105 @@ def init_domain(): 'Taxi':False, 'Train':False} -def write_file(name, data): +def write_file(name, data, role='usr'): with open(f'{name}.txt', 'w', encoding='utf-8') as f: for ID in data: sess = data[ID] sess_domains = init_domain() for turn in sess: - if not turn['usr_da']: - continue - turn['usr_da'] = eval(str(turn['usr_da']).replace('Bus','Train')) - da_seq = dict2seq(dict2dict(turn['usr_da'])).replace('&', 'and') - domains = set([key.split('-')[0] for key in turn['usr_da'].keys()]) + if role == 'usr': + if not turn['usr_da']: + continue + turn['usr_da'] = eval(str(turn['usr_da']).replace('Bus','Train')) + da_seq = dict2seq(dict2dict(turn['usr_da'])).replace('&', 'and') + domains = set([key.split('-')[0] for key in turn['usr_da'].keys()]) + elif role == 'sys': + if not turn['sys_da']: + continue + turn['sys_da'] = eval(str(turn['sys_da']).replace('Bus','Train')) + da_seq = dict2seq(dict2dict(turn['sys_da'])).replace('&', 'and') + domains = set([key.split('-')[0] for key in turn['sys_da'].keys()]) + else: + raise NameError('Invalid Role: Select usr/sys.') for domain in domains: if domain not in ['general', 'Booking'] and not sess_domains[domain]: da_seq = da_seq.replace(domain.lower(), domain.lower()+' *', 1) sess_domains[domain] = True - da_uttr = turn['usr'].replace(' bus ', ' train ').replace('&', 'and') + if role == 'usr': + da_uttr = turn['usr'].replace(' bus ', ' train ').replace('&', 'and') + elif role == 'sys': + da_uttr = turn['sys'].replace(' bus ', ' train ').replace('&', 'and') f.write(f'{da_seq} & {da_uttr}\n') -if not os.path.exists(os.path.join(cur_dir,'data')): - os.makedirs(os.path.join(cur_dir, 'data')) -write_file(os.path.join(cur_dir, 'data/train'), dict(results, **results_val)) -write_file(os.path.join(cur_dir, 'data/test'), results_test) + +if __name__ == '__main__': + parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) + parser.add_argument('--role', type=str, default='usr') + args = parser.parse_args() + + cur_dir = os.path.dirname(os.path.abspath(__file__)) + data_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname( + cur_dir)))), 'data/multiwoz/') + + keys = ['train', 'val', 'test'] + data = {} + for key in keys: + data_key = read_zipped_json(os.path.join(data_dir, key + '.json.zip'), key + '.json') + print('load {}, size {}'.format(key, len(data_key))) + data = dict(data, **data_key) + + with open(os.path.join(data_dir, 'valListFile'), 'r') as f: + val_list = f.read().splitlines() + with open(os.path.join(data_dir, 'testListFile'), 'r') as f: + test_list = f.read().splitlines() + + results = {} + results_val = {} + results_test = {} + + for title, sess in data.items(): + logs = sess['log'] + turns = [] + turn = {'turn': 0, 'sys': '', 'sys_da': '', 'usr': '', 'usr_da': ''} + current_domain = None + for i, diag in enumerate(logs): + text = diag['text'] + da = diag['dialog_act'] + span = diag['span_info'] + if current_domain: + da = eval(str(da).replace('Booking', current_domain)) + span = eval(str(span).replace('Booking', current_domain)) + if i % 2 == 0: + turn['usr'] = text + turn['usr_da'] = da + turn['usr_span'] = span + turns.append(turn) + else: + turn = {'turn': i//2 + 1, 'sys': '', 'sys_da': '', 'usr': '', 'usr_da': ''} + turn['sys'] = text + turn['sys_da'] = da + turn['sys_span'] = span + for key in da: + domain = key.split('-')[0] + if domain not in ['general', 'Booking']: + current_domain = domain + else: + if args.role == 'sys': + turns.append(turn) + title = title + if title in val_list: + current = results_val + elif title in test_list: + current = results_test + else: + current = results + current[title] = turns + + results = eval(str(results).replace(" n't", " not")) + results_val = eval(str(results_val).replace(" n't", " not")) + results_test = eval(str(results_test).replace(" n't", " not")) + + if not os.path.exists(os.path.join(cur_dir,'data')): + os.makedirs(os.path.join(cur_dir, 'data')) + write_file(os.path.join(cur_dir, f'data/train_{args.role}'), dict(results, **results_val), role=args.role) + write_file(os.path.join(cur_dir, f'data/test_{args.role}'), results_test, role=args.role) diff --git a/convlab2/nlg/scgpt/multiwoz/scgpt.py b/convlab2/nlg/scgpt/multiwoz/scgpt.py index 18f599a18bdd463c72e0c308764988b877165949..78f16f6e0b8562c7118a2a6118f0eb5b3287c828 100644 --- a/convlab2/nlg/scgpt/multiwoz/scgpt.py +++ b/convlab2/nlg/scgpt/multiwoz/scgpt.py @@ -2,6 +2,7 @@ import torch import numpy as np import os import zipfile +from copy import deepcopy from transformers import GPT2LMHeadModel, GPT2Tokenizer from convlab2.nlg.scgpt.utils import tuple2seq @@ -10,23 +11,31 @@ from convlab2.nlg.nlg import NLG from convlab2.util.file_util import cached_path MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop -DEFAULT_DIRECTORY = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models") -DEFAULT_ARCHIVE_FILE = os.path.join(DEFAULT_DIRECTORY, "nlg-gpt-multiwoz.zip") class SCGPT(NLG): - def __init__(self, - archive_file=DEFAULT_ARCHIVE_FILE, - use_cuda=True, - is_user=False, - model_file='https://convlab.blob.core.windows.net/convlab-2/nlg-gpt-multiwoz.zip'): + def __init__(self, model_file=None, + use_cuda=True, is_user=False): + # If no filename is mentioned then set to default + if not model_file: + if is_user: + model_file = 'https://convlab.blob.core.windows.net/convlab-2/nlg-gpt-multiwoz.zip' + else: + model_file = 'https://zenodo.org/record/5767426/files/neo_scgpt_system.zip' + + # Load from file/url model_dir = os.path.dirname(os.path.abspath(__file__)) - if not os.path.isfile(archive_file): - archive_file = cached_path(model_file) - archive = zipfile.ZipFile(archive_file, 'r') + if not os.path.isfile(model_file): + model_file = cached_path(model_file) + if not os.path.isdir(model_file): + archive = zipfile.ZipFile(model_file, 'r') archive.extractall(model_dir) - - self.model_name_or_path = os.path.join(model_dir, 'multiwoz') + # Get model directory + model_file = archive.filelist[0].filename.replace('/', '') + self.model_name_or_path = os.path.join(model_dir, model_file) + else: + self.model_name_or_path = model_file + self.length = 50 self.num_samples = 5 self.temperature = 1.0 @@ -34,6 +43,7 @@ class SCGPT(NLG): self.top_k = 50 self.top_p = 0.9 self.seed = 42 + self.is_user = is_user self.stop_token = '<|endoftext|>' self.device = torch.device("cuda" if torch.cuda.is_available() and use_cuda else "cpu") @@ -51,7 +61,9 @@ class SCGPT(NLG): self.length = self.model.config.max_position_embeddings # No generation bigger than model size elif self.length < 0: self.length = self.MAX_LENGTH # avoid infinite loop - + + self.init_session() + def init_session(self): self.sess_domains = {'Attraction':False, 'Hospital':False, @@ -59,14 +71,34 @@ class SCGPT(NLG): 'Police':False, 'Restaurant':False, 'Taxi':False, - 'Train':False} + 'Train':False,} + self.cur_domain = None + # if not self.is_user: + # self.sess_domains['Booking'] = False def generate(self, meta): + #some actions in testing data is none + if not meta: + return 'No user action' + + meta = deepcopy(meta) + for list_ in meta: + domain = list_[1] + if domain not in ('general', 'Booking'): + self.cur_domain = domain + for i, list_ in enumerate(meta): + list_ = list(list_) + if list_[1] == 'Booking': + if self.cur_domain is not None: + list_[1] = self.cur_domain + meta[i] = list_ + else: + print('`cur_domain` is None, but there is `Booking` in dialog action.') raw_text = tuple2seq(meta) domains = set([item[1] for item in meta]) for domain in domains: - if domain != 'general' and not self.sess_domains[domain]: + if domain not in ('general', 'Booking') and not self.sess_domains[domain]: raw_text = raw_text.replace(domain.lower(), domain.lower()+ ' *', 1) self.sess_domains[domain] = True context_tokens = self.tokenizer.encode(raw_text, add_special_tokens=False) @@ -88,4 +120,4 @@ class SCGPT(NLG): text = text.split('& ')[-1] text = text[: text.find(self.stop_token) if self.stop_token else None] - return text \ No newline at end of file + return text diff --git a/convlab2/nlg/scgpt/train.py b/convlab2/nlg/scgpt/train.py index 775688bbd63e116da42d5f02ecb78930c823a229..0878f31353735ede8b2036ec1f46ef56ce129bed 100644 --- a/convlab2/nlg/scgpt/train.py +++ b/convlab2/nlg/scgpt/train.py @@ -9,33 +9,28 @@ import random import re import shutil -import sys - import numpy as np import torch +from tqdm import tqdm, trange from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler from torch.utils.data.distributed import DistributedSampler try: from torch.utils.tensorboard import SummaryWriter -except: +except ImportError: from tensorboardX import SummaryWriter -from tqdm import tqdm, trange - from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, - BertConfig, BertForMaskedLM, BertTokenizer, - GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, - OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, - RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, - DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer, BertTokenizer) - + BertConfig, BertForMaskedLM, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, + OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2TokenizerFast, + RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, + DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer, BertTokenizer) +from convlab2.nlg.scgpt.modeling_utils import AmpGPT2LMHeadModel, try_enable_gradient_checkpointing, AmpHelper logger = logging.getLogger(__name__) - MODEL_CLASSES = { - 'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer), + 'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2TokenizerFast), 'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), 'bert': (BertConfig, BertForMaskedLM, BertTokenizer), 'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer), @@ -43,11 +38,20 @@ MODEL_CLASSES = { } +def closest_multiple_of_8(n): + """ + Returns: + a closest number, which is a multiple of 8 and >= n + """ + return ((n + 7) >> 3) << 3 + + class TextDataset(Dataset): def __init__(self, tokenizer, args, file_path='train', block_size=512, max_seq=80): assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) - cached_features_file = os.path.join(directory, args.model_name_or_path + '_cached_lm_' + str(block_size) + '_seqlen_' + str(max_seq) + '_' + filename) + cached_features_file = os.path.join(directory, args.model_name_or_path + '_cached_lm_' + str( + block_size) + '_seqlen_' + str(max_seq) + '_' + filename) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) @@ -68,12 +72,11 @@ class TextDataset(Dataset): self.examples.append(tokenized_text) if args.text_chunk: - for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size - self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size])) + for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size + self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i + block_size])) - # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) - # If your dataset is small, first you should loook for a bigger one :-) and second you + # If your dataset is small, first you should look for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. logger.info("Saving features into cached file %s", cached_features_file) @@ -86,26 +89,30 @@ class TextDataset(Dataset): def __getitem__(self, item): return torch.tensor(self.examples[item]) + class TextSeqDataset(Dataset): - def __init__(self, tokenizer, args, file_path='train', block_size=512, max_seq=80, seperator=' & '): + def __init__(self, tokenizer, args, file_path='train', block_size=512, max_seq=80, separator=' & '): + max_seq = closest_multiple_of_8(max_seq) assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) - cached_features_file = os.path.join(directory, args.output_dir.replace(os.sep, '_') + '_cached_lm_' + str(block_size) + '_seqlen_' + str(max_seq) + '_' + filename) + cached_features_file = os.path.join(directory, args.output_dir.replace(os.sep, '_') + '_cached_lm_' + str( + block_size) + '_seqlen_' + str(max_seq) + '_' + filename) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) with open(cached_features_file, 'rb') as handle: - self.examples = pickle.load(handle) + self.examples, self.masks, self.labels, self.seq_lengths = pickle.load(handle) else: logger.info("Creating features from dataset file at %s", directory) self.examples = [] self.labels = [] self.masks = [] + self.seq_lengths = [] with open(file_path, encoding="utf-8") as f: - for line in f: - line = line.strip() - raw_str = line.lower() - code_str = line.lower().split(seperator)[0] + seperator + for line in tqdm(f): + line = line.strip() + raw_str = line.lower() # do we need lowercase? + code_str = line.lower().split(separator)[0] + separator code_str = code_str.strip() if len(raw_str.split()) > max_seq -1: raw_str = ' '.join(raw_str.split()[:max_seq -1]) @@ -118,40 +125,44 @@ class TextSeqDataset(Dataset): code_str_len = len(tokenizer.convert_tokens_to_ids(code_str.split())) label = [-1] * max_seq - label[:len(tokenized_text)] = tokenized_text + label[:len(tokenized_text)] = tokenized_text mask = [1] * max_seq - if len(tokenized_text) < max_seq: + self.seq_lengths.append(len(tokenized_text)) mask[-(max_seq - len(tokenized_text)):] = [0] * (max_seq - len(tokenized_text)) # label[code_str_len:len(tokenized_text)] = tokenized_text[code_str_len:] - tokenized_text = tokenized_text + [0] * (max_seq - len(tokenized_text)) + tokenized_text = tokenized_text + [tokenizer.eos_token_id] * (max_seq - len(tokenized_text)) else: + self.seq_lengths.append(max_seq) tokenized_text = tokenized_text[:max_seq] - # label[code_str_len:] = tokenized_text[code_str_len:] - + # label[code_str_len:] = tokenized_text[code_str_len:] + self.examples.append(tokenized_text) self.masks.append(mask) self.labels.append(label) # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) - # If your dataset is small, first you should loook for a bigger one :-) and second you + # If your dataset is small, first you should look for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. if args.with_code_loss: self.labels = self.examples logger.info("Saving features into cached file %s", cached_features_file) with open(cached_features_file, 'wb') as handle: - pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) + pickle.dump((self.examples, self.masks, self.labels, self.seq_lengths), handle, + protocol=pickle.HIGHEST_PROTOCOL) def __len__(self): return len(self.examples) def __getitem__(self, item): - return torch.tensor(self.examples[item]), torch.tensor(self.masks[item]), torch.tensor(self.labels[item]) + return torch.tensor(self.examples[item]), torch.tensor(self.masks[item]), torch.tensor( + self.labels[item]), torch.tensor(self.seq_lengths[item]) def load_and_cache_examples(args, tokenizer, evaluate=False): - dataset = TextSeqDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size, max_seq=args.max_seq) + dataset = TextSeqDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, + block_size=args.block_size, max_seq=args.max_seq) return dataset @@ -197,7 +208,8 @@ def mask_tokens(inputs, tokenizer, args): labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) - special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()] + special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in + labels.tolist()] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -1 # We only compute loss on masked tokens @@ -215,6 +227,23 @@ def mask_tokens(inputs, tokenizer, args): return inputs, labels +def preprocess_batch(inputs, masks, labels, seq_lengths): + """ + The real sequence length of a batch may be shorter than max_seq of the whole dataset. + Remove some padding tokens to accelerate the training process. + And make sure that the sequence length is multiple of 8. + + References: + https://huggingface.co/transformers/performance.html#fp16 + """ + # The gain for FP16 training is that in each of those cases, the training with the flag --fp16 is twice as fast, + # which does require every tensor to have every dimension be a multiple of 8 + # (examples pad the tensors to a sequence length that is a multiple of 8). + max_seq_len = seq_lengths.max() + max_seq_len = closest_multiple_of_8(max_seq_len) + return inputs[:, :max_seq_len], masks[:, :max_seq_len], labels[:, :max_seq_len] + + def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: @@ -233,27 +262,23 @@ def train(args, train_dataset, model, tokenizer): # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, + {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) - if args.fp16: - try: - from apex import amp - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") - model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) - model.resize_token_embeddings(len(tokenizer)) - # multi-gpu training (should be after apex fp16 initialization) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, + num_training_steps=t_total) + # https://pytorch.org/docs/master/notes/amp_examples.html + amp_helper = AmpHelper(use_amp=args.fp16) if args.n_gpu > 1: model = torch.nn.DataParallel(model) - # Distributed training (should be after apex fp16 initialization) + # Distributed training if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, - find_unused_parameters=True) + find_unused_parameters=False) # Train! logger.info("***** Running training *****") @@ -261,7 +286,8 @@ def train(args, train_dataset, model, tokenizer): logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + args.train_batch_size * args.gradient_accumulation_steps * ( + torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) @@ -271,12 +297,13 @@ def train(args, train_dataset, model, tokenizer): train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility (even between python 2 and 3) for e in train_iterator: - + # epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(train_dataloader): # inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) - logger.info(f" PROGRESS: {float(global_step)/t_total*100}%") - inputs, masks, labels = batch + logger.info(f" PROGRESS: {float(global_step) / t_total * 100}%") + inputs, masks, labels, seq_lengths = batch + inputs, masks, labels = preprocess_batch(inputs, masks, labels, seq_lengths) # cut seq # import pdb # pdb.set_trace() inputs = inputs.to(args.device) @@ -284,27 +311,29 @@ def train(args, train_dataset, model, tokenizer): labels = labels.to(args.device) model.train() - outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) - loss = outputs[0] # model outputs are always tuple in transformers (see doc) - - if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - if args.fp16: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() + try: + with amp_helper.might_enable_autocast: + outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) + loss = outputs[0] # model outputs are always tuple in transformers (see doc) + + if args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + amp_helper.backward(loss) + except RuntimeError as e: + if 'CUDA out of memory' in str(e): + # if out of memory, we must choose smaller batch_size + print(f'inputs.shape = {inputs.shape}, labels.shape = {labels.shape}') + raise tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: - if args.fp16: - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) - else: - torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) - optimizer.step() + amp_helper.might_unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) + # optimizer.step() + amp_helper.step(optimizer) scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 @@ -317,7 +346,7 @@ def train(args, train_dataset, model, tokenizer): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) - logger.info(f" EVALERR: {(tr_loss - logging_loss)/float(args.logging_steps)}") + logger.info(f" EVALERR: {(tr_loss - logging_loss) / float(args.logging_steps)}") logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: @@ -326,7 +355,8 @@ def train(args, train_dataset, model, tokenizer): output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = model.module if hasattr(model, + 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) @@ -334,12 +364,9 @@ def train(args, train_dataset, model, tokenizer): _rotate_checkpoints(args, checkpoint_prefix) - # if args.max_steps > 0 and global_step > args.max_steps: - # epoch_iterator.close() - # break - if args.max_steps > 0 and global_step > args.max_steps: - train_iterator.close() - break + if global_step > args.max_steps > 0: + train_iterator.close() + break if args.local_rank in [-1, 0]: tb_writer.close() @@ -362,7 +389,9 @@ def evaluate(args, model, tokenizer, prefix=""): eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate - if args.n_gpu > 1: + if args.n_gpu > 1 and not (isinstance(model, torch.nn.DataParallel) or + isinstance(model, torch.nn.parallel.DistributedDataParallel)): + # if args.evaluate_during_training, DataParallel is already used model = torch.nn.DataParallel(model) # Eval! @@ -376,9 +405,10 @@ def evaluate(args, model, tokenizer, prefix=""): for batch in tqdm(eval_dataloader, desc="Evaluating"): # inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) - inputs, masks, labels = batch - # import pdb - # pdb.set_trace() + inputs, masks, labels, seq_lengths = batch + inputs, masks, labels = preprocess_batch(inputs, masks, labels, seq_lengths) # cut seq + # import pdb + # pdb.set_trace() inputs = inputs.to(args.device) masks = masks.to(args.device) labels = labels.to(args.device) @@ -387,12 +417,12 @@ def evaluate(args, model, tokenizer, prefix=""): with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) - lm_loss = outputs[0] - eval_loss += lm_loss.mean().item() + loss = outputs[0] # model outputs are always tuple in transformers (see doc) + eval_loss += loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps - perplexity = torch.exp(torch.tensor(eval_loss)) + perplexity = float(np.exp(eval_loss)) result = { "perplexity": perplexity @@ -409,6 +439,7 @@ def evaluate(args, model, tokenizer, prefix=""): def main(): + global AdamW parser = argparse.ArgumentParser() ## Required parameters @@ -489,10 +520,7 @@ def main(): help="random seed for initialization") parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - parser.add_argument('--fp16_opt_level', type=str, default='O1', - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html") + help="Whether to use 16-bit (mixed) precision (through torch.cuda.amp) instead of 32-bit") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") @@ -504,18 +532,32 @@ def main(): parser.add_argument("--max_seq", default=80, type=int, help="") + parser.add_argument('--gradient_checkpointing', action='store_true', help='enable gradient checkpointing') + parser.add_argument('--use_multi_tensor_adamw', action='store_true', + help='use torch.optim._multi_tensor.AdamW instead of transformers.AdamW') args = parser.parse_args() + if args.use_multi_tensor_adamw: + try: + # overwrite the previous imported AdamW + # https://huggingface.co/transformers/performance.html#faster-optimizer + from torch.optim._multi_tensor import AdamW + except ImportError as e: + print(e) if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm: raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling).") if args.eval_data_file is None and args.do_eval: - raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " - "or remove the --do_eval argument.") + raise ValueError( + "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " + "or remove the --do_eval argument.") - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) + if os.path.exists(args.output_dir) and os.listdir( + args.output_dir) and args.do_train and not args.overwrite_output_dir: + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: @@ -525,6 +567,11 @@ def main(): ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() + # Setup logging before `torch.distributed.init_process_group` is called + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) + # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") @@ -535,13 +582,8 @@ def main(): torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device - - # Setup logging - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) @@ -550,14 +592,16 @@ def main(): if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab + if args.fp16: + MODEL_CLASSES['gpt2'] = (GPT2Config, AmpGPT2LMHeadModel, GPT2TokenizerFast) config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - #tokenizer = BertTokenizer(vocab_file='../GPT2-chitchat/vocabulary/vocab_small.txt', eos_token='<T>', + # tokenizer = BertTokenizer(vocab_file='../GPT2-chitchat/vocabulary/vocab_small.txt', eos_token='<T>', do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) - + if args.block_size <= 0: args.block_size = tokenizer.max_len_single_sentence # Our input block size will be the max possible for the model args.block_size = min(args.block_size, tokenizer.max_len_single_sentence) @@ -565,7 +609,13 @@ def main(): from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) + if model.config.vocab_size != len(tokenizer): + logger.info('resize token embeddings, since there may be added tokens.') + model.resize_token_embeddings(len(tokenizer)) model.to(args.device) + if args.gradient_checkpointing: + # https://huggingface.co/transformers/performance.html#gradient-checkpointing + try_enable_gradient_checkpointing(model) if args.local_rank == 0: torch.distributed.barrier() # End of barrier to make sure only the first process in distributed training download model & vocab @@ -585,7 +635,6 @@ def main(): global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed @@ -595,7 +644,8 @@ def main(): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = model.module if hasattr(model, + 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) @@ -607,25 +657,24 @@ def main(): tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) - # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + checkpoints = list( + os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" - + model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) - return results diff --git a/convlab2/nlg/template/multiwoz/label_maps.json b/convlab2/nlg/template/multiwoz/label_maps.json new file mode 100644 index 0000000000000000000000000000000000000000..4735bf321052957a6fb9367e6662dd7839711aaf --- /dev/null +++ b/convlab2/nlg/template/multiwoz/label_maps.json @@ -0,0 +1,798 @@ +{ + "guesthouse": [ + "guest house", + "guest houses" + ], + "hotel": [ + "hotels" + ], + "centre": [ + "center", + "downtown" + ], + "north": [ + "northern", + "northside", + "northend" + ], + "east": [ + "eastern", + "eastside", + "eastend" + ], + "west": [ + "western", + "westside", + "westend" + ], + "south": [ + "southern", + "southside", + "southend" + ], + "cheap": [ + "inexpensive", + "lower price", + "lower range", + "cheaply", + "cheaper", + "cheapest", + "very affordable" + ], + "moderate": [ + "moderately", + "reasonable", + "reasonably", + "affordable", + "mid range", + "mid-range", + "priced moderately", + "decently priced", + "mid price", + "mid-price", + "mid priced", + "mid-priced", + "middle price", + "medium price", + "medium priced", + "not too expensive", + "not too cheap" + ], + "expensive": [ + "high end", + "high-end", + "high class", + "high-class", + "high scale", + "high-scale", + "high price", + "high priced", + "higher price", + "fancy", + "upscale", + "nice", + "expensively", + "luxury" + ], + "0": [ + "zero" + ], + "1": [ + "one", + "just me", + "for me", + "myself", + "alone", + "me" + ], + "2": [ + "two" + ], + "3": [ + "three" + ], + "4": [ + "four" + ], + "5": [ + "five" + ], + "6": [ + "six" + ], + "7": [ + "seven" + ], + "8": [ + "eight" + ], + "9": [ + "nine" + ], + "10": [ + "ten" + ], + "11": [ + "eleven" + ], + "12": [ + "twelve" + ], + "architecture": [ + "architectural", + "architecturally", + "architect" + ], + "boat": [ + "boating", + "boats", + "camboats" + ], + "camboats": [ + "boating", + "boat", + "boats" + ], + "cinema": [ + "cinemas", + "movie", + "films", + "film" + ], + "college": [ + "colleges" + ], + "concerthall": [ + "concert hall", + "concert halls", + "concerthalls", + "concerts", + "concert" + ], + "entertainment": [ + "entertaining" + ], + "gastropub": [ + "gastropubs" + ], + "mutliple sports": [ + "multiple sport", + "multiple sports", + "multi sport", + "multi sports", + "sports", + "sporting" + ], + "museum": [ + "museums", + "gallery", + "galleries" + ], + "nightclub": [ + "night clubs", + "night club", + "nightclubs", + "club", + "clubs" + ], + "park": [ + "parks" + ], + "swimmingpool": [ + "swimming pool", + "swimming", + "pool", + "pools", + "swimmingpool", + "water", + "swim" + ], + "theatre": [ + "theater", + "theatres", + "theaters" + ], + "abbey pool and astroturf pitch": [ + "abbey pool and astroturf", + "abbey pool" + ], + "adc theatre": [ + "adc theater", + "adc" + ], + "addenbrookes hospital": [ + "addenbrooke's hospital" + ], + "cafe jello gallery": [ + "cafe jello" + ], + "cambridge and county folk museum": [ + "cambridge and country folk museum", + "county folk museum" + ], + "cambridge arts theatre": [ + "cambridge arts theater" + ], + "cambridge book and print gallery": [ + "book and print gallery" + ], + "cambridge contemporary art": [ + "cambridge contemporary art museum", + "contemporary art museum" + ], + "the cambridge corn exchange": [ + "cambridge corn exchange" + ], + "cambridge museum of technology": [ + "museum of technology" + ], + "the cambridge punter": [ + "cambridge punter", + "cambridge punters" + ], + "cambridge university botanic gardens": [ + "cambridge university botanical gardens", + "cambridge university botanical garden", + "cambridge university botanic garden", + "cambridge botanic gardens", + "cambridge botanical gardens", + "cambridge botanic garden", + "cambridge botanical garden", + "botanic gardens", + "botanical gardens", + "botanic garden", + "botanical garden" + ], + "cherry hinton hall and grounds": [ + "cherry hinton hall" + ], + "cherry hinton water play": [ + "cherry hinton water play park" + ], + "cineworld cinema": [ + "cineworld" + ], + "clare hall": [ + "clair hall" + ], + "the fez club": [ + "fez club" + ], + "jesus green outdoor pool": [ + "jesus green" + ], + "kings hedges learner pool": [ + "king's hedges learner pool", + "king hedges learner pool" + ], + "mumford theatre": [ + "mumford theater" + ], + "museum of archaelogy and anthropology": [ + "museum of archaeology and anthropology", + "museum of archaelogy", + "museum of archaeology" + ], + "riverboat georgina": [ + "riverboat" + ], + "saint barnabas press gallery": [ + "saint barbabas" + ], + "scott polar museum": [ + "scott polar" + ], + "scudamores punting co": [ + "scudamore's punting co", + "scudamores punting", + "scudamore's punting", + "scudamores", + "scudamore's", + "scudamore" + ], + "soul tree nightclub": [ + "soul tree night club", + "soul tree", + "soultree" + ], + "the man on the moon": [ + "man on the moon" + ], + "the junction": [ + "junction theatre", + "junction theater" + ], + "old schools": [ + "old school" + ], + "vue cinema": [ + "vue" + ], + "wandlebury country park": [ + "the wandlebury" + ], + "whipple museum of the history of science": [ + "whipple museum", + "history of science museum" + ], + "restaurant alimentum": [ + "alimentum" + ], + "bedouin": [ + "the bedouin" + ], + "bloomsbury restaurant": [ + "bloomsbury" + ], + "caffe uno": [ + "cafe uno", + "caffee uno" + ], + "cambridge lodge restaurant": [ + "cambridge lodge" + ], + "chiquito restaurant bar": [ + "chiquito restaurant", + "chiquito" + ], + "city stop restaurant": [ + "city stop" + ], + "clowns cafe": [ + "clown's cafe" + ], + "the cow pizza kitchen and bar": [ + "cow pizza kitchen and bar", + "cow pizza" + ], + "darrys cookhouse and wine shop": [ + "darry's cookhouse and wine shop", + "darry's cookhouse", + "darrys cookhouse" + ], + "de luca cucina and bar": [ + "de luca cucina and bar riverside brasserie", + "luca cucina and bar", + "de luca cucina", + "luca cucina" + ], + "da vinci pizzeria": [ + "da vinci pizza", + "da vinci" + ], + "don pasquale pizzeria": [ + "don pasquale pizza", + "don pasquale", + "pasquale pizzeria", + "pasquale pizza" + ], + "efes restaurant": [ + "efes" + ], + "fitzbillies restaurant": [ + "fitzbillies" + ], + "frankie and bennys": [ + "frankie and benny's" + ], + "funky fun house": [ + "funky" + ], + "the gardenia": [ + "gardenia" + ], + "grafton hotel restaurant": [ + "the grafton hotel", + "grafton hotel" + ], + "hotel du vin and bistro": [ + "hotel du vin", + "du vin" + ], + "kohinoor": [ + "the kohinoor" + ], + "lan hong house": [ + "lan hong", + "ian hong house", + "ian hong" + ], + "lovell lodge": [ + "lovell", + "the lovell lodge" + ], + "mahal of cambridge": [ + "mahal" + ], + "maharajah tandoori restaurant": [ + "maharajah tandoori", + "the maharajah tandoor" + ], + "meze bar restaurant": [ + "the meze bar", + "meze bar" + ], + "michaelhouse cafe": [ + "michael house cafe" + ], + "midsummer house restaurant": [ + "midsummer house" + ], + "the missing sock": [ + "missing sock" + ], + "nandos": [ + "nando's city centre", + "nando's city center", + "nandos city centre", + "nandos city center", + "nando's" + ], + "nandos city centre": [ + "nando's city centre", + "nando's city center", + "nandos city center", + "nando's", + "nandos" + ], + "the oak bistro": [ + "oak bistro" + ], + "restaurant one seven": [ + "one seven" + ], + "the river bar steakhouse and grill": [ + "river bar steakhouse and grill", + "the river bar steakhouse", + "river bar steakhouse" + ], + "pipasha restaurant": [ + "pipasha" + ], + "pizza hut city centre": [ + "pizza hut city center" + ], + "pizza hut fen ditton": [ + "pizza hut fenditton", + "pizza express fen ditton" + ], + "restaurant two two": [ + "two two", + "restaurant 22" + ], + "saffron brasserie": [ + "saffron" + ], + "saint johns chop house": [ + "saint john's chop house", + "st john's chop house", + "st johns chop house" + ], + "sesame restaurant and bar": [ + "sesame restaurant", + "sesame" + ], + "shanghai family restaurant": [ + "shanghai" + ], + "sitar tandoori": [ + "sitar" + ], + "the slug and lettuce": [ + "slug and lettuce" + ], + "saint johns chop house": [ + "st johns chop house", + "st john's chop house", + "saint johns chop house" + ], + "stazione restaurant and coffee bar": [ + "stazione restaurant", + "stazione" + ], + "thanh binh": [ + "thanh", + "binh" + ], + "the hotpot": [ + "the hotspot", + "hotpot", + "hotspot" + ], + "the lucky star": [ + "lucky star" + ], + "peking restaurant": [ + "the peking restaurant" + ], + "the varsity restaurant": [ + "varsity restaurant", + "the varsity", + "varsity" + ], + "zizzi cambridge": [ + "zizzi" + ], + "asian oriental": [ + "asian", + "oriental" + ], + "australian": [ + "australasian" + ], + "barbeque": [ + "barbecue", + "bbq" + ], + "corsica": [ + "corsican" + ], + "indian": [ + "tandoori" + ], + "italian": [ + "pizza", + "pizzeria" + ], + "japanese": [ + "sushi" + ], + "latin american": [ + "latin-american", + "latin" + ], + "malaysian": [ + "malay" + ], + "middle eastern": [ + "middle-eastern" + ], + "modern american": [ + "american modern", + "american" + ], + "modern european": [ + "european modern", + "european" + ], + "north american": [ + "north-american", + "american" + ], + "portuguese": [ + "portugese" + ], + "seafood": [ + "sea food" + ], + "singaporean": [ + "singapore" + ], + "steakhouse": [ + "steak house", + "steak" + ], + "the americas": [ + "american", + "americas" + ], + "a and b guest house": [ + "a & b guest house", + "a and b", + "a & b" + ], + "acorn guest house": [ + "the acorn guest house", + "acorn" + ], + "alexander bed and breakfast": [ + "alexander" + ], + "allenbell": [ + "the allenbell" + ], + "alpha-milton guest house": [ + "the alpha-milton", + "alpha-milton" + ], + "arbury lodge guesthouse": [ + "arbury lodge guest house", + "arbury lodge", + "arbury" + ], + "archway house": [ + "archway" + ], + "ashley hotel": [ + "the ashley hotel", + "ashley" + ], + "aylesbray lodge guest house": [ + "aylesbray lodge", + "aylesbray", + "alesbray lodge guest house", + "alyesbray lodge hotel" + ], + "bridge guest house": [ + "bridge house" + ], + "the cambridge belfry": [ + "cambridge belfry", + "belfry hotel", + "belfry" + ], + "carolina bed and breakfast": [ + "carolina" + ], + "city centre north b and b": [ + "city centre north bed and breakfast", + "city centre north", + "north b and b" + ], + "el shaddai": [ + "el shaddia guest house", + "el shaddai guest house", + "el shaddia" + ], + "express by holiday inn cambridge": [ + "express by holiday inn", + "holiday inn cambridge", + "holiday inn" + ], + "finches bed and breakfast": [ + "finches" + ], + "gonville hotel": [ + "gonville" + ], + "hamilton lodge": [ + "the hamilton lodge", + "hamilton" + ], + "hobsons house": [ + "hobson's house", + "hobson's" + ], + "huntingdon marriott hotel": [ + "huntington marriott hotel", + "huntington marriot hotel", + "huntingdon marriot hotel", + "huntington marriott", + "huntingdon marriott", + "huntington marriot", + "huntingdon marriot", + "huntington", + "huntingdon", + "marriott hotel", + "marriott" + ], + "kirkwood house": [ + "kirkwood" + ], + "the lensfield hotel": [ + "lensfield hotel", + "lensfield" + ], + "leverton house": [ + "leverton" + ], + "rosa's bed and breakfast": [ + "rosas bed and breakfast", + "rosa's", + "rosas" + ], + "university arms hotel": [ + "university arms" + ], + "warkworth house": [ + "warkworth hotel", + "warkworth" + ], + "worth house": [ + "the worth house", + "warkworth house", + "warkworth" + ], + "birmingham new street": [ + "birmingham new street train station" + ], + "birmingham new street train station": [ + "birmingham new street" + ], + "bishops stortford": [ + "bishops stortford train station" + ], + "bishops stortford train station": [ + "bishops stortford" + ], + "broxbourne": [ + "broxbourne train station" + ], + "broxbourne train station": [ + "broxbourne" + ], + "cambridge": [ + "cambridge train station" + ], + "cambridge train station": [ + "cambridge" + ], + "ely": [ + "ely train station" + ], + "ely train station": [ + "ely" + ], + "kings lynn": [ + "king's lynn", + "king's lynn train station", + "kings lynn train station" + ], + "kings lynn train station": [ + "kings lynn", + "king's lynn", + "king's lynn train station" + ], + "leicester": [ + "leicester train station" + ], + "leicester train station": [ + "leicester" + ], + "london kings cross": [ + "kings cross", + "king's cross", + "london king's cross", + "kings cross train station", + "king's cross train station", + "london king's cross train station", + "london kings cross train station" + ], + "london kings cross train station": [ + "kings cross", + "king's cross", + "london king's cross", + "london kings cross", + "kings cross train station", + "king's cross train station", + "london king's cross train station" + ], + "london liverpool street": [ + "london liverpool", + "liverpool street", + "london liverpool train station", + "liverpool street train station", + "london liverpool street train station" + ], + "london liverpool street train station": [ + "london liverpool", + "liverpool street", + "london liverpool street", + "london liverpool train station", + "liverpool street train station" + ], + "norwich": [ + "norwich train station" + ], + "norwich train station": [ + "norwich" + ], + "peterborough": [ + "peterborough train station" + ], + "peterborough train station": [ + "peterborough" + ], + "stansted airport": [ + "stansted airport train station" + ], + "stansted airport train station": [ + "stansted airport" + ], + "stevenage": [ + "stevenage train station" + ], + "stevenage train station": [ + "stevenage" + ] +} \ No newline at end of file diff --git a/data/multiwoz/ontology_nlg_eval.json b/data/multiwoz/ontology_nlg_eval.json new file mode 100644 index 0000000000000000000000000000000000000000..1dbfbc97edf683419fada8f32199cf1e8f53c7db --- /dev/null +++ b/data/multiwoz/ontology_nlg_eval.json @@ -0,0 +1,1884 @@ +{ + "hotel-price range": [ + "cheap", + "do n't care", + "moderate", + "expensive" + ], + "hotel-type": [ + "hotel", + "guest house" + ], + "hotel-parking": [ + "yes", + "do n't care", + "no", + "free" + ], + "hotel-book stay": [ + "2", + "1", + "4", + "5", + "3", + "8", + "7", + "6", + "six" + ], + "hotel-book day": [ + "tuesday", + "friday", + "monday", + "wednesday", + "saturday", + "thursday", + "sunday" + ], + "hotel-book people": [ + "6", + "1", + "3", + "5", + "4", + "2", + "8", + "7", + "six", + "3." + ], + "hotel-area": [ + "east", + "north", + "centre", + "south", + "west", + "do n't care", + "southern aylesbray", + "cambridge", + "el shaddai", + "stevenage", + "place to be a guesthouse", + "peterborough", + "bishops stortford", + "cheap" + ], + "hotel-stars": [ + "4", + "2", + "0", + "3", + "do n't care", + "1", + "5" + ], + "hotel-internet": [ + "yes", + "do n't care", + "no" + ], + "train-destination": [ + "bishops stortford", + "cambridge", + "ely", + "peterborough", + "norwich", + "london liverpool street", + "leicester", + "kings lynn", + "broxbourne", + "birmingham new street", + "london kings cross", + "stansted airport", + "stevenage", + "lecester", + "bishop stortford", + "north", + "willi", + "curry prince", + "bournemouth", + "centre", + "bishops storford", + "city centre north", + "taj tandoori", + "petersborough", + "huntington marriott", + "hol", + "gonv", + "liverpool", + "huntingdon marriott hotel", + "gourmet burger kitchen", + "the copper kettle" + ], + "train-day": [ + "friday", + "wednesday", + "monday", + "saturday", + "thursday", + "tuesday", + "sunday", + "do n't care", + "train" + ], + "train-departure": [ + "cambridge", + "birmingham new street", + "ely", + "norwich", + "bishops stortford", + "peterborough", + "stevenage", + "broxbourne", + "london liverpool street", + "leicester", + "stansted airport", + "kings lynn", + "london kings cross", + "birmingham", + "do n't care", + "saint johns", + "wandlebury country park", + "liecester", + "panahar", + "cineworld", + "stansted", + "el shaddai", + "lon", + "cafe uno", + "leicaster", + "city hall", + "rosa's bed and breakfast", + "norwhich", + "cam", + "brookshite", + "bro", + "cambrid", + "arbu", + "aylesbray lodge guest", + "alpha-milton", + "london", + "hamilton lodge", + "duxford", + "camboats" + ], + "train-arrive by": [ + "19:45", + "20:45", + "11:30", + "14:45", + "08:15", + "13:45", + "08:00", + "08:45", + "11:00", + "11:45", + "08:30", + "10:45", + "05:00", + "20:00", + "15:15", + "11:15", + "19:00", + "20:15", + "16:45", + "12:45", + "10:15", + "21:00", + "15:00", + "17:15", + "14:30", + "20:30", + "18:15", + "12:15", + "18:45", + "12:00", + "10:00", + "21:45", + "16:15", + "14:00", + "18:30", + "17:45", + "13:30", + "19:30", + "12:30", + "09:15", + "09:00", + "16:00", + "23:00", + "10:30", + "09:30", + "16:30", + "18:00", + "09:45", + "15:45", + "19:15", + "18:23", + "13:00", + "17:30", + "15:30", + "21:15", + "13:15", + "21:30", + "17:00", + "8:00", + "14:15", + "do n't care", + "thursday", + "05:30", + "16:06", + "11:52", + "6:55", + "18:07", + "09:06", + "07:15", + "06:01", + "13:52", + "22:07", + "19:57", + "08:44", + "06:30", + "10:08", + "08:07", + "07:30", + "19:58", + "10:43", + "9:30", + "13:38", + "15:06", + "07:35", + "01:00", + "13:56", + "12:07", + "15:07", + "11:24", + "22:00", + "14:43", + "19:27", + "10:32", + "1100", + "06:15", + "23:30", + "15:01", + "16:32", + "05:15", + "13:32", + "07:45", + "05:45", + "09;45", + "18:32", + "23:27", + "06:45", + "19:51", + "tuesday", + "21:06", + "15:24", + "17:23", + "09:32" + ], + "train-book people": [ + "1", + "5", + "8", + "6", + "2", + "7", + "4", + "3", + "0", + "9", + "15", + "`1", + "10" + ], + "train-leave at": [ + "21:15", + "12:45", + "19:45", + "14:00", + "15:15", + "10:00", + "09:29", + "10:15", + "09:15", + "19:15", + "11:15", + "17:45", + "14:30", + "20:00", + "after 9:15 am", + "09:30", + "08:15", + "20:15", + "10:45", + "14:45", + "17:00", + "21:00", + "17:30", + "11:30", + "13:45", + "12:15", + "08:45", + "11:45", + "09:00", + "18:45", + "05:15", + "18:00", + "16:00", + "11:00", + "05:00", + "08:00", + "18:30", + "21:45", + "16:30", + "14:15", + "16:15", + "10:32", + "12:30", + "13:00", + "15:32", + "13:30", + "02:00", + "08:30", + "15:00", + "10:30", + "15:45", + "09:45", + "21:30", + "05:59", + "06:00", + "18:15", + "bishops stortford", + "07:45", + "05:45", + "after 11:30", + "12:00", + "13:15", + "20:45", + "16:45", + "19:00", + "15:30", + "17:15", + "05:30", + "do n't care", + "18:40", + "21:50", + "19:30", + "06:45", + "06:30", + "18:06", + "22:00", + "0", + "14:21", + "13:11", + "21:07", + "11:35", + "20:30", + "monday", + "leicester", + "07:21", + "05:16", + "8:30", + "after 8:30", + "05:01", + "10:36", + "20:36", + "21:39", + "17:59", + "18:31", + "13:35", + "after 19:30", + "09:17", + "07:15", + "morning", + "09:59", + "07:00", + "saturday", + "07:54", + "09:54", + "08:16", + "15:24", + "15:17", + "9:30", + "13:39", + "15:11", + "13:01", + "05:17", + "10:34", + "10:16", + "11:39", + "12:06", + "11:50", + "13:54", + "8:45", + "13:29", + "19:11", + "14:48", + "11:21", + "12:32", + "after 16:30", + "15:16", + "11:48", + "18:11", + "18:46", + "02:30", + "22:01", + "19:21", + "thursday", + "08:01", + "15:29", + "08:11", + "20:40", + "13:17", + "15:01", + "14:34", + "08:32" + ], + "attraction-area": [ + "centre", + "west", + "north", + "south", + "east", + "do n't care", + "cambridge" + ], + "restaurant-food": [ + "turkish", + "indian", + "chinese", + "seafood", + "italian", + "british", + "australasian", + "australian", + "asian oriental", + "thai", + "vegetarian", + "modern european", + "gastropub", + "south indian", + "european", + "portuguese", + "swiss", + "crossover", + "catalan", + "french", + "do n't care", + "mexican", + "welsh", + "korean", + "tuscan", + "new zealand", + "molecular gastronomy", + "eritrean", + "british food", + "the americas", + "north american", + "spanish", + "barbeque", + "persian", + "greek", + "lebanese", + "vietnamese", + "belgian", + "creative", + "jamaican", + "scottish", + "cuban", + "japanese", + "sri lankan", + "light bites", + "moroccan", + "latin american", + "african", + "basque", + "modern global", + "halal", + "mediterranean", + "bistro", + "international", + "unusual", + "north indian", + "modern eclectic", + "danish", + "afghan", + "world", + "northern european", + "german", + "cantonese", + "irish", + "romanian", + "russian", + "english", + "corsica", + "steakhouse", + "hungarian", + "singaporean", + "austrian", + "venetian", + "ital", + "polynesian", + "kosher", + "swedish", + "scandinavian", + "modern american", + "christmas", + "malaysian", + "north african", + "brazilian", + "canapes", + "caribbean", + "south african", + "traditional", + "indonesian", + "middle eastern", + "fusion", + "polish", + "asian", + "not mentionedc", + "afternoon tea", + "eastern european", + "panasian", + "gastro pub", + "american", + "pizza", + "modern european", + "modern english" + ], + "restaurant-price range": [ + "expensive", + "cheap", + "moderate", + "do n't care" + ], + "restaurant-area": [ + "centre", + "south", + "north", + "east", + "west" + ], + "attraction-name": [ + "all saints church", + "funky fun house", + "christ's college", + "the man on the moon", + "milton country park", + "wandlebury country park", + "king's college", + "lynne strover gallery", + "saint john's college", + "broughton house gallery", + "old schools", + "museum of classical archaeology", + "adc theatre", + "emmanuel college", + "byard art", + "soul tree nightclub", + "club salsa", + "cherry hinton hall and grounds", + "museum of archaelogy and anthropology", + "castle galleries", + "nusha", + "regency gallery", + "whipple museum of the history of science", + "churchill college", + "holy trinity church", + "cambridge artworks", + "cambridge arts theatre", + "scott polar museum", + "do n't care", + "scudamores punting co", + "clare college", + "saint barnabas press gallery", + "cambridge and county folk museum", + "kettle's yard", + "no specific location", + "whale of a time", + "vue cinema", + "cambridge book and print gallery", + "ballare", + "cafe jello gallery", + "riverboat georgina", + "the fitzwilliam museum", + "sidney sussex college", + "saint catharine's college", + "great saint mary's church", + "cherry hinton water play", + "primavera", + "cineworld cinema", + "magdalene college", + "kambar", + "parkside pools", + "the fez club", + "cambridge university botanic gardens", + "cambridge museum of technology", + "downing college", + "clare hall", + "cafe jello", + "little saint mary's church", + "people's portraits exhibition at girton college", + "abbey pool and astroturf pitch", + "corpus christi", + "tenpin", + "the cambridge punter", + "queens' college", + "fez club", + "mumford theatre", + "the cherry hinton village centre", + "jesus green outdoor pool", + "trinity college", + "gonville and caius college", + "camboats", + "cambridge", + "williams art and antiques", + "kings hedges learner pool", + "cambridge contemporary art", + "the cambridge corn exchange", + "the place", + "ruskin gallery", + "sheep's green and lammas land park fen causeway", + "gallery at twelve a high street", + "carol", + "swimming pool", + "kirkwood", + "pembroke college", + "univ", + "the junction", + "jesus college", + "fitzwilliam museum", + "college", + "pizz", + "worth house", + "free", + "aut", + "man on the moon", + "all saint's church", + "hughes hall", + "city", + "christ college", + "north", + "by", + "autu", + "garde", + "caffe uno", + "cambridge belf", + "old school", + "museum", + "love", + "cambridge corn exchange", + "college in the north", + "boating", + "older churches", + "cambridge punter", + "queen's college", + "architectural churches", + "saint john's college.", + "parkside", + "thanh", + "the mumford theatre", + "fitzbillies", + "home from home", + "da v", + "cambridge temporary art", + "centre", + "fitzwilliam", + "pizza", + "scott polar", + "da vinci pizzeria", + "broughton", + "contemporary art museum", + "belf", + "school", + "corn cambridge exchange", + "bangkok city", + "bed" + ], + "restaurant-name": [ + "meze bar restaurant", + "indian", + "pizza hut city centre", + "the good luck chinese food takeaway", + "caffe uno", + "the gardenia", + "the oak bistro", + "do n't care", + "sala thong", + "thanh binh", + "riverside brasserie", + "cambri", + "pizza express", + "yippee noodle bar", + "curry prince", + "midsummer house restaurant", + "restaurant alimentum", + "nandos city centre", + "chiquito restaurant bar", + "maharajah tandoori restaurant", + "yu garden", + "bangkok city", + "copper kettle", + "backstreet bistro", + "the golden curry", + "don pasquale pizzeria", + "sesame restaurant and bar", + "charlie", + "the cow pizza kitchen and bar", + "india house", + "loch fyne", + "eraina", + "royal spice", + "prezzo", + "curry king", + "the nirala", + "curry garden", + "zizzi cambridge", + "da vinci pizzeria", + "jinling noodle bar", + "la raza", + "cotto", + "efes restaurant", + "taj tandoori", + "golden wok", + "charlie chan", + "kohinoor", + "bedouin", + "the cambridge chop house", + "stazione restaurant and coffee bar", + "graffiti", + "pizza hut", + "la mimosa", + "city stop", + "grafton hotel restaurant", + "pizza hut fen ditton", + "frankie and bennys", + "rajmahal", + "rice boat", + "the missing sock", + "the varsity restaurant", + "panahar", + "nandos", + "sitar tandoori", + "oak bistro", + "scudamores punt", + "lovel", + "anatolia", + "clowns cafe", + "gourmet burger kitchen", + "tandoori palace", + "ali baba", + "darrys cookhouse and wine shop", + "hakka", + "peking restaurant", + "de luca cucina and bar", + "the slug and lettuce", + "city stop restaurant", + "kymmoy", + "cambridge lodge restaurant", + "tandoori", + "bloomsbury restaurant", + "ugly duckling", + "hk fusion", + "pizza hut cherry hinton", + "fitzbillies restaurant", + "hotel du vin and bistro", + "restaurant two two", + "dojo noodle bar", + "the copper kettle", + "michaelhouse cafe", + "restaurant one seven", + "the hotpot", + "royal standard", + "the river bar steakhouse and grill", + "pipasha restaurant", + "golden curry", + "saigon city", + "pizza express fen ditton", + "little seoul", + "meghna", + "saffron brasserie", + "j restaurant", + "la margherita", + "the lucky star", + "lan hong house", + "hotpot", + "the gandhi", + "cocum", + "golden house", + "la tasca", + "shanghai family restaurant", + "worth house", + "wagamama", + "galleria", + "travellers rest", + "mahal of cambridge", + "archway", + "molecular gastonomy", + "european", + "saint johns chop house", + "anatolia and efes restaurant", + "shiraz restaurant", + "nirala", + "not metioned", + "cott", + "cambridge chop house", + "bridge", + "lucky star", + "clu", + "tang chinese", + "golden house golden house", + "rice house", + "limehouse", + "clowns", + "restaurant", + "parkside pools", + "the dojo noodle bar", + "nusha", + "hobson house", + "curry queen", + "el shaddai", + "old school", + "cam", + "gardenia", + "fin", + "efes", + "slug and lettuce", + "camboats", + "missing sock", + "grafton", + "nus", + "cambridge lodge", + "fitzbillies", + "hamilton lodge", + "gastropub", + "funky", + "cow pizza", + "ashley", + "ros", + "hobso", + "kitchen and bar", + "cityr", + "pipasha", + "seasame restaurant and bar", + "the alex", + "hu", + "one seven", + "shanghi family restaurant", + "cambridge be", + "dif", + "margherita", + "bri", + "india", + "adden", + "ian hong house" + ], + "attraction-type": [ + "museum", + "entertainment", + "college", + "nightclub", + "swimming pool", + "multiple sports", + "architecture", + "cinema", + "boat", + "theatre", + "concert hall", + "park", + "do n't care", + "local site", + "hotspot", + "church", + "special" + ], + "hospital-department": [ + "paediatric clinic", + "do n't care", + "transitional care", + "acute medical assessment unit", + "neurology", + "oral and maxillofacial surgery and ent", + "infectious diseases", + "intermediate dependancy area", + "plastic and vascular surgery plastics", + "cardiology and coronary care unit", + "transplant high dependency unit", + "emergency department", + "john farman intensive care unit", + "neurology neurosurgery", + "cambridge eye unit", + "cardiology", + "respiratory medicine", + "uro", + "urology", + "surgery", + "medical decisions unit", + "gener", + "hepatobillary and gastrointestinal surgery regional referral centre", + "gastroenterology", + "haematology and haematological oncology", + "acute medicine for the elderly", + "paediatric day unit", + "oncology", + "psychiatry", + "trauma high dependency unit", + "children's surgical and medicine", + "infusion services", + "trauma and orthopaedics", + "teenage cancer trust unit", + "diabetes and endocrinology", + "medicine for the elderly", + "clinical decisions unit", + "children's oncology and haematology", + "neurosciences critical care unit", + "coronary care unit", + "clinical research facility", + "antenatal", + "hepatology", + "gynaecology", + "neurosciences", + "paediatric intensive care unit", + "haematology day unit", + "neonatal unit", + "accident", + "haematology", + "inpatient occupational therapy" + ], + "hotel-name": [ + "city centre north b and b", + "express by holiday inn cambridge", + "el shaddai", + "a and b guest house", + "arbury lodge guest house", + "university arms hotel", + "kirkwood house", + "autumn house", + "gonville hotel", + "leverton house", + "acorn guest house", + "the cambridge belfry", + "ashley hotel", + "the lensfield hotel", + "finches bed and breakfast", + "worth house", + "wandlebury coutn", + "allenbell", + "rosa's bed and breakfast", + "home from home", + "avalon", + "alpha-milton guest house", + "alexander bed and breakfast", + "cityroomz", + "limehouse", + "archway house", + "warkworth house", + "lovell lodge", + "do n't care", + "aylesbray lodge guest house", + "carolina bed and breakfast", + "huntingdon marriott hotel", + "hobsons house", + "hamilton lodge", + "whale", + "cambridge belfry", + "bridge guest house", + "gonville", + "cambridge", + "acorn house", + "city center north", + "caffe", + "express by", + "clare", + "acorn", + "sou", + "north b and b", + "lensfield", + "city stop rest", + "kirkwood", + "levert", + "anatolia", + "huntingdon marriot hotel", + "cherr", + "huntingd", + "nusha", + "aylesbray lodge", + "royal spice", + "la margherit", + "hotel", + "gall", + "city roomz", + "alexander", + "caridge belfrey", + "nus", + "alexander bed & breakfast", + "the allenbell.", + "the allenbell", + "gra", + "cambridg", + "lensfield hotel", + "tandoori palace", + "huntingdon hotel", + "autumn house.", + "doubletree by hilton cambridge" + ], + "taxi-leave at": [ + "02:45", + "24:45", + "08:45", + "16:15", + "04:45", + "24:00", + "12:30", + "12:00", + "02:30", + "19:00", + "19:45", + "13:00", + "22:15", + "13:45", + "15:15", + "01:30", + "16:45", + "07:15", + "20:00", + "12:15", + "21:00", + "17:15", + "08:00", + "10:30", + "21:15", + "15:00", + "05:30", + "13:15", + "01:15", + "06:15", + "01:00", + "18:30", + "06:45", + "06:00", + "07:45", + "06:30", + "17:45", + "18:15", + "19:30", + "24:30", + "21:30", + "03:45", + "21:25", + "14:30", + "17:00", + "14:00", + "04:30", + "23:30", + "03:30", + "20:15", + "24:15", + "02:00", + "07:00", + "23:00", + "08:15", + "22:00", + "10:15", + "16:30", + "02:15", + "18:00", + "14:15", + "10:00", + "17:30", + "15:45", + "05:45", + "19:15", + "20:45", + "23:45", + "15:30", + "13:30", + "11:45", + "05:00", + "03:15", + "03:20", + "07:30", + "23:15", + "16:00", + "09:30", + "after 2:30", + "11:30", + "09:45", + "09:15", + "09:00", + "15:25", + "08:30", + "14:45", + "04:00", + "11:00", + "22:45", + "11:15", + "03:00", + "04:15", + "1615", + "22:30", + "3:45", + "21:45", + "after 11:45", + "05:15", + "8:15", + "01:45", + "12:45", + "10:45", + "20:30", + "3:15", + "18:45", + "thursday", + "9:00", + "21:04", + "9:15", + "6:00", + "10:50", + "after 15:45", + "1:15", + "300", + "monday", + "friday", + "1" + ], + "taxi-destination": [ + "express by holiday inn cambridge", + "dojo noodle bar", + "wandlebury country park", + "leverton house", + "the cambridge chop house", + "saint john's college", + "galleria", + "churchill college", + "royal spice", + "all saints church", + "nandos city centre", + "cote", + "chiquito restaurant bar", + "saint johns chop house", + "the nirala", + "tang chinese", + "the junction", + "bangkok city", + "cambridge university botanic gardens", + "the lucky star", + "london liverpool street train station", + "la raza", + "soul tree nightclub", + "the cow pizza kitchen and bar", + "yippee noodle bar", + "christ's college", + "el shaddai", + "castle galleries", + "saint catharine's college", + "riverside brasserie", + "fitzbillies restaurant", + "cambridge", + "camb", + "the golden curry", + "wagamama", + "jinling noodle bar", + "maharajah tandoori restaurant", + "little saint mary's church", + "camboats", + "cotto", + "nusha", + "acorn guest house", + "frankie and bennys", + "jesus green outdoor pool", + "aylesbray lodge guest house", + "golden wok", + "charlie chan", + "country folk museum", + "huntingdon marriott hotel", + "meze bar restaurant", + "vue cinema", + "adc theatre", + "rice house", + "graffiti", + "saigon city", + "city stop restaurant", + "carolina bed and breakfast", + "cafe jello gallery", + "little seoul", + "primavera", + "kohinoor", + "the place", + "loch fyne", + "rice boat", + "kings hedges learner pool", + "the cambridge belfry", + "cambridge artworks", + "pizza hut cherry hinton", + "cambridge train station", + "sitar tandoori", + "the gardenia", + "don pasquale pizzeria", + "milton country park", + "hamilton lodge", + "hobsons house", + "kirkwood house", + "archway house", + "the fitzwilliam museum", + "alexander bed and breakfast", + "cambridge museum of technology", + "midsummer house restaurant", + "allenbell", + "hakka", + "old schools", + "ashley hotel", + "de luca cucina and bar", + "cambridge lodge restaurant", + "efes restaurant", + "bedouin", + "club salsa", + "bloomsbury restaurant", + "hk fusion", + "rajmahal", + "lynne strover gallery", + "abbey pool and astroturf pitch", + "taj tandoori", + "royal standard", + "a and b guest house", + "parkside police station", + "curry garden", + "scudamores punting co", + "the lensfield hotel", + "the fez club", + "university arms hotel", + "la margherita", + "hotel du vin and bistro", + "pizza express fen ditton", + "hughes hall", + "gonville hotel", + "the gandhi", + "sidney sussex college", + "tenpin", + "backstreet bistro", + "nandos", + "ballare", + "ali baba", + "india house", + "peking restaurant", + "meghna", + "the river bar steakhouse and grill", + "the cambridge punter", + "pipasha restaurant", + "prezzo", + "whale of a time", + "finches bed and breakfast", + "avalon", + "curry prince", + "da vinci pizzeria", + "corpus christi", + "holy trinity church", + "the varsity restaurant", + "saffron brasserie", + "downing college", + "sesame restaurant and bar", + "cambridge contemporary art", + "addenbrookes hospital", + "williams art and antiques", + "pembroke college", + "clare hall", + "cambridge arts theatre", + "shanghai family restaurant", + "king's college", + "the cherry hinton village centre", + "cambridge and county folk museum", + "the hotpot", + "bridge guest house", + "ugly duckling", + "cocum", + "stazione restaurant and coffee bar", + "gourmet burger kitchen", + "la tasca", + "shiraz restaurant", + "byard art", + "sheep's green and lammas land park fen causeway", + "caffe uno", + "broughton house gallery", + "the ghandi", + "kambar", + "the copper kettle", + "yu garden", + "trinity college", + "birmingham new street train station", + "funky fun house", + "eraina", + "man on the moon concert hall", + "tandoori palace", + "magdalene college", + "worth house", + "mumford theatre", + "pizza express", + "ely train station", + "lan hong house", + "mahal of cambridge", + "pizza hut city centre", + "golden house", + "alpha-milton guest house", + "regency gallery", + "darrys cookhouse and wine shop", + "cityroomz", + "queen's college", + "restaurant one seven", + "arbury lodge guesthouse", + "the cambridge corn exchange", + "the oak bistro", + "cambridge county fair next to the city tourist museum", + "riverboat georgina", + "stansted airport train station", + "grafton hotel restaurant", + "thanh binh", + "autumn house", + "restaurant two two", + "jesus college", + "london kings cross train station", + "city centre north b and b", + "cherry hinton water play", + "travellers rest", + "the man on the moon", + "rosa's bed and breakfast", + "the good luck chinese food takeaway", + "curry king", + "cambridge book and print gallery", + "st johns chop house", + "cherry hinton hall and grounds", + "anatolia", + "saint barnabas press gallery", + "j restaurant", + "parkside pools", + "kymmoy", + "scott polar museum", + "cineworld cinema", + "michaelhouse cafe", + "cambr", + "restaurant alimentum", + "zizzi cambridge", + "not museum of archaeology and anthropologymentioned", + "gonville and caius college", + "lovell lodge", + "fitzwilliam museum", + "sleeperz hotel", + "the slug and lettuce", + "whipple museum of the history of science", + "broxbourne train station", + "home from home", + "museum of archaelogy and anthropology", + "tandori in cambridge", + "gastropub", + "the anatolia", + "kettle's yard", + "nirala", + "leicester train station", + "cambridge road church of christ", + "warkworth house", + "megna", + "grou", + "sala thong", + "gallery at twelve a high street", + "maharajah tandoori restaurant4", + "pizza hut fen ditton", + "museum of classical archaeology", + "the regent street city center", + "gandhi", + "emmanuel college", + "tranh binh", + "wankworth hotel", + "kambur", + "the missing sock", + "panahar", + "limehouse", + "finders corner newmarket road", + "people's portraits exhibition at girton college", + "station road", + "la mimosa", + "clowns cafe", + "bishops stortford train station", + "hotel", + "nil", + "kings college", + "restaurant", + "attraction", + "stevenage train station", + "the galleria", + "queens' college", + "great saint mary's church", + "theathre", + "ruskin gallery", + "saint barnabas", + "peterborough train station", + "cambridge artw2orks", + "acorn house", + "clare college", + "shiraz.", + "riverboat georginawd", + "mic", + "the gallery at twelve", + "the soul tree", + "finches" + ], + "taxi-departure": [ + "christ's college", + "kirkwood house", + "bridge guest house", + "cineworld cinema", + "gonville hotel", + "ashley hotel", + "arbury lodge guesthouse", + "worth house", + "city centre north b and b", + "the lensfield hotel", + "funky fun house", + "riverboat georgina", + "backstreet bistro", + "the junction", + "express by holiday inn cambridge", + "all saints church", + "sala thong", + "byard art", + "london kings cross train station", + "la raza", + "alexander bed and breakfast", + "cityroomz", + "hamilton lodge", + "alpha-milton guest house", + "el shaddai", + "restaurant alimentum", + "allenbell", + "churchill college", + "clare college", + "parkside pools", + "cambridge arts theatre", + "frankie and bennys", + "huntingdon marriott hotel", + "university arms hotel", + "acorn guest house", + "soul tree nightclub", + "ali baba", + "tandoori palace", + "saigon city", + "zizzi cambridge", + "wagamama", + "the cambridge belfry", + "milton country park", + "a and b guest house", + "city stop restaurant", + "grafton hotel restaurant", + "the man on the moon", + "king's college", + "great saint mary's church", + "the oak bistro", + "club salsa", + "loch fyne", + "the place", + "la mimosa", + "la margherita", + "restaurant two two", + "queens' college", + "wandlebury country park", + "kymmoy", + "magdalene college", + "royal standard", + "kings lynn train station", + "the fez club", + "jesus green outdoor pool", + "vue cinema", + "little saint mary's church", + "avalon", + "aylesbray lodge guest house", + "chinese city centre", + "thanh binh", + "yu garden", + "finches bed and breakfast", + "hakka", + "lovell lodge", + "kettle's yard", + "bloomsbury restaurant", + "museum of classical archaeology", + "cherry hinton water play", + "camboats", + "pizza hut cherry hinton", + "abbey pool and astroturf pitch", + "pizza hut", + "taj tandoori", + "kings hedges learner pool", + "nandos city centre", + "hobsons house", + "carolina bed and breakfast", + "the golden curry", + "rosa's bed and breakfast", + "the river bar steakhouse and grill", + "cafe jello gallery", + "the cambridge punter", + "the byard art museum", + "archway house", + "ugly duckling", + "cherry hinton hall and grounds", + "addenbrookes hospital", + "graffiti", + "cambridge artworks", + "broughton house gallery", + "the missing sock", + "autumn house", + "stazione restaurant and coffee bar", + "whale of a time", + "cambridge lodge restaurant", + "saint johns chop house", + "prezzo", + "rice boat", + "lan hong house", + "leverton house", + "kambar", + "hotel", + "museum of archaelogy and anthropology", + "saint catharine's college", + "la tasca", + "cambridge", + "downing college", + "cocum", + "mumford theatre", + "cote", + "golden wok", + "cambridge university botanic gardens", + "charlie chan", + "corpus christi", + "old schools", + "cambridge belfry", + "jinling noodle bar", + "birmingham new street train station", + "meze bar restaurant", + "warkworth house", + "pizza express", + "gonville and caius college", + "cambridge contemporary art", + "primavera", + "adc theatre", + "bedouin", + "trinity college", + "nusha", + "the good luck chinese food takeaway", + "travellers rest", + "restaurant one seven", + "pizza hut fen ditton", + "cambridge and county folk museum", + "curry prince", + "the nirala", + "pembroke college", + "golden house", + "castle galleries", + "the fitzwilliam museum", + "galleria", + "jesus college", + "curry garden", + "anatolia", + "the gandhi", + "nandos", + "st. john's college", + "clowns cafe", + "dojo noodle bar", + "panahar", + "whipple museum of the history of science", + "royal spice", + "clare hall", + "gallery at twelve a high street", + "pizza hut city centre", + "bishops stortford train station", + "ruskin gallery", + "sheep's green and lammas land park fen causeway", + "home from home", + "parkside police station", + "scudamores punting co", + "the cherry hinton village centre", + "sesame restaurant and bar", + "broxbourne train station", + "sidney sussex college", + "chiquito restaurant bar", + "cambridge train station", + "saffron brasserie", + "pizza express fen ditton", + "tenpin", + "scott polar museum", + "rice house", + "caffee uno", + "cotto", + "corpus cristi", + "the varsity restaurant", + "limehouse", + "ely train station", + "hotel du vin and bistro", + "meghna", + "the gardenia", + "emmanuel college", + "saint barnabas press gallery", + "rajmahal", + "lynne strover gallery", + "riverside brasserie", + "hughes hall", + "cherry hinton village center", + "the lucky star", + "bangkok city", + "kohinoor", + "don pasquale pizzeria", + "fitzbillies restaurant", + "acorn house", + "curry queen", + "ballare", + "the cambridge corn exchange", + "lens", + "cambridge museum of technology", + "the cambridge chop house", + "peterborough train station", + "london liverpool street", + "the copper kettle", + "holy trinity church", + "the slug and lettuce", + "williams art and antiques", + "the avalon", + "junction theatre", + "yippee noodle bar", + "da vinci pizzeria", + "maharajah tandoori restaurant", + "darrys cookhouse and wine shop", + "michaelhouse cafe", + "tang chinese", + "peking restaurant", + "saint john's college", + "aylesbray lodge", + "the alexander bed and breakfast", + "cambridge belfy", + "kings college", + "eraina", + "people's portraits exhibition at girton college", + "gonville", + "caffe uno", + "the cow pizza kitchen and bar", + "mahal of cambridge", + "lovell ldoge", + "alyesbray lodge hotel", + "cambridge chop house", + "j restaurant", + "172 chestertown road", + "little seoul", + "stevenage train station", + "downing street", + "gourmet burger kitchen", + "cambridge towninfo centre", + "citiroomz", + "de luca cucina and bar", + "london liverpool street train station", + "cinema", + "museum", + "shiraz restaurant", + "clair hall", + "sitar tandoori", + "cambridge book and print gallery", + "norwich train station", + "home", + "regency gallery", + "nstaot mentioned", + "new england", + "the hotpot", + "park", + "07:15", + "the allenbell" + ], + "restaurant-book time": [ + "19:45", + "13:45", + "10:15", + "19:15", + "11:30", + "10:30", + "18:45", + "13:30", + "15:00", + "11:45", + "12:00", + "15:15", + "16:45", + "15:45", + "17:15", + "19:30", + "14:00", + "10:45", + "17:30", + "16:30", + "17:00", + "12:30", + "18:15", + "18:00", + "20:15", + "12:45", + "14:15", + "13:15", + "10:00", + "16:00", + "19:00", + "12:15", + "11:00", + "11:15", + "15:30", + "14:30", + "18:30", + "14:45", + "17:45", + "09:15", + "09:45", + "16:15", + "13:00", + "20:00", + "21:00", + "20:30", + "20:45", + "1545", + "1745", + "09:00", + "not given", + "do n't care", + "13:10", + "21:45", + "08:45", + "09:30" + ], + "restaurant-book day": [ + "thursday", + "wednesday", + "friday", + "monday", + "sunday", + "saturday", + "tuesday", + "thur", + "not given" + ], + "restaurant-book people": [ + "2", + "3", + "1", + "5", + "6", + "4", + "7", + "8" + ], + "taxi-arrive by": [ + "19:15", + "15:45", + "17:15", + "17:30", + "17:00", + "12:30", + "19:30", + "12:45", + "18:00", + "13:30", + "13:15", + "17:45", + "11:15", + "14:30", + "14:45", + "11:45", + "09:15", + "16:45", + "16:15", + "01:30", + "12:15", + "06:45", + "01:00", + "09:30", + "12:00", + "11:30", + "18:30", + "15:30", + "18:45", + "14:00", + "21:00", + "15:00", + "08:15", + "10:15", + "19:00", + "20:15", + "11:00", + "15:15", + "05:15", + "16:30", + "21:30", + "08:00", + "20:00", + "20:30", + "20:45", + "10:45", + "13:00", + "09:00", + "21:45", + "16:00", + "19:45", + "16;15", + "6:00", + "10:30", + "23:15", + "13:45", + "14:15", + "18:15", + "04:00", + "24:45", + "07:00", + "04:15", + "23:00", + "3:00", + "02:45", + "10:00", + "07:15", + "24:15", + "04:45", + "01:15", + "06:30", + "23:30", + "06:15", + "21:15", + "07:30", + "02:00", + "22:30", + "24:00", + "02:15", + "1:00", + "03:15", + "5:30", + "05:00", + "1145", + "03:30", + "01:45", + "22:15", + "03:45", + "09:45", + "03:00", + "06:00", + "05:30", + "17:05", + "1730", + "02:30", + "22:00", + "04:30", + "1700", + "24:30", + "15:23", + "08:30" + ], + "bus-departure": [ + "cambridge" + ], + "bus-destination": [ + "london kings cross", + "bishops stortford", + "cambridge", + "kohinoor" + ], + "bus-leaveAt": [ + "21:45" + ], + "bus-day": [ + "wednesday" + ] +}