import copy import re from zipfile import ZipFile, ZIP_DEFLATED from shutil import copy2, rmtree import json import os from tqdm import tqdm from collections import Counter from pprint import pprint from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer from data.unified_datasets.multiwoz21.booking_remapper import BookingActRemapper init_ontology = { "domains": { # descriptions are adapted from multiwoz22, but is_categorical may be different "attraction": { "description": "find an attraction", "slots": { "area": { "description": "area to search for attractions", "is_categorical": True, "possible_values": [ "centre", "east", "north", "south", "west" ] }, "name": { "description": "name of the attraction", "is_categorical": False, "possible_values": [] }, "type": { "description": "type of the attraction", "is_categorical": True, "possible_values": [ "architecture", "boat", "cinema", "college", "concerthall", "entertainment", "museum", "multiple sports", "nightclub", "park", "swimmingpool", "theatre" ] }, "entrance fee": { "description": "how much is the entrance fee", "is_categorical": False, "possible_values": [] }, "open hours": { "description": "open hours of the attraction", "is_categorical": False, "possible_values": [] }, "address": { "description": "address of the attraction", "is_categorical": False, "possible_values": [] }, "phone": { "description": "phone number of the attraction", "is_categorical": False, "possible_values": [] }, "postcode": { "description": "postcode of the attraction", "is_categorical": False, "possible_values": [] }, "choice": { "description": "number of attractions that meet the requirement", "is_categorical": False, "possible_values": [] } } }, "hotel": { "description": "find and book a hotel", "slots": { "internet": { "description": "whether the hotel has internet", "is_categorical": True, "possible_values": [ "free", "no", "yes" ] }, "parking": { "description": "whether the hotel has parking", "is_categorical": True, "possible_values": [ "free", "no", "yes" ] }, "area": { "description": "area or place of the hotel", "is_categorical": True, "possible_values": [ "centre", "east", "north", "south", "west" ] }, "stars": { "description": "star rating of the hotel", "is_categorical": True, "possible_values": [ "0", "1", "2", "3", "4", "5" ] }, "price range": { "description": "price budget of the hotel", "is_categorical": True, "possible_values": [ "expensive", "cheap", "moderate" ] }, "type": { "description": "what is the type of the hotel", "is_categorical": False, "possible_values": [ "guesthouse", "hotel" ] }, "name": { "description": "name of the hotel", "is_categorical": False, "possible_values": [] }, "book people": { "description": "number of people for the hotel booking", "is_categorical": False, "possible_values": [] }, "book stay": { "description": "length of stay at the hotel", "is_categorical": False, "possible_values": [] }, "book day": { "description": "day of the hotel booking", "is_categorical": True, "possible_values": [ "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday" ] }, "phone": { "description": "phone number of the hotel", "is_categorical": False, "possible_values": [] }, "postcode": { "description": "postcode of the hotel", "is_categorical": False, "possible_values": [] }, "address": { "description": "address of the hotel", "is_categorical": False, "possible_values": [] }, "ref": { "description": "reference number of the hotel booking", "is_categorical": False, "possible_values": [] }, "choice": { "description": "number of hotels that meet the requirement", "is_categorical": False, "possible_values": [] } } }, "taxi": { "description": "rent taxi to travel", "slots": { "destination": { "description": "destination of taxi", "is_categorical": False, "possible_values": [] }, "departure": { "description": "departure location of taxi", "is_categorical": False, "possible_values": [] }, "leave at": { "description": "leaving time of taxi", "is_categorical": False, "possible_values": [] }, "arrive by": { "description": "arrival time of taxi", "is_categorical": False, "possible_values": [] }, "phone": { "description": "phone number of the taxi", "is_categorical": False, "possible_values": [] }, "type": { "description": "car type of the taxi", "is_categorical": False, "possible_values": [] } } }, "restaurant": { "description": "find and book a restaurant", "slots": { "price range": { "description": "price budget for the restaurant", "is_categorical": True, "possible_values": [ "cheap", "expensive", "moderate" ] }, "area": { "description": "area or place of the restaurant", "is_categorical": True, "possible_values": [ "centre", "east", "north", "south", "west" ] }, "food": { "description": "the cuisine of the restaurant", "is_categorical": False, "possible_values": [] }, "name": { "description": "name of the restaurant", "is_categorical": False, "possible_values": [] }, "address": { "description": "address of the restaurant", "is_categorical": False, "possible_values": [] }, "postcode": { "description": "postcode of the restaurant", "is_categorical": False, "possible_values": [] }, "phone": { "description": "phone number of the restaurant", "is_categorical": False, "possible_values": [] }, "book people": { "description": "number of people for the restaurant booking", "is_categorical": False, "possible_values": [] }, "book time": { "description": "time of the restaurant booking", "is_categorical": False, "possible_values": [] }, "book day": { "description": "day of the restaurant booking", "is_categorical": True, "possible_values": [ "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday" ] }, "ref": { "description": "reference number of the restaurant booking", "is_categorical": False, "possible_values": [] }, "choice": { "description": "number of restaurants that meet the requirement", "is_categorical": False, "possible_values": [] } } }, "train": { "description": "find a train to travel", "slots": { "destination": { "description": "destination of the train", "is_categorical": False, "possible_values": [] }, "arrive by": { "description": "arrival time of the train", "is_categorical": False, "possible_values": [] }, "departure": { "description": "departure location of the train", "is_categorical": False, "possible_values": [] }, "leave at": { "description": "leaving time for the train", "is_categorical": False, "possible_values": [] }, "duration": { "description": "duration of the travel", "is_categorical": False, "possible_values": [] }, "book people": { "description": "number of people booking for train", "is_categorical": False, "possible_values": [] }, "day": { "description": "day of the train", "is_categorical": True, "possible_values": [ "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday" ] }, "ref": { "description": "reference number of the train booking", "is_categorical": False, "possible_values": [] }, "price": { "description": "price of the train ticket", "is_categorical": False, "possible_values": [] }, "train id": { "description": "id of the train", "is_categorical": False }, "choice": { "description": "number of trains that meet the requirement", "is_categorical": False, "possible_values": [] } } }, "police": { "description": "find a police station for help", "slots": { "name": { "description": "name of the police station", "is_categorical": False, "possible_values": [] }, "address": { "description": "address of the police station", "is_categorical": False, "possible_values": [] }, "postcode": { "description": "postcode of the police station", "is_categorical": False, "possible_values": [] }, "phone": { "description": "phone number of the police station", "is_categorical": False, "possible_values": [] } } }, "hospital": { "description": "find a hospital for help", "slots": { "department": { "description": "specific department of the hospital", "is_categorical": False, "possible_values": [] }, "address": { "description": "address of the hospital", "is_categorical": False, "possible_values": [] }, "phone": { "description": "phone number of the hospital", "is_categorical": False, "possible_values": [] }, "postcode": { "description": "postcode of the hospital", "is_categorical": False, "possible_values": [] } } }, "booking": { "description": "booking for taxi, restaurant, hotel, train, etc.", "slots":{ "day": { "description": "day of the booking", "is_categorical": True, "possible_values": [ "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday" ] }, "time": { "description": "time of the booking", "is_categorical": False, "possible_values": [] }, "book people": { "description": "number of people for the booking", "is_categorical": False, "possible_values": [] }, "book stay": { "description": "length of stay at the hotel", "is_categorical": False, "possible_values": [] }, "name": { "description": "name of the booked entity", "is_categorical": False, "possible_values": [] }, "ref": { "description": "reference number of the booking", "is_categorical": False, "possible_values": [] } } }, "general":{ "description": "general domain without slots", "slots": {} } }, "intents": { "inform": { "description": "inform the value of a slot" }, "request": { "description": "ask for the value of a slot" }, "nobook": { "description": "inform the user that the booking is failed" }, "reqmore": { "description": "ask the user for more instructions" }, "book": { "description": "book something for the user" }, "bye": { "description": "say goodbye to the user and end the conversation" }, "thank": { "description": "thanks for the help" }, "welcome": { "description": "you're welcome" }, "greet": { "description": "express greeting" }, "recommend": { "description": "recommend a choice to the user" }, "select": { "description": "provide several choices for the user" }, "offerbook": { "description": "ask the user if he or she needs booking" }, "offerbooked": { "description": "provide information about the booking" }, "nooffer": { "description": "inform the user that there is no result satisfies user requirements" } }, "binary_dialogue_acts": set(), # from data "state": { "attraction": { "type": "", "name": "", "area": "" }, "hotel": { "name": "", "area": "", "parking": "", "price range": "", "stars": "", "internet": "", "type": "", "book stay": "", "book day": "", "book people": "" }, "restaurant": { "food": "", "price range": "", "name": "", "area": "", "book time": "", "book day": "", "book people": "" }, "taxi": { "leave at": "", "destination": "", "departure": "", "arrive by": "" }, "train": { "leave at": "", "destination": "", "day": "", "arrive by": "", "departure": "", "book people": "" } } } slot_name_map = { 'addr': "address", 'post': "postcode", 'pricerange': "price range", 'arrive': "arrive by", 'arriveby': "arrive by", 'leave': "leave at", 'leaveat': "leave at", 'depart': "departure", 'dest': "destination", 'fee': "entrance fee", 'open': 'open hours', 'car': "type", 'car type': "type", 'ticket': 'price', 'trainid': 'train id', 'id': 'train id', 'people': 'book people', 'stay': 'book stay', 'none': '', 'attraction': { 'price': 'entrance fee' }, 'hospital': {}, 'hotel': { 'day': 'book day', 'price': "price range" }, 'restaurant': { 'day': 'book day', 'time': 'book time', 'price': "price range" }, 'taxi': {}, 'train': { 'day': 'day', 'time': "duration" }, 'police': {}, 'booking': {} } digit2word = { '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten' } cnt_domain_slot = Counter() def normalize_domain_slot_value(domain, slot, value): global init_ontology, slot_name_map domain = domain.lower() slot = slot.lower() value = value.strip() if value in ['do nt care', "do n't care"]: value = 'dontcare' if value in ['?', 'none', 'not mentioned']: value = "" if domain not in init_ontology['domains']: raise Exception(f'{domain} not in ontology') if slot not in init_ontology['domains'][domain]['slots']: if slot in slot_name_map: slot = slot_name_map[slot] elif slot in slot_name_map[domain]: slot = slot_name_map[domain][slot] else: raise Exception(f'{domain}-{slot} not in ontology') assert slot=='' or slot in init_ontology['domains'][domain]['slots'], f'{(domain, slot, value)} not in ontology' return domain, slot, value def convert_da(da_dict, utt, sent_tokenizer, word_tokenizer): ''' convert multiwoz dialogue acts to required format and update ontology :param da_dict: dict[(intent, domain, slot, value)] = [word_start, word_end] :param utt: user or system utt ''' global init_ontology, digit2word, cnt_domain_slot converted_da = { 'categorical': [], 'non-categorical': [], 'binary': [] } sentences = sent_tokenizer.tokenize(utt) sent_spans = sent_tokenizer.span_tokenize(utt) tokens = [token for sent in sentences for token in word_tokenizer.tokenize(sent)] token_spans = [(sent_span[0]+token_span[0], sent_span[0]+token_span[1]) for sent, sent_span in zip(sentences, sent_spans) for token_span in word_tokenizer.span_tokenize(sent)] # assert len(tokens) == len(token_spans) # for token, span in zip(tokens, token_spans): # if utt[span[0]:span[1]] != '"': # assert utt[span[0]:span[1]] == token for (intent, domain, slot, value), span in da_dict.items(): if intent == 'request' or slot == '' or value == '': # binary dialog acts init_ontology['binary_dialogue_acts'].add((intent, domain, slot, value,)) converted_da['binary'].append({ 'intent': intent, 'domain': domain, 'slot': slot, 'value': value }) elif init_ontology['domains'][domain]['slots'][slot]['is_categorical']: # categorical dialog acts converted_da['categorical'].append({ 'intent': intent, 'domain': domain, 'slot': slot, 'value': value }) else: # non-categorical dialog acts converted_da['non-categorical'].append({ 'intent': intent, 'domain': domain, 'slot': slot, 'value': value }) # correct some value and try to give char level span match = False value = value.lower() if span and span[0] <= span[1]: # use original span annotation, but tokenizations are different start_word, end_word = span if end_word >= len(tokens): # due to different tokenization, sometimes will out of index delta = end_word - len(tokens) + 1 start_word -= delta end_word -= delta start_char, end_char = token_spans[start_word][0], token_spans[end_word][1] value_span = utt[start_char:end_char].lower() match = True if value_span == value: cnt_domain_slot['span match'] += 1 elif value.isdigit() and value in digit2word and digit2word[value] == value_span: # !!!CHANGE VALUE: value is digit but value span is word cnt_domain_slot['digit value match'] += 1 elif ''.join(value.split()) == ''.join(value_span.split()): # !!!CHANGE VALUE: equal when remove blank cnt_domain_slot['remove blank'] += 1 elif value in value_span: # value in value_span start_char += value_span.index(value) end_char = start_char + len(value) assert utt[start_char:end_char].lower() == value, f'{[value, utt[start_char:end_char], utt]}' cnt_domain_slot['value in span'] += 1 elif ':' in value and value == '0'+value_span: # !!!CHANGE VALUE: time x:xx == 0x:xx cnt_domain_slot['x:xx == 0x:xx'] += 1 else: # span mismatch, search near 1-2 words for window in range(1,3): start = max(0,start_word-window) end = min(len(token_spans)-1,end_word+window) large_span = utt[token_spans[start][0]:token_spans[end][1]].lower() if value in large_span: start_char = token_spans[start][0] + large_span.index(value) end_char = start_char + len(value) assert utt[start_char:end_char].lower() == value, f'{[value, utt[start_char:end_char], utt]}' cnt_domain_slot[f'window={window}'] += 1 break else: # still not found match = False if match: converted_da['non-categorical'][-1]['value'] = utt[start_char:end_char] converted_da['non-categorical'][-1]['start'] = start_char converted_da['non-categorical'][-1]['end'] = end_char cnt_domain_slot['have span'] += 1 else: cnt_domain_slot['no span'] += 1 return converted_da def preprocess(): original_data_dir = 'MultiWOZ_2.1' new_data_dir = 'data' if not os.path.exists(original_data_dir): original_data_zip = 'MultiWOZ_2.1.zip' if not os.path.exists(original_data_zip): raise FileNotFoundError(f'cannot find original data {original_data_zip} in multiwoz21/, should manually download MultiWOZ_2.1.zip from https://github.com/budzianowski/multiwoz/blob/master/data/MultiWOZ_2.1.zip') else: archive = ZipFile(original_data_zip) archive.extractall() os.makedirs(new_data_dir, exist_ok=True) for filename in os.listdir(original_data_dir): if 'db' in filename: copy2(f'{original_data_dir}/{filename}', new_data_dir) original_data = json.load(open(f'{original_data_dir}/data.json')) global init_ontology, cnt_domain_slot val_list = set(open(f'{original_data_dir}/valListFile.txt').read().split()) test_list = set(open(f'{original_data_dir}/testListFile.txt').read().split()) dataset = 'multiwoz21' splits = ['train', 'validation', 'test'] dialogues_by_split = {split:[] for split in splits} sent_tokenizer = PunktSentenceTokenizer() word_tokenizer = TreebankWordTokenizer() booking_remapper = BookingActRemapper() for ori_dialog_id, ori_dialog in tqdm(original_data.items()): if ori_dialog_id in val_list: split = 'validation' elif ori_dialog_id in test_list: split = 'test' else: split = 'train' dialogue_id = f'{dataset}-{split}-{len(dialogues_by_split[split])}' # get user goal and involved domains cur_domains = [] goal = { 'description': '. '.join(ori_dialog['goal']['message']), 'inform': {}, 'request': {} } for k, v in ori_dialog['goal'].items(): if len(v) != 0 and k in init_ontology['domains']: cur_domains.append(k) goal['inform'][k] = {} goal['request'][k] = {} for attr in ['fail_info', 'info', 'fail_book', 'book']: if attr in v: for slot, value in v[attr].items(): if 'invalid' in slot: continue domain, slot, value = normalize_domain_slot_value(k, slot, value) if slot in goal['inform'][domain]: goal['inform'][domain][slot] += '|'+value else: goal['inform'][domain][slot] = value if 'reqt' in v: for slot in v['reqt']: domain, slot, _ = normalize_domain_slot_value(k, slot, '') goal['request'][domain][slot] = '' dialogue = { 'dataset': dataset, 'data_split': split, 'dialogue_id': dialogue_id, 'original_id': ori_dialog_id, 'domains': cur_domains, # will be updated by dialog_acts and state 'goal': goal, 'turns': [] } booking_remapper.reset() for turn_id, turn in enumerate(ori_dialog['log']): # correct some grammar errors in the text, mainly following `tokenization.md` in MultiWOZ_2.1 text = turn['text'] text = re.sub(" Im ", " I'm ", text) text = re.sub(" im ", " i'm ", text) text = re.sub(r"^Im ", "I'm ", text) text = re.sub(r"^im ", "i'm ", text) text = re.sub("theres", "there's", text) text = re.sub("dont", "don't", text) text = re.sub("whats", "what's", text) text = re.sub('thats', "that's", text) utt = text speaker = 'user' if turn_id % 2 == 0 else 'system' das = turn.get('dialog_act', []) spans = turn.get('span_info', []) if speaker == 'system': das, spans = booking_remapper.remap(turn_id, ori_dialog['log']) print(ori_dialog['log'][turn_id]) da_dict = {} # transform DA for Domain_Intent in das: domain, intent = Domain_Intent.lower().split('-') assert intent in init_ontology['intents'], f'{ori_dialog_id}:{turn_id}:da\t{intent} not in ontology' for Slot, value in das[Domain_Intent]: domain, slot, value = normalize_domain_slot_value(domain, Slot, value) if domain not in cur_domains: # update original cur_domains cur_domains.append(domain) da_dict[(intent, domain, slot, value,)] = [] for span in spans: Domain_Intent, Slot, value, start_word, end_word = span domain, intent = Domain_Intent.lower().split('-') domain, slot, value = normalize_domain_slot_value(domain, Slot, value) assert (intent, domain, slot, value,) in da_dict da_dict[(intent, domain, slot, value,)] = [start_word, end_word] dialogue_acts = convert_da(da_dict, utt, sent_tokenizer, word_tokenizer) # will also update ontology dialogue['turns'].append({ 'speaker': speaker, 'utterance': utt, 'utt_idx': turn_id, 'dialogue_acts': dialogue_acts, }) if speaker == 'system': # add state to last user turn # add empty db_results turn_state = turn['metadata'] cur_state = copy.deepcopy(init_ontology['state']) booked = {} for domain in turn_state: if domain not in cur_state: continue for subdomain in ['semi', 'book']: for slot, value in turn_state[domain][subdomain].items(): if slot == 'ticket': continue elif slot == 'booked': assert domain in init_ontology['domains'] booked[domain] = value continue _, slot, value = normalize_domain_slot_value(domain, slot, value) cur_state[domain][slot] = value dialogue['turns'][-2]['state'] = cur_state dialogue['turns'][-1]['db_results'] = {} dialogue['turns'][-1]['booked'] = booked dialogues_by_split[split].append(dialogue) # pprint(cnt_domain_slot.most_common()) dialogues = [] for split in splits: dialogues += dialogues_by_split[split] init_ontology['binary_dialogue_acts'] = [{'intent':bda[0],'domain':bda[1],'slot':bda[2],'value':bda[3]} for bda in sorted(init_ontology['binary_dialogue_acts'])] json.dump(dialogues[:10], open(f'dummy_data.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False) json.dump(dialogues, open(f'{new_data_dir}/dialogues.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False) json.dump(init_ontology, open(f'{new_data_dir}/ontology.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False) with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf: for filename in os.listdir(new_data_dir): zf.write(f'{new_data_dir}/{filename}') rmtree(original_data_dir) rmtree(new_data_dir) return dialogues, init_ontology if __name__ == '__main__': preprocess()