Select Git revision
DataValidationTest.mch
-
Michael Leuschel authored
Signed-off-by:
Michael Leuschel <leuschel@uni-duesseldorf.de>
Michael Leuschel authoredSigned-off-by:
Michael Leuschel <leuschel@uni-duesseldorf.de>
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
preprocess.py 35.00 KiB
import copy
import re
from zipfile import ZipFile, ZIP_DEFLATED
from shutil import copy2, rmtree
import json
import os
from tqdm import tqdm
from collections import Counter
from pprint import pprint
from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer
from data.unified_datasets.multiwoz21.booking_remapper import BookingActRemapper
init_ontology = {
"domains": { # descriptions are adapted from multiwoz22, but is_categorical may be different
"attraction": {
"description": "find an attraction",
"slots": {
"area": {
"description": "area to search for attractions",
"is_categorical": True,
"possible_values": [
"centre",
"east",
"north",
"south",
"west"
]
},
"name": {
"description": "name of the attraction",
"is_categorical": False,
"possible_values": []
},
"type": {
"description": "type of the attraction",
"is_categorical": True,
"possible_values": [
"architecture",
"boat",
"cinema",
"college",
"concerthall",
"entertainment",
"museum",
"multiple sports",
"nightclub",
"park",
"swimmingpool",
"theatre"
]
},
"entrance fee": {
"description": "how much is the entrance fee",
"is_categorical": False,
"possible_values": []
},
"open hours": {
"description": "open hours of the attraction",
"is_categorical": False,
"possible_values": []
},
"address": {
"description": "address of the attraction",
"is_categorical": False,
"possible_values": []
},
"phone": {
"description": "phone number of the attraction",
"is_categorical": False,
"possible_values": []
},
"postcode": {
"description": "postcode of the attraction",
"is_categorical": False,
"possible_values": []
},
"choice": {
"description": "number of attractions that meet the requirement",
"is_categorical": False,
"possible_values": []
}
}
},
"hotel": {
"description": "find and book a hotel",
"slots": {
"internet": {
"description": "whether the hotel has internet",
"is_categorical": True,
"possible_values": [
"free",
"no",
"yes"
]
},
"parking": {
"description": "whether the hotel has parking",
"is_categorical": True,
"possible_values": [
"free",
"no",
"yes"
]
},
"area": {
"description": "area or place of the hotel",
"is_categorical": True,
"possible_values": [
"centre",
"east",
"north",
"south",
"west"
]
},
"stars": {
"description": "star rating of the hotel",
"is_categorical": True,
"possible_values": [
"0",
"1",
"2",
"3",
"4",
"5"
]
},
"price range": {
"description": "price budget of the hotel",
"is_categorical": True,
"possible_values": [
"expensive",
"cheap",
"moderate"
]
},
"type": {
"description": "what is the type of the hotel",
"is_categorical": False,
"possible_values": [
"guesthouse",
"hotel"
]
},
"name": {
"description": "name of the hotel",
"is_categorical": False,
"possible_values": []
},
"book people": {
"description": "number of people for the hotel booking",
"is_categorical": False,
"possible_values": []
},
"book stay": {
"description": "length of stay at the hotel",
"is_categorical": False,
"possible_values": []
},
"book day": {
"description": "day of the hotel booking",
"is_categorical": True,
"possible_values": [
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday"
]
},
"phone": {
"description": "phone number of the hotel",
"is_categorical": False,
"possible_values": []
},
"postcode": {
"description": "postcode of the hotel",
"is_categorical": False,
"possible_values": []
},
"address": {
"description": "address of the hotel",
"is_categorical": False,
"possible_values": []
},
"ref": {
"description": "reference number of the hotel booking",
"is_categorical": False,
"possible_values": []
},
"choice": {
"description": "number of hotels that meet the requirement",
"is_categorical": False,
"possible_values": []
}
}
},
"taxi": {
"description": "rent taxi to travel",
"slots": {
"destination": {
"description": "destination of taxi",
"is_categorical": False,
"possible_values": []
},
"departure": {
"description": "departure location of taxi",
"is_categorical": False,
"possible_values": []
},
"leave at": {
"description": "leaving time of taxi",
"is_categorical": False,
"possible_values": []
},
"arrive by": {
"description": "arrival time of taxi",
"is_categorical": False,
"possible_values": []
},
"phone": {
"description": "phone number of the taxi",
"is_categorical": False,
"possible_values": []
},
"type": {
"description": "car type of the taxi",
"is_categorical": False,
"possible_values": []
}
}
},
"restaurant": {
"description": "find and book a restaurant",
"slots": {
"price range": {
"description": "price budget for the restaurant",
"is_categorical": True,
"possible_values": [
"cheap",
"expensive",
"moderate"
]
},
"area": {
"description": "area or place of the restaurant",
"is_categorical": True,
"possible_values": [
"centre",
"east",
"north",
"south",
"west"
]
},
"food": {
"description": "the cuisine of the restaurant",
"is_categorical": False,
"possible_values": []
},
"name": {
"description": "name of the restaurant",
"is_categorical": False,
"possible_values": []
},
"address": {
"description": "address of the restaurant",
"is_categorical": False,
"possible_values": []
},
"postcode": {
"description": "postcode of the restaurant",
"is_categorical": False,
"possible_values": []
},
"phone": {
"description": "phone number of the restaurant",
"is_categorical": False,
"possible_values": []
},
"book people": {
"description": "number of people for the restaurant booking",
"is_categorical": False,
"possible_values": []
},
"book time": {
"description": "time of the restaurant booking",
"is_categorical": False,
"possible_values": []
},
"book day": {
"description": "day of the restaurant booking",
"is_categorical": True,
"possible_values": [
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday"
]
},
"ref": {
"description": "reference number of the restaurant booking",
"is_categorical": False,
"possible_values": []
},
"choice": {
"description": "number of restaurants that meet the requirement",
"is_categorical": False,
"possible_values": []
}
}
},
"train": {
"description": "find a train to travel",
"slots": {
"destination": {
"description": "destination of the train",
"is_categorical": False,
"possible_values": []
},
"arrive by": {
"description": "arrival time of the train",
"is_categorical": False,
"possible_values": []
},
"departure": {
"description": "departure location of the train",
"is_categorical": False,
"possible_values": []
},
"leave at": {
"description": "leaving time for the train",
"is_categorical": False,
"possible_values": []
},
"duration": {
"description": "duration of the travel",
"is_categorical": False,
"possible_values": []
},
"book people": {
"description": "number of people booking for train",
"is_categorical": False,
"possible_values": []
},
"day": {
"description": "day of the train",
"is_categorical": True,
"possible_values": [
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday"
]
},
"ref": {
"description": "reference number of the train booking",
"is_categorical": False,
"possible_values": []
},
"price": {
"description": "price of the train ticket",
"is_categorical": False,
"possible_values": []
},
"train id": {
"description": "id of the train",
"is_categorical": False
},
"choice": {
"description": "number of trains that meet the requirement",
"is_categorical": False,
"possible_values": []
}
}
},
"police": {
"description": "find a police station for help",
"slots": {
"name": {
"description": "name of the police station",
"is_categorical": False,
"possible_values": []
},
"address": {
"description": "address of the police station",
"is_categorical": False,
"possible_values": []
},
"postcode": {
"description": "postcode of the police station",
"is_categorical": False,
"possible_values": []
},
"phone": {
"description": "phone number of the police station",
"is_categorical": False,
"possible_values": []
}
}
},
"hospital": {
"description": "find a hospital for help",
"slots": {
"department": {
"description": "specific department of the hospital",
"is_categorical": False,
"possible_values": []
},
"address": {
"description": "address of the hospital",
"is_categorical": False,
"possible_values": []
},
"phone": {
"description": "phone number of the hospital",
"is_categorical": False,
"possible_values": []
},
"postcode": {
"description": "postcode of the hospital",
"is_categorical": False,
"possible_values": []
}
}
},
"booking": {
"description": "booking for taxi, restaurant, hotel, train, etc.",
"slots":{
"day": {
"description": "day of the booking",
"is_categorical": True,
"possible_values": [
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday"
]
},
"time": {
"description": "time of the booking",
"is_categorical": False,
"possible_values": []
},
"book people": {
"description": "number of people for the booking",
"is_categorical": False,
"possible_values": []
},
"book stay": {
"description": "length of stay at the hotel",
"is_categorical": False,
"possible_values": []
},
"name": {
"description": "name of the booked entity",
"is_categorical": False,
"possible_values": []
},
"ref": {
"description": "reference number of the booking",
"is_categorical": False,
"possible_values": []
}
}
},
"general":{
"description": "general domain without slots",
"slots": {}
}
},
"intents": {
"inform": {
"description": "inform the value of a slot"
},
"request": {
"description": "ask for the value of a slot"
},
"nobook": {
"description": "inform the user that the booking is failed"
},
"reqmore": {
"description": "ask the user for more instructions"
},
"book": {
"description": "book something for the user"
},
"bye": {
"description": "say goodbye to the user and end the conversation"
},
"thank": {
"description": "thanks for the help"
},
"welcome": {
"description": "you're welcome"
},
"greet": {
"description": "express greeting"
},
"recommend": {
"description": "recommend a choice to the user"
},
"select": {
"description": "provide several choices for the user"
},
"offerbook": {
"description": "ask the user if he or she needs booking"
},
"offerbooked": {
"description": "provide information about the booking"
},
"nooffer": {
"description": "inform the user that there is no result satisfies user requirements"
}
},
"binary_dialogue_acts": set(), # from data
"state": {
"attraction": {
"type": "",
"name": "",
"area": ""
},
"hotel": {
"name": "",
"area": "",
"parking": "",
"price range": "",
"stars": "",
"internet": "",
"type": "",
"book stay": "",
"book day": "",
"book people": ""
},
"restaurant": {
"food": "",
"price range": "",
"name": "",
"area": "",
"book time": "",
"book day": "",
"book people": ""
},
"taxi": {
"leave at": "",
"destination": "",
"departure": "",
"arrive by": ""
},
"train": {
"leave at": "",
"destination": "",
"day": "",
"arrive by": "",
"departure": "",
"book people": ""
}
}
}
slot_name_map = {
'addr': "address",
'post': "postcode",
'pricerange': "price range",
'arrive': "arrive by",
'arriveby': "arrive by",
'leave': "leave at",
'leaveat': "leave at",
'depart': "departure",
'dest': "destination",
'fee': "entrance fee",
'open': 'open hours',
'car': "type",
'car type': "type",
'ticket': 'price',
'trainid': 'train id',
'id': 'train id',
'people': 'book people',
'stay': 'book stay',
'none': '',
'attraction': {
'price': 'entrance fee'
},
'hospital': {},
'hotel': {
'day': 'book day', 'price': "price range"
},
'restaurant': {
'day': 'book day', 'time': 'book time', 'price': "price range"
},
'taxi': {},
'train': {
'day': 'day', 'time': "duration"
},
'police': {},
'booking': {}
}
digit2word = {
'0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five',
'6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten'
}
cnt_domain_slot = Counter()
def normalize_domain_slot_value(domain, slot, value):
global init_ontology, slot_name_map
domain = domain.lower()
slot = slot.lower()
value = value.strip()
if value in ['do nt care', "do n't care"]:
value = 'dontcare'
if value in ['?', 'none', 'not mentioned']:
value = ""
if domain not in init_ontology['domains']:
raise Exception(f'{domain} not in ontology')
if slot not in init_ontology['domains'][domain]['slots']:
if slot in slot_name_map:
slot = slot_name_map[slot]
elif slot in slot_name_map[domain]:
slot = slot_name_map[domain][slot]
else:
raise Exception(f'{domain}-{slot} not in ontology')
assert slot=='' or slot in init_ontology['domains'][domain]['slots'], f'{(domain, slot, value)} not in ontology'
return domain, slot, value
def convert_da(da_dict, utt, sent_tokenizer, word_tokenizer):
'''
convert multiwoz dialogue acts to required format and update ontology
:param da_dict: dict[(intent, domain, slot, value)] = [word_start, word_end]
:param utt: user or system utt
'''
global init_ontology, digit2word, cnt_domain_slot
converted_da = {
'categorical': [],
'non-categorical': [],
'binary': []
}
sentences = sent_tokenizer.tokenize(utt)
sent_spans = sent_tokenizer.span_tokenize(utt)
tokens = [token for sent in sentences for token in word_tokenizer.tokenize(sent)]
token_spans = [(sent_span[0]+token_span[0], sent_span[0]+token_span[1]) for sent, sent_span in zip(sentences, sent_spans) for token_span in word_tokenizer.span_tokenize(sent)]
# assert len(tokens) == len(token_spans)
# for token, span in zip(tokens, token_spans):
# if utt[span[0]:span[1]] != '"':
# assert utt[span[0]:span[1]] == token
for (intent, domain, slot, value), span in da_dict.items():
if intent == 'request' or slot == '' or value == '':
# binary dialog acts
init_ontology['binary_dialogue_acts'].add((intent, domain, slot, value,))
converted_da['binary'].append({
'intent': intent,
'domain': domain,
'slot': slot,
'value': value
})
elif init_ontology['domains'][domain]['slots'][slot]['is_categorical']:
# categorical dialog acts
converted_da['categorical'].append({
'intent': intent,
'domain': domain,
'slot': slot,
'value': value
})
else:
# non-categorical dialog acts
converted_da['non-categorical'].append({
'intent': intent,
'domain': domain,
'slot': slot,
'value': value
})
# correct some value and try to give char level span
match = False
value = value.lower()
if span and span[0] <= span[1]:
# use original span annotation, but tokenizations are different
start_word, end_word = span
if end_word >= len(tokens):
# due to different tokenization, sometimes will out of index
delta = end_word - len(tokens) + 1
start_word -= delta
end_word -= delta
start_char, end_char = token_spans[start_word][0], token_spans[end_word][1]
value_span = utt[start_char:end_char].lower()
match = True
if value_span == value:
cnt_domain_slot['span match'] += 1
elif value.isdigit() and value in digit2word and digit2word[value] == value_span:
# !!!CHANGE VALUE: value is digit but value span is word
cnt_domain_slot['digit value match'] += 1
elif ''.join(value.split()) == ''.join(value_span.split()):
# !!!CHANGE VALUE: equal when remove blank
cnt_domain_slot['remove blank'] += 1
elif value in value_span:
# value in value_span
start_char += value_span.index(value)
end_char = start_char + len(value)
assert utt[start_char:end_char].lower() == value, f'{[value, utt[start_char:end_char], utt]}'
cnt_domain_slot['value in span'] += 1
elif ':' in value and value == '0'+value_span:
# !!!CHANGE VALUE: time x:xx == 0x:xx
cnt_domain_slot['x:xx == 0x:xx'] += 1
else:
# span mismatch, search near 1-2 words
for window in range(1,3):
start = max(0,start_word-window)
end = min(len(token_spans)-1,end_word+window)
large_span = utt[token_spans[start][0]:token_spans[end][1]].lower()
if value in large_span:
start_char = token_spans[start][0] + large_span.index(value)
end_char = start_char + len(value)
assert utt[start_char:end_char].lower() == value, f'{[value, utt[start_char:end_char], utt]}'
cnt_domain_slot[f'window={window}'] += 1
break
else:
# still not found
match = False
if match:
converted_da['non-categorical'][-1]['value'] = utt[start_char:end_char]
converted_da['non-categorical'][-1]['start'] = start_char
converted_da['non-categorical'][-1]['end'] = end_char
cnt_domain_slot['have span'] += 1
else:
cnt_domain_slot['no span'] += 1
return converted_da
def preprocess():
original_data_dir = 'MultiWOZ_2.1'
new_data_dir = 'data'
if not os.path.exists(original_data_dir):
original_data_zip = 'MultiWOZ_2.1.zip'
if not os.path.exists(original_data_zip):
raise FileNotFoundError(f'cannot find original data {original_data_zip} in multiwoz21/, should manually download MultiWOZ_2.1.zip from https://github.com/budzianowski/multiwoz/blob/master/data/MultiWOZ_2.1.zip')
else:
archive = ZipFile(original_data_zip)
archive.extractall()
os.makedirs(new_data_dir, exist_ok=True)
for filename in os.listdir(original_data_dir):
if 'db' in filename:
copy2(f'{original_data_dir}/{filename}', new_data_dir)
original_data = json.load(open(f'{original_data_dir}/data.json'))
global init_ontology, cnt_domain_slot
val_list = set(open(f'{original_data_dir}/valListFile.txt').read().split())
test_list = set(open(f'{original_data_dir}/testListFile.txt').read().split())
dataset = 'multiwoz21'
splits = ['train', 'validation', 'test']
dialogues_by_split = {split:[] for split in splits}
sent_tokenizer = PunktSentenceTokenizer()
word_tokenizer = TreebankWordTokenizer()
booking_remapper = BookingActRemapper()
for ori_dialog_id, ori_dialog in tqdm(original_data.items()):
if ori_dialog_id in val_list:
split = 'validation'
elif ori_dialog_id in test_list:
split = 'test'
else:
split = 'train'
dialogue_id = f'{dataset}-{split}-{len(dialogues_by_split[split])}'
# get user goal and involved domains
cur_domains = []
goal = {
'description': '. '.join(ori_dialog['goal']['message']),
'inform': {},
'request': {}
}
for k, v in ori_dialog['goal'].items():
if len(v) != 0 and k in init_ontology['domains']:
cur_domains.append(k)
goal['inform'][k] = {}
goal['request'][k] = {}
for attr in ['fail_info', 'info', 'fail_book', 'book']:
if attr in v:
for slot, value in v[attr].items():
if 'invalid' in slot:
continue
domain, slot, value = normalize_domain_slot_value(k, slot, value)
if slot in goal['inform'][domain]:
goal['inform'][domain][slot] += '|'+value
else:
goal['inform'][domain][slot] = value
if 'reqt' in v:
for slot in v['reqt']:
domain, slot, _ = normalize_domain_slot_value(k, slot, '')
goal['request'][domain][slot] = ''
dialogue = {
'dataset': dataset,
'data_split': split,
'dialogue_id': dialogue_id,
'original_id': ori_dialog_id,
'domains': cur_domains, # will be updated by dialog_acts and state
'goal': goal,
'turns': []
}
booking_remapper.reset()
for turn_id, turn in enumerate(ori_dialog['log']):
# correct some grammar errors in the text, mainly following `tokenization.md` in MultiWOZ_2.1
text = turn['text']
text = re.sub(" Im ", " I'm ", text)
text = re.sub(" im ", " i'm ", text)
text = re.sub(r"^Im ", "I'm ", text)
text = re.sub(r"^im ", "i'm ", text)
text = re.sub("theres", "there's", text)
text = re.sub("dont", "don't", text)
text = re.sub("whats", "what's", text)
text = re.sub('thats', "that's", text)
utt = text
speaker = 'user' if turn_id % 2 == 0 else 'system'
das = turn.get('dialog_act', [])
spans = turn.get('span_info', [])
if speaker == 'system':
das, spans = booking_remapper.remap(turn_id, ori_dialog['log'])
print(ori_dialog['log'][turn_id])
da_dict = {}
# transform DA
for Domain_Intent in das:
domain, intent = Domain_Intent.lower().split('-')
assert intent in init_ontology['intents'], f'{ori_dialog_id}:{turn_id}:da\t{intent} not in ontology'
for Slot, value in das[Domain_Intent]:
domain, slot, value = normalize_domain_slot_value(domain, Slot, value)
if domain not in cur_domains:
# update original cur_domains
cur_domains.append(domain)
da_dict[(intent, domain, slot, value,)] = []
for span in spans:
Domain_Intent, Slot, value, start_word, end_word = span
domain, intent = Domain_Intent.lower().split('-')
domain, slot, value = normalize_domain_slot_value(domain, Slot, value)
assert (intent, domain, slot, value,) in da_dict
da_dict[(intent, domain, slot, value,)] = [start_word, end_word]
dialogue_acts = convert_da(da_dict, utt, sent_tokenizer, word_tokenizer) # will also update ontology
dialogue['turns'].append({
'speaker': speaker,
'utterance': utt,
'utt_idx': turn_id,
'dialogue_acts': dialogue_acts,
})
if speaker == 'system':
# add state to last user turn
# add empty db_results
turn_state = turn['metadata']
cur_state = copy.deepcopy(init_ontology['state'])
booked = {}
for domain in turn_state:
if domain not in cur_state:
continue
for subdomain in ['semi', 'book']:
for slot, value in turn_state[domain][subdomain].items():
if slot == 'ticket':
continue
elif slot == 'booked':
assert domain in init_ontology['domains']
booked[domain] = value
continue
_, slot, value = normalize_domain_slot_value(domain, slot, value)
cur_state[domain][slot] = value
dialogue['turns'][-2]['state'] = cur_state
dialogue['turns'][-1]['db_results'] = {}
dialogue['turns'][-1]['booked'] = booked
dialogues_by_split[split].append(dialogue)
# pprint(cnt_domain_slot.most_common())
dialogues = []
for split in splits:
dialogues += dialogues_by_split[split]
init_ontology['binary_dialogue_acts'] = [{'intent':bda[0],'domain':bda[1],'slot':bda[2],'value':bda[3]} for bda in sorted(init_ontology['binary_dialogue_acts'])]
json.dump(dialogues[:10], open(f'dummy_data.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
json.dump(dialogues, open(f'{new_data_dir}/dialogues.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
json.dump(init_ontology, open(f'{new_data_dir}/ontology.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf:
for filename in os.listdir(new_data_dir):
zf.write(f'{new_data_dir}/{filename}')
rmtree(original_data_dir)
rmtree(new_data_dir)
return dialogues, init_ontology
if __name__ == '__main__':
preprocess()