Skip to content
Snippets Groups Projects
Select Git revision
  • cb4dfcdfd991e12d18cfa88461e0b4b495cc81a4
  • master default protected
  • btypes-fixes
  • freetypes
  • embedded-codegen
  • rust-support
  • z3solver
7 results

DataValidationTest.mch

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    preprocess.py 35.00 KiB
    import copy
    import re
    from zipfile import ZipFile, ZIP_DEFLATED
    from shutil import copy2, rmtree
    import json
    import os
    from tqdm import tqdm
    from collections import Counter
    from pprint import pprint
    from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer
    from data.unified_datasets.multiwoz21.booking_remapper import BookingActRemapper
    
    init_ontology = {
        "domains": { # descriptions are adapted from multiwoz22, but is_categorical may be different
            "attraction": {
                "description": "find an attraction",
                "slots": {
                    "area": {
                        "description": "area to search for attractions",
                        "is_categorical": True,
                        "possible_values": [
                            "centre",
                            "east",
                            "north",
                            "south",
                            "west"
                        ]
                    },
                    "name": {
                        "description": "name of the attraction",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "type": {
                        "description": "type of the attraction",
                        "is_categorical": True,
                        "possible_values": [
                            "architecture",
                            "boat",
                            "cinema",
                            "college",
                            "concerthall",
                            "entertainment",
                            "museum",
                            "multiple sports",
                            "nightclub",
                            "park",
                            "swimmingpool",
                            "theatre"
                        ]
                    },
                    "entrance fee": {
                        "description": "how much is the entrance fee",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "open hours": {
                        "description": "open hours of the attraction",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "address": {
                        "description": "address of the attraction",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "phone": {
                        "description": "phone number of the attraction",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "postcode": {
                        "description": "postcode of the attraction",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "choice": {
                        "description": "number of attractions that meet the requirement",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "hotel": {
                "description": "find and book a hotel",
                "slots": {
                    "internet": {
                        "description": "whether the hotel has internet",
                        "is_categorical": True,
                        "possible_values": [
                            "free",
                            "no",
                            "yes"
                        ]
                    },
                    "parking": {
                        "description": "whether the hotel has parking",
                        "is_categorical": True,
                        "possible_values": [
                            "free",
                            "no",
                            "yes"
                        ]
                    },
                    "area": {
                        "description": "area or place of the hotel",
                        "is_categorical": True,
                        "possible_values": [
                            "centre",
                            "east",
                            "north",
                            "south",
                            "west"
                        ]
                    },
                    "stars": {
                        "description": "star rating of the hotel",
                        "is_categorical": True,
                        "possible_values": [
                            "0",
                            "1",
                            "2",
                            "3",
                            "4",
                            "5"
                        ]
                    },
                    "price range": {
                        "description": "price budget of the hotel",
                        "is_categorical": True,
                        "possible_values": [
                            "expensive",
                            "cheap",
                            "moderate"
                        ]
                    },
                    "type": {
                        "description": "what is the type of the hotel",
                        "is_categorical": False,
                        "possible_values": [
                            "guesthouse",
                            "hotel"
                        ]
                    },
                    "name": {
                        "description": "name of the hotel",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book people": {
                        "description": "number of people for the hotel booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book stay": {
                        "description": "length of stay at the hotel",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book day": {
                        "description": "day of the hotel booking",
                        "is_categorical": True,
                        "possible_values": [
                            "monday",
                            "tuesday",
                            "wednesday",
                            "thursday",
                            "friday",
                            "saturday",
                            "sunday"
                        ]
                    },
                    "phone": {
                        "description": "phone number of the hotel",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "postcode": {
                        "description": "postcode of the hotel",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "address": {
                        "description": "address of the hotel",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "ref": {
                        "description": "reference number of the hotel booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "choice": {
                        "description": "number of hotels that meet the requirement",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "taxi": {
                "description": "rent taxi to travel",
                "slots": {
                    "destination": {
                        "description": "destination of taxi",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "departure": {
                        "description": "departure location of taxi",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "leave at": {
                        "description": "leaving time of taxi",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "arrive by": {
                        "description": "arrival time of taxi",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "phone": {
                        "description": "phone number of the taxi",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "type": {
                        "description": "car type of the taxi",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "restaurant": {
                "description": "find and book a restaurant",
                "slots": {
                    "price range": {
                        "description": "price budget for the restaurant",
                        "is_categorical": True,
                        "possible_values": [
                            "cheap",
                            "expensive",
                            "moderate"
                        ]
                    },
                    "area": {
                        "description": "area or place of the restaurant",
                        "is_categorical": True,
                        "possible_values": [
                            "centre",
                            "east",
                            "north",
                            "south",
                            "west"
                        ]
                    },
                    "food": {
                        "description": "the cuisine of the restaurant",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "name": {
                        "description": "name of the restaurant",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "address": {
                        "description": "address of the restaurant",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "postcode": {
                        "description": "postcode of the restaurant",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "phone": {
                        "description": "phone number of the restaurant",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book people": {
                        "description": "number of people for the restaurant booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book time": {
                        "description": "time of the restaurant booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book day": {
                        "description": "day of the restaurant booking",
                        "is_categorical": True,
                        "possible_values": [
                            "monday",
                            "tuesday",
                            "wednesday",
                            "thursday",
                            "friday",
                            "saturday",
                            "sunday"
                        ]
                    },
                    "ref": {
                        "description": "reference number of the restaurant booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "choice": {
                        "description": "number of restaurants that meet the requirement",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "train": {
                "description": "find a train to travel",
                "slots": {
                    "destination": {
                        "description": "destination of the train",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "arrive by": {
                        "description": "arrival time of the train",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "departure": {
                        "description": "departure location of the train",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "leave at": {
                        "description": "leaving time for the train",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "duration": {
                        "description": "duration of the travel",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book people": {
                        "description": "number of people booking for train",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "day": {
                        "description": "day of the train",
                        "is_categorical": True,
                        "possible_values": [
                            "monday",
                            "tuesday",
                            "wednesday",
                            "thursday",
                            "friday",
                            "saturday",
                            "sunday"
                        ]
                    },
                    "ref": {
                        "description": "reference number of the train booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "price": {
                        "description": "price of the train ticket",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "train id": {
                        "description": "id of the train",
                        "is_categorical": False
                    },
                    "choice": {
                        "description": "number of trains that meet the requirement",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "police": {
                "description": "find a police station for help",
                "slots": {
                    "name": {
                        "description": "name of the police station",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "address": {
                        "description": "address of the police station",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "postcode": {
                        "description": "postcode of the police station",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "phone": {
                        "description": "phone number of the police station",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "hospital": {
                "description": "find a hospital for help",
                "slots": {
                    "department": {
                        "description": "specific department of the hospital",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "address": {
                        "description": "address of the hospital",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "phone": {
                        "description": "phone number of the hospital",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "postcode": {
                        "description": "postcode of the hospital",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "booking": {
                "description": "booking for taxi, restaurant, hotel, train, etc.",
                "slots":{
                    "day": {
                        "description": "day of the booking",
                        "is_categorical": True,
                        "possible_values": [
                            "monday",
                            "tuesday",
                            "wednesday",
                            "thursday",
                            "friday",
                            "saturday",
                            "sunday"
                        ]
                    },
                    "time": {
                        "description": "time of the booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book people": {
                        "description": "number of people for the booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book stay": {
                        "description": "length of stay at the hotel",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "name": {
                        "description": "name of the booked entity",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "ref": {
                        "description": "reference number of the booking",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "general":{
                "description": "general domain without slots",
                "slots": {}
            }
        },
        "intents": {
            "inform": {
                "description": "inform the value of a slot"
            },
            "request": {
                "description": "ask for the value of a slot"
            },
            "nobook": {
                "description": "inform the user that the booking is failed"
            },
            "reqmore": {
                "description": "ask the user for more instructions"
            },
            "book": {
                "description": "book something for the user"
            },
            "bye": {
                "description": "say goodbye to the user and end the conversation"
            },
            "thank": {
                "description": "thanks for the help"
            },
            "welcome": {
                "description": "you're welcome"
            },
            "greet": {
                "description": "express greeting"
            },
            "recommend": {
                "description": "recommend a choice to the user"
            },
            "select": {
                "description": "provide several choices for the user"
            },
            "offerbook": {
                "description": "ask the user if he or she needs booking"
            },
            "offerbooked": {
                "description": "provide information about the booking"
            },
            "nooffer": {
                "description": "inform the user that there is no result satisfies user requirements"
            }
        },
        "binary_dialogue_acts": set(), # from data
        "state": {
            "attraction": {
                "type": "",
                "name": "",
                "area": ""
            },
            "hotel": {
                "name": "",
                "area": "",
                "parking": "",
                "price range": "",
                "stars": "",
                "internet": "",
                "type": "",
                "book stay": "",
                "book day": "",
                "book people": ""
            },
            "restaurant": {
                "food": "",
                "price range": "",
                "name": "",
                "area": "",
                "book time": "",
                "book day": "",
                "book people": ""
            },
            "taxi": {
                "leave at": "",
                "destination": "",
                "departure": "",
                "arrive by": ""
            },
            "train": {
                "leave at": "",
                "destination": "",
                "day": "",
                "arrive by": "",
                "departure": "",
                "book people": ""
            }
        }
    }
    
    slot_name_map = {
        'addr': "address", 
        'post': "postcode",
        'pricerange': "price range",
        'arrive': "arrive by",
        'arriveby': "arrive by",
        'leave': "leave at",
        'leaveat': "leave at",
        'depart': "departure",
        'dest': "destination",
        'fee': "entrance fee", 
        'open': 'open hours',
        'car': "type",
        'car type': "type",
        'ticket': 'price', 
        'trainid': 'train id',
        'id': 'train id',
        'people': 'book people',
        'stay': 'book stay',
        'none': '',
        'attraction': {
            'price': 'entrance fee'
        },
        'hospital': {},
        'hotel': {
            'day': 'book day', 'price': "price range"
        },
        'restaurant': {
            'day': 'book day', 'time': 'book time', 'price': "price range"
        },
        'taxi': {},
        'train': {
            'day': 'day', 'time': "duration"
        },
        'police': {},
        'booking': {}
    }
    
    digit2word = {
        '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five',
        '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten'
    }
    
    cnt_domain_slot = Counter()
    
    def normalize_domain_slot_value(domain, slot, value):
        global init_ontology, slot_name_map
        domain = domain.lower()
        slot = slot.lower()
        value = value.strip()
        if value in ['do nt care', "do n't care"]:
            value = 'dontcare'
        if value in ['?', 'none', 'not mentioned']:
            value = ""
        if domain not in init_ontology['domains']:
            raise Exception(f'{domain} not in ontology')
        if slot not in init_ontology['domains'][domain]['slots']:
            if slot in slot_name_map:
                slot = slot_name_map[slot]
            elif slot in slot_name_map[domain]:
                slot = slot_name_map[domain][slot]
            else:
                raise Exception(f'{domain}-{slot} not in ontology')
        assert slot=='' or slot in init_ontology['domains'][domain]['slots'], f'{(domain, slot, value)} not in ontology'
        return domain, slot, value
    
    def convert_da(da_dict, utt, sent_tokenizer, word_tokenizer):
        '''
        convert multiwoz dialogue acts to required format and update ontology
        :param da_dict: dict[(intent, domain, slot, value)] = [word_start, word_end]
        :param utt: user or system utt
        '''
        global init_ontology, digit2word, cnt_domain_slot
    
        converted_da = {
            'categorical': [],
            'non-categorical': [],
            'binary': []
        }
        sentences = sent_tokenizer.tokenize(utt)
        sent_spans = sent_tokenizer.span_tokenize(utt)
        tokens = [token for sent in sentences for token in word_tokenizer.tokenize(sent)]
        token_spans = [(sent_span[0]+token_span[0], sent_span[0]+token_span[1]) for sent, sent_span in zip(sentences, sent_spans) for token_span in word_tokenizer.span_tokenize(sent)]
        # assert len(tokens) == len(token_spans)
        # for token, span in zip(tokens, token_spans):
        #     if utt[span[0]:span[1]] != '"':
        #         assert utt[span[0]:span[1]] == token
    
        for (intent, domain, slot, value), span in da_dict.items():
            if intent == 'request' or slot == '' or value == '':
                # binary dialog acts
                init_ontology['binary_dialogue_acts'].add((intent, domain, slot, value,))
                converted_da['binary'].append({
                    'intent': intent,
                    'domain': domain,
                    'slot': slot,
                    'value': value
                })
            elif init_ontology['domains'][domain]['slots'][slot]['is_categorical']:
                # categorical dialog acts
                converted_da['categorical'].append({
                    'intent': intent,
                    'domain': domain,
                    'slot': slot,
                    'value': value
                })
            else:
                # non-categorical dialog acts
                converted_da['non-categorical'].append({
                    'intent': intent,
                    'domain': domain,
                    'slot': slot,
                    'value': value
                })
                # correct some value and try to give char level span
                match = False
                value = value.lower()
                if span and span[0] <= span[1]:
                    # use original span annotation, but tokenizations are different
                    start_word, end_word = span
                    if end_word >= len(tokens):
                        # due to different tokenization, sometimes will out of index
                        delta = end_word - len(tokens) + 1
                        start_word -= delta
                        end_word -= delta
                    start_char, end_char = token_spans[start_word][0], token_spans[end_word][1]
                    value_span = utt[start_char:end_char].lower()
                    match = True
                    if value_span == value:
                        cnt_domain_slot['span match'] += 1
                    elif value.isdigit() and value in digit2word and digit2word[value] == value_span:
                        # !!!CHANGE VALUE: value is digit but value span is word
                        cnt_domain_slot['digit value match'] += 1
                    elif ''.join(value.split()) == ''.join(value_span.split()):
                        # !!!CHANGE VALUE: equal when remove blank
                        cnt_domain_slot['remove blank'] += 1
                    elif value in value_span:
                        # value in value_span
                        start_char += value_span.index(value)
                        end_char = start_char + len(value)
                        assert utt[start_char:end_char].lower() == value, f'{[value, utt[start_char:end_char], utt]}'
                        cnt_domain_slot['value in span'] += 1
                    elif ':' in value and value == '0'+value_span:
                        # !!!CHANGE VALUE: time x:xx == 0x:xx
                        cnt_domain_slot['x:xx == 0x:xx'] += 1
                    else:
                        # span mismatch, search near 1-2 words
                        for window in range(1,3):
                            start = max(0,start_word-window)
                            end = min(len(token_spans)-1,end_word+window)
                            large_span = utt[token_spans[start][0]:token_spans[end][1]].lower()
                            if value in large_span:
                                start_char = token_spans[start][0] + large_span.index(value)
                                end_char = start_char + len(value)
                                assert utt[start_char:end_char].lower() == value, f'{[value, utt[start_char:end_char], utt]}'
                                cnt_domain_slot[f'window={window}'] += 1
                                break
                        else:
                            # still not found
                            match = False
                
                if match:
                    converted_da['non-categorical'][-1]['value'] = utt[start_char:end_char]
                    converted_da['non-categorical'][-1]['start'] = start_char
                    converted_da['non-categorical'][-1]['end'] = end_char
                    cnt_domain_slot['have span'] += 1
                else:
                    cnt_domain_slot['no span'] += 1
        return converted_da
    
    def preprocess():
        original_data_dir = 'MultiWOZ_2.1'
        new_data_dir = 'data'
    
        if not os.path.exists(original_data_dir):
            original_data_zip = 'MultiWOZ_2.1.zip'
            if not os.path.exists(original_data_zip):
                raise FileNotFoundError(f'cannot find original data {original_data_zip} in multiwoz21/, should manually download MultiWOZ_2.1.zip from https://github.com/budzianowski/multiwoz/blob/master/data/MultiWOZ_2.1.zip')
            else:
                archive = ZipFile(original_data_zip)
                archive.extractall()
        
        os.makedirs(new_data_dir, exist_ok=True)
        for filename in os.listdir(original_data_dir):
            if 'db' in filename:
                copy2(f'{original_data_dir}/{filename}', new_data_dir)
        
        original_data = json.load(open(f'{original_data_dir}/data.json'))
        global init_ontology, cnt_domain_slot
    
        val_list = set(open(f'{original_data_dir}/valListFile.txt').read().split())
        test_list = set(open(f'{original_data_dir}/testListFile.txt').read().split())
        dataset = 'multiwoz21'
        splits = ['train', 'validation', 'test']
        dialogues_by_split = {split:[] for split in splits}
        sent_tokenizer = PunktSentenceTokenizer()
        word_tokenizer = TreebankWordTokenizer()
        booking_remapper = BookingActRemapper()
        for ori_dialog_id, ori_dialog in tqdm(original_data.items()):
            if ori_dialog_id in val_list:
                split = 'validation'
            elif ori_dialog_id in test_list:
                split = 'test'
            else:
                split = 'train'
            dialogue_id = f'{dataset}-{split}-{len(dialogues_by_split[split])}'
    
            # get user goal and involved domains
            cur_domains = []
            goal = {
                'description': '. '.join(ori_dialog['goal']['message']),
                'inform': {},
                'request': {}
            }
            for k, v in ori_dialog['goal'].items():
                if len(v) != 0 and k in init_ontology['domains']:
                    cur_domains.append(k)
                    goal['inform'][k] = {}
                    goal['request'][k] = {}
                    for attr in ['fail_info', 'info', 'fail_book', 'book']:
                        if attr in v:
                            for slot, value in v[attr].items():
                                if 'invalid' in slot:
                                    continue
                                domain, slot, value = normalize_domain_slot_value(k, slot, value)
                                if slot in goal['inform'][domain]:
                                    goal['inform'][domain][slot] += '|'+value
                                else:
                                    goal['inform'][domain][slot] = value
                    if 'reqt' in v:
                        for slot in v['reqt']:
                            domain, slot, _ = normalize_domain_slot_value(k, slot, '')
                            goal['request'][domain][slot] = ''
    
            dialogue = {
                'dataset': dataset,
                'data_split': split,
                'dialogue_id': dialogue_id,
                'original_id': ori_dialog_id,
                'domains': cur_domains, # will be updated by dialog_acts and state
                'goal': goal,
                'turns': []
            }
    
            booking_remapper.reset()
            for turn_id, turn in enumerate(ori_dialog['log']):
                # correct some grammar errors in the text, mainly following `tokenization.md` in MultiWOZ_2.1
                text = turn['text']
                text = re.sub(" Im ", " I'm ", text)
                text = re.sub(" im ", " i'm ", text)
                text = re.sub(r"^Im ", "I'm ", text)
                text = re.sub(r"^im ", "i'm ", text)
                text = re.sub("theres", "there's", text)
                text = re.sub("dont", "don't", text)
                text = re.sub("whats", "what's", text)
                text = re.sub('thats', "that's", text)
                utt = text
                speaker = 'user' if turn_id % 2 == 0 else 'system'
    
                das = turn.get('dialog_act', [])
                spans = turn.get('span_info', [])
    
                if speaker == 'system':
                    das, spans = booking_remapper.remap(turn_id, ori_dialog['log'])
                print(ori_dialog['log'][turn_id])
    
                da_dict = {}
                # transform DA
                for Domain_Intent in das:
                    domain, intent = Domain_Intent.lower().split('-')
                    assert intent in init_ontology['intents'], f'{ori_dialog_id}:{turn_id}:da\t{intent} not in ontology'
                    for Slot, value in das[Domain_Intent]:
                        domain, slot, value = normalize_domain_slot_value(domain, Slot, value)
                        if domain not in cur_domains:
                            # update original cur_domains
                            cur_domains.append(domain)
                        da_dict[(intent, domain, slot, value,)] = []
    
                for span in spans:
                    Domain_Intent, Slot, value, start_word, end_word = span
                    domain, intent = Domain_Intent.lower().split('-')
                    domain, slot, value = normalize_domain_slot_value(domain, Slot, value)
                    assert (intent, domain, slot, value,) in da_dict
                    da_dict[(intent, domain, slot, value,)] = [start_word, end_word]
    
                dialogue_acts = convert_da(da_dict, utt, sent_tokenizer, word_tokenizer) # will also update ontology
    
                dialogue['turns'].append({
                    'speaker': speaker,
                    'utterance': utt,
                    'utt_idx': turn_id,
                    'dialogue_acts': dialogue_acts,
                })
    
                if speaker == 'system':
                    # add state to last user turn
                    # add empty db_results
                    turn_state = turn['metadata']
                    cur_state = copy.deepcopy(init_ontology['state'])
                    booked = {}
                    for domain in turn_state:
                        if domain not in cur_state:
                            continue
                        for subdomain in ['semi', 'book']:
                            for slot, value in turn_state[domain][subdomain].items():
                                if slot == 'ticket':
                                    continue
                                elif slot == 'booked':
                                    assert domain in init_ontology['domains']
                                    booked[domain] = value
                                    continue
                                _, slot, value = normalize_domain_slot_value(domain, slot, value)
                                cur_state[domain][slot] = value
                    dialogue['turns'][-2]['state'] = cur_state
                    dialogue['turns'][-1]['db_results'] = {}
                    dialogue['turns'][-1]['booked'] = booked
            dialogues_by_split[split].append(dialogue)
        # pprint(cnt_domain_slot.most_common())
        dialogues = []
        for split in splits:
            dialogues += dialogues_by_split[split]
        init_ontology['binary_dialogue_acts'] = [{'intent':bda[0],'domain':bda[1],'slot':bda[2],'value':bda[3]} for bda in sorted(init_ontology['binary_dialogue_acts'])]
        json.dump(dialogues[:10], open(f'dummy_data.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
        json.dump(dialogues, open(f'{new_data_dir}/dialogues.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
        json.dump(init_ontology, open(f'{new_data_dir}/ontology.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
        with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf:
            for filename in os.listdir(new_data_dir):
                zf.write(f'{new_data_dir}/{filename}')
        rmtree(original_data_dir)
        rmtree(new_data_dir)
        return dialogues, init_ontology
    
    if __name__ == '__main__':
        preprocess()