Skip to content
Snippets Groups Projects
Select Git revision
  • fe0b3bc853abe3ab39219652166b51bca70349c2
  • master default protected
2 results

NEWS.Rd

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    preprocess.py 46.34 KiB
    import copy
    import re
    from zipfile import ZipFile, ZIP_DEFLATED
    from shutil import copy2, rmtree
    import json
    import os
    from tqdm import tqdm
    from collections import Counter
    from pprint import pprint
    from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer
    
    ontology = {
        "domains": {  # descriptions are adapted from multiwoz22, but is_categorical may be different
            "attraction": {
                "description": "find an attraction",
                "slots": {
                    "area": {
                        "description": "area to search for attractions",
                        "is_categorical": True,
                        "possible_values": [
                            "centre",
                            "east",
                            "north",
                            "south",
                            "west"
                        ]
                    },
                    "name": {
                        "description": "name of the attraction",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "type": {
                        "description": "type of the attraction",
                        "is_categorical": True,
                        "possible_values": [
                            "architecture",
                            "boat",
                            "cinema",
                            "college",
                            "concerthall",
                            "entertainment",
                            "museum",
                            "multiple sports",
                            "nightclub",
                            "park",
                            "swimmingpool",
                            "theatre"
                        ]
                    },
                    "entrance fee": {
                        "description": "how much is the entrance fee",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "open hours": {
                        "description": "open hours of the attraction",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "address": {
                        "description": "address of the attraction",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "phone": {
                        "description": "phone number of the attraction",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "postcode": {
                        "description": "postcode of the attraction",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "choice": {
                        "description": "number of attractions that meet the requirement",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "hotel": {
                "description": "find and book a hotel",
                "slots": {
                    "internet": {
                        "description": "whether the hotel has internet",
                        "is_categorical": True,
                        "possible_values": [
                            "free",
                            "no",
                            "yes"
                        ]
                    },
                    "parking": {
                        "description": "whether the hotel has parking",
                        "is_categorical": True,
                        "possible_values": [
                            "free",
                            "no",
                            "yes"
                        ]
                    },
                    "area": {
                        "description": "area or place of the hotel",
                        "is_categorical": True,
                        "possible_values": [
                            "centre",
                            "east",
                            "north",
                            "south",
                            "west"
                        ]
                    },
                    "stars": {
                        "description": "star rating of the hotel",
                        "is_categorical": True,
                        "possible_values": [
                            "0",
                            "1",
                            "2",
                            "3",
                            "4",
                            "5"
                        ]
                    },
                    "price range": {
                        "description": "price budget of the hotel",
                        "is_categorical": True,
                        "possible_values": [
                            "expensive",
                            "cheap",
                            "moderate"
                        ]
                    },
                    "type": {
                        "description": "what is the type of the hotel",
                        "is_categorical": False,
                        "possible_values": [
                            "guesthouse",
                            "hotel"
                        ]
                    },
                    "name": {
                        "description": "name of the hotel",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book people": {
                        "description": "number of people for the hotel booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book stay": {
                        "description": "length of stay at the hotel",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book day": {
                        "description": "day of the hotel booking",
                        "is_categorical": True,
                        "possible_values": [
                            "monday",
                            "tuesday",
                            "wednesday",
                            "thursday",
                            "friday",
                            "saturday",
                            "sunday"
                        ]
                    },
                    "phone": {
                        "description": "phone number of the hotel",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "postcode": {
                        "description": "postcode of the hotel",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "address": {
                        "description": "address of the hotel",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "ref": {
                        "description": "reference number of the hotel booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "choice": {
                        "description": "number of hotels that meet the requirement",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "taxi": {
                "description": "rent taxi to travel",
                "slots": {
                    "destination": {
                        "description": "destination of taxi",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "departure": {
                        "description": "departure location of taxi",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "leave at": {
                        "description": "leaving time of taxi",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "arrive by": {
                        "description": "arrival time of taxi",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "phone": {
                        "description": "phone number of the taxi",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "type": {
                        "description": "car type of the taxi",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "restaurant": {
                "description": "find and book a restaurant",
                "slots": {
                    "price range": {
                        "description": "price budget for the restaurant",
                        "is_categorical": True,
                        "possible_values": [
                            "cheap",
                            "expensive",
                            "moderate"
                        ]
                    },
                    "area": {
                        "description": "area or place of the restaurant",
                        "is_categorical": True,
                        "possible_values": [
                            "centre",
                            "east",
                            "north",
                            "south",
                            "west"
                        ]
                    },
                    "food": {
                        "description": "the cuisine of the restaurant",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "name": {
                        "description": "name of the restaurant",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "address": {
                        "description": "address of the restaurant",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "postcode": {
                        "description": "postcode of the restaurant",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "phone": {
                        "description": "phone number of the restaurant",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book people": {
                        "description": "number of people for the restaurant booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book time": {
                        "description": "time of the restaurant booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book day": {
                        "description": "day of the restaurant booking",
                        "is_categorical": True,
                        "possible_values": [
                            "monday",
                            "tuesday",
                            "wednesday",
                            "thursday",
                            "friday",
                            "saturday",
                            "sunday"
                        ]
                    },
                    "ref": {
                        "description": "reference number of the restaurant booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "choice": {
                        "description": "number of restaurants that meet the requirement",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "train": {
                "description": "find a train to travel",
                "slots": {
                    "destination": {
                        "description": "destination of the train",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "arrive by": {
                        "description": "arrival time of the train",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "departure": {
                        "description": "departure location of the train",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "leave at": {
                        "description": "leaving time for the train",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "duration": {
                        "description": "duration of the travel",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "book people": {
                        "description": "number of people booking for train",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "day": {
                        "description": "day of the train",
                        "is_categorical": True,
                        "possible_values": [
                            "monday",
                            "tuesday",
                            "wednesday",
                            "thursday",
                            "friday",
                            "saturday",
                            "sunday"
                        ]
                    },
                    "ref": {
                        "description": "reference number of the train booking",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "price": {
                        "description": "price of the train ticket",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "train id": {
                        "description": "id of the train",
                        "is_categorical": False
                    },
                    "choice": {
                        "description": "number of trains that meet the requirement",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "police": {
                "description": "find a police station for help",
                "slots": {
                    "name": {
                        "description": "name of the police station",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "address": {
                        "description": "address of the police station",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "postcode": {
                        "description": "postcode of the police station",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "phone": {
                        "description": "phone number of the police station",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "hospital": {
                "description": "find a hospital for help",
                "slots": {
                    "department": {
                        "description": "specific department of the hospital",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "address": {
                        "description": "address of the hospital",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "phone": {
                        "description": "phone number of the hospital",
                        "is_categorical": False,
                        "possible_values": []
                    },
                    "postcode": {
                        "description": "postcode of the hospital",
                        "is_categorical": False,
                        "possible_values": []
                    }
                }
            },
            "general": {
                "description": "general domain without slots",
                "slots": {}
            }
        },
        "intents": {
            "inform": {
                "description": "inform the value of a slot"
            },
            "request": {
                "description": "ask for the value of a slot"
            },
            "nobook": {
                "description": "inform the user that the booking is failed"
            },
            "reqmore": {
                "description": "ask the user for more instructions"
            },
            "book": {
                "description": "book something for the user"
            },
            "bye": {
                "description": "say goodbye to the user and end the conversation"
            },
            "thank": {
                "description": "thanks for the help"
            },
            "welcome": {
                "description": "you're welcome"
            },
            "greet": {
                "description": "express greeting"
            },
            "recommend": {
                "description": "recommend a choice to the user"
            },
            "select": {
                "description": "provide several choices for the user"
            },
            "offerbook": {
                "description": "ask the user if he or she needs booking"
            },
            "offerbooked": {
                "description": "provide information about the booking"
            },
            "nooffer": {
                "description": "inform the user that there is no result satisfies user requirements"
            }
        },
        "state": {
            "attraction": {
                "type": "",
                "name": "",
                "area": ""
            },
            "hotel": {
                "name": "",
                "area": "",
                "parking": "",
                "price range": "",
                "stars": "",
                "internet": "",
                "type": "",
                "book stay": "",
                "book day": "",
                "book people": ""
            },
            "restaurant": {
                "food": "",
                "price range": "",
                "name": "",
                "area": "",
                "book time": "",
                "book day": "",
                "book people": ""
            },
            "taxi": {
                "leave at": "",
                "destination": "",
                "departure": "",
                "arrive by": ""
            },
            "train": {
                "leave at": "",
                "destination": "",
                "day": "",
                "arrive by": "",
                "departure": "",
                "book people": ""
            },
            "hospital": {
                "department": ""
            }
        },
        "dialogue_acts": {
            "categorical": {},
            "non-categorical": {},
            "binary": {}
        }
    }
    
    slot_name_map = {
        'addr': "address",
        'post': "postcode",
        'pricerange': "price range",
        'arrive': "arrive by",
        'arriveby': "arrive by",
        'leave': "leave at",
        'leaveat': "leave at",
        'depart': "departure",
        'dest': "destination",
        'fee': "entrance fee",
        'open': 'open hours',
        'car': "type",
        'car type': "type",
        'ticket': 'price',
        'trainid': 'train id',
        'id': 'train id',
        'people': 'book people',
        'stay': 'book stay',
        'none': '',
        'attraction': {
            'price': 'entrance fee'
        },
        'hospital': {},
        'hotel': {
            'day': 'book day', 'price': "price range"
        },
        'restaurant': {
            'day': 'book day', 'time': 'book time', 'price': "price range"
        },
        'taxi': {},
        'train': {
            'day': 'day', 'time': "duration"
        },
        'police': {},
        'booking': {}
    }
    
    reverse_da_slot_name_map = {
        'address': 'Addr',
        'postcode': 'Post',
        'price range': 'Price',
        'arrive by': 'Arrive',
        'leave at': 'Leave',
        'departure': 'Depart',
        'destination': 'Dest',
        'entrance fee': 'Fee',
        'open hours': 'Open',
        'price': 'Ticket',
        'train id': 'Id',
        'book people': 'People',
        'book stay': 'Stay',
        'book day': 'Day',
        'book time': 'Time',
        'duration': 'Time',
        'taxi': {
            'type': 'Car',
            'phone': 'Phone'
        }
    }
    
    digit2word = {
        '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five',
        '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten'
    }
    
    cnt_domain_slot = Counter()
    
    
    class BookingActRemapper:
    
        def __init__(self, ontology):
            self.ontology = ontology
            self.reset()
    
        def reset(self):
            self.current_domains_user = []
            self.current_domains_system = []
            self.booked_domains = []
    
        def retrieve_current_domain_from_user(self, turn_id, ori_dialog):
            prev_user_turn = ori_dialog[turn_id - 1]
    
            dialog_acts = prev_user_turn.get('dialog_act', [])
            keyword_domains_user = get_keyword_domains(prev_user_turn)
            current_domains_temp = get_current_domains_from_act(dialog_acts)
            self.current_domains_user = current_domains_temp if current_domains_temp else self.current_domains_user
            next_user_domains = get_next_user_act_domains(ori_dialog, turn_id)
    
            return keyword_domains_user, next_user_domains
    
        def retrieve_current_domain_from_system(self, turn_id, ori_dialog):
    
            system_turn = ori_dialog[turn_id]
            dialog_acts = system_turn.get('dialog_act', [])
            keyword_domains_system = get_keyword_domains(system_turn)
            current_domains_temp = get_current_domains_from_act(dialog_acts)
            self.current_domains_system = current_domains_temp if current_domains_temp else self.current_domains_system
            booked_domain_current = self.check_domain_booked(system_turn)
    
            return keyword_domains_system, booked_domain_current
    
        def remap(self, turn_id, ori_dialog):
    
            keyword_domains_user, next_user_domains = self.retrieve_current_domain_from_user(turn_id, ori_dialog)
            keyword_domains_system, booked_domain_current = self.retrieve_current_domain_from_system(turn_id, ori_dialog)
    
            # only need to remap if there is a dialog action labelled
            dialog_acts = ori_dialog[turn_id].get('dialog_act', [])
            spans = ori_dialog[turn_id].get('span_info', [])
            if dialog_acts:
    
                flattened_acts = flatten_acts(dialog_acts)
                flattened_spans = flatten_span_acts(spans)
                remapped_acts, error_local = remap_acts(flattened_acts, self.current_domains_user,
                                                        booked_domain_current, keyword_domains_user,
                                                        keyword_domains_system, self.current_domains_system,
                                                        next_user_domains, self.ontology)
    
                remapped_spans, _ = remap_acts(flattened_spans, self.current_domains_user,
                                                   booked_domain_current, keyword_domains_user,
                                                   keyword_domains_system, self.current_domains_system,
                                                   next_user_domains, self.ontology)
    
                deflattened_remapped_acts = deflat_acts(remapped_acts)
                deflattened_remapped_spans = deflat_span_acts(remapped_spans)
    
                return deflattened_remapped_acts, deflattened_remapped_spans
            else:
                return dialog_acts, spans
    
        def check_domain_booked(self, turn):
    
            booked_domain_current = None
            for domain in turn['metadata']:
                if turn['metadata'][domain]["book"]["booked"] and domain not in self.booked_domains:
                    booked_domain_current = domain.capitalize()
                    self.booked_domains.append(domain)
            return booked_domain_current
    
    
    def get_keyword_domains(turn):
        keyword_domains = []
        text = turn['text']
        for d in ["Hotel", "Restaurant", "Train"]:
            if d.lower() in text.lower():
                keyword_domains.append(d)
        return keyword_domains
    
    
    def get_current_domains_from_act(dialog_acts):
    
        current_domains_temp = []
        for dom_int in dialog_acts:
            domain, intent = dom_int.split('-')
            if domain in ["general", "Booking"]:
                continue
            if domain not in current_domains_temp:
                current_domains_temp.append(domain)
    
        return current_domains_temp
    
    
    def get_next_user_act_domains(ori_dialog, turn_id):
        domains = []
        try:
            next_user_act = ori_dialog[turn_id + 1]['dialog_act']
            domains = get_current_domains_from_act(next_user_act)
        except:
            # will fail if system act is the last act of the dialogue
            pass
        return domains
    
    
    def flatten_acts(dialog_acts):
        flattened_acts = []
        for dom_int in dialog_acts:
            domain, intent = dom_int.split('-')
            for slot_value in dialog_acts[dom_int]:
                slot = slot_value[0]
                value = slot_value[1]
                flattened_acts.append((domain, intent, slot, value))
    
        return flattened_acts
    
    
    def flatten_span_acts(span_acts):
    
        flattened_acts = []
        for span_act in span_acts:
            domain, intent = span_act[0].split("-")
            flattened_acts.append((domain, intent, span_act[1], span_act[2:]))
        return flattened_acts
    
    
    def deflat_acts(flattened_acts):
    
        dialog_acts = dict()
    
        for act in flattened_acts:
            domain, intent, slot, value = act
            if f"{domain}-{intent}" not in dialog_acts.keys():
                dialog_acts[f"{domain}-{intent}"] = [[slot, value]]
            else:
                dialog_acts[f"{domain}-{intent}"].append([slot, value])
    
        return dialog_acts
    
    
    def deflat_span_acts(flattened_acts):
    
        dialog_span_acts = []
        for act in flattened_acts:
            domain, intent, slot, value = act
            if value == 'none':
                continue
            new_act = [f"{domain}-{intent}", slot]
            new_act.extend(value)
            dialog_span_acts.append(new_act)
    
        return dialog_span_acts
    
    
    def remap_acts(flattened_acts, current_domains, booked_domain=None, keyword_domains_user=None,
                   keyword_domains_system=None, current_domain_system=None, next_user_domain=None, ontology=None):
    
        # We now look for all cases that can happen: Booking domain, Booking within a domain or taxi-inform-car for booking
        error = 0
        remapped_acts = []
    
        # if there is more than one current domain or none at all, we try to get booked domain differently
        if len(current_domains) != 1 and booked_domain:
            current_domains = [booked_domain]
        elif len(current_domains) != 1 and len(keyword_domains_user) == 1:
            current_domains = keyword_domains_user
        elif len(current_domains) != 1 and len(keyword_domains_system) == 1:
            current_domains = keyword_domains_system
        elif len(current_domains) != 1 and len(current_domain_system) == 1:
            current_domains = current_domain_system
        elif len(current_domains) != 1 and len(next_user_domain) == 1:
            current_domains = next_user_domain
    
        for act in flattened_acts:
            try:
                domain, intent, slot, value = act
                if f"{domain}-{intent}-{slot}" == "Booking-Book-Ref":
                    # We need to remap that booking act now
                    potential_domain = current_domains[0]
                    remapped_acts.append((potential_domain, "Book", "none", "none"))
                    if ontology_check(potential_domain, slot, ontology):
                        remapped_acts.append((potential_domain, "Inform", "Ref", value))
                elif domain == "Booking" and intent == "Book" and slot != "Ref":
                    # the book intent is here actually an inform intent according to the data
                    potential_domain = current_domains[0]
                    if ontology_check(potential_domain, slot, ontology):
                        remapped_acts.append((potential_domain, "Inform", slot, value))
                elif domain == "Booking" and intent == "Inform":
                    # the inform intent is here actually a request intent according to the data
                    potential_domain = current_domains[0]
                    if ontology_check(potential_domain, slot, ontology):
                        remapped_acts.append((potential_domain, "OfferBook", slot, value))
                elif domain == "Booking" and intent in ["NoBook", "Request"]:
                    potential_domain = current_domains[0]
                    if ontology_check(potential_domain, slot, ontology):
                        remapped_acts.append((potential_domain, intent, slot, value))
                elif f"{domain}-{intent}-{slot}" == "Taxi-Inform-Car":
                    # taxi-inform-car actually triggers the booking and informs on a car
                    remapped_acts.append((domain, "Book", "none", "none"))
                    remapped_acts.append((domain, intent, slot, value))
                elif f"{domain}-{intent}-{slot}" in ["Train-Inform-Ref", "Train-OfferBooked-Ref"]:
                    # train-inform/offerbooked-ref actually triggers the booking and informs on the reference number
                    remapped_acts.append((domain, "Book", "none", "none"))
                    remapped_acts.append((domain, "Inform", slot, value))
                elif domain == "Train" and intent == "OfferBooked" and slot != "Ref":
                    # this is actually an inform act
                    remapped_acts.append((domain, "Inform", slot, value))
                else:
                    remapped_acts.append(act)
            except Exception as e:
                print("Error detected:", e)
                error += 1
    
        return remapped_acts, error
    
    
    def ontology_check(domain_, slot_, init_ontology):
    
        domain = domain_.lower()
        slot = slot_.lower()
        if slot not in init_ontology['domains'][domain]['slots']:
            if slot in slot_name_map:
                slot = slot_name_map[slot]
            elif slot in slot_name_map[domain]:
                slot = slot_name_map[domain][slot]
        return slot in init_ontology['domains'][domain]['slots']
    
    
    def reverse_da(dialogue_acts):
        global reverse_da_slot_name_map
        das = {}
        for da_type in dialogue_acts:
            for da in dialogue_acts[da_type]:
                intent, domain, slot, value = da['intent'], da['domain'], da['slot'], da.get('value', '')
                if domain == 'general':
                    Domain_Intent = '-'.join([domain, intent])
                elif intent == 'nooffer':
                    Domain_Intent = '-'.join([domain.capitalize(), 'NoOffer'])
                elif intent == 'nobook':
                    Domain_Intent = '-'.join([domain.capitalize(), 'NoBook'])
                elif intent == 'offerbook':
                    Domain_Intent = '-'.join([domain.capitalize(), 'OfferBook'])
                else:
                    Domain_Intent = '-'.join([domain.capitalize(), intent.capitalize()])
                das.setdefault(Domain_Intent, [])
                if slot in reverse_da_slot_name_map:
                    Slot = reverse_da_slot_name_map[slot]
                elif domain in reverse_da_slot_name_map and slot in reverse_da_slot_name_map[domain]:
                    Slot = reverse_da_slot_name_map[domain][slot]
                else:
                    Slot = slot.capitalize()
                if value == '':
                    if intent == 'request':
                        value = '?'
                    else:
                        value = 'none'
                if Slot == '':
                    Slot = 'none'
                das[Domain_Intent].append([Slot, value])
        return das
    
    
    def normalize_domain_slot_value(domain, slot, value):
        global ontology, slot_name_map
        domain = domain.lower()
        slot = slot.lower()
        value = value.strip()
        if value in ['do nt care', "do n't care"]:
            value = 'dontcare'
        if value in ['?', 'none', 'not mentioned']:
            value = ""
        if domain not in ontology['domains']:
            raise Exception(f'{domain} not in ontology')
        if slot not in ontology['domains'][domain]['slots']:
            if slot in slot_name_map:
                slot = slot_name_map[slot]
            elif slot in slot_name_map[domain]:
                slot = slot_name_map[domain][slot]
            else:
                raise Exception(f'{domain}-{slot} not in ontology')
        assert slot == '' or slot in ontology['domains'][domain]['slots'], f'{(domain, slot, value)} not in ontology'
        return domain, slot, value
    
    
    def convert_da(da_dict, utt, sent_tokenizer, word_tokenizer):
        '''
        convert multiwoz dialogue acts to required format
        :param da_dict: dict[(intent, domain, slot, value)] = [word_start, word_end]
        :param utt: user or system utt
        '''
        global ontology, digit2word, cnt_domain_slot
    
        converted_da = {
            'categorical': [],
            'non-categorical': [],
            'binary': []
        }
        sentences = sent_tokenizer.tokenize(utt)
        sent_spans = sent_tokenizer.span_tokenize(utt)
        tokens = [token for sent in sentences for token in word_tokenizer.tokenize(sent)]
        token_spans = [(sent_span[0] + token_span[0], sent_span[0] + token_span[1]) for sent, sent_span in
                       zip(sentences, sent_spans) for token_span in word_tokenizer.span_tokenize(sent)]
        # assert len(tokens) == len(token_spans)
        # for token, span in zip(tokens, token_spans):
        #     if utt[span[0]:span[1]] != '"':
        #         assert utt[span[0]:span[1]] == token
    
        for (intent, domain, slot, value), span in da_dict.items():
            if intent == 'request' or slot == '' or value == '':
                # binary dialog acts
                assert value == ''
                converted_da['binary'].append({
                    'intent': intent,
                    'domain': domain,
                    'slot': slot
                })
            elif ontology['domains'][domain]['slots'][slot]['is_categorical']:
                # categorical dialog acts
                converted_da['categorical'].append({
                    'intent': intent,
                    'domain': domain,
                    'slot': slot,
                    'value': value
                })
            else:
                # non-categorical dialog acts
                converted_da['non-categorical'].append({
                    'intent': intent,
                    'domain': domain,
                    'slot': slot,
                    'value': value
                })
                # correct some value and try to give char level span
                match = False
                value = value.lower()
                if span and span[0] <= span[1]:
                    # use original span annotation, but tokenizations are different
                    start_word, end_word = span
                    if end_word >= len(tokens):
                        # due to different tokenization, sometimes will out of index
                        delta = end_word - len(tokens) + 1
                        start_word -= delta
                        end_word -= delta
                    start_char, end_char = token_spans[start_word][0], token_spans[end_word][1]
                    value_span = utt[start_char:end_char].lower()
                    match = True
                    if value_span == value:
                        cnt_domain_slot['span match'] += 1
                    elif value.isdigit() and value in digit2word and digit2word[value] == value_span:
                        # !!!CHANGE VALUE: value is digit but value span is word
                        cnt_domain_slot['digit value match'] += 1
                    elif ''.join(value.split()) == ''.join(value_span.split()):
                        # !!!CHANGE VALUE: equal when remove blank
                        cnt_domain_slot['remove blank'] += 1
                    elif value in value_span:
                        # value in value_span
                        start_char += value_span.index(value)
                        end_char = start_char + len(value)
                        assert utt[start_char:end_char].lower() == value, f'{[value, utt[start_char:end_char], utt]}'
                        cnt_domain_slot['value in span'] += 1
                    elif ':' in value and value == '0' + value_span:
                        # !!!CHANGE VALUE: time x:xx == 0x:xx
                        cnt_domain_slot['x:xx == 0x:xx'] += 1
                    else:
                        # span mismatch, search near 1-2 words
                        for window in range(1, 3):
                            start = max(0, start_word - window)
                            end = min(len(token_spans) - 1, end_word + window)
                            large_span = utt[token_spans[start][0]:token_spans[end][1]].lower()
                            if value in large_span:
                                start_char = token_spans[start][0] + large_span.index(value)
                                end_char = start_char + len(value)
                                assert utt[
                                       start_char:end_char].lower() == value, f'{[value, utt[start_char:end_char], utt]}'
                                cnt_domain_slot[f'window={window}'] += 1
                                break
                        else:
                            # still not found
                            match = False
    
                if match:
                    converted_da['non-categorical'][-1]['value'] = utt[start_char:end_char]
                    converted_da['non-categorical'][-1]['start'] = start_char
                    converted_da['non-categorical'][-1]['end'] = end_char
                    cnt_domain_slot['have span'] += 1
                else:
                    cnt_domain_slot['no span'] += 1
        return converted_da
    
    
    def preprocess():
        original_data_dir = 'MultiWOZ_2.1'
        new_data_dir = 'data'
    
        if not os.path.exists(original_data_dir):
            original_data_zip = 'MultiWOZ_2.1.zip'
            if not os.path.exists(original_data_zip):
                raise FileNotFoundError(
                    f'cannot find original data {original_data_zip} in multiwoz21/, should manually download MultiWOZ_2.1.zip from https://github.com/budzianowski/multiwoz/blob/master/data/MultiWOZ_2.1.zip')
            else:
                archive = ZipFile(original_data_zip)
                archive.extractall()
    
        os.makedirs(new_data_dir, exist_ok=True)
        for filename in os.listdir(original_data_dir):
            if 'db' in filename:
                copy2(f'{original_data_dir}/{filename}', new_data_dir)
    
        original_data = json.load(open(f'{original_data_dir}/data.json'))
        global ontology, cnt_domain_slot
    
        val_list = set(open(f'{original_data_dir}/valListFile.txt').read().split())
        test_list = set(open(f'{original_data_dir}/testListFile.txt').read().split())
        dataset = 'multiwoz21'
        splits = ['train', 'validation', 'test']
        dialogues_by_split = {split: [] for split in splits}
        sent_tokenizer = PunktSentenceTokenizer()
        word_tokenizer = TreebankWordTokenizer()
        booking_remapper = BookingActRemapper(ontology)
        for ori_dialog_id, ori_dialog in tqdm(original_data.items()):
            if ori_dialog_id in val_list:
                split = 'validation'
            elif ori_dialog_id in test_list:
                split = 'test'
            else:
                split = 'train'
            dialogue_id = f'{dataset}-{split}-{len(dialogues_by_split[split])}'
    
            # get user goal and involved domains
            cur_domains = []
            goal = {
                'description': '. '.join(ori_dialog['goal']['message']),
                'inform': {},
                'request': {}
            }
            for k, v in ori_dialog['goal'].items():
                if len(v) != 0 and k in ontology['domains']:
                    cur_domains.append(k)
                    goal['inform'][k] = {}
                    goal['request'][k] = {}
                    for attr in ['fail_info', 'info', 'fail_book', 'book']:
                        if attr in v:
                            for slot, value in v[attr].items():
                                if 'invalid' in slot:
                                    continue
                                domain, slot, value = normalize_domain_slot_value(k, slot, value)
                                if slot in goal['inform'][domain]:
                                    goal['inform'][domain][slot] += '|' + value
                                else:
                                    goal['inform'][domain][slot] = value
                    if 'reqt' in v:
                        for slot in v['reqt']:
                            domain, slot, _ = normalize_domain_slot_value(k, slot, '')
                            goal['request'][domain][slot] = ''
    
            dialogue = {
                'dataset': dataset,
                'data_split': split,
                'dialogue_id': dialogue_id,
                'original_id': ori_dialog_id,
                'domains': cur_domains,  # will be updated by dialog_acts and state
                'goal': goal,
                'turns': []
            }
    
            booking_remapper.reset()
            belief_domains = ['attraction', 'restaurant', 'train', 'hotel', 'taxi', 'hospital']
            entity_booked_dict = dict((domain, False) for domain in belief_domains)
            for turn_id, turn in enumerate(ori_dialog['log']):
                # correct some grammar errors in the text, mainly following `tokenization.md` in MultiWOZ_2.1
                text = turn['text']
                text = re.sub(" Im ", " I'm ", text)
                text = re.sub(" im ", " i'm ", text)
                text = re.sub(r"^Im ", "I'm ", text)
                text = re.sub(r"^im ", "i'm ", text)
                text = re.sub("theres", "there's", text)
                text = re.sub("dont", "don't", text)
                text = re.sub("whats", "what's", text)
                text = re.sub('thats', "that's", text)
                utt = text
                speaker = 'user' if turn_id % 2 == 0 else 'system'
    
                das = turn.get('dialog_act', [])
                spans = turn.get('span_info', [])
    
                if speaker == 'system':
                    das, spans = booking_remapper.remap(turn_id, ori_dialog['log'])
    
                da_dict = {}
                # transform DA
                for Domain_Intent in das:
                    domain, intent = Domain_Intent.lower().split('-')
                    assert intent in ontology['intents'], f'{ori_dialog_id}:{turn_id}:da\t{intent} not in ontology'
                    for Slot, value in das[Domain_Intent]:
                        domain, slot, value = normalize_domain_slot_value(domain, Slot, value)
                        if domain not in cur_domains:
                            # update original cur_domains
                            cur_domains.append(domain)
                        da_dict[(intent, domain, slot, value,)] = []
    
                for span in spans:
                    Domain_Intent, Slot, value, start_word, end_word = span
                    domain, intent = Domain_Intent.lower().split('-')
                    domain, slot, value = normalize_domain_slot_value(domain, Slot, value)
                    assert (intent, domain, slot, value,) in da_dict
                    da_dict[(intent, domain, slot, value,)] = [start_word, end_word]
    
                dialogue_acts = convert_da(da_dict, utt, sent_tokenizer, word_tokenizer)
    
                # reverse_das = reverse_da(dialogue_acts)
                # das_list = sorted([(Domain_Intent, Slot, ''.join(value.split()).lower()) for Domain_Intent in das for Slot, value in das[Domain_Intent]])
                # reverse_das_list = sorted([(Domain_Intent, Slot, ''.join(value.split()).lower()) for Domain_Intent in reverse_das for Slot, value in reverse_das[Domain_Intent]])
                # if das_list != reverse_das_list:
                #     print(das_list)
                #     print(reverse_das_list)
                #     print()
                #     print()
    
                dialogue['turns'].append({
                    'speaker': speaker,
                    'utterance': utt,
                    'utt_idx': turn_id,
                    'dialogue_acts': dialogue_acts,
                })
    
                # add to dialogue_acts dictionary in the ontology
                for da_type in dialogue_acts:
                    das = dialogue_acts[da_type]
                    for da in das:
                        ontology["dialogue_acts"][da_type].setdefault((da['intent'], da['domain'], da['slot']), {})
                        ontology["dialogue_acts"][da_type][(da['intent'], da['domain'], da['slot'])][speaker] = True
    
                if speaker == 'system':
                    # add state to last user turn
                    # add empty db_results
                    turn_state = turn['metadata']
                    cur_state = copy.deepcopy(ontology['state'])
                    booked = {}
                    for domain in turn_state:
                        if domain not in cur_state:
                            continue
                        for subdomain in ['semi', 'book']:
                            for slot, value in turn_state[domain][subdomain].items():
                                if slot == 'ticket':
                                    continue
                                elif slot == 'booked':
                                    assert domain in ontology['domains']
                                    booked[domain] = value
                                    continue
                                _, slot, value = normalize_domain_slot_value(domain, slot, value)
                                cur_state[domain][slot] = value
                    dialogue['turns'][-2]['state'] = cur_state
                    entity_booked_dict, booked = fix_entity_booked_info(entity_booked_dict, booked)
                    dialogue['turns'][-1]['booked'] = booked
            dialogues_by_split[split].append(dialogue)
        # pprint(cnt_domain_slot.most_common())
        dialogues = []
        for split in splits:
            dialogues += dialogues_by_split[split]
        for da_type in ontology['dialogue_acts']:
            ontology["dialogue_acts"][da_type] = sorted([str(
                {'user': speakers.get('user', False), 'system': speakers.get('system', False), 'intent': da[0],
                 'domain': da[1], 'slot': da[2]}) for da, speakers in ontology["dialogue_acts"][da_type].items()])
        json.dump(dialogues[:10], open(f'dummy_data.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
        json.dump(ontology, open(f'{new_data_dir}/ontology.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
        json.dump(dialogues, open(f'{new_data_dir}/dialogues.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
        with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf:
            for filename in os.listdir(new_data_dir):
                zf.write(f'{new_data_dir}/{filename}')
        rmtree(original_data_dir)
        rmtree(new_data_dir)
        return dialogues, ontology
    
    
    def fix_entity_booked_info(entity_booked_dict, booked):
        for domain in entity_booked_dict:
            if not entity_booked_dict[domain] and booked[domain]:
                entity_booked_dict[domain] = True
                booked[domain] = []
        return entity_booked_dict, booked
    
    
    if __name__ == '__main__':
        preprocess()