diff --git a/data/unified_datasets/README.md b/data/unified_datasets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a22a057e191e4bc2b75c0290e2d9cc8a09c23ffc --- /dev/null +++ b/data/unified_datasets/README.md @@ -0,0 +1,561 @@ +# Unified data format with example + +Under `data/unified_datasets` directory. + +single turn->dialogue with one turn + +Each dataset have at least 4 files: + +- `README.md`: dataset description and the main changes from original data to processed data. +- `preprocess.py`: python script that preprocess the data. By running `python preprocess.py` we can get the following two files. The structure `preprocess.py` should be: + +```python +def preprocess(): + pass +if __name__ == '__main__': + preprocess() +``` + +- `ontology.json`: dataset ontology, contains descriptions, state definition, etc. +- `data.json.zip`: contains `data.json`. + +### README + +- Data source: publication, original data download link, etc. +- Data description: + - Annotations: whether have dialogue act, belief state annotation. + - Statistics: \# domains, # dialogues, \# utterances, Avg. turns, Avg. tokens (split by space), etc. +- Main changes from original data to processed data. + +### Ontology + +`ontology.json`: a *dict* containing: + +- `domains`: (*dict*) descriptions for domains, slots. Must contains all slots in the state and non-binary dialogue acts. + - `$domain_name`: (*dict*) + - `description`: (*str*) description for this domain. + - `slots`: (*dict*) + - `$slot_name`: (*dict*) + - `description`: (*str*) description for this slot. + - `is_categorical`: (*bool*) categorical slot or not. + - `possible_values`: (*list*) List of possible values the slot can take. If the slot is a categorical slot, it is a complete list of all the possible values. If the slot is a non categorical slot, it is either an empty list or a small sample of all the values taken by the slot. + +- `intents`: (*dict*) descriptions for intents. + - `$intent_name`: (*dict*) + - `description`: (*str*) description for this intent. +- `binary_dialogue_act`: (*list* of *dict*) special dialogue acts that the value may not present in the utterance, e.g. request the address of a hotel. + - `{"intent": (str), "domain": (str), "slot": (str), "value": (str)}`. domain, slot, value may be empty. +- `state`: (*dict*) belief state of all domains. + - `$domain_name`: (*dict*) + - `$slot_name: ""`: slot with empty value. Note that the slot set are the subset of the slot set in Part 1 definition. + +### Dialogues + +`data.json`: a *list* of dialogues containing: + +- `dataset`: (*str*) dataset name, must be one of ['schema', 'multiwoz', 'camrest', 'woz', ...], and be the same as the current dataset. +- `data_split`: (*str*) in [train, val, test]. +- `dialogue_id`: (*str*) use dataset name as prefix, add count. +- `domains`: (*list*) domains in this dialogue. +- `turns`: (*list* of *dict*) + - `speaker`: (*str*) "user" or "system". **User side first, user side final**, "user" and "system" appear alternately? + - `utterance`: (*str*) sentence. + - `utt_idx`: (*int*) `turns['utt_idx']` gives current turn. + - `dialogue_act`: (*dict*) + - `categorical`: (*list* of *dict*) for categorical slots. + - `{"intent": (str), "domain": (str), "slot": (str), "value": (str)}`. Value sets are defined in the ontology. + - `non-categorical` (*list* of *dict*) for non-categorical slots. + - `{"intent": (str), "domain": (str), "slot": (str), "value": (str), "start": (int), "end": (int)}`. `start` and `end` are character indexes for the value span. + - `binary` (*list* of *dict*) for binary dialogue acts in ontology. + - `{"intent": (str), "domain": (str), "slot": (str), "value": (str)}`. Possible dialogue acts are listed in the `ontology['binary_dialogue_act']`. + - `state`: (*dict*, optional, user side) full state are shown in `ontology['state']`. + - `$domain_name`: (*dict*) contains all slots in this domain. + - `$slot_name`: (*str*) value for this slot. + - `state_update`: (*dict*, optional, user side) records the difference of states between the current turn and the last turn. + - `categorical`: (*list* of *dict*) for categorical slots. + - `{"domain": (str), "slot": (str), "value": (str)}`. Value sets are defined in the ontology (**dontcare** may not be included). + - `non-categorical` (*list* of *dict*) for non-categorical slots. + - `{"domain": (str), "slot": (str), "value": (str), "utt_idx": (int), "start": (int), "end": (int)}`. `utt_idx` is the utterance index of the value. `start` and `end` are character indexes for the value span in the current turn. `turn[utt_idx]['utterance'][start:end]` gives the value. + +Other attributes are optional. + +Run `python evaluate.py $dataset` to check the validation of processed dataset. + +## Example of Schema Dataset + +```json + { + "dataset": "schema", + "data_split": "train", + "dialogue_id": "schema_535", + "original_id": "5_00022", + "domains": [ + "event_2" + ], + "turns": [ + { + "speaker": "user", + "utterance": "I feel like going out to do something in Oakland. I've heard the Raiders Vs Bengals game should be good.", + "utt_idx": 0, + "dialogue_act": { + "binary": [ + { + "intent": "inform_intent", + "domain": "event_2", + "slot": "intent", + "value": "geteventdates" + } + ], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "event_2", + "slot": "event_name", + "value": "raiders vs bengals", + "start": 65, + "end": 83 + }, + { + "intent": "inform", + "domain": "event_2", + "slot": "city", + "value": "oakland", + "start": 41, + "end": 48 + } + ] + }, + "state": { + "event_2": { + "event_type": "", + "category": "", + "event_name": "raiders vs bengals", + "date": "", + "time": "", + "number_of_tickets": "", + "city": "oakland", + "venue": "", + "venue_address": "" + } + }, + "state_update": { + "categorical": [], + "non-categorical": [ + { + "domain": "event_2", + "slot": "city", + "value": "oakland", + "utt_idx": 0, + "start": 41, + "end": 48 + }, + { + "domain": "event_2", + "slot": "event_name", + "value": "raiders vs bengals", + "utt_idx": 0, + "start": 65, + "end": 83 + } + ] + } + }, + { + "speaker": "system", + "utterance": "The Raiders Vs Bengals game is at Oakland-Alameda County Coliseum today.", + "utt_idx": 1, + "dialogue_act": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "offer", + "domain": "event_2", + "slot": "date", + "value": "today", + "start": 66, + "end": 71 + }, + { + "intent": "offer", + "domain": "event_2", + "slot": "event_name", + "value": "raiders vs bengals", + "start": 4, + "end": 22 + }, + { + "intent": "offer", + "domain": "event_2", + "slot": "venue", + "value": "oakland-alameda county coliseum", + "start": 34, + "end": 65 + } + ] + } + }, + { + "speaker": "user", + "utterance": "What time does it start?", + "utt_idx": 2, + "dialogue_act": { + "binary": [ + { + "intent": "request", + "domain": "event_2", + "slot": "time", + "value": "" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "event_2": { + "event_type": "", + "category": "", + "event_name": "raiders vs bengals", + "date": "", + "time": "", + "number_of_tickets": "", + "city": "oakland", + "venue": "", + "venue_address": "" + } + }, + "state_update": { + "categorical": [], + "non-categorical": [] + } + }, + { + "speaker": "system", + "utterance": "It starts at 7 pm.", + "utt_idx": 3, + "dialogue_act": { + "binary": [], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "event_2", + "slot": "time", + "value": "7 pm", + "start": 13, + "end": 17 + } + ] + } + }, + { + "speaker": "user", + "utterance": "That sounds fine.", + "utt_idx": 4, + "dialogue_act": { + "binary": [ + { + "intent": "select", + "domain": "event_2", + "slot": "", + "value": "" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "event_2": { + "event_type": "", + "category": "", + "event_name": "raiders vs bengals", + "date": "today", + "time": "", + "number_of_tickets": "", + "city": "oakland", + "venue": "", + "venue_address": "" + } + }, + "state_update": { + "categorical": [], + "non-categorical": [ + { + "domain": "event_2", + "slot": "date", + "value": "today", + "utt_idx": 1, + "start": 66, + "end": 71 + } + ] + } + }, + { + "speaker": "system", + "utterance": "Do you want to get tickets for it?", + "utt_idx": 5, + "dialogue_act": { + "binary": [ + { + "intent": "offer_intent", + "domain": "event_2", + "slot": "intent", + "value": "buyeventtickets" + } + ], + "categorical": [], + "non-categorical": [] + } + }, + { + "speaker": "user", + "utterance": "Yes, can you buy 3 tickets for me?", + "utt_idx": 6, + "dialogue_act": { + "binary": [ + { + "intent": "affirm_intent", + "domain": "event_2", + "slot": "", + "value": "" + } + ], + "categorical": [ + { + "intent": "inform", + "domain": "event_2", + "slot": "number_of_tickets", + "value": "3" + } + ], + "non-categorical": [] + }, + "state": { + "event_2": { + "event_type": "", + "category": "", + "event_name": "raiders vs bengals", + "date": "today", + "time": "", + "number_of_tickets": "3", + "city": "oakland", + "venue": "", + "venue_address": "" + } + }, + "state_update": { + "categorical": [ + { + "domain": "event_2", + "slot": "number_of_tickets", + "value": "3" + } + ], + "non-categorical": [] + } + }, + { + "speaker": "system", + "utterance": "Sure. I will go ahead and buy 3 tickets for the Raiders Vs Bengals game in Oakland today. Is that right?", + "utt_idx": 7, + "dialogue_act": { + "binary": [], + "categorical": [ + { + "intent": "confirm", + "domain": "event_2", + "slot": "number_of_tickets", + "value": "3" + } + ], + "non-categorical": [ + { + "intent": "confirm", + "domain": "event_2", + "slot": "event_name", + "value": "raiders vs bengals", + "start": 48, + "end": 66 + }, + { + "intent": "confirm", + "domain": "event_2", + "slot": "date", + "value": "today", + "start": 83, + "end": 88 + }, + { + "intent": "confirm", + "domain": "event_2", + "slot": "city", + "value": "oakland", + "start": 75, + "end": 82 + } + ] + } + }, + { + "speaker": "user", + "utterance": "Yes, that's good. What's the address?", + "utt_idx": 8, + "dialogue_act": { + "binary": [ + { + "intent": "request", + "domain": "event_2", + "slot": "venue_address", + "value": "" + }, + { + "intent": "affirm", + "domain": "", + "slot": "", + "value": "" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "event_2": { + "event_type": "", + "category": "", + "event_name": "raiders vs bengals", + "date": "today", + "time": "", + "number_of_tickets": "3", + "city": "oakland", + "venue": "", + "venue_address": "" + } + }, + "state_update": { + "categorical": [], + "non-categorical": [] + } + }, + { + "speaker": "system", + "utterance": "The game is at 7000 Coliseum Way. I've bought the tickets.", + "utt_idx": 9, + "dialogue_act": { + "binary": [ + { + "intent": "notify_success", + "domain": "event_2", + "slot": "", + "value": "" + } + ], + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "event_2", + "slot": "venue_address", + "value": "7000 coliseum way", + "start": 15, + "end": 32 + } + ] + } + }, + { + "speaker": "user", + "utterance": "Thanks! That's all.", + "utt_idx": 10, + "dialogue_act": { + "binary": [ + { + "intent": "thank_you", + "domain": "", + "slot": "", + "value": "" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "event_2": { + "event_type": "", + "category": "", + "event_name": "raiders vs bengals", + "date": "today", + "time": "", + "number_of_tickets": "3", + "city": "oakland", + "venue": "", + "venue_address": "" + } + }, + "state_update": { + "categorical": [], + "non-categorical": [] + } + }, + { + "speaker": "system", + "utterance": "Need help with anything else?", + "utt_idx": 11, + "dialogue_act": { + "binary": [ + { + "intent": "req_more", + "domain": "", + "slot": "", + "value": "" + } + ], + "categorical": [], + "non-categorical": [] + } + }, + { + "speaker": "user", + "utterance": "No, thank you.", + "utt_idx": 12, + "dialogue_act": { + "binary": [ + { + "intent": "negate", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "thank_you", + "domain": "", + "slot": "", + "value": "" + } + ], + "categorical": [], + "non-categorical": [] + }, + "state": { + "event_2": { + "event_type": "", + "category": "", + "event_name": "raiders vs bengals", + "date": "today", + "time": "", + "number_of_tickets": "3", + "city": "oakland", + "venue": "", + "venue_address": "" + } + }, + "state_update": { + "categorical": [], + "non-categorical": [] + } + } + ] + } +``` + diff --git a/data/unified_datasets/camrest/README.md b/data/unified_datasets/camrest/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ae06dbe9dd5cd62c980f6b9bc731a1542bfd1aad --- /dev/null +++ b/data/unified_datasets/camrest/README.md @@ -0,0 +1,24 @@ +# README + +## Features + +- Annotations: dialogue act, character-level span for non-categorical slots. + +Statistics: + +| | \# dialogues | \# utterances | avg. turns | avg. tokens | \# domains | +| ----- | ------------ | ------------- | ---------- | ----------- | ---------- | +| train | 406 | 2936 | 7.23 | 11.36 | 1 | +| dev | 135 | 941 | 6.97 | 11.99 | 1 | +| train | 135 | 935 | 6.93 | 11.87 | 1 | + +## Main changes + +- domain is set to **restaurant** +- ignore some rare pair +- 3 values are not found in original utterances +- **dontcare** values in non-categorical slots are calculated in `evaluate.py` so `da_match` in evaluation is lower than actual number. + +## Original data + +camrest used in convlab2, included in `data/` path \ No newline at end of file diff --git a/data/unified_datasets/camrest/data.zip b/data/unified_datasets/camrest/data.zip new file mode 100644 index 0000000000000000000000000000000000000000..b0d3db9f816f377f431e33d4a43ab0b9eb668f2a Binary files /dev/null and b/data/unified_datasets/camrest/data.zip differ diff --git a/data/unified_datasets/camrest/ontology.json b/data/unified_datasets/camrest/ontology.json new file mode 100644 index 0000000000000000000000000000000000000000..b5617f7ef4d441674a361a280acdf3123468d53a --- /dev/null +++ b/data/unified_datasets/camrest/ontology.json @@ -0,0 +1,122 @@ +{ + "domains": { + "restaurant": { + "description": "find a restaurant to eat", + "slots": { + "food": { + "description": "food type the restaurant serves", + "is_categorical": false, + "possible_values": [] + }, + "area": { + "description": "area where the restaurant is located", + "is_categorical": true, + "possible_values": [ + "north", + "east", + "west", + "south", + "centre" + ] + }, + "name": { + "description": "name of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "pricerange": { + "description": "price range of the restaurant", + "is_categorical": true, + "possible_values": [ + "cheap", + "moderate", + "expensive" + ] + }, + "phone": { + "description": "phone number of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "address": { + "description": "exact location of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "postcode": { + "description": "postal code of the restaurant", + "is_categorical": false, + "possible_values": [] + } + } + } + }, + "intents": { + "inform": { + "description": "inform user of value of a slot" + }, + "request": { + "description": "ask for value of a slot" + }, + "nooffer": { + "description": "inform user that no restaurant matches his request" + } + }, + "binary_dialogue_act": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "food", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "address", + "value": "" + }, + { + "intent": "nooffer", + "domain": "restaurant", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "area", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "phone", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "pricerange", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "name", + "value": "" + } + ], + "state": { + "restaurant": { + "pricerange": "", + "area": "", + "food": "" + } + } +} \ No newline at end of file diff --git a/data/unified_datasets/camrest/original_data.zip b/data/unified_datasets/camrest/original_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..ab07af894f6508a78e7baf21d914978fe19e16a3 Binary files /dev/null and b/data/unified_datasets/camrest/original_data.zip differ diff --git a/data/unified_datasets/camrest/preprocess.py b/data/unified_datasets/camrest/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..510df266aa4d528bfe19fd5c496be5d2f96521cb --- /dev/null +++ b/data/unified_datasets/camrest/preprocess.py @@ -0,0 +1,350 @@ +import zipfile +import json +import os +import copy +import logging + +logging.basicConfig(level=logging.INFO) +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +# print(sys.path[-1]) + +from convlab2.util.file_util import read_zipped_json, write_zipped_json + +self_dir = os.path.dirname(os.path.abspath(__file__)) + +cat_slot_values = { + 'area': ['north', 'east', 'west', 'south', 'centre'], + 'pricerange': ['cheap', 'moderate', 'expensive'] +} + +camrest_desc = { + 'restaurant': { + 'domain': 'find a restaurant to eat', + 'food': 'food type the restaurant serves', + 'area': 'area where the restaurant is located', + 'name': 'name of the restaurant', + 'pricerange': 'price range of the restaurant', + 'phone': 'phone number of the restaurant', + 'address': 'exact location of the restaurant', + 'postcode': 'postal code of the restaurant', + }, + 'intents': { + 'inform': 'inform user of value of a slot', + 'request': 'ask for value of a slot', + 'nooffer': 'inform user that no restaurant matches his request' + } +} + +all_slots = ['food', 'area', 'name', 'pricerange', 'phone', 'address', 'postcode'] + + +def convert_da(utt, da, all_intent, all_binary_das): + converted_da = { + 'binary': [], + 'categorical': [], + 'non-categorical': [] + } + + for _intent, svs in da.items(): + if _intent not in all_intent: + all_intent.append(_intent) + + if _intent == 'nooffer': + converted_da['binary'].append({ + 'intent': _intent, + 'domain': 'restaurant', + 'slot': '', + 'value': '' + }) + + if { + 'intent': _intent, + 'domain': 'restaurant', + 'slot': '', + 'value': '' + } not in all_binary_das: + all_binary_das.append({ + 'intent': _intent, + 'domain': 'restaurant', + 'slot': '', + 'value': '' + }) + continue + + for s, v in svs: + if 'care' in v: + v = 'dontcare' + s = s.lower() + v = v.lower() + if _intent == 'request': + converted_da['binary'].append({ + 'intent': _intent, + 'domain': 'restaurant', + 'slot': s, + 'value': '' + }) + + if { + 'intent': _intent, + 'domain': 'restaurant', + 'slot': s, + 'value': '' + } not in all_binary_das: + all_binary_das.append({ + 'intent': _intent, + 'domain': 'restaurant', + 'slot': s, + 'value': '' + }) + continue + + if s in cat_slot_values: + assert v in cat_slot_values[s] + ['dontcare'] + converted_da['categorical'].append({ + 'intent': _intent, + 'domain': 'restaurant', + 'slot': s, + 'value': v + }) + + else: + # non-categorical + start_ch = utt.find(v) + + if start_ch == -1: + # if not v == 'dontcare': + # logging.info('non-categorical slot value not found') + # logging.info('value: {}'.format(v)) + # logging.info('sentence: {}'.format(utt)) + # continue + + converted_da['non-categorical'].append({ + 'intent': _intent, + 'domain': 'restaurant', + 'slot': s, + 'value': v, + # 'start': 0, + # 'end': 0 + }) + continue + + converted_da['non-categorical'].append({ + 'intent': _intent, + 'domain': 'restaurant', + 'slot': s, + 'value': utt[start_ch: start_ch + len(v)], + 'start': start_ch, + 'end': start_ch + len(v) + }) + assert utt[start_ch: start_ch + len(v)] == v + + return converted_da + + +def convert_state(state, state_slots): + ret_state = {'restaurant': {k: '' for k in state_slots}} + for da in state: + if da['act'] != 'inform': + continue + + for s, v in da['slots']: + s = s.lower() + v = v.lower() + + if not s in all_slots: + logging.info('state slot {} not in all_slots!'.format(s)) + continue + + ret_state['restaurant'][s] = v + + if s not in state_slots: + print(s) + raise + + return ret_state + + +def get_state_update(prev_state, cur_state, prev_turns, cur_user_da, dialog_id): + # cur_user_da: List of non-categorical slot-values + diff_state = {} + state_update = {'categorical': [], 'non-categorical':[]} + for s, v in cur_state.items(): + if s in prev_state and prev_state[s] == v: + continue + diff_state[s] = v + + for s, v in diff_state.items(): + if v == '': + continue + if s in cat_slot_values: + assert v in cat_slot_values[s] + ['dontcare'] + state_update['categorical'].append({ + 'domain': 'restaurant', + 'slot': s, + 'value': v, + }) + else: + # non-categorical slot + found = False + for _usr_da in cur_user_da: + if _usr_da['slot'] == s and _usr_da['value'] == v : + found = True + if v != 'dontcare' and 'start' in _usr_da: + state_update['non-categorical'].append({ + 'domain': 'restaurant', + 'slot': s, + 'value': v, + 'utt_idx': len(prev_turns), + 'start': _usr_da['start'], + 'end': _usr_da['end'] + }) + else: + state_update['non-categorical'].append({ + 'domain': 'restaurant', + 'slot': s, + 'value': v, + }) + if found: + continue + + prev_sys_da = [] if len(prev_turns) == 0 else prev_turns[-1]['dialogue_act']['non-categorical'] + for _sys_da in prev_sys_da: + if _sys_da['slot'] == s and _sys_da['value'] == v and 'start' in _sys_da: + if _sys_da['slot'] == s and _sys_da['value'] == v: + state_update['non-categorical'].append({ + 'domain': 'restaurant', + 'slot': s, + 'value': v, + 'utt_idx': len(prev_turns) - 1, + 'start': _sys_da['start'], + 'end': _sys_da['end'] + }) + found = True + + if not found: + state_update['non-categorical'].append({ + 'domain': 'restaurant', + 'slot': s, + 'value': v + }) + + return state_update + + +def preprocess(): + original_zipped_path = os.path.join(self_dir, 'original_data.zip') + if not os.path.exists(original_zipped_path): + raise FileNotFoundError(original_zipped_path) + if not os.path.exists(os.path.join(self_dir, 'data.zip')) or not os.path.exists( + os.path.join(self_dir, 'ontology.json')): + # print('unzip to', new_dir) + # print('This may take several minutes') + archive = zipfile.ZipFile(original_zipped_path, 'r') + archive.extractall(self_dir) + + all_data = [] + all_intent = [] + all_binary_das = [] + all_state_slots = ['pricerange', 'area', 'food'] + + data_splits = ['train', 'val', 'test'] + extract_dir = os.path.join(self_dir, 'original_data') + + if not os.path.exists('data.zip') or not os.path.exists('ontology.json'): + + dialog_id = 1 + for data_split in data_splits: + data = json.load(open(os.path.join(self_dir, extract_dir, '{}.json'.format(data_split)))) + + for i, d in enumerate(data): + + dialogue = d['dial'] + converted_dialogue = { + 'dataset': 'camrest', + 'data_split': data_split, + 'dialogue_id': 'camrest_' + str(dialog_id), + 'original_id': d['dialogue_id'], + 'domains': ['restaurant'], + 'turns': [] + } + + prev_state = {'restaurant': {}} + for turn in dialogue: + usr_text = turn['usr']['transcript'].lower() + usr_da = turn['usr']['dialog_act'] + + sys_text = turn['sys']['sent'].lower() + sys_da = turn['sys']['dialog_act'] + + cur_state = convert_state(turn['usr']['slu'], all_state_slots) + cur_user_da = convert_da(usr_text, usr_da, all_intent, all_binary_das) + + usr_turn = { + 'utt_idx': len(converted_dialogue['turns']), + 'speaker': 'user', + 'utterance': usr_text, + 'dialogue_act': cur_user_da, + 'state': copy.deepcopy(cur_state), + 'state_update': get_state_update(prev_state['restaurant'], cur_state['restaurant'], converted_dialogue['turns'], cur_user_da['non-categorical'], converted_dialogue['dialogue_id']) + } + + sys_turn = { + 'utt_idx': len(converted_dialogue['turns'])+1, + 'speaker': 'system', + 'utterance': sys_text, + 'dialogue_act': convert_da(sys_text, sys_da, all_intent, all_binary_das), + } + + prev_state = copy.deepcopy(cur_state) + + converted_dialogue['turns'].append(usr_turn) + converted_dialogue['turns'].append(sys_turn) + if converted_dialogue['turns'][-1]['speaker'] == 'system': + converted_dialogue['turns'].pop(-1) + all_data.append(converted_dialogue) + dialog_id += 1 + + json.dump(all_data, open('./data.json', 'w'), indent=4) + write_zipped_json(os.path.join(self_dir, 'data.zip'), 'data.json') + os.remove('data.json') + + new_ont = { + 'domains': {}, + 'intents': {}, + 'binary_dialogue_act': [], + 'state': {} + } + + new_ont['state']['restaurant'] = {} + for ss in all_state_slots: + new_ont['state']['restaurant'][ss] = '' + + for b in all_binary_das: + new_ont['binary_dialogue_act'].append(b) + + for i in all_intent: + new_ont['intents'][i] = {'description': camrest_desc['intents'][i]} + + new_ont['domains']['restaurant'] = { + 'description': camrest_desc['restaurant']['domain'], + 'slots': {} + } + for s in all_slots: + new_ont['domains']['restaurant']['slots'][s] = { + "description": camrest_desc['restaurant'][s], + "is_categorical": True if s in cat_slot_values else False, + "possible_values": cat_slot_values[s] if s in cat_slot_values else [] + } + json.dump(new_ont, open(os.path.join(self_dir, './ontology.json'), 'w'), indent=4) + + + else: + all_data = read_zipped_json(os.path.join(self_dir, './data.zip'), 'data.json') + new_ont = json.load(open(os.path.join(self_dir, './ontology.json'), 'r')) + + return all_data, new_ont + + +if __name__ == '__main__': + preprocess() diff --git a/data/unified_datasets/evaluate.py b/data/unified_datasets/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..1c68f384db76c34a6e05a4e3993af227e2e3be3c --- /dev/null +++ b/data/unified_datasets/evaluate.py @@ -0,0 +1,334 @@ +import json +import os +from copy import deepcopy + +special_values = ['dontcare', ''] + + +def check_ontology(name): + """ + ontology: { + "domains": { + domain name: { + "description": domain description, + "slots": { + slot name: { + "description": slot description + // possible_values is empty iff is_categorical is False + "is_categorical": is_categorical, + "possible_values": [possible_values...] + } + } + } + }, + "intents": { + intent name: { + "description": intent description + } + }, + "binary_dialogue_act": { + [ + { + "intent": intent name, + "domain": domain name + "slot": slot name, + "value": some value + } + ] + } + "state": { + domain name: { + slot name: "" + } + } + } + """ + global special_values + + ontology_file = os.path.join(f'{name}', 'ontology.json') + assert os.path.exists(ontology_file), f'ontology file should named {ontology_file}' + ontology = json.load(open(ontology_file)) + + # record issues in ontology + descriptions = { + # if each domain has a description + "domains": True, + "slots": True, + "intents": True, + } + for domain_name, domain in ontology['domains'].items(): + if not domain['description']: + descriptions["domains"] = False + # if not domain_name in ontology['state']: + # print(f"domain '{domain_name}' not found in state") + for slot_name, slot in domain["slots"].items(): + if not slot["description"]: + descriptions["slots"] = False + if slot["is_categorical"]: + assert slot["possible_values"] + slot['possible_values'] = list(map(str.lower, slot['possible_values'])) + for value in special_values: + assert value not in slot['possible_values'], f'ONTOLOGY\tspecial value `{value}` should not present in possible values' + + for intent_name, intent in ontology["intents"].items(): + if not intent["description"]: + descriptions["intents"] = False + + binary_dialogue_acts = set() + for bda in ontology['binary_dialogue_act']: + assert bda['intent'] is None or bda["intent"] in ontology['intents'], f'ONTOLOGY\tintent undefined intent in binary dialog act: {bda}' + binary_dialogue_acts.add(tuple(bda.values())) + ontology['bda_set'] = binary_dialogue_acts + + assert 'state' in ontology, 'ONTOLOGY\tno state' + redundant_value = False + for domain_name, domain in ontology['state'].items(): + assert domain_name in ontology['domains'] + for slot_name, value in domain.items(): + assert slot_name in ontology['domains'][domain_name]['slots'] + if value: + redundant_value = True + + if redundant_value: + print('ONTOLOGY: redundant value description in state') + + # print('description existence:', descriptions, '\n') + for description, value in descriptions.items(): + if not value: + print(f'description of {description} is incomplete') + return ontology + + +def check_data(name, ontology): + global special_values + + from zipfile import ZipFile + data_file = os.path.join(f'{name}', 'data.zip') + if not os.path.exists(data_file): + print('cannot find data.zip') + return + + print('loading data') + with ZipFile(data_file) as zipfile: + with zipfile.open('data.json', 'r') as f: + data = json.load(f) + + all_id = set() + splits = ['train', 'val', 'test'] + da_values = 0 + da_matches = 0 + state_values = 0 + state_matches = 0 + distances = [] + stat_keys = ['dialogues', 'utterances', 'tokens', 'domains'] + stat = { + split: { + key: 0 for key in stat_keys + } for split in splits + } + + # present for both non-categorical or categorical + + for dialogue in data: + dialogue_id = dialogue['dialogue_id'] + assert isinstance(dialogue_id, str), '`dialogue_id` is expected to be str type' + dialogue_id = str(dialogue_id) + + assert dialogue['dataset'] == name, f'{dialogue_id}\tinconsistent dataset name: {dialogue["dataset"]}' + + split = dialogue['data_split'] + assert split in splits, f'unknown split: `{split}`' + cur_stat = stat[split] + cur_stat['dialogues'] += 1 + try: + prefix, num = dialogue_id.split('_') + assert prefix == name + int(num) # try converting to int + except: + print(f'{dialogue_id}\twrong dialogue id format: {dialogue_id}') + raise Exception + assert dialogue_id not in all_id, f'multiple dialogue id: {dialogue_id}' + all_id.add(dialogue_id) + + cur_domains = dialogue['domains'] + assert isinstance(cur_domains, list), f'{dialogue_id}\t`domains` is expected to be list type, ' + assert len(set(cur_domains)) == len(cur_domains), f'{dialogue_id}\trepeated domains' + cur_stat['domains'] += len(cur_domains) + cur_domains = set(cur_domains) + for domain_name in cur_domains: + assert domain_name in ontology['domains'], f'{dialogue_id}\tundefined current domain: {domain_name}' + + turns = dialogue['turns'] + cur_stat['utterances'] += len(turns) + assert turns, f'{dialogue_id}\tempty turn' + + assert turns[0]['speaker'] == 'user', f'{dialogue_id}\tnot start with user role' + if ontology['state']: + # update cur_state with state_update every turn, and compare it with state annotation + cur_state = { + domain_name: deepcopy(ontology['state'][domain_name]) for domain_name in cur_domains + } + # check dialog act + for turn_id, turn in enumerate(turns): + assert turn['speaker'] in ['user', 'system'], f'{dialogue_id}:{turn_id}\tunknown speaker value: {turn["speaker"]}' + assert turn_id == turn['utt_idx'], f'{dialogue_id}:{turn_id}\twrong utt_idx' + if turn_id > 0: + assert turns[turn_id - 1]['speaker'] != turn['speaker'], f'{dialogue_id}:{turn_id}\tuser and system should speak alternatively' + + utterance = turn['utterance'] + cur_stat['tokens'] += len(utterance.strip().split(' ')) + dialogue_acts = turn['dialogue_act'] + + # check domain-slot-value + # prefix: error prefix + def check_dsv(domain_name, slot_name, value, categorical, prefix): + assert domain_name in cur_domains or domain_name == 'booking', f'{prefix}\t{domain_name} not presented in current domains' + domain = ontology['domains'][domain_name] + assert slot_name in domain['slots'], f'{prefix}\t{slot_name} not presented in domain {domain_name}' + slot = domain['slots'][slot_name] + if categorical: + assert slot['is_categorical'], f'{prefix}\t{domain_name}-{slot_name} is not categorical' + value = value.lower() + assert value in special_values or value in slot['possible_values'], f'{prefix}\t`{value}` not presented in possible values of' \ + f' {domain_name}-{slot_name}: {slot["possible_values"]}' + else: + assert not slot['is_categorical'], f'{prefix}\t{domain_name}-{slot_name} is not non-categorical' + + def check_da(da, categorical): + assert da['intent'] in ontology['intents'], f'{dialogue_id}:{turn_id}\tundefined intent {da["intent"]}' + check_dsv(da['domain'], da['slot'], da['value'], categorical, f'{dialogue_id}:{turn_id}:da') + + for da in dialogue_acts['categorical']: + check_da(da, True) + for da in dialogue_acts['non-categorical']: + check_da(da, False) + # values only match after .strip() in some case, it's the issue of pre-processing + if da['value'] not in special_values: + da_values += 1 + assert 'start' in da and 'end' in da or 'start' not in da and 'end' not in da, \ + f'{dialogue_id}:{turn_id}\tstart and end field in da should both present or neither not present' + if 'start' in da: + value = utterance[da['start']:da['end']] + if da['value'].lower() == value.lower(): + da_matches += 1 + + for da in dialogue_acts['binary']: + assert tuple(da.values()) in ontology['bda_set'], f'{dialogue_id}:{turn_id}\tbinary dialog act {da} not present in ontology' + # do not check domain-slot-value in binary dialogue acts + + if turn['speaker'] == 'user': + assert 'state' in turn and 'state_update' in turn, f"{dialogue_id}:{turn_id}\tstate and state_update must present in user's role" + state_update = turn['state_update'] + + def apply_update(update, categorical): + domain_name = update['domain'] + slot_name = update['slot'] + value = update['value'] + check_dsv(domain_name, slot_name, value, categorical, f'{dialogue_id}:{turn_id}:state_update') + cur_state[domain_name][slot_name] = value + if ontology['state']: + for update in state_update['categorical']: + apply_update(update, True) + for update in state_update['non-categorical']: + apply_update(update, False) + value = update['value'] + if value not in special_values: + state_values += 1 + if 'utt_idx' in update: + if turns[update['utt_idx']]['utterance'][update['start']:update['end']].lower() == update['value']: + state_matches += 1 + else: + print('value in utt:\t', turns[update['utt_idx']]['utterance'][update['start']:update['end']].strip()) + print('value in state:\t', update['value']) + pass + + assert cur_state == turn['state'], f'{dialogue_id}:{turn_id}:state_update incorrect state or state update calculation' + + else: + assert 'state' not in turn or 'state_update' in turn, f"{dialogue_id}:{turn_id}\tstate or state_update cannot present in system's role" + + assert turns[-1]['speaker'] == 'user', f'{dialogue_id} dialog must end with user role' + + if da_values: + print('da values match rate: {:.3f}'.format(da_matches * 100 / da_values)) + if state_values: + print('state values match rate: {:.3f}'.format(state_matches * 100 / state_values)) + + all_stat = {key: 0 for key in stat_keys} + for key in stat_keys: + all_stat[key] = sum(stat[split][key] for split in splits) + stat['all'] = all_stat + + for split in splits + ['all']: + cur_stat = stat[split] + if cur_stat['dialogues']: + cur_stat['avg_utt'] = round(cur_stat['utterances'] / cur_stat['dialogues'], 2) + cur_stat['avg_tokens'] = round(cur_stat['tokens'] / cur_stat['utterances'], 2) + cur_stat['avg_domains'] = round(cur_stat.pop('domains') / cur_stat['dialogues'], 2) + else: + del stat[split] + print(f'domains: {len(ontology["domains"])}') + print(json.dumps(stat, indent=4)) + if state_matches: + for dis, cnt in enumerate(distances): + print(cnt) + + +if __name__ == '__main__': + from argparse import ArgumentParser + + parser = ArgumentParser(description="evaluate pre-processed datasets") + parser.add_argument('datasets', metavar='dataset_name', nargs='*', help='dataset names to be evaluated') + parser.add_argument('--all', action='store_true', help='evaluate all datasets') + parser.add_argument('--no-int', action='store_true', help='not interrupted by exception') + parser.add_argument('--preprocess', '-p', action='store_true', help='run preprocess automatically') + args = parser.parse_args() + + if args.all: + datasets = list(filter(os.path.isdir, os.listdir())) + else: + datasets = args.datasets + if not datasets: + print('no dataset specified') + parser.print_help() + exit(1) + + print('datasets to be evaluated:', datasets) + + fail = [] + + for name in datasets: + try: + print('') + if not os.path.isdir(name): + print(f'dataset {name} not found') + continue + + print(f'checking {name}') + preprocess_file = os.path.join(f'{name}', 'preprocess.py') + if not os.path.exists(preprocess_file): + print('no preprocess.py') + if args.preprocess: + print(f'skip evaluation of {name}') + continue + if args.preprocess: + print('pre-processing') + + os.chdir(name) + import importlib + preprocess = importlib.import_module(f'{name}.preprocess') + preprocess.preprocess() + os.chdir('..') + + ontology = check_ontology(name) + check_data(name, ontology) + except Exception as e: + if args.no_int: + fail.append(name) + else: + raise e + + if not fail: + print('all datasets passed test') + else: + print('failed dataset(s):', fail) diff --git a/data/unified_datasets/frames/README.md b/data/unified_datasets/frames/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd1fc1ba33b1f19b8dc41ca6b361eb2478564cb7 --- /dev/null +++ b/data/unified_datasets/frames/README.md @@ -0,0 +1,23 @@ +# README + +## Features + +- Annotations: dialogue act, character-level span for non-categorical slots. + +Statistics: + +| | \# dialogues | \# utterances | avg. turns | avg. tokens | \# domains | +| ----- | ------------ | ------------- | ---------- | ----------- | ---------- | +| train | 1369 | 19445 | 14.2 | 12.71 | 1 | + +## Main changes + +- domain is set to **travel** +- slot-value pair changes: intent-book => book-"True", action-book => booked-"True" +- ignore some rare pair +- not annotate state and state upadte +- span info is from string matching, covering 96.4 non-categorical value + +## Original data + +https://www.microsoft.com/en-us/research/project/frames-dataset/#!download \ No newline at end of file diff --git a/data/unified_datasets/frames/data.zip b/data/unified_datasets/frames/data.zip new file mode 100644 index 0000000000000000000000000000000000000000..d6b73c808d8682571e2cd060a2937c34abd06e4e Binary files /dev/null and b/data/unified_datasets/frames/data.zip differ diff --git a/data/unified_datasets/frames/ontology.json b/data/unified_datasets/frames/ontology.json new file mode 100644 index 0000000000000000000000000000000000000000..4101d15b77135162e4cad5af5d433a434ad5c61d --- /dev/null +++ b/data/unified_datasets/frames/ontology.json @@ -0,0 +1,1488 @@ +{ + "domains": { + "travel": { + "description": "Book a vacation package containing round-trip flights and a hotel.", + "slots": { + "dst_city": { + "description": "Destination city", + "is_categorical": false, + "possible_values": [] + }, + "or_city": { + "description": "Origin city", + "is_categorical": false, + "possible_values": [] + }, + "str_date": { + "description": "Start date for the trip", + "is_categorical": false, + "possible_values": [] + }, + "n_adults": { + "description": "Number of adults", + "is_categorical": false, + "possible_values": [] + }, + "budget": { + "description": "The amount of money that the user has available to spend for the trip.", + "is_categorical": false, + "possible_values": [] + }, + "flex": { + "description": "Boolean value indicating whether the constraints are flexible", + "is_categorical": true, + "possible_values": [ + "false", + "true" + ] + }, + "duration": { + "description": "Duration of the trip", + "is_categorical": false, + "possible_values": [] + }, + "ref_anaphora": { + "description": "Words used to refer to a frame", + "is_categorical": false, + "possible_values": [] + }, + "price": { + "description": "Price of the trip including flights and hotel", + "is_categorical": false, + "possible_values": [] + }, + "end_date": { + "description": "End date for the trip", + "is_categorical": false, + "possible_values": [] + }, + "max_duration": { + "description": "Maximum number of days for the trip", + "is_categorical": false, + "possible_values": [] + }, + "name": { + "description": "Name of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "category": { + "description": "Rating of the hotel (in number of stars)", + "is_categorical": false, + "possible_values": [] + }, + "wifi": { + "description": "Boolean value indicating whether or not the hotel offers free wifi", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "dep_time_or": { + "description": "Time of departure from origin city", + "is_categorical": false, + "possible_values": [] + }, + "n_children": { + "description": "Number of children", + "is_categorical": false, + "possible_values": [] + }, + "gst_rating": { + "description": "Rating of the hotel by guests (in number of stars)", + "is_categorical": false, + "possible_values": [] + }, + "parking": { + "description": "Boolean value indicating whether or not the hotel offers free parking", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "arr_time_or": { + "description": "Time of arrival to origin city", + "is_categorical": false, + "possible_values": [] + }, + "breakfast": { + "description": "Boolean value indicating whether or not the hotel offers free breakfast", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "count": { + "description": "Number of different packages", + "is_categorical": false, + "possible_values": [] + }, + "seat": { + "description": "Seat type (economy or business)", + "is_categorical": false, + "possible_values": [] + }, + "count_name": { + "description": "Number of different hotels", + "is_categorical": false, + "possible_values": [] + }, + "budget_ok": { + "description": "Boolean value indicating whether the package fits the budget", + "is_categorical": true, + "possible_values": [ + "true" + ] + }, + "arr_time_dst": { + "description": "Time of arrival to destination", + "is_categorical": false, + "possible_values": [] + }, + "dep_time_dst": { + "description": "Time of departure from destination", + "is_categorical": false, + "possible_values": [] + }, + "gym": { + "description": "Boolean value indicating whether or not the hotel offers gym", + "is_categorical": true, + "possible_values": [ + "true" + ] + }, + "spa": { + "description": "Boolean value indicating whether or not the hotel offers spa", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "downtown": { + "description": "Boolean value indicating whether or not the hotel is in the heart of the city", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "count_dst_city": { + "description": "Number of destination cities", + "is_categorical": false, + "possible_values": [] + }, + "min_duration": { + "description": "Minimum number of days for the trip", + "is_categorical": false, + "possible_values": [] + }, + "airport": { + "description": "Boolean value indicating whether or not the hotel is in the vicinity of an airport", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "beach": { + "description": "Boolean value indicating whether or not the hotel is in the vicinity of a beach", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "museum": { + "description": "Boolean value indicating whether or not the hotel is in the vicinity of a museum", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "theatre": { + "description": "Boolean value indicating whether or not the hotel is in the vicinity of a theatre", + "is_categorical": true, + "possible_values": [ + "true" + ] + }, + "park": { + "description": "Boolean value indicating whether or not the hotel is in the vicinity of a park", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "market": { + "description": "Boolean value indicating whether or not the hotel is in the vicinity of a market", + "is_categorical": true, + "possible_values": [ + "true" + ] + }, + "shopping": { + "description": "Boolean value indicating whether or not the hotel is in the vicinity of a shopping center", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "university": { + "description": "Boolean value indicating whether or not the hotel is in the vicinity of an university", + "is_categorical": true, + "possible_values": [ + "true" + ] + }, + "mall": { + "description": "Boolean value indicating whether or not the hotel is in the vicinity of a mall", + "is_categorical": true, + "possible_values": [ + "true" + ] + }, + "palace": { + "description": "Boolean value indicating whether or not the hotel is in the vicinity of a palace", + "is_categorical": true, + "possible_values": [ + "true" + ] + }, + "cathedral": { + "description": "Boolean value indicating whether or not the hotel is in the vicinity of a cathedral", + "is_categorical": true, + "possible_values": [ + "true" + ] + } + } + } + }, + "intents": { + "inform": { + "description": "Inform a slot value" + }, + "suggest": { + "description": "Suggest a slot value or package that does not match the user's constraints" + }, + "no_result": { + "description": "Tell the user that the database returned no results" + }, + "negate": { + "description": "Negate something said by the other speaker" + }, + "switch_frame": { + "description": "Switch to a frame" + }, + "request": { + "description": "Ask for the value of a particular slot" + }, + "affirm": { + "description": "Affirm something said by the other speaker" + }, + "offer": { + "description": "Offer a package to the user" + }, + "request_alts": { + "description": "Ask for other possibilities" + }, + "request_compare": { + "description": "Ask the wizard to compare packages" + }, + "confirm": { + "description": "Ask the other speaker to confirm a given slot value" + }, + "hearmore": { + "description": "Ask the user if she'd like to hear more about a given package" + }, + "moreinfo": { + "description": "Ask for more information on a given set of results" + } + }, + "binary_dialogue_act": [ + { + "intent": "affirm", + "domain": "travel", + "slot": "book", + "value": "true" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "booked", + "value": "true" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "str_date", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "end_date", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "n_adults", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "budget", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "duration", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "dst_city", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "name", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "seat", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "count", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "count_name", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "category", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "n_children", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "amenities", + "value": "true" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "breakfast", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "gst_rating", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "max_duration", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "or_city", + "value": "" + }, + { + "intent": "affirm", + "domain": "travel", + "slot": "price", + "value": "" + }, + { + "intent": "confirm", + "domain": "travel", + "slot": "str_date", + "value": "" + }, + { + "intent": "confirm", + "domain": "travel", + "slot": "end_date", + "value": "" + }, + { + "intent": "confirm", + "domain": "travel", + "slot": "or_city", + "value": "dontcare" + }, + { + "intent": "confirm", + "domain": "travel", + "slot": "book", + "value": "true" + }, + { + "intent": "confirm", + "domain": "travel", + "slot": "n_adults", + "value": "" + }, + { + "intent": "confirm", + "domain": "travel", + "slot": "dst_city", + "value": "" + }, + { + "intent": "confirm", + "domain": "travel", + "slot": "booked", + "value": "true" + }, + { + "intent": "confirm", + "domain": "travel", + "slot": "budget", + "value": "" + }, + { + "intent": "confirm", + "domain": "travel", + "slot": "max_duration", + "value": "" + }, + { + "intent": "confirm", + "domain": "travel", + "slot": "price", + "value": "" + }, + { + "intent": "confirm", + "domain": "travel", + "slot": "dst_city", + "value": "dontcare" + }, + { + "intent": "confirm", + "domain": "travel", + "slot": "budget", + "value": "dontcare" + }, + { + "intent": "hearmore", + "domain": "travel", + "slot": "amenities", + "value": "" + }, + { + "intent": "hearmore", + "domain": "travel", + "slot": "price", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "book", + "value": "true" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "budget", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "amenities", + "value": "dontcare" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "booked", + "value": "true" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "str_date", + "value": "dontcare" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "budget", + "value": "dontcare" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "end_date", + "value": "dontcare" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "max_duration", + "value": "dontcare" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "dst_city", + "value": "dontcare" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "dst_city", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "str_date", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "end_date", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "or_city", + "value": "dontcare" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "n_adults", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "n_children", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "dep_time_or", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "arr_time_dst", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "dep_time_dst", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "arr_time_or", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "category", + "value": "dontcare" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "name", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "n_adults", + "value": "dontcare" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "max_duration", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "spa", + "value": "dontcare" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "amenities", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "vicinity", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "vicinity", + "value": "dontcare" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "vicinity", + "value": "false" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "amenities", + "value": "false" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "category", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "parking", + "value": "dontcare" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "price", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "duration", + "value": "" + }, + { + "intent": "inform", + "domain": "travel", + "slot": "or_city", + "value": "" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "budget", + "value": "" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "dst_city", + "value": "" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "str_date", + "value": "" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "end_date", + "value": "" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "price", + "value": "" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "or_city", + "value": "" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "airport", + "value": "" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "booked", + "value": "true" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "n_adults", + "value": "" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "book", + "value": "true" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "shopping", + "value": "" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "duration", + "value": "" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "arr_time_or", + "value": "" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "or_city", + "value": "dontcare" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "gst_rating", + "value": "" + }, + { + "intent": "negate", + "domain": "travel", + "slot": "category", + "value": "" + }, + { + "intent": "no_result", + "domain": "travel", + "slot": "str_date", + "value": "" + }, + { + "intent": "no_result", + "domain": "travel", + "slot": "dst_city", + "value": "" + }, + { + "intent": "no_result", + "domain": "travel", + "slot": "end_date", + "value": "" + }, + { + "intent": "no_result", + "domain": "travel", + "slot": "or_city", + "value": "" + }, + { + "intent": "no_result", + "domain": "travel", + "slot": "dep_time_or", + "value": "" + }, + { + "intent": "no_result", + "domain": "travel", + "slot": "dep_time_dst", + "value": "" + }, + { + "intent": "no_result", + "domain": "travel", + "slot": "budget", + "value": "" + }, + { + "intent": "no_result", + "domain": "travel", + "slot": "max_duration", + "value": "" + }, + { + "intent": "no_result", + "domain": "travel", + "slot": "museum", + "value": "" + }, + { + "intent": "no_result", + "domain": "travel", + "slot": "n_adults", + "value": "" + }, + { + "intent": "no_result", + "domain": "travel", + "slot": "n_children", + "value": "" + }, + { + "intent": "no_result", + "domain": "travel", + "slot": "downtown", + "value": "" + }, + { + "intent": "offer", + "domain": "travel", + "slot": "n_adults", + "value": "" + }, + { + "intent": "offer", + "domain": "travel", + "slot": "n_children", + "value": "" + }, + { + "intent": "offer", + "domain": "travel", + "slot": "str_date", + "value": "" + }, + { + "intent": "offer", + "domain": "travel", + "slot": "end_date", + "value": "" + }, + { + "intent": "offer", + "domain": "travel", + "slot": "name", + "value": "" + }, + { + "intent": "offer", + "domain": "travel", + "slot": "price", + "value": "" + }, + { + "intent": "offer", + "domain": "travel", + "slot": "budget", + "value": "" + }, + { + "intent": "offer", + "domain": "travel", + "slot": "dst_city", + "value": "" + }, + { + "intent": "offer", + "domain": "travel", + "slot": "duration", + "value": "" + }, + { + "intent": "offer", + "domain": "travel", + "slot": "amenities", + "value": "" + }, + { + "intent": "offer", + "domain": "travel", + "slot": "vicinity", + "value": "false" + }, + { + "intent": "request", + "domain": "travel", + "slot": "duration", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "price", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "amenities", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "vicinity", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "str_date", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "n_adults", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "n_children", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "budget", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "or_city", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "name", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "category", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "breakfast", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "end_date", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "max_duration", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "seat", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "dst_city", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "count_dst_city", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "count_name", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "dep_time_dst", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "dep_time_or", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "arr_time_dst", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "arr_time_or", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "wifi", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "gym", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "flex", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "parking", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "spa", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "gst_rating", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "count", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "beach", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "park", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "museum", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "downtown", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "airport", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "mall", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "shopping", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "budget", + "value": "dontcare" + }, + { + "intent": "request", + "domain": "travel", + "slot": "university", + "value": "" + }, + { + "intent": "request", + "domain": "travel", + "slot": "palace", + "value": "" + }, + { + "intent": "request_alts", + "domain": "travel", + "slot": "dst_city", + "value": "" + }, + { + "intent": "request_alts", + "domain": "travel", + "slot": "name", + "value": "" + }, + { + "intent": "request_alts", + "domain": "travel", + "slot": "or_city", + "value": "" + }, + { + "intent": "request_alts", + "domain": "travel", + "slot": "dep_time_dst", + "value": "" + }, + { + "intent": "request_alts", + "domain": "travel", + "slot": "amenities", + "value": "" + }, + { + "intent": "request_alts", + "domain": "travel", + "slot": "str_date", + "value": "" + }, + { + "intent": "request_alts", + "domain": "travel", + "slot": "end_date", + "value": "" + }, + { + "intent": "request_alts", + "domain": "travel", + "slot": "vicinity", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "str_date", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "category", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "amenities", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "vicinity", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "wifi", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "parking", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "breakfast", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "seat", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "price", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "gst_rating", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "dst_city", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "name", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "end_date", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "dep_time_or", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "arr_time_dst", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "dep_time_dst", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "arr_time_or", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "budget", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "duration", + "value": "" + }, + { + "intent": "request_compare", + "domain": "travel", + "slot": "beach", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "dst_city", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "str_date", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "end_date", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "or_city", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "booked", + "value": "true" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "budget", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "budget", + "value": "dontcare" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "name", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "price", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "amenities", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "vicinity", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "museum", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "park", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "beach", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "n_adults", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "n_children", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "market", + "value": "" + }, + { + "intent": "suggest", + "domain": "travel", + "slot": "or_city", + "value": "dontcare" + }, + { + "intent": "switch_frame", + "domain": "travel", + "slot": "museum", + "value": "" + }, + { + "intent": "switch_frame", + "domain": "travel", + "slot": "str_date", + "value": "" + }, + { + "intent": "switch_frame", + "domain": "travel", + "slot": "end_date", + "value": "" + }, + { + "intent": "switch_frame", + "domain": "travel", + "slot": "gst_rating", + "value": "" + } + ], + "state": {} +} \ No newline at end of file diff --git a/data/unified_datasets/frames/original_data.zip b/data/unified_datasets/frames/original_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..07ca92735ee93112b830e6181aa9832a091698d7 Binary files /dev/null and b/data/unified_datasets/frames/original_data.zip differ diff --git a/data/unified_datasets/frames/preprocess.py b/data/unified_datasets/frames/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..64a61f0dcae4d14e5dcc4e8e377bc5d7699a7eeb --- /dev/null +++ b/data/unified_datasets/frames/preprocess.py @@ -0,0 +1,341 @@ +import zipfile +import json +import os +from pprint import pprint +from copy import deepcopy +from collections import Counter +from tqdm import tqdm +from convlab2.util.file_util import read_zipped_json, write_zipped_json +import re +self_dir = os.path.dirname(os.path.abspath(__file__)) + + +intent2des = { + "inform": "Inform a slot value", + "offer": "Offer a package to the user", + "request": "Ask for the value of a particular slot", + "switch_frame": "Switch to a frame", + "suggest": "Suggest a slot value or package that does not match the user's constraints", + "no_result": "Tell the user that the database returned no results", + "thankyou": "Thank the other speaker", + "sorry": "Apologize to the user", + "greeting": "Greet the other speaker", + "affirm": "Affirm something said by the other speaker", + "negate": "Negate something said by the other speaker", + "confirm": "Ask the other speaker to confirm a given slot value", + "moreinfo": "Ask for more information on a given set of results", + "goodbye": "Say goodbye to the other speaker", + "request_alts": "Ask for other possibilities", + "request_compare": "Ask the wizard to compare packages", + "hearmore": "Ask the user if she'd like to hear more about a given package", + "you_are_welcome": "Tell the user she is welcome", + "canthelp": "Tell the user you cannot answer her request", + "reject": "Tell the user you did not understand what she meant" +} + +slot2des = { + "book": "Find a trip to book", + "dst_city": "Destination city", + "or_city": "Origin city", + "str_date": "Start date for the trip", + "n_adults": "Number of adults", + "budget": "The amount of money that the user has available to spend for the trip.", + "end_date": "End date for the trip", + "flex": "Boolean value indicating whether the constraints are flexible", + "duration": "Duration of the trip", + "ref_anaphora": "Words used to refer to a frame", + "price": "Price of the trip including flights and hotel", + "max_duration": "Maximum number of days for the trip", + "amenities": "Number of amenities", + "vicinity": "Vicinity of the hotel", + "name": "Name of the hotel", + "category": "Rating of the hotel (in number of stars)", + "wifi": "Boolean value indicating whether or not the hotel offers free wifi", + "booked": "Booked a trip", + "dep_time_or": "Time of departure from origin city", + "n_children": "Number of children", + "gst_rating": "Rating of the hotel by guests (in number of stars)", + "parking": "Boolean value indicating whether or not the hotel offers free parking", + "arr_time_or": "Time of arrival to origin city", + "breakfast": "Boolean value indicating whether or not the hotel offers free breakfast", + "count": "Number of different packages", + "seat": "Seat type (economy or business)", + "count_name": "Number of different hotels", + "count_dst_city": "Number of destination cities", + "budget_ok": "Boolean value indicating whether the package fits the budget", + "arr_time_dst": "Time of arrival to destination", + "dep_time_dst": "Time of departure from destination", + "gym": "Boolean value indicating whether or not the hotel offers gym", + "spa": "Boolean value indicating whether or not the hotel offers spa", + "downtown": "Boolean value indicating whether or not the hotel is in the heart of the city", + "min_duration": "Minimum number of days for the trip", + "airport": "Boolean value indicating whether or not the hotel is in the vicinity of an airport", + "beach": "Boolean value indicating whether or not the hotel is in the vicinity of a beach", + "museum": "Boolean value indicating whether or not the hotel is in the vicinity of a museum", + "theatre": "Boolean value indicating whether or not the hotel is in the vicinity of a theatre", + "park": "Boolean value indicating whether or not the hotel is in the vicinity of a park", + "market": "Boolean value indicating whether or not the hotel is in the vicinity of a market", + "shopping": "Boolean value indicating whether or not the hotel is in the vicinity of a shopping center", + "university": "Boolean value indicating whether or not the hotel is in the vicinity of an university", + "mall": "Boolean value indicating whether or not the hotel is in the vicinity of a mall", + "palace": "Boolean value indicating whether or not the hotel is in the vicinity of a palace", + "cathedral": "Boolean value indicating whether or not the hotel is in the vicinity of a cathedral", + "no_result": "Boolean value indicating whether there is no result match user's constraints" +} + + +def get_slot_type(slot): + if slot in {'book', 'booked', 'vicinity', 'amenities'}: + return 'binary' + elif slot in {'dst_city', 'or_city', 'str_date', 'end_date', 'duration', 'min_duration', 'max_duration', + 'dep_time_or', 'arr_time_or', 'arr_time_dst', 'dep_time_dst', 'n_adults', 'n_children', 'budget', + 'price', 'ref_anaphora', 'name', 'category', 'gst_rating', + 'count', 'count_name', 'count_dst_city', 'seat'}: + return 'non-categorical' + elif slot in {'budget_ok', 'flex', 'wifi', 'parking', 'breakfast', 'gym', 'spa', 'downtown', 'airport', 'beach', + 'museum', 'theatre', 'park', 'market', 'shopping', 'university', 'mall', 'palace', 'cathedral'}: + return 'categorical' + else: + return None + + +digit2word = { + '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five', + '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten' +} + +match = { + '0': 0, + '1': 0, + '>1': 0, +} + + +def pharse_in_sen(phrase, sen): + ''' + match value in the sentence + :param phrase: str + :param sen: str + :return: start, end if matched, else None, None + ''' + assert isinstance(phrase, str) + pw = '(^|[\s,\.:\?!-])(?P<v>{})([\s,\.:\?!-]|$)' + pn = '(^|[\s\?!-]|\D[,\.:])(?P<v>{})($|[\s\?!-]|[,\.:]\D|[,\.:]$)' + if phrase.isdigit(): + pattern = pn + else: + pattern = pw + p = re.compile(pattern.format(re.escape(phrase)), re.I) + m = re.search(p, sen) + if m: + num = len(re.findall(p, sen)) + if num > 1: + match['>1'] += 1 + else: + match['1'] += 1 + return m.span('v'), num + if phrase.isdigit() and phrase in digit2word: + phrase = digit2word[phrase] + p = re.compile(pw.format(re.escape(phrase)), re.I) + m = re.search(p, sen) + if m: + num = len(re.findall(p, sen)) + if num > 1: + match['>1'] += 1 + else: + match['1'] += 1 + return m.span('v'), num + match['0'] += 1 + return (None, None), 0 + + +def iter_over_acts(acts): + for act in acts: + intent = act['name'] + for arg in act['args']: + k = arg['key'] + if k == 'id': + continue + elif k in ['ref', 'read', 'write']: + assert isinstance(arg['val'], list) + for frame in arg['val']: + for kv in frame['annotations']: + if kv['key'] in ('ref', 'read', 'write'): + print(kv, frame) + assert False + yield intent, kv['key'], kv.get('val') + else: + yield intent, k, arg.get('val', None) + + +def normalize_da(intent, slot, value, utterance): + if slot == 'intent': + slot = 'book' + elif slot == 'action': + slot = 'booked' + elif slot not in slot2des: + # ignore some rare slot + return None, None + + if slot in ['book', 'booked']: + slot_type = 'binary' + return slot_type, { + "intent": intent, + "domain": 'travel', + "slot": slot, + "value": 'True', + } + elif value is None or value == '': + slot_type = 'binary' + return slot_type, { + "intent": intent, + "domain": 'travel', + "slot": slot, + "value": '', + } + elif value == '-1': + slot_type = 'binary' + return slot_type, { + "intent": intent, + "domain": 'travel', + "slot": slot, + "value": 'dontcare', + } + elif isinstance(value, str): + slot_type = get_slot_type(slot) + assert slot_type == 'non-categorical' + (start, end), num = pharse_in_sen(value, utterance) + if not num: + if slot == 'gst_rating' and pharse_in_sen(' / '.join(value.split('/')), utterance)[1]: + value = ' / '.join(value.split('/')) + elif 'a. m' in value and pharse_in_sen(value.replace('a. m', 'a.m'), utterance)[1]: + value = value.replace('a. m', 'a.m') + elif 'p. m' in value and pharse_in_sen(value.replace('p. m', 'p.m'), utterance)[1]: + value = value.replace('p. m', 'p.m') + elif slot == 'price' and pharse_in_sen(value.replace('USD', ' USD'), utterance)[1]: + value = value.replace('USD', ' USD') + else: + # few wrong annotation + return None, None + (start, end), num = pharse_in_sen(value, utterance) + assert num, print(value, utterance) + if not num: + return None, None + # return None, None + return slot_type, { + "intent": intent, + "domain": 'travel', + "slot": slot, + "value": utterance[start:end], + "start": start, + "end": end + } + elif isinstance(value, bool): + slot_type = get_slot_type(slot) + value = str(value) + assert slot_type == 'categorical' or slot_type == 'binary', print(slot, value) + return slot_type, { + "intent": intent, + "domain": 'travel', + "slot": slot, + "value": value, + } + else: + assert 0 + + +def preprocess(): + processed_dialogue = [] + ontology = {'domains': {'travel': + {"description": "Book a vacation package containing round-trip flights and a hotel.", + "slots": {}}}, + 'intents': {}, + 'binary_dialogue_act': [], + 'state': {}} + original_zipped_path = os.path.join(self_dir, 'original_data.zip') + new_dir = os.path.join(self_dir, 'original_data') + if not os.path.exists(original_zipped_path): + raise FileNotFoundError(original_zipped_path) + if not os.path.exists(os.path.join(self_dir, 'data.zip')) or not os.path.exists(os.path.join(self_dir, 'ontology.json')): + print('unzip to', new_dir) + print('This may take several minutes') + archive = zipfile.ZipFile(original_zipped_path, 'r') + archive.extractall(new_dir) + data = json.load(open(os.path.join(new_dir, 'frames.json'))) + # json.dump(data, open(os.path.join(new_dir, 'original_data.json'), 'w'), indent=2) + cnt = 1 + for d in tqdm(data, desc='dialogue'): + dialogue = { + "dataset": 'frames', + "data_split": 'train', + "dialogue_id": 'frames_' + str(cnt), + "original_id": d['id'], + "user_id": d['user_id'], + "system_id": d['wizard_id'], + "userSurveyRating": d['labels']['userSurveyRating'], + "wizardSurveyTaskSuccessful": d['labels']['wizardSurveyTaskSuccessful'], + "domains": ['travel'], + "turns": [] + } + # state = deepcopy(ontology['state']['travel']) + for utt_idx, t in enumerate(d['turns']): + speaker = 'system' if t['author']=='wizard' else t['author'] + turn = { + 'speaker': speaker, + 'utterance': t['text'], + 'utt_idx': utt_idx, + 'dialogue_act': { + 'binary': [], + 'categorical': [], + 'non-categorical': [], + }, + } + for intent, slot, value in iter_over_acts(t['labels']['acts']): + da_type, da = normalize_da(intent, slot, value, t['text']) + if da is not None: + da['value'] = da['value'].lower() + turn['dialogue_act'][da_type].append(da) + slot = da['slot'] + value = da['value'] + if da_type == 'binary': + if da not in ontology['binary_dialogue_act']: + ontology['binary_dialogue_act'].append(da) + else: + ontology['domains']['travel']['slots'].setdefault(slot, { + "description": slot2des[slot], + "is_categorical": da_type=='categorical', + "possible_values": [] + }) + if da_type == 'categorical' \ + and value not in ontology['domains']['travel']['slots'][slot]['possible_values']: + ontology['domains']['travel']['slots'][slot]['possible_values'].append(value) + ontology['intents'].setdefault(intent, { + "description": intent2des[intent] + }) + # state + if speaker == 'user': + turn['state'] = {} + turn['state_update'] = { + 'categorical': [], + 'non-categorical': [], + } + dialogue['turns'].append(deepcopy(turn)) + cnt += 1 + if len(dialogue['turns']) % 2 == 0: + dialogue['turns'] = dialogue['turns'][:-1] + processed_dialogue.append(deepcopy(dialogue)) + ontology['binary_dialogue_act'] = sorted(ontology['binary_dialogue_act'], key=lambda x: x['intent']) + json.dump(ontology, open(os.path.join(self_dir, 'ontology.json'), 'w'), indent=2) + json.dump(processed_dialogue, open('data.json', 'w'), indent=2) + write_zipped_json(os.path.join(self_dir, 'data.zip'), 'data.json') + os.remove('data.json') + else: + # read from file + processed_dialogue = read_zipped_json(os.path.join(self_dir, 'data.zip'), 'data.json') + ontology = json.load(open(os.path.join(self_dir, 'ontology.json'))) + return processed_dialogue, ontology + + +if __name__ == '__main__': + preprocess() + print(match) # {'0': 271, '1': 29333, '>1': 806} diff --git a/data/unified_datasets/metalwoz/README.md b/data/unified_datasets/metalwoz/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e49e76f5a938161fa1f39f5f762173f5841aabe6 --- /dev/null +++ b/data/unified_datasets/metalwoz/README.md @@ -0,0 +1,17 @@ +# README + +## Features + +No sentence-level annotation. Only annotate domain. + +Statistics: + +| | \# dialogues | \# utterances | avg. turns | avg. tokens | \# domains | +| ----- | ------------ | ------------- | ---------- | ----------- | ---------- | +| train | 37884 | 362450 | 9.57 | 7.66 | - | +| test | 2319 | 21949 | 9.46 | 8.23 | - | + + +## Original data + +- https://www.microsoft.com/en-us/research/project/metalwoz/ diff --git a/data/unified_datasets/metalwoz/data.zip b/data/unified_datasets/metalwoz/data.zip new file mode 100644 index 0000000000000000000000000000000000000000..ccd14f97ff4f8be6244d1634f33ca3b6ecda6dc8 Binary files /dev/null and b/data/unified_datasets/metalwoz/data.zip differ diff --git a/data/unified_datasets/metalwoz/metalwoz-test-v1.zip b/data/unified_datasets/metalwoz/metalwoz-test-v1.zip new file mode 100644 index 0000000000000000000000000000000000000000..040153c440767b113a805e68d7e9851e2e066c06 Binary files /dev/null and b/data/unified_datasets/metalwoz/metalwoz-test-v1.zip differ diff --git a/data/unified_datasets/metalwoz/metalwoz-v1.zip b/data/unified_datasets/metalwoz/metalwoz-v1.zip new file mode 100644 index 0000000000000000000000000000000000000000..eff7551b322eb7c3428d706fa660fce9c776aced Binary files /dev/null and b/data/unified_datasets/metalwoz/metalwoz-v1.zip differ diff --git a/data/unified_datasets/metalwoz/ontology.json b/data/unified_datasets/metalwoz/ontology.json new file mode 100644 index 0000000000000000000000000000000000000000..099a53528f47e5962b053c649d3e4a56cf0819c8 --- /dev/null +++ b/data/unified_datasets/metalwoz/ontology.json @@ -0,0 +1,211 @@ +{ + "domains": { + "AGREEMENT_BOT": { + "description": "", + "slots": {} + }, + "ALARM_SET": { + "description": "", + "slots": {} + }, + "APARTMENT_FINDER": { + "description": "", + "slots": {} + }, + "APPOINTMENT_REMINDER": { + "description": "", + "slots": {} + }, + "AUTO_SORT": { + "description": "", + "slots": {} + }, + "BANK_BOT": { + "description": "", + "slots": {} + }, + "BUS_SCHEDULE_BOT": { + "description": "", + "slots": {} + }, + "CATALOGUE_BOT": { + "description": "", + "slots": {} + }, + "CHECK_STATUS": { + "description": "", + "slots": {} + }, + "CITY_INFO": { + "description": "", + "slots": {} + }, + "CONTACT_MANAGER": { + "description": "", + "slots": {} + }, + "DECIDER_BOT": { + "description": "", + "slots": {} + }, + "EDIT_PLAYLIST": { + "description": "", + "slots": {} + }, + "EVENT_RESERVE": { + "description": "", + "slots": {} + }, + "GAME_RULES": { + "description": "", + "slots": {} + }, + "GEOGRAPHY": { + "description": "", + "slots": {} + }, + "GUINESS_CHECK": { + "description": "", + "slots": {} + }, + "HOME_BOT": { + "description": "", + "slots": {} + }, + "HOW_TO_BASIC": { + "description": "", + "slots": {} + }, + "INSURANCE": { + "description": "", + "slots": {} + }, + "LIBRARY_REQUEST": { + "description": "", + "slots": {} + }, + "LOOK_UP_INFO": { + "description": "", + "slots": {} + }, + "MAKE_RESTAURANT_RESERVATIONS": { + "description": "", + "slots": {} + }, + "MOVIE_LISTINGS": { + "description": "", + "slots": {} + }, + "MUSIC_SUGGESTER": { + "description": "", + "slots": {} + }, + "NAME_SUGGESTER": { + "description": "", + "slots": {} + }, + "ORDER_PIZZA": { + "description": "", + "slots": {} + }, + "PET_ADVICE": { + "description": "", + "slots": {} + }, + "PHONE_PLAN_BOT": { + "description": "", + "slots": {} + }, + "PHONE_SETTINGS": { + "description": "", + "slots": {} + }, + "PLAY_TIMES": { + "description": "", + "slots": {} + }, + "POLICY_BOT": { + "description": "", + "slots": {} + }, + "PRESENT_IDEAS": { + "description": "", + "slots": {} + }, + "PROMPT_GENERATOR": { + "description": "", + "slots": {} + }, + "QUOTE_OF_THE_DAY_BOT": { + "description": "", + "slots": {} + }, + "RESTAURANT_PICKER": { + "description": "", + "slots": {} + }, + "SCAM_LOOKUP": { + "description": "", + "slots": {} + }, + "SHOPPING": { + "description": "", + "slots": {} + }, + "SKI_BOT": { + "description": "", + "slots": {} + }, + "SPORTS_INFO": { + "description": "", + "slots": {} + }, + "STORE_DETAILS": { + "description": "", + "slots": {} + }, + "TIME_ZONE": { + "description": "", + "slots": {} + }, + "UPDATE_CALENDAR": { + "description": "", + "slots": {} + }, + "UPDATE_CONTACT": { + "description": "", + "slots": {} + }, + "WEATHER_CHECK": { + "description": "", + "slots": {} + }, + "WEDDING_PLANNER": { + "description": "", + "slots": {} + }, + "WHAT_IS_IT": { + "description": "", + "slots": {} + }, + "BOOKING_FLIGHT": { + "description": "", + "slots": {} + }, + "HOTEL_RESERVE": { + "description": "", + "slots": {} + }, + "TOURISM": { + "description": "", + "slots": {} + }, + "VACATION_IDEAS": { + "description": "", + "slots": {} + } + }, + "intents": {}, + "binary_dialogue_act": [], + "state": {} +} \ No newline at end of file diff --git a/data/unified_datasets/metalwoz/preprocess.py b/data/unified_datasets/metalwoz/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..c075e7d4541169a7fd9c611503d6ed9d8ae69817 --- /dev/null +++ b/data/unified_datasets/metalwoz/preprocess.py @@ -0,0 +1,89 @@ +import json +import os +from zipfile import ZipFile, ZIP_DEFLATED + +import json_lines + + +dataset = 'metalwoz' +self_dir = os.path.dirname(os.path.abspath(__file__)) +DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(self_dir)), 'data') +# origin_data_dir = os.path.join(DATA_PATH, dataset) +origin_data_dir = self_dir + + +def preprocess(): + ontology = { + 'domains': {}, + 'intents': {}, + 'binary_dialogue_act': [], + 'state': {} + } + + def process_dialog(ori_dialog, split, dialog_id): + domain = ori_dialog['domain'] + ontology['domains'][domain] = { + 'description': "", + 'slots': {} + } + dialog = { + "dataset": dataset, + "data_split": split, + "dialogue_id": f'{dataset}_{dialog_id}', + "original_id": ori_dialog['id'], + "domains": [domain], + } + turns = [] + # starts with system + for utt_idx, utt in enumerate(ori_dialog['turns'][1:]): + turn = { + 'utt_idx': utt_idx, + 'utterance': utt, + 'dialogue_act': { + 'categorical': [], + 'non-categorical': [], + 'binary': [], + }, + } + if utt_idx % 2 == 0: + turn['speaker'] = 'user' + turn['state'] = {} + turn['state_update'] = { + 'categorical': [], + 'non-categorical': [], + } + else: + turn['speaker'] = 'system' + turns.append(turn) + if turns[-1]['speaker'] == 'system': + turns.pop() + + dialog['turns'] = turns + return dialog + + dialog_id = 0 + data = [] + with ZipFile(os.path.join(origin_data_dir, 'metalwoz-v1.zip')) as zipfile: + for path in zipfile.namelist(): + if path.startswith('dialogues'): + for dialog in json_lines.reader(zipfile.open(path)): + data.append(process_dialog(dialog, 'train', dialog_id)) + dialog_id += 1 + + ZipFile(os.path.join(origin_data_dir, 'metalwoz-test-v1.zip')).extract('dstc8_metalwoz_heldout.zip') + with ZipFile(os.path.join('dstc8_metalwoz_heldout.zip')) as zipfile: + for path in zipfile.namelist(): + if path.startswith('dialogues'): + for dialog in json_lines.reader(zipfile.open(path)): + data.append(process_dialog(dialog, 'test', dialog_id)) + dialog_id += 1 + os.remove('dstc8_metalwoz_heldout.zip') + + json.dump(ontology, open(os.path.join(self_dir, 'ontology.json'), 'w')) + json.dump(data, open('data.json', 'w'), indent=4) + ZipFile(os.path.join(self_dir, 'data.zip'), 'w', ZIP_DEFLATED).write('data.json') + os.remove('data.json') + + +if __name__ == '__main__': + preprocess() diff --git a/data/unified_datasets/multiwoz21/README.md b/data/unified_datasets/multiwoz21/README.md new file mode 100644 index 0000000000000000000000000000000000000000..803ebf3ad3ee6c4b6aaf710a739f9518bf5d5321 --- /dev/null +++ b/data/unified_datasets/multiwoz21/README.md @@ -0,0 +1,31 @@ +# README + +## Features + +- Annotations: dialogue act, character-level span for non-categorical slots. state and state updates. + +Statistics: + +| | \# dialogues | \# utterances | avg. turns | avg. tokens | \# domains | +| ----- | ------------ | ------------- | ---------- | ----------- | ---------- | +| train | 8434 | 105066 | 12.46 | 17.27 | 7 | +| dev | 999 | 13731 | 13.74 | 17.72 | 7 | +| train | 1000 | 13744 | 13.74 | 17.67 | 7 | + + +## Main changes + +- only keep 5 domains in state annotations and dialog acts. +- `pricerange`, `area`, `day`, `internet`, `parking`, `stars` are considered categorical slots. +- punctuation marks are split from their previous tokens. e.g `I want to find a hotel. -> + I want to find a hotel .` + +Run `evaluate.py`: + +da values match rate: 97.944 +state values match rate: 66.017 + +### original data + +- from [multiwoz](https://github.com/budzianowski/multiwoz) repo. + diff --git a/data/unified_datasets/multiwoz21/data.zip b/data/unified_datasets/multiwoz21/data.zip new file mode 100644 index 0000000000000000000000000000000000000000..7f60019dfc4a68728474baf9e8d3d73305b46082 Binary files /dev/null and b/data/unified_datasets/multiwoz21/data.zip differ diff --git a/data/unified_datasets/multiwoz21/ontology.json b/data/unified_datasets/multiwoz21/ontology.json new file mode 100644 index 0000000000000000000000000000000000000000..b976054f3b6dc47c327c75f545549dfe3563ad01 --- /dev/null +++ b/data/unified_datasets/multiwoz21/ontology.json @@ -0,0 +1,1895 @@ +{ + "domains": { + "taxi": { + "description": "taxi information query system", + "slots": { + "destination": { + "description": "destination of taxi", + "is_categorical": false, + "possible_values": [] + }, + "departure": { + "description": "departure location of taxi", + "is_categorical": false, + "possible_values": [] + }, + "leaveAt": { + "description": "leaving time of taxi", + "is_categorical": false, + "possible_values": [] + }, + "arriveBy": { + "description": "arrival time of taxi", + "is_categorical": false, + "possible_values": [] + }, + "taxi_phone": { + "description": "taxi phone number", + "is_categorical": false, + "possible_values": [] + }, + "taxi_types": { + "description": "taxi type", + "is_categorical": false, + "possible_values": [] + } + } + }, + "restaurant": { + "description": "restaurant information query system", + "slots": { + "food": { + "description": "food type for the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "name": { + "description": "name of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "pricerange": { + "description": "price budget for the restaurant", + "is_categorical": true, + "possible_values": [ + "not moderate", + "ch", + "high - end", + "luxury options", + "priced cheaply", + "moderate", + "various", + "all price ranges", + "cheap price range", + "low cost", + "assorted", + "cheap priced", + "moderate or expensive priced", + "expensive price", + "your", + "most expensive", + "moderate and expensive", + "reasonable", + "moderately expensive", + "moderate|cheap", + "fairly expensive", + "upscale", + "quite inexpensive", + "boderate", + "affordable", + "expinsive", + "they vary", + "more moderate", + "expensive / upscale", + "high class", + "moderately price", + "reasonably priced", + "a little expensive but worth it", + "middle", + "any price point", + "priced moderately", + "cherap", + "moderately prices", + "cheap to expensive", + "or otherwise", + "inexpensive", + "cheaply priced", + "adforable", + "that range", + "great prices", + "rather expensive", + "expensive side", + "moderate range", + "cheaply - priced", + "more expensive", + "varying price", + "similar price range", + "this price range", + "expensive", + "expensive or moderate", + "not cheap", + "the least expensive", + "pretty expensive", + "high end", + "not expensive", + "less expensive", + "moderatley priced", + "most affordable price range", + "budget - friendly", + "expensive and moderately priced", + "moderatre", + "cheap range", + "or the moderately priced", + "fairly cheap", + "not too expensive", + "east", + "fairly inexpensive", + "regardless of price", + "north", + "mostly expensive and moderately priced", + "cheap", + "budget conscious", + "same price range", + "very expensive", + "not too pricey", + "any price range", + "quite expensive", + "that price range", + "your price range", + "every price point", + "expensively priced", + "varying price range", + "not - so - expensive", + "all", + "vietnamese", + "any", + "moderately - priced", + "that price", + "moderate pricing", + "do nt care", + "modest", + "pricey", + "expensive but worth every penny", + "cheaper", + "pretty cheap", + "moderate price", + "ranging from cheap to expensive", + "quite low", + "moderate priced", + "centre", + "the same", + "expensive range", + "relatively cheap", + "cheap or expensive", + "epensive", + "moderate and one in the cheap range", + "budget friendly", + "fine", + "on the cheap side", + "expensive price range", + "mostly expensive", + "moderately priced", + "relatively expensive", + "moderately", + "moderatly", + "on the pricey side", + "low priced", + "expensively", + "moderate price range", + "moderatly priced", + "do n't care" + ] + }, + "address": { + "description": "exact location of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "area": { + "description": "area or place of the restaurant", + "is_categorical": true, + "possible_values": [ + "near the centre", + "east area", + "northern parts of town", + "any part of town", + "west cambridge", + "town of centre", + "centre of cambridge", + "north or centre", + "westside", + "city centre", + "the west end", + "the south of town", + "the city center", + "elsewhere in the city", + "town center", + "the west side", + "south of cambridge", + "south area of town", + "here", + "north side of town", + "on the east", + "throughout the city", + "south area", + "centre city", + "city", + "in the north", + "not", + "east side", + "not in the west", + "here in the city", + "the south area of town", + "that part of town", + "the east", + "west of town", + "centere", + "in the city .", + "south part of time", + "mexican", + "the east side", + "south part of town", + "west part of town .", + "southern", + "any where in the city", + "west area", + "north cambridge", + "the south area of the city", + "almost every area of town", + "ely", + "the north end", + "south cambridge", + "the are", + "east side of town", + "southside", + "east side of the city", + "by the airport", + "around there", + "all of cambridge", + "the town centre", + "nearby", + "town centre", + "the center part of town", + "other parts of town", + "not in the south of town", + "chesterton", + "the centre of town", + "east area of town", + "the east area", + "southend", + "ctre", + "same area", + "anywhere in cambridge", + "north side of cambridge", + "same area as the park", + "south side of town", + "the centre area", + "the same area as the botanic gardens", + "south part of the city", + "the area you have chosen", + "in the centre", + "northern part of cambridge", + "east section", + "east|south", + "the east part of town", + "this town", + "the same area", + "west area of town", + "town centre area", + "the southern area", + "northern part", + "southern area", + "the east side of town", + "expensive", + "east section of town", + "same", + "in cambridge", + "north side of chersteron", + "towns centre", + "in town", + "west side", + "in the east", + "centrally", + "west side area", + "all over town", + "centre region", + "anywhere in town", + "closely located", + "west part of town", + "cambridge", + "downtown", + "south end", + "close it city centre", + "in the city centre", + "close to the hotel", + "east", + "north and west", + "west end of town", + "the south part of town", + "the north side", + "any area", + "the area of west", + "center of the town", + "the area", + "the centre part of town", + "north", + "wet part of town", + "other part of the town", + "cheap", + "north part of town", + "centre area of the town", + "cetre", + "in town at all", + "central", + "east part of town", + "near the center of town", + "the city centre", + "north end", + "east of town", + "centreof", + "west", + "the north side of town", + "in the west", + "same side of town as your hotel", + "the west of town", + "centre area of town", + "centrem", + "east of cambridge", + "center of town", + "anywhere", + "throughout the area", + "central area", + "south of town", + "the west area", + "the center of town", + "centre part of town", + "the center", + "north side", + "the south", + "the west side of town", + "west part of the city", + "center", + "any", + "the north", + "east end", + "the west part of town", + "west end", + "center cambridge", + "the centre area of town", + "centre area", + "northside", + "centre area of the city", + "center area of town", + "centrally located", + "the south side", + "around the college", + "do nt care", + "thai", + "the west", + "south side", + "cambridge centre", + "town", + "west of cambridge", + "near clare hall", + "south of town .", + "n the centre", + "centre of the city", + "the north area", + "that", + "this area", + "central region of town", + "centre", + "north of town", + "these areas", + "the same", + "south part", + "that area of town", + "in the city", + "that area", + "city center", + "south", + "close to your location", + "north area of town", + "all over cambridge", + "the south side of town", + "within this area", + "near centre", + "the centre", + "centre of town .", + "the north part of town", + "the north side .", + "same area as the hotel", + "and centre", + "west side of town", + "westies", + "around town", + "east part of tow", + "eastside", + "close to that area", + "all over the city", + "south side of the town", + "centre cambridge", + "do n't care", + "north area", + "centre of town" + ] + }, + "postcode": { + "description": "postcode of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "phone": { + "description": "restaurant phone number", + "is_categorical": false, + "possible_values": [] + }, + "people": { + "description": "number of people booking the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "time": { + "description": "time of the restaurant booking", + "is_categorical": false, + "possible_values": [] + }, + "day": { + "description": "day of the restaurant booking", + "is_categorical": true, + "possible_values": [ + "saturday", + "wednesday", + "tuesday", + "w", + "saturday|thursday", + "monday", + "friday", + "thursday", + "sunday", + "sunday|thursday" + ] + }, + "choice": { + "description": "number of restaurants meeting requests of user", + "is_categorical": false, + "possible_values": [] + } + } + }, + "attraction": { + "description": "an entertainment that is offered to the public", + "slots": { + "address": { + "description": "details of where the attraction is", + "is_categorical": false, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the attraction", + "is_categorical": false, + "possible_values": [] + }, + "phone": { + "description": "phone number of the attraction", + "is_categorical": false, + "possible_values": [] + }, + "entrance fee": { + "description": "the fee charged for admission to the attraction", + "is_categorical": false, + "possible_values": [] + }, + "type": { + "description": "type of the attraction", + "is_categorical": false, + "possible_values": [] + }, + "area": { + "description": "area or place of the attraction", + "is_categorical": true, + "possible_values": [ + "east area", + "near the centre", + "west cambridge", + "centre of cambridge", + "other areas", + "westside", + "the west - side", + "city centre", + "central district", + "near the restaurant", + "whole of cambridge", + "the west end", + "entre", + "various", + "northern area", + "town center", + "west of your city", + "the west side", + "south of cambridge", + "west part of cambridge", + "norwich", + "throughout the city", + "centre city", + "south area", + "city", + "very close in the same area", + "centre by galleria", + "east side", + "northern", + "that part of town", + "the east", + "same general area", + "went side of town", + "southern cambridge", + "the east side", + "south part of town", + "in that area", + "to the south", + "clifton way", + "there", + "west area", + "in the south", + "center are", + "south cambridge", + "ely", + "east side of town", + "central cambridge", + "south park of town", + "the south area", + "centre area of cambridge", + "west area of the city", + "several different parts of town", + "cambridge 's centre", + "nearby", + "town centre", + "the center part of town", + "near the hotel", + "museum", + "the area you 're looking for", + "other parts of town", + "most of them are in the center", + "the centre of town", + "northern area of town", + "east area of town", + "north of the city", + "different area", + "same area", + "east section", + "south side of town", + "the centre area", + "in the centre", + "in the center", + "western part of town", + "centre area .", + "south part of cambridge", + "the same road", + "cetnre of town", + "the same area", + "town centre area", + "the east part of town", + "center area", + "west area of town", + "cenre", + "center of town .", + "cambridge leisure park", + "church area", + "that area .", + "near ely", + "in that side", + "east cambridge", + "same", + "in cambridge", + "towns centre", + "that side of town", + "another area", + "in town", + "west side", + "west end of the city", + "the centry area", + "in the east", + "we", + "all over town", + "all around the city", + "city centre .", + "center of cambridge", + "centre region", + "west part of town", + "cambridge", + "cent", + "western part of the town", + "downtown", + "south end", + "east", + "this side of town", + "same area as hotel", + "center of the town", + "west end of town", + "the area", + "eat", + "northern cambridge", + "north", + "same part of town as your restaurant", + "the centre part of town", + "north part of town", + "central", + "close to the center of town", + "east part of town", + "same part", + "western cambridge", + "the city centre", + "east of town", + "west", + "the eastside", + "north in milton", + "the west of town", + "centre area of town", + "ce", + "east of cambridge", + "the east of the town", + "close to the restaurant", + "the west area of town", + "center of town", + "the south near your hotel", + "south of town", + "the west area", + "centre part of town", + "the center", + "north side", + "north section of cambridge", + "the north of the city", + "all of the other areas", + "the south", + "all", + "the west side of town", + "west part of the city", + "center", + "the north", + "your desired location", + "wet end", + "center part of town", + "cb30aq", + "west end", + "in the area", + "centre area", + "centrally located", + "the south side", + "city 's centre", + "do nt care", + "the west", + "east end of town", + "every area except the north", + "south side", + "centre of the city", + "that", + "this area", + "centre", + "north of town", + "right in the center of town", + "city cenre", + "centre of town ?", + "that area", + "city center", + "south", + "same area as tandoori palace", + "centre|west", + "central zone", + "the centre", + "the center area", + "west side of town", + "western area of town", + "the center of the park", + "west side of the city", + "eastside", + "northend", + "the area you are looking for", + "all over the city", + "western", + "on the centre", + "museums", + "north area", + "centre of town", + "center of the city" + ] + }, + "name": { + "description": "name of the attraction", + "is_categorical": false, + "possible_values": [] + }, + "pricerange": { + "description": "the price range for the attraction, from cheap to expensive", + "is_categorical": true, + "possible_values": [ + "free", + "no entrance fee", + "entrance fee", + "unaware of what their entrance fee is", + "not listed", + "cheap", + "we do n't have any information", + "2 pounds", + "do n't see a price listed", + "from free to 2 pounds 50", + "4 pounds", + "moderately priced", + "neither prices are listed", + "expensive", + "5 pounds", + "they do n't have the entrance fee posted", + "free admission", + "not sure of the fee", + "5 pound entrance fee", + "do n't have information", + "3.50 pounds" + ] + }, + "choice": { + "description": "number of attractions matching requests of user", + "is_categorical": false, + "possible_values": [] + } + } + }, + "booking": { + "description": "to arrange with a taxi, restaurant, train, etc.", + "slots": { + "time": { + "description": "time for an order", + "is_categorical": false, + "possible_values": [] + }, + "day": { + "description": "day for an order, from monday to sunday", + "is_categorical": true, + "possible_values": [ + "saturday", + "wednesday", + "saturday night thru monday", + "vmhj6y3i", + "6", + "weds", + "tuesday through friday", + "tuesday instead of thursday", + "2", + "that", + "tues", + "sunday 18/06/2017", + "tuesday 's", + "today", + "tonight", + "that time frame", + "tuesday", + "at that time", + "1", + "this evening", + "sunday and monday", + "the same day", + "sundar", + "monday", + "friday", + "thursday", + "sunday", + "we d" + ] + }, + "stay": { + "description": "for how long the user wish to be at a place", + "is_categorical": false, + "possible_values": [] + }, + "people": { + "description": "how many person the order is for", + "is_categorical": false, + "possible_values": [] + }, + "name": { + "description": "name of the ordered place", + "is_categorical": false, + "possible_values": [] + }, + "Ref": { + "description": "reference number of the order", + "is_categorical": false, + "possible_values": [] + } + } + }, + "train": { + "description": "query and order a train", + "slots": { + "destination": { + "description": "destination of the train", + "is_categorical": false, + "possible_values": [] + }, + "arriveBy": { + "description": "arrival time of the train", + "is_categorical": false, + "possible_values": [] + }, + "departure": { + "description": "departure location of the train", + "is_categorical": false, + "possible_values": [] + }, + "duration": { + "description": "the length of time the train trip lasts", + "is_categorical": false, + "possible_values": [] + }, + "people": { + "description": "number of people booking for train", + "is_categorical": false, + "possible_values": [] + }, + "day": { + "description": "day of the train", + "is_categorical": true, + "possible_values": [ + "cambridge", + "fr", + "saturday", + "wednesday", + "13:07", + "everday", + "weds", + "sat", + "sundays", + "train", + "that", + "between friday and wednesday", + "all week", + "this day", + "every day", + "tuesday", + "saturdays", + "tr2519", + "every", + "other days are available", + "mondays", + "we", + "monday", + "friday", + "thursday", + "frday", + "sunday", + "daily", + "that day", + "fiday" + ] + }, + "Ref": { + "description": "reference number of the order", + "is_categorical": false, + "possible_values": [] + }, + "leaveAt": { + "description": "leaving time for the train", + "is_categorical": false, + "possible_values": [] + }, + "price": { + "description": "price for the train ticket", + "is_categorical": false, + "possible_values": [] + }, + "choice": { + "description": "number of trains that meets requests of the user", + "is_categorical": false, + "possible_values": [] + } + } + }, + "hotel": { + "description": "to query hotel information and place an order", + "slots": { + "internet": { + "description": "internet option at the hotel", + "is_categorical": true, + "possible_values": [ + "free internet", + "free", + "y", + "do nt care", + "yes", + "none", + "no", + "4" + ] + }, + "area": { + "description": "area or place of the hotel", + "is_categorical": true, + "possible_values": [ + "near the centre", + "any part of town", + "east area", + "different parts of the city", + "north end of the city", + "centre of cambridge", + "near the restaurant", + "city centre", + "throughout cambridge", + "different part of town", + "eastern part of the city", + "the west end", + "various", + "northern area", + "the northern part of town", + "northern part of town", + "the west side", + "on the west", + "south area of town", + "centrally - located", + "north side of town", + "central location", + "centra", + "throughout the city", + "not in the south", + "south area", + "centre city", + "in the north", + "east side", + "northern side of town", + "different parts of town", + "you are interested in", + "northern", + "that part of town", + "west of town", + "the east", + "all areas", + "all parts of the city", + "the east side", + "south part of town", + "south end of the city", + "there", + "west area", + "north cambridge", + "the north end", + "south cambridge", + "east side of town", + "southside", + "the south area", + "or west ?", + "another area of town", + "all of cambridge", + "the town centre", + "east near other shops and boutiques", + "town centre", + "on the west side", + "all over", + "the centre of the city", + "other parts of town", + "the centre of town", + "east area of town", + "the east area", + "the westside", + "southend", + "west|centre", + "northe part of town", + "anywhere in cambridge", + "different area", + "same area", + "south side of town", + "the centre area", + "on the south", + "in the centre", + "in the center", + "wast", + "all across town", + "south end of town", + "except in the north", + "and east", + "place to be a guesthouse", + "the same area", + "other parts of the city", + "the east side of town", + "somewhere else", + "and in the centre", + "east section of town", + "east cambridge", + "across cambridge", + "north of cambridge", + "next door", + "except in the east", + "in cambridge", + "north part of the city", + "that side of town", + "south cambridge area", + "in town", + "west side", + "west end of the city", + "we", + "centrally", + "west side near the restaurant", + "all over town", + "southern part of town", + "all around the city", + "west part of town", + "cambridge", + "downtown", + "south end", + "east", + "east and the north .", + "norht", + "the north side", + "any area", + "center of the town", + "the area", + "eat", + "northern cambridge", + "north", + "cheap", + "north part of town", + "centre part", + "east part of town", + "western cambridge", + "north end of town", + "north end", + "east of town", + "several areas of town", + "west", + "eastern", + "in the west", + "everywhere but the city centre", + "centre area of town", + "center of town", + "west part", + "the town center", + "south of town", + "the center of town", + "the west area", + "centre part of town", + "nborth", + "north side", + "the north of the city", + "the south", + "west part of the city", + "center", + "the north", + "west end", + "same side of town", + "in the area", + "any part of the city", + "centre area", + "not to far from the restaurant", + "northside", + "on the eastside", + "close to where you 'll be dining", + "the east end of town", + "various parts of the city", + "west areas of town", + "centrally located", + "the south side", + "north part of town .", + "do nt care", + "the west", + "same area as the restaurant", + "that vicinity", + "nearby the restaurant", + "south side", + "east part of time", + "centre of the city", + "south closer to the museum", + "north park of town", + "centre", + "north of town", + "north and centre", + "the same", + "on the north", + "that area of town", + "that area", + "in the city", + "el shaddai", + "city center", + "south", + "the city 's south side", + "north area of town", + "near the museum", + "eastern cambridge", + "near centre", + "the easy", + "a different area", + "the centre", + "the north part of town", + "west side of town", + "across town", + "eastside", + "northend", + "north par of town", + "the centre of cambridge", + "all over the city", + "north location", + "centre cambridge", + "that region", + "north area", + "centre of town" + ] + }, + "stars": { + "description": "star rating of the hotel", + "is_categorical": true, + "possible_values": [ + "4|5", + "several", + "0 to 4", + "no star rating", + "do nt care", + "one", + "does not show", + "foru", + "does not have", + "different star ratings", + "unrated", + "2", + "four starts", + "4-star", + "four - star", + "3|4", + "yes", + "three", + "four", + "lower", + "four stars", + "zero", + "ranging from 2 - 4 stars", + "two", + "1", + "3", + "four star", + "0-star", + "five", + "drop the star rating", + "2-star", + "no", + "3-star", + "5-star", + "0", + "1-star", + "not rated", + "5", + "not as fancy", + "4" + ] + }, + "parking": { + "description": "parking facility at the hotel", + "is_categorical": true, + "possible_values": [ + "n", + "free", + "do nt care", + "yes", + "none", + "no", + "free parking" + ] + }, + "phone": { + "description": "hotel phone number", + "is_categorical": false, + "possible_values": [] + }, + "name": { + "description": "name of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "pricerange": { + "description": "price budget of the hotel", + "is_categorical": true, + "possible_values": [ + "cheapest", + "economically priced", + "free", + "all different price ranges", + "more moderately priced", + "expensively - priced", + "ca n't view that information", + "moderate", + "super cheap", + "low cost", + "cheap price range", + "lower", + "affordable pricing", + "moderate to cheap", + "cheap priced", + "inexpensively - priced", + "mostly in the cheap to moderate price range", + "moderately pricing", + "guesthouses", + "moderate|cheap", + "moderately priceed", + "very cheap", + "even cheaper", + "upscale", + "other ranges", + "moderate or cheap", + "cheaply", + "affordable", + "different price range", + "does not say", + "moderately price", + "expensive or cheap", + "priced moderately", + "moderatly price", + "moderate price point", + "moderately prices", + "cheap to expensive", + "on the more expensive side", + "inexpensive", + "the cheapest", + "cheaply priced", + "that range", + "rather expensive", + "your price", + "moderate range", + "cheaply - priced", + "more expensive", + "expensive", + "much cheaper", + "cheap side", + "slightly more expensive", + "$100", + "cheaper than the others", + "pretty expensive", + "not expensive", + "cheaply prices", + "moderatley priced", + "modrate", + "cheap range", + "espensive", + "chear", + "fairly cheap", + "oderately priced", + "moderate in price", + "different", + "very affordable", + "your chosen", + "unfortunately do not have the price", + "north", + "less costly", + "different price ranges", + "cheap", + "very inexpensive", + "2", + "moderately priced .", + "moderately to expensively priced", + "economical", + "same price range", + "moderate to cheap range", + "cheap to moderate", + "quite expensive", + "that price range", + "cheap|moderate", + "your price range", + "varying price ranges", + "expensive to moderate", + "expensively priced", + "epxensive", + "or expensive ?", + "all", + "higher price range", + "any", + "moderately - priced", + "hotel", + "moderate pricing", + "do nt care", + "lower end", + "quite cheap", + "more budget - friendly", + "cheaper", + "fairly cheap compared to other hotels", + "pretty cheap", + "moderate price", + "that", + "moderate priced", + "centre", + "the same", + "expensive range", + "relatively cheap", + "moderate prices", + "hotels", + "cheap>moderate", + "a little pricey", + "on the cheap side", + "moderately - priced or cheap", + "expensive price range", + "moderately priced", + "same price", + "moderately", + "budget - priced", + "moderately pried", + "moderate price range", + "great", + "moderatly priced", + "extremely reasonable", + "cheap or moderate", + "cheaper side" + ] + }, + "people": { + "description": "number of people for the hotel booking", + "is_categorical": false, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "address": { + "description": "exact location of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "type": { + "description": "what is the type of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "stay": { + "description": "length of stay at the hotel", + "is_categorical": false, + "possible_values": [] + }, + "day": { + "description": "day of the hotel booking", + "is_categorical": true, + "possible_values": [ + "next friday", + "saturday", + "wednesday", + "tuesday", + "saturday|tuesday", + "t", + "sunday>monday", + "friday>tuesday", + "monday", + "friday", + "thursday", + "monday<thursday", + "sunday", + "wednesday|friday", + "monda" + ] + }, + "choice": { + "description": "number of hotels that meets requests of the user", + "is_categorical": false, + "possible_values": [] + } + } + }, + "police": { + "description": "find police stations", + "slots": { + "address": { + "description": "exact location of the police station", + "is_categorical": false, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the police station", + "is_categorical": false, + "possible_values": [] + }, + "phone": { + "description": "police station phone number", + "is_categorical": false, + "possible_values": [] + } + } + } + }, + "intents": { + "inform": { + "description": "inform user of value for a certain slot" + }, + "request": { + "description": "ask for value of a slot" + }, + "nobook": { + "description": "inform user of booking failure" + }, + "reqmore": { + "description": "ask user for more instructions" + }, + "book": { + "description": "place an order for user" + }, + "bye": { + "description": "end a conversation and say goodbye to user" + }, + "thank": { + "description": "express gratitude" + }, + "welcome": { + "description": "welcome" + }, + "recommend": { + "description": "recommend a choice for user request" + }, + "offerbook": { + "description": "offer to place an order for user" + }, + "offerbooked": { + "description": "inform user that an order is succussful" + }, + "greet": { + "description": "express greeting" + }, + "nooffer": { + "description": "inform user that no options matches user request" + }, + "select": { + "description": "provide several choices for user to choose from" + } + }, + "binary_dialogue_act": [ + { + "intent": "request", + "domain": "hotel", + "slot": "area", + "value": "" + }, + { + "intent": "inform", + "domain": "booking", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "booking", + "slot": "stay", + "value": "" + }, + { + "intent": "request", + "domain": "booking", + "slot": "day", + "value": "" + }, + { + "intent": "reqmore", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "bye", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "thank", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "welcome", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "pricerange", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "leaveAt", + "value": "" + }, + { + "intent": "offerbook", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "duration", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "price", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "departure", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "day", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "leaveAt", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "arriveBy", + "value": "" + }, + { + "intent": "greet", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "destination", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "arriveBy", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "internet", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "parking", + "value": "" + }, + { + "intent": "inform", + "domain": "attraction", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "area", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "type", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "food", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "area", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "phone", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "phone", + "value": "" + }, + { + "intent": "nobook", + "domain": "booking", + "slot": "", + "value": "" + }, + { + "intent": "inform", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "people", + "value": "" + }, + { + "intent": "request", + "domain": "booking", + "slot": "people", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "stars", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "type", + "value": "" + }, + { + "intent": "select", + "domain": "hotel", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "entrance fee", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "pricerange", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "Ref", + "value": "" + }, + { + "intent": "request", + "domain": "booking", + "slot": "time", + "value": "" + }, + { + "intent": "book", + "domain": "booking", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "phone", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "pricerange", + "value": "" + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "restaurant", + "slot": "", + "value": "" + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "", + "value": "" + }, + { + "intent": "inform", + "domain": "taxi", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "departure", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "name", + "value": "" + }, + { + "intent": "nooffer", + "domain": "attraction", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "destination", + "value": "" + }, + { + "intent": "nooffer", + "domain": "hotel", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "attraction", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "taxi_phone", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "taxi_types", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "name", + "value": "" + }, + { + "intent": "nooffer", + "domain": "restaurant", + "slot": "", + "value": "" + }, + { + "intent": "nooffer", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "name", + "value": "" + }, + { + "intent": "recommend", + "domain": "hotel", + "slot": "", + "value": "" + }, + { + "intent": "recommend", + "domain": "attraction", + "slot": "", + "value": "" + }, + { + "intent": "offerbooked", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "recommend", + "domain": "restaurant", + "slot": "", + "value": "" + } + ], + "state": { + "hotel": { + "name": "", + "area": "", + "parking": "", + "pricerange": "", + "stars": "", + "internet": "", + "type": "", + "stay": "", + "day": "", + "people": "" + }, + "train": { + "leaveAt": "", + "destination": "", + "day": "", + "arriveBy": "", + "departure": "", + "people": "" + }, + "taxi": { + "leaveAt": "", + "destination": "", + "departure": "", + "arriveBy": "" + }, + "restaurant": { + "food": "", + "pricerange": "", + "name": "", + "area": "", + "time": "", + "day": "", + "people": "" + }, + "attraction": { + "type": "", + "name": "", + "area": "" + } + } +} \ No newline at end of file diff --git a/data/unified_datasets/multiwoz21/original_data.zip b/data/unified_datasets/multiwoz21/original_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..b27361772be980013a94c91898fb61e3e122ba8e Binary files /dev/null and b/data/unified_datasets/multiwoz21/original_data.zip differ diff --git a/data/unified_datasets/multiwoz21/preprocess.py b/data/unified_datasets/multiwoz21/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..5ca81e2e399d8a816e3be6d4ae33babfc2f1597e --- /dev/null +++ b/data/unified_datasets/multiwoz21/preprocess.py @@ -0,0 +1,1431 @@ +import copy +import re +import zipfile +import json +import os +from tqdm import tqdm +import sys +import difflib +from fuzzywuzzy import fuzz +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from convlab2.util.file_util import read_zipped_json, write_zipped_json +import logging + + +logging.basicConfig(level=logging.INFO) +self_dir = (os.path.abspath(os.getcwd())) + +REF_SYS_DA = { + 'Attraction': { + 'Addr': "address", 'Area': "area", 'Choice': "choice", + 'Fee': "entrance fee", 'Name': "name", 'Phone': "phone", + 'Post': "postcode", 'Price': "pricerange", 'Type': "type", + 'none': None, 'Open': None + }, + 'Hospital': { + 'Department': 'department', 'Addr': 'address', 'Post': 'postcode', + 'Phone': 'phone', 'none': None + }, + 'Booking': { + 'Day': 'day', 'Name': 'name', 'People': 'people', + 'Ref': 'Ref', 'Stay': 'stay', 'Time': 'time', + 'none': None + }, + 'Hotel': { + 'Addr': "address", 'Area': "area", 'Choice': "choice", + 'Internet': "internet", 'Name': "name", 'Parking': "parking", + 'Phone': "phone", 'Post': "postcode", 'Price': "pricerange", + 'Stars': "stars", 'Type': "type", 'Stay': 'stay', 'Day': 'day', 'People': 'people', + 'none': None + }, + 'Restaurant': { + 'Addr': "address", 'Area': "area", 'Choice': "choice", + 'Name': "name", 'Food': "food", 'Phone': "phone", + 'Post': "postcode", 'Price': "pricerange", + 'Time': 'time', 'Day': 'day', 'People': 'people', + 'none': None + }, + 'Taxi': { + 'Arrive': "arriveBy", 'Car': "taxi_types", 'Depart': "departure", + 'Dest': "destination", 'Leave': "leaveAt", 'Phone': "taxi_phone", + 'none': None + }, + 'Train': { + 'Arrive': "arriveBy", 'Choice': "choice", 'Day': "day", + 'Depart': "departure", 'Dest': "destination", + 'Leave': "leaveAt", 'People': "people", 'Ref': "Ref", + 'Time': "duration", 'none': None, 'Ticket': 'price', + }, + 'Police': { + 'Addr': "address", 'Post': "postcode", 'Phone': "phone", 'none': None + }, +} + +# taxi restaurant attraction train +slot_to_type = { + 'taxi-destination': 'non', + 'taxi-departure': 'non', + 'taxi-leaveAt': 'non', + 'taxi-arriveBy': 'non', + 'restaurant-food': 'non', + 'restaurant-name': 'non', + 'attraction-address': 'non', + 'attraction-postcode': 'non', + 'restaurant-pricerange': 'cat', + 'restaurant-address': 'non', + 'restaurant-area': 'cat', + 'restaurant-postcode': 'non', + 'attraction-phone': 'non', + 'attraction-entrance fee': 'non', + 'booking-time': 'non', + 'booking-day': 'cat', + 'attraction-type': 'non', + 'attraction-area': 'cat', + 'train-destination': 'non', + 'train-arriveBy': 'non', + 'train-departure': 'non', + 'hotel-internet': 'cat', + 'hotel-area': 'cat', + 'booking-stay': 'non', + 'booking-people': 'non', + 'train-duration': 'non', + 'train-people': 'non', + 'train-day': 'cat', + 'train-Ref': 'non', + 'hotel-stars': 'cat', + 'train-leaveAt': 'non', + 'train-price': 'non', + 'hotel-parking': 'cat', + 'hotel-phone': 'non', + 'hotel-name': 'non', + 'hotel-pricerange': 'cat', + 'hotel-people': 'non', + 'restaurant-phone': 'non', + 'hotel-postcode': 'non', + 'hotel-address': 'non', + 'attraction-name': 'non', + 'hotel-type': 'non', + 'restaurant-people': 'non', + 'train-choice': 'non', + 'attraction-pricerange': 'cat', + 'hotel-stay': 'non', + 'booking-name': 'non', + 'booking-Ref': 'non', + 'restaurant-time': 'non', + 'restaurant-day': 'cat', + 'hotel-day': 'cat', + 'hotel-choice': 'non', + 'restaurant-choice': 'non', + 'attraction-choice': 'non', + 'taxi-taxi_phone': 'non', + 'taxi-taxi_types': 'non', + 'police-address': 'non', + 'police-postcode': 'non', + 'police-phone': 'non' +} + +state_cat_slot_value_dict = { + "hotel-pricerange": { + "cheap": 735, + "moderate": 1063, + "expensive": 594, + }, + "hotel-parking": { + "yes": 1809, + "no": 126, + "free": 4, + }, + "hotel-day": { + "tuesday": 385, + "wednesday": 410, + "monday": 365, + "saturday": 407, + "friday": 393, + "thursday": 384, + "sunday": 369, + }, + "train-day": { + "wednesday": 533, + "monday": 533, + "saturday": 543, + "thursday": 547, + "friday": 563, + "tuesday": 553, + "sunday": 613, + }, + "hotel-stars": { + "4": 1263, + "2": 193, + "0": 201, + "3": 401, + "5": 45, + "1": 45, + }, + "hotel-internet": { + "yes": 1841, + "no": 79, + "free": 2 + }, + "hotel-area": { + "east": 416, + "north": 717, + "centre": 538, + "south": 289, + "west": 316, + }, + "attraction-area": { + "centre": 1290, + "west": 332, + "north": 155, + "south": 240, + "east": 272, + }, + "restaurant-pricerange": { + "expensive": 1477, + "cheap": 758, + "moderate": 1028, + }, + "restaurant-area": { + "centre": 1745, + "south": 398, + "north": 390, + "east": 360, + "west": 423, + }, + "restaurant-day": { + "thursday": 362, + "wednesday": 412, + "friday": 395, + "monday": 383, + "sunday": 399, + "saturday": 421, + "tuesday": 350, + } +} + + +synonyms = [ + ["el shaddia guesthouse", "el shaddai"], + [ "peterborough", "peterbourgh"], + ["night club", "nightclub", 'nightclubs'], + ["boat", "boating"], + ["portugese", "portuguese"], + ["guesthouse", "guest house"], + ["seafood", "sea food"], + ["christ 's college", "christ college"], + ["huntingdon marriott hotel"] +] + +state_cat_slot_ds = [k for k, v in slot_to_type.items() if v == 'cat'] + +da_cat_slot_values = { + # 'hotel-stay': ['1', '2', '3', '4', '5'], + 'hotel-internet': ['free', 'no', 'none', 'yes'], + 'hotel-parking': ['free', 'no', 'none', 'yes'] +} + +state_cat_slot_values = {} + +multiwoz_desc = { + 'taxi': { + 'domain': 'taxi information query system', + 'taxi_phone': 'taxi phone number', + 'taxi_types': 'taxi type', + }, + 'restaurant': { + 'domain': 'restaurant information query system', + 'address': 'exact location of the restaurant', + 'postcode': 'postcode of the restaurant', + 'phone': 'restaurant phone number', + 'choice': 'number of restaurants meeting requests of user', + }, + 'attraction': { + 'domain': 'an entertainment that is offered to the public', + 'address': 'details of where the attraction is', + 'postcode': 'postcode of the attraction', + 'phone': 'phone number of the attraction', + 'entrance fee': 'the fee charged for admission to the attraction', + 'pricerange': 'the price range for the attraction, from cheap to expensive', + 'choice': 'number of attractions matching requests of user' + }, + 'booking': { + 'domain': 'to arrange with a taxi, restaurant, train, etc.', + 'time': 'time for an order', + 'day': 'day for an order, from monday to sunday', + 'stay': 'for how long the user wish to be at a place', + 'people': 'how many person the order is for', + 'name': 'name of the ordered place', + 'Ref': 'reference number of the order' + }, + 'train': { + 'domain': 'query and order a train', + 'duration': 'the length of time the train trip lasts', + 'Ref': 'reference number of the order', + 'price': 'price for the train ticket', + 'choice': 'number of trains that meets requests of the user', + }, + 'hotel': { + 'domain': 'to query hotel information and place an order', + 'address': 'exact location of the hotel', + 'postcode': 'postcode of the hotel', + 'phone': 'hotel phone number', + 'choice': 'number of hotels that meets requests of the user', + }, + 'police': { + 'domain': 'find police stations', + 'address': 'exact location of the police station', + 'postcode': 'postcode of the police station', + 'phone': 'police station phone number', + }, + 'intents': { + 'inform': 'inform user of value for a certain slot', + 'request': 'ask for value of a slot', + 'nobook': 'inform user of booking failure', + 'reqmore': 'ask user for more instructions', + 'book': 'place an order for user', + 'bye': 'end a conversation and say goodbye to user', + 'thank': 'express gratitude', + 'welcome': 'welcome', + 'offerbooked': 'inform user that an order is succussful', + 'recommend': 'recommend a choice for user request', + 'greet': 'express greeting', + 'nooffer': 'inform user that no options matches user request', + 'offerbook': 'offer to place an order for user', + 'select': 'provide several choices for user to choose from', + } +} + +digit2word = { + '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five', + '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten' +} + + +def pharse_in_sen(phrase, sen): + ''' + match value in the sentence + :param phrase: str + :param sen: str + :return: start, end if matched, else None, None + ''' + assert isinstance(phrase, str) + pw = '(^|[\s,\.:\?!-])(?P<v>{})([\s,\.:\?!-]|$)' + pn = '(^|[\s\?!-]|\D[,\.:])(?P<v>{})($|[\s\?!-]|[,\.:]\D|[,\.:]$)' + + if phrase.isdigit() and phrase in digit2word: + phrase = digit2word[phrase] + p = re.compile(pw.format(re.escape(phrase)), re.I) + m = re.search(p, sen) + if m: + num = len(re.findall(p, sen)) + # if num > 1: + # match['>1'] += 1 + # else: + # match['1'] += 1 + return m.span('v'), num + # match['0'] += 1 + if phrase.isdigit(): + pattern = pn + else: + pattern = pw + p = re.compile(pattern.format(re.escape(phrase)), re.I) + m = re.search(p, sen) + if m: + num = len(re.findall(p, sen)) + # if num > 1: + # match['>1'] += 1 + # else: + # match['1'] += 1 + return m.span('v'), num + return (None, None), 0 + + + + + +def update_state(state, update): + # print('======================') + # print(state) + # print(update) + # print('======================') + + for service, service_update in update.items(): + if service not in state: + state[service] = copy.deepcopy(service_update) + else: + state[service].update(update[service]) + + +def convert_da(utt, da_dict, binary_ont, intent_ont, did, tid, da_cat_slot_values): + ''' + convert multiwoz dialogue acts to required format + :param utt: user or system utt + :param da_dict: multiwoz da + :param binary_ont: binary ontology + :param intent_ont: intent ontology + :return: + ''' + converted_da = { + 'categorical': [], + 'non-categorical': [], + 'binary': [] + } + + for Domain_Act, S, v in da_dict: + Domain, Act = Domain_Act.split('-') + if Domain.lower() in ['police', 'hospital', 'bus']: + continue + + if Act.lower() not in intent_ont: + intent_ont[Act.lower()] = {} + + # general domain is converted to empty domain. e.g. thank, bye + if Domain == 'general': + assert S == 'none' + assert v == 'none' + converted_dict = { + 'intent': Act.lower(), + 'domain': '', + 'slot': '', + 'value': '' + } + converted_da['binary'].append(converted_dict) + + if converted_dict not in binary_ont: + binary_ont.append(converted_dict) + continue + + + + try: + reformated_slot = REF_SYS_DA[Domain][S] + except: + # print('44444444444444444444444444444444') + # print(Domain, S) + # logging.info('slot not in REF_SYS_DA, drop') + continue + + # if slot is None, da should be converted into binary + if reformated_slot is None: + if not (S == 'none' and v == 'none'): + # mainly for `Open` slot + # print('11111111111111111111') + # print(Domain_Act, S, v) + continue + # Booking-Inform none none + # Police-Inform none none + # Train-OfferBook none none + converted_dict = { + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': '', + 'value': '' + } + converted_da['binary'].append(converted_dict) + if converted_dict not in binary_ont: + binary_ont.append(converted_dict) + continue + + reformated_domain_slot = Domain.lower() + '-' + reformated_slot + + if Act.lower() == 'request': + converted_dict = { + 'intent': 'request', + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': '' + } + converted_da['binary'].append(converted_dict) + + if converted_dict not in binary_ont: + binary_ont.append(converted_dict) + continue + + # vs = da_dict[(Domain_Act, S)]['values'] + + if reformated_domain_slot in slot_to_type and slot_to_type[reformated_domain_slot] == 'cat': + origin_v = v + v = v.lower() + # if reformated_domain_slot in cat_slot_proj: + # v = cat_slot_proj[reformated_domain_slot][v] + if reformated_domain_slot not in da_cat_slot_values: + da_cat_slot_values[reformated_domain_slot] = [] + # if v not in cat_slot_values[reformated_domain_slot]: + da_cat_slot_values[reformated_domain_slot].append(v) + converted_da['categorical'].append({ + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': v + }) + if 'start_word' in da_dict[(Domain_Act, S, origin_v)]: + start_ws = da_dict[(Domain_Act, S, origin_v)]['start_word'] + end_ws = da_dict[(Domain_Act, S, origin_v)]['end_word'] + utt_list = utt.split() + for start_w, end_w in zip(start_ws, end_ws): + if start_w > len(utt_list) or end_w > len(utt_list): + continue + start_ch = 0 + for i in range(start_w): + start_ch += len(utt_list[i]) + 1 + end_ch = start_ch + for i in range(start_w, end_w): + end_ch += len(utt_list[i]) + 1 + try: + end_ch += len(utt_list[end_w]) + except: + print(utt_list, start_w, end_w) + if not utt[start_ch: end_ch] == origin_v: + # print('2222222222222222222222222') + # print('\n'.join([v, utt[start_ch: end_ch - 1]])) + continue + + else: + converted_da['categorical'][-1].update({ + 'start': start_ch, + 'end': end_ch + }) + break + + else: + if 'start_word' not in da_dict[(Domain_Act, S, v)]: + # todo no span annotation + converted_da['non-categorical'].append({ + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': v + }) + continue + + start_ws = da_dict[(Domain_Act, S, v)]['start_word'] + end_ws = da_dict[(Domain_Act, S, v)]['end_word'] + utt_list = utt.split() + found = True + for start_w, end_w in zip(start_ws, end_ws): + if start_w > len(utt_list) or end_w > len(utt_list): + continue + start_ch = 0 + for i in range(start_w): + start_ch += len(utt_list[i]) + 1 + end_ch = start_ch + for i in range(start_w, end_w): + end_ch += len(utt_list[i]) + 1 + try: + end_ch += len(utt_list[end_w]) + except: + print(utt_list, start_w, end_w, v) + if not utt[start_ch: end_ch] == v: + # print('2222222222222222222222222') + # print('\n'.join([v, utt[start_ch: end_ch - 1]])) + continue + + else: + found = True + converted_da['non-categorical'].append({ + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': v, + 'start': start_ch, + 'end': end_ch + }) + break + + if not found: + converted_da['non-categorical'].append({ + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': v + }) + return converted_da + + +def get_state_update(prev_state, cur_state, dialog, did, tid, utt, coref_dict, slot_notfound_dict, da_cat_slot_values): + prev_turns = dialog['turns'] + state_update = {'categorical': [], 'non-categorical': []} + notfoundnum = 0 + total_value = 0 + + diff_state = {} + if prev_state is None: + diff_state = {domain: {slot: value for slot, value in cur_state[domain].items() if value != ''} for domain in + cur_state} + else: + assert len(prev_state) == len(cur_state), print(prev_state, cur_state) + for domain, domain_state in prev_state.items(): + if domain not in diff_state: + diff_state[domain] = {} + for slot, value in domain_state.items(): + if value != cur_state[domain][slot]: + # assert len(cur_state[domain][slot]) > 0, print(did, tid, domain, slot, utt) + diff_state[domain][slot] = cur_state[domain][slot] + + ret_diff_state = copy.deepcopy(diff_state) + + + + for domain in diff_state: + for slot in diff_state[domain]: + + total_value += 1 + fix_or = False + if '|' in diff_state[domain][slot]: + value = diff_state[domain][slot].split('|')[0] + else: + value = diff_state[domain][slot] + + # if dialog['original_id'] == 'PMUL2512' and tid == 17 and value == '02:45': + # value = '2:45' + + value_list = [value] + for _synonyms in synonyms: + if value in _synonyms: + value_list = _synonyms + + value_list.extend(get_time_variants(value)) + value_list.extend(get_genitive_variants(value)) + value_list.extend(get_bb_variants(value)) + + if value.endswith(' restaurant'): + value_list.append(value.split(' restaurant')[0]) + if value.endswith(' hotel'): + value_list.append(value.split(' hotel')[0]) + found = False + for value in value_list: + # categorical slots + if slot in ['internet', 'parking', 'pricerange', 'day', 'area', 'stars']: + reformated_domain_slot = '-'.join([domain, slot]) + if reformated_domain_slot in state_cat_slot_value_dict and (value in state_cat_slot_value_dict[reformated_domain_slot] or value in ['dontcare', '', 'none', 'not mentioned']): + state_update['categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot] + }) + if domain + '-' + slot not in da_cat_slot_values: + da_cat_slot_values[domain + '-' + slot] = [diff_state[domain][slot]] + da_cat_slot_values[domain + '-' + slot].append(diff_state[domain][slot]) + if value != diff_state[domain][slot]: + state_update['categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + else : + for _turn in prev_turns[::-1]: + found = False + for da in _turn['dialogue_act']['categorical']: + if da['value'] == value: + if 'start' in da: + state_update['categorical'][-1].update({ + 'utt_idx': _turn['utt_idx'], + 'start': da['start'], + 'end': da['end'], + 'from': 'prev_da_span' + }) + found = True + break + if found: + break + else: + state_update['categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + 'fixed_value': 'not found' + }) + if domain + '-' + slot not in da_cat_slot_values: + da_cat_slot_values[domain + '-' + slot] = [diff_state[domain][slot]] + da_cat_slot_values[domain + '-' + slot].append(diff_state[domain][slot]) + ret_diff_state[domain][slot] = 'not found' + notfoundnum += 1 + # reformated_domain_slot = '-'.join([domain, slot] + found = True + break + + # process value ---> none + assert value not in ['none', 'not mentioned'] + if value in ['', 'dontcare']: + # if reformated_domain_slot not in state_cat_slot_values: + # state_cat_slot_values[reformated_domain_slot] = [] + # # if v not in cat_slot_values[reformated_domain_slot]: + # state_cat_slot_values[reformated_domain_slot].append(value) + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot] + }) + found = True + break + + # first look for values in coref_dict + for _Domain_Act, _Slot, _value in coref_dict: + _domain, _act = _Domain_Act.lower().split('-') + _slot = _Slot.lower() + _coref_value = coref_dict[(_Domain_Act, _Slot, _value)]['coref_value'] + if _coref_value == '': + continue + _coref_turn = coref_dict[(_Domain_Act, _Slot, _value)]['turn'] + if _coref_turn == -1: + continue + _coref_pos = coref_dict[(_Domain_Act, _Slot, _value)]['pos'] + if _coref_pos == '': + continue + _utt = coref_dict[(_Domain_Act, _Slot, _value)]['utt'] + if _domain == domain and _slot == slot and value == _coref_value: + + start_w, end_w = [int(p) for p in _coref_pos.split('-')] + utt_list = _utt.split() + start_ch = 0 + for i in range(start_w): + start_ch += len(utt_list[i]) + 1 + end_ch = start_ch + for i in range(start_w, end_w + 1): + end_ch += len(utt_list[i]) + 1 + end_ch -= 1 + + if not _utt[start_ch: end_ch] == _coref_value: + # print(111111111111111111111111111111111) + # print(_utt[start_ch: end_ch], _coref_value) + continue + + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + 'from': 'coref', + 'utt_idx': _coref_turn, + 'start': start_ch, + 'end': end_ch + }) + if value != diff_state[domain][slot]: + state_update['categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + found = True + + if found: + break + + # from da annotation + for _turn in prev_turns[::-1]: + for da in _turn['dialogue_act']['non-categorical']: + # if da['domain'] == domain and da['slot'] == slot and fuzz.ratio(da['value'], value) > 85: + # if not da['value'] == value: + # print(1111111111111111) + # print(value, da['value']) + + if fuzz.ratio(da['value'], value) > 85: + + if 'start' in da: + found = True + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + # 'value': da['value'], + 'value': diff_state[domain][slot], + 'utt_idx': _turn['utt_idx'], + 'start': da['start'], + 'end': da['end'], + 'from': 'prev_da_span' + }) + if value != diff_state[domain][slot]: + state_update['non-categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + if da['value'] != value: + state_update['non-categorical'][-1].update({'fixed_value':da['value']}) + ret_diff_state[domain][slot] = da['value'] + + break + if found: + break + + if found: + break + + # from utterance + for _turn in prev_turns[::-1]: + _utt = _turn['utterance'] + (start, end), num = pharse_in_sen(str(value), _utt) + if num: + assert value.lower() == _utt[start:end].lower() \ + or digit2word[value].lower() == _utt[start:end].lower() + found = True + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + # 'value': _utt[start:end].lower(), + # 'fixed_value': _utt[start:end].lower(), + 'from': 'prev_utt', + 'utt_idx': _turn['utt_idx'], + 'start': start, + 'end': end + }) + if value != diff_state[domain][slot]: + state_update['non-categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + if value != _utt[start:end].lower(): + state_update['non-categorical'][-1].update({'fixed_value': _utt[start:end].lower()}) + ret_diff_state[domain][slot] = _utt[start:end].lower() + found = True + break + if found: + break + + # from utterance + if not value.isdigit(): + for _turn in prev_turns[::-1]: + _utt = _turn['utterance'] + + s = difflib.SequenceMatcher(None, _utt, value) + matches = s.get_matching_blocks() + + for i, j, n in matches: + possible_value = _utt[i: i+len(value)] + + if i+ len(value) < len(_utt) and _utt[i+len(value)] not in [ ' ', ',', '.', '?', '!', '/'] : + possible_value += _utt[i+len(value):].split()[0] + + if possible_value.startswith('th '): + possible_value = possible_value[3:] + i += 3 + if i > 0 and _utt[i-1] not in [ ' ', ',', '.', '?', '!', '/']: + # cut first incomplete word + if len(possible_value.split()) > 1: + i += len(possible_value.split()[0]) + 1 + possible_value = ' '.join(possible_value.split()[1:]) + + + # prepend first incomplete word + # possible_value = _utt[:i].split()[-1] + possible_value + # i -= len(_utt[:i].split()[-1]) + + + if fuzz.token_sort_ratio(value, possible_value) > 92 or possible_value.startswith('ashley hotel and lovell lodge') : + found = True + + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + # 'value': possible_value, + # 'fixed_value': possible_value, + 'from':'prev_utt', + 'utt_idx': _turn['utt_idx'], + 'start': i, + 'end': i+len(possible_value) + }) + if value != diff_state[domain][slot]: + state_update['non-categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + if possible_value != value: + state_update['non-categorical'][-1].update({'fixed_value': possible_value}) + ret_diff_state[domain][slot] = possible_value + break + # assert _utt[i:i+len(possible_value)] == possible_value, print(_utt, _utt[i:i+len(possible_value)], possible_value) + # break + # if not possible_value == value: + # print(3333333333333333) + # print(value) + # print(possible_value) + if found: + break + if found: + break + + if found: + break + if not found: + # print('3333333333333333333') + # print(did, tid) + # print(domain, slot, value) + # print([_t['utterance'] for _t in prev_turns]) + # assert slot not in ['internet', 'parking', 'pricerange', 'day', 'area', 'stars'] + + if (domain, slot) not in slot_notfound_dict: + slot_notfound_dict[(domain, slot)] = 1 + else: + slot_notfound_dict[(domain, slot)] += 1 + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + 'fixed_value': 'not found' + }) + ret_diff_state[domain][slot] = 'not found' + notfoundnum += 1 + return state_update, notfoundnum, total_value, ret_diff_state + + +def merge_data_annotation(): + extract_dir = os.path.join(self_dir, 'original_data') + data21 = json.load(open(os.path.join(self_dir, extract_dir, 'data.json'))) + # data21_train = json.load(open(os.path.join(self_dir, extract_dir, 'train.json'))) + # data21_val = json.load(open(os.path.join(self_dir, extract_dir, 'val.json'))) + # data21_test = json.load(open(os.path.join(self_dir, extract_dir, 'test.json'))) + # data21 = {} + # data21.update(data21_train) + # data21.update(data21_val) + # data21.update(data21_test) + + # update_from_25_cnt = 0 + # total_turn = 0 + # for dial_id, dialog in data21.items(): + # dial_id = dial_id + '.json' + # assert dial_id in data25 + # for i, _turn in enumerate(dialog['log']): + # total_turn += 1 + # if _turn['text'] == data25[dial_id]['log'][i]['text']: + # _turn['span_info'].extend(copy.deepcopy(data25[dial_id]['log'][i]['span_info'])) + # # _turn['span_info'] = list(set(_turn['span_info'])) + # # _turn['dialog_act'].update(copy.deepcopy(data25[dial_id]['log'][i]['dialog_act'])) + # for Domain_Intent in data25[dial_id]['log'][i]['dialog_act']: + # if Domain_Intent in _turn['dialog_act']: + # _turn['dialog_act'][Domain_Intent].extend(data25[dial_id]['log'][i]['dialog_act'][Domain_Intent]) + # else: + # _turn['dialog_act'][Domain_Intent] = copy.deepcopy(data25[dial_id]['log'][i]['dialog_act'][Domain_Intent]) + # # _turn['dialog_act'][Domain_Intent] = list(set(_turn['dialog_act'][Domain_Intent])) + # if 'coreference' in data25[dial_id]['log'][i]: + # _turn['coreference'] = copy.deepcopy(data25[dial_id]['log'][i]['coreference']) + # update_from_25_cnt += 1 + # else: + # # print('==============multiwoz21=================') + # # print(_turn['text']) + # # print('==============multiwoz25=================') + # # print(data25[dial_id]['log'][i]['text']) + # continue + # + # print('{}/{} turns update from multiwoz25 data'.format(update_from_25_cnt, total_turn)) + return data21 + + +def preprocess(da_cat_slot_values, state_cat_slot_values): + all_data = [] + binary_ont = [] + intent_ont = {} + state_ont = {} + + data_splits = ['train', 'val', 'test'] + # data_splits = ['test'] + extract_dir = os.path.join(self_dir, 'original_data') + num_train_dialogue = 0 + num_train_utt = 0 + + num_match_error_da_span = 0 + + if not os.path.exists('data.zip') or not os.path.exists('ontology.json'): + # for data_split in data_splits: + # data_zip_file = os.path.join(self_dir, 'original_data.zip') + # if not os.path.exists(data_zip_file): + # raise FileNotFoundError(data_zip_file) + + # logging.info('unzip multiwoz data to {}'.format(extract_dir)) + # archive = zipfile.ZipFile(data_zip_file, 'r') + # archive.extractall(extract_dir) + + data = merge_data_annotation() + # exit() + # data = json.load(open(os.path.join(self_dir, extract_dir, 'data_meta_fixed.json'))) + train_list = open(os.path.join(self_dir, extract_dir, 'trainListFile')).read().split() + val_list = open(os.path.join(self_dir, extract_dir, 'valListFile')).read().split() + test_list = open(os.path.join(self_dir, extract_dir, 'testListFile')).read().split() + + total_not_found_slot = 0 + total_slot = 0 + total_turn = 0 + total_not_found_turn = 0 + total_not_found_state = 0 + + slot_notfound_dict = {} + + dialog_idx = 0 + for dialog_id, dialog in tqdm(data.items()): + + acc_not_found_flag = False + + coref_dict = {} + + data_split = None + for _split in data_splits: + if dialog_id.strip('.json') in eval(_split + '_list'): + data_split = _split + break + # assert data_split is not None + # if data_split != 'test': + # continue + if data_split is None: + continue + + if data_split == 'train': + num_train_dialogue += len(data) + + dialog_idx += 1 + # if dialog_idx > 10: + # break + converted_dialogue = { + 'dataset': 'multiwoz21', + 'data_split': data_split, + 'dialogue_id': 'multiwoz21_' + str(dialog_idx), + 'original_id': dialog_id, + 'domains': [d for d in dialog['goal'] if + len(dialog['goal'][d]) != 0 and d in multiwoz_desc and d not in ['police', 'hospital', 'bus']], + 'turns': [], + } + + if data_split == 'train': + num_train_utt += len(dialog['log']) + + prev_state = None + accum_fixed_state = {} + for turn_id, turn in enumerate(dialog['log']): + + utt = turn['text'].lower() + # for several wrong words + utt = utt.replace('seeuni', 'see uni') + + utt = ' '.join(utt.split()) + utt = utt.replace(' im ', ' i\'m ') + utt = utt.replace(' dont ', ' don\'t ') + utt = utt.replace(' thats ', ' that\'s ') + utt = utt.replace('idon\'t', ' i don\'t ') + utt = utt.replace('wedon\'t ', 'we don\'t ') + utt = utt.replace('id be ', 'i\'d be ') + # utt = utt.replace('cambridgethat\'svery ', 'cambridge that\'s very') + utt = re.sub(r'^im ', 'i\'m ', utt) + utt = re.sub(r'^whats ', 'what\'s ', utt) + utt = re.sub(r'^id ', 'i\'d ', utt) + utt = re.sub(r'^thats ', 'that\'s ', utt) + + utt = re.sub( r'([a-zA-Z0-9])([,.!\'-\?"~])', r'\1 \2', utt) + utt = re.sub(r'([,.!\'-\?"~])([a-zA-Z0-9])', r'\1 \2', utt) + + das = turn.get('dialog_act', []) + role = 'user' if turn_id % 2 == 0 else 'system' + spans = turn.get('span_info', []) + + da_dict = {} + for Domain_Act in das: + Domain = Domain_Act.split('-')[0] + if Domain.lower() not in converted_dialogue['domains'] and Domain.lower() not in ['general', 'booking']: + continue + + Svs = das[Domain_Act] + for S, v in Svs: + v = v.lower() + if v.startswith('th '): + # print(v) + v = v[3:] + if v.startswith('he '): + # print(v) + v = v[3:] + + if (Domain_Act, S, v) not in da_dict: + da_dict[(Domain_Act, S, v)] = {} + + for span in spans: + Domain_Act, S, v, start_word, end_word = span + v = v.lower() + if not (Domain_Act, S, v) in da_dict: + # logging.info('span da annotation not found in multiwoz da label') + # logging.info(dialog_id, turn_id) + # logging.info((Domain_Act, S, v)) + # logging.info(da_dict) + num_match_error_da_span += 1 + else: + if v.startswith('th '): + # print(v) + v = v[3:] + start_word += 3 + if v.startswith('he '): + # print(v) + v = v[3:] + start_word += 3 + + if 'start_word' not in da_dict[(Domain_Act, S, v)]: + da_dict[(Domain_Act, S, v)]['start_word'] = [] + da_dict[(Domain_Act, S, v)]['end_word'] = [] + + da_dict[(Domain_Act, S, v)]['start_word'].append(start_word) + da_dict[(Domain_Act, S, v)]['end_word'].append(end_word) + + converted_turn = { + 'utt_idx': turn_id, + 'speaker': role, + 'utterance': utt, + 'dialogue_act': convert_da(utt, da_dict, binary_ont, intent_ont, dialog_id, turn_id, da_cat_slot_values), + } + + # for state annotations + if role == 'system': + turn_state = turn['metadata'] + cur_state = {} + for domain in turn_state: + if domain in ['police', 'hospital', 'bus']: + continue + if domain not in converted_dialogue['domains']: + continue + cur_state[domain] = {} + for subdomain in ['semi', 'book']: + for slot in turn_state[domain][subdomain]: + if slot == 'booked': + continue + if slot == 'ticket': # or (domain == 'train' and slot == 'people'): + # for cases where domain slot exists in REF but not in state + # because of check in evaluate.py + continue + + else: + fixed_slot = slot + state_ds = domain + '-' + fixed_slot + if state_ds not in slot_to_type: + logging.info('state slot not defined in da list') + logging.info(state_ds) + if turn_state[domain][subdomain][slot] in ['', [], 'not mentioned', 'none']: + cur_state[domain][fixed_slot] = "" + else: + if turn_state[domain][subdomain][slot].startswith('th '): + # print('state') + # print(turn_state[domain][subdomain][slot]) + turn_state[domain][subdomain][slot] = turn_state[domain][subdomain][slot][3:] + if turn_state[domain][subdomain][slot].startswith('he '): + # print('state') + # print(turn_state[domain][subdomain][slot]) + turn_state[domain][subdomain][slot] = turn_state[domain][subdomain][slot][3:] + + cur_state[domain][fixed_slot] = turn_state[domain][subdomain][slot] + + if domain not in state_ont: + state_ont[domain] = [] + if fixed_slot not in state_ont[domain]: + state_ont[domain].append(fixed_slot) + + if domain == 'train' and 'people' not in cur_state[domain]: + cur_state[domain]['people'] = '' + # if len(converted_turn['state'][domain]) == 0: + # converted_turn['state'].pop(domain) + if len(converted_dialogue['turns']) > 0: + # move state from system side to user side + converted_dialogue['turns'][-1]['state'] = copy.deepcopy(cur_state) + + # for state update annotations + state_update, _notfoundslot, _totalslot, ret_diff_state = get_state_update(prev_state, cur_state, converted_dialogue, + dialog_id, turn_id, turn['text'], coref_dict, + slot_notfound_dict, da_cat_slot_values) + + update_state(accum_fixed_state, ret_diff_state) + for domain in accum_fixed_state: + for slot in accum_fixed_state[domain]: + assert isinstance(accum_fixed_state[domain][slot], str), print(accum_fixed_state[domain][slot]) + + if _notfoundslot == 0: + # for slot in state_update['categorical']: + # assert 'fixed_value' not in slot + for slot in state_update['non-categorical']: + if slot['value'] not in ['', 'dontcare']: + assert 'utt_idx' in slot + + else: + flag = False + for slot in state_update['categorical']: + if 'fixed_value' in slot: + flag = True + break + for slot in state_update['non-categorical']: + if 'utt_idx' not in slot: + flag = True + break + assert flag, print(flag, state_update['non-categorical']) + + total_turn += 1 + total_slot += _totalslot + total_not_found_slot += _notfoundslot + total_not_found_turn += 1 if _notfoundslot > 0 else 0 + if _notfoundslot > 0: + acc_not_found_flag = True + if acc_not_found_flag: + total_not_found_state += 1 + + coref_dict = {} + converted_dialogue['turns'][-1]['state_update'] = copy.deepcopy(state_update) + converted_dialogue['turns'][-1]['fixed_state'] = copy.deepcopy(accum_fixed_state) + if 'state' not in converted_dialogue['turns'][-1]: + converted_dialogue['turns'][-1]['state'] = {} + prev_state = copy.deepcopy(cur_state) + + converted_dialogue['turns'].append(converted_turn) + + if 'coreference' in turn: + for Domain_Act in turn['coreference']: + for Slot, value, coref, coref_turn, coref_pos in turn['coreference'][Domain_Act]: + value = value.lower() + coref_dict[(Domain_Act, Slot, value)] = {'turn': coref_turn, 'pos': coref_pos, + 'coref_value': coref, + 'utt': converted_dialogue['turns'][coref_turn][ + 'utterance']} + + check_spans(converted_dialogue) + # postprocess_update_spans(converted_dialogue) + if converted_dialogue['turns'][-1]['speaker'] == 'system': + converted_dialogue['turns'].pop(-1) + all_data.append(converted_dialogue) + + print('total_turn', total_turn) + print('total_not_found_turn', total_not_found_turn) + print('total_slot', total_slot) + print('total_not_found_slot', total_not_found_slot) + print('total_not_found_state', total_not_found_state) + print(slot_notfound_dict) + from collections import Counter + # print({k : dict(Counter(v)) for k, v in cat_slot_values.items()}) + json.dump({k : dict(Counter(v)) for k, v in state_cat_slot_values.items()}, open(os.path.join(self_dir, 'cat_slot_values.json'), 'w'), indent=4) + cat_slot_values = {k: list(set(v)) for k, v in state_cat_slot_values.items()} + da_cat_slot_values = {k: list(set(v)) for k, v in da_cat_slot_values.items()} + + json.dump(all_data, open('data.json', 'w'), indent=4) + write_zipped_json(os.path.join(self_dir, './data.zip'), 'data.json') + os.remove('data.json') + + new_ont = { + 'domains': {}, + 'intents': {}, + 'binary_dialogue_act': {} + } + + for d_s in slot_to_type: + d, s = d_s.split('-') + if d not in new_ont['domains']: + new_ont['domains'][d] = { + 'description': multiwoz_desc[d]['domain'], + 'slots': {} + } + domain_ont = new_ont['domains'][d] + assert s not in domain_ont + domain_ont['slots'][s] = { + 'description': multiwoz_desc[d][s] if s in multiwoz_desc[d] else '', + 'is_categorical': d_s in state_cat_slot_ds, + 'possible_values': da_cat_slot_values[d_s] if d_s in state_cat_slot_ds else [] + } + domain_ont['slots'][s]['possible_values'] = [_ for _ in domain_ont['slots'][s]['possible_values'] if _ not in ['dontcare', '']] + + new_ont['state'] = {} + # print(state_cat_slot_value_dict) + print(state_ont) + for d in state_ont: + new_ont['state'][d] = {} + for s in state_ont[d]: + d_s = '-'.join([d, s]) + new_ont['state'][d][s] = '' + + new_ont['intents'] = {i: {'description': multiwoz_desc['intents'][i]} for i in intent_ont} + new_ont['binary_dialogue_act'] = binary_ont + + slot_desc = json.load(open(os.path.join(self_dir, extract_dir, './slot_descriptions.json'))) + for domain_slot in slot_desc: + _domain, _slot = domain_slot.split('-') + _desc = slot_desc[domain_slot][0] + if _slot == 'arriveby': + _slot = 'arriveBy' + elif _slot == 'leaveat': + _slot = 'leaveAt' + if 'book' in _slot: + _slot = _slot.replace('book ', '') + if not _domain in new_ont['state']: + # logging.info('domain {} not in state domains'.format(_domain)) + continue + if _domain in new_ont['domains'] and _slot in new_ont['domains'][_domain]['slots']: + new_ont['domains'][_domain]['slots'][_slot]['description'] = _desc + if not _slot in new_ont['state'][_domain]: + logging.info('domain {} slot {} not in state'.format(_domain, _slot)) + continue + # new_ont['state'][_domain][_slot] = "" + assert _domain in new_ont['domains'], print(_domain) + assert _slot in new_ont['domains'][_domain]['slots'] + + logging.info('num_match_error_da_span {}'.format(num_match_error_da_span)) + json.dump(new_ont, open(os.path.join(self_dir, './ontology.json'), 'w'), indent=4) + + else: + all_data = read_zipped_json(os.path.join(self_dir, './data.zip'), 'data.json') + new_ont = json.load(open(os.path.join(self_dir, './ontology.json'), 'r')) + logging.info('# dialogue: {}, # turn: {}'.format(num_train_dialogue, num_train_utt)) + return all_data, new_ont + + +# def postprocess_update_spans(dialog): +# changed_utt_idx_and_position = {} +# for turn in dialog['turns']: +# if turn['speaker'] != 'user': +# continue +# changed = False +# for _update in turn['state_update']['non-categorical']: +# if 'utt_idx' in _update: +# utt_idx = _update['utt_idx'] +# start = _update['start'] +# end = _update['end'] +# +# # assume at most one word changes for every utterance +# if turn['utt_idx'] not in changed_utt_idx_and_position: +# if utt_idx == turn['utt_idx'] and start-1 > -1 and turn['utterance'][start-1] not in [' ']: +# changed_utt_idx_and_position[turn['utt_idx']] = start +# print('=======================') +# print(dialog['original_id']) +# print(turn['utterance']) +# print(json.dumps(_update, indent=2)) +# print(turn['utterance'][start: end]) +# turn['utterance'] = turn['utterance'][:start] + ' ' + turn['utterance'][start:] +# print(turn['utterance']) +# _update['start'] += 1 +# _update['end'] += 1 +# changed = True +# if utt_idx not in changed_utt_idx_and_position: +# continue +# else: +# value = _update['fixed_value'] if 'fixed_value' in _update and _update['fixed_value'] != 'not found' else _update['value'] +# if start >= changed_utt_idx_and_position[utt_idx]: +# if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: +# assert dialog['turns'][utt_idx]['utterance'][_update['start']+1: _update['end']+1] == value, print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][_update['start']+1: _update['end']+1]) +# _update['start'] += 1 +# _update['end'] += 1 +# elif start < changed_utt_idx_and_position[utt_idx] < end: +# if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: +# assert (dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']+1]).replace(' ', '') == value.replace(' ', ''), print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']+1], value) +# print('fix') +# print(_update) +# _update['end'] += 1 +# _update['fixed_value'] = turn['utterance'][_update['start']: _update['end'] + 1].strip() +# print(_update) +# if changed: +# for _update in turn['state_update']['non-categorical']: +# if 'utt_idx' in _update: +# utt_idx = _update['utt_idx'] +# start = _update['start'] +# end = _update['end'] +# +# if utt_idx not in changed_utt_idx_and_position: +# continue +# else: +# value = _update['fixed_value'] if 'fixed_value' in _update and _update[ +# 'fixed_value'] != 'not found' else _update['value'] +# if start >= changed_utt_idx_and_position[utt_idx]: +# if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: +# assert dialog['turns'][utt_idx]['utterance'][_update['start'] + 1: _update['end'] + 1] == value +# _update['start'] += 1 +# _update['end'] += 1 +# elif start < changed_utt_idx_and_position[utt_idx] < end: +# if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: +# print('====================fix===================') +# print(_update) +# assert (dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']+1]).replace(' ', '') == value.replace(' ', ''), print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][_update['start']+1: _update['end']+1]) +# _update['end'] += 1 +# _update['fixed_value'] = dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end'] + 1] +# print(_update) +# for turn in dialog['turns']: +# if turn['speaker'] != 'user': +# continue +# for _update in turn['state_update']['non-categorical']: +# if 'utt_idx' in _update: +# value = _update['fixed_value'] if 'fixed_value' in _update and _update[ +# 'fixed_value'] != 'not found' else _update['value'] +# utt_idx = _update['utt_idx'] +# start = _update['start'] +# end = _update['end'] +# if dialog['turns'][utt_idx]['utterance'][start] == ' ': +# _update['start'] += 1 +# _update['fixed_value'] = value[1:] +# value = value[1:] +# start += 1 +# assert dialog['turns'][utt_idx]['utterance'][start: end] == value, print(json.dumps(turn, indent=4), [c for c in dialog['turns'][utt_idx]['utterance'][start: end]], [c for c in value]) +# return dialog + + +def get_time_variants(time_text): + value_list = [time_text] + pattern_time = r'(\d{1,2}:\d{2})(\s)?(am|pm|AM|PM)?' + match_times = re.findall(pattern_time, time_text) + if len(match_times) < 1: + return [] + match_time = match_times[0] + + am_flag = match_time[2] in ['am', 'AM'] + pm_flag = match_time[2] in ['pm', 'PM'] + no_am_pm_flag = match_time[2] == '' + if am_flag: + # 4:00am -> 4:00 + value_list.append(match_time[0]) + if len(match_time[0]) == 4: + # 4:00 -> 04:00 + value_list.append('0' + match_time[0]) + if pm_flag: + # 4:00pm -> 16:00 + hour, min = match_time[0].split(':') + hour = int(hour) + new_hour = 12 + hour + value_list.append(str(new_hour)+':'+min) + if no_am_pm_flag: + hour, min = match_time[0].split(':') + hour = int(hour) + if hour > 12: + new_hour = hour - 12 + value_list.append(str(new_hour) + ':' + min + 'pm') + value_list.append(str(new_hour) + ':' + min + ' pm') + value_list.append(str(new_hour) + ':' + min) + if min == '00': + value_list.append(str(new_hour) + 'pm') + value_list.append(str(new_hour) + ' pm') + value_list.append(str(new_hour)) + else: + value_list.append(str(hour) + ':' + min + 'am') + value_list.append(str(hour) + ':' + min + ' am') + value_list.append(str(hour) + ':' + min) + if min == '00': + value_list.append(str(hour) + 'am') + value_list.append(str(hour) + ' am') + value_list.append(str(hour)) + if len(match_time[0]) == 5 and match_time[0][0] == '0': + value_list.append(match_time[0][1:]) + value_list.append(''.join(match_time[0].split(':'))) + + return value_list + + +def get_genitive_variants(value): + ret_list = [] + value_genitive_format = r"(?=\w)s(?=\s)" + value_pattern = re.compile(value_genitive_format) + + span_genitive_value = re.sub(value_pattern, " 's", value) + if span_genitive_value != value: + ret_list.append(span_genitive_value) + span_genitive_value = re.sub(value_pattern, "'s", value) + if span_genitive_value != value: + ret_list.append(span_genitive_value) + # if len(ret_list) > 0: + # print('=============================') + # print(value) + # print(re.findall(value_pattern, value)) + # print(ret_list) + return ret_list + + +def check_spans(dialog): + for turn in dialog['turns']: + if turn['speaker'] != 'user': + continue + for _update in turn['state_update']['non-categorical']: + if 'utt_idx' in _update: + value = _update['fixed_value'] if 'fixed_value' in _update and _update[ + 'fixed_value'] != 'not found' else _update['value'] + utt_idx = _update['utt_idx'] + start = _update['start'] + end = _update['end'] + assert dialog['turns'][utt_idx]['utterance'][start:end] == value, print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][start:end]) + + + +def get_bb_variants(value): + ret_list = [] + if 'bed and breakfast' in value: + ret_list.append(value.replace('bed and breakfast', 'b & b')) + return ret_list + +if __name__ == '__main__': + preprocess(da_cat_slot_values, state_cat_slot_values) \ No newline at end of file diff --git a/data/unified_datasets/multiwoz22/README.md b/data/unified_datasets/multiwoz22/README.md new file mode 100644 index 0000000000000000000000000000000000000000..52db0dd3d3e5e16946d770bc3484d17b50b0dc5c --- /dev/null +++ b/data/unified_datasets/multiwoz22/README.md @@ -0,0 +1,34 @@ +# README + +## Features + +- Annotations: dialogue act, character-level span for non-categorical slots. state and state updates. + +Statistics: + +| | \# dialogues | \# utterances | avg. turns | avg. tokens | \# domains | +| ----- | ------------ | ------------- | ---------- | ----------- | ---------- | +| train | 8434 | 105066 | 12.46 | 17.27 | 7 | +| dev | 999 | 13731 | 13.74 | 17.72 | 7 | +| train | 1000 | 13744 | 13.74 | 17.67 | 7 | + + +## Main changes + +- only keep 5 domains in state annotations and dialog acts. +- `pricerange`, `area`, `day`, `internet`, `parking`, `stars` are considered categorical slots. +- punctuation marks are split from their previous tokens. e.g `I want to find a hotel. -> + I want to find a hotel .` + +Run `evaluate.py`: + +da values match rate: 97.944 +state values match rate: 66.945 + +### original data + +- from [multiwoz](https://github.com/budzianowski/multiwoz) repo. +- original multiwoz2.2 dataset gives slot value in List format. We take the first value +in each slot list as ground-truth value. + + diff --git a/data/unified_datasets/multiwoz22/data.zip b/data/unified_datasets/multiwoz22/data.zip new file mode 100644 index 0000000000000000000000000000000000000000..92d82a3382228455b61a4117413731d17ee7add2 Binary files /dev/null and b/data/unified_datasets/multiwoz22/data.zip differ diff --git a/data/unified_datasets/multiwoz22/ontology.json b/data/unified_datasets/multiwoz22/ontology.json new file mode 100644 index 0000000000000000000000000000000000000000..d3cc3c187cc961994cfb477f61a11694c6f156fd --- /dev/null +++ b/data/unified_datasets/multiwoz22/ontology.json @@ -0,0 +1,1879 @@ +{ + "domains": { + "taxi": { + "description": "taxi information query system", + "slots": { + "destination": { + "description": "destination of taxi", + "is_categorical": false, + "possible_values": [] + }, + "departure": { + "description": "departure location of taxi", + "is_categorical": false, + "possible_values": [] + }, + "leaveAt": { + "description": "leaving time of taxi", + "is_categorical": false, + "possible_values": [] + }, + "arriveBy": { + "description": "arrival time of taxi", + "is_categorical": false, + "possible_values": [] + }, + "taxi_phone": { + "description": "taxi phone number", + "is_categorical": false, + "possible_values": [] + }, + "taxi_types": { + "description": "taxi type", + "is_categorical": false, + "possible_values": [] + } + } + }, + "restaurant": { + "description": "restaurant information query system", + "slots": { + "food": { + "description": "food type for the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "name": { + "description": "name of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "pricerange": { + "description": "price budget for the restaurant", + "is_categorical": true, + "possible_values": [ + "varying price", + "every price point", + "a little expensive but worth it", + "assorted", + "most affordable price range", + "the least expensive", + "same price range", + "similar price range", + "priced cheaply", + "varying price range", + "any price point", + "cheap priced", + "moderate price range", + "moderate", + "any", + "rather expensive", + "that price range", + "on the pricey side", + "on the cheap side", + "various", + "your price range", + "expensively priced", + "expensive or moderate", + "moderatley priced", + "moderatre", + "more expensive", + "adforable", + "fine", + "not moderate", + "moderately expensive", + "relatively cheap", + "cheap or expensive", + "moderately prices", + "expensive side", + "ranging from cheap to expensive", + "very expensive", + "ch", + "cherap", + "do nt care", + "low cost", + "do n't care", + "most expensive", + "or the moderately priced", + "moderate and expensive", + "that price", + "expensively", + "reasonable", + "not cheap", + "moderately priced", + "all price ranges", + "or otherwise", + "upscale", + "expensive / upscale", + "fairly expensive", + "great prices", + "quite low", + "moderatly", + "moderate or expensive priced", + "cheaply priced", + "the same", + "expensive range", + "moderate priced", + "this price range", + "pricey", + "moderately - priced", + "reasonably priced", + "high - end", + "cheaper", + "expensive and moderately priced", + "pretty expensive", + "modest", + "they vary", + "luxury options", + "fairly inexpensive", + "pretty cheap", + "expensive but worth every penny", + "moderate and one in the cheap range", + "not expensive", + "that range", + "low priced", + "relatively expensive", + "cheap to expensive", + "mostly expensive and moderately priced", + "not - so - expensive", + "quite inexpensive", + "inexpensive", + "your", + "budget - friendly", + "high class", + "expinsive", + "fairly cheap", + "expensive price range", + "any price range", + "moderate pricing", + "high end", + "epensive", + "vietnamese", + "cheap price range", + "mostly expensive", + "less expensive", + "moderate range", + "more moderate", + "priced moderately", + "expensive price", + "centre", + "expensive", + "middle", + "boderate", + "moderate price", + "moderately", + "moderatly priced", + "cheap range", + "regardless of price", + "moderately price", + "north", + "not too expensive", + "not too pricey", + "affordable", + "all", + "east", + "quite expensive", + "cheaply - priced", + "budget conscious", + "cheap", + "budget friendly" + ] + }, + "address": { + "description": "exact location of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "area": { + "description": "area or place of the restaurant", + "is_categorical": true, + "possible_values": [ + "center of town", + "west part of town .", + "closely located", + "south cambridge", + "cetre", + "the city centre", + "on the east", + "any part of town", + "centre area of the city", + "the north", + "near centre", + "center", + "central region of town", + "not", + "east area of town", + "north side of cambridge", + "north of town", + "centre area", + "north part of town", + "west part of town", + "the south area of the city", + "here", + "south part", + "west", + "in town", + "in the centre", + "the southern area", + "south side", + "west area", + "anywhere", + "west cambridge", + "north area of town", + "the east side of town", + "any", + "south area of town", + "centre cambridge", + "centre of town", + "this town", + "the east part of town", + "cambridge", + "northern part", + "eastside", + "centrally", + "around town", + "close to your location", + "westies", + "the north area", + "thai", + "centre of town .", + "in town at all", + "in the west", + "mexican", + "not in the west", + "the town centre", + "other part of the town", + "the centre", + "southern area", + "the west", + "any area", + "southend", + "south of cambridge", + "east section of town", + "the north side", + "the east", + "the north side .", + "elsewhere in the city", + "south of town .", + "the west side of town", + "the centre area of town", + "the center part of town", + "northside", + "do nt care", + "that area", + "the are", + "do n't care", + "north end", + "in the city", + "centrem", + "the centre area", + "south of town", + "the east side", + "centre city", + "the south of town", + "the north end", + "centre region", + "the south side of town", + "east part of tow", + "the center", + "the same area", + "east side", + "town", + "any where in the city", + "throughout the area", + "in the city .", + "town centre", + "same side of town as your hotel", + "the south area of town", + "and centre", + "east area", + "north area", + "the west end", + "north cambridge", + "the area you have chosen", + "west side", + "east of town", + "central", + "east part of town", + "that part of town", + "city center", + "the same", + "the south side", + "city", + "by the airport", + "downtown", + "the center of town", + "chesterton", + "east section", + "southern", + "south part of time", + "ely", + "within this area", + "the north part of town", + "north or centre", + "the west side", + "all over the city", + "center of the town", + "cambridge centre", + "wet part of town", + "center area of town", + "west end of town", + "city centre", + "the area", + "all of cambridge", + "towns centre", + "in the north", + "west part of the city", + "north side of town", + "the south part of town", + "centre of the city", + "the east area", + "here in the city", + "that area of town", + "east side of the city", + "centre of cambridge", + "the city center", + "almost every area of town", + "around there", + "north side of chersteron", + "in cambridge", + "west area of town", + "the area of west", + "town centre area", + "nearby", + "same area as the hotel", + "south part of town", + "not in the south of town", + "centreof", + "ctre", + "anywhere in cambridge", + "this area", + "town of centre", + "town center", + "in the east", + "east of cambridge", + "southside", + "all over town", + "south side of town", + "other parts of town", + "northern part of cambridge", + "the same area as the botanic gardens", + "center cambridge", + "south area", + "south", + "near the centre", + "east end", + "throughout the city", + "the centre of town", + "centere", + "near clare hall", + "close to that area", + "in the city centre", + "n the centre", + "the centre part of town", + "the west of town", + "around the college", + "centre area of town", + "centre", + "these areas", + "south end", + "close to the hotel", + "expensive", + "south part of the city", + "close it city centre", + "west of cambridge", + "same area as the park", + "east side of town", + "that", + "west end", + "the south", + "all over cambridge", + "north", + "anywhere in town", + "centre area of the town", + "the west part of town", + "same area", + "south side of the town", + "the west area", + "near the center of town", + "centre part of town", + "northern parts of town", + "west side of town", + "east", + "westside", + "west of town", + "the north side of town", + "north and west", + "centrally located", + "west side area", + "same", + "central area", + "cheap", + "north side" + ] + }, + "postcode": { + "description": "postcode of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "phone": { + "description": "restaurant phone number", + "is_categorical": false, + "possible_values": [] + }, + "people": { + "description": "number of people booking the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "time": { + "description": "time of the restaurant booking", + "is_categorical": false, + "possible_values": [] + }, + "day": { + "description": "day of the restaurant booking", + "is_categorical": true, + "possible_values": [ + "w", + "thursday", + "monday", + "friday", + "saturday", + "tuesday", + "wednesday", + "sunday" + ] + }, + "choice": { + "description": "number of restaurants meeting requests of user", + "is_categorical": false, + "possible_values": [] + } + } + }, + "attraction": { + "description": "an entertainment that is offered to the public", + "slots": { + "address": { + "description": "details of where the attraction is", + "is_categorical": false, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the attraction", + "is_categorical": false, + "possible_values": [] + }, + "phone": { + "description": "phone number of the attraction", + "is_categorical": false, + "possible_values": [] + }, + "entrance fee": { + "description": "the fee charged for admission to the attraction", + "is_categorical": false, + "possible_values": [] + }, + "type": { + "description": "type of the attraction", + "is_categorical": false, + "possible_values": [] + }, + "area": { + "description": "area or place of the attraction", + "is_categorical": true, + "possible_values": [ + "center of town", + "that side of town", + "south cambridge", + "the city centre", + "north in milton", + "western part of the town", + "western area of town", + "whole of cambridge", + "the south near your hotel", + "the north", + "your desired location", + "center", + "cenre", + "east area of town", + "north part of town", + "centre area", + "north of town", + "west part of town", + "near ely", + "center part of town", + "western", + "west", + "in town", + "in the centre", + "south side", + "west cambridge", + "northern area", + "west area", + "the north of the city", + "centre area of cambridge", + "western cambridge", + "western part of town", + "centre of town", + "the east part of town", + "cambridge", + "various", + "west area of the city", + "eastside", + "eat", + "in the south", + "cetnre of town", + "right in the center of town", + "northern", + "different area", + "another area", + "centre area .", + "centre by galleria", + "same general area", + "the centre", + "clifton way", + "the west", + "east cambridge", + "close to the restaurant", + "this side of town", + "the east of the town", + "south of cambridge", + "most of them are in the center", + "the east", + "same area as tandoori palace", + "went side of town", + "the west side of town", + "do nt care", + "close to the center of town", + "that area", + "central zone", + "the centre area", + "south of town", + "near the restaurant", + "the east side", + "centre city", + "centre region", + "center of town .", + "center area", + "all of the other areas", + "the same area", + "the center", + "east side", + "central cambridge", + "town centre", + "center are", + "south park of town", + "center of cambridge", + "east area", + "northern area of town", + "north area", + "the west end", + "east of town", + "to the south", + "west side", + "central", + "that part of town", + "east part of town", + "city center", + "west side of the city", + "cambridge 's centre", + "the south side", + "city cenre", + "city", + "downtown", + "in that area", + "cb30aq", + "east section", + "ely", + "the area you 're looking for", + "the west side", + "all over the city", + "center of the town", + "the eastside", + "in the area", + "other areas", + "the west area of town", + "in the center", + "west end of town", + "city centre", + "city centre .", + "entre", + "the area", + "towns centre", + "northend", + "west part of the city", + "the center area", + "near the hotel", + "centre of town ?", + "south part of cambridge", + "that area .", + "centre of the city", + "southern cambridge", + "museum", + "centre of cambridge", + "all around the city", + "cambridge leisure park", + "several different parts of town", + "city 's centre", + "center of the city", + "in cambridge", + "church area", + "museums", + "west end of the city", + "west area of town", + "same part", + "town centre area", + "nearby", + "south part of town", + "same part of town as your restaurant", + "the same road", + "this area", + "in the east", + "town center", + "there", + "same area as hotel", + "ce", + "east of cambridge", + "south side of town", + "other parts of town", + "all over town", + "central district", + "the west - side", + "the center of the park", + "south area", + "south", + "near the centre", + "throughout the city", + "the centre of town", + "the south area", + "in that side", + "very close in the same area", + "cent", + "wet end", + "the centre part of town", + "west of your city", + "the west of town", + "we", + "north section of cambridge", + "centre area of town", + "centre", + "norwich", + "south end", + "every area except the north", + "the centry area", + "east side of town", + "that", + "northern cambridge", + "west end", + "the south", + "north of the city", + "north", + "east end of town", + "the center part of town", + "on the centre", + "same area", + "the west area", + "centre part of town", + "west side of town", + "all", + "east", + "westside", + "centrally located", + "same", + "the area you are looking for", + "west part of cambridge", + "north side" + ] + }, + "name": { + "description": "name of the attraction", + "is_categorical": false, + "possible_values": [] + }, + "pricerange": { + "description": "the price range for the attraction, from cheap to expensive", + "is_categorical": true, + "possible_values": [ + "neither prices are listed", + "from free to 2 pounds 50", + "entrance fee", + "5 pounds", + "4 pounds", + "unaware of what their entrance fee is", + "do n't see a price listed", + "expensive", + "they do n't have the entrance fee posted", + "no entrance fee", + "free", + "5 pound entrance fee", + "2 pounds", + "not sure of the fee", + "we do n't have any information", + "free admission", + "do n't have information", + "3.50 pounds", + "moderately priced", + "cheap", + "not listed" + ] + }, + "choice": { + "description": "number of attractions matching requests of user", + "is_categorical": false, + "possible_values": [] + } + } + }, + "booking": { + "description": "to arrange with a taxi, restaurant, train, etc.", + "slots": { + "time": { + "description": "time for an order", + "is_categorical": false, + "possible_values": [] + }, + "day": { + "description": "day for an order, from monday to sunday", + "is_categorical": true, + "possible_values": [ + "tues", + "monday", + "saturday night thru monday", + "tuesday", + "tuesday through friday", + "tuesday instead of thursday", + "tuesday 's", + "sunday", + "sundar", + "sunday 18/06/2017", + "tonight", + "this evening", + "sunday and monday", + "vmhj6y3i", + "at that time", + "today", + "friday", + "saturday", + "that", + "thursday", + "we d", + "weds", + "2", + "the same day", + "6", + "1", + "wednesday", + "that time frame" + ] + }, + "stay": { + "description": "for how long the user wish to be at a place", + "is_categorical": false, + "possible_values": [] + }, + "people": { + "description": "how many person the order is for", + "is_categorical": false, + "possible_values": [] + }, + "name": { + "description": "name of the ordered place", + "is_categorical": false, + "possible_values": [] + }, + "Ref": { + "description": "reference number of the order", + "is_categorical": false, + "possible_values": [] + } + } + }, + "train": { + "description": "query and order a train", + "slots": { + "destination": { + "description": "destination of the train", + "is_categorical": false, + "possible_values": [] + }, + "arriveBy": { + "description": "arrival time of the train", + "is_categorical": false, + "possible_values": [] + }, + "departure": { + "description": "departure location of the train", + "is_categorical": false, + "possible_values": [] + }, + "duration": { + "description": "the length of time the train trip lasts", + "is_categorical": false, + "possible_values": [] + }, + "people": { + "description": "number of people booking for train", + "is_categorical": false, + "possible_values": [] + }, + "day": { + "description": "day of the train", + "is_categorical": true, + "possible_values": [ + "monday", + "tuesday", + "tr2519", + "sunday", + "between friday and wednesday", + "sundays", + "every", + "daily", + "mondays", + "sat", + "we", + "all week", + "every day", + "friday", + "saturday", + "that", + "train", + "other days are available", + "this day", + "frday", + "cambridge", + "thursday", + "weds", + "saturdays", + "that day", + "13:07", + "fiday", + "everday", + "wednesday", + "fr" + ] + }, + "Ref": { + "description": "reference number of the order", + "is_categorical": false, + "possible_values": [] + }, + "leaveAt": { + "description": "leaving time for the train", + "is_categorical": false, + "possible_values": [] + }, + "price": { + "description": "price for the train ticket", + "is_categorical": false, + "possible_values": [] + }, + "choice": { + "description": "number of trains that meets requests of the user", + "is_categorical": false, + "possible_values": [] + } + } + }, + "hotel": { + "description": "to query hotel information and place an order", + "slots": { + "internet": { + "description": "internet option at the hotel", + "is_categorical": true, + "possible_values": [ + "yes", + "4", + "do nt care", + "no", + "free", + "y", + "free internet", + "none" + ] + }, + "area": { + "description": "area or place of the hotel", + "is_categorical": true, + "possible_values": [ + "center of town", + "that side of town", + "south cambridge", + "several areas of town", + "any part of the city", + "any part of town", + "the east end of town", + "a different area", + "the north", + "different part of town", + "north end of the city", + "center", + "and east", + "near centre", + "norht", + "northern side of town", + "east area of town", + "somewhere else", + "north of town", + "centre area", + "north part of town", + "west part of town", + "east and the north .", + "next door", + "all over", + "west", + "in town", + "in the centre", + "the town center", + "west part", + "south side", + "west area", + "northern area", + "the north of the city", + "north area of town", + "western cambridge", + "the east side of town", + "different parts of town", + "south area of town", + "centre cambridge", + "centre of town", + "cambridge", + "various", + "on the west side", + "north end of town", + "eastside", + "centrally", + "near the museum", + "eat", + "el shaddai", + "on the south", + "close to where you 'll be dining", + "in the west", + "northern", + "the town centre", + "different area", + "the northern part of town", + "the centre", + "the west", + "east cambridge", + "any area", + "southend", + "east section of town", + "the north side", + "southern part of town", + "the east", + "south end of town", + "west side near the restaurant", + "northside", + "do nt care", + "that area", + "north end", + "in the city", + "the centre area", + "south of town", + "near the restaurant", + "the east side", + "the city 's south side", + "centre city", + "the north end", + "north park of town", + "all areas", + "all parts of the city", + "south cambridge area", + "other parts of the city", + "that region", + "the same area", + "centra", + "east side", + "town centre", + "and in the centre", + "same area as the restaurant", + "north part of the city", + "across cambridge", + "east area", + "north area", + "the west end", + "north cambridge", + "east of town", + "west side", + "everywhere but the city centre", + "east part of town", + "that part of town", + "except in the north", + "city center", + "across town", + "west areas of town", + "the south side", + "the same", + "north part of town .", + "north location", + "different parts of the city", + "on the west", + "north of cambridge", + "downtown", + "the center of town", + "the westside", + "the north part of town", + "all over the city", + "the west side", + "center of the town", + "that vicinity", + "on the north", + "in the area", + "in the center", + "city centre", + "the easy", + "northe part of town", + "the area", + "all of cambridge", + "in the north", + "northend", + "or west ?", + "west part of the city", + "north side of town", + "same side of town", + "centre of the city", + "the east area", + "except in the east", + "place to be a guesthouse", + "that area of town", + "centre of cambridge", + "all around the city", + "in cambridge", + "the centre of cambridge", + "west end of the city", + "centre part", + "north par of town", + "central location", + "on the eastside", + "south part of town", + "eastern part of the city", + "the centre of the city", + "east part of time", + "anywhere in cambridge", + "there", + "north and centre", + "south side of town", + "all over town", + "other parts of town", + "southside", + "nborth", + "you are interested in", + "south area", + "south", + "near the centre", + "throughout the city", + "the centre of town", + "the south area", + "south closer to the museum", + "we", + "not in the south", + "centre area of town", + "centre", + "wast", + "south end", + "eastern cambridge", + "various parts of the city", + "east side of town", + "northern cambridge", + "eastern", + "west end", + "the south", + "north", + "east near other shops and boutiques", + "same area", + "the west area", + "centre part of town", + "all across town", + "west side of town", + "throughout cambridge", + "another area of town", + "south end of the city", + "east", + "northern part of town", + "nearby the restaurant", + "west of town", + "centrally located", + "centrally - located", + "cheap", + "not to far from the restaurant", + "north side" + ] + }, + "stars": { + "description": "star rating of the hotel", + "is_categorical": true, + "possible_values": [ + "3-star", + "3", + "does not have", + "foru", + "1-star", + "not as fancy", + "four star", + "yes", + "four starts", + "do nt care", + "five", + "one", + "no star rating", + "two", + "5-star", + "several", + "0 to 4", + "four - star", + "four stars", + "no", + "0", + "0-star", + "2-star", + "5", + "drop the star rating", + "different star ratings", + "ranging from 2 - 4 stars", + "2", + "lower", + "4-star", + "4", + "three", + "zero", + "1", + "does not show", + "unrated", + "not rated", + "four" + ] + }, + "parking": { + "description": "parking facility at the hotel", + "is_categorical": true, + "possible_values": [ + "yes", + "free parking", + "do nt care", + "no", + "free", + "n", + "none" + ] + }, + "phone": { + "description": "hotel phone number", + "is_categorical": false, + "possible_values": [] + }, + "name": { + "description": "name of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "pricerange": { + "description": "price budget of the hotel", + "is_categorical": true, + "possible_values": [ + "different", + "different price range", + "cheapest", + "budget - priced", + "cheaper than the others", + "same price range", + "cheap priced", + "moderate price range", + "extremely reasonable", + "slightly more expensive", + "other ranges", + "same price", + "moderate", + "any", + "moderatly price", + "rather expensive", + "that price range", + "on the cheap side", + "your price range", + "lower", + "expensively priced", + "does not say", + "moderatley priced", + "cheaply", + "more expensive", + "expensive to moderate", + "moderately pried", + "great", + "relatively cheap", + "moderately prices", + "moderately priceed", + "unfortunately do not have the price", + "very cheap", + "do nt care", + "moderately priced .", + "low cost", + "all different price ranges", + "varying price ranges", + "or expensive ?", + "very inexpensive", + "moderate to cheap", + "even cheaper", + "moderate in price", + "less costly", + "super cheap", + "moderately priced", + "cheap side", + "higher price range", + "upscale", + "espensive", + "more moderately priced", + "cheaper side", + "cheaply priced", + "the same", + "expensive range", + "moderate priced", + "your chosen", + "economical", + "moderately - priced", + "different price ranges", + "modrate", + "your price", + "more budget - friendly", + "quite cheap", + "mostly in the cheap to moderate price range", + "cheaper", + "economically priced", + "lower end", + "affordable pricing", + "pretty expensive", + "cheap to moderate", + "oderately priced", + "epxensive", + "cheap or moderate", + "pretty cheap", + "very affordable", + "free", + "not expensive", + "expensive or cheap", + "that range", + "much cheaper", + "cheap to expensive", + "2", + "unknown", + "inexpensive", + "fairly cheap", + "ca n't view that information", + "expensive price range", + "a little pricey", + "chear", + "moderately to expensively priced", + "moderate pricing", + "moderately pricing", + "moderate prices", + "on the more expensive side", + "cheap price range", + "the cheapest", + "moderate range", + "priced moderately", + "centre", + "expensive", + "inexpensively - priced", + "guesthouses", + "moderate price", + "moderately", + "moderate price point", + "that", + "moderatly priced", + "expensively - priced", + "cheap range", + "moderate to cheap range", + "moderately price", + "north", + "affordable", + "moderate or cheap", + "fairly cheap compared to other hotels", + "all", + "quite expensive", + "cheaply - priced", + "cheaply prices", + "moderately - priced or cheap", + "cheap", + "hotel", + "hotels" + ] + }, + "people": { + "description": "number of people for the hotel booking", + "is_categorical": false, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "address": { + "description": "exact location of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "type": { + "description": "what is the type of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "stay": { + "description": "length of stay at the hotel", + "is_categorical": false, + "possible_values": [] + }, + "day": { + "description": "day of the hotel booking", + "is_categorical": true, + "possible_values": [ + "thursday", + "monday", + "t", + "friday", + "next friday", + "saturday", + "tuesday", + "monda", + "wednesday", + "sunday" + ] + }, + "choice": { + "description": "number of hotels that meets requests of the user", + "is_categorical": false, + "possible_values": [] + } + } + }, + "police": { + "description": "find police stations", + "slots": { + "address": { + "description": "exact location of the police station", + "is_categorical": false, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the police station", + "is_categorical": false, + "possible_values": [] + }, + "phone": { + "description": "police station phone number", + "is_categorical": false, + "possible_values": [] + } + } + } + }, + "intents": { + "inform": { + "description": "inform user of value for a certain slot" + }, + "request": { + "description": "ask for value of a slot" + }, + "recommend": { + "description": "recommend a choice for user request" + }, + "book": { + "description": "place an order for user" + }, + "nobook": { + "description": "inform user of booking failure" + }, + "thank": { + "description": "express gratitude" + }, + "welcome": { + "description": "welcome" + }, + "bye": { + "description": "end a conversation and say goodbye to user" + }, + "reqmore": { + "description": "ask user for more instructions" + }, + "select": { + "description": "provide several choices for user to choose from" + }, + "nooffer": { + "description": "inform user that no options matches user request" + }, + "greet": { + "description": "express greeting" + }, + "offerbook": { + "description": "offer to place an order for user" + }, + "offerbooked": { + "description": "inform user that an order is succussful" + } + }, + "binary_dialogue_act": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "area", + "value": "" + }, + { + "intent": "inform", + "domain": "booking", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "booking", + "slot": "people", + "value": "" + }, + { + "intent": "request", + "domain": "booking", + "slot": "stay", + "value": "" + }, + { + "intent": "nobook", + "domain": "booking", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "area", + "value": "" + }, + { + "intent": "thank", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "welcome", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "food", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "pricerange", + "value": "" + }, + { + "intent": "request", + "domain": "booking", + "slot": "day", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "stars", + "value": "" + }, + { + "intent": "bye", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "pricerange", + "value": "" + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "", + "value": "" + }, + { + "intent": "reqmore", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "phone", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "parking", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "address", + "value": "" + }, + { + "intent": "select", + "domain": "hotel", + "slot": "", + "value": "" + }, + { + "intent": "nooffer", + "domain": "restaurant", + "slot": "", + "value": "" + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "", + "value": "" + }, + { + "intent": "inform", + "domain": "taxi", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "leaveAt", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "taxi_phone", + "value": "" + }, + { + "intent": "request", + "domain": "booking", + "slot": "time", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "arriveBy", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "name", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "phone", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "taxi_types", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "departure", + "value": "" + }, + { + "intent": "book", + "domain": "booking", + "slot": "", + "value": "" + }, + { + "intent": "greet", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "internet", + "value": "" + }, + { + "intent": "nooffer", + "domain": "hotel", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "name", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "destination", + "value": "" + }, + { + "intent": "select", + "domain": "restaurant", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "type", + "value": "" + }, + { + "intent": "recommend", + "domain": "hotel", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "day", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "departure", + "value": "" + }, + { + "intent": "offerbook", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "destination", + "value": "" + }, + { + "intent": "select", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "people", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "leaveAt", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "duration", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "price", + "value": "" + }, + { + "intent": "nooffer", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "arriveBy", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "Ref", + "value": "" + }, + { + "intent": "inform", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "offerbooked", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "recommend", + "domain": "restaurant", + "slot": "", + "value": "" + }, + { + "intent": "inform", + "domain": "attraction", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "type", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "phone", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "entrance fee", + "value": "" + }, + { + "intent": "select", + "domain": "attraction", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "area", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "name", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "pricerange", + "value": "" + }, + { + "intent": "nooffer", + "domain": "attraction", + "slot": "", + "value": "" + }, + { + "intent": "recommend", + "domain": "attraction", + "slot": "", + "value": "" + } + ], + "state": { + "hotel": { + "area": "", + "internet": "", + "name": "", + "parking": "", + "pricerange": "", + "stars": "", + "type": "", + "day": "", + "people": "", + "stay": "" + }, + "restaurant": { + "area": "", + "food": "", + "name": "", + "pricerange": "", + "day": "", + "people": "", + "time": "" + }, + "taxi": { + "arriveBy": "", + "departure": "", + "destination": "", + "leaveAt": "" + }, + "train": { + "arriveBy": "", + "day": "", + "departure": "", + "destination": "", + "leaveAt": "", + "people": "" + }, + "attraction": { + "area": "", + "name": "", + "type": "" + } + } +} \ No newline at end of file diff --git a/data/unified_datasets/multiwoz22/original_data.zip b/data/unified_datasets/multiwoz22/original_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..6fdaeb71ece04f47b6d1b0aa61e1b1ef6a1940fc Binary files /dev/null and b/data/unified_datasets/multiwoz22/original_data.zip differ diff --git a/data/unified_datasets/multiwoz22/preprocess.py b/data/unified_datasets/multiwoz22/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..521facf953fd4a4ed4222e4ce9084a0a90a1b518 --- /dev/null +++ b/data/unified_datasets/multiwoz22/preprocess.py @@ -0,0 +1,1431 @@ +import copy +import re +import zipfile +import json +import os +from tqdm import tqdm +import sys +import difflib +from fuzzywuzzy import fuzz +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from convlab2.util.file_util import read_zipped_json, write_zipped_json +import logging + + +logging.basicConfig(level=logging.INFO) +self_dir = (os.path.abspath(os.getcwd())) + +REF_SYS_DA = { + 'Attraction': { + 'Addr': "address", 'Area': "area", 'Choice': "choice", + 'Fee': "entrance fee", 'Name': "name", 'Phone': "phone", + 'Post': "postcode", 'Price': "pricerange", 'Type': "type", + 'none': None, 'Open': None + }, + 'Hospital': { + 'Department': 'department', 'Addr': 'address', 'Post': 'postcode', + 'Phone': 'phone', 'none': None + }, + 'Booking': { + 'Day': 'day', 'Name': 'name', 'People': 'people', + 'Ref': 'Ref', 'Stay': 'stay', 'Time': 'time', + 'none': None + }, + 'Hotel': { + 'Addr': "address", 'Area': "area", 'Choice': "choice", + 'Internet': "internet", 'Name': "name", 'Parking': "parking", + 'Phone': "phone", 'Post': "postcode", 'Price': "pricerange", + 'Stars': "stars", 'Type': "type", 'Stay': 'stay', 'Day': 'day', 'People': 'people', + 'none': None + }, + 'Restaurant': { + 'Addr': "address", 'Area': "area", 'Choice': "choice", + 'Name': "name", 'Food': "food", 'Phone': "phone", + 'Post': "postcode", 'Price': "pricerange", + 'Time': 'time', 'Day': 'day', 'People': 'people', + 'none': None + }, + 'Taxi': { + 'Arrive': "arriveBy", 'Car': "taxi_types", 'Depart': "departure", + 'Dest': "destination", 'Leave': "leaveAt", 'Phone': "taxi_phone", + 'none': None + }, + 'Train': { + 'Arrive': "arriveBy", 'Choice': "choice", 'Day': "day", + 'Depart': "departure", 'Dest': "destination", + 'Leave': "leaveAt", 'People': "people", 'Ref': "Ref", + 'Time': "duration", 'none': None, 'Ticket': 'price', + }, + 'Police': { + 'Addr': "address", 'Post': "postcode", 'Phone': "phone", 'none': None + }, +} + +# taxi restaurant attraction train +slot_to_type = { + 'taxi-destination': 'non', + 'taxi-departure': 'non', + 'taxi-leaveAt': 'non', + 'taxi-arriveBy': 'non', + 'restaurant-food': 'non', + 'restaurant-name': 'non', + 'attraction-address': 'non', + 'attraction-postcode': 'non', + 'restaurant-pricerange': 'cat', + 'restaurant-address': 'non', + 'restaurant-area': 'cat', + 'restaurant-postcode': 'non', + 'attraction-phone': 'non', + 'attraction-entrance fee': 'non', + 'booking-time': 'non', + 'booking-day': 'cat', + 'attraction-type': 'non', + 'attraction-area': 'cat', + 'train-destination': 'non', + 'train-arriveBy': 'non', + 'train-departure': 'non', + 'hotel-internet': 'cat', + 'hotel-area': 'cat', + 'booking-stay': 'non', + 'booking-people': 'non', + 'train-duration': 'non', + 'train-people': 'non', + 'train-day': 'cat', + 'train-Ref': 'non', + 'hotel-stars': 'cat', + 'train-leaveAt': 'non', + 'train-price': 'non', + 'hotel-parking': 'cat', + 'hotel-phone': 'non', + 'hotel-name': 'non', + 'hotel-pricerange': 'cat', + 'hotel-people': 'non', + 'restaurant-phone': 'non', + 'hotel-postcode': 'non', + 'hotel-address': 'non', + 'attraction-name': 'non', + 'hotel-type': 'non', + 'restaurant-people': 'non', + 'train-choice': 'non', + 'attraction-pricerange': 'cat', + 'hotel-stay': 'non', + 'booking-name': 'non', + 'booking-Ref': 'non', + 'restaurant-time': 'non', + 'restaurant-day': 'cat', + 'hotel-day': 'cat', + 'hotel-choice': 'non', + 'restaurant-choice': 'non', + 'attraction-choice': 'non', + 'taxi-taxi_phone': 'non', + 'taxi-taxi_types': 'non', + 'police-address': 'non', + 'police-postcode': 'non', + 'police-phone': 'non' +} + +state_cat_slot_value_dict = { + "hotel-pricerange": { + "cheap": 735, + "moderate": 1063, + "expensive": 594, + }, + "hotel-parking": { + "yes": 1809, + "no": 126, + "free": 4, + }, + "hotel-day": { + "tuesday": 385, + "wednesday": 410, + "monday": 365, + "saturday": 407, + "friday": 393, + "thursday": 384, + "sunday": 369, + }, + "train-day": { + "wednesday": 533, + "monday": 533, + "saturday": 543, + "thursday": 547, + "friday": 563, + "tuesday": 553, + "sunday": 613, + }, + "hotel-stars": { + "4": 1263, + "2": 193, + "0": 201, + "3": 401, + "5": 45, + "1": 45, + }, + "hotel-internet": { + "yes": 1841, + "no": 79, + "free": 2 + }, + "hotel-area": { + "east": 416, + "north": 717, + "centre": 538, + "south": 289, + "west": 316, + }, + "attraction-area": { + "centre": 1290, + "west": 332, + "north": 155, + "south": 240, + "east": 272, + }, + "restaurant-pricerange": { + "expensive": 1477, + "cheap": 758, + "moderate": 1028, + }, + "restaurant-area": { + "centre": 1745, + "south": 398, + "north": 390, + "east": 360, + "west": 423, + }, + "restaurant-day": { + "thursday": 362, + "wednesday": 412, + "friday": 395, + "monday": 383, + "sunday": 399, + "saturday": 421, + "tuesday": 350, + } +} + + +synonyms = [ + ["el shaddia guesthouse", "el shaddai"], + [ "peterborough", "peterbourgh"], + ["night club", "nightclub", 'nightclubs'], + ["boat", "boating"], + ["portugese", "portuguese"], + ["guesthouse", "guest house"], + ["seafood", "sea food"], + ["christ 's college", "christ college"], + ["huntingdon marriott hotel"] +] + +state_cat_slot_ds = [k for k, v in slot_to_type.items() if v == 'cat'] + +da_cat_slot_values = { + # 'hotel-stay': ['1', '2', '3', '4', '5'], + 'hotel-internet': ['free', 'no', 'none', 'yes'], + 'hotel-parking': ['free', 'no', 'none', 'yes'] +} + +state_cat_slot_values = {} + +multiwoz_desc = { + 'taxi': { + 'domain': 'taxi information query system', + 'taxi_phone': 'taxi phone number', + 'taxi_types': 'taxi type', + }, + 'restaurant': { + 'domain': 'restaurant information query system', + 'address': 'exact location of the restaurant', + 'postcode': 'postcode of the restaurant', + 'phone': 'restaurant phone number', + 'choice': 'number of restaurants meeting requests of user', + }, + 'attraction': { + 'domain': 'an entertainment that is offered to the public', + 'address': 'details of where the attraction is', + 'postcode': 'postcode of the attraction', + 'phone': 'phone number of the attraction', + 'entrance fee': 'the fee charged for admission to the attraction', + 'pricerange': 'the price range for the attraction, from cheap to expensive', + 'choice': 'number of attractions matching requests of user' + }, + 'booking': { + 'domain': 'to arrange with a taxi, restaurant, train, etc.', + 'time': 'time for an order', + 'day': 'day for an order, from monday to sunday', + 'stay': 'for how long the user wish to be at a place', + 'people': 'how many person the order is for', + 'name': 'name of the ordered place', + 'Ref': 'reference number of the order' + }, + 'train': { + 'domain': 'query and order a train', + 'duration': 'the length of time the train trip lasts', + 'Ref': 'reference number of the order', + 'price': 'price for the train ticket', + 'choice': 'number of trains that meets requests of the user', + }, + 'hotel': { + 'domain': 'to query hotel information and place an order', + 'address': 'exact location of the hotel', + 'postcode': 'postcode of the hotel', + 'phone': 'hotel phone number', + 'choice': 'number of hotels that meets requests of the user', + }, + 'police': { + 'domain': 'find police stations', + 'address': 'exact location of the police station', + 'postcode': 'postcode of the police station', + 'phone': 'police station phone number', + }, + 'intents': { + 'inform': 'inform user of value for a certain slot', + 'request': 'ask for value of a slot', + 'nobook': 'inform user of booking failure', + 'reqmore': 'ask user for more instructions', + 'book': 'place an order for user', + 'bye': 'end a conversation and say goodbye to user', + 'thank': 'express gratitude', + 'welcome': 'welcome', + 'offerbooked': 'inform user that an order is succussful', + 'recommend': 'recommend a choice for user request', + 'greet': 'express greeting', + 'nooffer': 'inform user that no options matches user request', + 'offerbook': 'offer to place an order for user', + 'select': 'provide several choices for user to choose from', + } +} + +digit2word = { + '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five', + '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten' +} + + +def pharse_in_sen(phrase, sen): + ''' + match value in the sentence + :param phrase: str + :param sen: str + :return: start, end if matched, else None, None + ''' + assert isinstance(phrase, str) + pw = '(^|[\s,\.:\?!-])(?P<v>{})([\s,\.:\?!-]|$)' + pn = '(^|[\s\?!-]|\D[,\.:])(?P<v>{})($|[\s\?!-]|[,\.:]\D|[,\.:]$)' + + if phrase.isdigit() and phrase in digit2word: + phrase = digit2word[phrase] + p = re.compile(pw.format(re.escape(phrase)), re.I) + m = re.search(p, sen) + if m: + num = len(re.findall(p, sen)) + # if num > 1: + # match['>1'] += 1 + # else: + # match['1'] += 1 + return m.span('v'), num + # match['0'] += 1 + if phrase.isdigit(): + pattern = pn + else: + pattern = pw + p = re.compile(pattern.format(re.escape(phrase)), re.I) + m = re.search(p, sen) + if m: + num = len(re.findall(p, sen)) + # if num > 1: + # match['>1'] += 1 + # else: + # match['1'] += 1 + return m.span('v'), num + return (None, None), 0 + + + + + +def update_state(state, update): + # print('======================') + # print(state) + # print(update) + # print('======================') + + for service, service_update in update.items(): + if service not in state: + state[service] = copy.deepcopy(service_update) + else: + state[service].update(update[service]) + + +def convert_da(utt, da_dict, binary_ont, intent_ont, did, tid, da_cat_slot_values): + ''' + convert multiwoz dialogue acts to required format + :param utt: user or system utt + :param da_dict: multiwoz da + :param binary_ont: binary ontology + :param intent_ont: intent ontology + :return: + ''' + converted_da = { + 'categorical': [], + 'non-categorical': [], + 'binary': [] + } + + for Domain_Act, S, v in da_dict: + Domain, Act = Domain_Act.split('-') + if Domain.lower() in ['police', 'hospital', 'bus']: + continue + + if Act.lower() not in intent_ont: + intent_ont[Act.lower()] = {} + + # general domain is converted to empty domain. e.g. thank, bye + if Domain == 'general': + assert S == 'none' + assert v == 'none' + converted_dict = { + 'intent': Act.lower(), + 'domain': '', + 'slot': '', + 'value': '' + } + converted_da['binary'].append(converted_dict) + + if converted_dict not in binary_ont: + binary_ont.append(converted_dict) + continue + + + + try: + reformated_slot = REF_SYS_DA[Domain][S] + except: + # print('44444444444444444444444444444444') + # print(Domain, S) + # logging.info('slot not in REF_SYS_DA, drop') + continue + + # if slot is None, da should be converted into binary + if reformated_slot is None: + if not (S == 'none' and v == 'none'): + # mainly for `Open` slot + # print('11111111111111111111') + # print(Domain_Act, S, v) + continue + # Booking-Inform none none + # Police-Inform none none + # Train-OfferBook none none + converted_dict = { + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': '', + 'value': '' + } + converted_da['binary'].append(converted_dict) + if converted_dict not in binary_ont: + binary_ont.append(converted_dict) + continue + + reformated_domain_slot = Domain.lower() + '-' + reformated_slot + + if Act.lower() == 'request': + converted_dict = { + 'intent': 'request', + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': '' + } + converted_da['binary'].append(converted_dict) + + if converted_dict not in binary_ont: + binary_ont.append(converted_dict) + continue + + # vs = da_dict[(Domain_Act, S)]['values'] + + if reformated_domain_slot in slot_to_type and slot_to_type[reformated_domain_slot] == 'cat': + origin_v = v + v = v.lower() + # if reformated_domain_slot in cat_slot_proj: + # v = cat_slot_proj[reformated_domain_slot][v] + if reformated_domain_slot not in da_cat_slot_values: + da_cat_slot_values[reformated_domain_slot] = [] + # if v not in cat_slot_values[reformated_domain_slot]: + da_cat_slot_values[reformated_domain_slot].append(v) + converted_da['categorical'].append({ + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': v + }) + if 'start_word' in da_dict[(Domain_Act, S, origin_v)]: + start_ws = da_dict[(Domain_Act, S, origin_v)]['start_word'] + end_ws = da_dict[(Domain_Act, S, origin_v)]['end_word'] + utt_list = utt.split() + for start_w, end_w in zip(start_ws, end_ws): + if start_w > len(utt_list) or end_w > len(utt_list): + continue + start_ch = 0 + for i in range(start_w): + start_ch += len(utt_list[i]) + 1 + end_ch = start_ch + for i in range(start_w, end_w): + end_ch += len(utt_list[i]) + 1 + try: + end_ch += len(utt_list[end_w]) + except: + print(utt_list, start_w, end_w) + if not utt[start_ch: end_ch] == origin_v: + # print('2222222222222222222222222') + # print('\n'.join([v, utt[start_ch: end_ch - 1]])) + continue + + else: + converted_da['categorical'][-1].update({ + 'start': start_ch, + 'end': end_ch + }) + break + + else: + if 'start_word' not in da_dict[(Domain_Act, S, v)]: + # todo no span annotation + converted_da['non-categorical'].append({ + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': v + }) + continue + + start_ws = da_dict[(Domain_Act, S, v)]['start_word'] + end_ws = da_dict[(Domain_Act, S, v)]['end_word'] + utt_list = utt.split() + found = True + for start_w, end_w in zip(start_ws, end_ws): + if start_w > len(utt_list) or end_w > len(utt_list): + continue + start_ch = 0 + for i in range(start_w): + start_ch += len(utt_list[i]) + 1 + end_ch = start_ch + for i in range(start_w, end_w): + end_ch += len(utt_list[i]) + 1 + try: + end_ch += len(utt_list[end_w]) + except: + print(utt_list, start_w, end_w, v) + if not utt[start_ch: end_ch] == v: + # print('2222222222222222222222222') + # print('\n'.join([v, utt[start_ch: end_ch - 1]])) + continue + + else: + found = True + converted_da['non-categorical'].append({ + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': v, + 'start': start_ch, + 'end': end_ch + }) + break + + if not found: + converted_da['non-categorical'].append({ + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': v + }) + return converted_da + + +def get_state_update(prev_state, cur_state, dialog, did, tid, utt, coref_dict, slot_notfound_dict, da_cat_slot_values): + prev_turns = dialog['turns'] + state_update = {'categorical': [], 'non-categorical': []} + notfoundnum = 0 + total_value = 0 + + diff_state = {} + if prev_state is None: + diff_state = {domain: {slot: value for slot, value in cur_state[domain].items() if value != ''} for domain in + cur_state} + else: + assert len(prev_state) == len(cur_state), print(prev_state, cur_state) + for domain, domain_state in prev_state.items(): + if domain not in diff_state: + diff_state[domain] = {} + for slot, value in domain_state.items(): + if value != cur_state[domain][slot]: + # assert len(cur_state[domain][slot]) > 0, print(did, tid, domain, slot, utt) + diff_state[domain][slot] = cur_state[domain][slot] + + ret_diff_state = copy.deepcopy(diff_state) + + + + for domain in diff_state: + for slot in diff_state[domain]: + + total_value += 1 + fix_or = False + if '|' in diff_state[domain][slot]: + value = diff_state[domain][slot].split('|')[0] + else: + value = diff_state[domain][slot] + + # if dialog['original_id'] == 'PMUL2512' and tid == 17 and value == '02:45': + # value = '2:45' + + value_list = [value] + for _synonyms in synonyms: + if value in _synonyms: + value_list = _synonyms + + value_list.extend(get_time_variants(value)) + value_list.extend(get_genitive_variants(value)) + value_list.extend(get_bb_variants(value)) + + if value.endswith(' restaurant'): + value_list.append(value.split(' restaurant')[0]) + if value.endswith(' hotel'): + value_list.append(value.split(' hotel')[0]) + found = False + for value in value_list: + # categorical slots + if slot in ['internet', 'parking', 'pricerange', 'day', 'area', 'stars']: + reformated_domain_slot = '-'.join([domain, slot]) + if reformated_domain_slot in state_cat_slot_value_dict and (value in state_cat_slot_value_dict[reformated_domain_slot] or value in ['dontcare', '', 'none', 'not mentioned']): + state_update['categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot] + }) + if domain + '-' + slot not in da_cat_slot_values: + da_cat_slot_values[domain + '-' + slot] = [diff_state[domain][slot]] + da_cat_slot_values[domain + '-' + slot].append(diff_state[domain][slot]) + if value != diff_state[domain][slot]: + state_update['categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + else : + for _turn in prev_turns[::-1]: + found = False + for da in _turn['dialogue_act']['categorical']: + if da['value'] == value: + if 'start' in da: + state_update['categorical'][-1].update({ + 'utt_idx': _turn['utt_idx'], + 'start': da['start'], + 'end': da['end'], + 'from': 'prev_da_span' + }) + found = True + break + if found: + break + else: + state_update['categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + 'fixed_value': 'not found' + }) + if domain + '-' + slot not in da_cat_slot_values: + da_cat_slot_values[domain + '-' + slot] = [diff_state[domain][slot]] + da_cat_slot_values[domain + '-' + slot].append(diff_state[domain][slot]) + ret_diff_state[domain][slot] = 'not found' + notfoundnum += 1 + # reformated_domain_slot = '-'.join([domain, slot] + found = True + break + + # process value ---> none + assert value not in ['none', 'not mentioned'] + if value in ['', 'dontcare']: + # if reformated_domain_slot not in state_cat_slot_values: + # state_cat_slot_values[reformated_domain_slot] = [] + # # if v not in cat_slot_values[reformated_domain_slot]: + # state_cat_slot_values[reformated_domain_slot].append(value) + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot] + }) + found = True + break + + # first look for values in coref_dict + for _Domain_Act, _Slot, _value in coref_dict: + _domain, _act = _Domain_Act.lower().split('-') + _slot = _Slot.lower() + _coref_value = coref_dict[(_Domain_Act, _Slot, _value)]['coref_value'] + if _coref_value == '': + continue + _coref_turn = coref_dict[(_Domain_Act, _Slot, _value)]['turn'] + if _coref_turn == -1: + continue + _coref_pos = coref_dict[(_Domain_Act, _Slot, _value)]['pos'] + if _coref_pos == '': + continue + _utt = coref_dict[(_Domain_Act, _Slot, _value)]['utt'] + if _domain == domain and _slot == slot and value == _coref_value: + + start_w, end_w = [int(p) for p in _coref_pos.split('-')] + utt_list = _utt.split() + start_ch = 0 + for i in range(start_w): + start_ch += len(utt_list[i]) + 1 + end_ch = start_ch + for i in range(start_w, end_w + 1): + end_ch += len(utt_list[i]) + 1 + end_ch -= 1 + + if not _utt[start_ch: end_ch] == _coref_value: + # print(111111111111111111111111111111111) + # print(_utt[start_ch: end_ch], _coref_value) + continue + + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + 'from': 'coref', + 'utt_idx': _coref_turn, + 'start': start_ch, + 'end': end_ch + }) + if value != diff_state[domain][slot]: + state_update['categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + found = True + + if found: + break + + # from da annotation + for _turn in prev_turns[::-1]: + for da in _turn['dialogue_act']['non-categorical']: + # if da['domain'] == domain and da['slot'] == slot and fuzz.ratio(da['value'], value) > 85: + # if not da['value'] == value: + # print(1111111111111111) + # print(value, da['value']) + + if fuzz.ratio(da['value'], value) > 85: + + if 'start' in da: + found = True + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + # 'value': da['value'], + 'value': diff_state[domain][slot], + 'utt_idx': _turn['utt_idx'], + 'start': da['start'], + 'end': da['end'], + 'from': 'prev_da_span' + }) + if value != diff_state[domain][slot]: + state_update['non-categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + if da['value'] != value: + state_update['non-categorical'][-1].update({'fixed_value':da['value']}) + ret_diff_state[domain][slot] = da['value'] + + break + if found: + break + + if found: + break + + # from utterance + for _turn in prev_turns[::-1]: + _utt = _turn['utterance'] + (start, end), num = pharse_in_sen(str(value), _utt) + if num: + assert value.lower() == _utt[start:end].lower() \ + or digit2word[value].lower() == _utt[start:end].lower() + found = True + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + # 'value': _utt[start:end].lower(), + # 'fixed_value': _utt[start:end].lower(), + 'from': 'prev_utt', + 'utt_idx': _turn['utt_idx'], + 'start': start, + 'end': end + }) + if value != diff_state[domain][slot]: + state_update['non-categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + if value != _utt[start:end].lower(): + state_update['non-categorical'][-1].update({'fixed_value': _utt[start:end].lower()}) + ret_diff_state[domain][slot] = _utt[start:end].lower() + found = True + break + if found: + break + + # from utterance + if not value.isdigit(): + for _turn in prev_turns[::-1]: + _utt = _turn['utterance'] + + s = difflib.SequenceMatcher(None, _utt, value) + matches = s.get_matching_blocks() + + for i, j, n in matches: + possible_value = _utt[i: i+len(value)] + + if i+ len(value) < len(_utt) and _utt[i+len(value)] not in [ ' ', ',', '.', '?', '!', '/'] : + possible_value += _utt[i+len(value):].split()[0] + + if possible_value.startswith('th '): + possible_value = possible_value[3:] + i += 3 + if i > 0 and _utt[i-1] not in [ ' ', ',', '.', '?', '!', '/']: + # cut first incomplete word + if len(possible_value.split()) > 1: + i += len(possible_value.split()[0]) + 1 + possible_value = ' '.join(possible_value.split()[1:]) + + + # prepend first incomplete word + # possible_value = _utt[:i].split()[-1] + possible_value + # i -= len(_utt[:i].split()[-1]) + + + if fuzz.token_sort_ratio(value, possible_value) > 92 or possible_value.startswith('ashley hotel and lovell lodge') : + found = True + + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + # 'value': possible_value, + # 'fixed_value': possible_value, + 'from':'prev_utt', + 'utt_idx': _turn['utt_idx'], + 'start': i, + 'end': i+len(possible_value) + }) + if value != diff_state[domain][slot]: + state_update['non-categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + if possible_value != value: + state_update['non-categorical'][-1].update({'fixed_value': possible_value}) + ret_diff_state[domain][slot] = possible_value + break + # assert _utt[i:i+len(possible_value)] == possible_value, print(_utt, _utt[i:i+len(possible_value)], possible_value) + # break + # if not possible_value == value: + # print(3333333333333333) + # print(value) + # print(possible_value) + if found: + break + if found: + break + + if found: + break + if not found: + # print('3333333333333333333') + # print(did, tid) + # print(domain, slot, value) + # print([_t['utterance'] for _t in prev_turns]) + # assert slot not in ['internet', 'parking', 'pricerange', 'day', 'area', 'stars'] + + if (domain, slot) not in slot_notfound_dict: + slot_notfound_dict[(domain, slot)] = 1 + else: + slot_notfound_dict[(domain, slot)] += 1 + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + 'fixed_value': 'not found' + }) + ret_diff_state[domain][slot] = 'not found' + notfoundnum += 1 + return state_update, notfoundnum, total_value, ret_diff_state + + +def merge_data_annotation(): + extract_dir = os.path.join(self_dir, 'original_data') + data21 = json.load(open(os.path.join(self_dir, extract_dir, 'data.json'))) + # data21_train = json.load(open(os.path.join(self_dir, extract_dir, 'train.json'))) + # data21_val = json.load(open(os.path.join(self_dir, extract_dir, 'val.json'))) + # data21_test = json.load(open(os.path.join(self_dir, extract_dir, 'test.json'))) + # data21 = {} + # data21.update(data21_train) + # data21.update(data21_val) + # data21.update(data21_test) + + # update_from_25_cnt = 0 + # total_turn = 0 + # for dial_id, dialog in data21.items(): + # dial_id = dial_id + '.json' + # assert dial_id in data25 + # for i, _turn in enumerate(dialog['log']): + # total_turn += 1 + # if _turn['text'] == data25[dial_id]['log'][i]['text']: + # _turn['span_info'].extend(copy.deepcopy(data25[dial_id]['log'][i]['span_info'])) + # # _turn['span_info'] = list(set(_turn['span_info'])) + # # _turn['dialog_act'].update(copy.deepcopy(data25[dial_id]['log'][i]['dialog_act'])) + # for Domain_Intent in data25[dial_id]['log'][i]['dialog_act']: + # if Domain_Intent in _turn['dialog_act']: + # _turn['dialog_act'][Domain_Intent].extend(data25[dial_id]['log'][i]['dialog_act'][Domain_Intent]) + # else: + # _turn['dialog_act'][Domain_Intent] = copy.deepcopy(data25[dial_id]['log'][i]['dialog_act'][Domain_Intent]) + # # _turn['dialog_act'][Domain_Intent] = list(set(_turn['dialog_act'][Domain_Intent])) + # if 'coreference' in data25[dial_id]['log'][i]: + # _turn['coreference'] = copy.deepcopy(data25[dial_id]['log'][i]['coreference']) + # update_from_25_cnt += 1 + # else: + # # print('==============multiwoz21=================') + # # print(_turn['text']) + # # print('==============multiwoz25=================') + # # print(data25[dial_id]['log'][i]['text']) + # continue + # + # print('{}/{} turns update from multiwoz25 data'.format(update_from_25_cnt, total_turn)) + return data21 + + +def preprocess(da_cat_slot_values, state_cat_slot_values): + all_data = [] + binary_ont = [] + intent_ont = {} + state_ont = {} + + data_splits = ['train', 'val', 'test'] + # data_splits = ['test'] + extract_dir = os.path.join(self_dir, 'original_data') + num_train_dialogue = 0 + num_train_utt = 0 + + num_match_error_da_span = 0 + + if not os.path.exists('data.zip') or not os.path.exists('ontology.json'): + # for data_split in data_splits: + # data_zip_file = os.path.join(self_dir, 'original_data.zip') + # if not os.path.exists(data_zip_file): + # raise FileNotFoundError(data_zip_file) + + # logging.info('unzip multiwoz data to {}'.format(extract_dir)) + # archive = zipfile.ZipFile(data_zip_file, 'r') + # archive.extractall(extract_dir) + + data = merge_data_annotation() + # exit() + # data = json.load(open(os.path.join(self_dir, extract_dir, 'data_meta_fixed.json'))) + train_list = open(os.path.join(self_dir, extract_dir, 'trainListFile')).read().split() + val_list = open(os.path.join(self_dir, extract_dir, 'valListFile')).read().split() + test_list = open(os.path.join(self_dir, extract_dir, 'testListFile')).read().split() + + total_not_found_slot = 0 + total_slot = 0 + total_turn = 0 + total_not_found_turn = 0 + total_not_found_state = 0 + + slot_notfound_dict = {} + + dialog_idx = 0 + for dialog_id, dialog in tqdm(data.items()): + + acc_not_found_flag = False + + coref_dict = {} + + data_split = None + for _split in data_splits: + if dialog_id.strip('.json') in eval(_split + '_list'): + data_split = _split + break + # assert data_split is not None + # if data_split != 'test': + # continue + if data_split is None: + continue + + if data_split == 'train': + num_train_dialogue += len(data) + + dialog_idx += 1 + # if dialog_idx > 10: + # break + converted_dialogue = { + 'dataset': 'multiwoz22', + 'data_split': data_split, + 'dialogue_id': 'multiwoz22_' + str(dialog_idx), + 'original_id': dialog_id, + 'domains': [d for d in dialog['goal'] if + len(dialog['goal'][d]) != 0 and d in multiwoz_desc and d not in ['police', 'hospital', 'bus']], + 'turns': [], + } + + if data_split == 'train': + num_train_utt += len(dialog['log']) + + prev_state = None + accum_fixed_state = {} + for turn_id, turn in enumerate(dialog['log']): + + utt = turn['text'].lower() + # for several wrong words + utt = utt.replace('seeuni', 'see uni') + + utt = ' '.join(utt.split()) + utt = utt.replace(' im ', ' i\'m ') + utt = utt.replace(' dont ', ' don\'t ') + utt = utt.replace(' thats ', ' that\'s ') + utt = utt.replace('idon\'t', ' i don\'t ') + utt = utt.replace('wedon\'t ', 'we don\'t ') + utt = utt.replace('id be ', 'i\'d be ') + # utt = utt.replace('cambridgethat\'svery ', 'cambridge that\'s very') + utt = re.sub(r'^im ', 'i\'m ', utt) + utt = re.sub(r'^whats ', 'what\'s ', utt) + utt = re.sub(r'^id ', 'i\'d ', utt) + utt = re.sub(r'^thats ', 'that\'s ', utt) + + utt = re.sub( r'([a-zA-Z0-9])([,.!\'-\?"~])', r'\1 \2', utt) + utt = re.sub(r'([,.!\'-\?"~])([a-zA-Z0-9])', r'\1 \2', utt) + + das = turn.get('dialog_act', []) + role = 'user' if turn_id % 2 == 0 else 'system' + spans = turn.get('span_info', []) + + da_dict = {} + for Domain_Act in das: + Domain = Domain_Act.split('-')[0] + if Domain.lower() not in converted_dialogue['domains'] and Domain.lower() not in ['general', 'booking']: + continue + + Svs = das[Domain_Act] + for S, v in Svs: + v = v.lower() + if v.startswith('th '): + # print(v) + v = v[3:] + if v.startswith('he '): + # print(v) + v = v[3:] + + if (Domain_Act, S, v) not in da_dict: + da_dict[(Domain_Act, S, v)] = {} + + for span in spans: + Domain_Act, S, v, start_word, end_word = span + v = v.lower() + if not (Domain_Act, S, v) in da_dict: + # logging.info('span da annotation not found in multiwoz da label') + # logging.info(dialog_id, turn_id) + # logging.info((Domain_Act, S, v)) + # logging.info(da_dict) + num_match_error_da_span += 1 + else: + if v.startswith('th '): + # print(v) + v = v[3:] + start_word += 3 + if v.startswith('he '): + # print(v) + v = v[3:] + start_word += 3 + + if 'start_word' not in da_dict[(Domain_Act, S, v)]: + da_dict[(Domain_Act, S, v)]['start_word'] = [] + da_dict[(Domain_Act, S, v)]['end_word'] = [] + + da_dict[(Domain_Act, S, v)]['start_word'].append(start_word) + da_dict[(Domain_Act, S, v)]['end_word'].append(end_word) + + converted_turn = { + 'utt_idx': turn_id, + 'speaker': role, + 'utterance': utt, + 'dialogue_act': convert_da(utt, da_dict, binary_ont, intent_ont, dialog_id, turn_id, da_cat_slot_values), + } + + # for state annotations + if role == 'system': + turn_state = turn['metadata'] + cur_state = {} + for domain in turn_state: + if domain in ['police', 'hospital', 'bus']: + continue + if domain not in converted_dialogue['domains']: + continue + cur_state[domain] = {} + for subdomain in ['semi', 'book']: + for slot in turn_state[domain][subdomain]: + if slot == 'booked': + continue + if slot == 'ticket': # or (domain == 'train' and slot == 'people'): + # for cases where domain slot exists in REF but not in state + # because of check in evaluate.py + continue + + else: + fixed_slot = slot + state_ds = domain + '-' + fixed_slot + if state_ds not in slot_to_type: + logging.info('state slot not defined in da list') + logging.info(state_ds) + if turn_state[domain][subdomain][slot] in ['', [], 'not mentioned', 'none']: + cur_state[domain][fixed_slot] = "" + else: + if turn_state[domain][subdomain][slot].startswith('th '): + # print('state') + # print(turn_state[domain][subdomain][slot]) + turn_state[domain][subdomain][slot] = turn_state[domain][subdomain][slot][3:] + if turn_state[domain][subdomain][slot].startswith('he '): + # print('state') + # print(turn_state[domain][subdomain][slot]) + turn_state[domain][subdomain][slot] = turn_state[domain][subdomain][slot][3:] + + cur_state[domain][fixed_slot] = turn_state[domain][subdomain][slot] + + if domain not in state_ont: + state_ont[domain] = [] + if fixed_slot not in state_ont[domain]: + state_ont[domain].append(fixed_slot) + + if domain == 'train' and 'people' not in cur_state[domain]: + cur_state[domain]['people'] = '' + # if len(converted_turn['state'][domain]) == 0: + # converted_turn['state'].pop(domain) + if len(converted_dialogue['turns']) > 0: + # move state from system side to user side + converted_dialogue['turns'][-1]['state'] = copy.deepcopy(cur_state) + + # for state update annotations + state_update, _notfoundslot, _totalslot, ret_diff_state = get_state_update(prev_state, cur_state, converted_dialogue, + dialog_id, turn_id, turn['text'], coref_dict, + slot_notfound_dict, da_cat_slot_values) + + update_state(accum_fixed_state, ret_diff_state) + for domain in accum_fixed_state: + for slot in accum_fixed_state[domain]: + assert isinstance(accum_fixed_state[domain][slot], str), print(accum_fixed_state[domain][slot]) + + if _notfoundslot == 0: + # for slot in state_update['categorical']: + # assert 'fixed_value' not in slot + for slot in state_update['non-categorical']: + if slot['value'] not in ['', 'dontcare']: + assert 'utt_idx' in slot + + else: + flag = False + for slot in state_update['categorical']: + if 'fixed_value' in slot: + flag = True + break + for slot in state_update['non-categorical']: + if 'utt_idx' not in slot: + flag = True + break + assert flag, print(flag, state_update['non-categorical']) + + total_turn += 1 + total_slot += _totalslot + total_not_found_slot += _notfoundslot + total_not_found_turn += 1 if _notfoundslot > 0 else 0 + if _notfoundslot > 0: + acc_not_found_flag = True + if acc_not_found_flag: + total_not_found_state += 1 + + coref_dict = {} + converted_dialogue['turns'][-1]['state_update'] = copy.deepcopy(state_update) + converted_dialogue['turns'][-1]['fixed_state'] = copy.deepcopy(accum_fixed_state) + if 'state' not in converted_dialogue['turns'][-1]: + converted_dialogue['turns'][-1]['state'] = {} + prev_state = copy.deepcopy(cur_state) + + converted_dialogue['turns'].append(converted_turn) + + if 'coreference' in turn: + for Domain_Act in turn['coreference']: + for Slot, value, coref, coref_turn, coref_pos in turn['coreference'][Domain_Act]: + value = value.lower() + coref_dict[(Domain_Act, Slot, value)] = {'turn': coref_turn, 'pos': coref_pos, + 'coref_value': coref, + 'utt': converted_dialogue['turns'][coref_turn][ + 'utterance']} + + check_spans(converted_dialogue) + # postprocess_update_spans(converted_dialogue) + if converted_dialogue['turns'][-1]['speaker'] == 'system': + converted_dialogue['turns'].pop(-1) + all_data.append(converted_dialogue) + + print('total_turn', total_turn) + print('total_not_found_turn', total_not_found_turn) + print('total_slot', total_slot) + print('total_not_found_slot', total_not_found_slot) + print('total_not_found_state', total_not_found_state) + print(slot_notfound_dict) + from collections import Counter + # print({k : dict(Counter(v)) for k, v in cat_slot_values.items()}) + json.dump({k : dict(Counter(v)) for k, v in state_cat_slot_values.items()}, open(os.path.join(self_dir, 'cat_slot_values.json'), 'w'), indent=4) + cat_slot_values = {k: list(set(v)) for k, v in state_cat_slot_values.items()} + da_cat_slot_values = {k: list(set(v)) for k, v in da_cat_slot_values.items()} + + json.dump(all_data, open('data.json', 'w'), indent=4) + write_zipped_json(os.path.join(self_dir, './data.zip'), 'data.json') + os.remove('data.json') + + new_ont = { + 'domains': {}, + 'intents': {}, + 'binary_dialogue_act': {} + } + + for d_s in slot_to_type: + d, s = d_s.split('-') + if d not in new_ont['domains']: + new_ont['domains'][d] = { + 'description': multiwoz_desc[d]['domain'], + 'slots': {} + } + domain_ont = new_ont['domains'][d] + assert s not in domain_ont + domain_ont['slots'][s] = { + 'description': multiwoz_desc[d][s] if s in multiwoz_desc[d] else '', + 'is_categorical': d_s in state_cat_slot_ds, + 'possible_values': da_cat_slot_values[d_s] if d_s in state_cat_slot_ds else [] + } + domain_ont['slots'][s]['possible_values'] = [_ for _ in domain_ont['slots'][s]['possible_values'] if _ not in ['dontcare', '']] + + new_ont['state'] = {} + # print(state_cat_slot_value_dict) + print(state_ont) + for d in state_ont: + new_ont['state'][d] = {} + for s in state_ont[d]: + d_s = '-'.join([d, s]) + new_ont['state'][d][s] = '' + + new_ont['intents'] = {i: {'description': multiwoz_desc['intents'][i]} for i in intent_ont} + new_ont['binary_dialogue_act'] = binary_ont + + slot_desc = json.load(open(os.path.join(self_dir, extract_dir, './slot_descriptions.json'))) + for domain_slot in slot_desc: + _domain, _slot = domain_slot.split('-') + _desc = slot_desc[domain_slot][0] + if _slot == 'arriveby': + _slot = 'arriveBy' + elif _slot == 'leaveat': + _slot = 'leaveAt' + if 'book' in _slot: + _slot = _slot.replace('book ', '') + if not _domain in new_ont['state']: + # logging.info('domain {} not in state domains'.format(_domain)) + continue + if _domain in new_ont['domains'] and _slot in new_ont['domains'][_domain]['slots']: + new_ont['domains'][_domain]['slots'][_slot]['description'] = _desc + if not _slot in new_ont['state'][_domain]: + logging.info('domain {} slot {} not in state'.format(_domain, _slot)) + continue + # new_ont['state'][_domain][_slot] = "" + assert _domain in new_ont['domains'], print(_domain) + assert _slot in new_ont['domains'][_domain]['slots'] + + logging.info('num_match_error_da_span {}'.format(num_match_error_da_span)) + json.dump(new_ont, open(os.path.join(self_dir, './ontology.json'), 'w'), indent=4) + + else: + all_data = read_zipped_json(os.path.join(self_dir, './data.zip'), 'data.json') + new_ont = json.load(open(os.path.join(self_dir, './ontology.json'), 'r')) + logging.info('# dialogue: {}, # turn: {}'.format(num_train_dialogue, num_train_utt)) + return all_data, new_ont + + +# def postprocess_update_spans(dialog): +# changed_utt_idx_and_position = {} +# for turn in dialog['turns']: +# if turn['speaker'] != 'user': +# continue +# changed = False +# for _update in turn['state_update']['non-categorical']: +# if 'utt_idx' in _update: +# utt_idx = _update['utt_idx'] +# start = _update['start'] +# end = _update['end'] +# +# # assume at most one word changes for every utterance +# if turn['utt_idx'] not in changed_utt_idx_and_position: +# if utt_idx == turn['utt_idx'] and start-1 > -1 and turn['utterance'][start-1] not in [' ']: +# changed_utt_idx_and_position[turn['utt_idx']] = start +# print('=======================') +# print(dialog['original_id']) +# print(turn['utterance']) +# print(json.dumps(_update, indent=2)) +# print(turn['utterance'][start: end]) +# turn['utterance'] = turn['utterance'][:start] + ' ' + turn['utterance'][start:] +# print(turn['utterance']) +# _update['start'] += 1 +# _update['end'] += 1 +# changed = True +# if utt_idx not in changed_utt_idx_and_position: +# continue +# else: +# value = _update['fixed_value'] if 'fixed_value' in _update and _update['fixed_value'] != 'not found' else _update['value'] +# if start >= changed_utt_idx_and_position[utt_idx]: +# if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: +# assert dialog['turns'][utt_idx]['utterance'][_update['start']+1: _update['end']+1] == value, print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][_update['start']+1: _update['end']+1]) +# _update['start'] += 1 +# _update['end'] += 1 +# elif start < changed_utt_idx_and_position[utt_idx] < end: +# if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: +# assert (dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']+1]).replace(' ', '') == value.replace(' ', ''), print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']+1], value) +# print('fix') +# print(_update) +# _update['end'] += 1 +# _update['fixed_value'] = turn['utterance'][_update['start']: _update['end'] + 1].strip() +# print(_update) +# if changed: +# for _update in turn['state_update']['non-categorical']: +# if 'utt_idx' in _update: +# utt_idx = _update['utt_idx'] +# start = _update['start'] +# end = _update['end'] +# +# if utt_idx not in changed_utt_idx_and_position: +# continue +# else: +# value = _update['fixed_value'] if 'fixed_value' in _update and _update[ +# 'fixed_value'] != 'not found' else _update['value'] +# if start >= changed_utt_idx_and_position[utt_idx]: +# if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: +# assert dialog['turns'][utt_idx]['utterance'][_update['start'] + 1: _update['end'] + 1] == value +# _update['start'] += 1 +# _update['end'] += 1 +# elif start < changed_utt_idx_and_position[utt_idx] < end: +# if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: +# print('====================fix===================') +# print(_update) +# assert (dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']+1]).replace(' ', '') == value.replace(' ', ''), print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][_update['start']+1: _update['end']+1]) +# _update['end'] += 1 +# _update['fixed_value'] = dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end'] + 1] +# print(_update) +# for turn in dialog['turns']: +# if turn['speaker'] != 'user': +# continue +# for _update in turn['state_update']['non-categorical']: +# if 'utt_idx' in _update: +# value = _update['fixed_value'] if 'fixed_value' in _update and _update[ +# 'fixed_value'] != 'not found' else _update['value'] +# utt_idx = _update['utt_idx'] +# start = _update['start'] +# end = _update['end'] +# if dialog['turns'][utt_idx]['utterance'][start] == ' ': +# _update['start'] += 1 +# _update['fixed_value'] = value[1:] +# value = value[1:] +# start += 1 +# assert dialog['turns'][utt_idx]['utterance'][start: end] == value, print(json.dumps(turn, indent=4), [c for c in dialog['turns'][utt_idx]['utterance'][start: end]], [c for c in value]) +# return dialog + + +def get_time_variants(time_text): + value_list = [time_text] + pattern_time = r'(\d{1,2}:\d{2})(\s)?(am|pm|AM|PM)?' + match_times = re.findall(pattern_time, time_text) + if len(match_times) < 1: + return [] + match_time = match_times[0] + + am_flag = match_time[2] in ['am', 'AM'] + pm_flag = match_time[2] in ['pm', 'PM'] + no_am_pm_flag = match_time[2] == '' + if am_flag: + # 4:00am -> 4:00 + value_list.append(match_time[0]) + if len(match_time[0]) == 4: + # 4:00 -> 04:00 + value_list.append('0' + match_time[0]) + if pm_flag: + # 4:00pm -> 16:00 + hour, min = match_time[0].split(':') + hour = int(hour) + new_hour = 12 + hour + value_list.append(str(new_hour)+':'+min) + if no_am_pm_flag: + hour, min = match_time[0].split(':') + hour = int(hour) + if hour > 12: + new_hour = hour - 12 + value_list.append(str(new_hour) + ':' + min + 'pm') + value_list.append(str(new_hour) + ':' + min + ' pm') + value_list.append(str(new_hour) + ':' + min) + if min == '00': + value_list.append(str(new_hour) + 'pm') + value_list.append(str(new_hour) + ' pm') + value_list.append(str(new_hour)) + else: + value_list.append(str(hour) + ':' + min + 'am') + value_list.append(str(hour) + ':' + min + ' am') + value_list.append(str(hour) + ':' + min) + if min == '00': + value_list.append(str(hour) + 'am') + value_list.append(str(hour) + ' am') + value_list.append(str(hour)) + if len(match_time[0]) == 5 and match_time[0][0] == '0': + value_list.append(match_time[0][1:]) + value_list.append(''.join(match_time[0].split(':'))) + + return value_list + + +def get_genitive_variants(value): + ret_list = [] + value_genitive_format = r"(?=\w)s(?=\s)" + value_pattern = re.compile(value_genitive_format) + + span_genitive_value = re.sub(value_pattern, " 's", value) + if span_genitive_value != value: + ret_list.append(span_genitive_value) + span_genitive_value = re.sub(value_pattern, "'s", value) + if span_genitive_value != value: + ret_list.append(span_genitive_value) + # if len(ret_list) > 0: + # print('=============================') + # print(value) + # print(re.findall(value_pattern, value)) + # print(ret_list) + return ret_list + + +def check_spans(dialog): + for turn in dialog['turns']: + if turn['speaker'] != 'user': + continue + for _update in turn['state_update']['non-categorical']: + if 'utt_idx' in _update: + value = _update['fixed_value'] if 'fixed_value' in _update and _update[ + 'fixed_value'] != 'not found' else _update['value'] + utt_idx = _update['utt_idx'] + start = _update['start'] + end = _update['end'] + assert dialog['turns'][utt_idx]['utterance'][start:end] == value, print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][start:end]) + + + +def get_bb_variants(value): + ret_list = [] + if 'bed and breakfast' in value: + ret_list.append(value.replace('bed and breakfast', 'b & b')) + return ret_list + +if __name__ == '__main__': + preprocess(da_cat_slot_values, state_cat_slot_values) \ No newline at end of file diff --git a/data/unified_datasets/multiwoz23/README.md b/data/unified_datasets/multiwoz23/README.md new file mode 100644 index 0000000000000000000000000000000000000000..57b5a077f7dc3c467764a664b6d01d08414c6e23 --- /dev/null +++ b/data/unified_datasets/multiwoz23/README.md @@ -0,0 +1,33 @@ +# README + +## Features + +- Annotations: dialogue act, character-level span for non-categorical slots. state and state updates. + +Statistics: + +| | \# dialogues | \# utterances | avg. turns | avg. tokens | \# domains | +| ----- | ------------ | ------------- | ---------- | ----------- | ---------- | +| train | 8434 | 105066 | 12.46 | 15.75 | 7 | +| dev | 999 | 13731 | 13.74 | 16.1 | 7 | +| train | 1000 | 13744 | 13.74 | 16.08 | 7 | + + +## Main changes + +- only keep 5 domains in state annotations and dialog acts. +- `pricerange`, `area`, `day`, `internet`, `parking`, `stars` are considered categorical slots. +- replace special tokens by space. e.g. `I want@to find a hotel. -> I want to find a hotel.` + +Run `evaluate.py`: + +da values match rate: 98.798 +state values match rate: 89.185 + +### original data + +- from [multiwoz-coref](https://github.com/lexmen318/MultiWOZ-coref) repo. +- slot description by multiwoz2.2 +- some hand-written descriptions. + + diff --git a/data/unified_datasets/multiwoz23/data.zip b/data/unified_datasets/multiwoz23/data.zip new file mode 100644 index 0000000000000000000000000000000000000000..54d08b07f6ca6f70fdfe39f9bdfda3d4564c1e16 Binary files /dev/null and b/data/unified_datasets/multiwoz23/data.zip differ diff --git a/data/unified_datasets/multiwoz23/ontology.json b/data/unified_datasets/multiwoz23/ontology.json new file mode 100644 index 0000000000000000000000000000000000000000..3d2c0a8dcc659e287ee88c95cf65ee9f7b3b3e8c --- /dev/null +++ b/data/unified_datasets/multiwoz23/ontology.json @@ -0,0 +1,1983 @@ +{ + "domains": { + "taxi": { + "description": "taxi information query system", + "slots": { + "destination": { + "description": "destination of taxi", + "is_categorical": false, + "possible_values": [] + }, + "departure": { + "description": "departure location of taxi", + "is_categorical": false, + "possible_values": [] + }, + "leaveAt": { + "description": "leaving time of taxi", + "is_categorical": false, + "possible_values": [] + }, + "arriveBy": { + "description": "arrival time of taxi", + "is_categorical": false, + "possible_values": [] + }, + "taxi_phone": { + "description": "taxi phone number", + "is_categorical": false, + "possible_values": [] + }, + "taxi_types": { + "description": "taxi type", + "is_categorical": false, + "possible_values": [] + } + } + }, + "restaurant": { + "description": "restaurant information query system", + "slots": { + "food": { + "description": "food type for the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "name": { + "description": "name of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "pricerange": { + "description": "price budget for the restaurant", + "is_categorical": true, + "possible_values": [ + "budget conscious", + "same area and price range", + "expensive side", + "mostly expensive", + "fairly expensive", + "all price ranges", + "that range", + "vietnamese", + "same area and price range as the hotel", + "middle", + "or otherwise", + "inexpensive", + "on the pricey side", + "ranging from cheap to expensive", + "moderatly priced", + "cherap", + "centre", + "more moderate", + "north", + "quite low", + "this price range", + "same price range", + "moderatley priced", + "varying price range", + "great prices", + "fine", + "expensive price", + "more expensive", + "upscale", + "any price range", + "cheapish", + "moderatre", + "modest", + "pricey", + "same price", + "fairly cheap", + "relatively cheap", + "cheap priced", + "moderate pricing", + "expensive price range", + "cheaply priced", + "east", + "expensive range", + "quite inexpensive", + "same area and price", + "moderately prices", + "or the moderately priced", + "expensive but worth every penny", + "assorted", + "any", + "any price", + "on the cheap side", + "do n't care", + "budget friendly", + "most affordable price range", + "low cost", + "moderate|cheap", + "pretty cheap", + "a little expensive but worth it", + "epensive", + "not too pricey", + "moderate priced", + "not - so - expensive", + "no particular", + "your", + "cheaper", + "priced cheaply", + "expensive priced", + "fairly inexpensive", + "moderate and one in the cheap range", + "reasonable", + "expensive(no", + "cheap price range", + "expinsive", + "mostly expensive and moderately priced", + "quite expensive", + "very expensive", + "that price", + "cheap range", + "boderate", + "reasonably priced", + "rather expensive", + "any price point", + "moderate or expensive priced", + "all", + "luxury options", + "regardless of price", + "expensive / upscale", + "moderate price", + "similar price range", + "pretty expensive", + "same area and pricerange", + "that price range", + "moderate range", + "moderately expensive", + "moderately", + "not", + "moderatly", + "cheaply", + "does n't matter", + "budget - friendly", + "cheapt", + "cheapest", + "various", + "not moderate", + "expensive or moderate", + "not cheap", + "expensively priced", + "expensive and moderately priced", + "not really", + "high class", + "low priced", + "your price range", + "varying price", + "do not care", + "not too expensive", + "adforable", + "they vary", + "none", + "moderate price range", + "high - end", + "moderately price", + "not expensive", + "relatively expensive", + "cheap to expensive", + "affordable", + "cheaply - priced", + "expensive", + "moderate", + "moderately - priced", + "priced moderately", + "same area and price range as my hotel", + "moderate and expensive", + "less expensive", + "expensively", + "high end", + "most expensive", + "same pricerange", + "does not matter", + "do not have a preference", + "cheap or expensive", + "the least expensive", + "and", + "moderately priced", + "cheap", + "every price point", + "the same" + ] + }, + "address": { + "description": "exact location of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "area": { + "description": "area or place of the restaurant", + "is_categorical": true, + "possible_values": [ + "east area of town", + "the city centre", + "that part of town", + "centre city", + "west of town", + "center", + "close to the hotel", + "in cambridge", + "ctre", + "south side of the town", + "close to your location", + "town centre area", + "any area", + "east side of the city", + "that area of town", + "west part of town .", + "on the east", + "north of town", + "n the centre", + "that area", + "east part of town", + "the west of town", + "centre", + "west of cambridge", + "north", + "south area", + "all of cambridge", + "centre of town", + "same area as the park", + "centrally located", + "the north", + "these areas", + "all of", + "north or centre", + "downtown", + "centre area", + "westside", + "the east part of town", + "throughout the area", + "near clare hall", + "the south", + "the east", + "the north area", + "the south side", + "east section of town", + "by the airport", + "the south area of the city", + "the west side of town", + "the south part of town", + "centre region", + "centre area of the city", + "close it city centre", + "mexican", + "centre of the city", + "anywhere in the city", + "central", + "north side of town", + "near the center of town", + "north and west", + "east", + "the south area of town", + "here", + "west area", + "in town at all", + "the east side", + "centrally", + "the centre area of town", + "around town", + "east side", + "centre of cambridge", + "in the north", + "centreof", + "central region of town", + "any", + "the west area", + "the west part of town", + "west part of the city", + "same side", + "ely", + "other part of the town", + "the north part of town", + "do n't care", + "same area", + "anywhere in town", + "same price range and area", + "north end", + "the north end", + "here in the city", + "same side of town as your hotel", + "the area you have chosen", + "center of the town", + "the north side of town", + "near the centre", + "the same area as the botanic gardens", + "southern", + "north part of town", + "this town", + "central area", + "east of cambridge", + "east part of tow", + "do n't have a preference", + "northern parts of town", + "south side of town", + "centre cambridge", + "anywhere", + "south part of time", + "north area of town", + "the center", + "all over the city", + "any where in the city", + "the town centre", + "southern area", + "the north side .", + "north area", + "in the city .", + "in the city centre", + "the city center", + "south side", + "east end", + "near centre", + "same part of town", + "anywhere in cambridge", + "town center", + "the center of town", + "south of cambridge", + "west side of town", + "that", + "thai", + "in the west", + "south area of town", + "northern part", + "does", + "the east side of town", + "northern part of cambridge", + "same location", + "centre area of the town", + "same", + "southend", + "center of town", + "the south of town", + "north side", + "the centre area", + "south of town .", + "west cambridge", + "the are", + "in town", + "do nt care", + "cambridge centre", + "not", + "in the city", + "the east area", + "city", + "the centre of town", + "does n't matter", + "the centre", + "the area", + "centre of town .", + "cetre", + "towns centre", + "west end of town", + "around the college", + "the west side", + "town", + "west", + "wet part of town", + "eastside", + "centrem", + "the southern area", + "north cambridge", + "in the east", + "east area", + "the south side of town", + "westies", + "chesterton", + "elsewhere in the city", + "not in the west", + "any part of town", + "other parts of town", + "the same area", + "town of centre", + "centere", + "same areas", + "town centre", + "east side of town", + "close to that area", + "none", + "center cambridge", + "city center", + "southside", + "same side of town", + "not in the south of town", + "north side of chersteron", + "around there", + "west part of town", + "the north side", + "south end", + "the area of west", + "northern", + "centre part of town", + "center area of town", + "city centre", + "restaurants in the city", + "west area of town", + "expensive", + "all over cambridge", + "south part of the city", + "east section", + "this area", + "within this area", + "eastern", + "nearby", + "west side area", + "centre-", + "the west", + "south part", + "throughout the city", + "south of town", + "same area as the hotel", + "closely located", + "south cambridge", + "and centre", + "the center part of town", + "west side", + "does not matter", + "east of town", + "almost every area of town", + "north side of cambridge", + "south part of town", + "all over town", + "same area of town", + "northside", + "cambridge", + "south", + "the west end", + "cheap", + "the centre part of town", + "in the centre", + "centre area of town", + "east|south", + "the same", + "west end" + ] + }, + "postcode": { + "description": "postcode of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "phone": { + "description": "restaurant phone number", + "is_categorical": false, + "possible_values": [] + }, + "people": { + "description": "number of people booking the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "time": { + "description": "time of the restaurant booking", + "is_categorical": false, + "possible_values": [] + }, + "day": { + "description": "day of the restaurant booking", + "is_categorical": true, + "possible_values": [ + "sunday|thursday", + "sunday", + "friday", + "saturday", + "saturday|thursday", + "wednesday", + "monday", + "thursday", + "same day", + "tuesday", + "same group and day" + ] + }, + "choice": { + "description": "number of restaurants meeting requests of user", + "is_categorical": false, + "possible_values": [] + } + } + }, + "attraction": { + "description": "an entertainment that is offered to the public", + "slots": { + "address": { + "description": "details of where the attraction is", + "is_categorical": false, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the attraction", + "is_categorical": false, + "possible_values": [] + }, + "phone": { + "description": "phone number of the attraction", + "is_categorical": false, + "possible_values": [] + }, + "entrance fee": { + "description": "the fee charged for admission to the attraction", + "is_categorical": false, + "possible_values": [] + }, + "type": { + "description": "type of the attraction", + "is_categorical": false, + "possible_values": [] + }, + "area": { + "description": "area or place of the attraction", + "is_categorical": true, + "possible_values": [ + "east area of town", + "close to the center of town", + "the city centre", + "every area except the north", + "that part of town", + "centre city", + "center", + "center area", + "in cambridge", + "near ely", + "all of the other areas", + "town centre area", + "south part of cambridge", + "the east of the town", + "the center of the park", + "center of the city", + "west of your city", + "the area you 're looking for", + "north of town", + "cenre", + "that area", + "east part of town", + "the west of town", + "centre area of cambridge", + "centre", + "north section of cambridge", + "north", + "the south area", + "south area", + "near the hotel", + "west area of the city", + "centre of town", + "the north of the city", + "centrally located", + "the north", + "central district", + "northern cambridge", + "city 's centre", + "the center area", + "southern cambridge", + "northend", + "downtown", + "centre area", + "westside", + "the east part of town", + "north of the city", + "the south", + "the east", + "museums", + "the south side", + "the west side of town", + "centre region", + "centre of the city", + "east end of town", + "church area", + "central", + "east", + "west area", + "the west area of town", + "the east side", + "same general area", + "city cenre", + "east side", + "centre of cambridge", + "centre|west", + "any", + "west part of the city", + "the west area", + "same side", + "the eastside", + "all around the city", + "do n't care", + "same area", + "in the south", + "there", + "the area you are looking for", + "close to the restaurant", + "another area", + "center of the town", + "very close in the same area", + "near the centre", + "central zone", + "southern", + "north part of town", + "does n't really matter", + "east of cambridge", + "do n't have a preference", + "that side of town", + "western part of the town", + "the south near your hotel", + "north in milton", + "south side of town", + "eat", + "cetnre of town", + "the center", + "all over the city", + "cambridge leisure park", + "north area", + "most of them are in the center", + "different area", + "south side", + "that area .", + "museum", + "town center", + "same part of town", + "western cambridge", + "south of cambridge", + "west side of town", + "that", + "in that side", + "center of cambridge", + "does", + "whole of cambridge", + "all", + "to the south", + "same location", + "same", + "center part of town", + "center of town", + "norwich", + "city centre .", + "north side", + "the centre area", + "the centry area", + "west cambridge", + "west end of the city", + "your desired location", + "in town", + "entre", + "in the center", + "not", + "centre of town ?", + "city", + "on the centre", + "the centre of town", + "does n't matter", + "went side of town", + "the centre", + "various", + "the area", + "towns centre", + "right in the center of town", + "in the area", + "west end of town", + "central cambridge", + "the west side", + "eastside", + "west", + "this side of town", + "west side of the city", + "near the restaurant", + "cb30aq", + "center of town .", + "not really", + "in the east", + "same area as tandoori palace", + "west part of cambridge", + "centre by galleria", + "east area", + "wet end", + "do not care", + "western", + "northern area of town", + "western area of town", + "other parts of town", + "the same area", + "town centre", + "the same road", + "east side of town", + "center are", + "none", + "city center", + "centre area .", + "in that area", + "clifton way", + "west part of town", + "south end", + "western part of town", + "several different parts of town", + "northern", + "centre part of town", + "city centre", + "west area of town", + "east section", + "this area", + "nearby", + "the west", + "same part of town as your restaurant", + "throughout the city", + "south park of town", + "south of town", + "other areas", + "south cambridge", + "the west - side", + "the center part of town", + "west side", + "does not matter", + "east of town", + "do not have a preference", + "same area of town", + "south part of town", + "all over town", + "cambridge", + "south", + "the west end", + "same part", + "the centre part of town", + "in the centre", + "centre area of town", + "east cambridge", + "northern area", + "cambridge 's centre", + "west end" + ] + }, + "name": { + "description": "name of the attraction", + "is_categorical": false, + "possible_values": [] + }, + "pricerange": { + "description": "the price range for the attraction, from cheap to expensive", + "is_categorical": true, + "possible_values": [ + "5 pound entrance fee", + "unaware of what their entrance fee is", + "2 pounds", + "we do n't have any information", + "4 pounds", + "no entrance fee", + "do n't have information", + "5 pounds", + "do n't see a price listed", + "expensive", + "neither prices are listed", + "not listed", + "entrance fee", + "free", + "free admission", + "3.50 pounds", + "not sure of the fee", + "moderately priced", + "cheap", + "from free to 2 pounds 50", + "they do n't have the entrance fee posted" + ] + }, + "choice": { + "description": "number of attractions matching requests of user", + "is_categorical": false, + "possible_values": [] + } + } + }, + "booking": { + "description": "to arrange with a taxi, restaurant, train, etc.", + "slots": { + "time": { + "description": "time for an order", + "is_categorical": false, + "possible_values": [] + }, + "day": { + "description": "day for an order, from monday to sunday", + "is_categorical": true, + "possible_values": [ + "one", + "sundar", + "2", + "6", + "tonight", + "that time frame", + "that", + "this evening", + "saturday night thru monday", + "we d", + "sunday", + "wednesday", + "same day", + "at that time", + "tuesday", + "sunday and monday", + "friday", + "vmhj6y3i", + "1", + "monday", + "tues", + "the same day", + "sunday 18/06/2017", + "tuesday instead of thursday", + "vmhj6y3i.", + "weds", + "tuesday 's", + "saturday", + "today", + "thursday", + "tuesday through friday" + ] + }, + "stay": { + "description": "for how long the user wish to be at a place", + "is_categorical": false, + "possible_values": [] + }, + "people": { + "description": "how many person the order is for", + "is_categorical": false, + "possible_values": [] + }, + "name": { + "description": "name of the ordered place", + "is_categorical": false, + "possible_values": [] + }, + "Ref": { + "description": "reference number of the order", + "is_categorical": false, + "possible_values": [] + } + } + }, + "train": { + "description": "query and order a train", + "slots": { + "destination": { + "description": "destination of the train", + "is_categorical": false, + "possible_values": [] + }, + "arriveBy": { + "description": "arrival time of the train", + "is_categorical": false, + "possible_values": [] + }, + "departure": { + "description": "departure location of the train", + "is_categorical": false, + "possible_values": [] + }, + "duration": { + "description": "the length of time the train trip lasts", + "is_categorical": false, + "possible_values": [] + }, + "people": { + "description": "number of people booking for train", + "is_categorical": false, + "possible_values": [] + }, + "day": { + "description": "day of the train", + "is_categorical": true, + "possible_values": [ + "everday", + "this day", + "every day", + "13:07", + "sundays", + "none", + "that", + "thursdays", + "sunday", + "frday", + "wednesday", + "same day", + "between friday and wednesday", + "tuesday", + "tr2519", + "sat", + "all week", + "friday", + "saturdays", + "monday", + "weds", + "that day", + "every", + "saturday", + "other days are available", + "thursday", + "fiday", + "cambridge", + "daily", + "saturday-", + "mondays" + ] + }, + "Ref": { + "description": "reference number of the order", + "is_categorical": false, + "possible_values": [] + }, + "leaveAt": { + "description": "leaving time for the train", + "is_categorical": false, + "possible_values": [] + }, + "price": { + "description": "price for the train ticket", + "is_categorical": false, + "possible_values": [] + }, + "choice": { + "description": "number of trains that meets requests of the user", + "is_categorical": false, + "possible_values": [] + } + } + }, + "hotel": { + "description": "to query hotel information and place an order", + "slots": { + "internet": { + "description": "internet option at the hotel", + "is_categorical": true, + "possible_values": [ + "no", + "yes", + "none", + "free" + ] + }, + "area": { + "description": "area or place of the hotel", + "is_categorical": true, + "possible_values": [ + "east area of town", + "north location", + "that part of town", + "centre city", + "west of town", + "center", + "east and the north .", + "in cambridge", + "southern part of town", + "the city 's south side", + "other parts of the city", + "close to where you 'll be dining", + "east near other shops and boutiques", + "any area", + "that area of town", + "same area as the restaurant", + "north end of the city", + "centrally - located", + "north of town", + "northern side of town", + "that area", + "east part of town", + "centre", + "on the west", + "north", + "the south area", + "south area", + "all of cambridge", + "eastern part of the city", + "centre of town", + "east part of time", + "the north of the city", + "centrally located", + "on the south", + "the north", + "northern cambridge", + "all over", + "northend", + "downtown", + "centre area", + "westside", + "northern part of town", + "least", + "the east", + "the south side", + "east section of town", + "the south", + "south end of the city", + "across town", + "centre of the city", + "or west ?", + "centra", + "south closer to the museum", + "north side of town", + "east", + "el shaddai", + "different part of town", + "west area", + "all areas", + "on the west side", + "nearby the restaurant", + "the east side", + "same general area", + "you are interested in", + "centrally", + "north park of town", + "east side", + "in the north", + "centre of cambridge", + "not to far from the restaurant", + "any", + "west part of the city", + "the west area", + "centre area of town", + "centre part", + "that vicinity", + "the north part of town", + "all around the city", + "do n't care", + "same area", + "the north end", + "north part of the city", + "north end", + "there", + "and in the centre", + "north and centre", + "center of the town", + "near the centre", + "west side near the restaurant", + "that region", + "southern", + "north part of town", + "except in the east", + "another area of town", + "near the museum", + "nborth", + "do n't have a preference", + "no particular", + "that side of town", + "south side of town", + "centre cambridge", + "eat", + "north area of town", + "the", + "all over the city", + "north par of town", + "north part of town .", + "next door", + "the town centre", + "different area", + "north area", + "south side", + "any part of the city", + "near centre", + "west part", + "everywhere but the city centre", + "same part of town", + "anywhere in cambridge", + "western cambridge", + "the center of town", + "north of cambridge", + "west side of town", + "in the west", + "south area of town", + "the east side of town", + "wast", + "southend", + "center of town", + "west areas of town", + "north side", + "on the eastside", + "the centre area", + "west end of the city", + "central location", + "in town", + "in the center", + "not", + "the centre of the city", + "in the city", + "the east area", + "the easy", + "the centre of cambridge", + "the centre of town", + "does n't matter", + "various parts of the city", + "several areas of town", + "norht", + "the centre", + "various", + "the area", + "not in the south", + "west|centre", + "in the area", + "on the north", + "the west side", + "eastside", + "west", + "near the restaurant", + "north cambridge", + "not really", + "south cambridge area", + "throughout cambridge", + "any part of town", + "western", + "other parts of town", + "the same area", + "town centre", + "east side of town", + "none", + "do n't really care", + "city center", + "southside", + "and east", + "all across town", + "south end of town", + "different parts of the city", + "same side of town", + "west part of town", + "the north side", + "south end", + "except in the north", + "northern", + "centre part of town", + "city centre", + "eastern cambridge", + "the town center", + "eastern", + "the east end of town", + "all parts of the city", + "different parts of town", + "the west", + "throughout the city", + "north end of town", + "south of town", + "a different area", + "south cambridge", + "the northern part of town", + "west side", + "across cambridge", + "does not matter", + "east of town", + "do not have a preference", + "all over town", + "south part of town", + "same area of town", + "somewhere else", + "northside", + "the westside", + "cambridge", + "south", + "the west end", + "northe part of town", + "cheap", + "in the centre", + "east area", + "east cambridge", + "northern area", + "the same", + "west end" + ] + }, + "stars": { + "description": "star rating of the hotel", + "is_categorical": true, + "possible_values": [ + "4|5", + "one", + "2", + "four - star", + "unrated", + "does not show", + "3|4", + "none", + "different star ratings", + "four", + "4", + "not as fancy", + "3", + "yes", + "drop the star rating", + "four starts", + "no star rating", + "3-star", + "0 to 4", + "5", + "2-stars", + "does not have", + "1", + "two", + "lower", + "2-star", + "1-star", + "4-stars", + "ranging from 2 - 4 stars", + "three", + "2-starred", + "foru", + "0-star", + "3-stars", + "0", + "4-star", + "no", + "several", + "zero", + "five", + "5-star", + "not rated" + ] + }, + "parking": { + "description": "parking facility at the hotel", + "is_categorical": true, + "possible_values": [ + "no", + "yes", + "none", + "free" + ] + }, + "phone": { + "description": "hotel phone number", + "is_categorical": false, + "possible_values": [] + }, + "name": { + "description": "name of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "pricerange": { + "description": "price budget of the hotel", + "is_categorical": true, + "possible_values": [ + "moderately pricing", + "same area and price range", + "different price ranges", + "that range", + "moderate to cheap range", + "cheaply prices", + "or expensive ?", + "all different price ranges", + "inexpensive", + "cheap|moderate", + "affordable pricing", + "is moderately priced", + "your chosen", + "moderatly priced", + "different", + "the cheapest", + "centre", + "moderate prices", + "same part of town and price range as the restaurant", + "cheaper side", + "north", + "moderately priced .", + "more budget - friendly", + "lower", + "cheap > moderate", + "same price range", + "moderatley priced", + "cheap>moderate", + "more expensive", + "upscale", + "free", + "different price range", + "very affordable", + "moderate to cheap", + "moderately to expensively priced", + "modrate", + "same price", + "fairly cheap", + "super cheap", + "relatively cheap", + "cheap priced", + "expensive price range", + "moderate pricing", + "your price", + "cheaply priced", + "expensive range", + "same area and price range as the restaurant", + "same area and price range as my restaurant", + "moderately prices", + "even cheaper", + "moderate or cheap", + "moderately - priced or cheap", + "higher price range", + "any", + "oderately priced", + "other ranges", + "lower end", + "great", + "extremely reasonable", + "on the cheap side", + "budget - priced", + "do n't care", + "low cost", + "varying price ranges", + "moderate|cheap", + "pretty cheap", + "quite cheap", + "moderate priced", + "cheap to moderate", + "no particular", + "cheaper", + "very inexpensive", + "does not say", + "epxensive", + "expensively - priced", + "cheap price range", + "expensive or cheap", + "on the more expensive side", + "more moderately priced", + "hotel", + "moderate in price", + "quite expensive", + "cheap range", + "that", + "slightly more expensive", + "moderately priceed", + "rather expensive", + "all", + "does", + "moderate price", + "chear", + "pretty expensive", + "moderatly price", + "that price range", + "moderate range", + "much cheaper", + "a little pricey", + "cheap side", + "moderately", + "not", + "mostly in the cheap to moderate price range", + "economically priced", + "cheaply", + "does n't matter", + "cheapest", + "range", + "fairly cheap compared to other hotels", + "expensively priced", + "very cheap", + "not really", + "your price range", + "do not care", + "none", + "less costly", + "moderate price range", + "inexpensively - priced", + "moderately price", + "$ 100", + "espensive", + "not expensive", + "cheap to expensive", + "cheap or moderate", + "cheaply - priced", + "affordable", + "moderate price point", + "guesthouses", + "expensive", + "priced moderately", + "moderate", + "moderately - priced", + "same general price", + "cheaper than the others", + "moderately pried", + "expensive to moderate", + "unfortunately do not have the price", + "ca n't view that information", + "$100", + "hotels", + "same pricerange", + "does not matter", + "moderately priced", + "economical", + "cheap", + "the same" + ] + }, + "people": { + "description": "number of people for the hotel booking", + "is_categorical": false, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "address": { + "description": "exact location of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "type": { + "description": "what is the type of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "stay": { + "description": "length of stay at the hotel", + "is_categorical": false, + "possible_values": [] + }, + "day": { + "description": "day of the hotel booking", + "is_categorical": true, + "possible_values": [ + "thursday--", + "wednesday|friday", + "none", + "friday > tuesday", + "sunday", + "same", + "wednesday", + "same day", + "tuesday", + "monday < thursday", + "friday", + "saturday|tuesday", + "friday>tuesday", + "monday", + "monday<thursday", + "tuesday/", + "saturday", + "thursday", + "sunday>monday", + "sunday > monday" + ] + }, + "choice": { + "description": "number of hotels that meets requests of the user", + "is_categorical": false, + "possible_values": [] + } + } + }, + "police": { + "description": "find police stations", + "slots": { + "address": { + "description": "exact location of the police station", + "is_categorical": false, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the police station", + "is_categorical": false, + "possible_values": [] + }, + "phone": { + "description": "police station phone number", + "is_categorical": false, + "possible_values": [] + } + } + } + }, + "intents": { + "inform": { + "description": "inform user of value for a certain slot" + }, + "request": { + "description": "ask for value of a slot" + }, + "nobook": { + "description": "inform user of booking failure" + }, + "reqmore": { + "description": "ask user for more instructions" + }, + "book": { + "description": "place an order for user" + }, + "bye": { + "description": "end a conversation and say goodbye to user" + }, + "thank": { + "description": "express gratitude" + }, + "welcome": { + "description": "welcome" + }, + "offerbook": { + "description": "offer to place an order for user" + }, + "offerbooked": { + "description": "inform user that an order is succussful" + }, + "recommend": { + "description": "recommend a choice for user request" + }, + "greet": { + "description": "express greeting" + }, + "nooffer": { + "description": "inform user that no options matches user request" + }, + "select": { + "description": "provide several choices for user to choose from" + } + }, + "binary_dialogue_act": [ + { + "intent": "request", + "domain": "hotel", + "slot": "area", + "value": "" + }, + { + "intent": "inform", + "domain": "booking", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "booking", + "slot": "stay", + "value": "" + }, + { + "intent": "request", + "domain": "booking", + "slot": "day", + "value": "" + }, + { + "intent": "reqmore", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "bye", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "thank", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "welcome", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "departure", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "day", + "value": "" + }, + { + "intent": "offerbook", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "greet", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "pricerange", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "destination", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "leaveAt", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "arriveBy", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "duration", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "price", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "internet", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "parking", + "value": "" + }, + { + "intent": "inform", + "domain": "attraction", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "area", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "type", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "food", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "area", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "phone", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "Ref", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "phone", + "value": "" + }, + { + "intent": "nobook", + "domain": "booking", + "slot": "", + "value": "" + }, + { + "intent": "inform", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "people", + "value": "" + }, + { + "intent": "request", + "domain": "booking", + "slot": "people", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "stars", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "type", + "value": "" + }, + { + "intent": "select", + "domain": "hotel", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "entrance fee", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "pricerange", + "value": "" + }, + { + "intent": "request", + "domain": "booking", + "slot": "time", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "phone", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "pricerange", + "value": "" + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "name", + "value": "" + }, + { + "intent": "nooffer", + "domain": "attraction", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "restaurant", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "attraction", + "slot": "", + "value": "" + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "", + "value": "" + }, + { + "intent": "inform", + "domain": "taxi", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "departure", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "destination", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "taxi_types", + "value": "" + }, + { + "intent": "book", + "domain": "booking", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "taxi_phone", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "arriveBy", + "value": "" + }, + { + "intent": "request", + "domain": "taxi", + "slot": "leaveAt", + "value": "" + }, + { + "intent": "nooffer", + "domain": "restaurant", + "slot": "", + "value": "" + }, + { + "intent": "nooffer", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "nooffer", + "domain": "hotel", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "name", + "value": "" + }, + { + "intent": "recommend", + "domain": "hotel", + "slot": "", + "value": "" + }, + { + "intent": "recommend", + "domain": "attraction", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "name", + "value": "" + }, + { + "intent": "offerbooked", + "domain": "train", + "slot": "", + "value": "" + }, + { + "intent": "recommend", + "domain": "restaurant", + "slot": "", + "value": "" + } + ], + "state": { + "hotel": { + "name": "", + "area": "", + "parking": "", + "pricerange": "", + "stars": "", + "internet": "", + "type": "", + "stay": "", + "day": "", + "people": "" + }, + "train": { + "leaveAt": "", + "destination": "", + "day": "", + "arriveBy": "", + "departure": "", + "people": "" + }, + "restaurant": { + "food": "", + "pricerange": "", + "name": "", + "area": "", + "time": "", + "day": "", + "people": "" + }, + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "taxi": { + "leaveAt": "", + "destination": "", + "departure": "", + "arriveBy": "" + } + } +} \ No newline at end of file diff --git a/data/unified_datasets/multiwoz23/original_data.zip b/data/unified_datasets/multiwoz23/original_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..68bffe4bfedf2ecc860c5e204323a91e8a6882c8 Binary files /dev/null and b/data/unified_datasets/multiwoz23/original_data.zip differ diff --git a/data/unified_datasets/multiwoz23/preprocess.py b/data/unified_datasets/multiwoz23/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..d293e7f9c0f2c78f60750f858a34e76d2100ecb4 --- /dev/null +++ b/data/unified_datasets/multiwoz23/preprocess.py @@ -0,0 +1,1421 @@ +import copy +import re +import zipfile +import json +import os +from tqdm import tqdm +import sys +import difflib +from fuzzywuzzy import fuzz +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from convlab2.util.file_util import read_zipped_json, write_zipped_json +import logging + + +logging.basicConfig(level=logging.INFO) +self_dir = (os.path.abspath(os.getcwd())) + +REF_SYS_DA = { + 'Attraction': { + 'Addr': "address", 'Area': "area", 'Choice': "choice", + 'Fee': "entrance fee", 'Name': "name", 'Phone': "phone", + 'Post': "postcode", 'Price': "pricerange", 'Type': "type", + 'none': None, 'Open': None + }, + 'Hospital': { + 'Department': 'department', 'Addr': 'address', 'Post': 'postcode', + 'Phone': 'phone', 'none': None + }, + 'Booking': { + 'Day': 'day', 'Name': 'name', 'People': 'people', + 'Ref': 'Ref', 'Stay': 'stay', 'Time': 'time', + 'none': None + }, + 'Hotel': { + 'Addr': "address", 'Area': "area", 'Choice': "choice", + 'Internet': "internet", 'Name': "name", 'Parking': "parking", + 'Phone': "phone", 'Post': "postcode", 'Price': "pricerange", + 'Stars': "stars", 'Type': "type", 'Stay': 'stay', 'Day': 'day', 'People': 'people', + 'none': None + }, + 'Restaurant': { + 'Addr': "address", 'Area': "area", 'Choice': "choice", + 'Name': "name", 'Food': "food", 'Phone': "phone", + 'Post': "postcode", 'Price': "pricerange", + 'Time': 'time', 'Day': 'day', 'People': 'people', + 'none': None + }, + 'Taxi': { + 'Arrive': "arriveBy", 'Car': "taxi_types", 'Depart': "departure", + 'Dest': "destination", 'Leave': "leaveAt", 'Phone': "taxi_phone", + 'none': None + }, + 'Train': { + 'Arrive': "arriveBy", 'Choice': "choice", 'Day': "day", + 'Depart': "departure", 'Dest': "destination", + 'Leave': "leaveAt", 'People': "people", 'Ref': "Ref", + 'Time': "duration", 'none': None, 'Ticket': 'price', + }, + 'Police': { + 'Addr': "address", 'Post': "postcode", 'Phone': "phone", 'none': None + }, +} + +# taxi restaurant attraction train +slot_to_type = { + 'taxi-destination': 'non', + 'taxi-departure': 'non', + 'taxi-leaveAt': 'non', + 'taxi-arriveBy': 'non', + 'restaurant-food': 'non', + 'restaurant-name': 'non', + 'attraction-address': 'non', + 'attraction-postcode': 'non', + 'restaurant-pricerange': 'cat', + 'restaurant-address': 'non', + 'restaurant-area': 'cat', + 'restaurant-postcode': 'non', + 'attraction-phone': 'non', + 'attraction-entrance fee': 'non', + 'booking-time': 'non', + 'booking-day': 'cat', + 'attraction-type': 'non', + 'attraction-area': 'cat', + 'train-destination': 'non', + 'train-arriveBy': 'non', + 'train-departure': 'non', + 'hotel-internet': 'cat', + 'hotel-area': 'cat', + 'booking-stay': 'non', + 'booking-people': 'non', + 'train-duration': 'non', + 'train-people': 'non', + 'train-day': 'cat', + 'train-Ref': 'non', + 'hotel-stars': 'cat', + 'train-leaveAt': 'non', + 'train-price': 'non', + 'hotel-parking': 'cat', + 'hotel-phone': 'non', + 'hotel-name': 'non', + 'hotel-pricerange': 'cat', + 'hotel-people': 'non', + 'restaurant-phone': 'non', + 'hotel-postcode': 'non', + 'hotel-address': 'non', + 'attraction-name': 'non', + 'hotel-type': 'non', + 'restaurant-people': 'non', + 'train-choice': 'non', + 'attraction-pricerange': 'cat', + 'hotel-stay': 'non', + 'booking-name': 'non', + 'booking-Ref': 'non', + 'restaurant-time': 'non', + 'restaurant-day': 'cat', + 'hotel-day': 'cat', + 'hotel-choice': 'non', + 'restaurant-choice': 'non', + 'attraction-choice': 'non', + 'taxi-taxi_phone': 'non', + 'taxi-taxi_types': 'non', + 'police-address': 'non', + 'police-postcode': 'non', + 'police-phone': 'non' +} + +state_cat_slot_value_dict = { + "hotel-pricerange": { + "cheap": 735, + "moderate": 1063, + "expensive": 594, + }, + "hotel-parking": { + "yes": 1809, + "no": 126, + "free": 4, + }, + "hotel-day": { + "tuesday": 385, + "wednesday": 410, + "monday": 365, + "saturday": 407, + "friday": 393, + "thursday": 384, + "sunday": 369, + }, + "train-day": { + "wednesday": 533, + "monday": 533, + "saturday": 543, + "thursday": 547, + "friday": 563, + "tuesday": 553, + "sunday": 613, + }, + "hotel-stars": { + "4": 1263, + "2": 193, + "0": 201, + "3": 401, + "5": 45, + "1": 45, + }, + "hotel-internet": { + "yes": 1841, + "no": 79, + "free": 2 + }, + "hotel-area": { + "east": 416, + "north": 717, + "centre": 538, + "south": 289, + "west": 316, + }, + "attraction-area": { + "centre": 1290, + "west": 332, + "north": 155, + "south": 240, + "east": 272, + }, + "restaurant-pricerange": { + "expensive": 1477, + "cheap": 758, + "moderate": 1028, + }, + "restaurant-area": { + "centre": 1745, + "south": 398, + "north": 390, + "east": 360, + "west": 423, + }, + "restaurant-day": { + "thursday": 362, + "wednesday": 412, + "friday": 395, + "monday": 383, + "sunday": 399, + "saturday": 421, + "tuesday": 350, + } +} + + +synonyms = [ + ["el shaddia guesthouse", "el shaddai"], + [ "peterborough", "peterbourgh"], + ["night club", "nightclub", 'nightclubs'], + ["boat", "boating"], + ["portugese", "portuguese"], + ["guesthouse", "guest house"], + ["seafood", "sea food"], + ["christ 's college", "christ college"], + ["huntingdon marriott hotel"] +] + +state_cat_slot_ds = [k for k, v in slot_to_type.items() if v == 'cat'] + +da_cat_slot_values = { + # 'hotel-stay': ['1', '2', '3', '4', '5'], + 'hotel-internet': ['free', 'no', 'none', 'yes'], + 'hotel-parking': ['free', 'no', 'none', 'yes'], +} + +state_cat_slot_values = {} + +multiwoz_desc = { + 'taxi': { + 'domain': 'taxi information query system', + 'taxi_phone': 'taxi phone number', + 'taxi_types': 'taxi type', + }, + 'restaurant': { + 'domain': 'restaurant information query system', + 'address': 'exact location of the restaurant', + 'postcode': 'postcode of the restaurant', + 'phone': 'restaurant phone number', + 'choice': 'number of restaurants meeting requests of user', + }, + 'attraction': { + 'domain': 'an entertainment that is offered to the public', + 'address': 'details of where the attraction is', + 'postcode': 'postcode of the attraction', + 'phone': 'phone number of the attraction', + 'entrance fee': 'the fee charged for admission to the attraction', + 'pricerange': 'the price range for the attraction, from cheap to expensive', + 'choice': 'number of attractions matching requests of user' + }, + 'booking': { + 'domain': 'to arrange with a taxi, restaurant, train, etc.', + 'time': 'time for an order', + 'day': 'day for an order, from monday to sunday', + 'stay': 'for how long the user wish to be at a place', + 'people': 'how many person the order is for', + 'name': 'name of the ordered place', + 'Ref': 'reference number of the order' + }, + 'train': { + 'domain': 'query and order a train', + 'duration': 'the length of time the train trip lasts', + 'Ref': 'reference number of the order', + 'price': 'price for the train ticket', + 'choice': 'number of trains that meets requests of the user', + }, + 'hotel': { + 'domain': 'to query hotel information and place an order', + 'address': 'exact location of the hotel', + 'postcode': 'postcode of the hotel', + 'phone': 'hotel phone number', + 'choice': 'number of hotels that meets requests of the user', + }, + 'police': { + 'domain': 'find police stations', + 'address': 'exact location of the police station', + 'postcode': 'postcode of the police station', + 'phone': 'police station phone number', + }, + 'intents': { + 'inform': 'inform user of value for a certain slot', + 'request': 'ask for value of a slot', + 'nobook': 'inform user of booking failure', + 'reqmore': 'ask user for more instructions', + 'book': 'place an order for user', + 'bye': 'end a conversation and say goodbye to user', + 'thank': 'express gratitude', + 'welcome': 'welcome', + 'offerbooked': 'inform user that an order is succussful', + 'recommend': 'recommend a choice for user request', + 'greet': 'express greeting', + 'nooffer': 'inform user that no options matches user request', + 'offerbook': 'offer to place an order for user', + 'select': 'provide several choices for user to choose from', + } +} + +digit2word = { + '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five', + '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten' +} + + +def pharse_in_sen(phrase, sen): + ''' + match value in the sentence + :param phrase: str + :param sen: str + :return: start, end if matched, else None, None + ''' + assert isinstance(phrase, str) + pw = '(^|[\s,\.:\?!-])(?P<v>{})([\s,\.:\?!-]|$)' + pn = '(^|[\s\?!-]|\D[,\.:])(?P<v>{})($|[\s\?!-]|[,\.:]\D|[,\.:]$)' + + if phrase.isdigit() and phrase in digit2word: + phrase = digit2word[phrase] + p = re.compile(pw.format(re.escape(phrase)), re.I) + m = re.search(p, sen) + if m: + num = len(re.findall(p, sen)) + # if num > 1: + # match['>1'] += 1 + # else: + # match['1'] += 1 + return m.span('v'), num + # match['0'] += 1 + if phrase.isdigit(): + pattern = pn + else: + pattern = pw + p = re.compile(pattern.format(re.escape(phrase)), re.I) + m = re.search(p, sen) + if m: + num = len(re.findall(p, sen)) + # if num > 1: + # match['>1'] += 1 + # else: + # match['1'] += 1 + return m.span('v'), num + return (None, None), 0 + + + + + +def update_state(state, update): + # print('======================') + # print(state) + # print(update) + # print('======================') + + for service, service_update in update.items(): + if service not in state: + state[service] = copy.deepcopy(service_update) + else: + state[service].update(update[service]) + + +def convert_da(utt, da_dict, binary_ont, intent_ont, did, tid, da_cat_slot_values): + ''' + convert multiwoz dialogue acts to required format + :param utt: user or system utt + :param da_dict: multiwoz da + :param binary_ont: binary ontology + :param intent_ont: intent ontology + :return: + ''' + converted_da = { + 'categorical': [], + 'non-categorical': [], + 'binary': [] + } + + for Domain_Act, S, v in da_dict: + Domain, Act = Domain_Act.split('-') + if Domain.lower() in ['police', 'hospital', 'bus']: + continue + + if Act.lower() not in intent_ont: + intent_ont[Act.lower()] = {} + + # general domain is converted to empty domain. e.g. thank, bye + if Domain == 'general': + assert S == 'none' + assert v == 'none' + converted_dict = { + 'intent': Act.lower(), + 'domain': '', + 'slot': '', + 'value': '' + } + converted_da['binary'].append(converted_dict) + + if converted_dict not in binary_ont: + binary_ont.append(converted_dict) + continue + + + + try: + reformated_slot = REF_SYS_DA[Domain][S] + except: + # print('44444444444444444444444444444444') + # print(Domain, S) + # logging.info('slot not in REF_SYS_DA, drop') + continue + + # if slot is None, da should be converted into binary + if reformated_slot is None: + if not (S == 'none' and v == 'none'): + # mainly for `Open` slot + # print('11111111111111111111') + # print(Domain_Act, S, v) + continue + # Booking-Inform none none + # Police-Inform none none + # Train-OfferBook none none + converted_dict = { + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': '', + 'value': '' + } + converted_da['binary'].append(converted_dict) + if converted_dict not in binary_ont: + binary_ont.append(converted_dict) + continue + + reformated_domain_slot = Domain.lower() + '-' + reformated_slot + + if Act.lower() == 'request': + converted_dict = { + 'intent': 'request', + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': '' + } + converted_da['binary'].append(converted_dict) + + if converted_dict not in binary_ont: + binary_ont.append(converted_dict) + continue + + # vs = da_dict[(Domain_Act, S)]['values'] + + if reformated_domain_slot in slot_to_type and slot_to_type[reformated_domain_slot] == 'cat': + origin_v = v + v = v.lower() + # if reformated_domain_slot in cat_slot_proj: + # v = cat_slot_proj[reformated_domain_slot][v] + if reformated_domain_slot not in da_cat_slot_values: + da_cat_slot_values[reformated_domain_slot] = [] + # if v not in cat_slot_values[reformated_domain_slot]: + da_cat_slot_values[reformated_domain_slot].append(v) + converted_da['categorical'].append({ + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': v + }) + if 'start_word' in da_dict[(Domain_Act, S, origin_v)]: + start_ws = da_dict[(Domain_Act, S, origin_v)]['start_word'] + end_ws = da_dict[(Domain_Act, S, origin_v)]['end_word'] + utt_list = utt.split() + for start_w, end_w in zip(start_ws, end_ws): + if start_w > len(utt_list) or end_w > len(utt_list): + continue + start_ch = 0 + for i in range(start_w): + start_ch += len(utt_list[i]) + 1 + end_ch = start_ch + for i in range(start_w, end_w): + end_ch += len(utt_list[i]) + 1 + try: + end_ch += len(utt_list[end_w]) + except: + print(utt_list, start_w, end_w) + if not utt[start_ch: end_ch] == origin_v: + # print('2222222222222222222222222') + # print('\n'.join([v, utt[start_ch: end_ch - 1]])) + continue + + else: + converted_da['categorical'][-1].update({ + 'start': start_ch, + 'end': end_ch + }) + break; + + else: + if 'start_word' not in da_dict[(Domain_Act, S, v)]: + # todo no span annotation + converted_da['non-categorical'].append({ + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': v + }) + continue + + start_ws = da_dict[(Domain_Act, S, v)]['start_word'] + end_ws = da_dict[(Domain_Act, S, v)]['end_word'] + utt_list = utt.split() + found = True + for start_w, end_w in zip(start_ws, end_ws): + if start_w > len(utt_list) or end_w > len(utt_list): + continue + start_ch = 0 + for i in range(start_w): + start_ch += len(utt_list[i]) + 1 + end_ch = start_ch + for i in range(start_w, end_w): + end_ch += len(utt_list[i]) + 1 + try: + end_ch += len(utt_list[end_w]) + except: + print(utt_list, start_w, end_w) + if not utt[start_ch: end_ch] == v: + # print('2222222222222222222222222') + # print('\n'.join([v, utt[start_ch: end_ch - 1]])) + continue + + else: + found = True + converted_da['non-categorical'].append({ + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': v, + 'start': start_ch, + 'end': end_ch + }) + break + + if not found: + converted_da['non-categorical'].append({ + 'intent': Act.lower(), + 'domain': Domain.lower(), + 'slot': reformated_slot, + 'value': v + }) + return converted_da + + +def get_state_update(prev_state, cur_state, dialog, did, tid, utt, coref_dict, slot_notfound_dict, da_cat_slot_values): + prev_turns = dialog['turns'] + state_update = {'categorical': [], 'non-categorical': []} + notfoundnum = 0 + total_value = 0 + + diff_state = {} + if prev_state is None: + diff_state = {domain: {slot: value for slot, value in cur_state[domain].items() if value != ''} for domain in + cur_state} + else: + assert len(prev_state) == len(cur_state), print(prev_state, cur_state) + for domain, domain_state in prev_state.items(): + if domain not in diff_state: + diff_state[domain] = {} + for slot, value in domain_state.items(): + if value != cur_state[domain][slot]: + # assert len(cur_state[domain][slot]) > 0, print(did, tid, domain, slot, utt) + diff_state[domain][slot] = cur_state[domain][slot] + + ret_diff_state = copy.deepcopy(diff_state) + + + + for domain in diff_state: + for slot in diff_state[domain]: + + total_value += 1 + fix_or = False + if '|' in diff_state[domain][slot]: + value = diff_state[domain][slot].split('|')[0] + else: + value = diff_state[domain][slot] + + # if dialog['original_id'] == 'PMUL2512' and tid == 17 and value == '02:45': + # value = '2:45' + + value_list = [value] + for _synonyms in synonyms: + if value in _synonyms: + value_list = _synonyms + + value_list.extend(get_time_variants(value)) + value_list.extend(get_genitive_variants(value)) + value_list.extend(get_bb_variants(value)) + + if value.endswith(' restaurant'): + value_list.append(value.split(' restaurant')[0]) + if value.endswith(' hotel'): + value_list.append(value.split(' hotel')[0]) + found = False + for value in value_list: + # categorical slots + if slot in ['internet', 'parking', 'pricerange', 'day', 'area', 'stars']: + reformated_domain_slot = '-'.join([domain, slot]) + if reformated_domain_slot in state_cat_slot_value_dict and (value in state_cat_slot_value_dict[reformated_domain_slot] or value in ['dontcare', '', 'none', 'not mentioned']): + state_update['categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot] + }) + if domain + '-' + slot not in da_cat_slot_values: + da_cat_slot_values[domain + '-' + slot] = [diff_state[domain][slot]] + da_cat_slot_values[domain + '-' + slot].append(diff_state[domain][slot]) + if value != diff_state[domain][slot]: + state_update['categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + else : + for _turn in prev_turns[::-1]: + found = False + for da in _turn['dialogue_act']['categorical']: + if da['value'] == value: + if 'start' in da: + state_update['categorical'][-1].update({ + 'utt_idx': _turn['utt_idx'], + 'start': da['start'], + 'end': da['end'], + 'from': 'prev_da_span' + }) + found = True + break + if found: + break + else: + state_update['categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + 'fixed_value': 'not found' + }) + if domain + '-' + slot not in da_cat_slot_values: + da_cat_slot_values[domain + '-' + slot] = [] + da_cat_slot_values[domain + '-' + slot].append(diff_state[domain][slot]) + ret_diff_state[domain][slot] = 'not found' + notfoundnum += 1 + # reformated_domain_slot = '-'.join([domain, slot] + found = True + break + + # process value ---> none + assert value not in ['none', 'not mentioned'] + if value in ['', 'dontcare']: + # if reformated_domain_slot not in state_cat_slot_values: + # state_cat_slot_values[reformated_domain_slot] = [] + # # if v not in cat_slot_values[reformated_domain_slot]: + # state_cat_slot_values[reformated_domain_slot].append(value) + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot] + }) + found = True + break + + # first look for values in coref_dict + for _Domain_Act, _Slot, _value in coref_dict: + _domain, _act = _Domain_Act.lower().split('-') + _slot = _Slot.lower() + _coref_value = coref_dict[(_Domain_Act, _Slot, _value)]['coref_value'] + if _coref_value == '': + continue + _coref_turn = coref_dict[(_Domain_Act, _Slot, _value)]['turn'] + if _coref_turn == -1: + continue + _coref_pos = coref_dict[(_Domain_Act, _Slot, _value)]['pos'] + if _coref_pos == '': + continue + _utt = coref_dict[(_Domain_Act, _Slot, _value)]['utt'] + if _domain == domain and _slot == slot and value == _coref_value: + + start_w, end_w = [int(p) for p in _coref_pos.split('-')] + utt_list = _utt.split() + start_ch = 0 + for i in range(start_w): + start_ch += len(utt_list[i]) + 1 + end_ch = start_ch + for i in range(start_w, end_w + 1): + end_ch += len(utt_list[i]) + 1 + end_ch -= 1 + + if not _utt[start_ch: end_ch] == _coref_value: + # print(111111111111111111111111111111111) + # print(_utt[start_ch: end_ch], _coref_value) + continue + + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + 'from': 'coref', + 'utt_idx': _coref_turn, + 'start': start_ch, + 'end': end_ch + }) + if value != diff_state[domain][slot]: + state_update['categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + found = True + + if found: + break + + # from da annotation + for _turn in prev_turns[::-1]: + for da in _turn['dialogue_act']['non-categorical']: + # if da['domain'] == domain and da['slot'] == slot and fuzz.ratio(da['value'], value) > 85: + # if not da['value'] == value: + # print(1111111111111111) + # print(value, da['value']) + + if fuzz.ratio(da['value'], value) > 85: + + if 'start' in da: + found = True + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + # 'value': da['value'], + 'value': diff_state[domain][slot], + 'utt_idx': _turn['utt_idx'], + 'start': da['start'], + 'end': da['end'], + 'from': 'prev_da_span' + }) + if value != diff_state[domain][slot]: + state_update['non-categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + if da['value'] != value: + state_update['non-categorical'][-1].update({'fixed_value':da['value']}) + ret_diff_state[domain][slot] = da['value'] + + break + if found: + break + + if found: + break + + # from utterance + for _turn in prev_turns[::-1]: + _utt = _turn['utterance'] + (start, end), num = pharse_in_sen(str(value), _utt) + if num: + assert value.lower() == _utt[start:end].lower() \ + or digit2word[value].lower() == _utt[start:end].lower() + found = True + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + # 'value': _utt[start:end].lower(), + # 'fixed_value': _utt[start:end].lower(), + 'from': 'prev_utt', + 'utt_idx': _turn['utt_idx'], + 'start': start, + 'end': end + }) + if value != diff_state[domain][slot]: + state_update['non-categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + if value != _utt[start:end].lower(): + state_update['non-categorical'][-1].update({'fixed_value': _utt[start:end].lower()}) + ret_diff_state[domain][slot] = _utt[start:end].lower() + found = True + break + if found: + break + + # from utterance + if not value.isdigit(): + for _turn in prev_turns[::-1]: + _utt = _turn['utterance'] + + s = difflib.SequenceMatcher(None, _utt, value) + matches = s.get_matching_blocks() + + for i, j, n in matches: + possible_value = _utt[i: i+len(value)] + + if i+ len(value) < len(_utt) and _utt[i+len(value)] not in [ ' ', ',', '.', '?', '!', '/'] : + possible_value += _utt[i+len(value):].split()[0] + + if possible_value.startswith('th '): + possible_value = possible_value[3:] + i += 3 + if i > 0 and _utt[i-1] not in [ ' ', ',', '.', '?', '!', '/']: + # cut first incomplete word + if len(possible_value.split()) > 1: + i += len(possible_value.split()[0]) + 1 + possible_value = ' '.join(possible_value.split()[1:]) + + + # prepend first incomplete word + # possible_value = _utt[:i].split()[-1] + possible_value + # i -= len(_utt[:i].split()[-1]) + + + if fuzz.token_sort_ratio(value, possible_value) > 92 or possible_value.startswith('ashley hotel and lovell lodge') : + found = True + + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + # 'value': possible_value, + # 'fixed_value': possible_value, + 'from':'prev_utt', + 'utt_idx': _turn['utt_idx'], + 'start': i, + 'end': i+len(possible_value) + }) + if value != diff_state[domain][slot]: + state_update['non-categorical'][-1].update({'fixed_value': value}) + ret_diff_state[domain][slot] = value + if possible_value != value: + state_update['non-categorical'][-1].update({'fixed_value': possible_value}) + ret_diff_state[domain][slot] = possible_value + break + # assert _utt[i:i+len(possible_value)] == possible_value, print(_utt, _utt[i:i+len(possible_value)], possible_value) + # break + # if not possible_value == value: + # print(3333333333333333) + # print(value) + # print(possible_value) + if found: + break + if found: + break + + if found: + break + if not found: + # print('3333333333333333333') + # print(did, tid) + # print(domain, slot, value) + # print([_t['utterance'] for _t in prev_turns]) + # assert slot not in ['internet', 'parking', 'pricerange', 'day', 'area', 'stars'] + + if (domain, slot) not in slot_notfound_dict: + slot_notfound_dict[(domain, slot)] = 1 + else: + slot_notfound_dict[(domain, slot)] += 1 + state_update['non-categorical'].append({ + 'domain': domain, + 'slot': slot, + 'value': diff_state[domain][slot], + 'fixed_value': 'not found' + }) + ret_diff_state[domain][slot] = 'not found' + notfoundnum += 1 + return state_update, notfoundnum, total_value, ret_diff_state + + +def merge_data_annotation(): + extract_dir = os.path.join(self_dir, 'original_data') + data25 = json.load(open(os.path.join(self_dir, extract_dir, 'data_meta_fixed.json'))) + # data21_train = json.load(open(os.path.join(self_dir, extract_dir, 'train.json'))) + # data21_val = json.load(open(os.path.join(self_dir, extract_dir, 'val.json'))) + # data21_test = json.load(open(os.path.join(self_dir, extract_dir, 'test.json'))) + # data21 = {} + # data21.update(data21_train) + # data21.update(data21_val) + # data21.update(data21_test) + + # update_from_25_cnt = 0 + # total_turn = 0 + # for dial_id, dialog in data21.items(): + # dial_id = dial_id + '.json' + # assert dial_id in data25 + # for i, _turn in enumerate(dialog['log']): + # total_turn += 1 + # if _turn['text'] == data25[dial_id]['log'][i]['text']: + # _turn['span_info'].extend(copy.deepcopy(data25[dial_id]['log'][i]['span_info'])) + # # _turn['span_info'] = list(set(_turn['span_info'])) + # # _turn['dialog_act'].update(copy.deepcopy(data25[dial_id]['log'][i]['dialog_act'])) + # for Domain_Intent in data25[dial_id]['log'][i]['dialog_act']: + # if Domain_Intent in _turn['dialog_act']: + # _turn['dialog_act'][Domain_Intent].extend(data25[dial_id]['log'][i]['dialog_act'][Domain_Intent]) + # else: + # _turn['dialog_act'][Domain_Intent] = copy.deepcopy(data25[dial_id]['log'][i]['dialog_act'][Domain_Intent]) + # # _turn['dialog_act'][Domain_Intent] = list(set(_turn['dialog_act'][Domain_Intent])) + # if 'coreference' in data25[dial_id]['log'][i]: + # _turn['coreference'] = copy.deepcopy(data25[dial_id]['log'][i]['coreference']) + # update_from_25_cnt += 1 + # else: + # # print('==============multiwoz21=================') + # # print(_turn['text']) + # # print('==============multiwoz25=================') + # # print(data25[dial_id]['log'][i]['text']) + # continue + + # print('{}/{} turns update from multiwoz25 data'.format(update_from_25_cnt, total_turn)) + return data25 + + +def preprocess(da_cat_slot_values, state_cat_slot_values): + all_data = [] + binary_ont = [] + intent_ont = {} + state_ont = {} + + data_splits = ['train', 'val', 'test'] + # data_splits = ['test'] + extract_dir = os.path.join(self_dir, 'original_data') + num_train_dialogue = 0 + num_train_utt = 0 + + num_match_error_da_span = 0 + + if not os.path.exists('data.zip') or not os.path.exists('ontology.json'): + # for data_split in data_splits: + data_zip_file = os.path.join(self_dir, 'original_data.zip') + if not os.path.exists(data_zip_file): + raise FileNotFoundError(data_zip_file) + + logging.info('unzip multiwoz data to {}'.format(extract_dir)) + archive = zipfile.ZipFile(data_zip_file, 'r') + archive.extractall(extract_dir) + + data = merge_data_annotation() + # exit() + # data = json.load(open(os.path.join(self_dir, extract_dir, 'data_meta_fixed.json'))) + train_list = open(os.path.join(self_dir, extract_dir, 'trainListFile')).read().split() + val_list = open(os.path.join(self_dir, extract_dir, 'valListFile')).read().split() + test_list = open(os.path.join(self_dir, extract_dir, 'testListFile')).read().split() + + total_not_found_slot = 0 + total_slot = 0 + total_turn = 0 + total_not_found_turn = 0 + total_not_found_state = 0 + + slot_notfound_dict = {} + + dialog_idx = 0 + for dialog_id, dialog in tqdm(data.items()): + + acc_not_found_flag = False + + coref_dict = {} + + data_split = None + for _split in data_splits: + if dialog_id.strip('.json') in eval(_split + '_list'): + data_split = _split + break + # assert data_split is not None + # if data_split != 'test': + # continue + + if data_split == 'train': + num_train_dialogue += len(data) + + dialog_idx += 1 + # if dialog_idx > 10: + # break + converted_dialogue = { + 'dataset': 'multiwoz23', + 'data_split': data_split, + 'dialogue_id': 'multiwoz23_' + str(dialog_idx), + 'original_id': dialog_id, + 'domains': [d for d in dialog['goal'] if + len(dialog['goal'][d]) != 0 and d in multiwoz_desc and d not in ['police', 'hospital', 'bus']], + 'turns': [], + } + + if data_split == 'train': + num_train_utt += len(dialog['log']) + + prev_state = None + accum_fixed_state = {} + for turn_id, turn in enumerate(dialog['log']): + + utt = turn['text'].lower() + # for several wrong words + utt = utt.replace('seeuni', 'see uni') + + utt = ' '.join(utt.split()) + das = turn['dialog_act'] + role = 'user' if turn_id % 2 == 0 else 'system' + spans = turn['span_info'] + + da_dict = {} + for Domain_Act in das: + Domain = Domain_Act.split('-')[0] + if Domain.lower() not in converted_dialogue['domains'] and Domain.lower() not in ['general', 'booking']: + continue + + Svs = das[Domain_Act] + for S, v in Svs: + v = v.lower() + if v.startswith('th '): + # print(v) + v = v[3:] + if v.startswith('he '): + # print(v) + v = v[3:] + + if (Domain_Act, S, v) not in da_dict: + da_dict[(Domain_Act, S, v)] = {} + + for span in spans: + Domain_Act, S, v, start_word, end_word = span + v = v.lower() + if not (Domain_Act, S, v) in da_dict: + # logging.info('span da annotation not found in multiwoz da label') + # logging.info(dialog_id, turn_id) + # logging.info((Domain_Act, S, v)) + # logging.info(da_dict) + num_match_error_da_span += 1 + else: + if v.startswith('th '): + # print(v) + v = v[3:] + start_word += 3 + if v.startswith('he '): + # print(v) + v = v[3:] + start_word += 3 + + if 'start_word' not in da_dict[(Domain_Act, S, v)]: + da_dict[(Domain_Act, S, v)]['start_word'] = [] + da_dict[(Domain_Act, S, v)]['end_word'] = [] + + da_dict[(Domain_Act, S, v)]['start_word'].append(start_word) + da_dict[(Domain_Act, S, v)]['end_word'].append(end_word) + + converted_turn = { + 'utt_idx': turn_id, + 'speaker': role, + 'utterance': utt, + 'dialogue_act': convert_da(utt, da_dict, binary_ont, intent_ont, dialog_id, turn_id, da_cat_slot_values), + } + + # for state annotations + if role == 'system': + turn_state = turn['metadata'] + cur_state = {} + for domain in turn_state: + if domain in ['police', 'hospital', 'bus']: + continue + if domain not in converted_dialogue['domains']: + continue + cur_state[domain] = {} + for subdomain in ['semi', 'book']: + for slot in turn_state[domain][subdomain]: + if slot == 'booked': + continue + if slot == 'ticket': # or (domain == 'train' and slot == 'people'): + # for cases where domain slot exists in REF but not in state + # because of check in evaluate.py + continue + + else: + fixed_slot = slot + state_ds = domain + '-' + fixed_slot + if state_ds not in slot_to_type: + logging.info('state slot not defined in da list') + logging.info(state_ds) + if turn_state[domain][subdomain][slot] in ['', [], 'not mentioned', 'none']: + cur_state[domain][fixed_slot] = "" + else: + if turn_state[domain][subdomain][slot].startswith('th '): + # print('state') + # print(turn_state[domain][subdomain][slot]) + turn_state[domain][subdomain][slot] = turn_state[domain][subdomain][slot][3:] + if turn_state[domain][subdomain][slot].startswith('he '): + # print('state') + # print(turn_state[domain][subdomain][slot]) + turn_state[domain][subdomain][slot] = turn_state[domain][subdomain][slot][3:] + + cur_state[domain][fixed_slot] = turn_state[domain][subdomain][slot] + + if domain not in state_ont: + state_ont[domain] = [] + if fixed_slot not in state_ont[domain]: + state_ont[domain].append(fixed_slot) + + if domain == 'train' and 'people' not in cur_state[domain]: + cur_state[domain]['people'] = '' + # if len(converted_turn['state'][domain]) == 0: + # converted_turn['state'].pop(domain) + if len(converted_dialogue['turns']) > 0: + # move state from system side to user side + converted_dialogue['turns'][-1]['state'] = copy.deepcopy(cur_state) + + # for state update annotations + state_update, _notfoundslot, _totalslot, ret_diff_state = get_state_update(prev_state, cur_state, converted_dialogue, + dialog_id, turn_id, turn['text'], coref_dict, + slot_notfound_dict, da_cat_slot_values) + + update_state(accum_fixed_state, ret_diff_state) + for domain in accum_fixed_state: + for slot in accum_fixed_state[domain]: + assert isinstance(accum_fixed_state[domain][slot], str), print(accum_fixed_state[domain][slot]) + + if _notfoundslot == 0: + # for slot in state_update['categorical']: + # assert 'fixed_value' not in slot + for slot in state_update['non-categorical']: + if slot['value'] not in ['', 'dontcare']: + assert 'utt_idx' in slot + + else: + flag = False + for slot in state_update['categorical']: + if 'fixed_value' in slot: + flag = True + break + for slot in state_update['non-categorical']: + if 'utt_idx' not in slot: + flag = True + break + assert flag, print(flag, state_update['non-categorical']) + + total_turn += 1 + total_slot += _totalslot + total_not_found_slot += _notfoundslot + total_not_found_turn += 1 if _notfoundslot > 0 else 0 + if _notfoundslot > 0: + acc_not_found_flag = True + if acc_not_found_flag: + total_not_found_state += 1 + + coref_dict = {} + converted_dialogue['turns'][-1]['state_update'] = copy.deepcopy(state_update) + converted_dialogue['turns'][-1]['fixed_state'] = copy.deepcopy(accum_fixed_state) + if 'state' not in converted_dialogue['turns'][-1]: + converted_dialogue['turns'][-1]['state'] = {} + prev_state = copy.deepcopy(cur_state) + + converted_dialogue['turns'].append(converted_turn) + + if 'coreference' in turn: + for Domain_Act in turn['coreference']: + for Slot, value, coref, coref_turn, coref_pos in turn['coreference'][Domain_Act]: + value = value.lower() + coref_dict[(Domain_Act, Slot, value)] = {'turn': coref_turn, 'pos': coref_pos, + 'coref_value': coref, + 'utt': converted_dialogue['turns'][coref_turn][ + 'utterance']} + + check_spans(converted_dialogue) + postprocess_update_spans(converted_dialogue) + if converted_dialogue['turns'][-1]['speaker'] == 'system': + converted_dialogue['turns'].pop(-1) + all_data.append(converted_dialogue) + + print('total_turn', total_turn) + print('total_not_found_turn', total_not_found_turn) + print('total_slot', total_slot) + print('total_not_found_slot', total_not_found_slot) + print('total_not_found_state', total_not_found_state) + print(slot_notfound_dict) + from collections import Counter + # print({k : dict(Counter(v)) for k, v in cat_slot_values.items()}) + json.dump({k : dict(Counter(v)) for k, v in state_cat_slot_values.items()}, open(os.path.join(self_dir, 'cat_slot_values.json'), 'w'), indent=4) + cat_slot_values = {k: list(set(v)) for k, v in state_cat_slot_values.items()} + da_cat_slot_values = {k: list(set(v)) for k, v in da_cat_slot_values.items()} + + json.dump(all_data, open('data.json', 'w'), indent=4) + write_zipped_json(os.path.join(self_dir, './data.zip'), 'data.json') + os.remove('data.json') + + new_ont = { + 'domains': {}, + 'intents': {}, + 'binary_dialogue_act': {} + } + + for d_s in slot_to_type: + d, s = d_s.split('-') + if d not in new_ont['domains']: + new_ont['domains'][d] = { + 'description': multiwoz_desc[d]['domain'], + 'slots': {} + } + domain_ont = new_ont['domains'][d] + assert s not in domain_ont + domain_ont['slots'][s] = { + 'description': multiwoz_desc[d][s] if s in multiwoz_desc[d] else '', + 'is_categorical': d_s in state_cat_slot_ds, + 'possible_values': da_cat_slot_values[d_s] if d_s in state_cat_slot_ds else [] + } + domain_ont['slots'][s]['possible_values'] = [_ for _ in domain_ont['slots'][s]['possible_values'] if _ not in ['dontcare', '']] + + new_ont['state'] = {} + # print(state_cat_slot_value_dict) + print(state_ont) + for d in state_ont: + new_ont['state'][d] = {} + for s in state_ont[d]: + d_s = '-'.join([d, s]) + new_ont['state'][d][s] = '' + # new_ont['state'][d][s] = { + # 'description': multiwoz_desc[d][s] if s in multiwoz_desc[d] else '', + # 'is_categorical': d_s in state_cat_slot_value_dict, + # 'possible_values': list(state_cat_slot_value_dict[d_s].keys()) if d_s in state_cat_slot_value_dict else [] + # } + # new_ont['state'][d][s]['possible_values'] = [_ for _ in new_ont['state'][d][s]['possible_values'] if + # _ != 'dontcare'] + + new_ont['intents'] = {i: {'description': multiwoz_desc['intents'][i]} for i in intent_ont} + new_ont['binary_dialogue_act'] = binary_ont + + slot_desc = json.load(open(os.path.join(self_dir, extract_dir, './slot_descriptions.json'))) + for domain_slot in slot_desc: + _domain, _slot = domain_slot.split('-') + _desc = slot_desc[domain_slot][0] + if _slot == 'arriveby': + _slot = 'arriveBy' + elif _slot == 'leaveat': + _slot = 'leaveAt' + if 'book' in _slot: + _slot = _slot.replace('book ', '') + if not _domain in new_ont['state']: + # logging.info('domain {} not in state domains'.format(_domain)) + continue + if _domain in new_ont['domains'] and _slot in new_ont['domains'][_domain]['slots']: + new_ont['domains'][_domain]['slots'][_slot]['description'] = _desc + if not _slot in new_ont['state'][_domain]: + logging.info('domain {} slot {} not in state'.format(_domain, _slot)) + continue + # new_ont['state'][_domain][_slot] = "" + assert _domain in new_ont['domains'], print(_domain) + assert _slot in new_ont['domains'][_domain]['slots'] + + logging.info('num_match_error_da_span {}'.format(num_match_error_da_span)) + json.dump(new_ont, open(os.path.join(self_dir, './ontology.json'), 'w'), indent=4) + + else: + all_data = read_zipped_json(os.path.join(self_dir, './data.zip'), 'data.json') + new_ont = json.load(open(os.path.join(self_dir, './ontology.json'), 'r')) + logging.info('# dialogue: {}, # turn: {}'.format(num_train_dialogue, num_train_utt)) + return all_data, new_ont + + +def postprocess_update_spans(dialog): + changed_utt_idx_and_position = {} + for turn in dialog['turns']: + if turn['speaker'] != 'user': + continue + changed = False + for _update in turn['state_update']['non-categorical']: + if 'utt_idx' in _update: + utt_idx = _update['utt_idx'] + start = _update['start'] + end = _update['end'] + + # assume at most one word changes for every utterance + if turn['utt_idx'] not in changed_utt_idx_and_position: + if utt_idx == turn['utt_idx'] and start-1 > -1 and turn['utterance'][start-1] not in [' ']: + changed_utt_idx_and_position[turn['utt_idx']] = start + print('=======================') + print(dialog['original_id']) + print(turn['utterance']) + print(json.dumps(_update, indent=2)) + print(turn['utterance'][start: end]) + turn['utterance'] = turn['utterance'][:start] + ' ' + turn['utterance'][start:] + print(turn['utterance']) + _update['start'] += 1 + _update['end'] += 1 + changed = True + if utt_idx not in changed_utt_idx_and_position: + continue + else: + value = _update['fixed_value'] if 'fixed_value' in _update and _update['fixed_value'] != 'not found' else _update['value'] + if start >= changed_utt_idx_and_position[utt_idx]: + if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: + assert dialog['turns'][utt_idx]['utterance'][_update['start']+1: _update['end']+1] == value, print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][_update['start']+1: _update['end']+1]) + _update['start'] += 1 + _update['end'] += 1 + elif start < changed_utt_idx_and_position[utt_idx] < end: + if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: + assert (dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']+1]).replace(' ', '') == value.replace(' ', ''), print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']+1], value) + print('fix') + print(_update) + _update['end'] += 1 + _update['fixed_value'] = turn['utterance'][_update['start']: _update['end'] + 1].strip() + print(_update) + if changed: + for _update in turn['state_update']['non-categorical']: + if 'utt_idx' in _update: + utt_idx = _update['utt_idx'] + start = _update['start'] + end = _update['end'] + + if utt_idx not in changed_utt_idx_and_position: + continue + else: + value = _update['fixed_value'] if 'fixed_value' in _update and _update[ + 'fixed_value'] != 'not found' else _update['value'] + if start >= changed_utt_idx_and_position[utt_idx]: + if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: + assert dialog['turns'][utt_idx]['utterance'][_update['start'] + 1: _update['end'] + 1] == value + _update['start'] += 1 + _update['end'] += 1 + elif start < changed_utt_idx_and_position[utt_idx] < end: + if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: + print('====================fix===================') + print(_update) + assert (dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']+1]).replace(' ', '') == value.replace(' ', ''), print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][_update['start']+1: _update['end']+1]) + _update['end'] += 1 + _update['fixed_value'] = dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end'] + 1] + print(_update) + for turn in dialog['turns']: + if turn['speaker'] != 'user': + continue + for _update in turn['state_update']['non-categorical']: + if 'utt_idx' in _update: + value = _update['fixed_value'] if 'fixed_value' in _update and _update[ + 'fixed_value'] != 'not found' else _update['value'] + utt_idx = _update['utt_idx'] + start = _update['start'] + end = _update['end'] + if dialog['turns'][utt_idx]['utterance'][start] == ' ': + _update['start'] += 1 + _update['fixed_value'] = value[1:] + value = value[1:] + start += 1 + assert dialog['turns'][utt_idx]['utterance'][start: end] == value, print(json.dumps(turn, indent=4), [c for c in dialog['turns'][utt_idx]['utterance'][start: end]], [c for c in value]) + return dialog + + +def get_time_variants(time_text): + value_list = [time_text] + pattern_time = r'(\d{1,2}:\d{2})(\s)?(am|pm|AM|PM)?' + match_times = re.findall(pattern_time, time_text) + if len(match_times) < 1: + return [] + match_time = match_times[0] + + am_flag = match_time[2] in ['am', 'AM'] + pm_flag = match_time[2] in ['pm', 'PM'] + no_am_pm_flag = match_time[2] == '' + if am_flag: + # 4:00am -> 4:00 + value_list.append(match_time[0]) + if len(match_time[0]) == 4: + # 4:00 -> 04:00 + value_list.append('0' + match_time[0]) + if pm_flag: + # 4:00pm -> 16:00 + hour, min = match_time[0].split(':') + hour = int(hour) + new_hour = 12 + hour + value_list.append(str(new_hour)+':'+min) + if no_am_pm_flag: + hour, min = match_time[0].split(':') + hour = int(hour) + if hour > 12: + new_hour = hour - 12 + value_list.append(str(new_hour) + ':' + min + 'pm') + value_list.append(str(new_hour) + ':' + min + ' pm') + value_list.append(str(new_hour) + ':' + min) + if min == '00': + value_list.append(str(new_hour) + 'pm') + value_list.append(str(new_hour) + ' pm') + value_list.append(str(new_hour)) + else: + value_list.append(str(hour) + ':' + min + 'am') + value_list.append(str(hour) + ':' + min + ' am') + value_list.append(str(hour) + ':' + min) + if min == '00': + value_list.append(str(hour) + 'am') + value_list.append(str(hour) + ' am') + value_list.append(str(hour)) + if len(match_time[0]) == 5 and match_time[0][0] == '0': + value_list.append(match_time[0][1:]) + value_list.append(''.join(match_time[0].split(':'))) + + return value_list + + +def get_genitive_variants(value): + ret_list = [] + value_genitive_format = r"(?=\w)s(?=\s)" + value_pattern = re.compile(value_genitive_format) + + span_genitive_value = re.sub(value_pattern, " 's", value) + if span_genitive_value != value: + ret_list.append(span_genitive_value) + span_genitive_value = re.sub(value_pattern, "'s", value) + if span_genitive_value != value: + ret_list.append(span_genitive_value) + # if len(ret_list) > 0: + # print('=============================') + # print(value) + # print(re.findall(value_pattern, value)) + # print(ret_list) + return ret_list + + +def check_spans(dialog): + for turn in dialog['turns']: + if turn['speaker'] != 'user': + continue + for _update in turn['state_update']['non-categorical']: + if 'utt_idx' in _update: + value = _update['fixed_value'] if 'fixed_value' in _update and _update[ + 'fixed_value'] != 'not found' else _update['value'] + utt_idx = _update['utt_idx'] + start = _update['start'] + end = _update['end'] + assert dialog['turns'][utt_idx]['utterance'][start:end] == value, print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][start:end]) + + + +def get_bb_variants(value): + ret_list = [] + if 'bed and breakfast' in value: + ret_list.append(value.replace('bed and breakfast', 'b & b')) + return ret_list + +if __name__ == '__main__': + preprocess(da_cat_slot_values, state_cat_slot_values) \ No newline at end of file diff --git a/data/unified_datasets/schema/README.md b/data/unified_datasets/schema/README.md new file mode 100644 index 0000000000000000000000000000000000000000..32483a533a248fda59a92dfeceab436d741659de --- /dev/null +++ b/data/unified_datasets/schema/README.md @@ -0,0 +1,689 @@ +# README + +## Features + +- Annotations: dialogue act, belief state, character-level span for non-categorical slots. +- Unseen domains and slots in the test set to quantify the performance in zero-shot or few shot settings. + +Statistics: + +| | \# dialogues | \# utterances | avg. turns | avg. tokens | \# domains | +| ----- | ------------ | ------------- | ---------- | ----------- | ---------- | +| train | 16142 | 313822 | 19.44 | 10.02 | 16 | +| val | 2482 | 46244 | 18.63 | 9.94 | 16 | +| test | 4201 | 80393 | 19.14 | 10.7 | 18 | + +## Main changes + +1. download the original data as `original_data.zip` + +2. run `python preprocess` to unzip `original_data.zip` and get processed `data.zip` & `ontology.json`. + +Main changes: + +- extract intent from domains. +- ~~numerical slot => non-categorical, use string match to get the span.~~ +- add binary_dialogue_act for those binary intents such as 'goodbye', 'request'. +- add **count** non-categorical, numerical slot for each domain, but not appear in belief state. +- sys state are updated by previous user frame['state']. +- calculate the state update according to prev state and slot spans in current turn slot_vals and all previous dialogue acts. 99.6% non-categorical state update have spans while the rest of them are like "Could you help me search for songs from **two years back** too?" +- values in possible values, dialogue act, state, and state_update are in **lowercase**. + +Notice: + +- for categorical slot, value maybe **dontcare**, which is not presented in **possible_values**. + +## Original data + +The Schema-Guided Dialogue (SGD) dataset consists of over 20k annotated +multi-domain, task-oriented conversations between a human and a virtual +assistant. These conversations involve interactions with services and APIs +spanning 20 domains, ranging from banks and events to media, calendar, travel, +and weather. For most of these domains, the dataset contains multiple different +APIs, many of which have overlapping functionalities but different interfaces, +which reflects common real-world scenarios. The wide range of available +annotations can be used for intent prediction, slot filling, dialogue state +tracking, policy imitation learning, language generation, user simulation +learning, among other tasks in large-scale virtual assistants. Besides these, +the dataset has unseen domains and services in the evaluation set to quantify +the performance in zero-shot or few shot settings. + +[[paper]](https://arxiv.org/abs/1909.05855) [[download link]](https://github.com/google-research-datasets/dstc8-schema-guided-dialogue) + +### Scheme Representation + +A service or API is essentially a set of functions (called intents), each taking +a set of parameters (called slots). A schema is a normalized representation of +the interface exposed by a service/API. In addition, the schema also includes +natural language description of the included functions and their parameters to +outline the semantics of each element. The schemas have been manually generated +by the dataset creators. The schema for a service contains the following fields: + +* **service_name** - A unique name for the service. +* **description** - A natural language description of the tasks supported by + the service. +* **slots** - A list of slots/attributes corresponding to the entities present + in the service. Each slot contains the following fields: + * **name** - The name of the slot. + * **description** - A natural language description of the slot. + * **is_categorical** - A boolean value. If it is true, the slot has a + fixed set of possible values. + * **possible_values** - List of possible values the slot can take. If the + slot is a categorical slot, it is a complete list of all the possible + values. If the slot is a non categorical slot, it is either an empty + list or a small sample of all the values taken by the slot. +* **intents** - The list of intents/tasks supported by the service. Each + method contains the following fields: + * **name** - The name of the intent. + * **description** - A natural language description of the intent. + * **is_transactional** - A boolean value. If true, indicates that the + underlying API call is transactional (e.g, a booking or a purchase), as + opposed to a search call. + * **required_slots** - A list of slot names whose values must be provided + before making a call to the service. + * **optional_slots** - A dictionary mapping slot names to the default + value taken by the slot. These slots may be optionally specified by the + user and the user may override the default value. An empty default value + allows that slot to take any value by default, but the user may override + it. + * **result_slots** - A list of slot names which are present in the results + returned by a call to the service or API. + +### Dialogue Representation + +The dialogue is represented as a list of turns, where each turn contains either +a user or a system utterance. The annotations for a turn are grouped into +frames, where each frame corresponds to a single service. Each turn in the +single domain dataset contains exactly one frame. In multi-domain datasets, some +turns may have multiple frames. + +Each dialogue is represented as a json object with the following fields: + +* **dialogue_id** - A unique identifier for a dialogue. +* **services** - A list of services present in the dialogue. +* **turns** - A list of annotated system or user utterances. + +Each turn consists of the following fields: + +* **speaker** - The speaker for the turn. Possible values are "USER" or + "SYSTEM". +* **utterance** - A string containing the natural language utterance. +* **frames** - A list of frames, each frame containing annotations for a + single service. + +Each frame consists of the fields listed below. The fields marked with * will +be excluded from all user turns in the test data released to the participants. + +* **service** - The name of the service corresponding to the frame. The slots + and intents used in the following fields are taken from the schema of this + service. +* **slots** - A list of slot spans in the utterance, only provided for + non-categorical slots. Each slot span contains the following fields: + * **slot** - The name of the slot. + * **start** - The index of the starting character in the utterance + corresponding to the slot value. + * **exclusive_end** - The index of the character just after the last + character corresponding to the slot value in the utterance. In python, + `utterance[start:exclusive_end]` gives the slot value. +* **actions** - A list of actions corresponding to the system. Each action has + the following fields: + * **act** - The type of action. The list of all possible system acts is + given below. + * **slot** (optional) - A slot argument for some of the actions. + * **values** (optional) - A list of values assigned to the slot. If the + values list is non-empty, then the slot must be present. + * **canonical_values** (optional) - The values in their canonicalized form + as used by the service. It is a list of strings of the same length as + values. +* **service_call** (system turns only, optional) - The request sent to the + service. It consists of the following fields: + * **method** - The name of the intent or function of the service or API + being executed. + * **parameters** - A dictionary mapping slot name (all required slots and + possibly some optional slots) to a value in its canonicalized form. +* **service_results** (system turns only, optional) - A list of entities + containing the results obtained from the service. It is only available for + turns in which a service call is made. Each entity is represented as a + dictionary mapping a slot name to a string containing its canonical value. +* **state** (user turns only) - The dialogue state corresponding to the + service. It consists of the following fields: + * **active_intent** - The intent corresponding to the service of the frame + which is currently being fulfilled by the system. It takes the value + "NONE" if none of the intents are active. + * **requested_slots** - A list of slots requested by the user in the + current turn. + * **slot_values** - A dictionary mapping slot name to a list of strings. + For categorical slots, this list contains a single value assigned to the + slot. For non-categorical slots, all the values in this list are spoken + variations of each other and are equivalent (e.g, "6 pm", "six in the + evening", "evening at 6" etc.). + +List of possible system acts: + +* **INFORM** - Inform the value for a slot to the user. The slot and values + fields in the corresponding action are always non-empty. +* **REQUEST** - Request the value of a slot from the user. The corresponding + action always contains a slot, but values are optional. When values are + present, they are used as examples for the user e.g, "Would you like to eat + indian or chinese food or something else?" +* **CONFIRM** - Confirm the value of a slot before making a transactional + service call. +* **OFFER** - Offer a certain value for a slot to the user. The corresponding + action always contains a slot and a list of values for that slot offered to + the user. +* **NOTIFY_SUCCESS** - Inform the user that their request was successful. Slot + and values are always empty in the corresponding action. +* **NOTIFY_FAILURE** - Inform the user that their request failed. Slot and + values are always empty in the corresponding action. +* **INFORM_COUNT** - Inform the number of items found that satisfy the user's + request. The corresponding action always has "count" as the slot, and a + single element in values for the number of results obtained by the system. +* **OFFER_INTENT** - Offer a new intent to the user. Eg, "Would you like to + reserve a table?". The corresponding action always has "intent" as the slot, + and a single value containing the intent being offered. The offered intent + belongs to the service corresponding to the frame. +* **REQ_MORE** - Asking the user if they need anything else. Slot and values + are always empty in the corresponding action. +* **GOODBYE** - End the dialogue. Slot and values are always empty in the + corresponding action. + +List of possible user acts: + +* **INFORM_INTENT** - Express the desire to perform a certain task to the + system. The action always has "intent" as the slot and a single value + containing the intent being informed. +* **NEGATE_INTENT** - Negate the intent which has been offered by the system. +* **AFFIRM_INTENT** - Agree to the intent which has been offered by the + system. +* **INFORM** - Inform the value of a slot to the system. The slot and values + fields in the corresponding action are always non-empty. +* **REQUEST** - Request the value of a slot from the system. The corresponding + action always contains a slot parameter. It may optionally contain a value, + in which case, the user asks the system if the slot has the specified value. +* **AFFIRM** - Agree to the system's proposition. Slot and values are always + empty. +* **NEGATE** - Deny the system's proposal. Slot and values are always empty. +* **SELECT** - Select a result being offered by the system. The corresponding + action may either contain no parameters, in which case all the values + proposed by the system are being accepted, or it may contain a slot and + value parameters, in which case the specified slot and value are being + accepted. +* **REQUEST_ALTS** - Ask for more results besides the ones offered by the + system. Slot and values are always empty. +* **THANK_YOU** - Thank the system. Slot and values are always empty. +* **GOODBYE** - End the dialogue. Slot and values are always empty. + +### Dataset Statistics + +The dataset consists of two kinds of dialogues. + +| Type of Dialogue | Train files | Dev files | Test Files | +| ---------------- | :------------------------------------------: | :------------------------------------------: | :------------------------------------------: | +| Single Domain | `dialogues_001.json` to `dialogues_043.json` | `dialogues_001.json` to `dialogues_007.json` | `dialogues_001.json` to `dialogues_011.json` | +| Multi Domain | `dialogues_044.json` to `dialogues_127.json` | `dialogues_008.json` to `dialogues_020.json` | `dialogues_012.json` to `dialogues_034.json` | + +The single domain dialogues involve interactions with a single service, possibly +over multiple intents. The multi-domain dialogues have interactions involving +intents belonging to two or more different services. The multi-domain dialogues +also involve transfer of dialogue state values from one service to the other +wherever such a transfer is deemed natural. Eg, if a user finds a restaurant and +searches for a movie next, the dialogue state for movie service is already +initialized with the location from the dialogue state for restaurant service. + +The overall statistics of the train and dev sets are given below. The term +*informable slots* refers to the slots over which the user can specify a +constraint. For example, slots like *phone_number* are not informable. + +<table> + <tr> + <th rowspan="2"></th> + <th colspan="3">Train</th><th colspan="3">Dev</th><th colspan="3">Test</th> + </tr> + <tr> + <td>Single-domain</td> + <td>Multi-domain</td> + <td>Combined</td> + <td>Single-domain</td> + <td>Multi-domain</td> + <td>Combined</td> + <td>Single-domain</td> + <td>Multi-domain</td> + <td>Combined</td> + </tr> + <tr> + <td>No. of dialogues</td> + <td align="center">5,403</td> + <td align="center">10,739</td> + <td align="center">16,142</td> + <td align="center">836</td> + <td align="center">1,646</td> + <td align="center">2,482</td> + <td align="center">1,331</td> + <td align="center">2,870</td> + <td align="center">4,201</td> + </tr> + <tr> + <td>No. of turns</td> + <td align="center">82,588</td> + <td align="center">247,376</td> + <td align="center">329,964</td> + <td align="center">11,928</td> + <td align="center">36,798</td> + <td align="center">48,726</td> + <td align="center">16,850</td> + <td align="center">67,744</td> + <td align="center">84,594</td> + </tr> + <tr> + <td>No. of tokens (lower-cased)</td> + <td align="center">807,562</td> + <td align="center">2,409,857</td> + <td align="center">3,217,419</td> + <td align="center">117,492</td> + <td align="center">353,381</td> + <td align="center">470,873</td> + <td align="center">166,329</td> + <td align="center">713,731</td> + <td align="center">880,060</td> + </tr> + <tr> + <td>Average turns per dialogue</td> + <td align="center">15.286</td> + <td align="center">23.035</td> + <td align="center">20.441</td> + <td align="center">14.268</td> + <td align="center">22.356</td> + <td align="center">19.632</td> + <td align="center">12.660</td> + <td align="center">23.604</td> + <td align="center">20.137</td> + </tr> + <tr> + <td>Average tokens per turn</td> + <td align="center">9.778</td> + <td align="center">9.742</td> + <td align="center">9.751</td> + <td align="center">9.850</td> + <td align="center">9.603</td> + <td align="center">9.664</td> + <td align="center">9.871</td> + <td align="center">10.536</td> + <td align="center">10.403</td> + </tr> + <tr> + <td>Total unique tokens (lower-cased)</td> + <td align="center">16,350</td> + <td align="center">25,459</td> + <td align="center">30,349</td> + <td align="center">6,803</td> + <td align="center">10,533</td> + <td align="center">12,719</td> + <td align="center">7,213</td> + <td align="center">14,888</td> + <td align="center">16,382</td> + </tr> + <tr> + <td>Total no. of slots</td> + <td align="center">201</td> + <td align="center">214</td> + <td align="center">214</td> + <td align="center">134</td> + <td align="center">132</td> + <td align="center">136</td> + <td align="center">157</td> + <td align="center">158</td> + <td align="center">159</td> + </tr> + <tr> + <td>Total no. of informable slots</td> + <td align="center">138</td> + <td align="center">144</td> + <td align="center">144</td> + <td align="center">89</td> + <td align="center">87</td> + <td align="center">89</td> + <td align="center">109</td> + <td align="center">110</td> + <td align="center">111</td> + </tr> + <tr> + <td>Total unique slot values (lower-cased)</td> + <td align="center">7,070</td> + <td align="center">11,635</td> + <td align="center">14,139</td> + <td align="center">2,418</td> + <td align="center">4,182</td> + <td align="center">5,101</td> + <td align="center">2,492</td> + <td align="center">5,847</td> + <td align="center">6,533</td> + </tr> + <tr> + <td>Total unique informable slot values (lower-cased)</td> + <td align="center">3,742</td> + <td align="center">6,348</td> + <td align="center">7,661</td> + <td align="center">1,137</td> + <td align="center">2,118</td> + <td align="center">2,524</td> + <td align="center">1,387</td> + <td align="center">3,323</td> + <td align="center">3,727</td> + </tr> + <tr> + <td>Total domains</td> + <td align="center">14</td> + <td align="center">16</td> + <td align="center">16</td> + <td align="center">16</td> + <td align="center">15</td> + <td align="center">16</td> + <td align="center">17</td> + <td align="center">18</td> + <td align="center">18</td> + </tr> + <tr> + <td>Total services</td> + <td align="center">24</td> + <td align="center">26</td> + <td align="center">26</td> + <td align="center">17</td> + <td align="center">16</td> + <td align="center">17</td> + <td align="center">20</td> + <td align="center">21</td> + <td align="center">21</td> + </tr> + <tr> + <td>Total intents</td> + <td align="center">35</td> + <td align="center">37</td> + <td align="center">37</td> + <td align="center">28</td> + <td align="center">26</td> + <td align="center">28</td> + <td align="center">33</td> + <td align="center">34</td> + <td align="center">35</td> + </tr> +</table> + + +The following table shows how the dialogues and services are distributed among +different domains for the train and dev sets. In this table, each multi-domain +dialogue contirbutes to the count of every service present in the dialogue. +Please note that a few domains like *Travel* and *Weather* are only present in +the dev set. This is to test the generalization of models on unseen domains. The +test set will similarly have some unseen domains which are neither present in +the training nor in the dev set. Also, the number in parenthesis represents the +number of unique services belonging to the corresponding domain. + +* In the first column, it indicates the number of unique services for the + domain in Train, Dev and Test datasets combined. +* In the fourth column, it indicates the number of such unique services in the + Train dataset only. +* In the seventh column, it indicates the number of such unique services in + the Dev dataset only. +* In the last column, it indicates the number of such unique services in the + Test dataset only. + +<table> + <tr> + <th rowspan="2"></th> + <th colspan="3"># Dialogues <br> Train</th> + <th colspan="3"># Dialogues <br> Dev</th> + <th colspan="3"># Dialogues <br> Test</th> + </tr> + <tr> + <td>Single-domain</td> + <td>Multi-domain</td> + <td>Combined</td> + <td>Single-domain</td> + <td>Multi-domain</td> + <td>Combined</td> + <td>Single-domain</td> + <td>Multi-domain</td> + <td>Combined</td> + </tr> + <tr> + <td>Alarm (1)</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">37</td> + <td align="center">NA</td> + <td align="center">37 (1)</td> + <td align="center">47</td> + <td align="center">240</td> + <td align="center">287 (1)</td> + </tr> + <tr> + <td>Banks (2)</td> + <td align="center">207</td> + <td align="center">520</td> + <td align="center">727 (1)</td> + <td align="center">42</td> + <td align="center">252</td> + <td align="center">294 (1)</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + </tr> + <tr> + <td>Buses (3)</td> + <td align="center">310</td> + <td align="center">1,970</td> + <td align="center">2,280 (2)</td> + <td align="center">44</td> + <td align="center">285</td> + <td align="center">329 (1)</td> + <td align="center">88</td> + <td align="center">438</td> + <td align="center">526 (1)</td> + </tr> + <tr> + <td>Calendar (1)</td> + <td align="center">169</td> + <td align="center">1,433</td> + <td align="center">1,602 (1)</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + </tr> + <tr> + <td>Events (3)</td> + <td align="center">788</td> + <td align="center">2,721</td> + <td align="center">3,509 (1)</td> + <td align="center">73</td> + <td align="center">345</td> + <td align="center">418 (1)</td> + <td align="center">76</td> + <td align="center">516</td> + <td align="center">592 (1)</td> + </tr> + <tr> + <td>Flights (4)</td> + <td align="center">985</td> + <td align="center">1,762</td> + <td align="center">2,747 (2)</td> + <td align="center">94</td> + <td align="center">297</td> + <td align="center">391 (1)</td> + <td align="center">87</td> + <td align="center">419</td> + <td align="center">506 (1)</td> + </tr> + <tr> + <td>Homes (2)</td> + <td align="center">268</td> + <td align="center">579</td> + <td align="center">847 (1)</td> + <td align="center">81</td> + <td align="center">99</td> + <td align="center">180 (1)</td> + <td align="center">89</td> + <td align="center">157</td> + <td align="center">246 (1)</td> + </tr> + <tr> + <td>Hotels (4)</td> + <td align="center">457</td> + <td align="center">2,896</td> + <td align="center">3,353 (3)</td> + <td align="center">56</td> + <td align="center">521</td> + <td align="center">577 (2)</td> + <td align="center">177</td> + <td align="center">885</td> + <td align="center">1062 (2)</td> + </tr> + <tr> + <td>Media (3)</td> + <td align="center">281</td> + <td align="center">832</td> + <td align="center">1,113 (1)</td> + <td align="center">46</td> + <td align="center">133</td> + <td align="center">179 (1)</td> + <td align="center">80</td> + <td align="center">284</td> + <td align="center">364 (1)</td> + </tr> + <tr> + <td>Messaging (1)</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">298</td> + <td align="center">298 (1)</td> + </tr> + <tr> + <td>Movies (2)</td> + <td align="center">292</td> + <td align="center">1,325</td> + <td align="center">1,617 (1)</td> + <td align="center">47</td> + <td align="center">94</td> + <td align="center">141 (1)</td> + <td align="center">132</td> + <td align="center">449</td> + <td align="center">581</td> + </tr> + <tr> + <td>Music (3)</td> + <td align="center">394</td> + <td align="center">896</td> + <td align="center">1,290 (2)</td> + <td align="center">35</td> + <td align="center">161</td> + <td align="center">196 (1)</td> + <td align="center">25</td> + <td align="center">322</td> + <td align="center">347 (2)</td> + </tr> + <tr> + <td>Payment (1)</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">36</td> + <td align="center">186</td> + <td align="center">222 (1)</td> + </tr> + <tr> + <td>RentalCars (3)</td> + <td align="center">215</td> + <td align="center">1,370</td> + <td align="center">1,585 (2)</td> + <td align="center">39</td> + <td align="center">342</td> + <td align="center">381 (1)</td> + <td align="center">64</td> + <td align="center">480</td> + <td align="center">544 (1)</td> + </tr> + <tr> + <td>Restaurants (2)</td> + <td align="center">367</td> + <td align="center">2052</td> + <td align="center">2,419 (1)</td> + <td align="center">73</td> + <td align="center">263</td> + <td align="center">336 (1)</td> + <td align="center">73</td> + <td align="center">390</td> + <td align="center">463 (1)</td> + </tr> + <tr> + <td>RideSharing (2)</td> + <td align="center">119</td> + <td align="center">1,584</td> + <td align="center">1,703 (2)</td> + <td align="center">45</td> + <td align="center">225</td> + <td align="center">270 (1)</td> + <td align="center">34</td> + <td align="center">216</td> + <td align="center">250 (1)</td> + </tr> + <tr> + <td>Services (4)</td> + <td align="center">551</td> + <td align="center">1,338</td> + <td align="center">1,889 (3)</td> + <td align="center">44</td> + <td align="center">157</td> + <td align="center">201 (1)</td> + <td align="center">167</td> + <td align="center">489</td> + <td align="center">656 (2)</td> + </tr> + <tr> + <td>Trains (1)</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">NA</td> + <td align="center">84</td> + <td align="center">266</td> + <td align="center">350 (1)</td> + </tr> + <tr> + <td>Travel (1)</td> + <td align="center">NA</td> + <td align="center">1,871</td> + <td align="center">1,871 (1)</td> + <td align="center">45</td> + <td align="center">238</td> + <td align="center">283 (1)</td> + <td align="center">24</td> + <td align="center">630</td> + <td align="center">654 (1)</td> + </tr> + <tr> + <td>Weather (1)</td> + <td align="center">NA</td> + <td align="center">951</td> + <td align="center">951 (1)</td> + <td align="center">35</td> + <td align="center">322</td> + <td align="center">357 (1)</td> + <td align="center">48</td> + <td align="center">427</td> + <td align="center">475 (1)</td> + </tr> +</table> + diff --git a/data/unified_datasets/schema/data.zip b/data/unified_datasets/schema/data.zip new file mode 100644 index 0000000000000000000000000000000000000000..d9cecdf51bc1793bed81920c5953bbb7bc001699 Binary files /dev/null and b/data/unified_datasets/schema/data.zip differ diff --git a/data/unified_datasets/schema/ontology.json b/data/unified_datasets/schema/ontology.json new file mode 100644 index 0000000000000000000000000000000000000000..56b66390614089b661d772d6fb70f75f9ce10ad1 --- /dev/null +++ b/data/unified_datasets/schema/ontology.json @@ -0,0 +1,7163 @@ +{ + "domains": { + "bank_1": { + "description": "Manage bank accounts and transfer money", + "slots": { + "account_type": { + "description": "The account type of the user", + "is_categorical": true, + "possible_values": [ + "checking", + "savings" + ] + }, + "recipient_account_type": { + "description": "The account type of the recipient whom the user is transfering money to", + "is_categorical": true, + "possible_values": [ + "checking", + "savings" + ] + }, + "balance": { + "description": "The amount of money held in a bank account at a given time", + "is_categorical": false, + "possible_values": [] + }, + "amount": { + "description": "The amount of money to transfer", + "is_categorical": false, + "possible_values": [] + }, + "recipient_account_name": { + "description": "The account name of the recipient who is to receive the transfered money", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "bus_1": { + "description": "Book bus journeys from the biggest bus network in the country", + "slots": { + "from_location": { + "description": "City where bus is leaving from", + "is_categorical": false, + "possible_values": [] + }, + "to_location": { + "description": "City where bus is going to", + "is_categorical": false, + "possible_values": [] + }, + "from_station": { + "description": "Station where bus is leaving from", + "is_categorical": false, + "possible_values": [] + }, + "to_station": { + "description": "Station where bus is going to", + "is_categorical": false, + "possible_values": [] + }, + "leaving_date": { + "description": "Date of bus leaving for journey", + "is_categorical": false, + "possible_values": [] + }, + "leaving_time": { + "description": "Time of bus leaving for journey", + "is_categorical": false, + "possible_values": [] + }, + "fare": { + "description": "Fare per ticket for journey", + "is_categorical": false, + "possible_values": [] + }, + "travelers": { + "description": "Number of travelers for journey", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5" + ] + }, + "transfers": { + "description": "Number of transfers in journey", + "is_categorical": true, + "possible_values": [ + "0", + "1" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "bus_2": { + "description": "Find a bus to take you to the city you want", + "slots": { + "origin": { + "description": "Origin city for journey", + "is_categorical": false, + "possible_values": [] + }, + "destination": { + "description": "Destination city for journey", + "is_categorical": false, + "possible_values": [] + }, + "origin_station_name": { + "description": "Name of the bus terminus at the origin", + "is_categorical": false, + "possible_values": [] + }, + "destination_station_name": { + "description": "Name of the bus terminus at the destination", + "is_categorical": false, + "possible_values": [] + }, + "departure_date": { + "description": "Date of bus departure", + "is_categorical": false, + "possible_values": [] + }, + "price": { + "description": "Price per ticket of the itinerary", + "is_categorical": false, + "possible_values": [] + }, + "departure_time": { + "description": "Time of bus departure", + "is_categorical": false, + "possible_values": [] + }, + "group_size": { + "description": "Size of group for the booking", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5" + ] + }, + "fare_type": { + "description": "Type of fare for the booking", + "is_categorical": true, + "possible_values": [ + "economy", + "economy extra", + "flexible" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "calendar_1": { + "description": "Calendar service to manage personal events and reservations", + "slots": { + "event_date": { + "description": "Date of event or for checking availability", + "is_categorical": false, + "possible_values": [] + }, + "event_time": { + "description": "Start time of event", + "is_categorical": false, + "possible_values": [] + }, + "event_location": { + "description": "Location of event", + "is_categorical": false, + "possible_values": [] + }, + "event_name": { + "description": "Title of event", + "is_categorical": false, + "possible_values": [] + }, + "available_start_time": { + "description": "Starting time of available time slot", + "is_categorical": false, + "possible_values": [] + }, + "available_end_time": { + "description": "Ending time of available time slot", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "event_1": { + "description": "The comprehensive portal to find and reserve seats at events near you", + "slots": { + "category": { + "description": "Type of event", + "is_categorical": true, + "possible_values": [ + "music", + "sports" + ] + }, + "subcategory": { + "description": "Subcategory of event, either a music genre or sport name", + "is_categorical": false, + "possible_values": [] + }, + "event_name": { + "description": "Name of event", + "is_categorical": false, + "possible_values": [] + }, + "date": { + "description": "Date of occurrence of event", + "is_categorical": false, + "possible_values": [] + }, + "time": { + "description": "Time when the event is scheduled to start", + "is_categorical": false, + "possible_values": [] + }, + "number_of_seats": { + "description": "Number of seats to find event tickets for", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9" + ] + }, + "city_of_event": { + "description": "City where event is happening", + "is_categorical": false, + "possible_values": [] + }, + "event_location": { + "description": "Name of event location", + "is_categorical": false, + "possible_values": [] + }, + "address_of_location": { + "description": "Street address of event location", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "event_2": { + "description": "Get tickets for the coolest concerts and sports in your area", + "slots": { + "event_type": { + "description": "Type of event", + "is_categorical": true, + "possible_values": [ + "music", + "sports" + ] + }, + "category": { + "description": "The sport or music subcategory", + "is_categorical": false, + "possible_values": [] + }, + "event_name": { + "description": "Name of match or artist for event", + "is_categorical": false, + "possible_values": [] + }, + "date": { + "description": "Date of event", + "is_categorical": false, + "possible_values": [] + }, + "time": { + "description": "Starting time for event", + "is_categorical": false, + "possible_values": [] + }, + "number_of_tickets": { + "description": "Number of tickets to buy for event", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9" + ] + }, + "city": { + "description": "City where the event is taking place", + "is_categorical": false, + "possible_values": [] + }, + "venue": { + "description": "Exact venue of event", + "is_categorical": false, + "possible_values": [] + }, + "venue_address": { + "description": "Address of event venue", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "flight_1": { + "description": "Find your next flight", + "slots": { + "passengers": { + "description": "Number of passengers in the booking", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4" + ] + }, + "seating_class": { + "description": "Seating class for the booking", + "is_categorical": true, + "possible_values": [ + "economy", + "premium economy", + "business", + "first class" + ] + }, + "origin_city": { + "description": "Starting city for the trip", + "is_categorical": false, + "possible_values": [] + }, + "destination_city": { + "description": "Ending city for the trip", + "is_categorical": false, + "possible_values": [] + }, + "origin_airport": { + "description": "Airport at the starting city for the trip", + "is_categorical": false, + "possible_values": [] + }, + "destination_airport": { + "description": "Airport at the ending city for the trip", + "is_categorical": false, + "possible_values": [] + }, + "departure_date": { + "description": "Start date for the trip", + "is_categorical": false, + "possible_values": [] + }, + "return_date": { + "description": "Date of the return flight", + "is_categorical": false, + "possible_values": [] + }, + "number_stops": { + "description": "Number of layovers in the flight", + "is_categorical": true, + "possible_values": [ + "0", + "1" + ] + }, + "outbound_departure_time": { + "description": "Departure time for the outbound leg flight", + "is_categorical": false, + "possible_values": [] + }, + "outbound_arrival_time": { + "description": "Arrival time for the outbound leg flight", + "is_categorical": false, + "possible_values": [] + }, + "inbound_arrival_time": { + "description": "Arrival time for the return leg flight", + "is_categorical": false, + "possible_values": [] + }, + "inbound_departure_time": { + "description": "Departure time for the return leg flight", + "is_categorical": false, + "possible_values": [] + }, + "price": { + "description": "Price of the booking per passenger", + "is_categorical": false, + "possible_values": [] + }, + "refundable": { + "description": "Whether the booking is refundable or not", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "airlines": { + "description": "Name of airline", + "is_categorical": true, + "possible_values": [ + "united airlines", + "american airlines", + "delta airlines", + "southwest airlines", + "alaska airlines", + "british airways", + "air canada", + "air france" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "flight_2": { + "description": "Search for cheap flights across multiple providers", + "slots": { + "passengers": { + "description": "Number of passengers to book tickets for", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5" + ] + }, + "seating_class": { + "description": "Seating class for the flight tickets", + "is_categorical": true, + "possible_values": [ + "economy", + "premium economy", + "business", + "first class" + ] + }, + "origin": { + "description": "City of origin for the flight", + "is_categorical": false, + "possible_values": [] + }, + "destination": { + "description": "City of destination for the flight", + "is_categorical": false, + "possible_values": [] + }, + "origin_airport": { + "description": "Airport where the flight is departing from", + "is_categorical": false, + "possible_values": [] + }, + "destination_airport": { + "description": "Airport where the flight is arriving to", + "is_categorical": false, + "possible_values": [] + }, + "departure_date": { + "description": "Date of departure flight on the ticket", + "is_categorical": false, + "possible_values": [] + }, + "return_date": { + "description": "Date of return flight on the ticket", + "is_categorical": false, + "possible_values": [] + }, + "number_stops": { + "description": "Number of stopovers made by the flight", + "is_categorical": true, + "possible_values": [ + "0", + "1" + ] + }, + "outbound_departure_time": { + "description": "Departure time of the origin-destination flight", + "is_categorical": false, + "possible_values": [] + }, + "outbound_arrival_time": { + "description": "Arrival time of the origin-destination flight", + "is_categorical": false, + "possible_values": [] + }, + "inbound_arrival_time": { + "description": "Arrival time of the destination-origin flight", + "is_categorical": false, + "possible_values": [] + }, + "inbound_departure_time": { + "description": "Departure time of the destination-origin flight", + "is_categorical": false, + "possible_values": [] + }, + "fare": { + "description": "Ticket fare for each passenger", + "is_categorical": false, + "possible_values": [] + }, + "is_redeye": { + "description": "Boolean flag indicating whether the flight is a red-eye flight", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "airlines": { + "description": "Name of the airline to book with", + "is_categorical": true, + "possible_values": [ + "united airlines", + "american airlines", + "delta airlines", + "southwest airlines", + "alaska airlines", + "british airways", + "air canada", + "air france" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "home_1": { + "description": "A widely used service for finding apartments and scheduling visits", + "slots": { + "area": { + "description": "City where the apartment is located", + "is_categorical": false, + "possible_values": [] + }, + "address": { + "description": "Address of the apartment", + "is_categorical": false, + "possible_values": [] + }, + "property_name": { + "description": "Name of the apartment", + "is_categorical": false, + "possible_values": [] + }, + "phone_number": { + "description": "Phone number of the apartment", + "is_categorical": false, + "possible_values": [] + }, + "furnished": { + "description": "Boolean flag indicating if the property is furnished", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "pets_allowed": { + "description": "Boolean flag indicating if pets are allowed", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "rent": { + "description": "Rent per month of the apartment", + "is_categorical": false, + "possible_values": [] + }, + "visit_date": { + "description": "Date for the visit to the apartment", + "is_categorical": false, + "possible_values": [] + }, + "number_of_beds": { + "description": "Number of bed rooms", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4" + ] + }, + "number_of_baths": { + "description": "Number of baths in the apartment", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "hotel_1": { + "description": "A popular service for searching and reserving rooms in hotels", + "slots": { + "destination": { + "description": "Location of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "number_of_rooms": { + "description": "Number of rooms in the reservation", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3" + ] + }, + "check_in_date": { + "description": "Start date for the reservation", + "is_categorical": false, + "possible_values": [] + }, + "number_of_days": { + "description": "Number of days in the reservation", + "is_categorical": false, + "possible_values": [] + }, + "star_rating": { + "description": "Star rating of the hotel", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5" + ] + }, + "hotel_name": { + "description": "Name of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "street_address": { + "description": "Address of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "phone_number": { + "description": "Phone number of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "price_per_night": { + "description": "Price per night for the reservation", + "is_categorical": false, + "possible_values": [] + }, + "has_wifi": { + "description": "Boolean flag indicating if the hotel has wifi", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "hotel_2": { + "description": "A popular service for searching and booking houses for short term stay", + "slots": { + "where_to": { + "description": "Location of the house", + "is_categorical": false, + "possible_values": [] + }, + "number_of_adults": { + "description": "Number of people for the reservation", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5" + ] + }, + "check_in_date": { + "description": "Start date for the reservation or to find the house", + "is_categorical": false, + "possible_values": [] + }, + "check_out_date": { + "description": "End date for the reservation or to find the house", + "is_categorical": false, + "possible_values": [] + }, + "rating": { + "description": "Review rating of the house", + "is_categorical": false, + "possible_values": [] + }, + "address": { + "description": "Address of the house", + "is_categorical": false, + "possible_values": [] + }, + "phone_number": { + "description": "Phone number of the house", + "is_categorical": false, + "possible_values": [] + }, + "total_price": { + "description": "Price per night of the house", + "is_categorical": false, + "possible_values": [] + }, + "has_laundry_service": { + "description": "Boolean flag indicating if the house has laundry service", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "hotel_3": { + "description": "A leading provider for searching and booking hotel rooms", + "slots": { + "location": { + "description": "Location of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "number_of_rooms": { + "description": "Number of rooms to book", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3" + ] + }, + "check_in_date": { + "description": "Start date for the hotel reservation", + "is_categorical": false, + "possible_values": [] + }, + "check_out_date": { + "description": "End date for the hotel reservation", + "is_categorical": false, + "possible_values": [] + }, + "average_rating": { + "description": "Average review rating for the hotel", + "is_categorical": false, + "possible_values": [] + }, + "hotel_name": { + "description": "Name of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "street_address": { + "description": "Address of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "phone_number": { + "description": "Phone number of the hotel", + "is_categorical": false, + "possible_values": [] + }, + "price": { + "description": "Total price for the stay", + "is_categorical": false, + "possible_values": [] + }, + "pets_welcome": { + "description": "Boolean flag indicating if pets are allowed in the hotel", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "media_1": { + "description": "A leading provider of movies for searching and watching on-demand", + "slots": { + "title": { + "description": "Title of the movie", + "is_categorical": false, + "possible_values": [] + }, + "genre": { + "description": "Genre of the movie", + "is_categorical": false, + "possible_values": [] + }, + "subtitles": { + "description": "Boolean flag indicating if subtitles are desired for this movie", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "directed_by": { + "description": "Name of the director of the movie", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "movie_1": { + "description": "A go-to provider for finding movies, searching for show times and booking tickets", + "slots": { + "price": { + "description": "Price per ticket", + "is_categorical": false, + "possible_values": [] + }, + "number_of_tickets": { + "description": "Number of the movie tickets to be purchased", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9" + ] + }, + "show_type": { + "description": "Type of show", + "is_categorical": true, + "possible_values": [ + "regular", + "3d", + "imax" + ] + }, + "theater_name": { + "description": "Name of the theatre", + "is_categorical": false, + "possible_values": [] + }, + "show_time": { + "description": "Time of the show", + "is_categorical": false, + "possible_values": [] + }, + "show_date": { + "description": "Date of the show", + "is_categorical": false, + "possible_values": [] + }, + "genre": { + "description": "Genre of the movie", + "is_categorical": false, + "possible_values": [] + }, + "street_address": { + "description": "Address of the theatre", + "is_categorical": false, + "possible_values": [] + }, + "location": { + "description": "City where the theatre is located", + "is_categorical": false, + "possible_values": [] + }, + "movie_name": { + "description": "Name of the movie", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "music_1": { + "description": "A popular provider of a wide range of music content for searching and listening", + "slots": { + "song_name": { + "description": "Name of the song", + "is_categorical": false, + "possible_values": [] + }, + "artist": { + "description": "Artist who performed the song", + "is_categorical": false, + "possible_values": [] + }, + "album": { + "description": "Album the song belongs to", + "is_categorical": false, + "possible_values": [] + }, + "genre": { + "description": "Genre of the song", + "is_categorical": false, + "possible_values": [] + }, + "year": { + "description": "Year in which the song was released", + "is_categorical": true, + "possible_values": [ + "2010", + "2011", + "2012", + "2013", + "2014", + "2015", + "2016", + "2017", + "2018", + "2019" + ] + }, + "playback_device": { + "description": "Playback device on which the song is to be played", + "is_categorical": true, + "possible_values": [ + "tv", + "kitchen speaker", + "bedroom speaker" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "music_2": { + "description": "A widely used service for finding and playing music from a variety of genres and artists", + "slots": { + "song_name": { + "description": "Name of the song", + "is_categorical": false, + "possible_values": [] + }, + "artist": { + "description": "Name of the artist the song is performed by", + "is_categorical": false, + "possible_values": [] + }, + "album": { + "description": "Album the song belongs to", + "is_categorical": false, + "possible_values": [] + }, + "genre": { + "description": "Genre of the song", + "is_categorical": false, + "possible_values": [] + }, + "playback_device": { + "description": "Playback device on which the song is to be played", + "is_categorical": true, + "possible_values": [ + "tv", + "kitchen speaker", + "bedroom speaker" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "rentalcar_1": { + "description": "Car rental service with extensive coverage of locations and cars", + "slots": { + "type": { + "description": "Category to which rental car belongs", + "is_categorical": true, + "possible_values": [ + "compact", + "standard", + "full-size" + ] + }, + "car_name": { + "description": "Model name of rental car", + "is_categorical": false, + "possible_values": [] + }, + "pickup_location": { + "description": "Location of rental car pickup", + "is_categorical": false, + "possible_values": [] + }, + "pickup_date": { + "description": "Date of rental car pickup", + "is_categorical": false, + "possible_values": [] + }, + "pickup_time": { + "description": "Time of rental car pickup", + "is_categorical": false, + "possible_values": [] + }, + "pickup_city": { + "description": "City to pick up the rental car", + "is_categorical": false, + "possible_values": [] + }, + "dropoff_date": { + "description": "Date of rental car drop-off", + "is_categorical": false, + "possible_values": [] + }, + "total_price": { + "description": "Total price of car rental", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "rentalcar_2": { + "description": "Car rental service, available worldwide", + "slots": { + "car_type": { + "description": "Type of car for rental", + "is_categorical": true, + "possible_values": [ + "compact", + "standard", + "full-size" + ] + }, + "car_name": { + "description": "Name of car model", + "is_categorical": false, + "possible_values": [] + }, + "pickup_location": { + "description": "Pickup location for car rental", + "is_categorical": false, + "possible_values": [] + }, + "pickup_date": { + "description": "Date of pickup for car rental", + "is_categorical": false, + "possible_values": [] + }, + "pickup_time": { + "description": "Time of pickup for car rental", + "is_categorical": false, + "possible_values": [] + }, + "pickup_city": { + "description": "City in which to pick up rental car", + "is_categorical": false, + "possible_values": [] + }, + "dropoff_date": { + "description": "End date of car rental reservation", + "is_categorical": false, + "possible_values": [] + }, + "total_price": { + "description": "Total price of rental reservation", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "restaurant_1": { + "description": "A leading provider for restaurant search and reservations", + "slots": { + "restaurant_name": { + "description": "Name of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "date": { + "description": "Date for the reservation or to find availability", + "is_categorical": false, + "possible_values": [] + }, + "time": { + "description": "Time for the reservation or to find availability", + "is_categorical": false, + "possible_values": [] + }, + "serves_alcohol": { + "description": "Boolean flag indicating if the restaurant serves alcohol", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "has_live_music": { + "description": "Boolean flag indicating if the restaurant has live music", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "phone_number": { + "description": "Phone number of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "street_address": { + "description": "Address of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "party_size": { + "description": "Party size for a reservation", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5", + "6" + ] + }, + "price_range": { + "description": "Price range for the restaurant", + "is_categorical": true, + "possible_values": [ + "inexpensive", + "moderate", + "expensive", + "very expensive" + ] + }, + "city": { + "description": "City in which the restaurant is located", + "is_categorical": false, + "possible_values": [] + }, + "cuisine": { + "description": "Cuisine of food served in the restaurant", + "is_categorical": false, + "possible_values": [ + "mexican", + "chinese", + "indian", + "american", + "italian" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "ridesharing_1": { + "description": "On-demand taxi calling service", + "slots": { + "destination": { + "description": "Destination for taxi ride", + "is_categorical": false, + "possible_values": [] + }, + "shared_ride": { + "description": "Boolean flag whether ride is shared with other passengers", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "ride_fare": { + "description": "Total fare for taxi ride", + "is_categorical": false, + "possible_values": [] + }, + "approximate_ride_duration": { + "description": "Approximate duration of ride to the destination", + "is_categorical": false, + "possible_values": [] + }, + "number_of_riders": { + "description": "Number of riders to call taxi for", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "ridesharing_2": { + "description": "App to book a cab to any destination", + "slots": { + "destination": { + "description": "Destination address or location for cab", + "is_categorical": false, + "possible_values": [] + }, + "ride_type": { + "description": "Type of cab ride", + "is_categorical": true, + "possible_values": [ + "pool", + "regular", + "luxury" + ] + }, + "ride_fare": { + "description": "Total fare for cab ride", + "is_categorical": false, + "possible_values": [] + }, + "wait_time": { + "description": "Expected waiting time for pick-up by cab", + "is_categorical": false, + "possible_values": [] + }, + "number_of_seats": { + "description": "Number of seats to reserve in the cab", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "services_1": { + "description": "A widely used service for finding and reserving the hair stylist of your choice", + "slots": { + "stylist_name": { + "description": "Name of the hair stylist/salon", + "is_categorical": false, + "possible_values": [] + }, + "phone_number": { + "description": "Phone number of the stylist/salon", + "is_categorical": false, + "possible_values": [] + }, + "average_rating": { + "description": "Average review rating for the stylist/salon", + "is_categorical": false, + "possible_values": [] + }, + "is_unisex": { + "description": "Boolean flag indicating if the salon is unisex", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "street_address": { + "description": "Address of the stylist/salon", + "is_categorical": false, + "possible_values": [] + }, + "city": { + "description": "City where the salon is located", + "is_categorical": false, + "possible_values": [] + }, + "appointment_date": { + "description": "Date for the appointment", + "is_categorical": false, + "possible_values": [] + }, + "appointment_time": { + "description": "Time of the appointment", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "services_2": { + "description": "The go-to service for finding and booking appointments with top rated dentists", + "slots": { + "dentist_name": { + "description": "Name of the dentist", + "is_categorical": false, + "possible_values": [] + }, + "phone_number": { + "description": "Phone number of the dentist", + "is_categorical": false, + "possible_values": [] + }, + "address": { + "description": "Address of the dentist", + "is_categorical": false, + "possible_values": [] + }, + "city": { + "description": "City where the dentist is located", + "is_categorical": false, + "possible_values": [] + }, + "appointment_date": { + "description": "Date for the appointment", + "is_categorical": false, + "possible_values": [] + }, + "appointment_time": { + "description": "Time for the appointment", + "is_categorical": false, + "possible_values": [] + }, + "offers_cosmetic_services": { + "description": "Boolean flag indicating if the dentist offers cosmetic services", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "services_3": { + "description": "A popular provider for finding the right doctor for your needs. Also allows you to schedule your visit to the doctor", + "slots": { + "doctor_name": { + "description": "Name of the doctor or the medical practice", + "is_categorical": false, + "possible_values": [] + }, + "phone_number": { + "description": "Contact number for the doctor or the medical practice", + "is_categorical": false, + "possible_values": [] + }, + "average_rating": { + "description": "Average review rating of the doctor", + "is_categorical": false, + "possible_values": [] + }, + "street_address": { + "description": "Address of the doctor", + "is_categorical": false, + "possible_values": [] + }, + "city": { + "description": "City where the doctor is located", + "is_categorical": false, + "possible_values": [] + }, + "appointment_date": { + "description": "Date for scheduling the appointment with the doctor", + "is_categorical": false, + "possible_values": [] + }, + "appointment_time": { + "description": "Time for the appointment with the doctor", + "is_categorical": false, + "possible_values": [] + }, + "type": { + "description": "Speciality of the doctor", + "is_categorical": true, + "possible_values": [ + "gynecologist", + "ent specialist", + "ophthalmologist", + "general practitioner", + "dermatologist" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "travel_1": { + "description": "The biggest database of tourist attractions and points of interest", + "slots": { + "location": { + "description": "City or town where the attraction is located", + "is_categorical": false, + "possible_values": [] + }, + "attraction_name": { + "description": "Common name of the attraction", + "is_categorical": false, + "possible_values": [] + }, + "category": { + "description": "Category to which the attraction belongs", + "is_categorical": true, + "possible_values": [ + "place of worship", + "theme park", + "museum", + "historical landmark", + "park", + "tourist attraction", + "sports venue", + "shopping area", + "performing arts venue", + "nature preserve" + ] + }, + "phone_number": { + "description": "Phone number to contact the attraction", + "is_categorical": false, + "possible_values": [] + }, + "free_entry": { + "description": "Boolean flag indicating whether entrance to attraction is free", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "good_for_kids": { + "description": "Boolean flag indicating whether attraction is good for to take kids to", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "weather_1": { + "description": "Check the weather for any place and any date", + "slots": { + "precipitation": { + "description": "The possibility of rain or snow in percentage", + "is_categorical": false, + "possible_values": [] + }, + "humidity": { + "description": "Percentage humidity", + "is_categorical": false, + "possible_values": [] + }, + "wind": { + "description": "Wind speed in miles per hour", + "is_categorical": false, + "possible_values": [] + }, + "temperature": { + "description": "Temperature in Fahrenheit", + "is_categorical": false, + "possible_values": [] + }, + "city": { + "description": "Name of the city", + "is_categorical": false, + "possible_values": [] + }, + "date": { + "description": "Date for the weather", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "alarm_1": { + "description": "Manage alarms by getting and setting them easily", + "slots": { + "alarm_time": { + "description": "Time of the alarm", + "is_categorical": false, + "possible_values": [] + }, + "alarm_name": { + "description": "Name of the alarm", + "is_categorical": false, + "possible_values": [] + }, + "new_alarm_time": { + "description": "Time to set for the new alarm", + "is_categorical": false, + "possible_values": [] + }, + "new_alarm_name": { + "description": "Name to use for the new alarm", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "bank_2": { + "description": "Service to manage your bank accounts and finances", + "slots": { + "account_type": { + "description": "The user's account type", + "is_categorical": true, + "possible_values": [ + "checking", + "savings" + ] + }, + "recipient_account_type": { + "description": "The account type of the recipient to transfer the money to", + "is_categorical": true, + "possible_values": [ + "checking", + "savings" + ] + }, + "account_balance": { + "description": "The balance in the specified account", + "is_categorical": false, + "possible_values": [] + }, + "transfer_amount": { + "description": "The amount of money to transfer", + "is_categorical": false, + "possible_values": [] + }, + "recipient_name": { + "description": "The name of the recipient to transfer the money to", + "is_categorical": false, + "possible_values": [] + }, + "transfer_time": { + "description": "Number of days for the transfer to go through", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "flight_3": { + "description": "Find one way and round trip flights to your favorite city", + "slots": { + "passengers": { + "description": "Number of passengers to find flight seats for", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4" + ] + }, + "flight_class": { + "description": "Fare class of flight booking", + "is_categorical": true, + "possible_values": [ + "economy", + "premium economy", + "business", + "first class" + ] + }, + "origin_city": { + "description": "City in which the journey originates", + "is_categorical": false, + "possible_values": [] + }, + "destination_city": { + "description": "City in which the journey ends", + "is_categorical": false, + "possible_values": [] + }, + "origin_airport_name": { + "description": "Number of the airport flying out from", + "is_categorical": false, + "possible_values": [] + }, + "destination_airport_name": { + "description": "Number of the airport flying to", + "is_categorical": false, + "possible_values": [] + }, + "departure_date": { + "description": "Date of departure flight", + "is_categorical": false, + "possible_values": [] + }, + "return_date": { + "description": "Date of return flight", + "is_categorical": false, + "possible_values": [] + }, + "number_stops": { + "description": "Number of stops in the itinerary", + "is_categorical": true, + "possible_values": [ + "0", + "1" + ] + }, + "outbound_departure_time": { + "description": "Local time of departure of flight from origin to destination", + "is_categorical": false, + "possible_values": [] + }, + "outbound_arrival_time": { + "description": "Local time of arrival of flight from origin to destination", + "is_categorical": false, + "possible_values": [] + }, + "inbound_arrival_time": { + "description": "Local time of arrival of flight from destination to origin", + "is_categorical": false, + "possible_values": [] + }, + "inbound_departure_time": { + "description": "Local time of departure of flight from destination to origin", + "is_categorical": false, + "possible_values": [] + }, + "price": { + "description": "Price per passenger of the itinerary", + "is_categorical": false, + "possible_values": [] + }, + "number_checked_bags": { + "description": "Number of bags to check in", + "is_categorical": true, + "possible_values": [ + "0", + "1", + "2" + ] + }, + "airlines": { + "description": "Name of airline operating the flight", + "is_categorical": true, + "possible_values": [ + "united airlines", + "american airlines", + "delta airlines", + "southwest airlines", + "alaska airlines", + "british airways", + "air canada", + "air france" + ] + }, + "arrives_next_day": { + "description": "Whether the flight arrives the next day", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "hotel_4": { + "description": "Accommodation searching and booking portal", + "slots": { + "location": { + "description": "City or town where the accommodation is located", + "is_categorical": false, + "possible_values": [] + }, + "number_of_rooms": { + "description": "Number of rooms to reserve", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3" + ] + }, + "check_in_date": { + "description": "Check in date for reservation", + "is_categorical": false, + "possible_values": [] + }, + "stay_length": { + "description": "Length of stay in days", + "is_categorical": false, + "possible_values": [] + }, + "star_rating": { + "description": "Star rating of the accommodation", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5" + ] + }, + "place_name": { + "description": "Name of the accommodation", + "is_categorical": false, + "possible_values": [] + }, + "street_address": { + "description": "Street address of the accommodation", + "is_categorical": false, + "possible_values": [] + }, + "phone_number": { + "description": "Contact phone number of the accommodation", + "is_categorical": false, + "possible_values": [] + }, + "price_per_night": { + "description": "Price per night for the stay", + "is_categorical": false, + "possible_values": [] + }, + "smoking_allowed": { + "description": "Whether or not smoking is allowed inside the place", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "media_2": { + "description": "The widest selection and lowest prices for movie rentals", + "slots": { + "movie_name": { + "description": "Name of the movie", + "is_categorical": false, + "possible_values": [] + }, + "genre": { + "description": "Main genre of the movie", + "is_categorical": false, + "possible_values": [] + }, + "subtitle_language": { + "description": "Language to use for subtitles (or None for no subtitles)", + "is_categorical": true, + "possible_values": [ + "none", + "english", + "mandarin", + "spanish" + ] + }, + "director": { + "description": "Name of the director of the movie", + "is_categorical": false, + "possible_values": [] + }, + "actors": { + "description": "Name of an actor starring in the movie", + "is_categorical": false, + "possible_values": [] + }, + "price": { + "description": "Cost of renting movie", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "movie_2": { + "description": "The definitive database to discover new movies to watch", + "slots": { + "title": { + "description": "Title for movie", + "is_categorical": false, + "possible_values": [] + }, + "genre": { + "description": "Genre of the movie", + "is_categorical": false, + "possible_values": [] + }, + "aggregate_rating": { + "description": "Aggregate user rating for movie, scale of 10", + "is_categorical": false, + "possible_values": [] + }, + "starring": { + "description": "Name of actor starring in movie", + "is_categorical": false, + "possible_values": [] + }, + "director": { + "description": "Name of director of the movie", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "restaurant_2": { + "description": "A popular restaurant search and reservation service", + "slots": { + "restaurant_name": { + "description": "Name of the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "date": { + "description": "Tentative date of restaurant reservation", + "is_categorical": false, + "possible_values": [] + }, + "time": { + "description": "Tentative time of restaurant reservation", + "is_categorical": false, + "possible_values": [] + }, + "has_seating_outdoors": { + "description": "Whether the restaurant has outdoor seating available", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "has_vegetarian_options": { + "description": "Whether the restaurant has adequate vegetarian options", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "phone_number": { + "description": "Phone number to contact restaurant", + "is_categorical": false, + "possible_values": [] + }, + "rating": { + "description": "Average user rating for restaurant on a scale of 5", + "is_categorical": false, + "possible_values": [] + }, + "address": { + "description": "Address of restaurant", + "is_categorical": false, + "possible_values": [] + }, + "number_of_seats": { + "description": "Number of seats to reserve at the restaurant", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5", + "6" + ] + }, + "price_range": { + "description": "Price range for the restaurant", + "is_categorical": true, + "possible_values": [ + "cheap", + "moderate", + "pricey", + "ultra high-end" + ] + }, + "location": { + "description": "City where the restaurant is located", + "is_categorical": false, + "possible_values": [] + }, + "category": { + "description": "The category of food offered by the restaurant", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "services_4": { + "description": "Discover the right therapist for you and make reservations easily", + "slots": { + "therapist_name": { + "description": "Name of the therapist", + "is_categorical": false, + "possible_values": [] + }, + "phone_number": { + "description": "Contact number of the therapist", + "is_categorical": false, + "possible_values": [] + }, + "address": { + "description": "Address of the therapist", + "is_categorical": false, + "possible_values": [] + }, + "city": { + "description": "Area where user wants to search for a therapist", + "is_categorical": false, + "possible_values": [] + }, + "appointment_date": { + "description": "Date of the appointment", + "is_categorical": false, + "possible_values": [] + }, + "appointment_time": { + "description": "Time of the appointment", + "is_categorical": false, + "possible_values": [] + }, + "type": { + "description": "Type of the therapist", + "is_categorical": true, + "possible_values": [ + "psychologist", + "family counselor", + "psychiatrist" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "bus_3": { + "description": "Affordable and comfortable bus travel across the country", + "slots": { + "from_city": { + "description": "The city to depart from", + "is_categorical": false, + "possible_values": [] + }, + "to_city": { + "description": "The destination city of the trip", + "is_categorical": false, + "possible_values": [] + }, + "from_station": { + "description": "Name of station of departure", + "is_categorical": false, + "possible_values": [] + }, + "to_station": { + "description": "Name of station of arrival", + "is_categorical": false, + "possible_values": [] + }, + "departure_date": { + "description": "The date of departure", + "is_categorical": false, + "possible_values": [] + }, + "departure_time": { + "description": "The time of departure", + "is_categorical": false, + "possible_values": [] + }, + "price": { + "description": "Ticket price per passenger", + "is_categorical": false, + "possible_values": [] + }, + "additional_luggage": { + "description": "Whether to carry excess baggage in the bus", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "num_passengers": { + "description": "The number of tickets for the trip", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5" + ] + }, + "category": { + "description": "How many stops the route has", + "is_categorical": true, + "possible_values": [ + "direct", + "one-stop" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "event_3": { + "description": "Find and book tickets to any cultural events in your area", + "slots": { + "event_type": { + "description": "Type of cultural event", + "is_categorical": true, + "possible_values": [ + "music", + "theater" + ] + }, + "event_name": { + "description": "Name of artist or play", + "is_categorical": false, + "possible_values": [] + }, + "date": { + "description": "Date of event", + "is_categorical": false, + "possible_values": [] + }, + "time": { + "description": "Start time of event", + "is_categorical": false, + "possible_values": [] + }, + "number_of_tickets": { + "description": "Number of tickets to reserve for the event", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9" + ] + }, + "price_per_ticket": { + "description": "Price of each ticket", + "is_categorical": false, + "possible_values": [] + }, + "city": { + "description": "City where the event is taking place", + "is_categorical": false, + "possible_values": [] + }, + "venue": { + "description": "Exact venue of event", + "is_categorical": false, + "possible_values": [] + }, + "venue_address": { + "description": "Street address of event venue", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "flight_4": { + "description": "Find cheap flights in seconds and book flights", + "slots": { + "number_of_tickets": { + "description": "the number of flight tickets for the trip", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4" + ] + }, + "seating_class": { + "description": "The cabin seat option", + "is_categorical": true, + "possible_values": [ + "economy", + "premium economy", + "business" + ] + }, + "origin_airport": { + "description": "The name of the airport or city to depart from", + "is_categorical": false, + "possible_values": [] + }, + "destination_airport": { + "description": "The name of the airport or city to arrive at", + "is_categorical": false, + "possible_values": [] + }, + "departure_date": { + "description": "Start date of the trip", + "is_categorical": false, + "possible_values": [] + }, + "return_date": { + "description": "End date of the trip", + "is_categorical": false, + "possible_values": [] + }, + "is_nonstop": { + "description": "Whether the flight is a direct one", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "outbound_departure_time": { + "description": "Departure time of the flight flying to the destination", + "is_categorical": false, + "possible_values": [] + }, + "outbound_arrival_time": { + "description": "Arrival time of the flight flying to the destination", + "is_categorical": false, + "possible_values": [] + }, + "inbound_arrival_time": { + "description": "Arrival time of the flight coming back from the trip", + "is_categorical": false, + "possible_values": [] + }, + "inbound_departure_time": { + "description": "Departure time of the flight coming back from the trip", + "is_categorical": false, + "possible_values": [] + }, + "price": { + "description": "The total cost of the flight tickets", + "is_categorical": false, + "possible_values": [] + }, + "airlines": { + "description": "The company that provides air transport services", + "is_categorical": true, + "possible_values": [ + "united airlines", + "american airlines", + "delta airlines", + "southwest airlines", + "alaska airlines", + "british airways", + "air canada", + "air france", + "south african airways", + "lot polish airlines", + "latam brasil" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "home_2": { + "description": "Service for finding properties to buy and rent", + "slots": { + "intent": { + "description": "Whether to buy or rent a property", + "is_categorical": true, + "possible_values": [ + "rent", + "buy" + ] + }, + "area": { + "description": "City where the property is located", + "is_categorical": false, + "possible_values": [] + }, + "address": { + "description": "Street address of property", + "is_categorical": false, + "possible_values": [] + }, + "property_name": { + "description": "Name of property or apartment complex", + "is_categorical": false, + "possible_values": [] + }, + "phone_number": { + "description": "Contact number of property or apartment complex", + "is_categorical": false, + "possible_values": [] + }, + "has_garage": { + "description": "Whether the property has a garage", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "in_unit_laundry": { + "description": "Whether the property has in-unit laundry facilities", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "price": { + "description": "Sale price or per-month rent of property", + "is_categorical": false, + "possible_values": [] + }, + "visit_date": { + "description": "Date for visit to the property", + "is_categorical": false, + "possible_values": [] + }, + "number_of_beds": { + "description": "Number of bedrooms in the property", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4" + ] + }, + "number_of_baths": { + "description": "Number of bathroom in the property", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "media_3": { + "description": "Enjoy instant and unlimited access to best shows, movies, comedy, sports, documentaries and more.", + "slots": { + "title": { + "description": "Title of the movie", + "is_categorical": false, + "possible_values": [] + }, + "genre": { + "description": "Category of the content", + "is_categorical": false, + "possible_values": [] + }, + "subtitle_language": { + "description": "Language of the subtitles", + "is_categorical": true, + "possible_values": [ + "english", + "spanish", + "hindi", + "french" + ] + }, + "starring": { + "description": "Celebs acting in the movie", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "messaging_1": { + "description": "Connect and share locations with your contacts", + "slots": { + "location": { + "description": "Location to share with contact", + "is_categorical": false, + "possible_values": [] + }, + "contact_name": { + "description": "Name of contact to send to", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "movie_3": { + "description": "A review-aggregation website for movies and television", + "slots": { + "movie_title": { + "description": "Name of the movie", + "is_categorical": false, + "possible_values": [] + }, + "genre": { + "description": "Type of the movie", + "is_categorical": false, + "possible_values": [] + }, + "percent_rating": { + "description": "Average critic percentage rating", + "is_categorical": false, + "possible_values": [] + }, + "cast": { + "description": "Actors in the movie", + "is_categorical": false, + "possible_values": [] + }, + "directed_by": { + "description": "Director of the movie", + "is_categorical": false, + "possible_values": [] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "music_3": { + "description": "A free, personalized platform that plays music you'll love. Discover new music and enjoy old favorites.", + "slots": { + "track": { + "description": "Name of the song", + "is_categorical": false, + "possible_values": [] + }, + "artist": { + "description": "Performer's name", + "is_categorical": false, + "possible_values": [] + }, + "album": { + "description": "Collection of the song", + "is_categorical": false, + "possible_values": [] + }, + "genre": { + "description": "Type of the music", + "is_categorical": false, + "possible_values": [] + }, + "year": { + "description": "Year when the song was first released", + "is_categorical": true, + "possible_values": [ + "2010", + "2011", + "2012", + "2013", + "2014", + "2015", + "2016", + "2017", + "2018", + "2019" + ] + }, + "device": { + "description": "Place or name of the media player to play the song selected", + "is_categorical": true, + "possible_values": [ + "living room", + "kitchen", + "patio" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "payment_1": { + "description": "The fast, simple way to pay in apps, on the web, and in millions of stores", + "slots": { + "payment_method": { + "description": "The source of money used for making the payment", + "is_categorical": true, + "possible_values": [ + "app balance", + "debit card", + "credit card" + ] + }, + "amount": { + "description": "The amount of money to send or request", + "is_categorical": false, + "possible_values": [] + }, + "receiver": { + "description": "Name of the contact or account to make the transaction with", + "is_categorical": false, + "possible_values": [] + }, + "private_visibility": { + "description": "Whether the transaction is private or not", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "rentalcar_3": { + "description": "A leading global provider of car rental solutions", + "slots": { + "car_type": { + "description": "Type of the car", + "is_categorical": true, + "possible_values": [ + "hatchback", + "sedan", + "suv" + ] + }, + "car_name": { + "description": "Car model", + "is_categorical": false, + "possible_values": [] + }, + "pickup_location": { + "description": "Place to pick up the car", + "is_categorical": false, + "possible_values": [] + }, + "start_date": { + "description": "The first date to start using the rental car", + "is_categorical": false, + "possible_values": [] + }, + "pickup_time": { + "description": "Time for the pick-up", + "is_categorical": false, + "possible_values": [] + }, + "city": { + "description": "City where you want to rent the car", + "is_categorical": false, + "possible_values": [] + }, + "end_date": { + "description": "The date to return the car", + "is_categorical": false, + "possible_values": [] + }, + "price_per_day": { + "description": "The cost for renting the car per day", + "is_categorical": false, + "possible_values": [] + }, + "add_insurance": { + "description": "Whether to purchase insurance", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + }, + "train_1": { + "description": "Service to find and reserve train journeys between cities", + "slots": { + "from": { + "description": "Starting city for train journey", + "is_categorical": false, + "possible_values": [] + }, + "to": { + "description": "Ending city for train journey", + "is_categorical": false, + "possible_values": [] + }, + "from_station": { + "description": "Name of station at starting city", + "is_categorical": false, + "possible_values": [] + }, + "to_station": { + "description": "Name of station at ending city", + "is_categorical": false, + "possible_values": [] + }, + "date_of_journey": { + "description": "Date of train journey", + "is_categorical": false, + "possible_values": [] + }, + "journey_start_time": { + "description": "Time of start of train journey", + "is_categorical": false, + "possible_values": [] + }, + "total": { + "description": "Total price of train reservation", + "is_categorical": false, + "possible_values": [] + }, + "number_of_adults": { + "description": "Number of adults to reserve train tickets for", + "is_categorical": true, + "possible_values": [ + "1", + "2", + "3", + "4", + "5" + ] + }, + "class": { + "description": "Fare class for train reservation", + "is_categorical": true, + "possible_values": [ + "value", + "flexible", + "business" + ] + }, + "trip_protection": { + "description": "Whether to add trip protection to reservation, for a fee", + "is_categorical": true, + "possible_values": [ + "true", + "false" + ] + }, + "count": { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": false, + "possible_values": [] + } + } + } + }, + "intents": { + "inform": { + "description": "Inform the value for a slot." + }, + "request": { + "description": "Request the value of a slot." + }, + "confirm": { + "description": "Confirm the value of a slot before making a transactional service call." + }, + "offer": { + "description": "Offer a certain value for a slot to the user." + }, + "notify_success": { + "description": "Inform the user that their request was successful." + }, + "notify_failure": { + "description": "Inform the user that their request failed." + }, + "inform_count": { + "description": "Inform the number of items found that satisfy the user's request." + }, + "offer_intent": { + "description": "Offer a new intent to the user." + }, + "req_more": { + "description": "Asking the user if they need anything else." + }, + "goodbye": { + "description": "End the dialogue." + }, + "inform_intent": { + "description": "Express the desire to perform a certain task to the system." + }, + "negate_intent": { + "description": "Negate the intent which has been offered by the system." + }, + "affirm_intent": { + "description": "Agree to the intent which has been offered by the system." + }, + "affirm": { + "description": "Agree to the system's proposition. " + }, + "negate": { + "description": "Deny the system's proposal." + }, + "select": { + "description": "Select a result being offered by the system." + }, + "request_alts": { + "description": "Ask for more results besides the ones offered by the system." + }, + "thank_you": { + "description": "Thank the system." + } + }, + "binary_dialogue_act": [ + { + "intent": "affirm", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "restaurant_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "event_2", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "music_2", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "music_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "event_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "flight_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "media_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "rentalcar_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "rentalcar_2", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "bus_2", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "bus_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "services_2", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "services_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "services_3", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "home_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "bank_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "hotel_2", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "calendar_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "hotel_3", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "hotel_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "alarm_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "services_4", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "restaurant_2", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "bank_2", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "media_2", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "hotel_4", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "music_3", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "event_3", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "rentalcar_3", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "bus_3", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "home_2", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "train_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "movie_1", + "slot": "", + "value": "" + }, + { + "intent": "affirm_intent", + "domain": "media_3", + "slot": "", + "value": "" + }, + { + "intent": "goodbye", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "inform_intent", + "domain": "restaurant_1", + "slot": "intent", + "value": "findrestaurants" + }, + { + "intent": "inform_intent", + "domain": "restaurant_1", + "slot": "intent", + "value": "reserverestaurant" + }, + { + "intent": "inform_intent", + "domain": "media_1", + "slot": "intent", + "value": "playmovie" + }, + { + "intent": "inform_intent", + "domain": "event_2", + "slot": "intent", + "value": "geteventdates" + }, + { + "intent": "inform_intent", + "domain": "event_2", + "slot": "intent", + "value": "buyeventtickets" + }, + { + "intent": "inform_intent", + "domain": "music_2", + "slot": "intent", + "value": "lookupmusic" + }, + { + "intent": "inform_intent", + "domain": "music_2", + "slot": "intent", + "value": "playmedia" + }, + { + "intent": "inform_intent", + "domain": "music_1", + "slot": "intent", + "value": "lookupsong" + }, + { + "intent": "inform_intent", + "domain": "music_1", + "slot": "intent", + "value": "playsong" + }, + { + "intent": "inform_intent", + "domain": "event_1", + "slot": "intent", + "value": "findevents" + }, + { + "intent": "inform_intent", + "domain": "event_1", + "slot": "intent", + "value": "buyeventtickets" + }, + { + "intent": "inform_intent", + "domain": "event_2", + "slot": "intent", + "value": "findevents" + }, + { + "intent": "inform_intent", + "domain": "movie_1", + "slot": "intent", + "value": "findmovies" + }, + { + "intent": "inform_intent", + "domain": "movie_1", + "slot": "intent", + "value": "gettimesformovie" + }, + { + "intent": "inform_intent", + "domain": "flight_1", + "slot": "intent", + "value": "searchonewayflight" + }, + { + "intent": "inform_intent", + "domain": "flight_2", + "slot": "intent", + "value": "searchonewayflight" + }, + { + "intent": "inform_intent", + "domain": "flight_1", + "slot": "intent", + "value": "reserveonewayflight" + }, + { + "intent": "inform_intent", + "domain": "flight_1", + "slot": "intent", + "value": "searchroundtripflights" + }, + { + "intent": "inform_intent", + "domain": "flight_1", + "slot": "intent", + "value": "reserveroundtripflights" + }, + { + "intent": "inform_intent", + "domain": "flight_2", + "slot": "intent", + "value": "searchroundtripflights" + }, + { + "intent": "inform_intent", + "domain": "media_1", + "slot": "intent", + "value": "findmovies" + }, + { + "intent": "inform_intent", + "domain": "ridesharing_2", + "slot": "intent", + "value": "getride" + }, + { + "intent": "inform_intent", + "domain": "ridesharing_1", + "slot": "intent", + "value": "getride" + }, + { + "intent": "inform_intent", + "domain": "rentalcar_1", + "slot": "intent", + "value": "getcarsavailable" + }, + { + "intent": "inform_intent", + "domain": "rentalcar_1", + "slot": "intent", + "value": "reservecar" + }, + { + "intent": "inform_intent", + "domain": "rentalcar_2", + "slot": "intent", + "value": "getcarsavailable" + }, + { + "intent": "inform_intent", + "domain": "rentalcar_2", + "slot": "intent", + "value": "reservecar" + }, + { + "intent": "inform_intent", + "domain": "bus_2", + "slot": "intent", + "value": "findbus" + }, + { + "intent": "inform_intent", + "domain": "bus_1", + "slot": "intent", + "value": "findbus" + }, + { + "intent": "inform_intent", + "domain": "hotel_2", + "slot": "intent", + "value": "bookhouse" + }, + { + "intent": "inform_intent", + "domain": "bus_2", + "slot": "intent", + "value": "buybusticket" + }, + { + "intent": "inform_intent", + "domain": "bus_1", + "slot": "intent", + "value": "buybusticket" + }, + { + "intent": "inform_intent", + "domain": "services_2", + "slot": "intent", + "value": "findprovider" + }, + { + "intent": "inform_intent", + "domain": "services_2", + "slot": "intent", + "value": "bookappointment" + }, + { + "intent": "inform_intent", + "domain": "services_1", + "slot": "intent", + "value": "findprovider" + }, + { + "intent": "inform_intent", + "domain": "services_1", + "slot": "intent", + "value": "bookappointment" + }, + { + "intent": "inform_intent", + "domain": "services_3", + "slot": "intent", + "value": "findprovider" + }, + { + "intent": "inform_intent", + "domain": "services_3", + "slot": "intent", + "value": "bookappointment" + }, + { + "intent": "inform_intent", + "domain": "home_1", + "slot": "intent", + "value": "findapartment" + }, + { + "intent": "inform_intent", + "domain": "home_1", + "slot": "intent", + "value": "schedulevisit" + }, + { + "intent": "inform_intent", + "domain": "bank_1", + "slot": "intent", + "value": "checkbalance" + }, + { + "intent": "inform_intent", + "domain": "bank_1", + "slot": "intent", + "value": "transfermoney" + }, + { + "intent": "inform_intent", + "domain": "hotel_2", + "slot": "intent", + "value": "searchhouse" + }, + { + "intent": "inform_intent", + "domain": "calendar_1", + "slot": "intent", + "value": "getevents" + }, + { + "intent": "inform_intent", + "domain": "calendar_1", + "slot": "intent", + "value": "getavailabletime" + }, + { + "intent": "inform_intent", + "domain": "calendar_1", + "slot": "intent", + "value": "addevent" + }, + { + "intent": "inform_intent", + "domain": "hotel_3", + "slot": "intent", + "value": "reservehotel" + }, + { + "intent": "inform_intent", + "domain": "hotel_1", + "slot": "intent", + "value": "reservehotel" + }, + { + "intent": "inform_intent", + "domain": "hotel_3", + "slot": "intent", + "value": "searchhotel" + }, + { + "intent": "inform_intent", + "domain": "hotel_1", + "slot": "intent", + "value": "searchhotel" + }, + { + "intent": "inform_intent", + "domain": "weather_1", + "slot": "intent", + "value": "getweather" + }, + { + "intent": "inform_intent", + "domain": "travel_1", + "slot": "intent", + "value": "findattractions" + }, + { + "intent": "inform_intent", + "domain": "restaurant_2", + "slot": "intent", + "value": "reserverestaurant" + }, + { + "intent": "inform_intent", + "domain": "flight_3", + "slot": "intent", + "value": "searchonewayflight" + }, + { + "intent": "inform_intent", + "domain": "flight_3", + "slot": "intent", + "value": "searchroundtripflights" + }, + { + "intent": "inform_intent", + "domain": "alarm_1", + "slot": "intent", + "value": "getalarms" + }, + { + "intent": "inform_intent", + "domain": "alarm_1", + "slot": "intent", + "value": "addalarm" + }, + { + "intent": "inform_intent", + "domain": "services_4", + "slot": "intent", + "value": "findprovider" + }, + { + "intent": "inform_intent", + "domain": "services_4", + "slot": "intent", + "value": "bookappointment" + }, + { + "intent": "inform_intent", + "domain": "restaurant_2", + "slot": "intent", + "value": "findrestaurants" + }, + { + "intent": "inform_intent", + "domain": "bank_2", + "slot": "intent", + "value": "checkbalance" + }, + { + "intent": "inform_intent", + "domain": "bank_2", + "slot": "intent", + "value": "transfermoney" + }, + { + "intent": "inform_intent", + "domain": "movie_2", + "slot": "intent", + "value": "findmovies" + }, + { + "intent": "inform_intent", + "domain": "media_2", + "slot": "intent", + "value": "findmovies" + }, + { + "intent": "inform_intent", + "domain": "media_2", + "slot": "intent", + "value": "rentmovie" + }, + { + "intent": "inform_intent", + "domain": "hotel_4", + "slot": "intent", + "value": "searchhotel" + }, + { + "intent": "inform_intent", + "domain": "hotel_4", + "slot": "intent", + "value": "reservehotel" + }, + { + "intent": "inform_intent", + "domain": "music_3", + "slot": "intent", + "value": "lookupmusic" + }, + { + "intent": "inform_intent", + "domain": "music_3", + "slot": "intent", + "value": "playmedia" + }, + { + "intent": "inform_intent", + "domain": "event_3", + "slot": "intent", + "value": "findevents" + }, + { + "intent": "inform_intent", + "domain": "event_3", + "slot": "intent", + "value": "buyeventtickets" + }, + { + "intent": "inform_intent", + "domain": "flight_4", + "slot": "intent", + "value": "searchonewayflight" + }, + { + "intent": "inform_intent", + "domain": "flight_4", + "slot": "intent", + "value": "searchroundtripflights" + }, + { + "intent": "inform_intent", + "domain": "rentalcar_3", + "slot": "intent", + "value": "getcarsavailable" + }, + { + "intent": "inform_intent", + "domain": "rentalcar_3", + "slot": "intent", + "value": "reservecar" + }, + { + "intent": "inform_intent", + "domain": "bus_3", + "slot": "intent", + "value": "findbus" + }, + { + "intent": "inform_intent", + "domain": "bus_3", + "slot": "intent", + "value": "buybusticket" + }, + { + "intent": "inform_intent", + "domain": "home_2", + "slot": "intent", + "value": "findhomebyarea" + }, + { + "intent": "inform_intent", + "domain": "home_2", + "slot": "intent", + "value": "schedulevisit" + }, + { + "intent": "inform_intent", + "domain": "movie_1", + "slot": "intent", + "value": "buymovietickets" + }, + { + "intent": "inform_intent", + "domain": "payment_1", + "slot": "intent", + "value": "makepayment" + }, + { + "intent": "inform_intent", + "domain": "payment_1", + "slot": "intent", + "value": "requestpayment" + }, + { + "intent": "inform_intent", + "domain": "train_1", + "slot": "intent", + "value": "findtrains" + }, + { + "intent": "inform_intent", + "domain": "train_1", + "slot": "intent", + "value": "gettraintickets" + }, + { + "intent": "inform_intent", + "domain": "movie_3", + "slot": "intent", + "value": "findmovies" + }, + { + "intent": "inform_intent", + "domain": "media_3", + "slot": "intent", + "value": "playmovie" + }, + { + "intent": "inform_intent", + "domain": "media_3", + "slot": "intent", + "value": "findmovies" + }, + { + "intent": "inform_intent", + "domain": "messaging_1", + "slot": "intent", + "value": "sharelocation" + }, + { + "intent": "negate", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "event_2", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "event_1", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "movie_1", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "flight_1", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "media_1", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "rentalcar_1", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "rentalcar_2", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "bus_2", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "bus_1", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "bank_1", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "services_2", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "services_1", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "services_3", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "home_1", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "hotel_3", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "hotel_2", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "hotel_1", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "restaurant_1", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "calendar_1", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "hotel_4", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "bank_2", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "media_2", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "services_4", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "restaurant_2", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "event_3", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "rentalcar_3", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "bus_3", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "train_1", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "music_3", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "media_3", + "slot": "", + "value": "" + }, + { + "intent": "negate_intent", + "domain": "alarm_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "restaurant_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "media_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "music_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "event_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "movie_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "event_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "flight_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "flight_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "rentalcar_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "services_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "services_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "services_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "home_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "hotel_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "calendar_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "hotel_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "hotel_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "bus_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "weather_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "music_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "rentalcar_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "travel_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "bus_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "bank_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "ridesharing_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "ridesharing_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "restaurant_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "alarm_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "services_4", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "media_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "hotel_4", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "movie_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "flight_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "bank_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "event_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "movie_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "media_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "bus_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "music_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "flight_4", + "slot": "", + "value": "" + }, + { + "intent": "notify_failure", + "domain": "rentalcar_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "restaurant_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "media_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "event_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "music_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "music_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "event_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "flight_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "ridesharing_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "ridesharing_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "rentalcar_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "rentalcar_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "hotel_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "bus_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "bus_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "services_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "services_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "services_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "home_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "bank_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "calendar_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "hotel_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "hotel_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "restaurant_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "alarm_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "services_4", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "bank_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "media_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "hotel_4", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "music_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "event_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "rentalcar_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "bus_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "home_2", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "movie_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "payment_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "train_1", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "media_3", + "slot": "", + "value": "" + }, + { + "intent": "notify_success", + "domain": "messaging_1", + "slot": "", + "value": "" + }, + { + "intent": "offer_intent", + "domain": "restaurant_1", + "slot": "intent", + "value": "reserverestaurant" + }, + { + "intent": "offer_intent", + "domain": "event_2", + "slot": "intent", + "value": "buyeventtickets" + }, + { + "intent": "offer_intent", + "domain": "music_2", + "slot": "intent", + "value": "playmedia" + }, + { + "intent": "offer_intent", + "domain": "music_1", + "slot": "intent", + "value": "playsong" + }, + { + "intent": "offer_intent", + "domain": "event_1", + "slot": "intent", + "value": "buyeventtickets" + }, + { + "intent": "offer_intent", + "domain": "movie_1", + "slot": "intent", + "value": "buymovietickets" + }, + { + "intent": "offer_intent", + "domain": "flight_1", + "slot": "intent", + "value": "reserveonewayflight" + }, + { + "intent": "offer_intent", + "domain": "flight_1", + "slot": "intent", + "value": "reserveroundtripflights" + }, + { + "intent": "offer_intent", + "domain": "media_1", + "slot": "intent", + "value": "playmovie" + }, + { + "intent": "offer_intent", + "domain": "rentalcar_1", + "slot": "intent", + "value": "reservecar" + }, + { + "intent": "offer_intent", + "domain": "rentalcar_2", + "slot": "intent", + "value": "reservecar" + }, + { + "intent": "offer_intent", + "domain": "bus_2", + "slot": "intent", + "value": "buybusticket" + }, + { + "intent": "offer_intent", + "domain": "bus_1", + "slot": "intent", + "value": "buybusticket" + }, + { + "intent": "offer_intent", + "domain": "services_2", + "slot": "intent", + "value": "bookappointment" + }, + { + "intent": "offer_intent", + "domain": "services_1", + "slot": "intent", + "value": "bookappointment" + }, + { + "intent": "offer_intent", + "domain": "services_3", + "slot": "intent", + "value": "bookappointment" + }, + { + "intent": "offer_intent", + "domain": "home_1", + "slot": "intent", + "value": "schedulevisit" + }, + { + "intent": "offer_intent", + "domain": "bank_1", + "slot": "intent", + "value": "transfermoney" + }, + { + "intent": "offer_intent", + "domain": "hotel_2", + "slot": "intent", + "value": "bookhouse" + }, + { + "intent": "offer_intent", + "domain": "calendar_1", + "slot": "intent", + "value": "addevent" + }, + { + "intent": "offer_intent", + "domain": "hotel_3", + "slot": "intent", + "value": "reservehotel" + }, + { + "intent": "offer_intent", + "domain": "hotel_1", + "slot": "intent", + "value": "reservehotel" + }, + { + "intent": "offer_intent", + "domain": "alarm_1", + "slot": "intent", + "value": "addalarm" + }, + { + "intent": "offer_intent", + "domain": "services_4", + "slot": "intent", + "value": "bookappointment" + }, + { + "intent": "offer_intent", + "domain": "restaurant_2", + "slot": "intent", + "value": "reserverestaurant" + }, + { + "intent": "offer_intent", + "domain": "bank_2", + "slot": "intent", + "value": "transfermoney" + }, + { + "intent": "offer_intent", + "domain": "media_2", + "slot": "intent", + "value": "rentmovie" + }, + { + "intent": "offer_intent", + "domain": "hotel_4", + "slot": "intent", + "value": "reservehotel" + }, + { + "intent": "offer_intent", + "domain": "music_3", + "slot": "intent", + "value": "playmedia" + }, + { + "intent": "offer_intent", + "domain": "event_3", + "slot": "intent", + "value": "buyeventtickets" + }, + { + "intent": "offer_intent", + "domain": "rentalcar_3", + "slot": "intent", + "value": "reservecar" + }, + { + "intent": "offer_intent", + "domain": "bus_3", + "slot": "intent", + "value": "buybusticket" + }, + { + "intent": "offer_intent", + "domain": "home_2", + "slot": "intent", + "value": "schedulevisit" + }, + { + "intent": "offer_intent", + "domain": "train_1", + "slot": "intent", + "value": "gettraintickets" + }, + { + "intent": "offer_intent", + "domain": "media_3", + "slot": "intent", + "value": "playmovie" + }, + { + "intent": "req_more", + "domain": "", + "slot": "", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_1", + "slot": "city", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_1", + "slot": "street_address", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_1", + "slot": "phone_number", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_1", + "slot": "time", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_1", + "slot": "has_live_music", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_1", + "slot": "serves_alcohol", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_1", + "slot": "cuisine", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_1", + "slot": "price_range", + "value": "" + }, + { + "intent": "request", + "domain": "media_1", + "slot": "title", + "value": "" + }, + { + "intent": "request", + "domain": "media_1", + "slot": "directed_by", + "value": "" + }, + { + "intent": "request", + "domain": "media_1", + "slot": "genre", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_1", + "slot": "restaurant_name", + "value": "" + }, + { + "intent": "request", + "domain": "event_2", + "slot": "venue_address", + "value": "" + }, + { + "intent": "request", + "domain": "event_2", + "slot": "city", + "value": "" + }, + { + "intent": "request", + "domain": "event_2", + "slot": "time", + "value": "" + }, + { + "intent": "request", + "domain": "event_2", + "slot": "event_name", + "value": "" + }, + { + "intent": "request", + "domain": "event_2", + "slot": "number_of_tickets", + "value": "" + }, + { + "intent": "request", + "domain": "music_2", + "slot": "genre", + "value": "" + }, + { + "intent": "request", + "domain": "music_1", + "slot": "genre", + "value": "" + }, + { + "intent": "request", + "domain": "music_1", + "slot": "year", + "value": "" + }, + { + "intent": "request", + "domain": "event_1", + "slot": "city_of_event", + "value": "" + }, + { + "intent": "request", + "domain": "event_1", + "slot": "subcategory", + "value": "" + }, + { + "intent": "request", + "domain": "event_1", + "slot": "address_of_location", + "value": "" + }, + { + "intent": "request", + "domain": "event_1", + "slot": "number_of_seats", + "value": "" + }, + { + "intent": "request", + "domain": "event_1", + "slot": "category", + "value": "" + }, + { + "intent": "request", + "domain": "event_2", + "slot": "event_type", + "value": "" + }, + { + "intent": "request", + "domain": "event_2", + "slot": "date", + "value": "" + }, + { + "intent": "request", + "domain": "event_2", + "slot": "category", + "value": "" + }, + { + "intent": "request", + "domain": "movie_1", + "slot": "location", + "value": "" + }, + { + "intent": "request", + "domain": "movie_1", + "slot": "show_date", + "value": "" + }, + { + "intent": "request", + "domain": "movie_1", + "slot": "street_address", + "value": "" + }, + { + "intent": "request", + "domain": "movie_1", + "slot": "price", + "value": "" + }, + { + "intent": "request", + "domain": "movie_1", + "slot": "genre", + "value": "" + }, + { + "intent": "request", + "domain": "event_1", + "slot": "date", + "value": "" + }, + { + "intent": "request", + "domain": "event_1", + "slot": "event_name", + "value": "" + }, + { + "intent": "request", + "domain": "event_1", + "slot": "event_location", + "value": "" + }, + { + "intent": "request", + "domain": "event_1", + "slot": "time", + "value": "" + }, + { + "intent": "request", + "domain": "event_2", + "slot": "venue", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "origin_city", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "destination_city", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "departure_date", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "origin_airport", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "refundable", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "destination_airport", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "outbound_arrival_time", + "value": "" + }, + { + "intent": "request", + "domain": "flight_2", + "slot": "origin", + "value": "" + }, + { + "intent": "request", + "domain": "flight_2", + "slot": "destination", + "value": "" + }, + { + "intent": "request", + "domain": "flight_2", + "slot": "departure_date", + "value": "" + }, + { + "intent": "request", + "domain": "flight_2", + "slot": "destination_airport", + "value": "" + }, + { + "intent": "request", + "domain": "flight_2", + "slot": "outbound_arrival_time", + "value": "" + }, + { + "intent": "request", + "domain": "flight_2", + "slot": "seating_class", + "value": "" + }, + { + "intent": "request", + "domain": "flight_2", + "slot": "is_redeye", + "value": "" + }, + { + "intent": "request", + "domain": "flight_2", + "slot": "origin_airport", + "value": "" + }, + { + "intent": "request", + "domain": "flight_2", + "slot": "passengers", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "price", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "airlines", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "outbound_departure_time", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "number_stops", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "return_date", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "inbound_arrival_time", + "value": "" + }, + { + "intent": "request", + "domain": "flight_2", + "slot": "return_date", + "value": "" + }, + { + "intent": "request", + "domain": "flight_2", + "slot": "number_stops", + "value": "" + }, + { + "intent": "request", + "domain": "flight_1", + "slot": "inbound_departure_time", + "value": "" + }, + { + "intent": "request", + "domain": "flight_2", + "slot": "inbound_arrival_time", + "value": "" + }, + { + "intent": "request", + "domain": "ridesharing_2", + "slot": "number_of_seats", + "value": "" + }, + { + "intent": "request", + "domain": "ridesharing_2", + "slot": "wait_time", + "value": "" + }, + { + "intent": "request", + "domain": "ridesharing_2", + "slot": "ride_fare", + "value": "" + }, + { + "intent": "request", + "domain": "ridesharing_2", + "slot": "destination", + "value": "" + }, + { + "intent": "request", + "domain": "ridesharing_2", + "slot": "ride_type", + "value": "" + }, + { + "intent": "request", + "domain": "ridesharing_1", + "slot": "number_of_riders", + "value": "" + }, + { + "intent": "request", + "domain": "ridesharing_1", + "slot": "destination", + "value": "" + }, + { + "intent": "request", + "domain": "ridesharing_1", + "slot": "ride_fare", + "value": "" + }, + { + "intent": "request", + "domain": "ridesharing_1", + "slot": "approximate_ride_duration", + "value": "" + }, + { + "intent": "request", + "domain": "ridesharing_1", + "slot": "shared_ride", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_1", + "slot": "pickup_time", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_1", + "slot": "dropoff_date", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_1", + "slot": "pickup_city", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_1", + "slot": "pickup_date", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_1", + "slot": "total_price", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_2", + "slot": "dropoff_date", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_2", + "slot": "pickup_city", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_2", + "slot": "pickup_date", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_2", + "slot": "total_price", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_2", + "slot": "pickup_time", + "value": "" + }, + { + "intent": "request", + "domain": "bus_2", + "slot": "origin", + "value": "" + }, + { + "intent": "request", + "domain": "bus_2", + "slot": "departure_date", + "value": "" + }, + { + "intent": "request", + "domain": "bus_2", + "slot": "destination", + "value": "" + }, + { + "intent": "request", + "domain": "bus_2", + "slot": "origin_station_name", + "value": "" + }, + { + "intent": "request", + "domain": "bus_2", + "slot": "destination_station_name", + "value": "" + }, + { + "intent": "request", + "domain": "bus_1", + "slot": "from_location", + "value": "" + }, + { + "intent": "request", + "domain": "bus_1", + "slot": "leaving_date", + "value": "" + }, + { + "intent": "request", + "domain": "bus_1", + "slot": "to_station", + "value": "" + }, + { + "intent": "request", + "domain": "bus_1", + "slot": "from_station", + "value": "" + }, + { + "intent": "request", + "domain": "bus_1", + "slot": "to_location", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_2", + "slot": "where_to", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_2", + "slot": "number_of_adults", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_2", + "slot": "check_in_date", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_2", + "slot": "check_out_date", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_2", + "slot": "rating", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_2", + "slot": "has_laundry_service", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_2", + "slot": "phone_number", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_2", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_2", + "slot": "total_price", + "value": "" + }, + { + "intent": "request", + "domain": "bus_2", + "slot": "group_size", + "value": "" + }, + { + "intent": "request", + "domain": "bus_1", + "slot": "travelers", + "value": "" + }, + { + "intent": "request", + "domain": "services_2", + "slot": "city", + "value": "" + }, + { + "intent": "request", + "domain": "services_2", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "services_2", + "slot": "offers_cosmetic_services", + "value": "" + }, + { + "intent": "request", + "domain": "services_2", + "slot": "phone_number", + "value": "" + }, + { + "intent": "request", + "domain": "services_2", + "slot": "appointment_date", + "value": "" + }, + { + "intent": "request", + "domain": "services_2", + "slot": "appointment_time", + "value": "" + }, + { + "intent": "request", + "domain": "services_1", + "slot": "city", + "value": "" + }, + { + "intent": "request", + "domain": "services_1", + "slot": "average_rating", + "value": "" + }, + { + "intent": "request", + "domain": "services_1", + "slot": "phone_number", + "value": "" + }, + { + "intent": "request", + "domain": "services_1", + "slot": "street_address", + "value": "" + }, + { + "intent": "request", + "domain": "services_1", + "slot": "appointment_date", + "value": "" + }, + { + "intent": "request", + "domain": "services_1", + "slot": "is_unisex", + "value": "" + }, + { + "intent": "request", + "domain": "services_1", + "slot": "appointment_time", + "value": "" + }, + { + "intent": "request", + "domain": "services_3", + "slot": "city", + "value": "" + }, + { + "intent": "request", + "domain": "services_3", + "slot": "type", + "value": "" + }, + { + "intent": "request", + "domain": "services_3", + "slot": "phone_number", + "value": "" + }, + { + "intent": "request", + "domain": "services_3", + "slot": "street_address", + "value": "" + }, + { + "intent": "request", + "domain": "services_3", + "slot": "appointment_time", + "value": "" + }, + { + "intent": "request", + "domain": "services_3", + "slot": "average_rating", + "value": "" + }, + { + "intent": "request", + "domain": "services_3", + "slot": "appointment_date", + "value": "" + }, + { + "intent": "request", + "domain": "home_1", + "slot": "number_of_beds", + "value": "" + }, + { + "intent": "request", + "domain": "home_1", + "slot": "phone_number", + "value": "" + }, + { + "intent": "request", + "domain": "home_1", + "slot": "pets_allowed", + "value": "" + }, + { + "intent": "request", + "domain": "home_1", + "slot": "area", + "value": "" + }, + { + "intent": "request", + "domain": "home_1", + "slot": "furnished", + "value": "" + }, + { + "intent": "request", + "domain": "home_1", + "slot": "visit_date", + "value": "" + }, + { + "intent": "request", + "domain": "bank_1", + "slot": "recipient_account_name", + "value": "" + }, + { + "intent": "request", + "domain": "bank_1", + "slot": "amount", + "value": "" + }, + { + "intent": "request", + "domain": "bank_1", + "slot": "account_type", + "value": "" + }, + { + "intent": "request", + "domain": "services_2", + "slot": "dentist_name", + "value": "" + }, + { + "intent": "request", + "domain": "services_1", + "slot": "stylist_name", + "value": "" + }, + { + "intent": "request", + "domain": "services_3", + "slot": "doctor_name", + "value": "" + }, + { + "intent": "request", + "domain": "calendar_1", + "slot": "event_date", + "value": "" + }, + { + "intent": "request", + "domain": "calendar_1", + "slot": "event_time", + "value": "" + }, + { + "intent": "request", + "domain": "calendar_1", + "slot": "event_location", + "value": "" + }, + { + "intent": "request", + "domain": "calendar_1", + "slot": "event_name", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_3", + "slot": "check_in_date", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_3", + "slot": "location", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_3", + "slot": "check_out_date", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_3", + "slot": "average_rating", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_3", + "slot": "price", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_3", + "slot": "hotel_name", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_3", + "slot": "pets_welcome", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_3", + "slot": "phone_number", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_3", + "slot": "street_address", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_1", + "slot": "destination", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_1", + "slot": "check_in_date", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_1", + "slot": "hotel_name", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_1", + "slot": "number_of_days", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_1", + "slot": "has_wifi", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_1", + "slot": "phone_number", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_1", + "slot": "price_per_night", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_1", + "slot": "star_rating", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_1", + "slot": "street_address", + "value": "" + }, + { + "intent": "request", + "domain": "bus_2", + "slot": "departure_time", + "value": "" + }, + { + "intent": "request", + "domain": "bus_2", + "slot": "price", + "value": "" + }, + { + "intent": "request", + "domain": "movie_1", + "slot": "movie_name", + "value": "" + }, + { + "intent": "request", + "domain": "weather_1", + "slot": "city", + "value": "" + }, + { + "intent": "request", + "domain": "weather_1", + "slot": "humidity", + "value": "" + }, + { + "intent": "request", + "domain": "weather_1", + "slot": "wind", + "value": "" + }, + { + "intent": "request", + "domain": "weather_1", + "slot": "date", + "value": "" + }, + { + "intent": "request", + "domain": "music_2", + "slot": "song_name", + "value": "" + }, + { + "intent": "request", + "domain": "music_2", + "slot": "artist", + "value": "" + }, + { + "intent": "request", + "domain": "music_2", + "slot": "album", + "value": "" + }, + { + "intent": "request", + "domain": "music_1", + "slot": "album", + "value": "" + }, + { + "intent": "request", + "domain": "music_1", + "slot": "song_name", + "value": "" + }, + { + "intent": "request", + "domain": "music_1", + "slot": "artist", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_1", + "slot": "pickup_location", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_1", + "slot": "car_name", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_1", + "slot": "type", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_2", + "slot": "pickup_location", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_2", + "slot": "car_name", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_2", + "slot": "car_type", + "value": "" + }, + { + "intent": "request", + "domain": "travel_1", + "slot": "phone_number", + "value": "" + }, + { + "intent": "request", + "domain": "travel_1", + "slot": "good_for_kids", + "value": "" + }, + { + "intent": "request", + "domain": "travel_1", + "slot": "free_entry", + "value": "" + }, + { + "intent": "request", + "domain": "travel_1", + "slot": "location", + "value": "" + }, + { + "intent": "request", + "domain": "bus_1", + "slot": "fare", + "value": "" + }, + { + "intent": "request", + "domain": "bus_1", + "slot": "leaving_time", + "value": "" + }, + { + "intent": "request", + "domain": "bus_1", + "slot": "transfers", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_2", + "slot": "restaurant_name", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_2", + "slot": "location", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_2", + "slot": "phone_number", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_2", + "slot": "has_vegetarian_options", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_2", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_2", + "slot": "time", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_2", + "slot": "price_range", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_2", + "slot": "category", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_2", + "slot": "has_seating_outdoors", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant_2", + "slot": "rating", + "value": "" + }, + { + "intent": "request", + "domain": "flight_3", + "slot": "origin_city", + "value": "" + }, + { + "intent": "request", + "domain": "flight_3", + "slot": "departure_date", + "value": "" + }, + { + "intent": "request", + "domain": "flight_3", + "slot": "destination_city", + "value": "" + }, + { + "intent": "request", + "domain": "flight_3", + "slot": "outbound_arrival_time", + "value": "" + }, + { + "intent": "request", + "domain": "flight_3", + "slot": "arrives_next_day", + "value": "" + }, + { + "intent": "request", + "domain": "flight_3", + "slot": "destination_airport_name", + "value": "" + }, + { + "intent": "request", + "domain": "flight_3", + "slot": "origin_airport_name", + "value": "" + }, + { + "intent": "request", + "domain": "flight_3", + "slot": "number_checked_bags", + "value": "" + }, + { + "intent": "request", + "domain": "flight_3", + "slot": "passengers", + "value": "" + }, + { + "intent": "request", + "domain": "flight_3", + "slot": "return_date", + "value": "" + }, + { + "intent": "request", + "domain": "flight_3", + "slot": "flight_class", + "value": "" + }, + { + "intent": "request", + "domain": "flight_3", + "slot": "inbound_arrival_time", + "value": "" + }, + { + "intent": "request", + "domain": "alarm_1", + "slot": "new_alarm_time", + "value": "" + }, + { + "intent": "request", + "domain": "services_4", + "slot": "phone_number", + "value": "" + }, + { + "intent": "request", + "domain": "services_4", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "services_4", + "slot": "appointment_date", + "value": "" + }, + { + "intent": "request", + "domain": "services_4", + "slot": "appointment_time", + "value": "" + }, + { + "intent": "request", + "domain": "services_4", + "slot": "type", + "value": "" + }, + { + "intent": "request", + "domain": "services_4", + "slot": "city", + "value": "" + }, + { + "intent": "request", + "domain": "bank_2", + "slot": "recipient_name", + "value": "" + }, + { + "intent": "request", + "domain": "bank_2", + "slot": "transfer_amount", + "value": "" + }, + { + "intent": "request", + "domain": "bank_2", + "slot": "transfer_time", + "value": "" + }, + { + "intent": "request", + "domain": "bank_2", + "slot": "account_type", + "value": "" + }, + { + "intent": "request", + "domain": "movie_2", + "slot": "director", + "value": "" + }, + { + "intent": "request", + "domain": "movie_2", + "slot": "genre", + "value": "" + }, + { + "intent": "request", + "domain": "movie_2", + "slot": "starring", + "value": "" + }, + { + "intent": "request", + "domain": "media_2", + "slot": "genre", + "value": "" + }, + { + "intent": "request", + "domain": "media_2", + "slot": "price", + "value": "" + }, + { + "intent": "request", + "domain": "media_2", + "slot": "actors", + "value": "" + }, + { + "intent": "request", + "domain": "media_2", + "slot": "director", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_4", + "slot": "location", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_4", + "slot": "smoking_allowed", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_4", + "slot": "phone_number", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_4", + "slot": "price_per_night", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_4", + "slot": "stay_length", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_4", + "slot": "check_in_date", + "value": "" + }, + { + "intent": "request", + "domain": "hotel_4", + "slot": "street_address", + "value": "" + }, + { + "intent": "request", + "domain": "flight_3", + "slot": "number_stops", + "value": "" + }, + { + "intent": "request", + "domain": "music_3", + "slot": "genre", + "value": "" + }, + { + "intent": "request", + "domain": "music_3", + "slot": "year", + "value": "" + }, + { + "intent": "request", + "domain": "event_3", + "slot": "price_per_ticket", + "value": "" + }, + { + "intent": "request", + "domain": "event_3", + "slot": "city", + "value": "" + }, + { + "intent": "request", + "domain": "event_3", + "slot": "venue_address", + "value": "" + }, + { + "intent": "request", + "domain": "event_3", + "slot": "event_type", + "value": "" + }, + { + "intent": "request", + "domain": "event_3", + "slot": "number_of_tickets", + "value": "" + }, + { + "intent": "request", + "domain": "flight_4", + "slot": "origin_airport", + "value": "" + }, + { + "intent": "request", + "domain": "flight_4", + "slot": "departure_date", + "value": "" + }, + { + "intent": "request", + "domain": "flight_4", + "slot": "destination_airport", + "value": "" + }, + { + "intent": "request", + "domain": "flight_4", + "slot": "outbound_arrival_time", + "value": "" + }, + { + "intent": "request", + "domain": "flight_4", + "slot": "number_of_tickets", + "value": "" + }, + { + "intent": "request", + "domain": "flight_4", + "slot": "seating_class", + "value": "" + }, + { + "intent": "request", + "domain": "flight_4", + "slot": "return_date", + "value": "" + }, + { + "intent": "request", + "domain": "flight_4", + "slot": "inbound_arrival_time", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_3", + "slot": "pickup_time", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_3", + "slot": "end_date", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_3", + "slot": "start_date", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_3", + "slot": "city", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_3", + "slot": "price_per_day", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_3", + "slot": "add_insurance", + "value": "" + }, + { + "intent": "request", + "domain": "bus_3", + "slot": "to_city", + "value": "" + }, + { + "intent": "request", + "domain": "bus_3", + "slot": "departure_date", + "value": "" + }, + { + "intent": "request", + "domain": "bus_3", + "slot": "from_city", + "value": "" + }, + { + "intent": "request", + "domain": "bus_3", + "slot": "category", + "value": "" + }, + { + "intent": "request", + "domain": "bus_3", + "slot": "to_station", + "value": "" + }, + { + "intent": "request", + "domain": "bus_3", + "slot": "from_station", + "value": "" + }, + { + "intent": "request", + "domain": "bus_3", + "slot": "num_passengers", + "value": "" + }, + { + "intent": "request", + "domain": "home_2", + "slot": "number_of_beds", + "value": "" + }, + { + "intent": "request", + "domain": "home_2", + "slot": "area", + "value": "" + }, + { + "intent": "request", + "domain": "home_2", + "slot": "in_unit_laundry", + "value": "" + }, + { + "intent": "request", + "domain": "home_2", + "slot": "visit_date", + "value": "" + }, + { + "intent": "request", + "domain": "home_2", + "slot": "intent", + "value": "" + }, + { + "intent": "request", + "domain": "home_2", + "slot": "has_garage", + "value": "" + }, + { + "intent": "request", + "domain": "home_2", + "slot": "phone_number", + "value": "" + }, + { + "intent": "request", + "domain": "home_2", + "slot": "number_of_baths", + "value": "" + }, + { + "intent": "request", + "domain": "movie_1", + "slot": "number_of_tickets", + "value": "" + }, + { + "intent": "request", + "domain": "movie_1", + "slot": "show_type", + "value": "" + }, + { + "intent": "request", + "domain": "movie_1", + "slot": "show_time", + "value": "" + }, + { + "intent": "request", + "domain": "movie_1", + "slot": "theater_name", + "value": "" + }, + { + "intent": "request", + "domain": "home_2", + "slot": "property_name", + "value": "" + }, + { + "intent": "request", + "domain": "home_2", + "slot": "price", + "value": "" + }, + { + "intent": "request", + "domain": "home_2", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "payment_1", + "slot": "amount", + "value": "" + }, + { + "intent": "request", + "domain": "payment_1", + "slot": "receiver", + "value": "" + }, + { + "intent": "request", + "domain": "payment_1", + "slot": "payment_method", + "value": "" + }, + { + "intent": "request", + "domain": "train_1", + "slot": "date_of_journey", + "value": "" + }, + { + "intent": "request", + "domain": "train_1", + "slot": "from", + "value": "" + }, + { + "intent": "request", + "domain": "train_1", + "slot": "to", + "value": "" + }, + { + "intent": "request", + "domain": "train_1", + "slot": "from_station", + "value": "" + }, + { + "intent": "request", + "domain": "train_1", + "slot": "to_station", + "value": "" + }, + { + "intent": "request", + "domain": "train_1", + "slot": "number_of_adults", + "value": "" + }, + { + "intent": "request", + "domain": "train_1", + "slot": "trip_protection", + "value": "" + }, + { + "intent": "request", + "domain": "movie_3", + "slot": "cast", + "value": "" + }, + { + "intent": "request", + "domain": "movie_3", + "slot": "directed_by", + "value": "" + }, + { + "intent": "request", + "domain": "movie_3", + "slot": "genre", + "value": "" + }, + { + "intent": "request", + "domain": "media_3", + "slot": "genre", + "value": "" + }, + { + "intent": "request", + "domain": "media_3", + "slot": "starring", + "value": "" + }, + { + "intent": "request", + "domain": "media_3", + "slot": "title", + "value": "" + }, + { + "intent": "request", + "domain": "bus_3", + "slot": "departure_time", + "value": "" + }, + { + "intent": "request", + "domain": "bus_3", + "slot": "price", + "value": "" + }, + { + "intent": "request", + "domain": "messaging_1", + "slot": "contact_name", + "value": "" + }, + { + "intent": "request", + "domain": "event_3", + "slot": "date", + "value": "" + }, + { + "intent": "request", + "domain": "event_3", + "slot": "event_name", + "value": "" + }, + { + "intent": "request", + "domain": "event_3", + "slot": "venue", + "value": "" + }, + { + "intent": "request", + "domain": "event_3", + "slot": "time", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_3", + "slot": "car_name", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_3", + "slot": "pickup_location", + "value": "" + }, + { + "intent": "request", + "domain": "rentalcar_3", + "slot": "car_type", + "value": "" + }, + { + "intent": "request_alts", + "domain": "restaurant_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "event_2", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "music_2", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "music_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "event_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "movie_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "flight_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "flight_2", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "media_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "rentalcar_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "rentalcar_2", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "bus_2", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "bus_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "services_2", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "services_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "services_3", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "home_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "bank_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "hotel_2", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "calendar_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "hotel_3", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "hotel_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "weather_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "travel_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "flight_3", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "alarm_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "services_4", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "restaurant_2", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "bank_2", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "movie_2", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "media_2", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "hotel_4", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "music_3", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "event_3", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "flight_4", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "rentalcar_3", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "bus_3", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "home_2", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "train_1", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "movie_3", + "slot": "", + "value": "" + }, + { + "intent": "request_alts", + "domain": "media_3", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "restaurant_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "event_2", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "music_2", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "music_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "event_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "movie_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "flight_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "flight_2", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "rentalcar_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "rentalcar_2", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "bus_2", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "bus_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "services_2", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "services_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "services_3", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "home_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "bank_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "hotel_2", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "calendar_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "hotel_3", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "hotel_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "weather_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "travel_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "flight_3", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "alarm_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "services_4", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "restaurant_2", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "bank_2", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "movie_2", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "hotel_4", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "music_3", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "event_3", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "flight_4", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "rentalcar_3", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "bus_3", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "home_2", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "train_1", + "slot": "", + "value": "" + }, + { + "intent": "select", + "domain": "movie_3", + "slot": "", + "value": "" + }, + { + "intent": "thank_you", + "domain": "", + "slot": "", + "value": "" + } + ], + "state": { + "bank_1": { + "account_type": "", + "recipient_account_type": "", + "balance": "", + "amount": "", + "recipient_account_name": "" + }, + "bus_1": { + "from_location": "", + "to_location": "", + "from_station": "", + "to_station": "", + "leaving_date": "", + "leaving_time": "", + "fare": "", + "travelers": "", + "transfers": "" + }, + "bus_2": { + "origin": "", + "destination": "", + "origin_station_name": "", + "destination_station_name": "", + "departure_date": "", + "price": "", + "departure_time": "", + "group_size": "", + "fare_type": "" + }, + "calendar_1": { + "event_date": "", + "event_time": "", + "event_location": "", + "event_name": "", + "available_start_time": "", + "available_end_time": "" + }, + "event_1": { + "category": "", + "subcategory": "", + "event_name": "", + "date": "", + "time": "", + "number_of_seats": "", + "city_of_event": "", + "event_location": "", + "address_of_location": "" + }, + "event_2": { + "event_type": "", + "category": "", + "event_name": "", + "date": "", + "time": "", + "number_of_tickets": "", + "city": "", + "venue": "", + "venue_address": "" + }, + "flight_1": { + "passengers": "", + "seating_class": "", + "origin_city": "", + "destination_city": "", + "origin_airport": "", + "destination_airport": "", + "departure_date": "", + "return_date": "", + "number_stops": "", + "outbound_departure_time": "", + "outbound_arrival_time": "", + "inbound_arrival_time": "", + "inbound_departure_time": "", + "price": "", + "refundable": "", + "airlines": "" + }, + "flight_2": { + "passengers": "", + "seating_class": "", + "origin": "", + "destination": "", + "origin_airport": "", + "destination_airport": "", + "departure_date": "", + "return_date": "", + "number_stops": "", + "outbound_departure_time": "", + "outbound_arrival_time": "", + "inbound_arrival_time": "", + "inbound_departure_time": "", + "fare": "", + "is_redeye": "", + "airlines": "" + }, + "home_1": { + "area": "", + "address": "", + "property_name": "", + "phone_number": "", + "furnished": "", + "pets_allowed": "", + "rent": "", + "visit_date": "", + "number_of_beds": "", + "number_of_baths": "" + }, + "hotel_1": { + "destination": "", + "number_of_rooms": "", + "check_in_date": "", + "number_of_days": "", + "star_rating": "", + "hotel_name": "", + "street_address": "", + "phone_number": "", + "price_per_night": "", + "has_wifi": "" + }, + "hotel_2": { + "where_to": "", + "number_of_adults": "", + "check_in_date": "", + "check_out_date": "", + "rating": "", + "address": "", + "phone_number": "", + "total_price": "", + "has_laundry_service": "" + }, + "hotel_3": { + "location": "", + "number_of_rooms": "", + "check_in_date": "", + "check_out_date": "", + "average_rating": "", + "hotel_name": "", + "street_address": "", + "phone_number": "", + "price": "", + "pets_welcome": "" + }, + "media_1": { + "title": "", + "genre": "", + "subtitles": "", + "directed_by": "" + }, + "movie_1": { + "price": "", + "number_of_tickets": "", + "show_type": "", + "theater_name": "", + "show_time": "", + "show_date": "", + "genre": "", + "street_address": "", + "location": "", + "movie_name": "" + }, + "music_1": { + "song_name": "", + "artist": "", + "album": "", + "genre": "", + "year": "", + "playback_device": "" + }, + "music_2": { + "song_name": "", + "artist": "", + "album": "", + "genre": "", + "playback_device": "" + }, + "rentalcar_1": { + "type": "", + "car_name": "", + "pickup_location": "", + "pickup_date": "", + "pickup_time": "", + "pickup_city": "", + "dropoff_date": "", + "total_price": "" + }, + "rentalcar_2": { + "car_type": "", + "car_name": "", + "pickup_location": "", + "pickup_date": "", + "pickup_time": "", + "pickup_city": "", + "dropoff_date": "", + "total_price": "" + }, + "restaurant_1": { + "restaurant_name": "", + "date": "", + "time": "", + "serves_alcohol": "", + "has_live_music": "", + "phone_number": "", + "street_address": "", + "party_size": "", + "price_range": "", + "city": "", + "cuisine": "" + }, + "ridesharing_1": { + "destination": "", + "shared_ride": "", + "ride_fare": "", + "approximate_ride_duration": "", + "number_of_riders": "" + }, + "ridesharing_2": { + "destination": "", + "ride_type": "", + "ride_fare": "", + "wait_time": "", + "number_of_seats": "" + }, + "services_1": { + "stylist_name": "", + "phone_number": "", + "average_rating": "", + "is_unisex": "", + "street_address": "", + "city": "", + "appointment_date": "", + "appointment_time": "" + }, + "services_2": { + "dentist_name": "", + "phone_number": "", + "address": "", + "city": "", + "appointment_date": "", + "appointment_time": "", + "offers_cosmetic_services": "" + }, + "services_3": { + "doctor_name": "", + "phone_number": "", + "average_rating": "", + "street_address": "", + "city": "", + "appointment_date": "", + "appointment_time": "", + "type": "" + }, + "travel_1": { + "location": "", + "attraction_name": "", + "category": "", + "phone_number": "", + "free_entry": "", + "good_for_kids": "" + }, + "weather_1": { + "precipitation": "", + "humidity": "", + "wind": "", + "temperature": "", + "city": "", + "date": "" + }, + "alarm_1": { + "alarm_time": "", + "alarm_name": "", + "new_alarm_time": "", + "new_alarm_name": "" + }, + "bank_2": { + "account_type": "", + "recipient_account_type": "", + "account_balance": "", + "transfer_amount": "", + "recipient_name": "", + "transfer_time": "" + }, + "flight_3": { + "passengers": "", + "flight_class": "", + "origin_city": "", + "destination_city": "", + "origin_airport_name": "", + "destination_airport_name": "", + "departure_date": "", + "return_date": "", + "number_stops": "", + "outbound_departure_time": "", + "outbound_arrival_time": "", + "inbound_arrival_time": "", + "inbound_departure_time": "", + "price": "", + "number_checked_bags": "", + "airlines": "", + "arrives_next_day": "" + }, + "hotel_4": { + "location": "", + "number_of_rooms": "", + "check_in_date": "", + "stay_length": "", + "star_rating": "", + "place_name": "", + "street_address": "", + "phone_number": "", + "price_per_night": "", + "smoking_allowed": "" + }, + "media_2": { + "movie_name": "", + "genre": "", + "subtitle_language": "", + "director": "", + "actors": "", + "price": "" + }, + "movie_2": { + "title": "", + "genre": "", + "aggregate_rating": "", + "starring": "", + "director": "" + }, + "restaurant_2": { + "restaurant_name": "", + "date": "", + "time": "", + "has_seating_outdoors": "", + "has_vegetarian_options": "", + "phone_number": "", + "rating": "", + "address": "", + "number_of_seats": "", + "price_range": "", + "location": "", + "category": "" + }, + "services_4": { + "therapist_name": "", + "phone_number": "", + "address": "", + "city": "", + "appointment_date": "", + "appointment_time": "", + "type": "" + }, + "bus_3": { + "from_city": "", + "to_city": "", + "from_station": "", + "to_station": "", + "departure_date": "", + "departure_time": "", + "price": "", + "additional_luggage": "", + "num_passengers": "", + "category": "" + }, + "event_3": { + "event_type": "", + "event_name": "", + "date": "", + "time": "", + "number_of_tickets": "", + "price_per_ticket": "", + "city": "", + "venue": "", + "venue_address": "" + }, + "flight_4": { + "number_of_tickets": "", + "seating_class": "", + "origin_airport": "", + "destination_airport": "", + "departure_date": "", + "return_date": "", + "is_nonstop": "", + "outbound_departure_time": "", + "outbound_arrival_time": "", + "inbound_arrival_time": "", + "inbound_departure_time": "", + "price": "", + "airlines": "" + }, + "home_2": { + "intent": "", + "area": "", + "address": "", + "property_name": "", + "phone_number": "", + "has_garage": "", + "in_unit_laundry": "", + "price": "", + "visit_date": "", + "number_of_beds": "", + "number_of_baths": "" + }, + "media_3": { + "title": "", + "genre": "", + "subtitle_language": "", + "starring": "" + }, + "messaging_1": { + "location": "", + "contact_name": "" + }, + "movie_3": { + "movie_title": "", + "genre": "", + "percent_rating": "", + "cast": "", + "directed_by": "" + }, + "music_3": { + "track": "", + "artist": "", + "album": "", + "genre": "", + "year": "", + "device": "" + }, + "payment_1": { + "payment_method": "", + "amount": "", + "receiver": "", + "private_visibility": "" + }, + "rentalcar_3": { + "car_type": "", + "car_name": "", + "pickup_location": "", + "start_date": "", + "pickup_time": "", + "city": "", + "end_date": "", + "price_per_day": "", + "add_insurance": "" + }, + "train_1": { + "from": "", + "to": "", + "from_station": "", + "to_station": "", + "date_of_journey": "", + "journey_start_time": "", + "total": "", + "number_of_adults": "", + "class": "", + "trip_protection": "" + } + } +} \ No newline at end of file diff --git a/data/unified_datasets/schema/original_data.zip b/data/unified_datasets/schema/original_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..110e958b8b1f73f0c57f8660f9201a2dfb340f71 Binary files /dev/null and b/data/unified_datasets/schema/original_data.zip differ diff --git a/data/unified_datasets/schema/preprocess.py b/data/unified_datasets/schema/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..46913390c7c89978965b0c1ae65067b4c14fe9a9 --- /dev/null +++ b/data/unified_datasets/schema/preprocess.py @@ -0,0 +1,549 @@ +import zipfile +import json +import os +from pprint import pprint +from copy import deepcopy +from collections import Counter +from tqdm import tqdm +import numpy as np +from convlab2.util.file_util import read_zipped_json, write_zipped_json +import re +self_dir = os.path.dirname(os.path.abspath(__file__)) + + +norm_service2domain = { + 'alarm': 'alarm', + 'banks': 'bank', + 'buses': 'bus', + 'calendar': 'calendar', + 'events': 'event', + 'flights': 'flight', + 'homes': 'home', + 'hotels': 'hotel', + 'media': 'media', + 'messaging': 'messaging', + 'movies': 'movie', + 'music': 'music', + 'payment': 'payment', + 'rentalcars': 'rentalcar', + 'restaurants': 'restaurant', + 'ridesharing': 'ridesharing', + 'services': 'services', + 'trains': 'train', + 'travel': 'travel', + 'weather': 'weather' +} + +digit2word = { + '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five', + '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten' +} + +match = { + '0': 0, + '1': 0, + '>1': 0, +} + + +def service2domain(service): + s, i = service.split('_') + return norm_service2domain[s.lower()]+'_'+i + + +def slot_normalize(service, slot): + pass + + +def pharse_in_sen(phrase, sen): + ''' + match value in the sentence + :param phrase: str + :param sen: str + :return: start, end if matched, else None, None + ''' + assert isinstance(phrase, str) + pw = '(^|[\s,\.:\?!-])(?P<v>{})([\s,\.:\?!-]|$)' + pn = '(^|[\s\?!-]|\D[,\.:])(?P<v>{})($|[\s\?!-]|[,\.:]\D|[,\.:]$)' + if phrase.isdigit(): + pattern = pn + else: + pattern = pw + p = re.compile(pattern.format(re.escape(phrase)), re.I) + m = re.search(p, sen) + if m: + num = len(re.findall(p, sen)) + if num > 1: + match['>1'] += 1 + else: + match['1'] += 1 + return m.span('v'), num + if phrase.isdigit() and phrase in digit2word: + phrase = digit2word[phrase] + p = re.compile(pw.format(re.escape(phrase)), re.I) + m = re.search(p, sen) + if m: + num = len(re.findall(p, sen)) + if num > 1: + match['>1'] += 1 + else: + match['1'] += 1 + return m.span('v'), num + match['0'] += 1 + return (None, None), 0 + + +def number_in_sen(word, sen): + if ' '+word+' ' in sen: + return sen.index(' ' + word + ' ') + 1, sen.index(' ' + word + ' ') + 1 + len(word) + elif ' '+word+'.' in sen: + return sen.index(' ' + word + '.') + 1, sen.index(' ' + word + '.') + 1 + len(word) + elif ' '+word+',' in sen: + return sen.index(' ' + word + ',') + 1, sen.index(' ' + word + ',') + 1 + len(word) + elif sen.startswith(word+ ' ') or sen.startswith(word+'.') or sen.startswith(word+','): + return 0, len(word) + elif word.isdigit() and word in digit2word: + ori_word = word + ori_sen = sen + word = digit2word[word] + sen = sen.lower() + if ' ' + word + ' ' in sen: + return sen.index(' ' + word + ' ') + 1, sen.index(' ' + word + ' ') + 1 + len(word) + elif ' ' + word + '.' in sen: + return sen.index(' ' + word + '.') + 1, sen.index(' ' + word + '.') + 1 + len(word) + elif ' ' + word + ',' in sen: + return sen.index(' ' + word + ',') + 1, sen.index(' ' + word + ',') + 1 + len(word) + elif sen.startswith(word + ' ') or sen.startswith(word + '.') or sen.startswith(word + ','): + return 0, len(word) + word = ori_word + sen = ori_sen + return sen.index(word) + + +def sys_intent(): + return { + "inform": {"description": "Inform the value for a slot to the user."}, + "request": {"description": "Request the value of a slot from the user."}, + "confirm": {"description": "Confirm the value of a slot before making a transactional service call."}, + "offer": {"description": "Offer a certain value for a slot to the user."}, + "notify_success": {"description": "Inform the user that their request was successful."}, + "notify_failure": {"description": "Inform the user that their request failed."}, + "inform_count": {"description": "Inform the number of items found that satisfy the user's request."}, + "offer_intent": {"description": "Offer a new intent to the user."}, + "req_more": {"description": "Asking the user if they need anything else."}, + "goodbye": {"description": "End the dialogue."}, + } + + +def usr_intent(): + return { + "inform_intent": {"description": "Express the desire to perform a certain task to the system."}, + "negate_intent": {"description": "Negate the intent which has been offered by the system."}, + "affirm_intent": {"description": "Agree to the intent which has been offered by the system."}, + "inform": {"description": "Inform the value of a slot to the system."}, + "request": {"description": "Request the value of a slot from the system."}, + "affirm": {"description": "Agree to the system's proposition. "}, + "negate": {"description": "Deny the system's proposal."}, + "select": {"description": "Select a result being offered by the system."}, + "request_alts": {"description": "Ask for more results besides the ones offered by the system."}, + "thank_you": {"description": "Thank the system."}, + "goodbye": {"description": "End the dialogue."}, + } + + +def get_intent(): + """merge sys & usr intent""" + return { + "inform": {"description": "Inform the value for a slot."}, + "request": {"description": "Request the value of a slot."}, + "confirm": {"description": "Confirm the value of a slot before making a transactional service call."}, + "offer": {"description": "Offer a certain value for a slot to the user."}, + "notify_success": {"description": "Inform the user that their request was successful."}, + "notify_failure": {"description": "Inform the user that their request failed."}, + "inform_count": {"description": "Inform the number of items found that satisfy the user's request."}, + "offer_intent": {"description": "Offer a new intent to the user."}, + "req_more": {"description": "Asking the user if they need anything else."}, + "goodbye": {"description": "End the dialogue."}, + "inform_intent": {"description": "Express the desire to perform a certain task to the system."}, + "negate_intent": {"description": "Negate the intent which has been offered by the system."}, + "affirm_intent": {"description": "Agree to the intent which has been offered by the system."}, + "affirm": {"description": "Agree to the system's proposition. "}, + "negate": {"description": "Deny the system's proposal."}, + "select": {"description": "Select a result being offered by the system."}, + "request_alts": {"description": "Ask for more results besides the ones offered by the system."}, + "thank_you": {"description": "Thank the system."}, + } + + +def preprocess(): + processed_dialogue = [] + ontology = {'domains': {}, + 'intents': {}, + 'binary_dialogue_act': [], + 'state': {}} + ontology['intents'].update(get_intent()) + numerical_slots = {} + original_zipped_path = os.path.join(self_dir, 'original_data.zip') + new_dir = os.path.join(self_dir, 'original_data') + if not os.path.exists(original_zipped_path): + raise FileNotFoundError(original_zipped_path) + if not os.path.exists(os.path.join(self_dir, 'data.zip')) or not os.path.exists(os.path.join(self_dir, 'ontology.json')): + print('unzip to', new_dir) + print('This may take several minutes') + archive = zipfile.ZipFile(original_zipped_path, 'r') + archive.extractall(self_dir) + cnt = 1 + non_cate_slot_update_cnt = 0 + non_cate_slot_update_fail_cnt = 0 + state_cnt = {} + num_train_dialog = 0 + num_train_utt = 0 + for data_split in ['train', 'dev', 'test']: + dataset_name = 'schema' + data_dir = os.path.join(new_dir, data_split) + # schema => ontology + f = open(os.path.join(data_dir, 'schema.json')) + data = json.load(f) + for schema in data: + domain = service2domain(schema['service_name']) + ontology['domains'].setdefault(domain, {}) + ontology['domains'][domain]['description'] = schema['description'] + ontology['domains'][domain].setdefault('slots', {}) + ontology['state'].setdefault(domain, {}) + for slot in schema['slots']: + # numerical => non-categorical: not use + # is_numerical = slot['is_categorical'] + # for value in slot['possible_values']: + # if not value.isdigit(): + # is_numerical = False + # break + # if is_numerical: + # numerical_slots.setdefault(slot['name'].lower(), 1) + lower_values = [x.lower() for x in slot['possible_values']] + ontology['domains'][domain]['slots'][slot['name'].lower()] = { + "description": slot['description'], + "is_categorical": slot['is_categorical'], + "possible_values": lower_values + } + ontology['state'][domain][slot['name'].lower()] = '' + # add 'count' slot + ontology['domains'][domain]['slots']['count'] = { + "description": "the number of items found that satisfy the user's request.", + "is_categorical": False, + "possible_values": [] + } + # ontology['state'][domain]['count'] = '' + # pprint(numerical_slots) + # dialog + for root, dirs, files in os.walk(data_dir): + fs = sorted([x for x in files if 'dialogues' in x]) + for f in tqdm(fs, desc='processing schema-guided-{}'.format(data_split)): + data = json.load(open(os.path.join(data_dir, f))) + if data_split == 'train': + num_train_dialog += len(data) + for d in data: + dialogue = { + "dataset": dataset_name, + "data_split": data_split if data_split!='dev' else 'val', + "dialogue_id": dataset_name+'_'+str(cnt), + "original_id": d['dialogue_id'], + "domains": [service2domain(s) for s in d['services']], + "turns": [] + } + # if d['dialogue_id'] != '84_00008': + # continue + cnt += 1 + prev_sys_frames = [] + prev_user_frames = [] + all_slot_spans_from_da = [] + state = {} + for domain in dialogue['domains']: + state.setdefault(domain, deepcopy(ontology['state'][domain])) + if data_split == 'train': + num_train_utt += len(d['turns']) + for utt_idx, t in enumerate(d['turns']): + speaker = t['speaker'].lower() + turn = { + 'speaker': speaker, + 'utterance': t['utterance'], + 'utt_idx': utt_idx, + 'dialogue_act': { + 'binary': [], + 'categorical': [], + 'non-categorical': [], + }, + } + for i, frame in enumerate(t['frames']): + domain = service2domain(frame['service']) + for action in frame['actions']: + intent = action['act'].lower() + assert intent in ontology['intents'], [intent] + slot = action['slot'].lower() + value_list = action['values'] + if action['act'] in ['REQ_MORE', 'AFFIRM', 'NEGATE', 'THANK_YOU', 'GOODBYE']: + turn['dialogue_act']['binary'].append({ + "intent": intent, + "domain": '', + "slot": '', + "value": '', + }) + elif action['act'] in ['NOTIFY_SUCCESS', 'NOTIFY_FAILURE', 'REQUEST_ALTS', 'AFFIRM_INTENT', 'NEGATE_INTENT']: + # Slot and values are always empty + turn['dialogue_act']['binary'].append({ + "intent": intent, + "domain": domain, + "slot": '', + "value": '', + }) + elif action['act'] in ['OFFER_INTENT', 'INFORM_INTENT']: + # always has "intent" as the slot, and a single value containing the intent being offered. + assert slot == 'intent' + turn['dialogue_act']['binary'].append({ + "intent": intent, + "domain": domain, + "slot": slot, + "value": value_list[0].lower(), + }) + elif action['act'] in ['REQUEST', 'SELECT'] and not value_list: + # always contains a slot, but values are optional. + # assert slot in ontology['domains'][domain]['slots'] + turn['dialogue_act']['binary'].append({ + "intent": intent, + "domain": domain, + "slot": slot, + "value": '', + }) + elif action['act'] in ['INFORM_COUNT']: + # always has "count" as the slot, and a single element in values for the number of results obtained by the system. + value = value_list[0] + assert slot in ontology['domains'][domain]['slots'] + (start, end), num = pharse_in_sen(value, t['utterance']) + if num: + assert value.lower() == t['utterance'][start:end].lower() \ + or digit2word[value].lower() == t['utterance'][start:end].lower() + turn['dialogue_act']['non-categorical'].append({ + "intent": intent, + "domain": domain, + "slot": slot.lower(), + "value": t['utterance'][start:end].lower(), + "start": start, + "end": end + }) + else: + # have slot & value + if ontology['domains'][domain]['slots'][slot]['is_categorical']: + for value in value_list: + value = value.lower() + if value not in ontology['domains'][domain]['slots'][slot]['possible_values'] and value != 'dontcare': + ontology['domains'][domain]['slots'][slot]['possible_values'].append(value) + print('add value to ontology', domain, slot, value) + assert value in ontology['domains'][domain]['slots'][slot][ + 'possible_values'] or value == 'dontcare' + turn['dialogue_act']['categorical'].append({ + "intent": intent, + "domain": domain, + "slot": slot, + "value": value, + }) + elif slot in numerical_slots: + value = value_list[-1] + (start, end), num = pharse_in_sen(value, t['utterance']) + if num: + assert value.lower() == t['utterance'][start:end].lower() \ + or digit2word[value].lower() == t['utterance'][start:end].lower() + turn['dialogue_act']['non-categorical'].append({ + "intent": intent, + "domain": domain, + "slot": slot.lower(), + "value": t['utterance'][start:end].lower(), + "start": start, + "end": end + }) + else: + # span info in frame['slots'] + for value in value_list: + for slot_info in frame['slots']: + start = slot_info['start'] + end = slot_info['exclusive_end'] + if slot_info['slot'] == slot and t['utterance'][start:end] == value: + turn['dialogue_act']['non-categorical'].append({ + "intent": intent, + "domain": domain, + "slot": slot, + "value": value.lower(), + "start": start, + "end": end + }) + break + # add span da to all_slot_spans_from_da + for ele in turn['dialogue_act']['non-categorical']: + all_slot_spans_from_da.insert(0, { + "domain": ele["domain"], + "slot": ele["slot"], + "value": ele["value"].lower(), + "utt_idx": utt_idx, + "start": ele["start"], + "end": ele["end"] + }) + if speaker == 'user': + # DONE: record state update, may come from sys acts + # prev_state: state. update the state using current frames. + # candidate span info from prev frames and current frames + slot_spans = [] + for frame in t['frames']: + for ele in frame['slots']: + slot, start, end = ele['slot'].lower(), ele['start'], ele['exclusive_end'] + slot_spans.append({ + "domain": service2domain(frame['service']), + "slot": slot, + "value": t['utterance'][start:end].lower(), + "utt_idx": utt_idx, + "start": start, + "end": end + }) + for frame in prev_sys_frames: + for ele in frame['slots']: + slot, start, end = ele['slot'].lower(), ele['start'], ele['exclusive_end'] + slot_spans.append({ + "domain": service2domain(frame['service']), + "slot": slot, + "value": d['turns'][utt_idx-1]['utterance'][start:end].lower(), + "utt_idx": utt_idx-1, + "start": start, + "end": end + }) + # turn['slot_spans'] = slot_spans + # turn['all_slot_span'] = deepcopy(all_slot_spans_from_da) + state_update = {"categorical": [], "non-categorical": []} + # print(utt_idx) + for frame in t['frames']: + domain = service2domain(frame['service']) + # print(domain) + for slot, value_list in frame['state']['slot_values'].items(): + # For categorical slots, this list contains a single value assigned to the slot. + # For non-categorical slots, all the values in this list are spoken variations + # of each other and are equivalent (e.g, "6 pm", "six in the evening", + # "evening at 6" etc.). + numerical_equal_values = [] + if slot in numerical_slots: + for value in value_list: + if value in digit2word: + numerical_equal_values.append(digit2word[value]) + value_list += numerical_equal_values + assert len(value_list) > 0, print(slot, value_list) + assert slot in state[domain] + value_list = list(set([x.lower() for x in value_list])) + if state[domain][slot] in value_list: + continue + # new value + candidate_values = value_list + for prev_user_frame in prev_user_frames: + prev_domain = service2domain(prev_user_frame['service']) + if prev_domain == domain and slot in prev_user_frame['state']['slot_values']: + prev_value_list = [x.lower() for x in prev_user_frame['state']['slot_values'][slot]] + candidate_values = list(set(value_list) - set(prev_value_list)) + assert state[domain][slot] not in candidate_values + assert candidate_values + + if ontology['domains'][domain]['slots'][slot]['is_categorical']: + state_cnt.setdefault('cate_slot_update', 0) + state_cnt['cate_slot_update'] += 1 + value = candidate_values[0] + state_update['categorical'].append( + {"domain": domain, "slot": slot, "value": value} + ) + state[domain][slot] = value + else: + state_cnt.setdefault('non_cate_slot_update', 0) + state_cnt['non_cate_slot_update'] += 1 + span_priority = [] + slot_spans_len = len(slot_spans) + all_slot_spans = slot_spans+all_slot_spans_from_da + for span_idx, slot_span in enumerate(all_slot_spans): + priority = 0 + span_domain = slot_span['domain'] + span_slot = slot_span['slot'] + span_value = slot_span['value'] + if domain == span_domain: + priority += 1 + if slot == span_slot: + priority += 10 + if span_value in candidate_values: + priority += 100 + if span_idx + 1 <= slot_spans_len: + priority += 0.5 + span_priority.append(priority) + if span_idx + 1 <= slot_spans_len: + # slot_spans not run out + if max(span_priority) >= 111.5: + break + else: + # search in previous da + if max(span_priority) >= 111: + break + if span_priority and max(span_priority) >= 100: + # {111.5: 114255, + # 111: 29591, + # 100: 15208, + # 110: 2159, + # 100.5: 642, + # 110.5: 125, + # 101: 24} + max_priority = max(span_priority) + state_cnt.setdefault('max_priority', Counter()) + state_cnt['max_priority'][max_priority] += 1 + span_idx = np.argmax(span_priority) + ele = all_slot_spans[span_idx] + state_update['non-categorical'].append({ + "domain": domain, + "slot": slot, + "value": ele['value'], + "utt_idx": ele["utt_idx"], + "start": ele["start"], + "end": ele["end"] + }) + state[domain][slot] = ele['value'] + else: + # not found + value = candidate_values[0] + state_update['non-categorical'].append( + {"domain": domain, "slot": slot, "value": value} + ) + state[domain][slot] = value + # print(t['utterance']) + non_cate_slot_update_fail_cnt += 1 + non_cate_slot_update_cnt += 1 + turn['state'] = deepcopy(state) + turn['state_update'] = state_update + prev_user_frames = deepcopy(t['frames']) + else: + prev_sys_frames = deepcopy(t['frames']) + + for da in turn['dialogue_act']['binary']: + if da not in ontology['binary_dialogue_act']: + ontology['binary_dialogue_act'].append(deepcopy(da)) + dialogue['turns'].append(deepcopy(turn)) + assert len(dialogue['turns']) % 2 == 0 + dialogue['turns'].pop() + processed_dialogue.append(dialogue) + # break + # sort ontology binary + pprint(state_cnt) + ontology['binary_dialogue_act'] = sorted(ontology['binary_dialogue_act'], key=lambda x:x['intent']) + json.dump(ontology, open(os.path.join(self_dir, 'ontology.json'), 'w'), indent=2) + json.dump(processed_dialogue, open('data.json', 'w'), indent=2) + write_zipped_json(os.path.join(self_dir, 'data.zip'), 'data.json') + os.remove('data.json') + print('# train dialog: {}, # train utterance: {}'.format(num_train_dialog, num_train_utt)) + print(non_cate_slot_update_fail_cnt, non_cate_slot_update_cnt) # 395 162399 + + else: + # read from file + processed_dialogue = read_zipped_json(os.path.join(self_dir, 'data.zip'), 'data.json') + ontology = json.load(open(os.path.join(self_dir, 'ontology.json'))) + return processed_dialogue, ontology + + +if __name__ == '__main__': + preprocess() + print(match) # {'0': 4146, '1': 53626, '>1': 2904} =>(after user act released) {'0': 487, '1': 63886, '>1': 3097} diff --git a/data/unified_datasets/taskmaster/README.md b/data/unified_datasets/taskmaster/README.md new file mode 100644 index 0000000000000000000000000000000000000000..303a82317a50d1aa4f40bd625ad17ccf97da392a --- /dev/null +++ b/data/unified_datasets/taskmaster/README.md @@ -0,0 +1,26 @@ +# README + +## Features + +- Annotations: character-level span for non-categorical slots. No slot descriptions. + +Statistics: + +| | \# dialogues | \# utterances | avg. turns | avg. tokens | \# domains | +| ----- | ------------ | ------------- | ---------- | ----------- | ---------- | +| train | 30483 | 540311 | 17.72 | 9.18 | 13 | + +## Main changes + +- each speaker for one turn +- intent is set to **inform** +- not annotate state and state upadte +- span info is provided by original data + +## Original data + +https://github.com/google-research-datasets/Taskmaster + +TM-1: https://github.com/google-research-datasets/Taskmaster/tree/master/TM-1-2019 + +TM-2: https://github.com/google-research-datasets/Taskmaster/tree/master/TM-2-2020 \ No newline at end of file diff --git a/data/unified_datasets/taskmaster/data.zip b/data/unified_datasets/taskmaster/data.zip new file mode 100644 index 0000000000000000000000000000000000000000..f52a3808df8a413962c71f305c64d437eb196e00 Binary files /dev/null and b/data/unified_datasets/taskmaster/data.zip differ diff --git a/data/unified_datasets/taskmaster/ontology.json b/data/unified_datasets/taskmaster/ontology.json new file mode 100644 index 0000000000000000000000000000000000000000..9b5532e2bbcc36039486954e5b7bee1205a33cb5 --- /dev/null +++ b/data/unified_datasets/taskmaster/ontology.json @@ -0,0 +1,1168 @@ +{ + "domains": { + "uber_lyft": { + "description": "order a car for a ride inside a city", + "slots": { + "location.from": { + "description": "pickup location", + "is_categorical": false, + "possible_values": [], + "count": 5764, + "in original ontology": true + }, + "location.to": { + "description": "destination of the ride", + "is_categorical": false, + "possible_values": [], + "count": 6026, + "in original ontology": true + }, + "type.ride": { + "description": "type of ride", + "is_categorical": false, + "possible_values": [], + "count": 13317, + "in original ontology": true + }, + "num.people": { + "description": "number of people", + "is_categorical": false, + "possible_values": [], + "count": 2636, + "in original ontology": true + }, + "price.estimate": { + "description": "estimated cost of the ride", + "is_categorical": false, + "possible_values": [], + "count": 4996, + "in original ontology": true + }, + "duration.estimate": { + "description": "estimated duration of the ride", + "is_categorical": false, + "possible_values": [], + "count": 1152, + "in original ontology": true + }, + "time.pickup": { + "description": "time of pickup", + "is_categorical": false, + "possible_values": [], + "count": 4303, + "in original ontology": true + }, + "time.dropoff": { + "description": "time of dropoff", + "is_categorical": false, + "possible_values": [], + "count": 111, + "in original ontology": true + } + } + }, + "movie_ticket": { + "description": "book movie tickets for a film", + "slots": { + "name.movie": { + "description": "name of the movie", + "is_categorical": false, + "possible_values": [], + "count": 8959, + "in original ontology": true + }, + "name.theater": { + "description": "name of the theater", + "is_categorical": false, + "possible_values": [], + "count": 6842, + "in original ontology": true + }, + "num.tickets": { + "description": "number of tickets", + "is_categorical": false, + "possible_values": [], + "count": 7368, + "in original ontology": true + }, + "time.start": { + "description": "start time of the movie", + "is_categorical": false, + "possible_values": [], + "count": 14820, + "in original ontology": true + }, + "location.theater": { + "description": "location of the theater", + "is_categorical": false, + "possible_values": [], + "count": 5295, + "in original ontology": true + }, + "price.ticket": { + "description": "price of the ticket", + "is_categorical": false, + "possible_values": [], + "count": 2418, + "in original ontology": true + }, + "type.screening": { + "description": "type of the screening", + "is_categorical": false, + "possible_values": [], + "count": 4579, + "in original ontology": true + }, + "time.end": { + "description": "end time of the movie", + "is_categorical": false, + "possible_values": [], + "count": 10, + "in original ontology": true + }, + "time.duration": { + "description": "duration of the movie", + "is_categorical": false, + "possible_values": [], + "count": 250, + "in original ontology": true + } + } + }, + "restaurant_reservation": { + "description": "searching for a restaurant and make reservation", + "slots": { + "name.restaurant": { + "description": "name of the restaurant", + "is_categorical": false, + "possible_values": [], + "count": 14544, + "in original ontology": true + }, + "name.reservation": { + "description": "name of the person who make the reservation", + "is_categorical": false, + "possible_values": [], + "count": 609, + "in original ontology": true + }, + "num.guests": { + "description": "number of guests", + "is_categorical": false, + "possible_values": [], + "count": 6962, + "in original ontology": true + }, + "time.reservation": { + "description": "time of the reservation", + "is_categorical": false, + "possible_values": [], + "count": 12073, + "in original ontology": true + }, + "type.seating": { + "description": "type of the seating", + "is_categorical": false, + "possible_values": [], + "count": 8983, + "in original ontology": true + }, + "location.restaurant": { + "description": "location of the restaurant", + "is_categorical": false, + "possible_values": [], + "count": 4311, + "in original ontology": true + } + } + }, + "coffee_ordering": { + "description": "order a coffee drink from either Starbucks or Peets for pick up", + "slots": { + "location.store": { + "description": "location of the coffee store", + "is_categorical": false, + "possible_values": [], + "count": 5510, + "in original ontology": true + }, + "name.drink": { + "description": "name of the drink", + "is_categorical": false, + "possible_values": [], + "count": 9182, + "in original ontology": true + }, + "size.drink": { + "description": "size of the drink", + "is_categorical": false, + "possible_values": [], + "count": 7804, + "in original ontology": true + }, + "num.drink": { + "description": "number of drinks", + "is_categorical": false, + "possible_values": [], + "count": 848, + "in original ontology": true + }, + "type.milk": { + "description": "type of the milk", + "is_categorical": false, + "possible_values": [], + "count": 8433, + "in original ontology": true + }, + "preference": { + "description": "user preference of the drink", + "is_categorical": false, + "possible_values": [], + "count": 11266, + "in original ontology": true + } + } + }, + "pizza_ordering": { + "description": "order a pizza", + "slots": { + "name.store": { + "description": "name of the pizza store", + "is_categorical": false, + "possible_values": [], + "count": 5127, + "in original ontology": true + }, + "name.pizza": { + "description": "name of the pizza", + "is_categorical": false, + "possible_values": [], + "count": 9208, + "in original ontology": true + }, + "size.pizza": { + "description": "size of the pizza", + "is_categorical": false, + "possible_values": [], + "count": 9661, + "in original ontology": true + }, + "type.topping": { + "description": "type of the topping", + "is_categorical": false, + "possible_values": [], + "count": 20639, + "in original ontology": true + }, + "type.crust": { + "description": "type of the crust", + "is_categorical": false, + "possible_values": [], + "count": 5099, + "in original ontology": true + }, + "preference": { + "description": "user preference of the pizza", + "is_categorical": false, + "possible_values": [], + "count": 4998, + "in original ontology": true + }, + "location.store": { + "description": "location of the pizza store", + "is_categorical": false, + "possible_values": [], + "count": 1487, + "in original ontology": true + } + } + }, + "auto_repair": { + "description": "set up an auto repair appointment with a repair shop", + "slots": { + "name.store": { + "description": "name of the repair store", + "is_categorical": false, + "possible_values": [], + "count": 4005, + "in original ontology": true + }, + "name.customer": { + "description": "name of the customer", + "is_categorical": false, + "possible_values": [], + "count": 4547, + "in original ontology": true + }, + "date.appt": { + "description": "date of the appointment", + "is_categorical": false, + "possible_values": [], + "count": 7650, + "in original ontology": true + }, + "time.appt": { + "description": "time of the appointment", + "is_categorical": false, + "possible_values": [], + "count": 9827, + "in original ontology": true + }, + "reason.appt": { + "description": "reason of the appointment", + "is_categorical": false, + "possible_values": [], + "count": 6509, + "in original ontology": true + }, + "name.vehicle": { + "description": "name of the vehicle", + "is_categorical": false, + "possible_values": [], + "count": 5262, + "in original ontology": true + }, + "year.vehicle": { + "description": "year of the vehicle", + "is_categorical": false, + "possible_values": [], + "count": 4561, + "in original ontology": true + }, + "location.store": { + "description": "location of the repair store", + "is_categorical": false, + "possible_values": [], + "count": 709, + "in original ontology": true + } + } + }, + "flights": { + "description": "find a round trip or multi-city flights", + "slots": { + "type": { + "description": "type of the flight", + "is_categorical": false, + "possible_values": [], + "count": 1999, + "in original ontology": true + }, + "destination1": { + "description": "the first destination city of the trip", + "is_categorical": false, + "possible_values": [], + "count": 3993, + "in original ontology": true + }, + "destination2": { + "description": "the second destination city of the trip", + "is_categorical": false, + "possible_values": [], + "count": 128, + "in original ontology": true + }, + "origin": { + "description": "the origin city of the trip", + "is_categorical": false, + "possible_values": [], + "count": 2595, + "in original ontology": true + }, + "date.depart_origin": { + "description": "date of departure from origin", + "is_categorical": false, + "possible_values": [], + "count": 3177, + "in original ontology": true + }, + "date.depart_intermediate": { + "description": "date of departure from intermediate", + "is_categorical": false, + "possible_values": [], + "count": 48, + "in original ontology": true + }, + "date.return": { + "description": "date of return", + "is_categorical": false, + "possible_values": [], + "count": 2822, + "in original ontology": true + }, + "time_of_day": { + "description": "time of the flight", + "is_categorical": false, + "possible_values": [], + "count": 3840, + "in original ontology": true + }, + "seating_class": { + "description": "seat type (first class, business class, economy class, etc.", + "is_categorical": false, + "possible_values": [], + "count": 3626, + "in original ontology": true + }, + "seat_location": { + "description": "location of the seat", + "is_categorical": false, + "possible_values": [], + "count": 313, + "in original ontology": true + }, + "stops": { + "description": "non-stop, layovers, etc.", + "is_categorical": false, + "possible_values": [], + "count": 6174, + "in original ontology": true + }, + "price_range": { + "description": "price range of the flight", + "is_categorical": false, + "possible_values": [], + "count": 2646, + "in original ontology": true + }, + "num.pax": { + "description": "number of people", + "is_categorical": false, + "possible_values": [], + "count": 437, + "in original ontology": true + }, + "luggage": { + "description": "luggage information", + "is_categorical": false, + "possible_values": [], + "count": 17, + "in original ontology": true + }, + "total_fare": { + "description": "total cost of the trip", + "is_categorical": false, + "possible_values": [], + "count": 1238, + "in original ontology": true + }, + "other_description": { + "description": "other description of the flight", + "is_categorical": false, + "possible_values": [], + "count": 2620, + "in original ontology": true + }, + "from": { + "description": "departure of the flight", + "is_categorical": false, + "possible_values": [], + "count": 1293, + "in original ontology": true + }, + "to": { + "description": "destination of the flight", + "is_categorical": false, + "possible_values": [], + "count": 1979, + "in original ontology": true + }, + "airline": { + "description": "airline of the flight", + "is_categorical": false, + "possible_values": [], + "count": 3981, + "in original ontology": true + }, + "flight_number": { + "description": "the number of the flight", + "is_categorical": false, + "possible_values": [], + "count": 42, + "in original ontology": true + }, + "date": { + "description": "date of the flight", + "is_categorical": false, + "possible_values": [], + "count": 756, + "in original ontology": true + }, + "from.time": { + "description": "departure time of the flight", + "is_categorical": false, + "possible_values": [], + "count": 6440, + "in original ontology": true + }, + "to.time": { + "description": "arrival time of the flight", + "is_categorical": false, + "possible_values": [], + "count": 2571, + "in original ontology": true + }, + "stops.location": { + "description": "location of the stop", + "is_categorical": false, + "possible_values": [], + "count": 1097, + "in original ontology": true + }, + "fare": { + "description": "cost of the flight", + "is_categorical": false, + "possible_values": [], + "count": 1475, + "in original ontology": true + } + } + }, + "food_order": { + "description": "order take-out for a particular cuisine choice", + "slots": { + "name.item": { + "description": "name of the item", + "is_categorical": false, + "possible_values": [], + "count": 6080, + "in original ontology": true + }, + "other_description.item": { + "description": "other description of the item", + "is_categorical": false, + "possible_values": [], + "count": 1474, + "in original ontology": true + }, + "type.retrieval": { + "description": "type of the retrieval method", + "is_categorical": false, + "possible_values": [], + "count": 1868, + "in original ontology": true + }, + "total_price": { + "description": "total price", + "is_categorical": false, + "possible_values": [], + "count": 5, + "in original ontology": true + }, + "time.pickup": { + "description": "pick up time", + "is_categorical": false, + "possible_values": [], + "count": 981, + "in original ontology": true + }, + "num.people": { + "description": "number of people", + "is_categorical": false, + "possible_values": [], + "count": 880, + "in original ontology": true + }, + "name.restaurant": { + "description": "name of the restaurant", + "is_categorical": false, + "possible_values": [], + "count": 106, + "in original ontology": true + }, + "type.food": { + "description": "type of food", + "is_categorical": false, + "possible_values": [], + "count": 1247, + "in original ontology": true + }, + "type.meal": { + "description": "type of meal", + "is_categorical": false, + "possible_values": [], + "count": 64, + "in original ontology": true + }, + "location.restaurant": { + "description": "location of the restaurant", + "is_categorical": false, + "possible_values": [], + "count": 8, + "in original ontology": true + }, + "rating.restaurant": { + "description": "rating of the restaurant", + "is_categorical": false, + "possible_values": [], + "count": 5, + "in original ontology": true + }, + "price_range": { + "description": "price range of the food", + "is_categorical": false, + "possible_values": [], + "count": 5, + "in original ontology": true + } + } + }, + "hotel": { + "description": "find a hotel using typical preferences", + "slots": { + "name.hotel": { + "description": "name of the hotel", + "is_categorical": false, + "possible_values": [], + "count": 5241, + "in original ontology": true + }, + "location.hotel": { + "description": "location of the hotel", + "is_categorical": false, + "possible_values": [], + "count": 2940, + "in original ontology": true + }, + "sub_location.hotel": { + "description": "rough location of the hotel", + "is_categorical": false, + "possible_values": [], + "count": 1869, + "in original ontology": true + }, + "star_rating": { + "description": "star rating of the hotel", + "is_categorical": false, + "possible_values": [], + "count": 2049, + "in original ontology": true + }, + "customer_rating": { + "description": "customer rating of the hotel", + "is_categorical": false, + "possible_values": [], + "count": 1239, + "in original ontology": true + }, + "price_range": { + "description": "price range of the hotel", + "is_categorical": false, + "possible_values": [], + "count": 2357, + "in original ontology": true + }, + "amenity": { + "description": "amenity of the hotel", + "is_categorical": false, + "possible_values": [], + "count": 19030, + "in original ontology": true + }, + "num.beds": { + "description": "number of beds to book", + "is_categorical": false, + "possible_values": [], + "count": 733, + "in original ontology": true + }, + "type.bed": { + "description": "type of the bed", + "is_categorical": false, + "possible_values": [], + "count": 1423, + "in original ontology": true + }, + "num.rooms": { + "description": "number of rooms to book", + "is_categorical": false, + "possible_values": [], + "count": 256, + "in original ontology": true + }, + "check-in_date": { + "description": "check-in date", + "is_categorical": false, + "possible_values": [], + "count": 608, + "in original ontology": true + }, + "check-out_date": { + "description": "check-out date", + "is_categorical": false, + "possible_values": [], + "count": 428, + "in original ontology": true + }, + "date_range": { + "description": "date range of the reservation", + "is_categorical": false, + "possible_values": [], + "count": 2458, + "in original ontology": true + }, + "num.guests": { + "description": "number of guests", + "is_categorical": false, + "possible_values": [], + "count": 1323, + "in original ontology": true + }, + "type.room": { + "description": "type of the room", + "is_categorical": false, + "possible_values": [], + "count": 1840, + "in original ontology": true + }, + "price_per_night": { + "description": "price per night", + "is_categorical": false, + "possible_values": [], + "count": 2357, + "in original ontology": true + }, + "total_fare": { + "description": "total fare", + "is_categorical": false, + "possible_values": [], + "count": 28, + "in original ontology": true + }, + "location": { + "description": "location of the hotel", + "is_categorical": false, + "possible_values": [], + "count": 352, + "in original ontology": true + } + } + }, + "movie": { + "description": "find a movie to watch in theaters or using a streaming service at home", + "slots": { + "name.movie": { + "description": "name of the movie", + "is_categorical": false, + "possible_values": [], + "count": 13413, + "in original ontology": true + }, + "genre": { + "description": "genre of the movie", + "is_categorical": false, + "possible_values": [], + "count": 4982, + "in original ontology": true + }, + "name.theater": { + "description": "name of the theater", + "is_categorical": false, + "possible_values": [], + "count": 2371, + "in original ontology": true + }, + "location.theater": { + "description": "location of the theater", + "is_categorical": false, + "possible_values": [], + "count": 2894, + "in original ontology": true + }, + "time.start": { + "description": "start time of the movie", + "is_categorical": false, + "possible_values": [], + "count": 6455, + "in original ontology": true + }, + "time.end": { + "description": "end time of the movie", + "is_categorical": false, + "possible_values": [], + "count": 3, + "in original ontology": true + }, + "price.ticket": { + "description": "price of the ticket", + "is_categorical": false, + "possible_values": [], + "count": 989, + "in original ontology": true + }, + "price.streaming": { + "description": "price of the streaming", + "is_categorical": false, + "possible_values": [], + "count": 397, + "in original ontology": true + }, + "type.screening": { + "description": "type of the screening", + "is_categorical": false, + "possible_values": [], + "count": 1419, + "in original ontology": true + }, + "audience_rating": { + "description": "audience rating", + "is_categorical": false, + "possible_values": [], + "count": 1506, + "in original ontology": true + }, + "movie_rating": { + "description": "film rating", + "is_categorical": false, + "possible_values": [], + "count": 273, + "in original ontology": true + }, + "release_date": { + "description": "release date of the movie", + "is_categorical": false, + "possible_values": [], + "count": 386, + "in original ontology": true + }, + "runtime": { + "description": "running time of the movie", + "is_categorical": false, + "possible_values": [], + "count": 262, + "in original ontology": true + }, + "real_person": { + "description": "name of actors, directors, etc.", + "is_categorical": false, + "possible_values": [], + "count": 3406, + "in original ontology": true + }, + "character": { + "description": "name of character in the movie", + "is_categorical": false, + "possible_values": [], + "count": 1676, + "in original ontology": true + }, + "streaming_service": { + "description": "streaming service that provide the movie", + "is_categorical": false, + "possible_values": [], + "count": 2729, + "in original ontology": true + }, + "num.tickets": { + "description": "number of tickets", + "is_categorical": false, + "possible_values": [], + "count": 1045, + "in original ontology": true + }, + "seating": { + "description": "type of seating", + "is_categorical": false, + "possible_values": [], + "count": 13, + "in original ontology": true + } + } + }, + "music": { + "description": "find several tracks to play and then comment on each one", + "slots": { + "name.track": { + "description": "name of the track", + "is_categorical": false, + "possible_values": [], + "count": 4916, + "in original ontology": true + }, + "name.artist": { + "description": "name of the artist", + "is_categorical": false, + "possible_values": [], + "count": 9287, + "in original ontology": true + }, + "name.album": { + "description": "name of the album", + "is_categorical": false, + "possible_values": [], + "count": 1106, + "in original ontology": true + }, + "name.genre": { + "description": "music genre", + "is_categorical": false, + "possible_values": [], + "count": 452, + "in original ontology": true + }, + "type.music": { + "description": "rough type of the music", + "is_categorical": false, + "possible_values": [], + "count": 603, + "in original ontology": true + }, + "describes_track": { + "description": "description of a track to find", + "is_categorical": false, + "possible_values": [], + "count": 2969, + "in original ontology": true + }, + "describes_artist": { + "description": "description of a artist to find", + "is_categorical": false, + "possible_values": [], + "count": 612, + "in original ontology": true + }, + "describes_album": { + "description": "description of an album to find", + "is_categorical": false, + "possible_values": [], + "count": 189, + "in original ontology": true + }, + "describes_genre": { + "description": "description of a genre to find", + "is_categorical": false, + "possible_values": [], + "count": 26, + "in original ontology": true + }, + "describes_type.music": { + "description": "description of the music type", + "is_categorical": false, + "possible_values": [], + "count": 375, + "in original ontology": true + } + } + }, + "restaurant": { + "description": "ask for recommendations for a particular type of cuisine", + "slots": { + "name.restaurant": { + "description": "name of the restaurant", + "is_categorical": false, + "possible_values": [], + "count": 8676, + "in original ontology": true + }, + "location": { + "description": "location of the restaurant", + "is_categorical": false, + "possible_values": [], + "count": 5165, + "in original ontology": true + }, + "sub-location": { + "description": "rough location of the restaurant", + "is_categorical": false, + "possible_values": [], + "count": 1210, + "in original ontology": true + }, + "type.food": { + "description": "the cuisine of the restaurant", + "is_categorical": false, + "possible_values": [], + "count": 12412, + "in original ontology": true + }, + "menu_item": { + "description": "item in the menu", + "is_categorical": false, + "possible_values": [], + "count": 1499, + "in original ontology": true + }, + "type.meal": { + "description": "type of meal", + "is_categorical": false, + "possible_values": [], + "count": 2677, + "in original ontology": true + }, + "rating": { + "description": "rating of the restaurant", + "is_categorical": false, + "possible_values": [], + "count": 2951, + "in original ontology": true + }, + "price_range": { + "description": "price range of the restaurant", + "is_categorical": false, + "possible_values": [], + "count": 1930, + "in original ontology": true + }, + "business_hours": { + "description": "business hours of the restaurant", + "is_categorical": false, + "possible_values": [], + "count": 2024, + "in original ontology": true + }, + "name.reservation": { + "description": "name of the person who make the reservation", + "is_categorical": false, + "possible_values": [], + "count": 16, + "in original ontology": true + }, + "num.guests": { + "description": "number of guests", + "is_categorical": false, + "possible_values": [], + "count": 179, + "in original ontology": true + }, + "time.reservation": { + "description": "time of the reservation", + "is_categorical": false, + "possible_values": [], + "count": 216, + "in original ontology": true + }, + "date.reservation": { + "description": "date of the reservation", + "is_categorical": false, + "possible_values": [], + "count": 130, + "in original ontology": true + }, + "type.seating": { + "description": "type of the seating", + "is_categorical": false, + "possible_values": [], + "count": 11, + "in original ontology": true + } + } + }, + "sport": { + "description": "discuss facts and stats about players, teams, games, etc. in EPL, MLB, MLS, NBA, NFL", + "slots": { + "name.team": { + "description": "name of the team", + "is_categorical": false, + "possible_values": [], + "count": 19651, + "in original ontology": true + }, + "record.team": { + "description": "record of the team (number of wins and losses)", + "is_categorical": false, + "possible_values": [], + "count": 3338, + "in original ontology": true + }, + "record.games_ahead": { + "description": "number of games ahead", + "is_categorical": false, + "possible_values": [], + "count": 33, + "in original ontology": true + }, + "record.games_back": { + "description": "number of games behind", + "is_categorical": false, + "possible_values": [], + "count": 361, + "in original ontology": true + }, + "place.team": { + "description": "ranking of the team", + "is_categorical": false, + "possible_values": [], + "count": 4075, + "in original ontology": true + }, + "result.match": { + "description": "result of the match", + "is_categorical": false, + "possible_values": [], + "count": 3245, + "in original ontology": true + }, + "score.match": { + "description": "score of the match", + "is_categorical": false, + "possible_values": [], + "count": 3241, + "in original ontology": true + }, + "date.match": { + "description": "date of the match", + "is_categorical": false, + "possible_values": [], + "count": 2660, + "in original ontology": true + }, + "day.match": { + "description": "day of the match", + "is_categorical": false, + "possible_values": [], + "count": 4743, + "in original ontology": true + }, + "time.match": { + "description": "time of the match", + "is_categorical": false, + "possible_values": [], + "count": 1283, + "in original ontology": true + }, + "name.player": { + "description": "name of the player", + "is_categorical": false, + "possible_values": [], + "count": 2365, + "in original ontology": true + }, + "position.player": { + "description": "position of the player", + "is_categorical": false, + "possible_values": [], + "count": 2746, + "in original ontology": true + }, + "record.player": { + "description": "record of the player", + "is_categorical": false, + "possible_values": [], + "count": 80, + "in original ontology": true + }, + "name.non_player": { + "description": "name of non-palyer such as the manager, coach", + "is_categorical": false, + "possible_values": [], + "count": 843, + "in original ontology": true + }, + "venue": { + "description": "venue of the match take place", + "is_categorical": false, + "possible_values": [], + "count": 328, + "in original ontology": true + } + } + } + }, + "intents": { + "inform": { + "description": "" + } + }, + "binary_dialogue_act": [], + "state": {} +} \ No newline at end of file diff --git a/data/unified_datasets/taskmaster/original_data.zip b/data/unified_datasets/taskmaster/original_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..8a36e95bf829063c7bff98404eb795107baf7b87 Binary files /dev/null and b/data/unified_datasets/taskmaster/original_data.zip differ diff --git a/data/unified_datasets/taskmaster/preprocess.py b/data/unified_datasets/taskmaster/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..298ac2562432d5cf93dad8f6dec672e626eca249 --- /dev/null +++ b/data/unified_datasets/taskmaster/preprocess.py @@ -0,0 +1,445 @@ +import json +import os +import copy +import zipfile +from tqdm import tqdm +import re +from convlab2.util.file_util import read_zipped_json, write_zipped_json +from pprint import pprint + +descriptions = { + "uber_lyft": { + "uber_lyft": "order a car for a ride inside a city", + "location.from": "pickup location", + "location.to": "destination of the ride", + "type.ride": "type of ride", + "num.people": "number of people", + "price.estimate": "estimated cost of the ride", + "duration.estimate": "estimated duration of the ride", + "time.pickup": "time of pickup", + "time.dropoff": "time of dropoff", + }, + "movie_ticket": { + "movie_ticket": "book movie tickets for a film", + "name.movie": "name of the movie", + "name.theater": "name of the theater", + "num.tickets": "number of tickets", + "time.start": "start time of the movie", + "location.theater": "location of the theater", + "price.ticket": "price of the ticket", + "type.screening": "type of the screening", + "time.end": "end time of the movie", + "time.duration": "duration of the movie", + }, + "restaurant_reservation": { + "restaurant_reservation": "searching for a restaurant and make reservation", + "name.restaurant": "name of the restaurant", + "name.reservation": "name of the person who make the reservation", + "num.guests": "number of guests", + "time.reservation": "time of the reservation", + "type.seating": "type of the seating", + "location.restaurant": "location of the restaurant", + }, + "coffee_ordering": { + "coffee_ordering": "order a coffee drink from either Starbucks or Peets for pick up", + "location.store": "location of the coffee store", + "name.drink": "name of the drink", + "size.drink": "size of the drink", + "num.drink": "number of drinks", + "type.milk": "type of the milk", + "preference": "user preference of the drink", + }, + "pizza_ordering": { + "pizza_ordering": "order a pizza", + "name.store": "name of the pizza store", + "name.pizza": "name of the pizza", + "size.pizza": "size of the pizza", + "type.topping": "type of the topping", + "type.crust": "type of the crust", + "preference": "user preference of the pizza", + "location.store": "location of the pizza store", + }, + "auto_repair": { + "auto_repair": "set up an auto repair appointment with a repair shop", + "name.store": "name of the repair store", + "name.customer": "name of the customer", + "date.appt": "date of the appointment", + "time.appt": "time of the appointment", + "reason.appt": "reason of the appointment", + "name.vehicle": "name of the vehicle", + "year.vehicle": "year of the vehicle", + "location.store": "location of the repair store", + }, + "flights": { + "flights": "find a round trip or multi-city flights", + "type": "type of the flight", + "destination1": "the first destination city of the trip", + "destination2": "the second destination city of the trip", + "origin": "the origin city of the trip", + "date.depart_origin": "date of departure from origin", + "date.depart_intermediate": "date of departure from intermediate", + "date.return": "date of return", + "time_of_day": "time of the flight", + "seating_class": "seat type (first class, business class, economy class, etc.", + "seat_location": "location of the seat", + "stops": "non-stop, layovers, etc.", + "price_range": "price range of the flight", + "num.pax": "number of people", + "luggage": "luggage information", + "total_fare": "total cost of the trip", + "other_description": "other description of the flight", + "from": "departure of the flight", + "to": "destination of the flight", + "airline": "airline of the flight", + "flight_number": "the number of the flight", + "date": "date of the flight", + "from.time": "departure time of the flight", + "to.time": "arrival time of the flight", + "stops.location": "location of the stop", + "fare": "cost of the flight", + }, + "food_order": { + "food_order": "order take-out for a particular cuisine choice", + "name.item": "name of the item", + "other_description.item": "other description of the item", + "type.retrieval": "type of the retrieval method", + "total_price": "total price", + "time.pickup": "pick up time", + "num.people": "number of people", + "name.restaurant": "name of the restaurant", + "type.food": "type of food", + "type.meal": "type of meal", + "location.restaurant": "location of the restaurant", + "rating.restaurant": "rating of the restaurant", + "price_range": "price range of the food", + }, + "hotel": { + "hotel": "find a hotel using typical preferences", + "name.hotel": "name of the hotel", + "location.hotel": "location of the hotel", + "sub_location.hotel": "rough location of the hotel", + "star_rating": "star rating of the hotel", + "customer_rating": "customer rating of the hotel", + "price_range": "price range of the hotel", + "amenity": "amenity of the hotel", + "num.beds": "number of beds to book", + "type.bed": "type of the bed", + "num.rooms": "number of rooms to book", + "check-in_date": "check-in date", + "check-out_date": "check-out date", + "date_range": "date range of the reservation", + "num.guests": "number of guests", + "type.room": "type of the room", + "price_per_night": "price per night", + "total_fare": "total fare", + "location": "location of the hotel", + }, + "movie": { + "movie": "find a movie to watch in theaters or using a streaming service at home", + "name.movie": "name of the movie", + "genre": "genre of the movie", + "name.theater": "name of the theater", + "location.theater": "location of the theater", + "time.start": "start time of the movie", + "time.end": "end time of the movie", + "price.ticket": "price of the ticket", + "price.streaming": "price of the streaming", + "type.screening": "type of the screening", + "audience_rating": "audience rating", + "movie_rating": "film rating", + "release_date": "release date of the movie", + "runtime": "running time of the movie", + "real_person": "name of actors, directors, etc.", + "character": "name of character in the movie", + "streaming_service": "streaming service that provide the movie", + "num.tickets": "number of tickets", + "seating": "type of seating", + }, + "music": { + "music": "find several tracks to play and then comment on each one", + "name.track": "name of the track", + "name.artist": "name of the artist", + "name.album": "name of the album", + "name.genre": "music genre", + "type.music": "rough type of the music", + "describes_track": "description of a track to find", + "describes_artist": "description of a artist to find", + "describes_album": "description of an album to find", + "describes_genre": "description of a genre to find", + "describes_type.music": "description of the music type", + }, + "restaurant": { + "restaurant": "ask for recommendations for a particular type of cuisine", + "name.restaurant": "name of the restaurant", + "location": "location of the restaurant", + "sub-location": "rough location of the restaurant", + "type.food": "the cuisine of the restaurant", + "menu_item": "item in the menu", + "type.meal": "type of meal", + "rating": "rating of the restaurant", + "price_range": "price range of the restaurant", + "business_hours": "business hours of the restaurant", + "name.reservation": "name of the person who make the reservation", + "num.guests": "number of guests", + "time.reservation": "time of the reservation", + "date.reservation": "date of the reservation", + "type.seating": "type of the seating", + }, + "sport": { + "sport": "discuss facts and stats about players, teams, games, etc. in EPL, MLB, MLS, NBA, NFL", + "name.team": "name of the team", + "record.team": "record of the team (number of wins and losses)", + "record.games_ahead": "number of games ahead", + "record.games_back": "number of games behind", + "place.team": "ranking of the team", + "result.match": "result of the match", + "score.match": "score of the match", + "date.match": "date of the match", + "day.match": "day of the match", + "time.match": "time of the match", + "name.player": "name of the player", + "position.player": "position of the player", + "record.player": "record of the player", + "name.non_player": "name of non-palyer such as the manager, coach", + "venue": "venue of the match take place", + } +} + + +def normalize_domain_name(domain): + if domain == 'auto': + return 'auto_repair' + elif domain == 'pizza': + return 'pizza_ordering' + elif domain == 'coffee': + return 'coffee_ordering' + elif domain == 'uber': + return 'uber_lyft' + elif domain == 'restaurant': + return 'restaurant_reservation' + elif domain == 'movie': + return 'movie_ticket' + elif domain == 'flights': + return 'flights' + elif domain == 'food-ordering': + return 'food_order' + elif domain == 'hotels': + return 'hotel' + elif domain == 'movies': + return 'movie' + elif domain == 'music': + return 'music' + elif domain == 'restaurant-search': + return 'restaurant' + elif domain == 'sports': + return 'sport' + assert 0 + + +def format_turns(ori_turns): + new_turns = [] + previous_speaker = None + utt_idx = 0 + for i, turn in enumerate(ori_turns): + speaker = 'system' if turn['speaker'] == 'ASSISTANT' else 'user' + turn['speaker'] = speaker + if utt_idx == 0 and speaker == 'system': + continue + if turn['text'] == '(deleted)': + continue + if not previous_speaker: + assert speaker != previous_speaker + if speaker != previous_speaker: + previous_speaker = speaker + new_turns.append(copy.deepcopy(turn)) + utt_idx += 1 + else: + # continuous speaking + last_turn = new_turns[-1] + # if ori_turns[i-1]['text'] == turn['text']: + # # skip repeat turn + # continue + if turn['text'] in ori_turns[i-1]['text']: + continue + index_shift = len(last_turn['text']) + 1 + last_turn['text'] += ' '+turn['text'] + if 'segments' in turn: + last_turn.setdefault('segments', []) + for segment in turn['segments']: + segment['start_index'] += index_shift + segment['end_index'] += index_shift + last_turn['segments'] += turn['segments'] + if new_turns and new_turns[-1]['speaker'] == 'system': + new_turns = new_turns[:-1] + return new_turns + + +def log_ontology(acts, ontology, ori_ontology): + for item in acts: + intent, domain, slot, value = item['intent'], item['domain'], item['slot'], item['value'] + if domain not in ontology['domains']: + ontology['domains'][domain] = {'description': "", 'slots': {}} + if slot not in ontology['domains'][domain]['slots']: + ontology['domains'][domain]['slots'][slot] = { + 'description': '', + 'is_categorical': False, + 'possible_values': [], + 'count': 1 + } + else: + ontology['domains'][domain]['slots'][slot]['count'] += 1 + ontology['domains'][domain]['slots'][slot]['in original ontology'] = slot in ori_ontology[domain] + if intent is not None and intent not in ontology['intents']: + ontology['intents'][intent] = { + "description": '' + } + + +def preprocess(): + self_dir = os.path.dirname(os.path.abspath(__file__)) + processed_dialogue = [] + ontology = {'domains': {}, + 'intents': {}, + 'binary_dialogue_act': [], + 'state': {}} + original_zipped_path = os.path.join(self_dir, 'original_data.zip') + new_dir = os.path.join(self_dir, 'original_data') + if not os.path.exists(os.path.join(self_dir, 'data.zip')) or not os.path.exists(os.path.join(self_dir, 'ontology.json')): + print('unzip to', new_dir) + print('This may take several minutes') + archive = zipfile.ZipFile(original_zipped_path, 'r') + archive.extractall(self_dir) + files = [ + ('TM-1-2019/woz-dialogs.json', 'TM-1-2019/ontology.json'), + ('TM-1-2019/self-dialogs.json', 'TM-1-2019/ontology.json'), + ('TM-2-2020/data/flights.json', 'TM-2-2020/ontology/flights.json'), + ('TM-2-2020/data/food-ordering.json', 'TM-2-2020/ontology/food-ordering.json'), + ('TM-2-2020/data/hotels.json', 'TM-2-2020/ontology/hotels.json'), + ('TM-2-2020/data/movies.json', 'TM-2-2020/ontology/movies.json'), + ('TM-2-2020/data/music.json', 'TM-2-2020/ontology/music.json'), + ('TM-2-2020/data/restaurant-search.json', 'TM-2-2020/ontology/restaurant-search.json'), + ('TM-2-2020/data/sports.json', 'TM-2-2020/ontology/sports.json') + ] + idx_count = 1 + total = 0 + + for filename, ontology_filename in files: + data = json.load(open(os.path.join(new_dir, filename))) + ori_ontology = {} + if 'TM-1' in filename: + for domain, item in json.load(open(os.path.join(new_dir, ontology_filename))).items(): + ori_ontology[item["id"]] = {} + for slot in item["required"] + item["optional"]: + ori_ontology[item["id"]][slot] = 0 + else: + domain = normalize_domain_name(filename.split('/')[-1].split('.')[0]) + ori_ontology[domain] = {} + for _, item in json.load(open(os.path.join(new_dir, ontology_filename))).items(): + for group in item: + for anno in group["annotations"]: + ori_ontology[domain][anno] = 0 + for d in ori_ontology: + if d not in ontology['domains']: + ontology['domains'][d] = {'description': descriptions[d][d], 'slots': {}} + for s in ori_ontology[d]: + if s not in ontology['domains'][d]['slots']: + ontology['domains'][d]['slots'][s] = { + 'description': descriptions[d][s], + 'is_categorical': False, + 'possible_values': [], + 'count': 0, + 'in original ontology': True + } + # pprint(ori_ontology) + for ori_sess in tqdm(data, desc='processing taskmaster-{}'.format(filename)): + total += 1 + turns = format_turns(ori_sess['utterances']) + if not turns: + continue + if 'TM-2' in filename: + dial_domain = normalize_domain_name(filename.split('/')[-1].split('.')[0]) + else: + dial_domain = normalize_domain_name(ori_sess['instruction_id'].split('-', 1)[0]) + dialogue = { + "dataset": "taskmaster", + "data_split": "train", + "dialogue_id": 'taskmaster_' + str(idx_count), + "original_id": ori_sess['conversation_id'], + "instruction_id": ori_sess['instruction_id'], + "domains": [ + dial_domain + ], + "turns": [] + } + idx_count += 1 + assert turns[0]['speaker'] == 'user' and turns[-1]['speaker'] == 'user', print(turns) + for utt_idx, uttr in enumerate(turns): + speaker = uttr['speaker'] + turn = { + 'speaker': speaker, + 'utterance': uttr['text'], + 'utt_idx': utt_idx, + 'dialogue_act': { + 'binary': [], + 'categorical': [], + 'non-categorical': [], + }, + } + if speaker == 'user': + turn['state'] = {} + turn['state_update'] = {'categorical': [], 'non-categorical': []} + + if 'segments' in uttr: + for segment in uttr['segments']: + for item in segment['annotations']: + # domain = item['name'].split('.', 1)[0] + domain = dial_domain + + # if domain != item['name'].split('.', 1)[0]: + # print(domain, item['name'].split('.', 1), dialogue["original_id"]) + # assert domain in item['name'].split('.', 1)[0] + + # if item['name'].split('.', 1)[0] != domain: + # print(domain, item['name'].split('.', 1), dialogue["original_id"]) + slot = item['name'].split('.', 1)[-1] + if slot.endswith('.accept') or slot.endswith('.reject'): + slot = slot[:-7] + if slot not in ori_ontology[domain]: + # print(domain, item['name'].split('.', 1), dialogue["original_id"]) + continue + # if domain in ori_ontology: + # ori_ontology[domain][slot] += 1 + # else: + # print(domain, item['name'].split('.', 1), dialogue["original_id"]) + # assert domain in ori_ontology, print(domain, item['name'].split('.', 1), dialogue["original_id"]) + + if not segment['text']: + print(slot) + print(segment) + print() + assert turn['utterance'][segment['start_index']:segment['end_index']] == segment['text'] + turn['dialogue_act']['non-categorical'].append({ + 'intent': 'inform', + 'domain': domain, + 'slot': slot, + 'value': segment['text'].lower(), + 'start': segment['start_index'], + 'end': segment['end_index'] + }) + log_ontology(turn['dialogue_act']['non-categorical'], ontology, ori_ontology) + dialogue['turns'].append(turn) + processed_dialogue.append(dialogue) + # pprint(ori_ontology) + # save ontology json + json.dump(ontology, open(os.path.join(self_dir, 'ontology.json'), 'w'), indent=2) + json.dump(processed_dialogue, open('data.json', 'w'), indent=2) + write_zipped_json(os.path.join(self_dir, 'data.zip'), 'data.json') + os.remove('data.json') + else: + # read from file + processed_dialogue = read_zipped_json(os.path.join(self_dir, 'data.zip'), 'data.json') + ontology = json.load(open(os.path.join(self_dir, 'ontology.json'))) + return processed_dialogue, ontology + +if __name__ == '__main__': + preprocess() diff --git a/data/unified_datasets/woz/README.md b/data/unified_datasets/woz/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9b883e678a20fba4da78f37545acedb50945c767 --- /dev/null +++ b/data/unified_datasets/woz/README.md @@ -0,0 +1,31 @@ +# README + +## Features + +- Annotations: dialogue act, character-level span for non-categorical slots. + +Statistics: + +| | \# dialogues | \# utterances | avg. turns | avg. tokens | \# domains | +| ----- | ------------ | ------------- | ---------- | ----------- | ---------- | +| train | 406 | 2936 | 7.23 | 11.36 | 1 | +| dev | 135 | 941 | 6.97 | 11.99 | 1 | +| train | 135 | 935 | 6.93 | 11.87 | 1 | + + +## Main changes + +- domain is set to **restaurant** +- make some rule-based fixes on categorical values to make them in `possible value` lists +- `belief_states` in WOZ dataset contains `request` intents, which are ignored in processing +- some state annotations are not consistent with dialogue_act annotations. for example in `woz_train_en.json`, first dialog, 2nd turn: + + `user: "How about Chinese food?"` + + `chinese food` is included in `dialogue_act` annotation as a `inform` intent, but not updated in `belief_state` annotation. + + + +## Original data + +https://github.com/nmrksic/neural-belief-tracker/tree/master/data/woz \ No newline at end of file diff --git a/data/unified_datasets/woz/data.zip b/data/unified_datasets/woz/data.zip new file mode 100644 index 0000000000000000000000000000000000000000..14b3cc709f5d6c3c8361542a1ccdfef7696a436c Binary files /dev/null and b/data/unified_datasets/woz/data.zip differ diff --git a/data/unified_datasets/woz/ontology.json b/data/unified_datasets/woz/ontology.json new file mode 100644 index 0000000000000000000000000000000000000000..8f863f679941e2bdf4347a4d6c992a4881b8ef60 --- /dev/null +++ b/data/unified_datasets/woz/ontology.json @@ -0,0 +1,117 @@ +{ + "domains": { + "restaurant": { + "description": "search for a restaurant to dine", + "slots": { + "food": { + "is_categorical": false, + "possible_values": [], + "description": "food type of the restaurant" + }, + "area": { + "is_categorical": true, + "possible_values": [ + "east", + "west", + "center", + "north", + "south" + ], + "description": "area of the restaurant" + }, + "postcode": { + "is_categorical": false, + "possible_values": [], + "description": "postal code of the restaurant" + }, + "phone": { + "is_categorical": false, + "possible_values": [], + "description": "phone number of the restaurant" + }, + "address": { + "is_categorical": false, + "possible_values": [], + "description": "address of the restaurant" + }, + "price range": { + "is_categorical": true, + "possible_values": [ + "expensive", + "moderate", + "cheap" + ], + "description": "price range of the restaurant" + }, + "name": { + "is_categorical": false, + "possible_values": [], + "description": "name of the restaurant" + } + } + } + }, + "intents": { + "inform": { + "description": "system informs user the value of a slot" + }, + "request": { + "description": "system asks the user to provide value of a slot" + } + }, + "binary_dialogue_act": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "phone", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "area", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "price range", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "food", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "name", + "value": "" + } + ], + "state": { + "restaurant": { + "food": "", + "area": "", + "postcode": "", + "phone": "", + "address": "", + "price range": "", + "name": "" + } + } +} \ No newline at end of file diff --git a/data/unified_datasets/woz/original_data.zip b/data/unified_datasets/woz/original_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..52f0a9c6d72e88082d0d52f64d3298516136c26a Binary files /dev/null and b/data/unified_datasets/woz/original_data.zip differ diff --git a/data/unified_datasets/woz/preprocess.py b/data/unified_datasets/woz/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..a99394b42479e927f8c2cf8194a0233bc5fe8129 --- /dev/null +++ b/data/unified_datasets/woz/preprocess.py @@ -0,0 +1,324 @@ +import copy +import zipfile +import json +import os +from collections import Counter +from tqdm import tqdm +import logging +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +logging.basicConfig(level=logging.INFO) +from convlab2.util.file_util import read_zipped_json, write_zipped_json + +self_dir = os.path.dirname(os.path.abspath(__file__)) + +cat_slots = ['price range', 'area'] +cat_slot_values = { + 'area': [ + "east", + "west", + "center", + "north", + "south" + ], + 'price range': [ + "expensive", + "moderate", + "dontcare", + "cheap" + ] +} + +woz_desc = { + 'restaurant': { + 'domain': 'search for a restaurant to dine', + 'food': 'food type of the restaurant', + 'area': 'area of the restaurant', + 'postcode': 'postal code of the restaurant', + 'phone': 'phone number of the restaurant', + 'address': 'address of the restaurant', + 'price range': 'price range of the restaurant', + 'name': 'name of the restaurant' + }, + 'intents': { + 'inform': 'system informs user the value of a slot', + 'request': 'system asks the user to provide value of a slot', + } +} + + +def convert_da(da, utt, all_binary): + converted = { + 'binary': [], + 'categorical': [], + 'non-categorical': [] + } + + for s, v in da: + v = 'expensive' if 'expensive' in v else v + v = 'center' if v == 'centre' else v + v = 'east' if 'east' in v else v + + if s in ['request']: + _converted = { + 'intent': 'request', + 'domain': 'restaurant', + 'slot': v, + 'value': '', + } + converted['binary'].append(_converted) + + if _converted not in all_binary: + all_binary.append(_converted) + + else: + slot_name = s + slot_type = 'categorical' if s in cat_slots else 'non-categorical' + + converted[slot_type].append({ + 'intent': 'inform', + 'domain': 'restaurant', + 'slot': slot_name, + 'value': v + }) + + if slot_type == 'non-categorical': + + start = utt.find(v) + + if start != -1: + end = start + len(v) + converted[slot_type][-1]['start'] = start + converted[slot_type][-1]['end'] = end + + return converted + + +def convert_state(state): + ret = { + 'restaurant': {} + } + for s in woz_desc['restaurant']: + if s == 'domain': + continue + ret['restaurant'][s] = '' + for s in state: + assert s['act'] in ['request', 'inform'] + if s['act'] == 'inform': + for _s, _v in s['slots']: + _v = 'expensive' if 'expensive' in _v else _v + _v = 'center' if _v == 'centre' else _v + _v = 'east' if 'east' in _v else _v + # try: + # assert _s not in ret['restaurant'] + # except: + # continue + ret['restaurant'][_s] = _v + + return ret + + +def get_state_update(prev_state, cur_state, usr_da, turn_idx, dialog_idx): + + ret = { + 'categorical': [], + 'non-categorical': [] + } + for k, v in prev_state['restaurant'].items(): + + if k in cur_state['restaurant'] and cur_state['restaurant'][k] == v: + continue + if k in cat_slots: + ret['categorical'].append({ + 'domain': 'restaurant', + 'slot': k, + 'value': cur_state['restaurant'][k] + }) + else: + found = False + for _da in usr_da['non-categorical']: + + if _da['slot'] == k and _da['value'] == cur_state['restaurant'][k]: + found = True + if v == 'dontcare': + ret['non-categorical'].append({ + 'domain': 'restaurant', + 'slot': k, + 'value': cur_state['restaurant'][k], + }) + else: + ret['non-categorical'].append({ + 'domain': 'restaurant', + 'slot': k, + 'value': cur_state['restaurant'][k] + }) + + if 'start' in _da: + ret['non-categorical'][-1].update({ + 'utt_idx': turn_idx * 2, + 'start': _da['start'], + 'end': _da['end'] + }) + + if not found: + # print(dialog_idx, turn_idx*2) + # print(k, v) + # print('===================') + ret['non-categorical'].append({ + 'domain': 'restaurant', + 'slot': k, + 'value': cur_state['restaurant'][k] + }) + + return ret + + + +def preprocess(): + dataset_dir = 'woz' + data_splits = ['train', 'validate', 'test'] + all_dialogues = [] + all_binary_intents = [] + all_slot = [] + all_slot_value = {} + extract_dir = os.path.join(self_dir, 'original_data') + + if not os.path.exists('data.zip') or not os.path.exists('ontology.json'): + # data not processed + data_zip_file = os.path.join(self_dir, 'original_data.zip') + if not os.path.exists(data_zip_file): + raise FileNotFoundError(data_zip_file) + + logging.info('unzip woz data to {}'.format(extract_dir)) + archive = zipfile.ZipFile(data_zip_file, 'r') + archive.extractall(extract_dir) + + dialog_id = 1 + for split in data_splits: + + data = json.load(open(os.path.join(self_dir, extract_dir, 'original_data/woz_{}_en.json'.format(split)))) + + + for dialogue in data: + ret = {} + ret['dataset'] = "woz" + ret['data_split'] = split if split != 'validate' else 'val' + ret['dialogue_id'] = 'woz_' + str(dialog_id) + ret['original_id'] = split + str(dialogue['dialogue_idx']) if split != 'validate' else 'val' + str(dialogue['dialogue_idx']) + ret['domains'] = ['restaurant'] + + ret['turns'] = [] + + turns = dialogue['dialogue'] + n_turn = len(turns) + prev_state = {'restaurant':{k: '' for k in woz_desc['restaurant'] if k != 'domain'}} + + for i in range(n_turn): + + sys_utt = turns[i]['system_transcript'].lower() + usr_utt = turns[i]['transcript'].lower() + usr_da = turns[i]['turn_label'] + bs = turns[i]['belief_state'] + + for s, v in usr_da: + if s == 'request': + if v not in all_slot: + all_slot.append(v) + + if v not in all_slot_value and v != 'dontcare': + all_slot_value[v] = [] + + else: + if s not in all_slot: + all_slot.append(s) + if v == 'dontcare': + continue + if s not in all_slot_value: + all_slot_value[s] = [v] + else: + if v not in all_slot_value[s]: + all_slot_value[s].append(v) + + if i != 0: + ret['turns'].append({ + 'utt_idx': len(ret['turns']), + 'speaker': 'system', + 'utterance': sys_utt, + 'dialogue_act': {'binary':[], 'categorical': [], 'non-categorical':[]}, + }) + + cur_state = convert_state(bs) + cur_usr_da = convert_da(usr_da, usr_utt, all_binary_intents) + + ret['turns'].append({ + 'utt_idx': len(ret['turns']), + 'speaker': 'user', + 'utterance': usr_utt, + 'state': cur_state, + 'dialogue_act': cur_usr_da, + 'state_update': get_state_update(prev_state, cur_state, cur_usr_da, i, ret['dialogue_id']) + }) + + prev_state = copy.deepcopy(cur_state) + + all_dialogues.append(ret) + dialog_id += 1 + + save_file = 'data.json' + json.dump(all_dialogues, open(save_file, 'w'), indent=4) + write_zipped_json(os.path.join(self_dir, 'data.zip'), 'data.json') + os.remove('data.json') + + new_ont = {'domains': { + 'restaurant': { + 'description': woz_desc['restaurant']['domain'], + 'slots': {} + } + }, 'intents': { + 'inform': { + 'description': woz_desc['intents']['inform'], + }, + 'request': { + 'description': woz_desc['intents']['request'], + }, + }, 'binary_dialogue_act': [] + } + for i in all_binary_intents: + new_ont['binary_dialogue_act'].append(i) + + for slot in all_slot_value: + if slot in cat_slots: + new_ont['domains']['restaurant']['slots'][slot] = { + 'is_categorical': True, + 'possible_values': [], + 'description': woz_desc['restaurant'][slot] + } + for v in all_slot_value[slot]: + v = 'expensive' if 'expensive' in v else v + v = 'center' if v == 'centre' else v + v = 'east' if 'east' in v else v + if v not in new_ont['domains']['restaurant']['slots'][slot]['possible_values']: + new_ont['domains']['restaurant']['slots'][slot]['possible_values'].append(v) + else: + new_ont['domains']['restaurant']['slots'][slot] = { + 'is_categorical': False, + 'possible_values': [], + 'description': woz_desc['restaurant'][slot] + } + + new_ont['state'] = { + 'restaurant': {k: '' for k in all_slot_value} + } + + json.dump(new_ont, open(os.path.join(self_dir, 'ontology.json'), 'w'), indent=4) + + else: + # read from file + all_dialogues = read_zipped_json(os.path.join(self_dir, 'data.zip'), 'data.json') + new_ont = json.load(open(os.path.join(self_dir, 'ontology.json'))) + + return all_dialogues, new_ont + + +if __name__ == '__main__': + preprocess()