diff --git a/data/unified_datasets/multiwoz21/original_data.zip b/data/unified_datasets/multiwoz21/MultiWOZ_2.1.zip similarity index 66% rename from data/unified_datasets/multiwoz21/original_data.zip rename to data/unified_datasets/multiwoz21/MultiWOZ_2.1.zip index b27361772be980013a94c91898fb61e3e122ba8e..c30d869f77e86fe2bf19245120e771f28cb32bfa 100644 Binary files a/data/unified_datasets/multiwoz21/original_data.zip and b/data/unified_datasets/multiwoz21/MultiWOZ_2.1.zip differ diff --git a/data/unified_datasets/multiwoz21/data.zip b/data/unified_datasets/multiwoz21/data.zip index 7f60019dfc4a68728474baf9e8d3d73305b46082..51ac6b5f326b2c9c6d078c433c89b9e0010301cd 100644 Binary files a/data/unified_datasets/multiwoz21/data.zip and b/data/unified_datasets/multiwoz21/data.zip differ diff --git a/data/unified_datasets/multiwoz21/dummy_data.json b/data/unified_datasets/multiwoz21/dummy_data.json new file mode 100644 index 0000000000000000000000000000000000000000..9945b92915b0f9739d8b31b3fd112f8ab86f2cb3 --- /dev/null +++ b/data/unified_datasets/multiwoz21/dummy_data.json @@ -0,0 +1,5896 @@ +[ + { + "dataset": "multiwoz21", + "data_split": "train", + "dialogue_id": "multiwoz21-train-0", + "original_id": "SNG01856.json", + "domains": [ + "hotel", + "booking", + "general" + ], + "goal": { + "description": "You are looking for a <span class='emphasis'>place to stay</span>. The hotel should be in the <span class='emphasis'>cheap</span> price range and should be in the type of <span class='emphasis'>hotel</span>. The hotel should <span class='emphasis'>include free parking</span> and should <span class='emphasis'>include free wifi</span>. Once you find the <span class='emphasis'>hotel</span> you want to book it for <span class='emphasis'>6 people</span> and <span class='emphasis'>3 nights</span> starting from <span class='emphasis'>tuesday</span>. If the booking fails how about <span class='emphasis'>2 nights</span>. Make sure you get the <span class='emphasis'>reference number</span>", + "inform": { + "hotel": { + "type": "hotel", + "parking": "yes", + "price range": "cheap", + "internet": "yes", + "book stay": "3|2", + "book day": "tuesday", + "book people": "6" + } + }, + "request": { + "hotel": {} + } + }, + "turns": [ + { + "speaker": "user", + "utterance": "am looking for a place to to stay that has cheap price range it should be in a type of hotel", + "utt_idx": 0, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "price range", + "value": "cheap" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "type", + "value": "hotel", + "start": 87, + "end": 92 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "cheap", + "stars": "", + "internet": "", + "type": "hotel", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Okay, do you have a specific area you want to stay in?", + "utt_idx": 1, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "hotel", + "slot": "area", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "no, i just need to make sure it's cheap. oh, and i need parking", + "utt_idx": 2, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "parking", + "value": "yes" + } + ], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "yes", + "price range": "cheap", + "stars": "", + "internet": "", + "type": "hotel", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "I found 1 cheap hotel for you that includes parking. Do you like me to book it?", + "utt_idx": 3, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "price range", + "value": "cheap" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "choice", + "value": "1", + "start": 8, + "end": 9 + } + ], + "binary": [ + { + "intent": "inform", + "domain": "booking", + "slot": "", + "value": "" + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "parking", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Yes, please. 6 people 3 nights starting on tuesday.", + "utt_idx": 4, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "book day", + "value": "tuesday" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "book stay", + "value": "3", + "start": 22, + "end": 23 + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "book people", + "value": "6", + "start": 13, + "end": 14 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "yes", + "price range": "cheap", + "stars": "", + "internet": "", + "type": "hotel", + "book stay": "3", + "book day": "tuesday", + "book people": "6" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "I am sorry but I wasn't able to book that for you for Tuesday. Is there another day you would like to stay or perhaps a shorter stay?", + "utt_idx": 5, + "dialogue_acts": { + "categorical": [ + { + "intent": "nobook", + "domain": "booking", + "slot": "day", + "value": "Tuesday" + } + ], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "booking", + "slot": "book stay", + "value": "" + }, + { + "intent": "request", + "domain": "booking", + "slot": "day", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "how about only 2 nights.", + "utt_idx": 6, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "book stay", + "value": "2", + "start": 15, + "end": 16 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "yes", + "price range": "cheap", + "stars": "", + "internet": "", + "type": "hotel", + "book stay": "2", + "book day": "tuesday", + "book people": "6" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Booking was successful.\nReference number is : 7GAWK763. Anything else I can do for you?", + "utt_idx": 7, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "book", + "domain": "booking", + "slot": "ref", + "value": "7GAWK763", + "start": 46, + "end": 54 + } + ], + "binary": [ + { + "intent": "reqmore", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "No, that will be all. Good bye.", + "utt_idx": 8, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "bye", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "yes", + "price range": "cheap", + "stars": "", + "internet": "", + "type": "hotel", + "book stay": "2", + "book day": "tuesday", + "book people": "6" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Thank you for using our services.", + "utt_idx": 9, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "bye", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + } + ] + }, + { + "dataset": "multiwoz21", + "data_split": "train", + "dialogue_id": "multiwoz21-train-1", + "original_id": "SNG0129.json", + "domains": [ + "police", + "general" + ], + "goal": { + "description": "You were <span class='emphasis'>robbed</span> and are looking for help. Make sure you get <span class='emphasis'>postcode</span>, <span class='emphasis'>address</span>, and <span class='emphasis'>phone number</span>", + "inform": { + "police": {} + }, + "request": { + "police": { + "postcode": "", + "address": "", + "phone": "" + } + } + }, + "turns": [ + { + "speaker": "user", + "utterance": "Hello, I have been robbed. Can you please help me get in touch with the police?", + "utt_idx": 0, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "inform", + "domain": "police", + "slot": "", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Parkside Police Station is in Parkside, Cambridge. Their number is 01223358966. Anything else I can do for you?", + "utt_idx": 1, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "police", + "slot": "name", + "value": "Parkside Police Station", + "start": 0, + "end": 23 + }, + { + "intent": "inform", + "domain": "police", + "slot": "phone", + "value": "01223358966", + "start": 67, + "end": 78 + }, + { + "intent": "inform", + "domain": "police", + "slot": "address", + "value": "Parkside, Cambridge", + "start": 30, + "end": 49 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Can I please have the postcode as well?", + "utt_idx": 2, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "police", + "slot": "postcode", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "The postcode for the Parkside Police Station is CB11JG. Can I help you with anything else?", + "utt_idx": 3, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "police", + "slot": "name", + "value": "Parkside Police Station", + "start": 21, + "end": 44 + }, + { + "intent": "inform", + "domain": "police", + "slot": "postcode", + "value": "CB11JG", + "start": 48, + "end": 54 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Was Parkside the address of the police station? If not, can I have the address please?", + "utt_idx": 4, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "police", + "slot": "address", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Yes, Parkside is the address.", + "utt_idx": 5, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Thank you that will be all for now.", + "utt_idx": 6, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "thank", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Great. Thank you for contacting Cambridge Towninfo Centre.", + "utt_idx": 7, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "welcome", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "You were great. Goodbye.", + "utt_idx": 8, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "We are happy to help. Have a good day!", + "utt_idx": 9, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "bye", + "domain": "general", + "slot": "", + "value": "" + }, + { + "intent": "welcome", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + } + ] + }, + { + "dataset": "multiwoz21", + "data_split": "train", + "dialogue_id": "multiwoz21-train-2", + "original_id": "MUL2168.json", + "domains": [ + "hotel", + "train", + "booking", + "general" + ], + "goal": { + "description": "You are planning your trip in Cambridge. You are looking for a <span class='emphasis'>train</span>. The train should go to <span class='emphasis'>cambridge</span> and should <span class='emphasis'>arrive by 20:45</span>. The train should depart from <span class='emphasis'>birmingham new street</span> and should leave on <span class='emphasis'>wednesday</span>. Once you find the train you want to make a booking for <span class='emphasis'>5 people</span>. Make sure you get the <span class='emphasis'>reference number</span>. You are also looking for a <span class='emphasis'>place to stay</span>. The hotel should have <span class='emphasis'>a star of 4</span> and should <span class='emphasis'>include free wifi</span>. The hotel <span class='emphasis'>doesn't need to have free parking</span> and should be in the <span class='emphasis'>cheap</span> price range. If there is no such hotel, how about one that has <span class='emphasis'>free parking</span>. Once you find the <span class='emphasis'>hotel</span> you want to book it for <span class='emphasis'>the same group of people</span> and <span class='emphasis'>5 nights</span> starting from <span class='emphasis'>the same day</span>. Make sure you get the <span class='emphasis'>reference number</span>", + "inform": { + "hotel": { + "parking": "no|yes", + "price range": "cheap|cheap", + "stars": "4|4", + "internet": "yes|yes", + "book stay": "5", + "book day": "wednesday", + "book people": "5" + }, + "train": { + "destination": "cambridge", + "day": "wednesday", + "arrive by": "20:45", + "departure": "birmingham new street", + "book people": "5" + } + }, + "request": { + "hotel": {}, + "train": {} + } + }, + "turns": [ + { + "speaker": "user", + "utterance": "Hi, I'm looking for a train that is going to cambridge and arriving there by 20:45, is there anything like that?", + "utt_idx": 0, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "destination", + "value": "cambridge", + "start": 45, + "end": 54 + }, + { + "intent": "inform", + "domain": "train", + "slot": "arrive by", + "value": "20:45", + "start": 77, + "end": 82 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "cambridge", + "day": "", + "arrive by": "20:45", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "There are over 1,000 trains like that. Where will you be departing from?", + "utt_idx": 1, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "choice", + "value": "over 1", + "start": 10, + "end": 16 + }, + { + "intent": "inform", + "domain": "train", + "slot": "choice", + "value": "000" + } + ], + "binary": [ + { + "intent": "request", + "domain": "train", + "slot": "departure", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I am departing from birmingham new street.", + "utt_idx": 2, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "departure", + "value": "birmingham new street", + "start": 20, + "end": 41 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "cambridge", + "day": "", + "arrive by": "20:45", + "departure": "birmingham new street", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Can you confirm your desired travel day?", + "utt_idx": 3, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "train", + "slot": "day", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I would like to leave on wednesday", + "utt_idx": 4, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "day", + "value": "wednesday" + } + ], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "cambridge", + "day": "wednesday", + "arrive by": "20:45", + "departure": "birmingham new street", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "I show a train leaving birmingham new street at 17:40 and arriving at 20:23 on Wednesday. Will this work for you?", + "utt_idx": 5, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "day", + "value": "Wednesday" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "arrive by", + "value": "20:23", + "start": 70, + "end": 75 + }, + { + "intent": "inform", + "domain": "train", + "slot": "leave at", + "value": "17:40", + "start": 48, + "end": 53 + }, + { + "intent": "inform", + "domain": "train", + "slot": "departure", + "value": "birmingham new street", + "start": 23, + "end": 44 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "That will, yes. Please make a booking for 5 people please.", + "utt_idx": 6, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "book people", + "value": "5", + "start": 42, + "end": 43 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "cambridge", + "day": "wednesday", + "arrive by": "20:45", + "departure": "birmingham new street", + "book people": "5" + } + } + }, + { + "speaker": "system", + "utterance": "I've booked your train tickets, and your reference number is A9NHSO9Y.", + "utt_idx": 7, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "offerbooked", + "domain": "train", + "slot": "ref", + "value": "A9NHSO9Y", + "start": 61, + "end": 69 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Thanks so much. I would also need a place to say. I am looking for something with 4 stars and has free wifi. ", + "utt_idx": 8, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "internet", + "value": "yes" + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "stars", + "value": "4" + } + ], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "4", + "internet": "yes", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "cambridge", + "day": "wednesday", + "arrive by": "20:45", + "departure": "birmingham new street", + "book people": "5" + } + } + }, + { + "speaker": "system", + "utterance": "How about the cambridge belfry? It has all the attributes you requested and a great name! Maybe even a real belfry?", + "utt_idx": 9, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "recommend", + "domain": "hotel", + "slot": "name", + "value": "the cambridge belfry", + "start": 10, + "end": 30 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "That sounds great, could you make a booking for me please?", + "utt_idx": 10, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "cambridge belfry", + "area": "", + "parking": "", + "price range": "", + "stars": "4", + "internet": "yes", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "cambridge", + "day": "wednesday", + "arrive by": "20:45", + "departure": "birmingham new street", + "book people": "5" + } + } + }, + { + "speaker": "system", + "utterance": "What day would you like your booking for?", + "utt_idx": 11, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "booking", + "slot": "day", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Please book it for Wednesday for 5 people and 5 nights, please.", + "utt_idx": 12, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "book day", + "value": "wednesday" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "book stay", + "value": "5", + "start": 46, + "end": 47 + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "book people", + "value": "5", + "start": 33, + "end": 34 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "cambridge belfry", + "area": "", + "parking": "", + "price range": "", + "stars": "4", + "internet": "yes", + "type": "", + "book stay": "5", + "book day": "wednesday", + "book people": "5" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "cambridge", + "day": "wednesday", + "arrive by": "20:45", + "departure": "birmingham new street", + "book people": "5" + } + } + }, + { + "speaker": "system", + "utterance": "Booking was successful. Reference number is : 5NAWGJDC.", + "utt_idx": 13, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "book", + "domain": "booking", + "slot": "ref", + "value": "5NAWGJDC", + "start": 46, + "end": 54 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Thank you, goodbye", + "utt_idx": 14, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "thank", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "cambridge belfry", + "area": "", + "parking": "", + "price range": "", + "stars": "4", + "internet": "yes", + "type": "", + "book stay": "5", + "book day": "wednesday", + "book people": "5" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "cambridge", + "day": "wednesday", + "arrive by": "20:45", + "departure": "birmingham new street", + "book people": "5" + } + } + }, + { + "speaker": "system", + "utterance": "Goodbye. If you think of anything else you need don't hesitate to contact us.", + "utt_idx": 15, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "bye", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + } + ] + }, + { + "dataset": "multiwoz21", + "data_split": "train", + "dialogue_id": "multiwoz21-train-3", + "original_id": "SNG01445.json", + "domains": [ + "police", + "general" + ], + "goal": { + "description": "You were <span class='emphasis'>robbed</span> and are looking for help. Make sure you get <span class='emphasis'>address</span>, <span class='emphasis'>phone number</span>, and <span class='emphasis'>postcode</span>", + "inform": { + "police": {} + }, + "request": { + "police": { + "address": "", + "phone": "", + "postcode": "" + } + } + }, + "turns": [ + { + "speaker": "user", + "utterance": "I was robbed a few minutes ago and need assistance.", + "utt_idx": 0, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Please contact the Parkside, Cambridge police station.", + "utt_idx": 1, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "police", + "slot": "address", + "value": "Parkside, Cambridge", + "start": 19, + "end": 38 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I need the address including postcode of the police station and also the phone number.", + "utt_idx": 2, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "police", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "police", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "police", + "slot": "phone", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Of course. Their address is Parkside, Cambridge postcode CB11JG and phone number is 01223358966.", + "utt_idx": 3, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "police", + "slot": "phone", + "value": "01223358966", + "start": 84, + "end": 95 + }, + { + "intent": "inform", + "domain": "police", + "slot": "address", + "value": "Parkside, Cambridge", + "start": 28, + "end": 47 + }, + { + "intent": "inform", + "domain": "police", + "slot": "postcode", + "value": "CB11JG", + "start": 57, + "end": 63 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Thank you. That's all I needed.", + "utt_idx": 4, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "thank", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "You are more than welcome!\n", + "utt_idx": 5, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "greet", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Ok, have a good day. Goodbye.", + "utt_idx": 6, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "You are very welcome. Goodbye.", + "utt_idx": 7, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "bye", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + } + ] + }, + { + "dataset": "multiwoz21", + "data_split": "train", + "dialogue_id": "multiwoz21-train-4", + "original_id": "MUL2105.json", + "domains": [ + "hotel", + "train", + "general", + "booking" + ], + "goal": { + "description": "You are looking for information in Cambridge. You are looking for a <span class='emphasis'>place to stay</span>. The hotel should be in the <span class='emphasis'>east</span> and <span class='emphasis'>doesn't need to include internet</span>. The hotel should <span class='emphasis'>include free parking</span> and should be in the type of <span class='emphasis'>guesthouse</span>. If there is no such hotel, how about one that has <span class='emphasis'>free wifi</span>. Make sure you get <span class='emphasis'>address</span>. You are also looking for a <span class='emphasis'>train</span>. The train should leave on <span class='emphasis'>wednesday</span> and should <span class='emphasis'>arrive by 11:30</span>. The train should go to <span class='emphasis'>cambridge</span> and should depart from <span class='emphasis'>ely</span>. Once you find the train you want to make a booking for <span class='emphasis'>8 people</span>. Make sure you get the <span class='emphasis'>reference number</span>", + "inform": { + "hotel": { + "parking": "yes|yes", + "type": "guesthouse|guesthouse", + "internet": "no|yes", + "area": "east|centre" + }, + "train": { + "destination": "cambridge", + "day": "wednesday", + "arrive by": "11:30", + "departure": "ely", + "book people": "8" + } + }, + "request": { + "hotel": { + "address": "" + }, + "train": {} + } + }, + "turns": [ + { + "speaker": "user", + "utterance": "Hi. I'm looking for a hotel in the east. The internet is not needed.", + "utt_idx": 0, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "area", + "value": "east" + } + ], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "east", + "parking": "", + "price range": "", + "stars": "", + "internet": "no", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "There are no hotels that do not have internet, but 7 which do have it. Do you have a specific price range you'd prefer?", + "utt_idx": 1, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "choice", + "value": "7", + "start": 51, + "end": 52 + } + ], + "binary": [ + { + "intent": "request", + "domain": "hotel", + "slot": "price range", + "value": "" + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "internet", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "no, but it should have free parking, please.", + "utt_idx": 2, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "parking", + "value": "yes" + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "price range", + "value": "dontcare" + } + ], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "east", + "parking": "yes", + "price range": "dontcare", + "stars": "", + "internet": "no", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "The Express by Holiday Inn Cambridge is in the area and has free parking, would you like more information?", + "utt_idx": 3, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "name", + "value": "Express by Holiday Inn Cambridge", + "start": 4, + "end": 36 + } + ], + "binary": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "parking", + "value": "" + }, + { + "intent": "reqmore", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Is that a guesthouse or a hotel? I'd really prefer a guesthouse.", + "utt_idx": 4, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "type", + "value": "guesthouse", + "start": 10, + "end": 20 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "express by holiday inn cambridge", + "area": "east", + "parking": "yes", + "price range": "dontcare", + "stars": "", + "internet": "no", + "type": "guesthouse", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "It's a hotel. There are 5 guesthouses in the area. Do you prefer cheap or moderate for the price range?", + "utt_idx": 5, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "choice", + "value": "5", + "start": 24, + "end": 25 + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "type", + "value": "hotel", + "start": 7, + "end": 12 + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "type", + "value": "guesthouse", + "start": 26, + "end": 36 + } + ], + "binary": [ + { + "intent": "request", + "domain": "hotel", + "slot": "price range", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "The price doesn't really matter. I just need free parking. It doesn't really need to have internet though. ", + "utt_idx": 6, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "east", + "parking": "yes", + "price range": "dontcare", + "stars": "", + "internet": "no", + "type": "guesthouse", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "There are 5 guesthouses that have free parking. Should I book one of them for you?", + "utt_idx": 7, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "type", + "value": "guesthouse", + "start": 12, + "end": 22 + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "choice", + "value": "5", + "start": 10, + "end": 11 + } + ], + "binary": [ + { + "intent": "inform", + "domain": "booking", + "slot": "", + "value": "" + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "parking", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Okay, none of them DON'T offer free wifi? If not, I'll need the address for one that does have wifi, please. Tell me about your favorite.", + "utt_idx": 8, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "internet", + "value": "yes" + } + ], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "hotel", + "slot": "address", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "east", + "parking": "yes", + "price range": "dontcare", + "stars": "", + "internet": "no", + "type": "guesthouse", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "The allenbell is a guesthouse on the east. The addres sis 517a coldham lane post code cb13js.", + "utt_idx": 9, + "dialogue_acts": { + "categorical": [ + { + "intent": "recommend", + "domain": "hotel", + "slot": "area", + "value": "east" + } + ], + "non-categorical": [ + { + "intent": "recommend", + "domain": "hotel", + "slot": "address", + "value": "517a coldham lane", + "start": 59, + "end": 76 + }, + { + "intent": "recommend", + "domain": "hotel", + "slot": "postcode", + "value": "cb13js", + "start": 87, + "end": 93 + }, + { + "intent": "recommend", + "domain": "hotel", + "slot": "name", + "value": "allenbell", + "start": 4, + "end": 13 + }, + { + "intent": "recommend", + "domain": "hotel", + "slot": "type", + "value": "guesthouse", + "start": 19, + "end": 29 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Great. Can you help me with a train as well? I'm looking to come in from Ely.", + "utt_idx": 10, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "departure", + "value": "Ely", + "start": 73, + "end": 76 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "east", + "parking": "yes", + "price range": "dontcare", + "stars": "", + "internet": "no", + "type": "guesthouse", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "ely", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "What is your destination? And do you have any preference for day, departure or arrival time?", + "utt_idx": 11, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "train", + "slot": "day", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "destination", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "leave at", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "arrive by", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I want to go from cambridge to ely, no preference for time but I need to book for 8 people.", + "utt_idx": 12, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "destination", + "value": "ely", + "start": 31, + "end": 34 + }, + { + "intent": "inform", + "domain": "train", + "slot": "departure", + "value": "cambridge", + "start": 18, + "end": 27 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "east", + "parking": "yes", + "price range": "dontcare", + "stars": "", + "internet": "no", + "type": "guesthouse", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "dontcare", + "destination": "ely", + "day": "", + "arrive by": "dontcare", + "departure": "cambridge", + "book people": "8" + } + } + }, + { + "speaker": "system", + "utterance": "There are trains that run every two hours every day starting at 5:50. Do you know what day you'd like to travel?", + "utt_idx": 13, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "leave at", + "value": "starting at 5:50", + "start": 52, + "end": 68 + } + ], + "binary": [ + { + "intent": "request", + "domain": "train", + "slot": "day", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I'm looking for Wednesday , arrive by 11:30 for 8 people Please.", + "utt_idx": 14, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "day", + "value": "wednesday" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "arrive by", + "value": "11:30", + "start": 38, + "end": 43 + }, + { + "intent": "inform", + "domain": "train", + "slot": "book people", + "value": "8", + "start": 48, + "end": 49 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "east", + "parking": "yes", + "price range": "dontcare", + "stars": "", + "internet": "no", + "type": "guesthouse", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "dontcare", + "destination": "ely", + "day": "wednesday", + "arrive by": "11:30", + "departure": "cambridge", + "book people": "8" + } + } + }, + { + "speaker": "system", + "utterance": "Booking was successful, the total fee is 35.2 GBP payable at the station. You reference number is 7ZCVR4Q3. Can I help you with anything else?", + "utt_idx": 15, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "offerbooked", + "domain": "train", + "slot": "ref", + "value": "7ZCVR4Q3", + "start": 98, + "end": 106 + }, + { + "intent": "offerbooked", + "domain": "train", + "slot": "price", + "value": "35.2 GBP", + "start": 41, + "end": 49 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "No, I don't need anything else right now. Thank you for your assistance. Good Bye.", + "utt_idx": 16, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "bye", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "east", + "parking": "yes", + "price range": "dontcare", + "stars": "", + "internet": "no", + "type": "guesthouse", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "dontcare", + "destination": "ely", + "day": "wednesday", + "arrive by": "11:30", + "departure": "cambridge", + "book people": "8" + } + } + }, + { + "speaker": "system", + "utterance": "Thank you and enjoy your visit. Have a great day.", + "utt_idx": 17, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "bye", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + } + ] + }, + { + "dataset": "multiwoz21", + "data_split": "train", + "dialogue_id": "multiwoz21-train-5", + "original_id": "PMUL1690.json", + "domains": [ + "hotel", + "train", + "booking", + "general" + ], + "goal": { + "description": "You are planning your trip in Cambridge. You are looking for a <span class='emphasis'>train</span>. The train should depart from <span class='emphasis'>norwich</span> and should <span class='emphasis'>leave after 21:15</span>. The train should leave on <span class='emphasis'>monday</span> and should go to <span class='emphasis'>cambridge</span>. Make sure you get <span class='emphasis'>arrival time</span>, <span class='emphasis'>travel time</span>, and <span class='emphasis'>price</span>. You are also looking for a <span class='emphasis'>place to stay</span>. The hotel should have <span class='emphasis'>a star of 4</span> and should be in the <span class='emphasis'>north</span>. The hotel should be in the <span class='emphasis'>moderate</span> price range and should be in the type of <span class='emphasis'>hotel</span>. If there is no such hotel, how about one that is in <span class='emphasis'>the type of guesthouse</span>. Once you find the <span class='emphasis'>hotel</span> you want to book it for <span class='emphasis'>4 people</span> and <span class='emphasis'>2 nights</span> starting from <span class='emphasis'>the same day</span>. Make sure you get the <span class='emphasis'>reference number</span>", + "inform": { + "hotel": { + "type": "hotel|guesthouse", + "price range": "moderate|moderate", + "stars": "4|4", + "area": "north|north", + "book stay": "2", + "book day": "monday", + "book people": "4" + }, + "train": { + "leave at": "21:15", + "destination": "cambridge", + "day": "monday", + "departure": "norwich" + } + }, + "request": { + "hotel": {}, + "train": { + "arrive by": "", + "duration": "", + "price": "" + } + } + }, + "turns": [ + { + "speaker": "user", + "utterance": "I need to take a train from Norwich, leaving after 21:15.", + "utt_idx": 0, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "departure", + "value": "Norwich", + "start": 28, + "end": 35 + }, + { + "intent": "inform", + "domain": "train", + "slot": "leave at", + "value": "21:15", + "start": 51, + "end": 56 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "21:15", + "destination": "", + "day": "", + "arrive by": "", + "departure": "norwich", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Where would you like to go to?", + "utt_idx": 1, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "train", + "slot": "destination", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I would like to go to Ely please.", + "utt_idx": 2, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "destination", + "value": "Ely", + "start": 22, + "end": 25 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "ely", + "day": "", + "arrive by": "", + "departure": "norwich", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "I don't have any listings for trains leaving Norwich and arriving in Ely. Would you like to try a different departure station?", + "utt_idx": 3, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "nooffer", + "domain": "train", + "slot": "destination", + "value": "Ely", + "start": 69, + "end": 72 + }, + { + "intent": "nooffer", + "domain": "train", + "slot": "departure", + "value": "Norwich", + "start": 45, + "end": 52 + } + ], + "binary": [ + { + "intent": "request", + "domain": "train", + "slot": "departure", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I meant Cambridge please. I would like to arrive in Cambridge on Monday after 21:15.", + "utt_idx": 4, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "day", + "value": "monday" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "destination", + "value": "Cambridge", + "start": 8, + "end": 17 + }, + { + "intent": "inform", + "domain": "train", + "slot": "leave at", + "value": "21:15", + "start": 78, + "end": 83 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "ely", + "day": "", + "arrive by": "", + "departure": "norwich", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Ok I will book that for you and get you a confirmation number", + "utt_idx": 5, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "offerbook", + "domain": "train", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I don't need anything booked. I just need to get the arrival time, travel time and price of a train from norwich to cambridge leaving after 21:15.", + "utt_idx": 6, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "destination", + "value": "cambridge", + "start": 117, + "end": 126 + }, + { + "intent": "inform", + "domain": "train", + "slot": "departure", + "value": "norwich", + "start": 106, + "end": 113 + }, + { + "intent": "inform", + "domain": "train", + "slot": "leave at", + "value": "21:15", + "start": 141, + "end": 146 + } + ], + "binary": [ + { + "intent": "request", + "domain": "train", + "slot": "arrive by", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "duration", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "price", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "21:15", + "destination": "cambridge", + "day": "monday", + "arrive by": "", + "departure": "norwich", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "There are trains arriving at 22:35, 23:35 and 24:35. They are 17.60 pounds and 79 minutes each.", + "utt_idx": 7, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "price", + "value": "17.60 pounds", + "start": 62, + "end": 74 + }, + { + "intent": "inform", + "domain": "train", + "slot": "duration", + "value": "79 minutes", + "start": 79, + "end": 89 + }, + { + "intent": "inform", + "domain": "train", + "slot": "arrive by", + "value": "22:35", + "start": 29, + "end": 34 + }, + { + "intent": "inform", + "domain": "train", + "slot": "arrive by", + "value": "23:35", + "start": 36, + "end": 41 + }, + { + "intent": "inform", + "domain": "train", + "slot": "arrive by", + "value": "24:35", + "start": 46, + "end": 51 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Great, thanks! I'm also looking for a 4 star hotel in the north. Can you please help with that? ", + "utt_idx": 8, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "area", + "value": "north" + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "stars", + "value": "4" + } + ], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "north", + "parking": "", + "price range": "", + "stars": "4", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "21:15", + "destination": "cambridge", + "day": "monday", + "arrive by": "", + "departure": "norwich", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "There are several that meet your criteria. Is there a price range you are interested in?", + "utt_idx": 9, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "choice", + "value": "several", + "start": 10, + "end": 17 + } + ], + "binary": [ + { + "intent": "request", + "domain": "hotel", + "slot": "price range", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I would like to keep it in the moderate range, please.", + "utt_idx": 10, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "price range", + "value": "moderate" + } + ], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "north", + "parking": "", + "price range": "moderate", + "stars": "4", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "21:15", + "destination": "cambridge", + "day": "monday", + "arrive by": "", + "departure": "norwich", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "I see 7 options. They are all guesthouses. Do you have preferences for internet and parking?", + "utt_idx": 11, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "type", + "value": "guesthouses", + "start": 30, + "end": 41 + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "choice", + "value": "7", + "start": 6, + "end": 7 + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "choice", + "value": "all", + "start": 26, + "end": 29 + } + ], + "binary": [ + { + "intent": "request", + "domain": "hotel", + "slot": "internet", + "value": "" + }, + { + "intent": "request", + "domain": "hotel", + "slot": "parking", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I do not have a preference on those things.", + "utt_idx": 12, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "north", + "parking": "", + "price range": "moderate", + "stars": "4", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "21:15", + "destination": "cambridge", + "day": "monday", + "arrive by": "", + "departure": "norwich", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Shall I try to start and book you into one?", + "utt_idx": 13, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "inform", + "domain": "booking", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Please book it and provide the reference number.", + "utt_idx": 14, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "hotel", + "slot": "ref", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "north", + "parking": "", + "price range": "moderate", + "stars": "4", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "21:15", + "destination": "cambridge", + "day": "monday", + "arrive by": "", + "departure": "norwich", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "How many nights will you be staying?", + "utt_idx": 15, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "booking", + "slot": "book stay", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Please book the hotel for 4 people and 2 nights. Thank you.", + "utt_idx": 16, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "book stay", + "value": "2", + "start": 39, + "end": 40 + }, + { + "intent": "inform", + "domain": "hotel", + "slot": "book people", + "value": "4", + "start": 26, + "end": 27 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "north", + "parking": "", + "price range": "moderate", + "stars": "4", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "21:15", + "destination": "cambridge", + "day": "monday", + "arrive by": "", + "departure": "norwich", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "What day will you be arriving?", + "utt_idx": 17, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "booking", + "slot": "day", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "the same day... monday please.", + "utt_idx": 18, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "hotel", + "slot": "book day", + "value": "monday" + } + ], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "north", + "parking": "", + "price range": "moderate", + "stars": "4", + "internet": "", + "type": "", + "book stay": "2", + "book day": "monday", + "book people": "4" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "21:15", + "destination": "cambridge", + "day": "monday", + "arrive by": "", + "departure": "norwich", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "I apologize, some of our new workers have phobias of the booking system, I booked you at Acorn guest house, the reference number is 04CSEO7Q any other questions?", + "utt_idx": 19, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "book", + "domain": "booking", + "slot": "ref", + "value": "04CSEO7Q", + "start": 132, + "end": 140 + }, + { + "intent": "book", + "domain": "booking", + "slot": "name", + "value": "Acorn guest house", + "start": 89, + "end": 106 + } + ], + "binary": [ + { + "intent": "reqmore", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "That's all I need for today. Thanks for your help!", + "utt_idx": 20, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "thank", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "north", + "parking": "", + "price range": "moderate", + "stars": "4", + "internet": "", + "type": "", + "book stay": "2", + "book day": "monday", + "book people": "4" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "21:15", + "destination": "cambridge", + "day": "monday", + "arrive by": "", + "departure": "norwich", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "I'm happy to help, and I hope you enjoy your stay!", + "utt_idx": 21, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "bye", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + } + ] + }, + { + "dataset": "multiwoz21", + "data_split": "train", + "dialogue_id": "multiwoz21-train-6", + "original_id": "MUL2395.json", + "domains": [ + "attraction", + "restaurant", + "general" + ], + "goal": { + "description": "You are planning your trip in Cambridge. You are looking for <span class='emphasis'>places to go</span> in town. The attraction should be in the <span class='emphasis'>centre</span>. Make sure you get <span class='emphasis'>entrance fee</span>, <span class='emphasis'>attraction type</span>, and <span class='emphasis'>address</span>. You are also looking for a <span class='emphasis'>place to dine</span>. The restaurant should be in the <span class='emphasis'>expensive</span> price range and should be <span class='emphasis'>in the same area as the attraction</span>. The restaurant should serve <span class='emphasis'>caribbean</span> food. If there is no such restaurant, how about one that serves <span class='emphasis'>turkish</span> food. Make sure you get <span class='emphasis'>phone number</span>, <span class='emphasis'>postcode</span>, and <span class='emphasis'>address</span>", + "inform": { + "attraction": { + "area": "east|centre" + }, + "restaurant": { + "food": "caribbean|turkish", + "price range": "expensive|expensive", + "area": "centre|centre" + } + }, + "request": { + "attraction": { + "entrance fee": "", + "type": "", + "address": "" + }, + "restaurant": { + "phone": "", + "postcode": "", + "address": "" + } + } + }, + "turns": [ + { + "speaker": "user", + "utterance": "I am planning a trip to town and want to sight see a bit. Can you let me know some attractions I may be interested in?", + "utt_idx": 0, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "inform", + "domain": "attraction", + "slot": "", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "There are 79 attractions in the city. Do you have a specific type of attraction or specific area of the attraction that you are interested in?", + "utt_idx": 1, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "attraction", + "slot": "choice", + "value": "79", + "start": 10, + "end": 12 + } + ], + "binary": [ + { + "intent": "request", + "domain": "attraction", + "slot": "area", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "type", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I'd like something in the centre.", + "utt_idx": 2, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "attraction", + "slot": "area", + "value": "centre" + } + ], + "non-categorical": [], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "centre" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Great! How about All Saints Church? It has fantastic architecture and free entrance. Would you like more information?", + "utt_idx": 3, + "dialogue_acts": { + "categorical": [ + { + "intent": "recommend", + "domain": "attraction", + "slot": "type", + "value": "architecture" + } + ], + "non-categorical": [ + { + "intent": "recommend", + "domain": "attraction", + "slot": "entrance fee", + "value": "free", + "start": 70, + "end": 74 + }, + { + "intent": "recommend", + "domain": "attraction", + "slot": "name", + "value": "All Saints Church", + "start": 17, + "end": 34 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Could I get the address for it? I would also like an expensive place to eat around it. ", + "utt_idx": 4, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "attraction", + "slot": "address", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "centre" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "expensive", + "name": "", + "area": "centre", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "The address is Jesus Lane. What type of food would you like to eat?", + "utt_idx": 5, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "attraction", + "slot": "address", + "value": "Jesus Lane", + "start": 15, + "end": 25 + } + ], + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "food", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I'm in the mood for Caribbean food.", + "utt_idx": 6, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Caribbean", + "start": 20, + "end": 29 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "centre" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "caribbean", + "price range": "expensive", + "name": "", + "area": "centre", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Unfortunately, there are no expensive caribbean restaurants in that area. Would you like me to check a different type of cuisine, or another area?", + "utt_idx": 7, + "dialogue_acts": { + "categorical": [ + { + "intent": "nooffer", + "domain": "restaurant", + "slot": "price range", + "value": "expensive" + } + ], + "non-categorical": [ + { + "intent": "nooffer", + "domain": "restaurant", + "slot": "food", + "value": "caribbean", + "start": 38, + "end": 47 + } + ], + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "food", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "area", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Yes, I also love turkish food. Is there something in the center that's expensive? Also, what type of attraction is All Saints Church?", + "utt_idx": 8, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "expensive" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "attraction", + "slot": "name", + "value": "All Saints Church", + "start": 115, + "end": 132 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "turkish", + "start": 17, + "end": 24 + } + ], + "binary": [ + { + "intent": "request", + "domain": "attraction", + "slot": "type", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "all saints church", + "area": "centre" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "turkish", + "price range": "expensive", + "name": "", + "area": "centre", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "The Meze Bar Restaurant is an expensive Turkish restaurant in the city centre. The All Saints Church is an architecture attraction.", + "utt_idx": 9, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "area", + "value": "the city centre" + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "price range", + "value": "expensive" + }, + { + "intent": "inform", + "domain": "attraction", + "slot": "type", + "value": "architecture" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "food", + "value": "Turkish", + "start": 40, + "end": 47 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "name", + "value": "The Meze Bar Restaurant", + "start": 0, + "end": 23 + }, + { + "intent": "inform", + "domain": "attraction", + "slot": "name", + "value": "The All Saints Church", + "start": 80, + "end": 101 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Can I have the phone number, address and postcode for the restaurant please?", + "utt_idx": 10, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "restaurant", + "slot": "phone", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "restaurant", + "slot": "address", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "all saints church", + "area": "centre" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "turkish", + "price range": "expensive", + "name": "meze bar", + "area": "centre", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "I don't have a phone number available, but the address is 196 Mill Road City Centre, postal code cb13nf. Anything else I can do for you? ", + "utt_idx": 11, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "restaurant", + "slot": "postcode", + "value": "cb13nf", + "start": 97, + "end": 103 + }, + { + "intent": "inform", + "domain": "restaurant", + "slot": "address", + "value": "196 Mill Road City Centre", + "start": 58, + "end": 83 + } + ], + "binary": [ + { + "intent": "reqmore", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Great that's all that I needed to know, thank you!", + "utt_idx": 12, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "thank", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "all saints church", + "area": "centre" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "turkish", + "price range": "expensive", + "name": "meze bar", + "area": "centre", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "You're welcome, feel free to contact us about anything else you need.", + "utt_idx": 13, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "bye", + "domain": "general", + "slot": "", + "value": "" + }, + { + "intent": "welcome", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + } + ] + }, + { + "dataset": "multiwoz21", + "data_split": "train", + "dialogue_id": "multiwoz21-train-7", + "original_id": "SNG0190.json", + "domains": [ + "hospital", + "general" + ], + "goal": { + "description": "You got injured and are looking for a <span class='emphasis'>hospital</span> nearby. Make sure you get <span class='emphasis'>address</span> and <span class='emphasis'>phone number</span>", + "inform": { + "hospital": {} + }, + "request": { + "hospital": { + "address": "", + "phone": "" + } + } + }, + "turns": [ + { + "speaker": "user", + "utterance": "I've been injured and need to find a hospital nearby.", + "utt_idx": 0, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "inform", + "domain": "hospital", + "slot": "", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "The Addenbrookes Hospital is located at Hills Rd., Cambridge. Are you looking for a specific department?", + "utt_idx": 1, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "hospital", + "slot": "department", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "No, I just need the general phone number, please.", + "utt_idx": 2, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "hospital", + "slot": "phone", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "01223245151, is the number. Do you need anything else?", + "utt_idx": 3, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "hospital", + "slot": "phone", + "value": "01223245151", + "start": 0, + "end": 11 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Thank you that will do.", + "utt_idx": 4, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "thank", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Feel better soon! Have a better night! ", + "utt_idx": 5, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "bye", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + } + ] + }, + { + "dataset": "multiwoz21", + "data_split": "train", + "dialogue_id": "multiwoz21-train-8", + "original_id": "PMUL1170.json", + "domains": [ + "attraction", + "train", + "general" + ], + "goal": { + "description": "You are planning your trip in Cambridge. You are looking for a <span class='emphasis'>train</span>. The train should depart from <span class='emphasis'>cambridge</span> and should leave on <span class='emphasis'>wednesday</span>. The train should <span class='emphasis'>leave after 12:45</span> and should go to <span class='emphasis'>peterborough</span>. Make sure you get <span class='emphasis'>train ID</span>, <span class='emphasis'>arrival time</span>, and <span class='emphasis'>price</span>. You are also looking for <span class='emphasis'>places to go</span> in town. The attraction should be in the <span class='emphasis'>centre</span> and should be in the type of <span class='emphasis'>museum</span>. Make sure you get <span class='emphasis'>address</span>, <span class='emphasis'>postcode</span>, and <span class='emphasis'>phone number</span>", + "inform": { + "attraction": { + "type": "museum", + "area": "centre" + }, + "train": { + "leave at": "12:45", + "destination": "peterborough", + "day": "wednesday", + "departure": "cambridge" + } + }, + "request": { + "attraction": { + "address": "", + "postcode": "", + "phone": "" + }, + "train": { + "train id": "", + "arrive by": "", + "price": "" + } + } + }, + "turns": [ + { + "speaker": "user", + "utterance": "I need to take a train out of Cambridge, I will be leaving town on Wednesday.", + "utt_idx": 0, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "day", + "value": "wednesday" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "departure", + "value": "Cambridge", + "start": 30, + "end": 39 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "wednesday", + "arrive by": "", + "departure": "cambridge", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "There are 5 trains out of Cambridge on Wednesday. Do you have a departure time in mind? ", + "utt_idx": 1, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "day", + "value": "Wednesday" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "choice", + "value": "5", + "start": 10, + "end": 11 + }, + { + "intent": "inform", + "domain": "train", + "slot": "departure", + "value": "Cambridge", + "start": 26, + "end": 35 + } + ], + "binary": [ + { + "intent": "request", + "domain": "train", + "slot": "leave at", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I'd like to go to peterborough and leave after 12:45, I have to attend a meeting beforehand.", + "utt_idx": 2, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "destination", + "value": "peterborough", + "start": 18, + "end": 30 + }, + { + "intent": "inform", + "domain": "train", + "slot": "leave at", + "value": "12:45", + "start": 47, + "end": 52 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "12:45", + "destination": "peterborough", + "day": "wednesday", + "arrive by": "", + "departure": "cambridge", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "TR1879 leaves at 13:06 on Wednesday. Will that work for you?", + "utt_idx": 3, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "day", + "value": "Wednesday" + } + ], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "leave at", + "value": "13:06", + "start": 17, + "end": 22 + }, + { + "intent": "inform", + "domain": "train", + "slot": "train id", + "value": "TR1879", + "start": 0, + "end": 6 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "What is the price of the fair and could you tell me what is the arrival time into Peterborough?", + "utt_idx": 4, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "train", + "slot": "arrive by", + "value": "" + }, + { + "intent": "request", + "domain": "train", + "slot": "price", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "12:45", + "destination": "peterborough", + "day": "wednesday", + "arrive by": "", + "departure": "cambridge", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "The price is 16.50 pounds and the train will arrive in Peterborough by 13:56 Wednesday. Would you like me to book a ticket for you?", + "utt_idx": 5, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "price", + "value": "16.50 pounds", + "start": 13, + "end": 25 + }, + { + "intent": "inform", + "domain": "train", + "slot": "destination", + "value": "Peterborough", + "start": 55, + "end": 67 + }, + { + "intent": "inform", + "domain": "train", + "slot": "arrive by", + "value": "13:56", + "start": 71, + "end": 76 + } + ], + "binary": [ + { + "intent": "offerbook", + "domain": "train", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "Yes, one ticket please, can I also get the reference number?", + "utt_idx": 6, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "train", + "slot": "book people", + "value": "one", + "start": 5, + "end": 8 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "12:45", + "destination": "peterborough", + "day": "wednesday", + "arrive by": "", + "departure": "cambridge", + "book people": "1" + } + } + }, + { + "speaker": "system", + "utterance": "I booked you one seat on TR1879 departing Cambridge for Peterborough on Wednesday at 13:06, Reference: xu1qlhvw. Can I help further today?", + "utt_idx": 7, + "dialogue_acts": { + "categorical": [ + { + "intent": "offerbooked", + "domain": "train", + "slot": "day", + "value": "Wednesday" + } + ], + "non-categorical": [ + { + "intent": "offerbooked", + "domain": "train", + "slot": "ref", + "value": "xu1qlhvw", + "start": 103, + "end": 111 + }, + { + "intent": "offerbooked", + "domain": "train", + "slot": "departure", + "value": "Cambridge", + "start": 42, + "end": 51 + }, + { + "intent": "offerbooked", + "domain": "train", + "slot": "book people", + "value": "one", + "start": 13, + "end": 16 + }, + { + "intent": "offerbooked", + "domain": "train", + "slot": "train id", + "value": "TR1879", + "start": 25, + "end": 31 + }, + { + "intent": "offerbooked", + "domain": "train", + "slot": "destination", + "value": "Peterborough", + "start": 56, + "end": 68 + }, + { + "intent": "offerbooked", + "domain": "train", + "slot": "leave at", + "value": "13:06", + "start": 85, + "end": 90 + } + ], + "binary": [ + { + "intent": "reqmore", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I am also looking for a museum in centre. Could you give me the address, postcode and phone number of one? ", + "utt_idx": 8, + "dialogue_acts": { + "categorical": [ + { + "intent": "inform", + "domain": "attraction", + "slot": "area", + "value": "centre" + }, + { + "intent": "inform", + "domain": "attraction", + "slot": "type", + "value": "museum" + } + ], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "attraction", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "attraction", + "slot": "phone", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "museum", + "name": "", + "area": "centre" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "12:45", + "destination": "peterborough", + "day": "wednesday", + "arrive by": "", + "departure": "cambridge", + "book people": "1" + } + } + }, + { + "speaker": "system", + "utterance": "The Broughton House Gallery is located at 98 King Street, postcode cb11ln. Their phone number is 01223314960. Is there anything else you need?", + "utt_idx": 9, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "attraction", + "slot": "phone", + "value": "01223314960", + "start": 97, + "end": 108 + }, + { + "intent": "inform", + "domain": "attraction", + "slot": "address", + "value": "98 King Street", + "start": 42, + "end": 56 + }, + { + "intent": "inform", + "domain": "attraction", + "slot": "name", + "value": "Broughton House Gallery", + "start": 4, + "end": 27 + }, + { + "intent": "inform", + "domain": "attraction", + "slot": "postcode", + "value": "cb11ln", + "start": 67, + "end": 73 + } + ], + "binary": [ + { + "intent": "reqmore", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "That's all I need right now. Thanks, you have been very helpful!", + "utt_idx": 10, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "thank", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "museum", + "name": "", + "area": "centre" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "12:45", + "destination": "peterborough", + "day": "wednesday", + "arrive by": "", + "departure": "cambridge", + "book people": "1" + } + } + }, + { + "speaker": "system", + "utterance": "You're welcome, I hope you enjoy your time in Cambridge!", + "utt_idx": 11, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "bye", + "domain": "general", + "slot": "", + "value": "" + }, + { + "intent": "welcome", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + } + ] + }, + { + "dataset": "multiwoz21", + "data_split": "train", + "dialogue_id": "multiwoz21-train-9", + "original_id": "SNG01741.json", + "domains": [ + "hospital", + "general" + ], + "goal": { + "description": "You want to find a <span class='emphasis'>hospital</span> in town. The hospital should have the <span class='emphasis'>paediatric clinic</span> department. Make sure you get <span class='emphasis'>address</span>, <span class='emphasis'>postcode</span>, and <span class='emphasis'>phone number</span>", + "inform": { + "hospital": { + "department": "paediatric clinic" + } + }, + "request": { + "hospital": { + "address": "", + "postcode": "", + "phone": "" + } + } + }, + "turns": [ + { + "speaker": "user", + "utterance": "I am looking in town for a hospital with a paediatric clinic", + "utt_idx": 0, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "hospital", + "slot": "department", + "value": "paediatric clinic", + "start": 44, + "end": 61 + } + ], + "binary": [] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Addenbrookes Hosptial on Hills Rd. in Cambridge has a paediatric clinic.", + "utt_idx": 1, + "dialogue_acts": { + "categorical": [], + "non-categorical": [ + { + "intent": "inform", + "domain": "hospital", + "slot": "department", + "value": "paediatric clinic", + "start": 54, + "end": 71 + } + ], + "binary": [] + }, + "db_results": {} + }, + { + "speaker": "user", + "utterance": "I need the address and phone number and postal code", + "utt_idx": 2, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "request", + "domain": "hospital", + "slot": "address", + "value": "" + }, + { + "intent": "request", + "domain": "hospital", + "slot": "postcode", + "value": "" + }, + { + "intent": "request", + "domain": "hospital", + "slot": "phone", + "value": "" + } + ] + }, + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } + } + }, + { + "speaker": "system", + "utterance": "Great. Thank you and goodbye.", + "utt_idx": 3, + "dialogue_acts": { + "categorical": [], + "non-categorical": [], + "binary": [ + { + "intent": "bye", + "domain": "general", + "slot": "", + "value": "" + } + ] + }, + "db_results": {} + } + ] + } +] \ No newline at end of file diff --git a/data/unified_datasets/multiwoz21/ontology.json b/data/unified_datasets/multiwoz21/ontology.json deleted file mode 100644 index b976054f3b6dc47c327c75f545549dfe3563ad01..0000000000000000000000000000000000000000 --- a/data/unified_datasets/multiwoz21/ontology.json +++ /dev/null @@ -1,1895 +0,0 @@ -{ - "domains": { - "taxi": { - "description": "taxi information query system", - "slots": { - "destination": { - "description": "destination of taxi", - "is_categorical": false, - "possible_values": [] - }, - "departure": { - "description": "departure location of taxi", - "is_categorical": false, - "possible_values": [] - }, - "leaveAt": { - "description": "leaving time of taxi", - "is_categorical": false, - "possible_values": [] - }, - "arriveBy": { - "description": "arrival time of taxi", - "is_categorical": false, - "possible_values": [] - }, - "taxi_phone": { - "description": "taxi phone number", - "is_categorical": false, - "possible_values": [] - }, - "taxi_types": { - "description": "taxi type", - "is_categorical": false, - "possible_values": [] - } - } - }, - "restaurant": { - "description": "restaurant information query system", - "slots": { - "food": { - "description": "food type for the restaurant", - "is_categorical": false, - "possible_values": [] - }, - "name": { - "description": "name of the restaurant", - "is_categorical": false, - "possible_values": [] - }, - "pricerange": { - "description": "price budget for the restaurant", - "is_categorical": true, - "possible_values": [ - "not moderate", - "ch", - "high - end", - "luxury options", - "priced cheaply", - "moderate", - "various", - "all price ranges", - "cheap price range", - "low cost", - "assorted", - "cheap priced", - "moderate or expensive priced", - "expensive price", - "your", - "most expensive", - "moderate and expensive", - "reasonable", - "moderately expensive", - "moderate|cheap", - "fairly expensive", - "upscale", - "quite inexpensive", - "boderate", - "affordable", - "expinsive", - "they vary", - "more moderate", - "expensive / upscale", - "high class", - "moderately price", - "reasonably priced", - "a little expensive but worth it", - "middle", - "any price point", - "priced moderately", - "cherap", - "moderately prices", - "cheap to expensive", - "or otherwise", - "inexpensive", - "cheaply priced", - "adforable", - "that range", - "great prices", - "rather expensive", - "expensive side", - "moderate range", - "cheaply - priced", - "more expensive", - "varying price", - "similar price range", - "this price range", - "expensive", - "expensive or moderate", - "not cheap", - "the least expensive", - "pretty expensive", - "high end", - "not expensive", - "less expensive", - "moderatley priced", - "most affordable price range", - "budget - friendly", - "expensive and moderately priced", - "moderatre", - "cheap range", - "or the moderately priced", - "fairly cheap", - "not too expensive", - "east", - "fairly inexpensive", - "regardless of price", - "north", - "mostly expensive and moderately priced", - "cheap", - "budget conscious", - "same price range", - "very expensive", - "not too pricey", - "any price range", - "quite expensive", - "that price range", - "your price range", - "every price point", - "expensively priced", - "varying price range", - "not - so - expensive", - "all", - "vietnamese", - "any", - "moderately - priced", - "that price", - "moderate pricing", - "do nt care", - "modest", - "pricey", - "expensive but worth every penny", - "cheaper", - "pretty cheap", - "moderate price", - "ranging from cheap to expensive", - "quite low", - "moderate priced", - "centre", - "the same", - "expensive range", - "relatively cheap", - "cheap or expensive", - "epensive", - "moderate and one in the cheap range", - "budget friendly", - "fine", - "on the cheap side", - "expensive price range", - "mostly expensive", - "moderately priced", - "relatively expensive", - "moderately", - "moderatly", - "on the pricey side", - "low priced", - "expensively", - "moderate price range", - "moderatly priced", - "do n't care" - ] - }, - "address": { - "description": "exact location of the restaurant", - "is_categorical": false, - "possible_values": [] - }, - "area": { - "description": "area or place of the restaurant", - "is_categorical": true, - "possible_values": [ - "near the centre", - "east area", - "northern parts of town", - "any part of town", - "west cambridge", - "town of centre", - "centre of cambridge", - "north or centre", - "westside", - "city centre", - "the west end", - "the south of town", - "the city center", - "elsewhere in the city", - "town center", - "the west side", - "south of cambridge", - "south area of town", - "here", - "north side of town", - "on the east", - "throughout the city", - "south area", - "centre city", - "city", - "in the north", - "not", - "east side", - "not in the west", - "here in the city", - "the south area of town", - "that part of town", - "the east", - "west of town", - "centere", - "in the city .", - "south part of time", - "mexican", - "the east side", - "south part of town", - "west part of town .", - "southern", - "any where in the city", - "west area", - "north cambridge", - "the south area of the city", - "almost every area of town", - "ely", - "the north end", - "south cambridge", - "the are", - "east side of town", - "southside", - "east side of the city", - "by the airport", - "around there", - "all of cambridge", - "the town centre", - "nearby", - "town centre", - "the center part of town", - "other parts of town", - "not in the south of town", - "chesterton", - "the centre of town", - "east area of town", - "the east area", - "southend", - "ctre", - "same area", - "anywhere in cambridge", - "north side of cambridge", - "same area as the park", - "south side of town", - "the centre area", - "the same area as the botanic gardens", - "south part of the city", - "the area you have chosen", - "in the centre", - "northern part of cambridge", - "east section", - "east|south", - "the east part of town", - "this town", - "the same area", - "west area of town", - "town centre area", - "the southern area", - "northern part", - "southern area", - "the east side of town", - "expensive", - "east section of town", - "same", - "in cambridge", - "north side of chersteron", - "towns centre", - "in town", - "west side", - "in the east", - "centrally", - "west side area", - "all over town", - "centre region", - "anywhere in town", - "closely located", - "west part of town", - "cambridge", - "downtown", - "south end", - "close it city centre", - "in the city centre", - "close to the hotel", - "east", - "north and west", - "west end of town", - "the south part of town", - "the north side", - "any area", - "the area of west", - "center of the town", - "the area", - "the centre part of town", - "north", - "wet part of town", - "other part of the town", - "cheap", - "north part of town", - "centre area of the town", - "cetre", - "in town at all", - "central", - "east part of town", - "near the center of town", - "the city centre", - "north end", - "east of town", - "centreof", - "west", - "the north side of town", - "in the west", - "same side of town as your hotel", - "the west of town", - "centre area of town", - "centrem", - "east of cambridge", - "center of town", - "anywhere", - "throughout the area", - "central area", - "south of town", - "the west area", - "the center of town", - "centre part of town", - "the center", - "north side", - "the south", - "the west side of town", - "west part of the city", - "center", - "any", - "the north", - "east end", - "the west part of town", - "west end", - "center cambridge", - "the centre area of town", - "centre area", - "northside", - "centre area of the city", - "center area of town", - "centrally located", - "the south side", - "around the college", - "do nt care", - "thai", - "the west", - "south side", - "cambridge centre", - "town", - "west of cambridge", - "near clare hall", - "south of town .", - "n the centre", - "centre of the city", - "the north area", - "that", - "this area", - "central region of town", - "centre", - "north of town", - "these areas", - "the same", - "south part", - "that area of town", - "in the city", - "that area", - "city center", - "south", - "close to your location", - "north area of town", - "all over cambridge", - "the south side of town", - "within this area", - "near centre", - "the centre", - "centre of town .", - "the north part of town", - "the north side .", - "same area as the hotel", - "and centre", - "west side of town", - "westies", - "around town", - "east part of tow", - "eastside", - "close to that area", - "all over the city", - "south side of the town", - "centre cambridge", - "do n't care", - "north area", - "centre of town" - ] - }, - "postcode": { - "description": "postcode of the restaurant", - "is_categorical": false, - "possible_values": [] - }, - "phone": { - "description": "restaurant phone number", - "is_categorical": false, - "possible_values": [] - }, - "people": { - "description": "number of people booking the restaurant", - "is_categorical": false, - "possible_values": [] - }, - "time": { - "description": "time of the restaurant booking", - "is_categorical": false, - "possible_values": [] - }, - "day": { - "description": "day of the restaurant booking", - "is_categorical": true, - "possible_values": [ - "saturday", - "wednesday", - "tuesday", - "w", - "saturday|thursday", - "monday", - "friday", - "thursday", - "sunday", - "sunday|thursday" - ] - }, - "choice": { - "description": "number of restaurants meeting requests of user", - "is_categorical": false, - "possible_values": [] - } - } - }, - "attraction": { - "description": "an entertainment that is offered to the public", - "slots": { - "address": { - "description": "details of where the attraction is", - "is_categorical": false, - "possible_values": [] - }, - "postcode": { - "description": "postcode of the attraction", - "is_categorical": false, - "possible_values": [] - }, - "phone": { - "description": "phone number of the attraction", - "is_categorical": false, - "possible_values": [] - }, - "entrance fee": { - "description": "the fee charged for admission to the attraction", - "is_categorical": false, - "possible_values": [] - }, - "type": { - "description": "type of the attraction", - "is_categorical": false, - "possible_values": [] - }, - "area": { - "description": "area or place of the attraction", - "is_categorical": true, - "possible_values": [ - "east area", - "near the centre", - "west cambridge", - "centre of cambridge", - "other areas", - "westside", - "the west - side", - "city centre", - "central district", - "near the restaurant", - "whole of cambridge", - "the west end", - "entre", - "various", - "northern area", - "town center", - "west of your city", - "the west side", - "south of cambridge", - "west part of cambridge", - "norwich", - "throughout the city", - "centre city", - "south area", - "city", - "very close in the same area", - "centre by galleria", - "east side", - "northern", - "that part of town", - "the east", - "same general area", - "went side of town", - "southern cambridge", - "the east side", - "south part of town", - "in that area", - "to the south", - "clifton way", - "there", - "west area", - "in the south", - "center are", - "south cambridge", - "ely", - "east side of town", - "central cambridge", - "south park of town", - "the south area", - "centre area of cambridge", - "west area of the city", - "several different parts of town", - "cambridge 's centre", - "nearby", - "town centre", - "the center part of town", - "near the hotel", - "museum", - "the area you 're looking for", - "other parts of town", - "most of them are in the center", - "the centre of town", - "northern area of town", - "east area of town", - "north of the city", - "different area", - "same area", - "east section", - "south side of town", - "the centre area", - "in the centre", - "in the center", - "western part of town", - "centre area .", - "south part of cambridge", - "the same road", - "cetnre of town", - "the same area", - "town centre area", - "the east part of town", - "center area", - "west area of town", - "cenre", - "center of town .", - "cambridge leisure park", - "church area", - "that area .", - "near ely", - "in that side", - "east cambridge", - "same", - "in cambridge", - "towns centre", - "that side of town", - "another area", - "in town", - "west side", - "west end of the city", - "the centry area", - "in the east", - "we", - "all over town", - "all around the city", - "city centre .", - "center of cambridge", - "centre region", - "west part of town", - "cambridge", - "cent", - "western part of the town", - "downtown", - "south end", - "east", - "this side of town", - "same area as hotel", - "center of the town", - "west end of town", - "the area", - "eat", - "northern cambridge", - "north", - "same part of town as your restaurant", - "the centre part of town", - "north part of town", - "central", - "close to the center of town", - "east part of town", - "same part", - "western cambridge", - "the city centre", - "east of town", - "west", - "the eastside", - "north in milton", - "the west of town", - "centre area of town", - "ce", - "east of cambridge", - "the east of the town", - "close to the restaurant", - "the west area of town", - "center of town", - "the south near your hotel", - "south of town", - "the west area", - "centre part of town", - "the center", - "north side", - "north section of cambridge", - "the north of the city", - "all of the other areas", - "the south", - "all", - "the west side of town", - "west part of the city", - "center", - "the north", - "your desired location", - "wet end", - "center part of town", - "cb30aq", - "west end", - "in the area", - "centre area", - "centrally located", - "the south side", - "city 's centre", - "do nt care", - "the west", - "east end of town", - "every area except the north", - "south side", - "centre of the city", - "that", - "this area", - "centre", - "north of town", - "right in the center of town", - "city cenre", - "centre of town ?", - "that area", - "city center", - "south", - "same area as tandoori palace", - "centre|west", - "central zone", - "the centre", - "the center area", - "west side of town", - "western area of town", - "the center of the park", - "west side of the city", - "eastside", - "northend", - "the area you are looking for", - "all over the city", - "western", - "on the centre", - "museums", - "north area", - "centre of town", - "center of the city" - ] - }, - "name": { - "description": "name of the attraction", - "is_categorical": false, - "possible_values": [] - }, - "pricerange": { - "description": "the price range for the attraction, from cheap to expensive", - "is_categorical": true, - "possible_values": [ - "free", - "no entrance fee", - "entrance fee", - "unaware of what their entrance fee is", - "not listed", - "cheap", - "we do n't have any information", - "2 pounds", - "do n't see a price listed", - "from free to 2 pounds 50", - "4 pounds", - "moderately priced", - "neither prices are listed", - "expensive", - "5 pounds", - "they do n't have the entrance fee posted", - "free admission", - "not sure of the fee", - "5 pound entrance fee", - "do n't have information", - "3.50 pounds" - ] - }, - "choice": { - "description": "number of attractions matching requests of user", - "is_categorical": false, - "possible_values": [] - } - } - }, - "booking": { - "description": "to arrange with a taxi, restaurant, train, etc.", - "slots": { - "time": { - "description": "time for an order", - "is_categorical": false, - "possible_values": [] - }, - "day": { - "description": "day for an order, from monday to sunday", - "is_categorical": true, - "possible_values": [ - "saturday", - "wednesday", - "saturday night thru monday", - "vmhj6y3i", - "6", - "weds", - "tuesday through friday", - "tuesday instead of thursday", - "2", - "that", - "tues", - "sunday 18/06/2017", - "tuesday 's", - "today", - "tonight", - "that time frame", - "tuesday", - "at that time", - "1", - "this evening", - "sunday and monday", - "the same day", - "sundar", - "monday", - "friday", - "thursday", - "sunday", - "we d" - ] - }, - "stay": { - "description": "for how long the user wish to be at a place", - "is_categorical": false, - "possible_values": [] - }, - "people": { - "description": "how many person the order is for", - "is_categorical": false, - "possible_values": [] - }, - "name": { - "description": "name of the ordered place", - "is_categorical": false, - "possible_values": [] - }, - "Ref": { - "description": "reference number of the order", - "is_categorical": false, - "possible_values": [] - } - } - }, - "train": { - "description": "query and order a train", - "slots": { - "destination": { - "description": "destination of the train", - "is_categorical": false, - "possible_values": [] - }, - "arriveBy": { - "description": "arrival time of the train", - "is_categorical": false, - "possible_values": [] - }, - "departure": { - "description": "departure location of the train", - "is_categorical": false, - "possible_values": [] - }, - "duration": { - "description": "the length of time the train trip lasts", - "is_categorical": false, - "possible_values": [] - }, - "people": { - "description": "number of people booking for train", - "is_categorical": false, - "possible_values": [] - }, - "day": { - "description": "day of the train", - "is_categorical": true, - "possible_values": [ - "cambridge", - "fr", - "saturday", - "wednesday", - "13:07", - "everday", - "weds", - "sat", - "sundays", - "train", - "that", - "between friday and wednesday", - "all week", - "this day", - "every day", - "tuesday", - "saturdays", - "tr2519", - "every", - "other days are available", - "mondays", - "we", - "monday", - "friday", - "thursday", - "frday", - "sunday", - "daily", - "that day", - "fiday" - ] - }, - "Ref": { - "description": "reference number of the order", - "is_categorical": false, - "possible_values": [] - }, - "leaveAt": { - "description": "leaving time for the train", - "is_categorical": false, - "possible_values": [] - }, - "price": { - "description": "price for the train ticket", - "is_categorical": false, - "possible_values": [] - }, - "choice": { - "description": "number of trains that meets requests of the user", - "is_categorical": false, - "possible_values": [] - } - } - }, - "hotel": { - "description": "to query hotel information and place an order", - "slots": { - "internet": { - "description": "internet option at the hotel", - "is_categorical": true, - "possible_values": [ - "free internet", - "free", - "y", - "do nt care", - "yes", - "none", - "no", - "4" - ] - }, - "area": { - "description": "area or place of the hotel", - "is_categorical": true, - "possible_values": [ - "near the centre", - "any part of town", - "east area", - "different parts of the city", - "north end of the city", - "centre of cambridge", - "near the restaurant", - "city centre", - "throughout cambridge", - "different part of town", - "eastern part of the city", - "the west end", - "various", - "northern area", - "the northern part of town", - "northern part of town", - "the west side", - "on the west", - "south area of town", - "centrally - located", - "north side of town", - "central location", - "centra", - "throughout the city", - "not in the south", - "south area", - "centre city", - "in the north", - "east side", - "northern side of town", - "different parts of town", - "you are interested in", - "northern", - "that part of town", - "west of town", - "the east", - "all areas", - "all parts of the city", - "the east side", - "south part of town", - "south end of the city", - "there", - "west area", - "north cambridge", - "the north end", - "south cambridge", - "east side of town", - "southside", - "the south area", - "or west ?", - "another area of town", - "all of cambridge", - "the town centre", - "east near other shops and boutiques", - "town centre", - "on the west side", - "all over", - "the centre of the city", - "other parts of town", - "the centre of town", - "east area of town", - "the east area", - "the westside", - "southend", - "west|centre", - "northe part of town", - "anywhere in cambridge", - "different area", - "same area", - "south side of town", - "the centre area", - "on the south", - "in the centre", - "in the center", - "wast", - "all across town", - "south end of town", - "except in the north", - "and east", - "place to be a guesthouse", - "the same area", - "other parts of the city", - "the east side of town", - "somewhere else", - "and in the centre", - "east section of town", - "east cambridge", - "across cambridge", - "north of cambridge", - "next door", - "except in the east", - "in cambridge", - "north part of the city", - "that side of town", - "south cambridge area", - "in town", - "west side", - "west end of the city", - "we", - "centrally", - "west side near the restaurant", - "all over town", - "southern part of town", - "all around the city", - "west part of town", - "cambridge", - "downtown", - "south end", - "east", - "east and the north .", - "norht", - "the north side", - "any area", - "center of the town", - "the area", - "eat", - "northern cambridge", - "north", - "cheap", - "north part of town", - "centre part", - "east part of town", - "western cambridge", - "north end of town", - "north end", - "east of town", - "several areas of town", - "west", - "eastern", - "in the west", - "everywhere but the city centre", - "centre area of town", - "center of town", - "west part", - "the town center", - "south of town", - "the center of town", - "the west area", - "centre part of town", - "nborth", - "north side", - "the north of the city", - "the south", - "west part of the city", - "center", - "the north", - "west end", - "same side of town", - "in the area", - "any part of the city", - "centre area", - "not to far from the restaurant", - "northside", - "on the eastside", - "close to where you 'll be dining", - "the east end of town", - "various parts of the city", - "west areas of town", - "centrally located", - "the south side", - "north part of town .", - "do nt care", - "the west", - "same area as the restaurant", - "that vicinity", - "nearby the restaurant", - "south side", - "east part of time", - "centre of the city", - "south closer to the museum", - "north park of town", - "centre", - "north of town", - "north and centre", - "the same", - "on the north", - "that area of town", - "that area", - "in the city", - "el shaddai", - "city center", - "south", - "the city 's south side", - "north area of town", - "near the museum", - "eastern cambridge", - "near centre", - "the easy", - "a different area", - "the centre", - "the north part of town", - "west side of town", - "across town", - "eastside", - "northend", - "north par of town", - "the centre of cambridge", - "all over the city", - "north location", - "centre cambridge", - "that region", - "north area", - "centre of town" - ] - }, - "stars": { - "description": "star rating of the hotel", - "is_categorical": true, - "possible_values": [ - "4|5", - "several", - "0 to 4", - "no star rating", - "do nt care", - "one", - "does not show", - "foru", - "does not have", - "different star ratings", - "unrated", - "2", - "four starts", - "4-star", - "four - star", - "3|4", - "yes", - "three", - "four", - "lower", - "four stars", - "zero", - "ranging from 2 - 4 stars", - "two", - "1", - "3", - "four star", - "0-star", - "five", - "drop the star rating", - "2-star", - "no", - "3-star", - "5-star", - "0", - "1-star", - "not rated", - "5", - "not as fancy", - "4" - ] - }, - "parking": { - "description": "parking facility at the hotel", - "is_categorical": true, - "possible_values": [ - "n", - "free", - "do nt care", - "yes", - "none", - "no", - "free parking" - ] - }, - "phone": { - "description": "hotel phone number", - "is_categorical": false, - "possible_values": [] - }, - "name": { - "description": "name of the hotel", - "is_categorical": false, - "possible_values": [] - }, - "pricerange": { - "description": "price budget of the hotel", - "is_categorical": true, - "possible_values": [ - "cheapest", - "economically priced", - "free", - "all different price ranges", - "more moderately priced", - "expensively - priced", - "ca n't view that information", - "moderate", - "super cheap", - "low cost", - "cheap price range", - "lower", - "affordable pricing", - "moderate to cheap", - "cheap priced", - "inexpensively - priced", - "mostly in the cheap to moderate price range", - "moderately pricing", - "guesthouses", - "moderate|cheap", - "moderately priceed", - "very cheap", - "even cheaper", - "upscale", - "other ranges", - "moderate or cheap", - "cheaply", - "affordable", - "different price range", - "does not say", - "moderately price", - "expensive or cheap", - "priced moderately", - "moderatly price", - "moderate price point", - "moderately prices", - "cheap to expensive", - "on the more expensive side", - "inexpensive", - "the cheapest", - "cheaply priced", - "that range", - "rather expensive", - "your price", - "moderate range", - "cheaply - priced", - "more expensive", - "expensive", - "much cheaper", - "cheap side", - "slightly more expensive", - "$100", - "cheaper than the others", - "pretty expensive", - "not expensive", - "cheaply prices", - "moderatley priced", - "modrate", - "cheap range", - "espensive", - "chear", - "fairly cheap", - "oderately priced", - "moderate in price", - "different", - "very affordable", - "your chosen", - "unfortunately do not have the price", - "north", - "less costly", - "different price ranges", - "cheap", - "very inexpensive", - "2", - "moderately priced .", - "moderately to expensively priced", - "economical", - "same price range", - "moderate to cheap range", - "cheap to moderate", - "quite expensive", - "that price range", - "cheap|moderate", - "your price range", - "varying price ranges", - "expensive to moderate", - "expensively priced", - "epxensive", - "or expensive ?", - "all", - "higher price range", - "any", - "moderately - priced", - "hotel", - "moderate pricing", - "do nt care", - "lower end", - "quite cheap", - "more budget - friendly", - "cheaper", - "fairly cheap compared to other hotels", - "pretty cheap", - "moderate price", - "that", - "moderate priced", - "centre", - "the same", - "expensive range", - "relatively cheap", - "moderate prices", - "hotels", - "cheap>moderate", - "a little pricey", - "on the cheap side", - "moderately - priced or cheap", - "expensive price range", - "moderately priced", - "same price", - "moderately", - "budget - priced", - "moderately pried", - "moderate price range", - "great", - "moderatly priced", - "extremely reasonable", - "cheap or moderate", - "cheaper side" - ] - }, - "people": { - "description": "number of people for the hotel booking", - "is_categorical": false, - "possible_values": [] - }, - "postcode": { - "description": "postcode of the hotel", - "is_categorical": false, - "possible_values": [] - }, - "address": { - "description": "exact location of the hotel", - "is_categorical": false, - "possible_values": [] - }, - "type": { - "description": "what is the type of the hotel", - "is_categorical": false, - "possible_values": [] - }, - "stay": { - "description": "length of stay at the hotel", - "is_categorical": false, - "possible_values": [] - }, - "day": { - "description": "day of the hotel booking", - "is_categorical": true, - "possible_values": [ - "next friday", - "saturday", - "wednesday", - "tuesday", - "saturday|tuesday", - "t", - "sunday>monday", - "friday>tuesday", - "monday", - "friday", - "thursday", - "monday<thursday", - "sunday", - "wednesday|friday", - "monda" - ] - }, - "choice": { - "description": "number of hotels that meets requests of the user", - "is_categorical": false, - "possible_values": [] - } - } - }, - "police": { - "description": "find police stations", - "slots": { - "address": { - "description": "exact location of the police station", - "is_categorical": false, - "possible_values": [] - }, - "postcode": { - "description": "postcode of the police station", - "is_categorical": false, - "possible_values": [] - }, - "phone": { - "description": "police station phone number", - "is_categorical": false, - "possible_values": [] - } - } - } - }, - "intents": { - "inform": { - "description": "inform user of value for a certain slot" - }, - "request": { - "description": "ask for value of a slot" - }, - "nobook": { - "description": "inform user of booking failure" - }, - "reqmore": { - "description": "ask user for more instructions" - }, - "book": { - "description": "place an order for user" - }, - "bye": { - "description": "end a conversation and say goodbye to user" - }, - "thank": { - "description": "express gratitude" - }, - "welcome": { - "description": "welcome" - }, - "recommend": { - "description": "recommend a choice for user request" - }, - "offerbook": { - "description": "offer to place an order for user" - }, - "offerbooked": { - "description": "inform user that an order is succussful" - }, - "greet": { - "description": "express greeting" - }, - "nooffer": { - "description": "inform user that no options matches user request" - }, - "select": { - "description": "provide several choices for user to choose from" - } - }, - "binary_dialogue_act": [ - { - "intent": "request", - "domain": "hotel", - "slot": "area", - "value": "" - }, - { - "intent": "inform", - "domain": "booking", - "slot": "", - "value": "" - }, - { - "intent": "request", - "domain": "booking", - "slot": "stay", - "value": "" - }, - { - "intent": "request", - "domain": "booking", - "slot": "day", - "value": "" - }, - { - "intent": "reqmore", - "domain": "", - "slot": "", - "value": "" - }, - { - "intent": "bye", - "domain": "", - "slot": "", - "value": "" - }, - { - "intent": "thank", - "domain": "", - "slot": "", - "value": "" - }, - { - "intent": "welcome", - "domain": "", - "slot": "", - "value": "" - }, - { - "intent": "request", - "domain": "hotel", - "slot": "pricerange", - "value": "" - }, - { - "intent": "request", - "domain": "train", - "slot": "leaveAt", - "value": "" - }, - { - "intent": "offerbook", - "domain": "train", - "slot": "", - "value": "" - }, - { - "intent": "request", - "domain": "train", - "slot": "duration", - "value": "" - }, - { - "intent": "request", - "domain": "train", - "slot": "price", - "value": "" - }, - { - "intent": "request", - "domain": "train", - "slot": "departure", - "value": "" - }, - { - "intent": "request", - "domain": "train", - "slot": "day", - "value": "" - }, - { - "intent": "request", - "domain": "taxi", - "slot": "leaveAt", - "value": "" - }, - { - "intent": "request", - "domain": "taxi", - "slot": "arriveBy", - "value": "" - }, - { - "intent": "greet", - "domain": "", - "slot": "", - "value": "" - }, - { - "intent": "request", - "domain": "hotel", - "slot": "address", - "value": "" - }, - { - "intent": "request", - "domain": "train", - "slot": "destination", - "value": "" - }, - { - "intent": "request", - "domain": "train", - "slot": "arriveBy", - "value": "" - }, - { - "intent": "request", - "domain": "hotel", - "slot": "internet", - "value": "" - }, - { - "intent": "request", - "domain": "hotel", - "slot": "parking", - "value": "" - }, - { - "intent": "inform", - "domain": "attraction", - "slot": "", - "value": "" - }, - { - "intent": "request", - "domain": "attraction", - "slot": "area", - "value": "" - }, - { - "intent": "request", - "domain": "attraction", - "slot": "type", - "value": "" - }, - { - "intent": "request", - "domain": "attraction", - "slot": "address", - "value": "" - }, - { - "intent": "request", - "domain": "restaurant", - "slot": "food", - "value": "" - }, - { - "intent": "request", - "domain": "restaurant", - "slot": "area", - "value": "" - }, - { - "intent": "request", - "domain": "restaurant", - "slot": "phone", - "value": "" - }, - { - "intent": "request", - "domain": "restaurant", - "slot": "postcode", - "value": "" - }, - { - "intent": "request", - "domain": "restaurant", - "slot": "address", - "value": "" - }, - { - "intent": "request", - "domain": "attraction", - "slot": "postcode", - "value": "" - }, - { - "intent": "request", - "domain": "attraction", - "slot": "phone", - "value": "" - }, - { - "intent": "nobook", - "domain": "booking", - "slot": "", - "value": "" - }, - { - "intent": "inform", - "domain": "train", - "slot": "", - "value": "" - }, - { - "intent": "request", - "domain": "train", - "slot": "people", - "value": "" - }, - { - "intent": "request", - "domain": "booking", - "slot": "people", - "value": "" - }, - { - "intent": "request", - "domain": "hotel", - "slot": "stars", - "value": "" - }, - { - "intent": "request", - "domain": "hotel", - "slot": "type", - "value": "" - }, - { - "intent": "select", - "domain": "hotel", - "slot": "", - "value": "" - }, - { - "intent": "request", - "domain": "hotel", - "slot": "postcode", - "value": "" - }, - { - "intent": "request", - "domain": "attraction", - "slot": "entrance fee", - "value": "" - }, - { - "intent": "request", - "domain": "restaurant", - "slot": "pricerange", - "value": "" - }, - { - "intent": "request", - "domain": "train", - "slot": "Ref", - "value": "" - }, - { - "intent": "request", - "domain": "booking", - "slot": "time", - "value": "" - }, - { - "intent": "book", - "domain": "booking", - "slot": "", - "value": "" - }, - { - "intent": "request", - "domain": "hotel", - "slot": "phone", - "value": "" - }, - { - "intent": "request", - "domain": "attraction", - "slot": "pricerange", - "value": "" - }, - { - "intent": "inform", - "domain": "hotel", - "slot": "", - "value": "" - }, - { - "intent": "select", - "domain": "train", - "slot": "", - "value": "" - }, - { - "intent": "select", - "domain": "restaurant", - "slot": "", - "value": "" - }, - { - "intent": "inform", - "domain": "restaurant", - "slot": "", - "value": "" - }, - { - "intent": "inform", - "domain": "taxi", - "slot": "", - "value": "" - }, - { - "intent": "request", - "domain": "taxi", - "slot": "departure", - "value": "" - }, - { - "intent": "request", - "domain": "restaurant", - "slot": "name", - "value": "" - }, - { - "intent": "nooffer", - "domain": "attraction", - "slot": "", - "value": "" - }, - { - "intent": "request", - "domain": "taxi", - "slot": "destination", - "value": "" - }, - { - "intent": "nooffer", - "domain": "hotel", - "slot": "", - "value": "" - }, - { - "intent": "select", - "domain": "attraction", - "slot": "", - "value": "" - }, - { - "intent": "request", - "domain": "taxi", - "slot": "taxi_phone", - "value": "" - }, - { - "intent": "request", - "domain": "taxi", - "slot": "taxi_types", - "value": "" - }, - { - "intent": "request", - "domain": "attraction", - "slot": "name", - "value": "" - }, - { - "intent": "nooffer", - "domain": "restaurant", - "slot": "", - "value": "" - }, - { - "intent": "nooffer", - "domain": "train", - "slot": "", - "value": "" - }, - { - "intent": "request", - "domain": "hotel", - "slot": "name", - "value": "" - }, - { - "intent": "recommend", - "domain": "hotel", - "slot": "", - "value": "" - }, - { - "intent": "recommend", - "domain": "attraction", - "slot": "", - "value": "" - }, - { - "intent": "offerbooked", - "domain": "train", - "slot": "", - "value": "" - }, - { - "intent": "recommend", - "domain": "restaurant", - "slot": "", - "value": "" - } - ], - "state": { - "hotel": { - "name": "", - "area": "", - "parking": "", - "pricerange": "", - "stars": "", - "internet": "", - "type": "", - "stay": "", - "day": "", - "people": "" - }, - "train": { - "leaveAt": "", - "destination": "", - "day": "", - "arriveBy": "", - "departure": "", - "people": "" - }, - "taxi": { - "leaveAt": "", - "destination": "", - "departure": "", - "arriveBy": "" - }, - "restaurant": { - "food": "", - "pricerange": "", - "name": "", - "area": "", - "time": "", - "day": "", - "people": "" - }, - "attraction": { - "type": "", - "name": "", - "area": "" - } - } -} \ No newline at end of file diff --git a/data/unified_datasets/multiwoz21/preprocess.py b/data/unified_datasets/multiwoz21/preprocess.py index 5ca81e2e399d8a816e3be6d4ae33babfc2f1597e..9f1c0cbc31eb761d02f72833782c8e863ede87d5 100644 --- a/data/unified_datasets/multiwoz21/preprocess.py +++ b/data/unified_datasets/multiwoz21/preprocess.py @@ -1,299 +1,609 @@ import copy import re -import zipfile +from zipfile import ZipFile, ZIP_DEFLATED +from shutil import copy2 import json import os from tqdm import tqdm -import sys -import difflib -from fuzzywuzzy import fuzz -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) - -from convlab2.util.file_util import read_zipped_json, write_zipped_json -import logging - - -logging.basicConfig(level=logging.INFO) -self_dir = (os.path.abspath(os.getcwd())) - -REF_SYS_DA = { - 'Attraction': { - 'Addr': "address", 'Area': "area", 'Choice': "choice", - 'Fee': "entrance fee", 'Name': "name", 'Phone': "phone", - 'Post': "postcode", 'Price': "pricerange", 'Type': "type", - 'none': None, 'Open': None - }, - 'Hospital': { - 'Department': 'department', 'Addr': 'address', 'Post': 'postcode', - 'Phone': 'phone', 'none': None - }, - 'Booking': { - 'Day': 'day', 'Name': 'name', 'People': 'people', - 'Ref': 'Ref', 'Stay': 'stay', 'Time': 'time', - 'none': None - }, - 'Hotel': { - 'Addr': "address", 'Area': "area", 'Choice': "choice", - 'Internet': "internet", 'Name': "name", 'Parking': "parking", - 'Phone': "phone", 'Post': "postcode", 'Price': "pricerange", - 'Stars': "stars", 'Type': "type", 'Stay': 'stay', 'Day': 'day', 'People': 'people', - 'none': None - }, - 'Restaurant': { - 'Addr': "address", 'Area': "area", 'Choice': "choice", - 'Name': "name", 'Food': "food", 'Phone': "phone", - 'Post': "postcode", 'Price': "pricerange", - 'Time': 'time', 'Day': 'day', 'People': 'people', - 'none': None - }, - 'Taxi': { - 'Arrive': "arriveBy", 'Car': "taxi_types", 'Depart': "departure", - 'Dest': "destination", 'Leave': "leaveAt", 'Phone': "taxi_phone", - 'none': None - }, - 'Train': { - 'Arrive': "arriveBy", 'Choice': "choice", 'Day': "day", - 'Depart': "departure", 'Dest': "destination", - 'Leave': "leaveAt", 'People': "people", 'Ref': "Ref", - 'Time': "duration", 'none': None, 'Ticket': 'price', - }, - 'Police': { - 'Addr': "address", 'Post': "postcode", 'Phone': "phone", 'none': None - }, -} - -# taxi restaurant attraction train -slot_to_type = { - 'taxi-destination': 'non', - 'taxi-departure': 'non', - 'taxi-leaveAt': 'non', - 'taxi-arriveBy': 'non', - 'restaurant-food': 'non', - 'restaurant-name': 'non', - 'attraction-address': 'non', - 'attraction-postcode': 'non', - 'restaurant-pricerange': 'cat', - 'restaurant-address': 'non', - 'restaurant-area': 'cat', - 'restaurant-postcode': 'non', - 'attraction-phone': 'non', - 'attraction-entrance fee': 'non', - 'booking-time': 'non', - 'booking-day': 'cat', - 'attraction-type': 'non', - 'attraction-area': 'cat', - 'train-destination': 'non', - 'train-arriveBy': 'non', - 'train-departure': 'non', - 'hotel-internet': 'cat', - 'hotel-area': 'cat', - 'booking-stay': 'non', - 'booking-people': 'non', - 'train-duration': 'non', - 'train-people': 'non', - 'train-day': 'cat', - 'train-Ref': 'non', - 'hotel-stars': 'cat', - 'train-leaveAt': 'non', - 'train-price': 'non', - 'hotel-parking': 'cat', - 'hotel-phone': 'non', - 'hotel-name': 'non', - 'hotel-pricerange': 'cat', - 'hotel-people': 'non', - 'restaurant-phone': 'non', - 'hotel-postcode': 'non', - 'hotel-address': 'non', - 'attraction-name': 'non', - 'hotel-type': 'non', - 'restaurant-people': 'non', - 'train-choice': 'non', - 'attraction-pricerange': 'cat', - 'hotel-stay': 'non', - 'booking-name': 'non', - 'booking-Ref': 'non', - 'restaurant-time': 'non', - 'restaurant-day': 'cat', - 'hotel-day': 'cat', - 'hotel-choice': 'non', - 'restaurant-choice': 'non', - 'attraction-choice': 'non', - 'taxi-taxi_phone': 'non', - 'taxi-taxi_types': 'non', - 'police-address': 'non', - 'police-postcode': 'non', - 'police-phone': 'non' -} - -state_cat_slot_value_dict = { - "hotel-pricerange": { - "cheap": 735, - "moderate": 1063, - "expensive": 594, - }, - "hotel-parking": { - "yes": 1809, - "no": 126, - "free": 4, - }, - "hotel-day": { - "tuesday": 385, - "wednesday": 410, - "monday": 365, - "saturday": 407, - "friday": 393, - "thursday": 384, - "sunday": 369, - }, - "train-day": { - "wednesday": 533, - "monday": 533, - "saturday": 543, - "thursday": 547, - "friday": 563, - "tuesday": 553, - "sunday": 613, - }, - "hotel-stars": { - "4": 1263, - "2": 193, - "0": 201, - "3": 401, - "5": 45, - "1": 45, - }, - "hotel-internet": { - "yes": 1841, - "no": 79, - "free": 2 - }, - "hotel-area": { - "east": 416, - "north": 717, - "centre": 538, - "south": 289, - "west": 316, - }, - "attraction-area": { - "centre": 1290, - "west": 332, - "north": 155, - "south": 240, - "east": 272, - }, - "restaurant-pricerange": { - "expensive": 1477, - "cheap": 758, - "moderate": 1028, +from collections import Counter +from pprint import pprint +from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer + +init_ontology = { + "domains": { # descriptions are adapted from multiwoz22, but is_categorical may be different + "attraction": { + "description": "find an attraction", + "slots": { + "area": { + "description": "area to search for attractions", + "is_categorical": True, + "possible_values": [ + "centre", + "east", + "north", + "south", + "west" + ] + }, + "name": { + "description": "name of the attraction", + "is_categorical": False, + "possible_values": [] + }, + "type": { + "description": "type of the attraction", + "is_categorical": True, + "possible_values": [ + "architecture", + "boat", + "cinema", + "college", + "concerthall", + "entertainment", + "museum", + "multiple sports", + "nightclub", + "park", + "swimmingpool", + "theatre" + ] + }, + "entrance fee": { + "description": "how much is the entrance fee", + "is_categorical": False, + "possible_values": [] + }, + "open hours": { + "description": "open hours of the attraction", + "is_categorical": False, + "possible_values": [] + }, + "address": { + "description": "address of the attraction", + "is_categorical": False, + "possible_values": [] + }, + "phone": { + "description": "phone number of the attraction", + "is_categorical": False, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the attraction", + "is_categorical": False, + "possible_values": [] + }, + "choice": { + "description": "number of attractions that meet the requirement", + "is_categorical": False, + "possible_values": [] + } + } + }, + "hotel": { + "description": "find and book a hotel", + "slots": { + "internet": { + "description": "whether the hotel has internet", + "is_categorical": True, + "possible_values": [ + "free", + "no", + "yes" + ] + }, + "parking": { + "description": "whether the hotel has parking", + "is_categorical": True, + "possible_values": [ + "free", + "no", + "yes" + ] + }, + "area": { + "description": "area or place of the hotel", + "is_categorical": True, + "possible_values": [ + "centre", + "east", + "north", + "south", + "west" + ] + }, + "stars": { + "description": "star rating of the hotel", + "is_categorical": True, + "possible_values": [ + "0", + "1", + "2", + "3", + "4", + "5" + ] + }, + "price range": { + "description": "price budget of the hotel", + "is_categorical": True, + "possible_values": [ + "expensive", + "cheap", + "moderate" + ] + }, + "type": { + "description": "what is the type of the hotel", + "is_categorical": False, + "possible_values": [ + "guesthouse", + "hotel" + ] + }, + "name": { + "description": "name of the hotel", + "is_categorical": False, + "possible_values": [] + }, + "book people": { + "description": "number of people for the hotel booking", + "is_categorical": False, + "possible_values": [] + }, + "book stay": { + "description": "length of stay at the hotel", + "is_categorical": False, + "possible_values": [] + }, + "book day": { + "description": "day of the hotel booking", + "is_categorical": True, + "possible_values": [ + "monday", + "tuesday", + "wednesday", + "thursday", + "friday", + "saturday", + "sunday" + ] + }, + "phone": { + "description": "phone number of the hotel", + "is_categorical": False, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the hotel", + "is_categorical": False, + "possible_values": [] + }, + "address": { + "description": "address of the hotel", + "is_categorical": False, + "possible_values": [] + }, + "ref": { + "description": "reference number of the hotel booking", + "is_categorical": False, + "possible_values": [] + }, + "choice": { + "description": "number of hotels that meet the requirement", + "is_categorical": False, + "possible_values": [] + } + } + }, + "taxi": { + "description": "rent taxi to travel", + "slots": { + "destination": { + "description": "destination of taxi", + "is_categorical": False, + "possible_values": [] + }, + "departure": { + "description": "departure location of taxi", + "is_categorical": False, + "possible_values": [] + }, + "leave at": { + "description": "leaving time of taxi", + "is_categorical": False, + "possible_values": [] + }, + "arrive by": { + "description": "arrival time of taxi", + "is_categorical": False, + "possible_values": [] + }, + "phone": { + "description": "phone number of the taxi", + "is_categorical": False, + "possible_values": [] + }, + "type": { + "description": "car type of the taxi", + "is_categorical": False, + "possible_values": [] + } + } + }, + "restaurant": { + "description": "find and book a restaurant", + "slots": { + "price range": { + "description": "price budget for the restaurant", + "is_categorical": True, + "possible_values": [ + "cheap", + "expensive", + "moderate" + ] + }, + "area": { + "description": "area or place of the restaurant", + "is_categorical": True, + "possible_values": [ + "centre", + "east", + "north", + "south", + "west" + ] + }, + "food": { + "description": "the cuisine of the restaurant", + "is_categorical": False, + "possible_values": [] + }, + "name": { + "description": "name of the restaurant", + "is_categorical": False, + "possible_values": [] + }, + "address": { + "description": "address of the restaurant", + "is_categorical": False, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the restaurant", + "is_categorical": False, + "possible_values": [] + }, + "phone": { + "description": "phone number of the restaurant", + "is_categorical": False, + "possible_values": [] + }, + "book people": { + "description": "number of people for the restaurant booking", + "is_categorical": False, + "possible_values": [] + }, + "book time": { + "description": "time of the restaurant booking", + "is_categorical": False, + "possible_values": [] + }, + "book day": { + "description": "day of the restaurant booking", + "is_categorical": True, + "possible_values": [ + "monday", + "tuesday", + "wednesday", + "thursday", + "friday", + "saturday", + "sunday" + ] + }, + "ref": { + "description": "reference number of the restaurant booking", + "is_categorical": False, + "possible_values": [] + }, + "choice": { + "description": "number of restaurants that meet the requirement", + "is_categorical": False, + "possible_values": [] + } + } + }, + "train": { + "description": "find a train to travel", + "slots": { + "destination": { + "description": "destination of the train", + "is_categorical": False, + "possible_values": [] + }, + "arrive by": { + "description": "arrival time of the train", + "is_categorical": False, + "possible_values": [] + }, + "departure": { + "description": "departure location of the train", + "is_categorical": False, + "possible_values": [] + }, + "leave at": { + "description": "leaving time for the train", + "is_categorical": False, + "possible_values": [] + }, + "duration": { + "description": "duration of the travel", + "is_categorical": False, + "possible_values": [] + }, + "book people": { + "description": "number of people booking for train", + "is_categorical": False, + "possible_values": [] + }, + "day": { + "description": "day of the train", + "is_categorical": True, + "possible_values": [ + "monday", + "tuesday", + "wednesday", + "thursday", + "friday", + "saturday", + "sunday" + ] + }, + "ref": { + "description": "reference number of the train booking", + "is_categorical": False, + "possible_values": [] + }, + "price": { + "description": "price of the train ticket", + "is_categorical": False, + "possible_values": [] + }, + "train id": { + "description": "id of the train", + "is_categorical": False + }, + "choice": { + "description": "number of trains that meet the requirement", + "is_categorical": False, + "possible_values": [] + } + } + }, + "police": { + "description": "find a police station for help", + "slots": { + "name": { + "description": "name of the police station", + "is_categorical": False, + "possible_values": [] + }, + "address": { + "description": "address of the police station", + "is_categorical": False, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the police station", + "is_categorical": False, + "possible_values": [] + }, + "phone": { + "description": "phone number of the police station", + "is_categorical": False, + "possible_values": [] + } + } + }, + "hospital": { + "description": "find a hospital for help", + "slots": { + "department": { + "description": "specific department of the hospital", + "is_categorical": False, + "possible_values": [] + }, + "address": { + "description": "address of the hospital", + "is_categorical": False, + "possible_values": [] + }, + "phone": { + "description": "phone number of the hospital", + "is_categorical": False, + "possible_values": [] + }, + "postcode": { + "description": "postcode of the hospital", + "is_categorical": False, + "possible_values": [] + } + } + }, + "booking": { + "description": "booking for taxi, restaurant, hotel, train, etc.", + "slots":{ + "day": { + "description": "day of the booking", + "is_categorical": True, + "possible_values": [ + "monday", + "tuesday", + "wednesday", + "thursday", + "friday", + "saturday", + "sunday" + ] + }, + "time": { + "description": "time of the booking", + "is_categorical": False, + "possible_values": [] + }, + "book people": { + "description": "number of people for the booking", + "is_categorical": False, + "possible_values": [] + }, + "book stay": { + "description": "length of stay at the hotel", + "is_categorical": False, + "possible_values": [] + }, + "name": { + "description": "name of the booked entity", + "is_categorical": False, + "possible_values": [] + }, + "ref": { + "description": "reference number of the booking", + "is_categorical": False, + "possible_values": [] + } + } + }, + "general":{ + "description": "general domain without slots", + "slots": {} + } }, - "restaurant-area": { - "centre": 1745, - "south": 398, - "north": 390, - "east": 360, - "west": 423, + "intents": { + "inform": { + "description": "inform the value of a slot" + }, + "request": { + "description": "ask for the value of a slot" + }, + "nobook": { + "description": "inform the user that the booking is failed" + }, + "reqmore": { + "description": "ask the user for more instructions" + }, + "book": { + "description": "book something for the user" + }, + "bye": { + "description": "say goodbye to the user and end the conversation" + }, + "thank": { + "description": "thanks for the help" + }, + "welcome": { + "description": "you're welcome" + }, + "greet": { + "description": "express greeting" + }, + "recommend": { + "description": "recommend a choice to the user" + }, + "select": { + "description": "provide several choices for the user" + }, + "offerbook": { + "description": "ask the user if he or she needs booking" + }, + "offerbooked": { + "description": "provide information about the booking" + }, + "nooffer": { + "description": "inform the user that there is no result satisfies user requirements" + } }, - "restaurant-day": { - "thursday": 362, - "wednesday": 412, - "friday": 395, - "monday": 383, - "sunday": 399, - "saturday": 421, - "tuesday": 350, + "binary_dialogue_acts": set(), # from data + "state": { + "attraction": { + "type": "", + "name": "", + "area": "" + }, + "hotel": { + "name": "", + "area": "", + "parking": "", + "price range": "", + "stars": "", + "internet": "", + "type": "", + "book stay": "", + "book day": "", + "book people": "" + }, + "restaurant": { + "food": "", + "price range": "", + "name": "", + "area": "", + "book time": "", + "book day": "", + "book people": "" + }, + "taxi": { + "leave at": "", + "destination": "", + "departure": "", + "arrive by": "" + }, + "train": { + "leave at": "", + "destination": "", + "day": "", + "arrive by": "", + "departure": "", + "book people": "" + } } } - -synonyms = [ - ["el shaddia guesthouse", "el shaddai"], - [ "peterborough", "peterbourgh"], - ["night club", "nightclub", 'nightclubs'], - ["boat", "boating"], - ["portugese", "portuguese"], - ["guesthouse", "guest house"], - ["seafood", "sea food"], - ["christ 's college", "christ college"], - ["huntingdon marriott hotel"] -] - -state_cat_slot_ds = [k for k, v in slot_to_type.items() if v == 'cat'] - -da_cat_slot_values = { - # 'hotel-stay': ['1', '2', '3', '4', '5'], - 'hotel-internet': ['free', 'no', 'none', 'yes'], - 'hotel-parking': ['free', 'no', 'none', 'yes'] -} - -state_cat_slot_values = {} - -multiwoz_desc = { - 'taxi': { - 'domain': 'taxi information query system', - 'taxi_phone': 'taxi phone number', - 'taxi_types': 'taxi type', - }, - 'restaurant': { - 'domain': 'restaurant information query system', - 'address': 'exact location of the restaurant', - 'postcode': 'postcode of the restaurant', - 'phone': 'restaurant phone number', - 'choice': 'number of restaurants meeting requests of user', - }, +slot_name_map = { + 'addr': "address", + 'post': "postcode", + 'pricerange': "price range", + 'arrive': "arrive by", + 'arriveby': "arrive by", + 'leave': "leave at", + 'leaveat': "leave at", + 'depart': "departure", + 'dest': "destination", + 'fee': "entrance fee", + 'open': 'open hours', + 'car': "type", + 'car type': "type", + 'ticket': 'price', + 'trainid': 'train id', + 'id': 'train id', + 'people': 'book people', + 'stay': 'book stay', + 'none': '', 'attraction': { - 'domain': 'an entertainment that is offered to the public', - 'address': 'details of where the attraction is', - 'postcode': 'postcode of the attraction', - 'phone': 'phone number of the attraction', - 'entrance fee': 'the fee charged for admission to the attraction', - 'pricerange': 'the price range for the attraction, from cheap to expensive', - 'choice': 'number of attractions matching requests of user' - }, - 'booking': { - 'domain': 'to arrange with a taxi, restaurant, train, etc.', - 'time': 'time for an order', - 'day': 'day for an order, from monday to sunday', - 'stay': 'for how long the user wish to be at a place', - 'people': 'how many person the order is for', - 'name': 'name of the ordered place', - 'Ref': 'reference number of the order' - }, - 'train': { - 'domain': 'query and order a train', - 'duration': 'the length of time the train trip lasts', - 'Ref': 'reference number of the order', - 'price': 'price for the train ticket', - 'choice': 'number of trains that meets requests of the user', + 'price': 'entrance fee' }, + 'hospital': {}, 'hotel': { - 'domain': 'to query hotel information and place an order', - 'address': 'exact location of the hotel', - 'postcode': 'postcode of the hotel', - 'phone': 'hotel phone number', - 'choice': 'number of hotels that meets requests of the user', + 'day': 'book day', 'price': "price range" }, - 'police': { - 'domain': 'find police stations', - 'address': 'exact location of the police station', - 'postcode': 'postcode of the police station', - 'phone': 'police station phone number', + 'restaurant': { + 'day': 'book day', 'time': 'book time', 'price': "price range" }, - 'intents': { - 'inform': 'inform user of value for a certain slot', - 'request': 'ask for value of a slot', - 'nobook': 'inform user of booking failure', - 'reqmore': 'ask user for more instructions', - 'book': 'place an order for user', - 'bye': 'end a conversation and say goodbye to user', - 'thank': 'express gratitude', - 'welcome': 'welcome', - 'offerbooked': 'inform user that an order is succussful', - 'recommend': 'recommend a choice for user request', - 'greet': 'express greeting', - 'nooffer': 'inform user that no options matches user request', - 'offerbook': 'offer to place an order for user', - 'select': 'provide several choices for user to choose from', - } + 'taxi': {}, + 'train': { + 'day': 'day', 'time': "duration" + }, + 'police': {}, + 'booking': {} } digit2word = { @@ -301,1131 +611,282 @@ digit2word = { '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten' } - -def pharse_in_sen(phrase, sen): - ''' - match value in the sentence - :param phrase: str - :param sen: str - :return: start, end if matched, else None, None - ''' - assert isinstance(phrase, str) - pw = '(^|[\s,\.:\?!-])(?P<v>{})([\s,\.:\?!-]|$)' - pn = '(^|[\s\?!-]|\D[,\.:])(?P<v>{})($|[\s\?!-]|[,\.:]\D|[,\.:]$)' - - if phrase.isdigit() and phrase in digit2word: - phrase = digit2word[phrase] - p = re.compile(pw.format(re.escape(phrase)), re.I) - m = re.search(p, sen) - if m: - num = len(re.findall(p, sen)) - # if num > 1: - # match['>1'] += 1 - # else: - # match['1'] += 1 - return m.span('v'), num - # match['0'] += 1 - if phrase.isdigit(): - pattern = pn - else: - pattern = pw - p = re.compile(pattern.format(re.escape(phrase)), re.I) - m = re.search(p, sen) - if m: - num = len(re.findall(p, sen)) - # if num > 1: - # match['>1'] += 1 - # else: - # match['1'] += 1 - return m.span('v'), num - return (None, None), 0 - - - - - -def update_state(state, update): - # print('======================') - # print(state) - # print(update) - # print('======================') - - for service, service_update in update.items(): - if service not in state: - state[service] = copy.deepcopy(service_update) +cnt_domain_slot = Counter() + +def normalize_domain_slot_value(domain, slot, value): + global init_ontology, slot_name_map + domain = domain.lower() + slot = slot.lower() + value = value.strip() + if value in ['do nt care', "do n't care"]: + value = 'dontcare' + if value in ['?', 'none', 'not mentioned']: + value = "" + if domain not in init_ontology['domains']: + raise Exception(f'{domain} not in ontology') + if slot not in init_ontology['domains'][domain]['slots']: + if slot in slot_name_map: + slot = slot_name_map[slot] + elif slot in slot_name_map[domain]: + slot = slot_name_map[domain][slot] else: - state[service].update(update[service]) + raise Exception(f'{domain}-{slot} not in ontology') + assert slot=='' or slot in init_ontology['domains'][domain]['slots'], f'{(domain, slot, value)} not in ontology' + return domain, slot, value - -def convert_da(utt, da_dict, binary_ont, intent_ont, did, tid, da_cat_slot_values): +def convert_da(da_dict, utt, sent_tokenizer, word_tokenizer): ''' - convert multiwoz dialogue acts to required format + convert multiwoz dialogue acts to required format and update ontology + :param da_dict: dict[(intent, domain, slot, value)] = [word_start, word_end] :param utt: user or system utt - :param da_dict: multiwoz da - :param binary_ont: binary ontology - :param intent_ont: intent ontology - :return: ''' + global init_ontology, digit2word, cnt_domain_slot + converted_da = { 'categorical': [], 'non-categorical': [], 'binary': [] } - - for Domain_Act, S, v in da_dict: - Domain, Act = Domain_Act.split('-') - if Domain.lower() in ['police', 'hospital', 'bus']: - continue - - if Act.lower() not in intent_ont: - intent_ont[Act.lower()] = {} - - # general domain is converted to empty domain. e.g. thank, bye - if Domain == 'general': - assert S == 'none' - assert v == 'none' - converted_dict = { - 'intent': Act.lower(), - 'domain': '', - 'slot': '', - 'value': '' - } - converted_da['binary'].append(converted_dict) - - if converted_dict not in binary_ont: - binary_ont.append(converted_dict) - continue - - - - try: - reformated_slot = REF_SYS_DA[Domain][S] - except: - # print('44444444444444444444444444444444') - # print(Domain, S) - # logging.info('slot not in REF_SYS_DA, drop') - continue - - # if slot is None, da should be converted into binary - if reformated_slot is None: - if not (S == 'none' and v == 'none'): - # mainly for `Open` slot - # print('11111111111111111111') - # print(Domain_Act, S, v) - continue - # Booking-Inform none none - # Police-Inform none none - # Train-OfferBook none none - converted_dict = { - 'intent': Act.lower(), - 'domain': Domain.lower(), - 'slot': '', - 'value': '' - } - converted_da['binary'].append(converted_dict) - if converted_dict not in binary_ont: - binary_ont.append(converted_dict) - continue - - reformated_domain_slot = Domain.lower() + '-' + reformated_slot - - if Act.lower() == 'request': - converted_dict = { - 'intent': 'request', - 'domain': Domain.lower(), - 'slot': reformated_slot, - 'value': '' - } - converted_da['binary'].append(converted_dict) - - if converted_dict not in binary_ont: - binary_ont.append(converted_dict) - continue - - # vs = da_dict[(Domain_Act, S)]['values'] - - if reformated_domain_slot in slot_to_type and slot_to_type[reformated_domain_slot] == 'cat': - origin_v = v - v = v.lower() - # if reformated_domain_slot in cat_slot_proj: - # v = cat_slot_proj[reformated_domain_slot][v] - if reformated_domain_slot not in da_cat_slot_values: - da_cat_slot_values[reformated_domain_slot] = [] - # if v not in cat_slot_values[reformated_domain_slot]: - da_cat_slot_values[reformated_domain_slot].append(v) + sentences = sent_tokenizer.tokenize(utt) + sent_spans = sent_tokenizer.span_tokenize(utt) + tokens = [token for sent in sentences for token in word_tokenizer.tokenize(sent)] + token_spans = [(sent_span[0]+token_span[0], sent_span[0]+token_span[1]) for sent, sent_span in zip(sentences, sent_spans) for token_span in word_tokenizer.span_tokenize(sent)] + # assert len(tokens) == len(token_spans) + # for token, span in zip(tokens, token_spans): + # if utt[span[0]:span[1]] != '"': + # assert utt[span[0]:span[1]] == token + + for (intent, domain, slot, value), span in da_dict.items(): + if intent == 'request' or slot == '' or value == '': + # binary dialog acts + init_ontology['binary_dialogue_acts'].add((intent, domain, slot, value,)) + converted_da['binary'].append({ + 'intent': intent, + 'domain': domain, + 'slot': slot, + 'value': value + }) + elif init_ontology['domains'][domain]['slots'][slot]['is_categorical']: + # categorical dialog acts converted_da['categorical'].append({ - 'intent': Act.lower(), - 'domain': Domain.lower(), - 'slot': reformated_slot, - 'value': v + 'intent': intent, + 'domain': domain, + 'slot': slot, + 'value': value }) - if 'start_word' in da_dict[(Domain_Act, S, origin_v)]: - start_ws = da_dict[(Domain_Act, S, origin_v)]['start_word'] - end_ws = da_dict[(Domain_Act, S, origin_v)]['end_word'] - utt_list = utt.split() - for start_w, end_w in zip(start_ws, end_ws): - if start_w > len(utt_list) or end_w > len(utt_list): - continue - start_ch = 0 - for i in range(start_w): - start_ch += len(utt_list[i]) + 1 - end_ch = start_ch - for i in range(start_w, end_w): - end_ch += len(utt_list[i]) + 1 - try: - end_ch += len(utt_list[end_w]) - except: - print(utt_list, start_w, end_w) - if not utt[start_ch: end_ch] == origin_v: - # print('2222222222222222222222222') - # print('\n'.join([v, utt[start_ch: end_ch - 1]])) - continue - - else: - converted_da['categorical'][-1].update({ - 'start': start_ch, - 'end': end_ch - }) - break - else: - if 'start_word' not in da_dict[(Domain_Act, S, v)]: - # todo no span annotation - converted_da['non-categorical'].append({ - 'intent': Act.lower(), - 'domain': Domain.lower(), - 'slot': reformated_slot, - 'value': v - }) - continue - - start_ws = da_dict[(Domain_Act, S, v)]['start_word'] - end_ws = da_dict[(Domain_Act, S, v)]['end_word'] - utt_list = utt.split() - found = True - for start_w, end_w in zip(start_ws, end_ws): - if start_w > len(utt_list) or end_w > len(utt_list): - continue - start_ch = 0 - for i in range(start_w): - start_ch += len(utt_list[i]) + 1 - end_ch = start_ch - for i in range(start_w, end_w): - end_ch += len(utt_list[i]) + 1 - try: - end_ch += len(utt_list[end_w]) - except: - print(utt_list, start_w, end_w, v) - if not utt[start_ch: end_ch] == v: - # print('2222222222222222222222222') - # print('\n'.join([v, utt[start_ch: end_ch - 1]])) - continue - + # non-categorical dialog acts + converted_da['non-categorical'].append({ + 'intent': intent, + 'domain': domain, + 'slot': slot, + 'value': value + }) + # correct some value and try to give char level span + match = False + ori_value = value + value = value.lower() + if span and span[0] <= span[1]: + # use original span annotation, but tokenizations are different + start_word, end_word = span + if end_word >= len(tokens): + # due to different tokenization, sometimes will out of index + delta = end_word - len(tokens) + 1 + start_word -= delta + end_word -= delta + start_char, end_char = token_spans[start_word][0], token_spans[end_word][1] + value_span = utt[start_char:end_char].lower() + match = True + if value_span == value: + cnt_domain_slot['span match'] += 1 + elif value.isdigit() and value in digit2word and digit2word[value] == value_span: + # !!!CHANGE VALUE: value is digit but value span is word + cnt_domain_slot['digit value match'] += 1 + elif ''.join(value.split()) == ''.join(value_span.split()): + # !!!CHANGE VALUE: equal when remove blank + cnt_domain_slot['remove blank'] += 1 + elif value in value_span: + # value in value_span + start_char += value_span.index(value) + end_char = start_char + len(value) + assert utt[start_char:end_char].lower() == value, f'{[value, utt[start_char:end_char], utt]}' + cnt_domain_slot['value in span'] += 1 + elif ':' in value and value == '0'+value_span: + # !!!CHANGE VALUE: time x:xx == 0x:xx + cnt_domain_slot['x:xx == 0x:xx'] += 1 else: - found = True - converted_da['non-categorical'].append({ - 'intent': Act.lower(), - 'domain': Domain.lower(), - 'slot': reformated_slot, - 'value': v, - 'start': start_ch, - 'end': end_ch - }) - break - - if not found: - converted_da['non-categorical'].append({ - 'intent': Act.lower(), - 'domain': Domain.lower(), - 'slot': reformated_slot, - 'value': v - }) - return converted_da - - -def get_state_update(prev_state, cur_state, dialog, did, tid, utt, coref_dict, slot_notfound_dict, da_cat_slot_values): - prev_turns = dialog['turns'] - state_update = {'categorical': [], 'non-categorical': []} - notfoundnum = 0 - total_value = 0 - - diff_state = {} - if prev_state is None: - diff_state = {domain: {slot: value for slot, value in cur_state[domain].items() if value != ''} for domain in - cur_state} - else: - assert len(prev_state) == len(cur_state), print(prev_state, cur_state) - for domain, domain_state in prev_state.items(): - if domain not in diff_state: - diff_state[domain] = {} - for slot, value in domain_state.items(): - if value != cur_state[domain][slot]: - # assert len(cur_state[domain][slot]) > 0, print(did, tid, domain, slot, utt) - diff_state[domain][slot] = cur_state[domain][slot] - - ret_diff_state = copy.deepcopy(diff_state) - - - - for domain in diff_state: - for slot in diff_state[domain]: - - total_value += 1 - fix_or = False - if '|' in diff_state[domain][slot]: - value = diff_state[domain][slot].split('|')[0] - else: - value = diff_state[domain][slot] - - # if dialog['original_id'] == 'PMUL2512' and tid == 17 and value == '02:45': - # value = '2:45' - - value_list = [value] - for _synonyms in synonyms: - if value in _synonyms: - value_list = _synonyms - - value_list.extend(get_time_variants(value)) - value_list.extend(get_genitive_variants(value)) - value_list.extend(get_bb_variants(value)) - - if value.endswith(' restaurant'): - value_list.append(value.split(' restaurant')[0]) - if value.endswith(' hotel'): - value_list.append(value.split(' hotel')[0]) - found = False - for value in value_list: - # categorical slots - if slot in ['internet', 'parking', 'pricerange', 'day', 'area', 'stars']: - reformated_domain_slot = '-'.join([domain, slot]) - if reformated_domain_slot in state_cat_slot_value_dict and (value in state_cat_slot_value_dict[reformated_domain_slot] or value in ['dontcare', '', 'none', 'not mentioned']): - state_update['categorical'].append({ - 'domain': domain, - 'slot': slot, - 'value': diff_state[domain][slot] - }) - if domain + '-' + slot not in da_cat_slot_values: - da_cat_slot_values[domain + '-' + slot] = [diff_state[domain][slot]] - da_cat_slot_values[domain + '-' + slot].append(diff_state[domain][slot]) - if value != diff_state[domain][slot]: - state_update['categorical'][-1].update({'fixed_value': value}) - ret_diff_state[domain][slot] = value - else : - for _turn in prev_turns[::-1]: - found = False - for da in _turn['dialogue_act']['categorical']: - if da['value'] == value: - if 'start' in da: - state_update['categorical'][-1].update({ - 'utt_idx': _turn['utt_idx'], - 'start': da['start'], - 'end': da['end'], - 'from': 'prev_da_span' - }) - found = True - break - if found: - break - else: - state_update['categorical'].append({ - 'domain': domain, - 'slot': slot, - 'value': diff_state[domain][slot], - 'fixed_value': 'not found' - }) - if domain + '-' + slot not in da_cat_slot_values: - da_cat_slot_values[domain + '-' + slot] = [diff_state[domain][slot]] - da_cat_slot_values[domain + '-' + slot].append(diff_state[domain][slot]) - ret_diff_state[domain][slot] = 'not found' - notfoundnum += 1 - # reformated_domain_slot = '-'.join([domain, slot] - found = True - break - - # process value ---> none - assert value not in ['none', 'not mentioned'] - if value in ['', 'dontcare']: - # if reformated_domain_slot not in state_cat_slot_values: - # state_cat_slot_values[reformated_domain_slot] = [] - # # if v not in cat_slot_values[reformated_domain_slot]: - # state_cat_slot_values[reformated_domain_slot].append(value) - state_update['non-categorical'].append({ - 'domain': domain, - 'slot': slot, - 'value': diff_state[domain][slot] - }) - found = True - break - - # first look for values in coref_dict - for _Domain_Act, _Slot, _value in coref_dict: - _domain, _act = _Domain_Act.lower().split('-') - _slot = _Slot.lower() - _coref_value = coref_dict[(_Domain_Act, _Slot, _value)]['coref_value'] - if _coref_value == '': - continue - _coref_turn = coref_dict[(_Domain_Act, _Slot, _value)]['turn'] - if _coref_turn == -1: - continue - _coref_pos = coref_dict[(_Domain_Act, _Slot, _value)]['pos'] - if _coref_pos == '': - continue - _utt = coref_dict[(_Domain_Act, _Slot, _value)]['utt'] - if _domain == domain and _slot == slot and value == _coref_value: - - start_w, end_w = [int(p) for p in _coref_pos.split('-')] - utt_list = _utt.split() - start_ch = 0 - for i in range(start_w): - start_ch += len(utt_list[i]) + 1 - end_ch = start_ch - for i in range(start_w, end_w + 1): - end_ch += len(utt_list[i]) + 1 - end_ch -= 1 - - if not _utt[start_ch: end_ch] == _coref_value: - # print(111111111111111111111111111111111) - # print(_utt[start_ch: end_ch], _coref_value) - continue - - state_update['non-categorical'].append({ - 'domain': domain, - 'slot': slot, - 'value': diff_state[domain][slot], - 'from': 'coref', - 'utt_idx': _coref_turn, - 'start': start_ch, - 'end': end_ch - }) - if value != diff_state[domain][slot]: - state_update['categorical'][-1].update({'fixed_value': value}) - ret_diff_state[domain][slot] = value - found = True - - if found: - break - - # from da annotation - for _turn in prev_turns[::-1]: - for da in _turn['dialogue_act']['non-categorical']: - # if da['domain'] == domain and da['slot'] == slot and fuzz.ratio(da['value'], value) > 85: - # if not da['value'] == value: - # print(1111111111111111) - # print(value, da['value']) - - if fuzz.ratio(da['value'], value) > 85: - - if 'start' in da: - found = True - state_update['non-categorical'].append({ - 'domain': domain, - 'slot': slot, - # 'value': da['value'], - 'value': diff_state[domain][slot], - 'utt_idx': _turn['utt_idx'], - 'start': da['start'], - 'end': da['end'], - 'from': 'prev_da_span' - }) - if value != diff_state[domain][slot]: - state_update['non-categorical'][-1].update({'fixed_value': value}) - ret_diff_state[domain][slot] = value - if da['value'] != value: - state_update['non-categorical'][-1].update({'fixed_value':da['value']}) - ret_diff_state[domain][slot] = da['value'] - - break - if found: - break - - if found: - break - - # from utterance - for _turn in prev_turns[::-1]: - _utt = _turn['utterance'] - (start, end), num = pharse_in_sen(str(value), _utt) - if num: - assert value.lower() == _utt[start:end].lower() \ - or digit2word[value].lower() == _utt[start:end].lower() - found = True - state_update['non-categorical'].append({ - 'domain': domain, - 'slot': slot, - 'value': diff_state[domain][slot], - # 'value': _utt[start:end].lower(), - # 'fixed_value': _utt[start:end].lower(), - 'from': 'prev_utt', - 'utt_idx': _turn['utt_idx'], - 'start': start, - 'end': end - }) - if value != diff_state[domain][slot]: - state_update['non-categorical'][-1].update({'fixed_value': value}) - ret_diff_state[domain][slot] = value - if value != _utt[start:end].lower(): - state_update['non-categorical'][-1].update({'fixed_value': _utt[start:end].lower()}) - ret_diff_state[domain][slot] = _utt[start:end].lower() - found = True - break - if found: - break - - # from utterance - if not value.isdigit(): - for _turn in prev_turns[::-1]: - _utt = _turn['utterance'] - - s = difflib.SequenceMatcher(None, _utt, value) - matches = s.get_matching_blocks() - - for i, j, n in matches: - possible_value = _utt[i: i+len(value)] - - if i+ len(value) < len(_utt) and _utt[i+len(value)] not in [ ' ', ',', '.', '?', '!', '/'] : - possible_value += _utt[i+len(value):].split()[0] - - if possible_value.startswith('th '): - possible_value = possible_value[3:] - i += 3 - if i > 0 and _utt[i-1] not in [ ' ', ',', '.', '?', '!', '/']: - # cut first incomplete word - if len(possible_value.split()) > 1: - i += len(possible_value.split()[0]) + 1 - possible_value = ' '.join(possible_value.split()[1:]) - - - # prepend first incomplete word - # possible_value = _utt[:i].split()[-1] + possible_value - # i -= len(_utt[:i].split()[-1]) - - - if fuzz.token_sort_ratio(value, possible_value) > 92 or possible_value.startswith('ashley hotel and lovell lodge') : - found = True - - state_update['non-categorical'].append({ - 'domain': domain, - 'slot': slot, - 'value': diff_state[domain][slot], - # 'value': possible_value, - # 'fixed_value': possible_value, - 'from':'prev_utt', - 'utt_idx': _turn['utt_idx'], - 'start': i, - 'end': i+len(possible_value) - }) - if value != diff_state[domain][slot]: - state_update['non-categorical'][-1].update({'fixed_value': value}) - ret_diff_state[domain][slot] = value - if possible_value != value: - state_update['non-categorical'][-1].update({'fixed_value': possible_value}) - ret_diff_state[domain][slot] = possible_value - break - # assert _utt[i:i+len(possible_value)] == possible_value, print(_utt, _utt[i:i+len(possible_value)], possible_value) - # break - # if not possible_value == value: - # print(3333333333333333) - # print(value) - # print(possible_value) - if found: - break - if found: + # span mismatch, search near 1-2 words + for window in range(1,3): + start = max(0,start_word-window) + end = min(len(token_spans)-1,end_word+window) + large_span = utt[token_spans[start][0]:token_spans[end][1]].lower() + if value in large_span: + start_char = token_spans[start][0] + large_span.index(value) + end_char = start_char + len(value) + assert utt[start_char:end_char].lower() == value, f'{[value, utt[start_char:end_char], utt]}' + cnt_domain_slot[f'window={window}'] += 1 break - - if found: - break - if not found: - # print('3333333333333333333') - # print(did, tid) - # print(domain, slot, value) - # print([_t['utterance'] for _t in prev_turns]) - # assert slot not in ['internet', 'parking', 'pricerange', 'day', 'area', 'stars'] - - if (domain, slot) not in slot_notfound_dict: - slot_notfound_dict[(domain, slot)] = 1 - else: - slot_notfound_dict[(domain, slot)] += 1 - state_update['non-categorical'].append({ - 'domain': domain, - 'slot': slot, - 'value': diff_state[domain][slot], - 'fixed_value': 'not found' - }) - ret_diff_state[domain][slot] = 'not found' - notfoundnum += 1 - return state_update, notfoundnum, total_value, ret_diff_state - - -def merge_data_annotation(): - extract_dir = os.path.join(self_dir, 'original_data') - data21 = json.load(open(os.path.join(self_dir, extract_dir, 'data.json'))) - # data21_train = json.load(open(os.path.join(self_dir, extract_dir, 'train.json'))) - # data21_val = json.load(open(os.path.join(self_dir, extract_dir, 'val.json'))) - # data21_test = json.load(open(os.path.join(self_dir, extract_dir, 'test.json'))) - # data21 = {} - # data21.update(data21_train) - # data21.update(data21_val) - # data21.update(data21_test) - - # update_from_25_cnt = 0 - # total_turn = 0 - # for dial_id, dialog in data21.items(): - # dial_id = dial_id + '.json' - # assert dial_id in data25 - # for i, _turn in enumerate(dialog['log']): - # total_turn += 1 - # if _turn['text'] == data25[dial_id]['log'][i]['text']: - # _turn['span_info'].extend(copy.deepcopy(data25[dial_id]['log'][i]['span_info'])) - # # _turn['span_info'] = list(set(_turn['span_info'])) - # # _turn['dialog_act'].update(copy.deepcopy(data25[dial_id]['log'][i]['dialog_act'])) - # for Domain_Intent in data25[dial_id]['log'][i]['dialog_act']: - # if Domain_Intent in _turn['dialog_act']: - # _turn['dialog_act'][Domain_Intent].extend(data25[dial_id]['log'][i]['dialog_act'][Domain_Intent]) - # else: - # _turn['dialog_act'][Domain_Intent] = copy.deepcopy(data25[dial_id]['log'][i]['dialog_act'][Domain_Intent]) - # # _turn['dialog_act'][Domain_Intent] = list(set(_turn['dialog_act'][Domain_Intent])) - # if 'coreference' in data25[dial_id]['log'][i]: - # _turn['coreference'] = copy.deepcopy(data25[dial_id]['log'][i]['coreference']) - # update_from_25_cnt += 1 - # else: - # # print('==============multiwoz21=================') - # # print(_turn['text']) - # # print('==============multiwoz25=================') - # # print(data25[dial_id]['log'][i]['text']) - # continue - # - # print('{}/{} turns update from multiwoz25 data'.format(update_from_25_cnt, total_turn)) - return data21 - - -def preprocess(da_cat_slot_values, state_cat_slot_values): - all_data = [] - binary_ont = [] - intent_ont = {} - state_ont = {} - - data_splits = ['train', 'val', 'test'] - # data_splits = ['test'] - extract_dir = os.path.join(self_dir, 'original_data') - num_train_dialogue = 0 - num_train_utt = 0 - - num_match_error_da_span = 0 - - if not os.path.exists('data.zip') or not os.path.exists('ontology.json'): - # for data_split in data_splits: - # data_zip_file = os.path.join(self_dir, 'original_data.zip') - # if not os.path.exists(data_zip_file): - # raise FileNotFoundError(data_zip_file) - - # logging.info('unzip multiwoz data to {}'.format(extract_dir)) - # archive = zipfile.ZipFile(data_zip_file, 'r') - # archive.extractall(extract_dir) - - data = merge_data_annotation() - # exit() - # data = json.load(open(os.path.join(self_dir, extract_dir, 'data_meta_fixed.json'))) - train_list = open(os.path.join(self_dir, extract_dir, 'trainListFile')).read().split() - val_list = open(os.path.join(self_dir, extract_dir, 'valListFile')).read().split() - test_list = open(os.path.join(self_dir, extract_dir, 'testListFile')).read().split() - - total_not_found_slot = 0 - total_slot = 0 - total_turn = 0 - total_not_found_turn = 0 - total_not_found_state = 0 - - slot_notfound_dict = {} - - dialog_idx = 0 - for dialog_id, dialog in tqdm(data.items()): - - acc_not_found_flag = False - - coref_dict = {} - - data_split = None - for _split in data_splits: - if dialog_id.strip('.json') in eval(_split + '_list'): - data_split = _split - break - # assert data_split is not None - # if data_split != 'test': - # continue - if data_split is None: - continue - - if data_split == 'train': - num_train_dialogue += len(data) - - dialog_idx += 1 - # if dialog_idx > 10: - # break - converted_dialogue = { - 'dataset': 'multiwoz21', - 'data_split': data_split, - 'dialogue_id': 'multiwoz21_' + str(dialog_idx), - 'original_id': dialog_id, - 'domains': [d for d in dialog['goal'] if - len(dialog['goal'][d]) != 0 and d in multiwoz_desc and d not in ['police', 'hospital', 'bus']], - 'turns': [], - } - - if data_split == 'train': - num_train_utt += len(dialog['log']) - - prev_state = None - accum_fixed_state = {} - for turn_id, turn in enumerate(dialog['log']): - - utt = turn['text'].lower() - # for several wrong words - utt = utt.replace('seeuni', 'see uni') - - utt = ' '.join(utt.split()) - utt = utt.replace(' im ', ' i\'m ') - utt = utt.replace(' dont ', ' don\'t ') - utt = utt.replace(' thats ', ' that\'s ') - utt = utt.replace('idon\'t', ' i don\'t ') - utt = utt.replace('wedon\'t ', 'we don\'t ') - utt = utt.replace('id be ', 'i\'d be ') - # utt = utt.replace('cambridgethat\'svery ', 'cambridge that\'s very') - utt = re.sub(r'^im ', 'i\'m ', utt) - utt = re.sub(r'^whats ', 'what\'s ', utt) - utt = re.sub(r'^id ', 'i\'d ', utt) - utt = re.sub(r'^thats ', 'that\'s ', utt) - - utt = re.sub( r'([a-zA-Z0-9])([,.!\'-\?"~])', r'\1 \2', utt) - utt = re.sub(r'([,.!\'-\?"~])([a-zA-Z0-9])', r'\1 \2', utt) - - das = turn.get('dialog_act', []) - role = 'user' if turn_id % 2 == 0 else 'system' - spans = turn.get('span_info', []) - - da_dict = {} - for Domain_Act in das: - Domain = Domain_Act.split('-')[0] - if Domain.lower() not in converted_dialogue['domains'] and Domain.lower() not in ['general', 'booking']: - continue - - Svs = das[Domain_Act] - for S, v in Svs: - v = v.lower() - if v.startswith('th '): - # print(v) - v = v[3:] - if v.startswith('he '): - # print(v) - v = v[3:] - - if (Domain_Act, S, v) not in da_dict: - da_dict[(Domain_Act, S, v)] = {} - - for span in spans: - Domain_Act, S, v, start_word, end_word = span - v = v.lower() - if not (Domain_Act, S, v) in da_dict: - # logging.info('span da annotation not found in multiwoz da label') - # logging.info(dialog_id, turn_id) - # logging.info((Domain_Act, S, v)) - # logging.info(da_dict) - num_match_error_da_span += 1 else: - if v.startswith('th '): - # print(v) - v = v[3:] - start_word += 3 - if v.startswith('he '): - # print(v) - v = v[3:] - start_word += 3 - - if 'start_word' not in da_dict[(Domain_Act, S, v)]: - da_dict[(Domain_Act, S, v)]['start_word'] = [] - da_dict[(Domain_Act, S, v)]['end_word'] = [] - - da_dict[(Domain_Act, S, v)]['start_word'].append(start_word) - da_dict[(Domain_Act, S, v)]['end_word'].append(end_word) - - converted_turn = { - 'utt_idx': turn_id, - 'speaker': role, - 'utterance': utt, - 'dialogue_act': convert_da(utt, da_dict, binary_ont, intent_ont, dialog_id, turn_id, da_cat_slot_values), - } - - # for state annotations - if role == 'system': - turn_state = turn['metadata'] - cur_state = {} - for domain in turn_state: - if domain in ['police', 'hospital', 'bus']: - continue - if domain not in converted_dialogue['domains']: - continue - cur_state[domain] = {} - for subdomain in ['semi', 'book']: - for slot in turn_state[domain][subdomain]: - if slot == 'booked': - continue - if slot == 'ticket': # or (domain == 'train' and slot == 'people'): - # for cases where domain slot exists in REF but not in state - # because of check in evaluate.py - continue - - else: - fixed_slot = slot - state_ds = domain + '-' + fixed_slot - if state_ds not in slot_to_type: - logging.info('state slot not defined in da list') - logging.info(state_ds) - if turn_state[domain][subdomain][slot] in ['', [], 'not mentioned', 'none']: - cur_state[domain][fixed_slot] = "" - else: - if turn_state[domain][subdomain][slot].startswith('th '): - # print('state') - # print(turn_state[domain][subdomain][slot]) - turn_state[domain][subdomain][slot] = turn_state[domain][subdomain][slot][3:] - if turn_state[domain][subdomain][slot].startswith('he '): - # print('state') - # print(turn_state[domain][subdomain][slot]) - turn_state[domain][subdomain][slot] = turn_state[domain][subdomain][slot][3:] - - cur_state[domain][fixed_slot] = turn_state[domain][subdomain][slot] - - if domain not in state_ont: - state_ont[domain] = [] - if fixed_slot not in state_ont[domain]: - state_ont[domain].append(fixed_slot) - - if domain == 'train' and 'people' not in cur_state[domain]: - cur_state[domain]['people'] = '' - # if len(converted_turn['state'][domain]) == 0: - # converted_turn['state'].pop(domain) - if len(converted_dialogue['turns']) > 0: - # move state from system side to user side - converted_dialogue['turns'][-1]['state'] = copy.deepcopy(cur_state) - - # for state update annotations - state_update, _notfoundslot, _totalslot, ret_diff_state = get_state_update(prev_state, cur_state, converted_dialogue, - dialog_id, turn_id, turn['text'], coref_dict, - slot_notfound_dict, da_cat_slot_values) - - update_state(accum_fixed_state, ret_diff_state) - for domain in accum_fixed_state: - for slot in accum_fixed_state[domain]: - assert isinstance(accum_fixed_state[domain][slot], str), print(accum_fixed_state[domain][slot]) - - if _notfoundslot == 0: - # for slot in state_update['categorical']: - # assert 'fixed_value' not in slot - for slot in state_update['non-categorical']: - if slot['value'] not in ['', 'dontcare']: - assert 'utt_idx' in slot - - else: - flag = False - for slot in state_update['categorical']: - if 'fixed_value' in slot: - flag = True - break - for slot in state_update['non-categorical']: - if 'utt_idx' not in slot: - flag = True - break - assert flag, print(flag, state_update['non-categorical']) - - total_turn += 1 - total_slot += _totalslot - total_not_found_slot += _notfoundslot - total_not_found_turn += 1 if _notfoundslot > 0 else 0 - if _notfoundslot > 0: - acc_not_found_flag = True - if acc_not_found_flag: - total_not_found_state += 1 - - coref_dict = {} - converted_dialogue['turns'][-1]['state_update'] = copy.deepcopy(state_update) - converted_dialogue['turns'][-1]['fixed_state'] = copy.deepcopy(accum_fixed_state) - if 'state' not in converted_dialogue['turns'][-1]: - converted_dialogue['turns'][-1]['state'] = {} - prev_state = copy.deepcopy(cur_state) - - converted_dialogue['turns'].append(converted_turn) - - if 'coreference' in turn: - for Domain_Act in turn['coreference']: - for Slot, value, coref, coref_turn, coref_pos in turn['coreference'][Domain_Act]: - value = value.lower() - coref_dict[(Domain_Act, Slot, value)] = {'turn': coref_turn, 'pos': coref_pos, - 'coref_value': coref, - 'utt': converted_dialogue['turns'][coref_turn][ - 'utterance']} - - check_spans(converted_dialogue) - # postprocess_update_spans(converted_dialogue) - if converted_dialogue['turns'][-1]['speaker'] == 'system': - converted_dialogue['turns'].pop(-1) - all_data.append(converted_dialogue) - - print('total_turn', total_turn) - print('total_not_found_turn', total_not_found_turn) - print('total_slot', total_slot) - print('total_not_found_slot', total_not_found_slot) - print('total_not_found_state', total_not_found_state) - print(slot_notfound_dict) - from collections import Counter - # print({k : dict(Counter(v)) for k, v in cat_slot_values.items()}) - json.dump({k : dict(Counter(v)) for k, v in state_cat_slot_values.items()}, open(os.path.join(self_dir, 'cat_slot_values.json'), 'w'), indent=4) - cat_slot_values = {k: list(set(v)) for k, v in state_cat_slot_values.items()} - da_cat_slot_values = {k: list(set(v)) for k, v in da_cat_slot_values.items()} - - json.dump(all_data, open('data.json', 'w'), indent=4) - write_zipped_json(os.path.join(self_dir, './data.zip'), 'data.json') - os.remove('data.json') - - new_ont = { - 'domains': {}, - 'intents': {}, - 'binary_dialogue_act': {} - } - - for d_s in slot_to_type: - d, s = d_s.split('-') - if d not in new_ont['domains']: - new_ont['domains'][d] = { - 'description': multiwoz_desc[d]['domain'], - 'slots': {} - } - domain_ont = new_ont['domains'][d] - assert s not in domain_ont - domain_ont['slots'][s] = { - 'description': multiwoz_desc[d][s] if s in multiwoz_desc[d] else '', - 'is_categorical': d_s in state_cat_slot_ds, - 'possible_values': da_cat_slot_values[d_s] if d_s in state_cat_slot_ds else [] - } - domain_ont['slots'][s]['possible_values'] = [_ for _ in domain_ont['slots'][s]['possible_values'] if _ not in ['dontcare', '']] - - new_ont['state'] = {} - # print(state_cat_slot_value_dict) - print(state_ont) - for d in state_ont: - new_ont['state'][d] = {} - for s in state_ont[d]: - d_s = '-'.join([d, s]) - new_ont['state'][d][s] = '' - - new_ont['intents'] = {i: {'description': multiwoz_desc['intents'][i]} for i in intent_ont} - new_ont['binary_dialogue_act'] = binary_ont - - slot_desc = json.load(open(os.path.join(self_dir, extract_dir, './slot_descriptions.json'))) - for domain_slot in slot_desc: - _domain, _slot = domain_slot.split('-') - _desc = slot_desc[domain_slot][0] - if _slot == 'arriveby': - _slot = 'arriveBy' - elif _slot == 'leaveat': - _slot = 'leaveAt' - if 'book' in _slot: - _slot = _slot.replace('book ', '') - if not _domain in new_ont['state']: - # logging.info('domain {} not in state domains'.format(_domain)) - continue - if _domain in new_ont['domains'] and _slot in new_ont['domains'][_domain]['slots']: - new_ont['domains'][_domain]['slots'][_slot]['description'] = _desc - if not _slot in new_ont['state'][_domain]: - logging.info('domain {} slot {} not in state'.format(_domain, _slot)) - continue - # new_ont['state'][_domain][_slot] = "" - assert _domain in new_ont['domains'], print(_domain) - assert _slot in new_ont['domains'][_domain]['slots'] - - logging.info('num_match_error_da_span {}'.format(num_match_error_da_span)) - json.dump(new_ont, open(os.path.join(self_dir, './ontology.json'), 'w'), indent=4) - - else: - all_data = read_zipped_json(os.path.join(self_dir, './data.zip'), 'data.json') - new_ont = json.load(open(os.path.join(self_dir, './ontology.json'), 'r')) - logging.info('# dialogue: {}, # turn: {}'.format(num_train_dialogue, num_train_utt)) - return all_data, new_ont - - -# def postprocess_update_spans(dialog): -# changed_utt_idx_and_position = {} -# for turn in dialog['turns']: -# if turn['speaker'] != 'user': -# continue -# changed = False -# for _update in turn['state_update']['non-categorical']: -# if 'utt_idx' in _update: -# utt_idx = _update['utt_idx'] -# start = _update['start'] -# end = _update['end'] -# -# # assume at most one word changes for every utterance -# if turn['utt_idx'] not in changed_utt_idx_and_position: -# if utt_idx == turn['utt_idx'] and start-1 > -1 and turn['utterance'][start-1] not in [' ']: -# changed_utt_idx_and_position[turn['utt_idx']] = start -# print('=======================') -# print(dialog['original_id']) -# print(turn['utterance']) -# print(json.dumps(_update, indent=2)) -# print(turn['utterance'][start: end]) -# turn['utterance'] = turn['utterance'][:start] + ' ' + turn['utterance'][start:] -# print(turn['utterance']) -# _update['start'] += 1 -# _update['end'] += 1 -# changed = True -# if utt_idx not in changed_utt_idx_and_position: -# continue -# else: -# value = _update['fixed_value'] if 'fixed_value' in _update and _update['fixed_value'] != 'not found' else _update['value'] -# if start >= changed_utt_idx_and_position[utt_idx]: -# if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: -# assert dialog['turns'][utt_idx]['utterance'][_update['start']+1: _update['end']+1] == value, print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][_update['start']+1: _update['end']+1]) -# _update['start'] += 1 -# _update['end'] += 1 -# elif start < changed_utt_idx_and_position[utt_idx] < end: -# if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: -# assert (dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']+1]).replace(' ', '') == value.replace(' ', ''), print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']+1], value) -# print('fix') -# print(_update) -# _update['end'] += 1 -# _update['fixed_value'] = turn['utterance'][_update['start']: _update['end'] + 1].strip() -# print(_update) -# if changed: -# for _update in turn['state_update']['non-categorical']: -# if 'utt_idx' in _update: -# utt_idx = _update['utt_idx'] -# start = _update['start'] -# end = _update['end'] -# -# if utt_idx not in changed_utt_idx_and_position: -# continue -# else: -# value = _update['fixed_value'] if 'fixed_value' in _update and _update[ -# 'fixed_value'] != 'not found' else _update['value'] -# if start >= changed_utt_idx_and_position[utt_idx]: -# if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: -# assert dialog['turns'][utt_idx]['utterance'][_update['start'] + 1: _update['end'] + 1] == value -# _update['start'] += 1 -# _update['end'] += 1 -# elif start < changed_utt_idx_and_position[utt_idx] < end: -# if dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']] != value: -# print('====================fix===================') -# print(_update) -# assert (dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end']+1]).replace(' ', '') == value.replace(' ', ''), print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][_update['start']+1: _update['end']+1]) -# _update['end'] += 1 -# _update['fixed_value'] = dialog['turns'][utt_idx]['utterance'][_update['start']: _update['end'] + 1] -# print(_update) -# for turn in dialog['turns']: -# if turn['speaker'] != 'user': -# continue -# for _update in turn['state_update']['non-categorical']: -# if 'utt_idx' in _update: -# value = _update['fixed_value'] if 'fixed_value' in _update and _update[ -# 'fixed_value'] != 'not found' else _update['value'] -# utt_idx = _update['utt_idx'] -# start = _update['start'] -# end = _update['end'] -# if dialog['turns'][utt_idx]['utterance'][start] == ' ': -# _update['start'] += 1 -# _update['fixed_value'] = value[1:] -# value = value[1:] -# start += 1 -# assert dialog['turns'][utt_idx]['utterance'][start: end] == value, print(json.dumps(turn, indent=4), [c for c in dialog['turns'][utt_idx]['utterance'][start: end]], [c for c in value]) -# return dialog - + # still not found + match = False + + if match: + converted_da['non-categorical'][-1]['value'] = utt[start_char:end_char] + converted_da['non-categorical'][-1]['start'] = start_char + converted_da['non-categorical'][-1]['end'] = end_char + cnt_domain_slot['have span'] += 1 + else: + cnt_domain_slot['no span'] += 1 + return converted_da -def get_time_variants(time_text): - value_list = [time_text] - pattern_time = r'(\d{1,2}:\d{2})(\s)?(am|pm|AM|PM)?' - match_times = re.findall(pattern_time, time_text) - if len(match_times) < 1: - return [] - match_time = match_times[0] +def preprocess(): + original_data_dir = 'MultiWOZ_2.1' + new_data_dir = 'data' - am_flag = match_time[2] in ['am', 'AM'] - pm_flag = match_time[2] in ['pm', 'PM'] - no_am_pm_flag = match_time[2] == '' - if am_flag: - # 4:00am -> 4:00 - value_list.append(match_time[0]) - if len(match_time[0]) == 4: - # 4:00 -> 04:00 - value_list.append('0' + match_time[0]) - if pm_flag: - # 4:00pm -> 16:00 - hour, min = match_time[0].split(':') - hour = int(hour) - new_hour = 12 + hour - value_list.append(str(new_hour)+':'+min) - if no_am_pm_flag: - hour, min = match_time[0].split(':') - hour = int(hour) - if hour > 12: - new_hour = hour - 12 - value_list.append(str(new_hour) + ':' + min + 'pm') - value_list.append(str(new_hour) + ':' + min + ' pm') - value_list.append(str(new_hour) + ':' + min) - if min == '00': - value_list.append(str(new_hour) + 'pm') - value_list.append(str(new_hour) + ' pm') - value_list.append(str(new_hour)) + if not os.path.exists(original_data_dir): + # should manually download MultiWOZ_2.1.zip from + # https://github.com/budzianowski/multiwoz/blob/master/data/MultiWOZ_2.1.zip + original_data_zip = 'MultiWOZ_2.1.zip' + if not os.path.exists(original_data_zip): + raise FileNotFoundError(f'cannot find original data: {original_data_zip}') else: - value_list.append(str(hour) + ':' + min + 'am') - value_list.append(str(hour) + ':' + min + ' am') - value_list.append(str(hour) + ':' + min) - if min == '00': - value_list.append(str(hour) + 'am') - value_list.append(str(hour) + ' am') - value_list.append(str(hour)) - if len(match_time[0]) == 5 and match_time[0][0] == '0': - value_list.append(match_time[0][1:]) - value_list.append(''.join(match_time[0].split(':'))) - - return value_list - - -def get_genitive_variants(value): - ret_list = [] - value_genitive_format = r"(?=\w)s(?=\s)" - value_pattern = re.compile(value_genitive_format) - - span_genitive_value = re.sub(value_pattern, " 's", value) - if span_genitive_value != value: - ret_list.append(span_genitive_value) - span_genitive_value = re.sub(value_pattern, "'s", value) - if span_genitive_value != value: - ret_list.append(span_genitive_value) - # if len(ret_list) > 0: - # print('=============================') - # print(value) - # print(re.findall(value_pattern, value)) - # print(ret_list) - return ret_list - - -def check_spans(dialog): - for turn in dialog['turns']: - if turn['speaker'] != 'user': - continue - for _update in turn['state_update']['non-categorical']: - if 'utt_idx' in _update: - value = _update['fixed_value'] if 'fixed_value' in _update and _update[ - 'fixed_value'] != 'not found' else _update['value'] - utt_idx = _update['utt_idx'] - start = _update['start'] - end = _update['end'] - assert dialog['turns'][utt_idx]['utterance'][start:end] == value, print(dialog['turns'][utt_idx]['utterance'], dialog['turns'][utt_idx]['utterance'][start:end]) - + archive = ZipFile(original_data_zip) + archive.extractall() + + os.makedirs(new_data_dir, exist_ok=True) + for filename in os.listdir(original_data_dir): + if 'db' in filename: + copy2(f'{original_data_dir}/{filename}', new_data_dir) + + original_data = json.load(open(f'{original_data_dir}/data.json')) + global init_ontology, cnt_domain_slot + + val_list = set(open(f'{original_data_dir}/valListFile.txt').read().split()) + test_list = set(open(f'{original_data_dir}/testListFile.txt').read().split()) + dataset = 'multiwoz21' + splits = ['train', 'validation', 'test'] + dialogues_by_split = {split:[] for split in splits} + sent_tokenizer = PunktSentenceTokenizer() + word_tokenizer = TreebankWordTokenizer() + for ori_dialog_id, ori_dialog in tqdm(original_data.items()): + if ori_dialog_id in val_list: + split = 'validation' + elif ori_dialog_id in test_list: + split = 'test' + else: + split = 'train' + dialogue_id = f'{dataset}-{split}-{len(dialogues_by_split[split])}' + + # get user goal and involved domains + cur_domains = [] + goal = { + 'description': '. '.join(ori_dialog['goal']['message']), + 'inform': {}, + 'request': {} + } + for k, v in ori_dialog['goal'].items(): + if len(v) != 0 and k in init_ontology['domains']: + cur_domains.append(k) + goal['inform'][k] = {} + goal['request'][k] = {} + for attr in ['fail_info', 'info', 'fail_book', 'book']: + if attr in v: + for slot, value in v[attr].items(): + if 'invalid' in slot: + continue + domain, slot, value = normalize_domain_slot_value(k, slot, value) + if slot in goal['inform'][domain]: + goal['inform'][domain][slot] += '|'+value + else: + goal['inform'][domain][slot] = value + if 'reqt' in v: + for slot in v['reqt']: + domain, slot, _ = normalize_domain_slot_value(k, slot, '') + goal['request'][domain][slot] = '' + + dialogue = { + 'dataset': dataset, + 'data_split': split, + 'dialogue_id': dialogue_id, + 'original_id': ori_dialog_id, + 'domains': cur_domains, # will be updated by dialog_acts and state + 'goal': goal, + 'turns': [] + } + for turn_id, turn in enumerate(ori_dialog['log']): + # correct some grammar error in text, mainly follow tokenization.md in MultiWOZ_2.1 + text = turn['text'] + text = re.sub(" Im ", " I'm ", text) + text = re.sub(" im ", " i'm ", text) + text = re.sub(r"^Im ", "I'm ", text) + text = re.sub(r"^im ", "i'm ", text) + text = re.sub("theres", "there's", text) + text = re.sub("dont", "don't", text) + text = re.sub("whats", "what's", text) + text = re.sub('thats', "that's", text) + utt = text + speaker = 'user' if turn_id % 2 == 0 else 'system' + + das = turn.get('dialog_act', []) + spans = turn.get('span_info', []) + da_dict = {} + # transform DA + for Domain_Intent in das: + domain, intent = Domain_Intent.lower().split('-') + assert intent in init_ontology['intents'], f'{ori_dialog_id}:{turn_id}:da\t{intent} not in ontology' + for Slot, value in das[Domain_Intent]: + domain, slot, value = normalize_domain_slot_value(domain, Slot, value) + if domain not in cur_domains: + # update original cur_domains + cur_domains.append(domain) + da_dict[(intent, domain, slot, value,)] = [] + + for span in spans: + Domain_Intent, Slot, value, start_word, end_word = span + domain, intent = Domain_Intent.lower().split('-') + domain, slot, value = normalize_domain_slot_value(domain, Slot, value) + assert (intent, domain, slot, value,) in da_dict + da_dict[(intent, domain, slot, value,)] = [start_word, end_word] + + dialogue_acts = convert_da(da_dict, utt, sent_tokenizer, word_tokenizer) # will also update ontology + + dialogue['turns'].append({ + 'speaker': speaker, + 'utterance': utt, + 'utt_idx': turn_id, + 'dialogue_acts': dialogue_acts, + }) -def get_bb_variants(value): - ret_list = [] - if 'bed and breakfast' in value: - ret_list.append(value.replace('bed and breakfast', 'b & b')) - return ret_list + if speaker == 'system': + # add state to last user turn + # add empty db_results + turn_state = turn['metadata'] + cur_state = copy.deepcopy(init_ontology['state']) + for domain in turn_state: + if domain not in cur_state: + continue + for subdomain in ['semi', 'book']: + for slot, value in turn_state[domain][subdomain].items(): + if slot in ['booked', 'ticket']: + continue + _, slot, value = normalize_domain_slot_value(domain, slot, value) + cur_state[domain][slot] = value + dialogue['turns'][-2]['state'] = cur_state + dialogue['turns'][-1]['db_results'] = {} + dialogues_by_split[split].append(dialogue) + # pprint(cnt_domain_slot.most_common()) + dialogues = [] + for split in splits: + dialogues += dialogues_by_split[split] + init_ontology['binary_dialogue_acts'] = [{'intent':bda[0],'domain':bda[1],'slot':bda[2],'value':bda[3]} for bda in init_ontology['binary_dialogue_acts']] + json.dump(dialogues[:10], open(f'dummy_data.json', 'w'), indent=2) + json.dump(dialogues, open(f'{new_data_dir}/dialogues.json', 'w'), indent=2) + json.dump(init_ontology, open(f'{new_data_dir}/ontology.json', 'w'), indent=2) + with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf: + for filename in os.listdir(new_data_dir): + zf.write(f'{new_data_dir}/{filename}') + return dialogues, init_ontology if __name__ == '__main__': - preprocess(da_cat_slot_values, state_cat_slot_values) \ No newline at end of file + preprocess() \ No newline at end of file