diff --git a/data/unified_datasets/multiwoz21/README.md b/data/unified_datasets/multiwoz21/README.md index d8fbcf9b5b5fed37728cbe63f97c0c28b222fc1d..1dadc8c32d7a198ace917598d71f82cf232d6138 100644 --- a/data/unified_datasets/multiwoz21/README.md +++ b/data/unified_datasets/multiwoz21/README.md @@ -19,6 +19,7 @@ MultiWOZ 2.1 fixed the noise in state annotations and dialogue utterances. It al - Correct some non-categorical slots' values and provide character level span annotation. - Concatenate multiple values in user goal & state using `|`. - Add `booked` information in system turns from original belief states. + - Remove `Booking` domain and remap all booking relevant dialog acts to unify the annotation of booking action in different domains, see `booking_remapper.py`. - **Annotations:** - user goal, dialogue acts, state. @@ -34,14 +35,14 @@ English | split | dialogues | utterances | avg_utt | avg_tokens | avg_domains | cat slot match(state) | cat slot match(goal) | cat slot match(dialogue act) | non-cat slot span(dialogue act) | |------------|-------------|--------------|-----------|--------------|---------------|-------------------------|------------------------|--------------------------------|-----------------------------------| -| train | 8438 | 113556 | 13.46 | 13.23 | 3.39 | 98.84 | 99.48 | 86.39 | 98.22 | -| validation | 1000 | 14748 | 14.75 | 13.5 | 3.64 | 98.84 | 99.46 | 86.59 | 98.17 | -| test | 1000 | 14744 | 14.74 | 13.5 | 3.59 | 99.21 | 99.32 | 85.83 | 98.58 | -| all | 10438 | 143048 | 13.7 | 13.28 | 3.44 | 98.88 | 99.47 | 86.36 | 98.25 | +| train | 8438 | 113556 | 13.46 | 13.23 | 2.8 | 98.84 | 99.48 | 86.39 | 98.22 | +| validation | 1000 | 14748 | 14.75 | 13.5 | 2.98 | 98.84 | 99.46 | 86.59 | 98.17 | +| test | 1000 | 14744 | 14.74 | 13.5 | 2.93 | 99.21 | 99.32 | 85.83 | 98.58 | +| all | 10438 | 143048 | 13.7 | 13.28 | 2.83 | 98.88 | 99.47 | 86.35 | 98.25 | -9 domains: ['attraction', 'hotel', 'taxi', 'restaurant', 'train', 'police', 'hospital', 'booking', 'general'] -- **cat slot match**: how many values of categorical slots are in the possible values of ontology. -- **non-cat slot span**: how many values of non-categorical slots have span annotation. +8 domains: ['attraction', 'hotel', 'taxi', 'restaurant', 'train', 'police', 'hospital', 'general'] +- **cat slot match**: how many values of categorical slots are in the possible values of ontology in percentage. +- **non-cat slot span**: how many values of non-categorical slots have span annotation in percentage. ### Citation diff --git a/data/unified_datasets/multiwoz21/data.zip b/data/unified_datasets/multiwoz21/data.zip index 95c1ebbf0794eaad22b945c5ad4d55e7916f41f7..d62ee13bcf3f266f18761e9bf2110783167f5236 100644 Binary files a/data/unified_datasets/multiwoz21/data.zip and b/data/unified_datasets/multiwoz21/data.zip differ diff --git a/data/unified_datasets/multiwoz21/dummy_data.json b/data/unified_datasets/multiwoz21/dummy_data.json index a012b907e3634e25eefbdbf8a2eb44ee0236b5ba..ec7aed39735bed82218038a31722b6fdf8e3aac8 100644 --- a/data/unified_datasets/multiwoz21/dummy_data.json +++ b/data/unified_datasets/multiwoz21/dummy_data.json @@ -6,7 +6,6 @@ "original_id": "SNG01856.json", "domains": [ "hotel", - "booking", "general" ], "goal": { @@ -202,11 +201,6 @@ } ], "binary": [ - { - "intent": "inform", - "domain": "booking", - "slot": "" - }, { "intent": "inform", "domain": "hotel", @@ -307,8 +301,8 @@ "categorical": [ { "intent": "nobook", - "domain": "booking", - "slot": "day", + "domain": "hotel", + "slot": "book day", "value": "Tuesday" } ], @@ -316,13 +310,13 @@ "binary": [ { "intent": "request", - "domain": "booking", + "domain": "hotel", "slot": "book stay" }, { "intent": "request", - "domain": "booking", - "slot": "day" + "domain": "hotel", + "slot": "book day" } ] }, @@ -404,8 +398,8 @@ "categorical": [], "non-categorical": [ { - "intent": "book", - "domain": "booking", + "intent": "inform", + "domain": "hotel", "slot": "ref", "value": "7GAWK763", "start": 46, @@ -417,6 +411,11 @@ "intent": "reqmore", "domain": "general", "slot": "" + }, + { + "intent": "book", + "domain": "hotel", + "slot": "" } ] }, @@ -989,7 +988,6 @@ "domains": [ "hotel", "train", - "booking", "general" ], "goal": { @@ -1389,7 +1387,7 @@ "categorical": [], "non-categorical": [ { - "intent": "offerbooked", + "intent": "inform", "domain": "train", "slot": "ref", "value": "A9NHSO9Y", @@ -1397,7 +1395,13 @@ "end": 69 } ], - "binary": [] + "binary": [ + { + "intent": "book", + "domain": "train", + "slot": "" + } + ] }, "db_results": {}, "booked": { @@ -1572,8 +1576,8 @@ "binary": [ { "intent": "request", - "domain": "booking", - "slot": "day" + "domain": "hotel", + "slot": "book day" } ] }, @@ -1675,15 +1679,21 @@ "categorical": [], "non-categorical": [ { - "intent": "book", - "domain": "booking", + "intent": "inform", + "domain": "hotel", "slot": "ref", "value": "5NAWGJDC", "start": 46, "end": 54 } ], - "binary": [] + "binary": [ + { + "intent": "book", + "domain": "hotel", + "slot": "" + } + ] }, "db_results": {}, "booked": { @@ -2179,8 +2189,7 @@ "domains": [ "hotel", "train", - "general", - "booking" + "general" ], "goal": { "description": "You are looking for information in Cambridge. You are looking for a <span class='emphasis'>place to stay</span>. The hotel should be in the <span class='emphasis'>east</span> and <span class='emphasis'>doesn't need to include internet</span>. The hotel should <span class='emphasis'>include free parking</span> and should be in the type of <span class='emphasis'>guesthouse</span>. If there is no such hotel, how about one that has <span class='emphasis'>free wifi</span>. Make sure you get <span class='emphasis'>address</span>. You are also looking for a <span class='emphasis'>train</span>. The train should leave on <span class='emphasis'>wednesday</span> and should <span class='emphasis'>arrive by 11:30</span>. The train should go to <span class='emphasis'>cambridge</span> and should depart from <span class='emphasis'>ely</span>. Once you find the train you want to make a booking for <span class='emphasis'>8 people</span>. Make sure you get the <span class='emphasis'>reference number</span>", @@ -2594,11 +2603,6 @@ } ], "binary": [ - { - "intent": "inform", - "domain": "booking", - "slot": "" - }, { "intent": "inform", "domain": "hotel", @@ -3024,7 +3028,7 @@ "categorical": [], "non-categorical": [ { - "intent": "offerbooked", + "intent": "inform", "domain": "train", "slot": "ref", "value": "7ZCVR4Q3", @@ -3032,7 +3036,7 @@ "end": 106 }, { - "intent": "offerbooked", + "intent": "inform", "domain": "train", "slot": "price", "value": "35.2 GBP", @@ -3040,7 +3044,13 @@ "end": 49 } ], - "binary": [] + "binary": [ + { + "intent": "book", + "domain": "train", + "slot": "" + } + ] }, "db_results": {}, "booked": { @@ -3153,7 +3163,6 @@ "domains": [ "hotel", "train", - "booking", "general" ], "goal": { @@ -3902,13 +3911,7 @@ "dialogue_acts": { "categorical": [], "non-categorical": [], - "binary": [ - { - "intent": "inform", - "domain": "booking", - "slot": "" - } - ] + "binary": [] }, "db_results": {}, "booked": { @@ -3987,7 +3990,7 @@ "binary": [ { "intent": "request", - "domain": "booking", + "domain": "hotel", "slot": "book stay" } ] @@ -4080,8 +4083,8 @@ "binary": [ { "intent": "request", - "domain": "booking", - "slot": "day" + "domain": "hotel", + "slot": "book day" } ] }, @@ -4161,16 +4164,16 @@ "categorical": [], "non-categorical": [ { - "intent": "book", - "domain": "booking", + "intent": "inform", + "domain": "hotel", "slot": "ref", "value": "04CSEO7Q", "start": 132, "end": 140 }, { - "intent": "book", - "domain": "booking", + "intent": "inform", + "domain": "hotel", "slot": "name", "value": "Acorn guest house", "start": 89, @@ -4182,6 +4185,11 @@ "intent": "reqmore", "domain": "general", "slot": "" + }, + { + "intent": "book", + "domain": "hotel", + "slot": "" } ] }, @@ -5778,7 +5786,7 @@ "dialogue_acts": { "categorical": [ { - "intent": "offerbooked", + "intent": "inform", "domain": "train", "slot": "day", "value": "Wednesday" @@ -5786,7 +5794,7 @@ ], "non-categorical": [ { - "intent": "offerbooked", + "intent": "inform", "domain": "train", "slot": "ref", "value": "xu1qlhvw", @@ -5794,7 +5802,7 @@ "end": 111 }, { - "intent": "offerbooked", + "intent": "inform", "domain": "train", "slot": "departure", "value": "Cambridge", @@ -5802,7 +5810,7 @@ "end": 51 }, { - "intent": "offerbooked", + "intent": "inform", "domain": "train", "slot": "book people", "value": "one", @@ -5810,7 +5818,7 @@ "end": 16 }, { - "intent": "offerbooked", + "intent": "inform", "domain": "train", "slot": "train id", "value": "TR1879", @@ -5818,7 +5826,7 @@ "end": 31 }, { - "intent": "offerbooked", + "intent": "inform", "domain": "train", "slot": "destination", "value": "Peterborough", @@ -5826,7 +5834,7 @@ "end": 68 }, { - "intent": "offerbooked", + "intent": "inform", "domain": "train", "slot": "leave at", "value": "13:06", @@ -5839,6 +5847,11 @@ "intent": "reqmore", "domain": "general", "slot": "" + }, + { + "intent": "book", + "domain": "train", + "slot": "" } ] }, diff --git a/data/unified_datasets/multiwoz21/preprocess.py b/data/unified_datasets/multiwoz21/preprocess.py index f48f06e70a8c53f4cac4b5ae8ed2e0077f24fbda..305e16d54d67ec0e345ffd002126fc76d342f8bd 100644 --- a/data/unified_datasets/multiwoz21/preprocess.py +++ b/data/unified_datasets/multiwoz21/preprocess.py @@ -432,49 +432,6 @@ ontology = { } } }, - "booking": { - "description": "booking for taxi, restaurant, hotel, train, etc.", - "slots":{ - "day": { - "description": "day of the booking", - "is_categorical": True, - "possible_values": [ - "monday", - "tuesday", - "wednesday", - "thursday", - "friday", - "saturday", - "sunday" - ] - }, - "time": { - "description": "time of the booking", - "is_categorical": False, - "possible_values": [] - }, - "book people": { - "description": "number of people for the booking", - "is_categorical": False, - "possible_values": [] - }, - "book stay": { - "description": "length of stay at the hotel", - "is_categorical": False, - "possible_values": [] - }, - "name": { - "description": "name of the booked entity", - "is_categorical": False, - "possible_values": [] - }, - "ref": { - "description": "reference number of the booking", - "is_categorical": False, - "possible_values": [] - } - } - }, "general":{ "description": "general domain without slots", "slots": {} @@ -769,7 +726,7 @@ def preprocess(): dialogues_by_split = {split:[] for split in splits} sent_tokenizer = PunktSentenceTokenizer() word_tokenizer = TreebankWordTokenizer() - booking_remapper = BookingActRemapper(init_ontology) + booking_remapper = BookingActRemapper(ontology) for ori_dialog_id, ori_dialog in tqdm(original_data.items()): if ori_dialog_id in val_list: split = 'validation'