diff --git a/data/unified_datasets/tm1/README.md b/data/unified_datasets/tm1/README.md index 6b1366f99c6064591d19f063aea386d9401b9ccc..ddc51e640a3cf29ee9db45ad01c628259108cb8d 100644 --- a/data/unified_datasets/tm1/README.md +++ b/data/unified_datasets/tm1/README.md @@ -16,9 +16,9 @@ The original dataset consists of 13,215 task-based dialogs, including 5,507 spok - Remove dialogs that are empty or only contain one speaker. - Split woz-dialogs into train/validation/test randomly (8:1:1). The split of self-dialogs is followed the original dataset. - Merge continuous turns by the same speaker (ignore repeated turns). - - Annotate `dialogue acts` according to the original segment annotations. Add `intent` annotation (inform/accept/reject). The type of `dialogue act` is set to `non-categorical` if the original segment annotation includes a specified `slot`. Otherwise, the type is set to `binary` (and the `slot` and `value` are empty) since it means general reference to a transaction, e.g. "OK your pizza has been ordered". + - Annotate `dialogue acts` according to the original segment annotations. Add `intent` annotation (inform/accept/reject). The type of `dialogue act` is set to `non-categorical` if the original segment annotation includes a specified `slot`. Otherwise, the type is set to `binary` (and the `slot` and `value` are empty) since it means general reference to a transaction, e.g. "OK your pizza has been ordered". If there are multiple spans overlapping, we only keep the shortest one, since we found that this simple strategy can reduce the noise in annotation. - Add `intent` and `slot` descriptions. - - Add `state` by accumulate dialog acts except those whose intents are **reject**. + - Add `state` by accumulate `non-categorical dialogue acts` in the order that they appear, except those whose intents are **reject**. - Keep the first annotation since each conversation was annotated by two workers. - **Annotations:** - dialogue acts, state. diff --git a/data/unified_datasets/tm1/data.zip b/data/unified_datasets/tm1/data.zip index ef44348839377a2609eb06197418518bc4df0e89..0826bdd7897abda7db1910016a0032db8b24885a 100644 Binary files a/data/unified_datasets/tm1/data.zip and b/data/unified_datasets/tm1/data.zip differ diff --git a/data/unified_datasets/tm1/dummy_data.json b/data/unified_datasets/tm1/dummy_data.json index 9cffe8bbb254b2437c70e4f9837823cd93e139cf..982375d753b698cd6b427cf6c9e3e8bbfb2ab926 100644 --- a/data/unified_datasets/tm1/dummy_data.json +++ b/data/unified_datasets/tm1/dummy_data.json @@ -1244,12 +1244,20 @@ "categorical": [], "non-categorical": [ { - "intent": "accept", + "intent": "inform", "domain": "auto_repair", "slot": "reason.appt", - "value": "tune up and have them check my brakes. I believe I hear some grinding noise when I press the brakes.", + "value": "tune up and have them check my brakes", "start": 21, - "end": 121 + "end": 58 + }, + { + "intent": "inform", + "domain": "auto_repair", + "slot": "reason.appt", + "value": "grinding noise when I press the brakes", + "start": 82, + "end": 120 } ] }, @@ -1259,7 +1267,7 @@ "name.customer": "", "date.appt": "", "time.appt": "", - "reason.appt": "tune up and have them check my brakes. I believe I hear some grinding noise when I press the brakes.", + "reason.appt": "grinding noise when I press the brakes", "name.vehicle": "Toyota Corolla", "year.vehicle": "2015", "location.store": "O'Fallon, IL. on Hwy 50" @@ -1292,7 +1300,7 @@ "name.customer": "", "date.appt": "", "time.appt": "", - "reason.appt": "tune up and have them check my brakes. I believe I hear some grinding noise when I press the brakes.", + "reason.appt": "grinding noise when I press the brakes", "name.vehicle": "Toyota Corolla", "year.vehicle": "2015", "location.store": "O'Fallon, IL. on Hwy 50" @@ -1325,7 +1333,7 @@ "name.customer": "", "date.appt": "", "time.appt": "", - "reason.appt": "tune up and have them check my brakes. I believe I hear some grinding noise when I press the brakes.", + "reason.appt": "grinding noise when I press the brakes", "name.vehicle": "Toyota Corolla", "year.vehicle": "2015", "location.store": "O'Fallon, IL. on Hwy 50" @@ -1358,7 +1366,7 @@ "name.customer": "", "date.appt": "", "time.appt": "", - "reason.appt": "tune up and have them check my brakes. I believe I hear some grinding noise when I press the brakes.", + "reason.appt": "grinding noise when I press the brakes", "name.vehicle": "Toyota Corolla", "year.vehicle": "2015", "location.store": "O'Fallon, IL. on Hwy 50" @@ -1400,7 +1408,7 @@ "name.customer": "", "date.appt": "", "time.appt": "", - "reason.appt": "tune up and have them check my brakes. I believe I hear some grinding noise when I press the brakes.", + "reason.appt": "grinding noise when I press the brakes", "name.vehicle": "Toyota Corolla", "year.vehicle": "2015", "location.store": "O'Fallon, IL. on Hwy 50" @@ -1433,7 +1441,7 @@ "name.customer": "", "date.appt": "", "time.appt": "", - "reason.appt": "tune up and have them check my brakes. I believe I hear some grinding noise when I press the brakes.", + "reason.appt": "grinding noise when I press the brakes", "name.vehicle": "Toyota Corolla", "year.vehicle": "2015", "location.store": "O'Fallon, IL. on Hwy 50" @@ -1466,7 +1474,7 @@ "name.customer": "", "date.appt": "", "time.appt": "", - "reason.appt": "tune up and have them check my brakes. I believe I hear some grinding noise when I press the brakes.", + "reason.appt": "grinding noise when I press the brakes", "name.vehicle": "Toyota Corolla", "year.vehicle": "2015", "location.store": "O'Fallon, IL. on Hwy 50" @@ -1508,8 +1516,8 @@ "intent": "accept", "domain": "auto_repair", "slot": "reason.appt", - "value": "your brakes checked and to obtain a tune up", - "start": 98, + "value": "brakes checked and to obtain a tune up", + "start": 103, "end": 141 } ] @@ -1531,7 +1539,7 @@ "name.customer": "", "date.appt": "", "time.appt": "", - "reason.appt": "your brakes checked and to obtain a tune up", + "reason.appt": "brakes checked and to obtain a tune up", "name.vehicle": "Toyota Corolla", "year.vehicle": "2015", "location.store": "O'Fallon, IL. on Hwy 50" @@ -2188,10 +2196,10 @@ { "intent": "inform", "domain": "pizza_ordering", - "slot": "type.topping", - "value": "Buffalo Chicken Ranch,", + "slot": "name.pizza", + "value": "Buffalo Chicken Ranch", "start": 72, - "end": 94 + "end": 93 } ] }, @@ -2220,7 +2228,7 @@ "name.store": "Domino's", "name.pizza": "Philly Cheesesteak", "size.pizza": "large", - "type.topping": "Buffalo Chicken Ranch,", + "type.topping": "", "type.crust": "", "preference": "", "location.store": "" @@ -3775,8 +3783,8 @@ "intent": "inform", "domain": "uber_lyft", "slot": "location.to", - "value": "the north end", - "start": 34, + "value": "north end", + "start": 38, "end": 47 } ] @@ -3784,7 +3792,7 @@ "state": { "uber_lyft": { "location.from": "south boston", - "location.to": "the north end", + "location.to": "north end", "type.ride": "", "num.people": "", "price.estimate": "", @@ -3803,11 +3811,11 @@ "categorical": [], "non-categorical": [ { - "intent": "accept", + "intent": "inform", "domain": "uber_lyft", "slot": "location.from", - "value": "in the south end", - "start": 14, + "value": "south end", + "start": 21, "end": 30 } ] @@ -3835,7 +3843,7 @@ "state": { "uber_lyft": { "location.from": "boston chops.", - "location.to": "the north end", + "location.to": "north end", "type.ride": "", "num.people": "", "price.estimate": "", @@ -3908,9 +3916,9 @@ "intent": "inform", "domain": "uber_lyft", "slot": "type.ride", - "value": "personal ride", + "value": "personal", "start": 14, - "end": 27 + "end": 22 }, { "intent": "inform", @@ -5013,11 +5021,19 @@ "end": 30 }, { - "intent": "inform", + "intent": "accept", "domain": "movie_ticket", - "slot": "name.theater", - "value": "Alita: Battle Angel at Regal Cinemas Fox Tower 10", + "slot": "name.movie", + "value": "Alita: Battle Angel", "start": 42, + "end": 61 + }, + { + "intent": "accept", + "domain": "movie_ticket", + "slot": "name.theater", + "value": "Regal Cinemas Fox Tower 10", + "start": 65, "end": 91 } ] @@ -5045,7 +5061,7 @@ "state": { "movie_ticket": { "name.movie": "Alita: Battle Angel", - "name.theater": "Alita: Battle Angel at Regal Cinemas Fox Tower 10", + "name.theater": "Regal Cinemas Fox Tower 10", "num.tickets": "Two", "time.start": "8:00 pm", "location.theater": "Let's see it there", @@ -5086,7 +5102,7 @@ "state": { "movie_ticket": { "name.movie": "Alita: Battle Angel", - "name.theater": "Alita: Battle Angel at Regal Cinemas Fox Tower 10", + "name.theater": "Regal Cinemas Fox Tower 10", "num.tickets": "Two", "time.start": "8:00 pm", "location.theater": "Let's see it there", @@ -5120,7 +5136,7 @@ "state": { "movie_ticket": { "name.movie": "Alita: Battle Angel", - "name.theater": "Alita: Battle Angel at Regal Cinemas Fox Tower 10", + "name.theater": "Regal Cinemas Fox Tower 10", "num.tickets": "Two", "time.start": "8:00 pm", "location.theater": "Let's see it there", @@ -5161,7 +5177,7 @@ "state": { "movie_ticket": { "name.movie": "Alita: Battle Angel", - "name.theater": "Alita: Battle Angel at Regal Cinemas Fox Tower 10", + "name.theater": "Regal Cinemas Fox Tower 10", "num.tickets": "Two", "time.start": "8:00 pm", "location.theater": "Let's see it there", diff --git a/data/unified_datasets/tm1/preprocess.py b/data/unified_datasets/tm1/preprocess.py index f73f80b88503aefcbdf5b01d90d7ea73c8bd9443..344f0cf0ca77e399a64fb07a3f6a44f5d05a838a 100644 --- a/data/unified_datasets/tm1/preprocess.py +++ b/data/unified_datasets/tm1/preprocess.py @@ -238,8 +238,10 @@ def preprocess(): in_span = [0] * len(turn['utterance']) if 'segments' in uttr: - for segment in uttr['segments']: - # skip overlapped span + # sort the span according to the length + segments = sorted(uttr['segments'], key=lambda x: len(x['text'])) + for segment in segments: + # skip overlapped spans, keep the shortest one if sum(in_span[segment['start_index']: segment['end_index']]) > 0: continue else: @@ -272,6 +274,8 @@ def preprocess(): 'end': segment['end_index'] }) + turn['dialogue_acts']['non-categorical'] = sorted(turn['dialogue_acts']['non-categorical'], key=lambda x: x['start']) + for da in turn['dialogue_acts']['binary']: da_tuple = (da['intent'], da['domain'], da['slot'], da['value'],) if da_tuple not in ontology['binary_dialogue_acts']: @@ -300,8 +304,8 @@ def preprocess(): with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf: for filename in os.listdir(new_data_dir): zf.write(f'{new_data_dir}/{filename}') - # rmtree(original_data_dir) - # rmtree(new_data_dir) + rmtree(original_data_dir) + rmtree(new_data_dir) return dialogues, ontology if __name__ == '__main__':