diff --git a/data/unified_datasets/dailydialog/README.md b/data/unified_datasets/dailydialog/README.md index 69e627cf2f5fbd7c4f5b10b3450a659afd6822d5..43cbe9b0a64869bc0abed6527d7a2b35a76827b2 100644 --- a/data/unified_datasets/dailydialog/README.md +++ b/data/unified_datasets/dailydialog/README.md @@ -18,6 +18,7 @@ DailyDialog is a high-quality multi-turn dialog dataset. It is intriguing in sev - Retain emotion annotation in the `emotion` field of each turn. - Use nltk to remove space before punctuation: `utt = ' '.join([detokenizer.detokenize(word_tokenize(s)) for s in sent_tokenize(utt)])`. - Replace `" ’ "` with `"'"`: `utt = utt.replace(' ’ ', "'")`. + - Add space after full-stop - **Annotations:** - intent, emotion @@ -33,10 +34,10 @@ English | split | dialogues | utterances | avg_utt | avg_tokens | avg_domains | cat slot match(state) | cat slot match(goal) | cat slot match(dialogue act) | non-cat slot span(dialogue act) | |------------|-------------|--------------|-----------|--------------|---------------|-------------------------|------------------------|--------------------------------|-----------------------------------| -| train | 11118 | 87170 | 7.84 | 11.18 | 1 | - | - | - | - | -| validation | 1000 | 8069 | 8.07 | 11.14 | 1 | - | - | - | - | -| test | 1000 | 7740 | 7.74 | 11.33 | 1 | - | - | - | - | -| all | 13118 | 102979 | 7.85 | 11.19 | 1 | - | - | - | - | +| train | 11118 | 87170 | 7.84 | 11.22 | 1 | - | - | - | - | +| validation | 1000 | 8069 | 8.07 | 11.16 | 1 | - | - | - | - | +| test | 1000 | 7740 | 7.74 | 11.36 | 1 | - | - | - | - | +| all | 13118 | 102979 | 7.85 | 11.22 | 1 | - | - | - | - | 10 domains: ['Ordinary Life', 'School Life', 'Culture & Education', 'Attitude & Emotion', 'Relationship', 'Tourism', 'Health', 'Work', 'Politics', 'Finance'] - **cat slot match**: how many values of categorical slots are in the possible values of ontology in percentage. diff --git a/data/unified_datasets/dailydialog/data.zip b/data/unified_datasets/dailydialog/data.zip index e8f1805a465c0609980e109bf1c2a6b0491ba81f..cb2af33908afc7f9954beb32cb2114605970027c 100644 Binary files a/data/unified_datasets/dailydialog/data.zip and b/data/unified_datasets/dailydialog/data.zip differ diff --git a/data/unified_datasets/dailydialog/dummy_data.json b/data/unified_datasets/dailydialog/dummy_data.json index 5da0cbbac9fed4f0efbcb48c88b1647241efa506..404646bbfa4fa01c85832b988a0ed04e39f9a144 100644 --- a/data/unified_datasets/dailydialog/dummy_data.json +++ b/data/unified_datasets/dailydialog/dummy_data.json @@ -78,7 +78,7 @@ }, { "speaker": "user", - "utterance": "I guess you are right.But what shall we do? I don't feel like sitting at home.", + "utterance": "I guess you are right. But what shall we do? I don't feel like sitting at home.", "utt_idx": 4, "dialogue_acts": { "binary": [ @@ -112,7 +112,7 @@ }, { "speaker": "user", - "utterance": "That's a good idea. I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them.", + "utterance": "That's a good idea. I hear Mary and Sally often go there to play pingpong. Perhaps we can make a foursome with them.", "utt_idx": 6, "dialogue_acts": { "binary": [ @@ -129,7 +129,7 @@ }, { "speaker": "system", - "utterance": "Sounds great to me! If they are willing, we could ask them to go dancing with us.That is excellent exercise and fun, too.", + "utterance": "Sounds great to me! If they are willing, we could ask them to go dancing with us. That is excellent exercise and fun, too.", "utt_idx": 7, "dialogue_acts": { "binary": [ @@ -146,7 +146,7 @@ }, { "speaker": "user", - "utterance": "Good.Let' s go now.", + "utterance": "Good. Let' s go now.", "utt_idx": 8, "dialogue_acts": { "binary": [ @@ -434,7 +434,7 @@ }, { "speaker": "user", - "utterance": "Don't worry.He is an acrobat 。", + "utterance": "Don't worry. He is an acrobat 。", "utt_idx": 2, "dialogue_acts": { "binary": [ @@ -677,7 +677,7 @@ }, { "speaker": "user", - "utterance": "What are you talking about? Let me see that...What are horoscopes?", + "utterance": "What are you talking about? Let me see that... What are horoscopes?", "utt_idx": 2, "dialogue_acts": { "binary": [ diff --git a/data/unified_datasets/dailydialog/preprocess.py b/data/unified_datasets/dailydialog/preprocess.py index de108a747bf7c4acc06a93c75f2bb8c8df5ba924..caea8a543743212913fb86ee83cee69820cd88cb 100644 --- a/data/unified_datasets/dailydialog/preprocess.py +++ b/data/unified_datasets/dailydialog/preprocess.py @@ -7,6 +7,7 @@ from collections import Counter from pprint import pprint from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tokenize.treebank import TreebankWordDetokenizer +import re topic_map = { 1: "Ordinary Life", @@ -110,8 +111,12 @@ def preprocess(): speaker = 'user' if len(dialogue['turns']) % 2 == 0 else 'system' intent = act_map[int(act)] emotion = emotion_map[int(emotion)] + # re-tokenize utt = ' '.join([detokenizer.detokenize(word_tokenize(s)) for s in sent_tokenize(utt)]) + # replace with common apostrophe utt = utt.replace(' ’ ', "'") + # add space after full-stop + utt = re.sub('\.(?!com)(\w)', lambda x: '. '+x.group(1), utt) dialogue['turns'].append({ 'speaker': speaker,