update dailydialog: remove space before punctuation and replace " ’ " with "'"

59fe66c5 · zqwerty · eee9a0a3 · 59fe66c5 · 59fe66c5 · 59fe66c5
Commit 59fe66c5 authored 3 years ago by zqwerty
--- a/data/unified_datasets/dailydialog/README.md
+++ b/data/unified_datasets/dailydialog/README.md
@@ -14,7 +14,10 @@ DailyDialog is a high-quality multi-turn dialog dataset. It is intriguing in sev
  - Run `python preprocess.py` in the current directory.
 - **Main changes of the transformation:**
  - Use `topic` annotation as `domain`. If duplicated dialogs are annotated with different topics, use the most frequent one.
-  - Combine `intent` and `domain` annotation as `binary` dialogue acts.
+  - Use `intent` annotation as `binary` dialogue act.
+  - Retain emotion annotation in the `emotion` field of each turn.
+  - Use nltk to remove space before punctuation: `utt = ' '.join([detokenizer.detokenize(word_tokenize(s)) for s in sent_tokenize(utt)])`.
+  - Replace `" ’ "` with `"'"`: `utt = utt.replace(' ’ ', "'")`.
 - **Annotations:**
  - intent, emotion

@@ -30,10 +33,10 @@ English

 | split      |   dialogues |   utterances |   avg_utt |   avg_tokens |   avg_domains | cat slot match(state)   | cat slot match(goal)   | cat slot match(dialogue act)   | non-cat slot span(dialogue act)   |
 |------------|-------------|--------------|-----------|--------------|---------------|-------------------------|------------------------|--------------------------------|-----------------------------------|
-| train      |       11118 |        87170 |      7.84 |        13.61 |             1 | -                       | -                      | -                              | -                                 |
-| validation |        1000 |         8069 |      8.07 |        13.5  |             1 | -                       | -                      | -                              | -                                 |
-| test       |        1000 |         7740 |      7.74 |        13.78 |             1 | -                       | -                      | -                              | -                                 |
-| all        |       13118 |       102979 |      7.85 |        13.61 |             1 | -                       | -                      | -                              | -                                 |
+| train      |       11118 |        87170 |      7.84 |        11.18 |             1 | -                       | -                      | -                              | -                                 |
+| validation |        1000 |         8069 |      8.07 |        11.14 |             1 | -                       | -                      | -                              | -                                 |
+| test       |        1000 |         7740 |      7.74 |        11.33 |             1 | -                       | -                      | -                              | -                                 |
+| all        |       13118 |       102979 |      7.85 |        11.19 |             1 | -                       | -                      | -                              | -                                 |

 10 domains: ['Ordinary Life', 'School Life', 'Culture & Education', 'Attitude & Emotion', 'Relationship', 'Tourism', 'Health', 'Work', 'Politics', 'Finance']
 - **cat slot match**: how many values of categorical slots are in the possible values of ontology in percentage.

--- a/data/unified_datasets/dailydialog/data.zip
+++ b/data/unified_datasets/dailydialog/data.zip
--- a/data/unified_datasets/dailydialog/dummy_data.json
+++ b/data/unified_datasets/dailydialog/dummy_data.json
--- a/data/unified_datasets/dailydialog/preprocess.py
+++ b/data/unified_datasets/dailydialog/preprocess.py
-import copy
-import re
 from zipfile import ZipFile, ZIP_DEFLATED
-from shutil import copy2, rmtree
+from shutil import rmtree
 import json
 import os
 from tqdm import tqdm
 from collections import Counter
 from pprint import pprint
-from datasets import load_dataset
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.tokenize.treebank import TreebankWordDetokenizer

 topic_map = {
    1: "Ordinary Life", 
@@ -78,12 +77,14 @@ def preprocess():
                    "binary": {}
                }}

+    detokenizer = TreebankWordDetokenizer()
+
    for data_split in splits:
        archive = ZipFile(os.path.join(original_data_dir, f'{data_split}.zip'))
        with archive.open(f'{data_split}/dialogues_{data_split}.txt') as dialog_file, \
            archive.open(f'{data_split}/dialogues_act_{data_split}.txt') as act_file, \
            archive.open(f'{data_split}/dialogues_emotion_{data_split}.txt') as emotion_file:
-            for dialog_line, act_line, emotion_line in zip(dialog_file, act_file, emotion_file):
+            for dialog_line, act_line, emotion_line in tqdm(zip(dialog_file, act_file, emotion_file)):
                if not dialog_line.strip():
                    break
                utts = dialog_line.decode().split("__eou__")[:-1]
@@ -114,6 +115,9 @@ def preprocess():
                    speaker = 'user' if len(dialogue['turns']) % 2 == 0 else 'system'
                    intent = act_map[int(act)]
                    emotion = emotion_map[int(emotion)]
+                    utt = ' '.join([detokenizer.detokenize(word_tokenize(s)) for s in sent_tokenize(utt)])
+                    utt = utt.replace(' ’ ', "'")
+
                    dialogue['turns'].append({
                        'speaker': speaker,
                        'utterance': utt.strip(),
@@ -121,7 +125,7 @@ def preprocess():
                        'dialogue_acts': {
                            'binary': [{
                                'intent': intent, 
-                                'domain': domain, 
+                                'domain': '', 
                                'slot': ''
                            }],
                            'categorical': [],
@@ -134,8 +138,8 @@ def preprocess():
                    else:
                        dialogue['turns'][-1]['state'] = {}

-                    ontology["dialogue_acts"]['binary'].setdefault((intent, domain, ''), {})
-                    ontology["dialogue_acts"]['binary'][(intent, domain, '')][speaker] = True
+                    ontology["dialogue_acts"]['binary'].setdefault((intent, '', ''), {})
+                    ontology["dialogue_acts"]['binary'][(intent, '', '')][speaker] = True

                dialogues_by_split[data_split].append(dialogue)