Skip to content
Snippets Groups Projects
Commit 59fe66c5 authored by zqwerty's avatar zqwerty
Browse files

update dailydialog: remove space before punctuation and replace " ’ " with "'"

parent eee9a0a3
Branches
No related tags found
No related merge requests found
......@@ -14,7 +14,10 @@ DailyDialog is a high-quality multi-turn dialog dataset. It is intriguing in sev
- Run `python preprocess.py` in the current directory.
- **Main changes of the transformation:**
- Use `topic` annotation as `domain`. If duplicated dialogs are annotated with different topics, use the most frequent one.
- Combine `intent` and `domain` annotation as `binary` dialogue acts.
- Use `intent` annotation as `binary` dialogue act.
- Retain emotion annotation in the `emotion` field of each turn.
- Use nltk to remove space before punctuation: `utt = ' '.join([detokenizer.detokenize(word_tokenize(s)) for s in sent_tokenize(utt)])`.
- Replace `" ’ "` with `"'"`: `utt = utt.replace(' ’ ', "'")`.
- **Annotations:**
- intent, emotion
......@@ -30,10 +33,10 @@ English
| split | dialogues | utterances | avg_utt | avg_tokens | avg_domains | cat slot match(state) | cat slot match(goal) | cat slot match(dialogue act) | non-cat slot span(dialogue act) |
|------------|-------------|--------------|-----------|--------------|---------------|-------------------------|------------------------|--------------------------------|-----------------------------------|
| train | 11118 | 87170 | 7.84 | 13.61 | 1 | - | - | - | - |
| validation | 1000 | 8069 | 8.07 | 13.5 | 1 | - | - | - | - |
| test | 1000 | 7740 | 7.74 | 13.78 | 1 | - | - | - | - |
| all | 13118 | 102979 | 7.85 | 13.61 | 1 | - | - | - | - |
| train | 11118 | 87170 | 7.84 | 11.18 | 1 | - | - | - | - |
| validation | 1000 | 8069 | 8.07 | 11.14 | 1 | - | - | - | - |
| test | 1000 | 7740 | 7.74 | 11.33 | 1 | - | - | - | - |
| all | 13118 | 102979 | 7.85 | 11.19 | 1 | - | - | - | - |
10 domains: ['Ordinary Life', 'School Life', 'Culture & Education', 'Attitude & Emotion', 'Relationship', 'Tourism', 'Health', 'Work', 'Politics', 'Finance']
- **cat slot match**: how many values of categorical slots are in the possible values of ontology in percentage.
......
No preview for this file type
This diff is collapsed.
import copy
import re
from zipfile import ZipFile, ZIP_DEFLATED
from shutil import copy2, rmtree
from shutil import rmtree
import json
import os
from tqdm import tqdm
from collections import Counter
from pprint import pprint
from datasets import load_dataset
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
topic_map = {
1: "Ordinary Life",
......@@ -78,12 +77,14 @@ def preprocess():
"binary": {}
}}
detokenizer = TreebankWordDetokenizer()
for data_split in splits:
archive = ZipFile(os.path.join(original_data_dir, f'{data_split}.zip'))
with archive.open(f'{data_split}/dialogues_{data_split}.txt') as dialog_file, \
archive.open(f'{data_split}/dialogues_act_{data_split}.txt') as act_file, \
archive.open(f'{data_split}/dialogues_emotion_{data_split}.txt') as emotion_file:
for dialog_line, act_line, emotion_line in zip(dialog_file, act_file, emotion_file):
for dialog_line, act_line, emotion_line in tqdm(zip(dialog_file, act_file, emotion_file)):
if not dialog_line.strip():
break
utts = dialog_line.decode().split("__eou__")[:-1]
......@@ -114,6 +115,9 @@ def preprocess():
speaker = 'user' if len(dialogue['turns']) % 2 == 0 else 'system'
intent = act_map[int(act)]
emotion = emotion_map[int(emotion)]
utt = ' '.join([detokenizer.detokenize(word_tokenize(s)) for s in sent_tokenize(utt)])
utt = utt.replace('', "'")
dialogue['turns'].append({
'speaker': speaker,
'utterance': utt.strip(),
......@@ -121,7 +125,7 @@ def preprocess():
'dialogue_acts': {
'binary': [{
'intent': intent,
'domain': domain,
'domain': '',
'slot': ''
}],
'categorical': [],
......@@ -134,8 +138,8 @@ def preprocess():
else:
dialogue['turns'][-1]['state'] = {}
ontology["dialogue_acts"]['binary'].setdefault((intent, domain, ''), {})
ontology["dialogue_acts"]['binary'][(intent, domain, '')][speaker] = True
ontology["dialogue_acts"]['binary'].setdefault((intent, '', ''), {})
ontology["dialogue_acts"]['binary'][(intent, '', '')][speaker] = True
dialogues_by_split[data_split].append(dialogue)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment