Skip to content
Snippets Groups Projects
Commit 052b8c5f authored by zqwerty's avatar zqwerty
Browse files

Merge branch 'unified_dataset'

parents f7fe7b49 697ab75f
No related branches found
No related tags found
No related merge requests found
......@@ -14,6 +14,9 @@ __pycache__
.vscode
# data
data/unified_datasets/multiwoz21/MultiWOZ_2.1.zip
data/unified_datasets/tm1/master.zip
data/unified_datasets/dailydialog/ijcnlp_dailydialog.zip
data/**/train.json
data/**/val.json
data/**/test.json
......
# Dataset Card for DailyDialog
- **Repository:** http://yanran.li/dailydialog
- **Paper:** https://arxiv.org/pdf/1710.03957.pdf
- **Leaderboard:** None
- **Who transforms the dataset:** Qi Zhu(zhuq96 at gmail dot com)
### Dataset Summary
DailyDialog is a high-quality multi-turn dialog dataset. It is intriguing in several aspects. The language is human-written and less noisy. The dialogues in the dataset reflect our daily communication way and cover various topics about our daily life. We also manually label the developed dataset with communication intention and emotion information.
- **How to get the transformed data from original data:**
- Download [ijcnlp_dailydialog.zip](http://yanran.li/files/ijcnlp_dailydialog.zip).
- Run `python preprocess.py` in the current directory.
- **Main changes of the transformation:**
- Use `topic` annotation as `domain`. If duplicated dialogs are annotated with different topics, use the most frequent one.
- Combine `intent` and `domain` annotation as `binary` dialogue acts.
- **Annotations:**
- intent, emotion
### Supported Tasks and Leaderboards
NLU, NLG
### Languages
English
### Data Splits
| split | dialogues | utterances | avg_utt | avg_tokens | avg_domains | cat slot match(state) | cat slot match(goal) | cat slot match(dialogue act) | non-cat slot span(dialogue act) |
|------------|-------------|--------------|-----------|--------------|---------------|-------------------------|------------------------|--------------------------------|-----------------------------------|
| train | 11118 | 87170 | 7.84 | 13.61 | 1 | - | - | - | - |
| validation | 1000 | 8069 | 8.07 | 13.5 | 1 | - | - | - | - |
| test | 1000 | 7740 | 7.74 | 13.78 | 1 | - | - | - | - |
| all | 13118 | 102979 | 7.85 | 13.61 | 1 | - | - | - | - |
10 domains: ['Ordinary Life', 'School Life', 'Culture & Education', 'Attitude & Emotion', 'Relationship', 'Tourism', 'Health', 'Work', 'Politics', 'Finance']
- **cat slot match**: how many values of categorical slots are in the possible values of ontology in percentage.
- **non-cat slot span**: how many values of non-categorical slots have span annotation in percentage.
### Citation
```
@InProceedings{li2017dailydialog,
author = {Li, Yanran and Su, Hui and Shen, Xiaoyu and Li, Wenjie and Cao, Ziqiang and Niu, Shuzi},
title = {DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset},
booktitle = {Proceedings of The 8th International Joint Conference on Natural Language Processing (IJCNLP 2017)},
year = {2017}
}
```
### Licensing Information
[**CC BY-NC-SA 4.0**](https://creativecommons.org/licenses/by-nc-sa/4.0/)
\ No newline at end of file
File added
This diff is collapsed.
import copy
import re
from zipfile import ZipFile, ZIP_DEFLATED
from shutil import copy2, rmtree
import json
import os
from tqdm import tqdm
from collections import Counter
from pprint import pprint
from datasets import load_dataset
topic_map = {
1: "Ordinary Life",
2: "School Life",
3: "Culture & Education",
4: "Attitude & Emotion",
5: "Relationship",
6: "Tourism",
7: "Health",
8: "Work",
9: "Politics",
10: "Finance"
}
act_map = {
1: "inform",
2: "question",
3: "directive",
4: "commissive"
}
emotion_map = {
0: "no emotion",
1: "anger",
2: "disgust",
3: "fear",
4: "happiness",
5: "sadness",
6: "surprise"
}
def preprocess():
original_data_dir = 'ijcnlp_dailydialog'
new_data_dir = 'data'
if not os.path.exists(original_data_dir):
original_data_zip = 'ijcnlp_dailydialog.zip'
if not os.path.exists(original_data_zip):
raise FileNotFoundError(f'cannot find original data {original_data_zip} in dailydialog/, should manually download ijcnlp_dailydialog.zip from http://yanran.li/files/ijcnlp_dailydialog.zip')
else:
archive = ZipFile(original_data_zip)
archive.extractall()
os.makedirs(new_data_dir, exist_ok=True)
dataset = 'dailydialog'
splits = ['train', 'validation', 'test']
dialogues_by_split = {split:[] for split in splits}
dial2topics = {}
with open(os.path.join(original_data_dir, 'dialogues_text.txt')) as dialog_file, \
open(os.path.join(original_data_dir, 'dialogues_topic.txt')) as topic_file:
for dialog, topic in zip(dialog_file, topic_file):
topic = int(topic.strip())
dialog = dialog.replace(' __eou__ ', ' ')
if dialog in dial2topics:
dial2topics[dialog].append(topic)
else:
dial2topics[dialog] = [topic]
global topic_map, act_map, emotion_map
ontology = {'domains': {x:{'description': '', 'slots': {}} for x in topic_map.values()},
'intents': {x:{'description': ''} for x in act_map.values()},
'state': {},
'dialogue_acts': {
"categorical": [],
"non-categorical": [],
"binary": {}
}}
for data_split in splits:
archive = ZipFile(os.path.join(original_data_dir, f'{data_split}.zip'))
with archive.open(f'{data_split}/dialogues_{data_split}.txt') as dialog_file, \
archive.open(f'{data_split}/dialogues_act_{data_split}.txt') as act_file, \
archive.open(f'{data_split}/dialogues_emotion_{data_split}.txt') as emotion_file:
for dialog_line, act_line, emotion_line in zip(dialog_file, act_file, emotion_file):
if not dialog_line.strip():
break
utts = dialog_line.decode().split("__eou__")[:-1]
acts = act_line.decode().split(" ")[:-1]
emotions = emotion_line.decode().split(" ")[:-1]
assert (len(utts) == len(acts) == len(emotions)), "Different turns btw dialogue & emotion & action"
topics = dial2topics[dialog_line.decode().replace(' __eou__ ', ' ')]
topic = Counter(topics).most_common(1)[0][0]
domain = topic_map[topic]
dialogue_id = f'{dataset}-{data_split}-{len(dialogues_by_split[data_split])}'
dialogue = {
'dataset': dataset,
'data_split': data_split,
'dialogue_id': dialogue_id,
'original_id': f'{data_split}-{len(dialogues_by_split[data_split])}',
'domains': [domain],
'goal': {
'description': '',
'inform': {},
'request': {}
},
'turns': []
}
for utt, act, emotion in zip(utts, acts, emotions):
speaker = 'user' if len(dialogue['turns']) % 2 == 0 else 'system'
intent = act_map[int(act)]
emotion = emotion_map[int(emotion)]
dialogue['turns'].append({
'speaker': speaker,
'utterance': utt.strip(),
'utt_idx': len(dialogue['turns']),
'dialogue_acts': {
'binary': [{
'intent': intent,
'domain': domain,
'slot': ''
}],
'categorical': [],
'non-categorical': [],
},
'emotion': emotion,
})
if speaker == 'system':
dialogue['turns'][-1]['db_results'] = {}
else:
dialogue['turns'][-1]['state'] = {}
ontology["dialogue_acts"]['binary'].setdefault((intent, domain, ''), {})
ontology["dialogue_acts"]['binary'][(intent, domain, '')][speaker] = True
dialogues_by_split[data_split].append(dialogue)
ontology["dialogue_acts"]['binary'] = sorted([str({'user': speakers.get('user', False), 'system': speakers.get('system', False), 'intent':da[0],'domain':da[1], 'slot':da[2]}) for da, speakers in ontology["dialogue_acts"]['binary'].items()])
dialogues = dialogues_by_split['train']+dialogues_by_split['validation']+dialogues_by_split['test']
json.dump(dialogues[:10], open(f'dummy_data.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
json.dump(ontology, open(f'{new_data_dir}/ontology.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
json.dump(dialogues, open(f'{new_data_dir}/dialogues.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf:
for filename in os.listdir(new_data_dir):
zf.write(f'{new_data_dir}/{filename}')
# rmtree(original_data_dir)
# rmtree(new_data_dir)
return dialogues, ontology
if __name__ == '__main__':
preprocess()
\ No newline at end of file
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment