Skip to content
Snippets Groups Projects
Commit a403317b authored by zqwerty's avatar zqwerty
Browse files

add dart dataset

parent e265e86b
No related branches found
No related tags found
No related merge requests found
......@@ -17,6 +17,7 @@ __pycache__
data/unified_datasets/multiwoz21/MultiWOZ_2.1.zip
data/unified_datasets/tm1/master.zip
data/unified_datasets/dailydialog/ijcnlp_dailydialog.zip
data/unified_datasets/dart/dart-v1.1.1-*.json
data/**/train.json
data/**/val.json
data/**/test.json
......
......@@ -329,9 +329,11 @@ if __name__ == '__main__':
if args.preprocess:
print('pre-processing')
cur_dir = os.getcwd()
os.chdir(name)
preprocess = importlib.import_module(f'{name}.preprocess')
preprocess.preprocess()
os.chdir(cur_dir)
data_file = f'{name}/data.zip'
if not os.path.exists(data_file):
......
# Dataset Card for DailyDialog
- **Repository:** https://github.com/Yale-LILY/dart
- **Paper:** https://arxiv.org/pdf/2007.02871.pdf
- **Leaderboard:** https://github.com/Yale-LILY/dart
- **Who transforms the dataset:** Qi Zhu(zhuq96 at gmail dot com)
### Dataset Summary
DART is a large and open-domain structured DAta Record to Text generation corpus with high-quality sentence annotations with each input being a set of entity-relation triples following a tree-structured ontology. It consists of 82191 examples across different domains with each input being a semantic triple set derived from data records in tables and the tree ontology of table schema, annotated with sentence description that covers all facts in the triple set.
- **How to get the transformed data from original data:**
- Run `python preprocess.py` in the current directory.
- **Main changes of the transformation:**
- Use `source` annotation as `domain`.
- Retain entity-relation triples in the `tripleset` field of each turn.
- If there are multiple source&text annotation in a original sample, split them into multiple samples.
- **Annotations:**
- entity-relation triples
### Supported Tasks and Leaderboards
NLG
### Languages
English
### Data Splits
| split | dialogues | utterances | avg_utt | avg_tokens | avg_domains | cat slot match(state) | cat slot match(goal) | cat slot match(dialogue act) | non-cat slot span(dialogue act) |
|------------|-------------|--------------|-----------|--------------|---------------|-------------------------|------------------------|--------------------------------|-----------------------------------|
| train | 62659 | 62659 | 1 | 18.85 | 1 | - | - | - | - |
| validation | 6980 | 6980 | 1 | 21.22 | 1 | - | - | - | - |
| test | 12552 | 12552 | 1 | 20.95 | 1 | - | - | - | - |
| all | 82191 | 82191 | 1 | 19.37 | 1 | - | - | - | - |
6 domains: ['WikiTableQuestions_mturk', 'WikiSQL_decl_sents', 'WikiSQL_lily', 'WikiTableQuestions_lily', 'webnlg', 'e2e']
- **cat slot match**: how many values of categorical slots are in the possible values of ontology in percentage.
- **non-cat slot span**: how many values of non-categorical slots have span annotation in percentage.
### Citation
```
@inproceedings{nan-etal-2021-dart,
title = "{DART}: Open-Domain Structured Data Record to Text Generation",
author = "Linyong Nan and Dragomir Radev and Rui Zhang and Amrit Rau and Abhinand Sivaprasad and Chiachun Hsieh and Xiangru Tang and Aadit Vyas and Neha Verma and Pranav Krishna and Yangxiaokang Liu and Nadia Irwanto and Jessica Pan and Faiaz Rahman and Ahmad Zaidi and Murori Mutuma and Yasin Tarabar and Ankit Gupta and Tao Yu and Yi Chern Tan and Xi Victoria Lin and Caiming Xiong and Richard Socher and Nazneen Fatema Rajani",
booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
year = "2021",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.naacl-main.37",
}
```
### Licensing Information
MIT License
\ No newline at end of file
File added
[
{
"dataset": "dart",
"data_split": "train",
"dialogue_id": "dart-train-0",
"original_id": "train-0",
"domains": [
"WikiTableQuestions_mturk"
],
"goal": {
"description": "",
"inform": {},
"request": {}
},
"turns": [
{
"speaker": "system",
"utterance": "First Clearing\tbased on Callicoon, New York and location at On NYS 52 1 Mi. Youngsville",
"utt_idx": 0,
"dialogue_acts": {
"binary": [],
"categorical": [],
"non-categorical": []
},
"tripleset": [
[
"First Clearing",
"LOCATION",
"On NYS 52 1 Mi. Youngsville"
],
[
"On NYS 52 1 Mi. Youngsville",
"CITY_OR_TOWN",
"Callicoon, New York"
]
],
"subtree_was_extended": false,
"db_results": {}
}
]
},
{
"dataset": "dart",
"data_split": "train",
"dialogue_id": "dart-train-1",
"original_id": "train-1",
"domains": [
"WikiTableQuestions_mturk"
],
"goal": {
"description": "",
"inform": {},
"request": {}
},
"turns": [
{
"speaker": "system",
"utterance": "Old Turnpike is a Historic Marker in Sullivan County, New York.",
"utt_idx": 0,
"dialogue_acts": {
"binary": [],
"categorical": [],
"non-categorical": []
},
"tripleset": [
[
"[TABLECONTEXT]",
"MARKER_NAME",
"Old Turnpike"
],
[
"[TABLECONTEXT]",
"[TITLE]",
"List of New York State Historic Markers in Sullivan County, New York"
]
],
"subtree_was_extended": true,
"db_results": {}
}
]
},
{
"dataset": "dart",
"data_split": "train",
"dialogue_id": "dart-train-2",
"original_id": "train-2",
"domains": [
"WikiTableQuestions_mturk"
],
"goal": {
"description": "",
"inform": {},
"request": {}
},
"turns": [
{
"speaker": "system",
"utterance": "Tereza Martincová",
"utt_idx": 0,
"dialogue_acts": {
"binary": [],
"categorical": [],
"non-categorical": []
},
"tripleset": [
[
"Antalya 15, Turkey",
"SURFACE",
"Hard"
],
[
"Antalya 15, Turkey",
"OPPONENT",
"Tereza Martincová"
],
[
"Antalya 15, Turkey",
"SCORE",
"6-4, 6-3"
],
[
"15 April 2013",
"TOURNAMENT",
"Antalya 15, Turkey"
]
],
"subtree_was_extended": false,
"db_results": {}
}
]
},
{
"dataset": "dart",
"data_split": "train",
"dialogue_id": "dart-train-3",
"original_id": "train-3",
"domains": [
"WikiTableQuestions_mturk"
],
"goal": {
"description": "",
"inform": {},
"request": {}
},
"turns": [
{
"speaker": "system",
"utterance": "Beatriz Haddad Maia played on 2 April 2012\tin Ribeirão Preto, Brazil\ton a hard surface.",
"utt_idx": 0,
"dialogue_acts": {
"binary": [],
"categorical": [],
"non-categorical": []
},
"tripleset": [
[
"2 April 2012",
"TOURNAMENT",
"Ribeirão Preto, Brazil"
],
[
"Ribeirão Preto, Brazil",
"SURFACE",
"Hard"
]
],
"subtree_was_extended": false,
"db_results": {}
}
]
},
{
"dataset": "dart",
"data_split": "train",
"dialogue_id": "dart-train-4",
"original_id": "train-4",
"domains": [
"WikiSQL_decl_sents"
],
"goal": {
"description": "",
"inform": {},
"request": {}
},
"turns": [
{
"speaker": "system",
"utterance": "The week 5 game is played in Shea Stadium.",
"utt_idx": 0,
"dialogue_acts": {
"binary": [],
"categorical": [],
"non-categorical": []
},
"tripleset": [
[
"5",
"STADIUM",
"shea stadium"
]
],
"subtree_was_extended": false,
"db_results": {}
}
]
},
{
"dataset": "dart",
"data_split": "train",
"dialogue_id": "dart-train-5",
"original_id": "train-5",
"domains": [
"WikiSQL_decl_sents"
],
"goal": {
"description": "",
"inform": {},
"request": {}
},
"turns": [
{
"speaker": "system",
"utterance": "The team whose nickname is red raiders is located in the orange city, iowa",
"utt_idx": 0,
"dialogue_acts": {
"binary": [],
"categorical": [],
"non-categorical": []
},
"tripleset": [
[
"Northwestern College",
"NICKNAME",
"Red Raiders"
],
[
"Northwestern College",
"LOCATION",
"Orange City, Iowa"
]
],
"subtree_was_extended": true,
"db_results": {}
}
]
},
{
"dataset": "dart",
"data_split": "train",
"dialogue_id": "dart-train-6",
"original_id": "train-6",
"domains": [
"WikiSQL_decl_sents"
],
"goal": {
"description": "",
"inform": {},
"request": {}
},
"turns": [
{
"speaker": "system",
"utterance": "University of mississippi was in the selc new conference.",
"utt_idx": 0,
"dialogue_acts": {
"binary": [],
"categorical": [],
"non-categorical": []
},
"tripleset": [
[
"University of Mississippi",
"NEW_CONFERENCE",
"SELC"
]
],
"subtree_was_extended": false,
"db_results": {}
}
]
},
{
"dataset": "dart",
"data_split": "train",
"dialogue_id": "dart-train-7",
"original_id": "train-7",
"domains": [
"WikiSQL_decl_sents"
],
"goal": {
"description": "",
"inform": {},
"request": {}
},
"turns": [
{
"speaker": "system",
"utterance": "The years that the new classification was MCLA division i are 2008-2009.",
"utt_idx": 0,
"dialogue_acts": {
"binary": [],
"categorical": [],
"non-categorical": []
},
"tripleset": [
[
"University of Mississippi",
"NEW_CLASSIFICATION",
"MCLA Division I"
],
[
"University of Mississippi",
"YEARS",
"2008-2009"
]
],
"subtree_was_extended": true,
"db_results": {}
}
]
},
{
"dataset": "dart",
"data_split": "train",
"dialogue_id": "dart-train-8",
"original_id": "train-8",
"domains": [
"WikiSQL_decl_sents"
],
"goal": {
"description": "",
"inform": {},
"request": {}
},
"turns": [
{
"speaker": "system",
"utterance": "The nickname of the team of University of Nebraska at Omaha is mavericks.",
"utt_idx": 0,
"dialogue_acts": {
"binary": [],
"categorical": [],
"non-categorical": []
},
"tripleset": [
[
"University of Nebraska at Omaha",
"NICKNAME",
"Mavericks"
]
],
"subtree_was_extended": false,
"db_results": {}
}
]
},
{
"dataset": "dart",
"data_split": "train",
"dialogue_id": "dart-train-9",
"original_id": "train-9",
"domains": [
"WikiSQL_decl_sents"
],
"goal": {
"description": "",
"inform": {},
"request": {}
},
"turns": [
{
"speaker": "system",
"utterance": "William Wasmund scored 5.0 points",
"utt_idx": 0,
"dialogue_acts": {
"binary": [],
"categorical": [],
"non-categorical": []
},
"tripleset": [
[
"William Wasmund",
"FIELD_GOALS",
"0"
],
[
"William Wasmund",
"EXTRA_POINTS",
"0"
],
[
"William Wasmund",
"POINTS",
"5"
],
[
"William Wasmund",
"TOUCHDOWNS",
"1"
]
],
"subtree_was_extended": false,
"db_results": {}
}
]
}
]
\ No newline at end of file
from zipfile import ZipFile, ZIP_DEFLATED
from shutil import rmtree
import json
import os
from tqdm import tqdm
from collections import Counter
from pprint import pprint
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import requests
def preprocess():
new_data_dir = 'data'
os.makedirs(new_data_dir, exist_ok=True)
dataset = 'dart'
speaker = 'system'
splits = ['train', 'validation', 'test']
dialogues_by_split = {split:[] for split in splits}
ontology = {'domains': {},
'intents': {},
'state': {},
'dialogue_acts': {
"categorical": [],
"non-categorical": [],
"binary": []
}}
url_prefix = "https://github.com/Yale-LILY/dart/raw/master/data/v1.1.1/"
for data_split in splits:
data_file = f"dart-v1.1.1-full-{data_split}.json" if data_split != 'validation' else "dart-v1.1.1-full-dev.json"
if not os.path.exists(data_file):
data = json.loads(requests.get(f"{url_prefix}{data_file}").content)
json.dump(data, open(data_file, 'w'))
else:
# open(data_file, "wb").write(requests.get(f"{url_prefix}{data_file}").content)
data = json.load(open(data_file))
for item in tqdm(data, desc='processing dart-{}'.format(data_split)):
tripleset = item["tripleset"]
subtree_was_extended = item.get("subtree_was_extended", None)
for annotation in item["annotations"]:
source = annotation["source"]
text = annotation["text"]
ontology['domains'][source] = {'description': '', 'slots': {}}
dialogue_id = f'{dataset}-{data_split}-{len(dialogues_by_split[data_split])}'
dialogue = {
'dataset': dataset,
'data_split': data_split,
'dialogue_id': dialogue_id,
'original_id': f'{data_split}-{len(dialogues_by_split[data_split])}',
'domains': [source],
'goal': {
'description': '',
'inform': {},
'request': {}
},
'turns': [{
'speaker': speaker,
'utterance': text.strip(),
'utt_idx': 0,
'dialogue_acts': {
'binary': [],
'categorical': [],
'non-categorical': [],
},
'tripleset': tripleset,
'subtree_was_extended': subtree_was_extended,
'db_results': {}
}]
}
dialogues_by_split[data_split].append(dialogue)
dialogues = dialogues_by_split['train']+dialogues_by_split['validation']+dialogues_by_split['test']
json.dump(dialogues[:10], open(f'dummy_data.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
json.dump(ontology, open(f'{new_data_dir}/ontology.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
json.dump(dialogues, open(f'{new_data_dir}/dialogues.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf:
for filename in os.listdir(new_data_dir):
zf.write(f'{new_data_dir}/{filename}')
rmtree(new_data_dir)
return dialogues, ontology
if __name__ == '__main__':
preprocess()
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment