add dart dataset

a403317b · zqwerty · e265e86b · a403317b · a403317b · a403317b
Commit a403317b authored 3 years ago by zqwerty
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,7 @@ __pycache__
 data/unified_datasets/multiwoz21/MultiWOZ_2.1.zip
 data/unified_datasets/tm1/master.zip
 data/unified_datasets/dailydialog/ijcnlp_dailydialog.zip
+data/unified_datasets/dart/dart-v1.1.1-*.json
 data/**/train.json
 data/**/val.json
 data/**/test.json

--- a/data/unified_datasets/check.py
+++ b/data/unified_datasets/check.py
@@ -329,9 +329,11 @@ if __name__ == '__main__':

            if args.preprocess:
                print('pre-processing')
-
+                cur_dir = os.getcwd()
+                os.chdir(name)
                preprocess = importlib.import_module(f'{name}.preprocess')
                preprocess.preprocess()
+                os.chdir(cur_dir)

            data_file = f'{name}/data.zip'
            if not os.path.exists(data_file):

--- a/data/unified_datasets/dart/README.md
+++ b/data/unified_datasets/dart/README.md
+# Dataset Card for DailyDialog
+
+- **Repository:** https://github.com/Yale-LILY/dart
+- **Paper:** https://arxiv.org/pdf/2007.02871.pdf
+- **Leaderboard:** https://github.com/Yale-LILY/dart
+- **Who transforms the dataset:** Qi Zhu(zhuq96 at gmail dot com)
+
+### Dataset Summary
+
+DART is a large and open-domain structured DAta Record to Text generation corpus with high-quality sentence annotations with each input being a set of entity-relation triples following a tree-structured ontology. It consists of 82191 examples across different domains with each input being a semantic triple set derived from data records in tables and the tree ontology of table schema, annotated with sentence description that covers all facts in the triple set.
+
+- **How to get the transformed data from original data:** 
+  - Run `python preprocess.py` in the current directory.
+- **Main changes of the transformation:**
+  - Use `source` annotation as `domain`.
+  - Retain entity-relation triples in the `tripleset` field of each turn.
+  - If there are multiple source&text annotation in a original sample, split them into multiple samples.
+- **Annotations:**
+  - entity-relation triples
+
+### Supported Tasks and Leaderboards
+
+NLG
+
+### Languages
+
+English
+
+### Data Splits
+
+| split      |   dialogues |   utterances |   avg_utt |   avg_tokens |   avg_domains | cat slot match(state)   | cat slot match(goal)   | cat slot match(dialogue act)   | non-cat slot span(dialogue act)   |
+|------------|-------------|--------------|-----------|--------------|---------------|-------------------------|------------------------|--------------------------------|-----------------------------------|
+| train      |       62659 |        62659 |         1 |        18.85 |             1 | -                       | -                      | -                              | -                                 |
+| validation |        6980 |         6980 |         1 |        21.22 |             1 | -                       | -                      | -                              | -                                 |
+| test       |       12552 |        12552 |         1 |        20.95 |             1 | -                       | -                      | -                              | -                                 |
+| all        |       82191 |        82191 |         1 |        19.37 |             1 | -                       | -                      | -                              | -                                 |
+
+6 domains: ['WikiTableQuestions_mturk', 'WikiSQL_decl_sents', 'WikiSQL_lily', 'WikiTableQuestions_lily', 'webnlg', 'e2e']
+- **cat slot match**: how many values of categorical slots are in the possible values of ontology in percentage.
+- **non-cat slot span**: how many values of non-categorical slots have span annotation in percentage.
+
+
+### Citation
+
+```
+@inproceedings{nan-etal-2021-dart,
+    title = "{DART}: Open-Domain Structured Data Record to Text Generation",
+    author = "Linyong Nan and Dragomir Radev and Rui Zhang and Amrit Rau and Abhinand Sivaprasad and Chiachun Hsieh and Xiangru Tang and Aadit Vyas and Neha Verma and Pranav Krishna and Yangxiaokang Liu and Nadia Irwanto and Jessica Pan and Faiaz Rahman and Ahmad Zaidi and Murori Mutuma and Yasin Tarabar and Ankit Gupta and Tao Yu and Yi Chern Tan and Xi Victoria Lin and Caiming Xiong and Richard Socher and Nazneen Fatema Rajani",
+    booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
+    year = "2021",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2021.naacl-main.37",
+}
+
+```
+
+### Licensing Information
+
+MIT License
\ No newline at end of file
--- a/data/unified_datasets/dart/data.zip
+++ b/data/unified_datasets/dart/data.zip
--- a/data/unified_datasets/dart/dummy_data.json
+++ b/data/unified_datasets/dart/dummy_data.json
+[
+  {
+    "dataset": "dart",
+    "data_split": "train",
+    "dialogue_id": "dart-train-0",
+    "original_id": "train-0",
+    "domains": [
+      "WikiTableQuestions_mturk"
+    ],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "First Clearing\tbased on Callicoon, New York and location at On NYS 52 1 Mi. Youngsville",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "tripleset": [
+          [
+            "First Clearing",
+            "LOCATION",
+            "On NYS 52 1 Mi. Youngsville"
+          ],
+          [
+            "On NYS 52 1 Mi. Youngsville",
+            "CITY_OR_TOWN",
+            "Callicoon, New York"
+          ]
+        ],
+        "subtree_was_extended": false,
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "dart",
+    "data_split": "train",
+    "dialogue_id": "dart-train-1",
+    "original_id": "train-1",
+    "domains": [
+      "WikiTableQuestions_mturk"
+    ],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "Old Turnpike is a Historic Marker in Sullivan County, New York.",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "tripleset": [
+          [
+            "[TABLECONTEXT]",
+            "MARKER_NAME",
+            "Old Turnpike"
+          ],
+          [
+            "[TABLECONTEXT]",
+            "[TITLE]",
+            "List of New York State Historic Markers in Sullivan County, New York"
+          ]
+        ],
+        "subtree_was_extended": true,
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "dart",
+    "data_split": "train",
+    "dialogue_id": "dart-train-2",
+    "original_id": "train-2",
+    "domains": [
+      "WikiTableQuestions_mturk"
+    ],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "Tereza Martincová",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "tripleset": [
+          [
+            "Antalya 15, Turkey",
+            "SURFACE",
+            "Hard"
+          ],
+          [
+            "Antalya 15, Turkey",
+            "OPPONENT",
+            "Tereza Martincová"
+          ],
+          [
+            "Antalya 15, Turkey",
+            "SCORE",
+            "6-4, 6-3"
+          ],
+          [
+            "15 April 2013",
+            "TOURNAMENT",
+            "Antalya 15, Turkey"
+          ]
+        ],
+        "subtree_was_extended": false,
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "dart",
+    "data_split": "train",
+    "dialogue_id": "dart-train-3",
+    "original_id": "train-3",
+    "domains": [
+      "WikiTableQuestions_mturk"
+    ],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "Beatriz Haddad Maia played on 2 April 2012\tin Ribeirão Preto, Brazil\ton a hard surface.",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "tripleset": [
+          [
+            "2 April 2012",
+            "TOURNAMENT",
+            "Ribeirão Preto, Brazil"
+          ],
+          [
+            "Ribeirão Preto, Brazil",
+            "SURFACE",
+            "Hard"
+          ]
+        ],
+        "subtree_was_extended": false,
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "dart",
+    "data_split": "train",
+    "dialogue_id": "dart-train-4",
+    "original_id": "train-4",
+    "domains": [
+      "WikiSQL_decl_sents"
+    ],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "The week 5 game is played in Shea Stadium.",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "tripleset": [
+          [
+            "5",
+            "STADIUM",
+            "shea stadium"
+          ]
+        ],
+        "subtree_was_extended": false,
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "dart",
+    "data_split": "train",
+    "dialogue_id": "dart-train-5",
+    "original_id": "train-5",
+    "domains": [
+      "WikiSQL_decl_sents"
+    ],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "The team whose nickname is red raiders is located in the orange city, iowa",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "tripleset": [
+          [
+            "Northwestern College",
+            "NICKNAME",
+            "Red Raiders"
+          ],
+          [
+            "Northwestern College",
+            "LOCATION",
+            "Orange City, Iowa"
+          ]
+        ],
+        "subtree_was_extended": true,
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "dart",
+    "data_split": "train",
+    "dialogue_id": "dart-train-6",
+    "original_id": "train-6",
+    "domains": [
+      "WikiSQL_decl_sents"
+    ],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "University of mississippi was in the selc new conference.",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "tripleset": [
+          [
+            "University of Mississippi",
+            "NEW_CONFERENCE",
+            "SELC"
+          ]
+        ],
+        "subtree_was_extended": false,
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "dart",
+    "data_split": "train",
+    "dialogue_id": "dart-train-7",
+    "original_id": "train-7",
+    "domains": [
+      "WikiSQL_decl_sents"
+    ],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "The years that the new classification was MCLA division i are 2008-2009.",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "tripleset": [
+          [
+            "University of Mississippi",
+            "NEW_CLASSIFICATION",
+            "MCLA Division I"
+          ],
+          [
+            "University of Mississippi",
+            "YEARS",
+            "2008-2009"
+          ]
+        ],
+        "subtree_was_extended": true,
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "dart",
+    "data_split": "train",
+    "dialogue_id": "dart-train-8",
+    "original_id": "train-8",
+    "domains": [
+      "WikiSQL_decl_sents"
+    ],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "The nickname of the  team of University of Nebraska at Omaha is mavericks.",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "tripleset": [
+          [
+            "University of Nebraska at Omaha",
+            "NICKNAME",
+            "Mavericks"
+          ]
+        ],
+        "subtree_was_extended": false,
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "dart",
+    "data_split": "train",
+    "dialogue_id": "dart-train-9",
+    "original_id": "train-9",
+    "domains": [
+      "WikiSQL_decl_sents"
+    ],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "William Wasmund scored 5.0 points",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "tripleset": [
+          [
+            "William Wasmund",
+            "FIELD_GOALS",
+            "0"
+          ],
+          [
+            "William Wasmund",
+            "EXTRA_POINTS",
+            "0"
+          ],
+          [
+            "William Wasmund",
+            "POINTS",
+            "5"
+          ],
+          [
+            "William Wasmund",
+            "TOUCHDOWNS",
+            "1"
+          ]
+        ],
+        "subtree_was_extended": false,
+        "db_results": {}
+      }
+    ]
+  }
+]
\ No newline at end of file
--- a/data/unified_datasets/dart/preprocess.py
+++ b/data/unified_datasets/dart/preprocess.py
+from zipfile import ZipFile, ZIP_DEFLATED
+from shutil import rmtree
+import json
+import os
+from tqdm import tqdm
+from collections import Counter
+from pprint import pprint
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+import requests
+
+
+def preprocess():
+    new_data_dir = 'data'
+
+    os.makedirs(new_data_dir, exist_ok=True)
+
+    dataset = 'dart'
+    speaker = 'system'
+    splits = ['train', 'validation', 'test']
+    dialogues_by_split = {split:[] for split in splits}
+
+    ontology = {'domains': {},
+                'intents': {},
+                'state': {},
+                'dialogue_acts': {
+                    "categorical": [],
+                    "non-categorical": [],
+                    "binary": []
+                }}
+
+    url_prefix = "https://github.com/Yale-LILY/dart/raw/master/data/v1.1.1/"
+    for data_split in splits:
+        data_file = f"dart-v1.1.1-full-{data_split}.json" if data_split != 'validation' else "dart-v1.1.1-full-dev.json"
+        if not os.path.exists(data_file):
+            data = json.loads(requests.get(f"{url_prefix}{data_file}").content)
+            json.dump(data, open(data_file, 'w'))
+        else:
+            # open(data_file, "wb").write(requests.get(f"{url_prefix}{data_file}").content)
+            data = json.load(open(data_file))
+        for item in tqdm(data, desc='processing dart-{}'.format(data_split)):
+            tripleset = item["tripleset"]
+            subtree_was_extended = item.get("subtree_was_extended", None)
+            for annotation in item["annotations"]:
+                source = annotation["source"]
+                text = annotation["text"]
+                ontology['domains'][source] = {'description': '', 'slots': {}}
+
+                dialogue_id = f'{dataset}-{data_split}-{len(dialogues_by_split[data_split])}'
+                dialogue = {
+                    'dataset': dataset,
+                    'data_split': data_split,
+                    'dialogue_id': dialogue_id,
+                    'original_id': f'{data_split}-{len(dialogues_by_split[data_split])}',
+                    'domains': [source],
+                    'goal': {
+                        'description': '',
+                        'inform': {},
+                        'request': {}
+                    },
+                    'turns': [{
+                        'speaker': speaker,
+                        'utterance': text.strip(),
+                        'utt_idx': 0,
+                        'dialogue_acts': {
+                            'binary': [],
+                            'categorical': [],
+                            'non-categorical': [],
+                        },
+                        'tripleset': tripleset,
+                        'subtree_was_extended': subtree_was_extended,
+                        'db_results': {}
+                    }]
+                }
+
+                dialogues_by_split[data_split].append(dialogue)
+
+    dialogues = dialogues_by_split['train']+dialogues_by_split['validation']+dialogues_by_split['test']
+    json.dump(dialogues[:10], open(f'dummy_data.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    json.dump(ontology, open(f'{new_data_dir}/ontology.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    json.dump(dialogues, open(f'{new_data_dir}/dialogues.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf:
+        for filename in os.listdir(new_data_dir):
+            zf.write(f'{new_data_dir}/{filename}')
+    rmtree(new_data_dir)
+    return dialogues, ontology
+
+
+if __name__ == '__main__':
+    preprocess()
--- a/data/unified_datasets/dart/shuffled_dial_ids.json
+++ b/data/unified_datasets/dart/shuffled_dial_ids.json