add commongen dataset

7b73fe6d · zqwerty · cfe30c75 · 7b73fe6d · 7b73fe6d · 7b73fe6d
Commit 7b73fe6d authored May 11, 2022 by zqwerty
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,7 @@ data/unified_datasets/multiwoz21/MultiWOZ_2.1.zip
 data/unified_datasets/tm1/master.zip
 data/unified_datasets/dailydialog/ijcnlp_dailydialog.zip
 data/unified_datasets/dart/dart-v1.1.1-*.json
+data/unified_datasets/commongen/commongen_data.zip
 data/**/train.json
 data/**/val.json
 data/**/test.json

--- a/data/unified_datasets/commongen/README.md
+++ b/data/unified_datasets/commongen/README.md
+# Dataset Card for CommenGen
+
+- **Repository:** https://github.com/INK-USC/CommonGen
+- **Paper:** https://aclanthology.org/2020.findings-emnlp.165.pdf
+- **Leaderboard:** https://inklab.usc.edu/CommonGen/leaderboard.html
+- **Who transforms the dataset:** Qi Zhu(zhuq96 at gmail dot com)
+
+### Dataset Summary
+
+Building machines with commonsense to compose realistically plausible sentences is challenging. CommonGen is a constrained text generation task, associated with a benchmark dataset, to explicitly test machines for the ability of generative commonsense reasoning. Given a set of common concepts; the task is to generate a coherent sentence describing an everyday scenario using these concepts.
+
+CommonGen is challenging because it inherently requires 1) relational reasoning using background commonsense knowledge, and 2) compositional generalization ability to work on unseen concept combinations. Our dataset, constructed through a combination of crowd-sourcing from AMT and existing caption corpora, consists of 30k concept-sets and 50k sentences in total.
+
+- **How to get the transformed data from original data:**
+  - Run `python preprocess.py` in the current directory.
+- **Main changes of the transformation:**
+  - Set `speaker` to `system`.
+  - Retain common concepts annotation in the `concepts` field of each turn.
+  - If there are multiple scene sentences in a original sample, split them into multiple samples.
+- **Annotations:**
+  - concept words
+
+### Supported Tasks and Leaderboards
+
+NLG
+
+### Languages
+
+English
+
+### Data Splits
+
+| split      |   dialogues |   utterances |   avg_utt |   avg_tokens |   avg_domains | cat slot match(state)   | cat slot match(goal)   | cat slot match(dialogue act)   | non-cat slot span(dialogue act)   |
+|------------|-------------|--------------|-----------|--------------|---------------|-------------------------|------------------------|--------------------------------|-----------------------------------|
+| train      |       67389 |        67389 |         1 |        10.54 |             0 | -                       | -                      | -                              | -                                 |
+| validation |        4018 |         4018 |         1 |        11.57 |             0 | -                       | -                      | -                              | -                                 |
+| test       |        1497 |         1497 |         1 |         1    |             0 | -                       | -                      | -                              | -                                 |
+| all        |       72904 |        72904 |         1 |        10.41 |             0 | -                       | -                      | -                              | -                                 |
+
+0 domains: []
+- **cat slot match**: how many values of categorical slots are in the possible values of ontology in percentage.
+- **non-cat slot span**: how many values of non-categorical slots have span annotation in percentage.
+
+
+### Citation
+
+```
+@inproceedings{lin-etal-2020-commongen,
+    title = "{C}ommon{G}en: A Constrained Text Generation Challenge for Generative Commonsense Reasoning",
+    author = "Bill Yuchen Lin and Wangchunshu Zhou and Ming Shen and Pei Zhou and Chandra Bhagavatula and Yejin Choi and Xiang Ren",
+    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
+    year = "2020",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2020.findings-emnlp.165",
+}
+```
+
+### Licensing Information
+
+MIT License
\ No newline at end of file
--- a/data/unified_datasets/commongen/data.zip
+++ b/data/unified_datasets/commongen/data.zip
--- a/data/unified_datasets/commongen/dummy_data.json
+++ b/data/unified_datasets/commongen/dummy_data.json
+[
+  {
+    "dataset": "commongen",
+    "data_split": "train",
+    "dialogue_id": "commongen-train-0",
+    "original_id": "train-0",
+    "domains": [],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "Skier skis down the mountain",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "concepts": [
+          "ski",
+          "mountain",
+          "skier"
+        ],
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "commongen",
+    "data_split": "train",
+    "dialogue_id": "commongen-train-1",
+    "original_id": "train-1",
+    "domains": [],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "A skier is skiing down a mountain.",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "concepts": [
+          "ski",
+          "mountain",
+          "skier"
+        ],
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "commongen",
+    "data_split": "train",
+    "dialogue_id": "commongen-train-2",
+    "original_id": "train-2",
+    "domains": [],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "Three skiers are skiing on a snowy mountain.",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "concepts": [
+          "ski",
+          "mountain",
+          "skier"
+        ],
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "commongen",
+    "data_split": "train",
+    "dialogue_id": "commongen-train-3",
+    "original_id": "train-3",
+    "domains": [],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "The dog is wagging his tail.",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "concepts": [
+          "wag",
+          "tail",
+          "dog"
+        ],
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "commongen",
+    "data_split": "train",
+    "dialogue_id": "commongen-train-4",
+    "original_id": "train-4",
+    "domains": [],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "A dog wags his tail at the boy.",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "concepts": [
+          "wag",
+          "tail",
+          "dog"
+        ],
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "commongen",
+    "data_split": "train",
+    "dialogue_id": "commongen-train-5",
+    "original_id": "train-5",
+    "domains": [],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "a dog wags its tail with its heart",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "concepts": [
+          "wag",
+          "tail",
+          "dog"
+        ],
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "commongen",
+    "data_split": "train",
+    "dialogue_id": "commongen-train-6",
+    "original_id": "train-6",
+    "domains": [],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "woman paddling canoe on a lake",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "concepts": [
+          "lake",
+          "paddle",
+          "canoe"
+        ],
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "commongen",
+    "data_split": "train",
+    "dialogue_id": "commongen-train-7",
+    "original_id": "train-7",
+    "domains": [],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "paddle an open canoe along lake .",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "concepts": [
+          "lake",
+          "paddle",
+          "canoe"
+        ],
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "commongen",
+    "data_split": "train",
+    "dialogue_id": "commongen-train-8",
+    "original_id": "train-8",
+    "domains": [],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "a man paddles his canoe on the lake.",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "concepts": [
+          "lake",
+          "paddle",
+          "canoe"
+        ],
+        "db_results": {}
+      }
+    ]
+  },
+  {
+    "dataset": "commongen",
+    "data_split": "train",
+    "dialogue_id": "commongen-train-9",
+    "original_id": "train-9",
+    "domains": [],
+    "goal": {
+      "description": "",
+      "inform": {},
+      "request": {}
+    },
+    "turns": [
+      {
+        "speaker": "system",
+        "utterance": "a train pulls into station",
+        "utt_idx": 0,
+        "dialogue_acts": {
+          "binary": [],
+          "categorical": [],
+          "non-categorical": []
+        },
+        "concepts": [
+          "station",
+          "train",
+          "pull"
+        ],
+        "db_results": {}
+      }
+    ]
+  }
+]
\ No newline at end of file
--- a/data/unified_datasets/commongen/preprocess.py
+++ b/data/unified_datasets/commongen/preprocess.py
+from zipfile import ZipFile, ZIP_DEFLATED
+from shutil import rmtree
+import json
+import os
+from tqdm import tqdm
+from collections import Counter
+from pprint import pprint
+import random
+import requests
+
+
+def preprocess():
+    data_file = "commongen_data.zip"
+    if not os.path.exists(data_file):
+        response = requests.get("https://storage.googleapis.com/huggingface-nlp/datasets/common_gen/commongen_data.zip")
+        open(data_file, "wb").write(response.content)
+
+    archive = ZipFile(data_file)
+
+    new_data_dir = 'data'
+
+    os.makedirs(new_data_dir, exist_ok=True)
+
+    dataset = 'commongen'
+    speaker = 'system'
+    splits = ['train', 'validation', 'test']
+    dialogues_by_split = {split:[] for split in splits}
+
+    ontology = {'domains': {},
+                'intents': {},
+                'state': {},
+                'dialogue_acts': {
+                    "categorical": [],
+                    "non-categorical": [],
+                    "binary": []
+                }}
+
+    data_split2suffix = {'train': 'train', 'validation': 'dev', 'test': 'test_noref'}
+    random.seed(42)
+    for data_split in splits:
+        with archive.open(f'commongen.{data_split2suffix[data_split]}.jsonl') as f:
+            for line in f:
+                line = line.replace(b", }", b"}")  # Fix possible JSON format error
+                item = json.loads(line)
+                concepts = item["concept_set"].split("#")
+                random.shuffle(concepts)
+                scenes = item.get("scene", [''])
+                for scene in scenes:
+                    dialogue_id = f'{dataset}-{data_split}-{len(dialogues_by_split[data_split])}'
+                    dialogue = {
+                        'dataset': dataset,
+                        'data_split': data_split,
+                        'dialogue_id': dialogue_id,
+                        'original_id': f'{data_split}-{len(dialogues_by_split[data_split])}',
+                        'domains': [],
+                        'goal': {
+                            'description': '',
+                            'inform': {},
+                            'request': {}
+                        },
+                        'turns': [{
+                            'speaker': speaker,
+                            'utterance': scene.strip(),
+                            'utt_idx': 0,
+                            'dialogue_acts': {
+                                'binary': [],
+                                'categorical': [],
+                                'non-categorical': [],
+                            },
+                            'concepts': concepts,
+                            'db_results': {}
+                        }]
+                    }
+
+                    dialogues_by_split[data_split].append(dialogue)
+
+    dialogues = dialogues_by_split['train']+dialogues_by_split['validation']+dialogues_by_split['test']
+    json.dump(dialogues[:10], open(f'dummy_data.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    json.dump(ontology, open(f'{new_data_dir}/ontology.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    json.dump(dialogues, open(f'{new_data_dir}/dialogues.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf:
+        for filename in os.listdir(new_data_dir):
+            zf.write(f'{new_data_dir}/{filename}')
+    rmtree(new_data_dir)
+    return dialogues, ontology
+
+
+if __name__ == '__main__':
+    preprocess()
--- a/data/unified_datasets/commongen/shuffled_dial_ids.json
+++ b/data/unified_datasets/commongen/shuffled_dial_ids.json
--- a/data/unified_datasets/dart/README.md
+++ b/data/unified_datasets/dart/README.md
-# Dataset Card for DailyDialog
+# Dataset Card for DART

 - **Repository:** https://github.com/Yale-LILY/dart
 - **Paper:** https://arxiv.org/pdf/2007.02871.pdf
@@ -13,6 +13,7 @@ DART is a large and open-domain structured DAta Record to Text generation corpus
  - Run `python preprocess.py` in the current directory.
 - **Main changes of the transformation:**
  - Use `source` annotation as `domain`.
+  - Set `speaker` to `system`.
  - Retain entity-relation triples in the `tripleset` field of each turn.
  - If there are multiple source&text annotation in a original sample, split them into multiple samples.
 - **Annotations:**
@@ -51,7 +52,6 @@ English
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.naacl-main.37",
 }
-
 ```

 ### Licensing Information

--- a/data/unified_datasets/dart/preprocess.py
+++ b/data/unified_datasets/dart/preprocess.py
@@ -3,10 +3,6 @@ from shutil import rmtree
 import json
 import os
 from tqdm import tqdm
-from collections import Counter
-from pprint import pprint
-from nltk.tokenize import sent_tokenize, word_tokenize
-from nltk.tokenize.treebank import TreebankWordDetokenizer
 import requests