Merge branch 'unified_dataset'

052b8c5f · zqwerty · f7fe7b49 · 697ab75f · 052b8c5f · 052b8c5f
Commit 052b8c5f authored 3 years ago by zqwerty
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,9 @@ __pycache__
 .vscode

 # data
+data/unified_datasets/multiwoz21/MultiWOZ_2.1.zip
+data/unified_datasets/tm1/master.zip
+data/unified_datasets/dailydialog/ijcnlp_dailydialog.zip
 data/**/train.json
 data/**/val.json
 data/**/test.json

--- a/data/unified_datasets/dailydialog/README.md
+++ b/data/unified_datasets/dailydialog/README.md
+# Dataset Card for DailyDialog
+
+- **Repository:** http://yanran.li/dailydialog
+- **Paper:** https://arxiv.org/pdf/1710.03957.pdf
+- **Leaderboard:** None
+- **Who transforms the dataset:** Qi Zhu(zhuq96 at gmail dot com)
+
+### Dataset Summary
+
+DailyDialog is a high-quality multi-turn dialog dataset. It is intriguing in several aspects. The language is human-written and less noisy. The dialogues in the dataset reflect our daily communication way and cover various topics about our daily life. We also manually label the developed dataset with communication intention and emotion information.
+
+- **How to get the transformed data from original data:** 
+  - Download [ijcnlp_dailydialog.zip](http://yanran.li/files/ijcnlp_dailydialog.zip).
+  - Run `python preprocess.py` in the current directory.
+- **Main changes of the transformation:**
+  - Use `topic` annotation as `domain`. If duplicated dialogs are annotated with different topics, use the most frequent one.
+  - Combine `intent` and `domain` annotation as `binary` dialogue acts.
+- **Annotations:**
+  - intent, emotion
+
+### Supported Tasks and Leaderboards
+
+NLU, NLG
+
+### Languages
+
+English
+
+### Data Splits
+
+| split      |   dialogues |   utterances |   avg_utt |   avg_tokens |   avg_domains | cat slot match(state)   | cat slot match(goal)   | cat slot match(dialogue act)   | non-cat slot span(dialogue act)   |
+|------------|-------------|--------------|-----------|--------------|---------------|-------------------------|------------------------|--------------------------------|-----------------------------------|
+| train      |       11118 |        87170 |      7.84 |        13.61 |             1 | -                       | -                      | -                              | -                                 |
+| validation |        1000 |         8069 |      8.07 |        13.5  |             1 | -                       | -                      | -                              | -                                 |
+| test       |        1000 |         7740 |      7.74 |        13.78 |             1 | -                       | -                      | -                              | -                                 |
+| all        |       13118 |       102979 |      7.85 |        13.61 |             1 | -                       | -                      | -                              | -                                 |
+
+10 domains: ['Ordinary Life', 'School Life', 'Culture & Education', 'Attitude & Emotion', 'Relationship', 'Tourism', 'Health', 'Work', 'Politics', 'Finance']
+- **cat slot match**: how many values of categorical slots are in the possible values of ontology in percentage.
+- **non-cat slot span**: how many values of non-categorical slots have span annotation in percentage.
+
+
+### Citation
+
+```
+@InProceedings{li2017dailydialog,
+    author = {Li, Yanran and Su, Hui and Shen, Xiaoyu and Li, Wenjie and Cao, Ziqiang and Niu, Shuzi},
+    title = {DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset},
+    booktitle = {Proceedings of The 8th International Joint Conference on Natural Language Processing (IJCNLP 2017)},
+    year = {2017}
+}
+```
+
+### Licensing Information
+
+[**CC BY-NC-SA 4.0**](https://creativecommons.org/licenses/by-nc-sa/4.0/)
\ No newline at end of file
--- a/data/unified_datasets/dailydialog/data.zip
+++ b/data/unified_datasets/dailydialog/data.zip
--- a/data/unified_datasets/dailydialog/dummy_data.json
+++ b/data/unified_datasets/dailydialog/dummy_data.json
--- a/data/unified_datasets/dailydialog/preprocess.py
+++ b/data/unified_datasets/dailydialog/preprocess.py
+import copy
+import re
+from zipfile import ZipFile, ZIP_DEFLATED
+from shutil import copy2, rmtree
+import json
+import os
+from tqdm import tqdm
+from collections import Counter
+from pprint import pprint
+from datasets import load_dataset
+
+topic_map = {
+    1: "Ordinary Life", 
+    2: "School Life", 
+    3: "Culture & Education",
+    4: "Attitude & Emotion", 
+    5: "Relationship", 
+    6: "Tourism", 
+    7: "Health", 
+    8: "Work", 
+    9: "Politics", 
+    10: "Finance"
+}
+
+act_map = {
+    1: "inform", 
+    2: "question", 
+    3: "directive", 
+    4: "commissive"
+}
+
+emotion_map = {
+    0: "no emotion", 
+    1: "anger", 
+    2: "disgust", 
+    3: "fear", 
+    4: "happiness", 
+    5: "sadness", 
+    6: "surprise"
+}
+
+def preprocess():
+    original_data_dir = 'ijcnlp_dailydialog'
+    new_data_dir = 'data'
+
+    if not os.path.exists(original_data_dir):
+        original_data_zip = 'ijcnlp_dailydialog.zip'
+        if not os.path.exists(original_data_zip):
+            raise FileNotFoundError(f'cannot find original data {original_data_zip} in dailydialog/, should manually download ijcnlp_dailydialog.zip from http://yanran.li/files/ijcnlp_dailydialog.zip')
+        else:
+            archive = ZipFile(original_data_zip)
+            archive.extractall()
+
+    os.makedirs(new_data_dir, exist_ok=True)
+
+    dataset = 'dailydialog'
+    splits = ['train', 'validation', 'test']
+    dialogues_by_split = {split:[] for split in splits}
+    dial2topics = {}
+    with open(os.path.join(original_data_dir, 'dialogues_text.txt')) as dialog_file, \
+        open(os.path.join(original_data_dir, 'dialogues_topic.txt')) as topic_file:
+        for dialog, topic in zip(dialog_file, topic_file):
+            topic = int(topic.strip())
+            dialog = dialog.replace(' __eou__ ', ' ')
+            if dialog in dial2topics:
+                dial2topics[dialog].append(topic)
+            else:
+                dial2topics[dialog] = [topic]
+
+    global topic_map, act_map, emotion_map
+
+    ontology = {'domains': {x:{'description': '', 'slots': {}} for x in topic_map.values()},
+                'intents': {x:{'description': ''} for x in act_map.values()},
+                'state': {},
+                'dialogue_acts': {
+                    "categorical": [],
+                    "non-categorical": [],
+                    "binary": {}
+                }}
+
+    for data_split in splits:
+        archive = ZipFile(os.path.join(original_data_dir, f'{data_split}.zip'))
+        with archive.open(f'{data_split}/dialogues_{data_split}.txt') as dialog_file, \
+            archive.open(f'{data_split}/dialogues_act_{data_split}.txt') as act_file, \
+            archive.open(f'{data_split}/dialogues_emotion_{data_split}.txt') as emotion_file:
+            for dialog_line, act_line, emotion_line in zip(dialog_file, act_file, emotion_file):
+                if not dialog_line.strip():
+                    break
+                utts = dialog_line.decode().split("__eou__")[:-1]
+                acts = act_line.decode().split(" ")[:-1]
+                emotions = emotion_line.decode().split(" ")[:-1]
+                assert (len(utts) == len(acts) == len(emotions)), "Different turns btw dialogue & emotion & action"
+
+                topics = dial2topics[dialog_line.decode().replace(' __eou__ ', ' ')]
+                topic = Counter(topics).most_common(1)[0][0]
+                domain = topic_map[topic]
+                
+                dialogue_id = f'{dataset}-{data_split}-{len(dialogues_by_split[data_split])}'
+                dialogue = {
+                    'dataset': dataset,
+                    'data_split': data_split,
+                    'dialogue_id': dialogue_id,
+                    'original_id': f'{data_split}-{len(dialogues_by_split[data_split])}',
+                    'domains': [domain],
+                    'goal': {
+                        'description': '',
+                        'inform': {},
+                        'request': {}
+                    },
+                    'turns': []
+                }
+
+                for utt, act, emotion in zip(utts, acts, emotions):
+                    speaker = 'user' if len(dialogue['turns']) % 2 == 0 else 'system'
+                    intent = act_map[int(act)]
+                    emotion = emotion_map[int(emotion)]
+                    dialogue['turns'].append({
+                        'speaker': speaker,
+                        'utterance': utt.strip(),
+                        'utt_idx': len(dialogue['turns']),
+                        'dialogue_acts': {
+                            'binary': [{
+                                'intent': intent, 
+                                'domain': domain, 
+                                'slot': ''
+                            }],
+                            'categorical': [],
+                            'non-categorical': [],
+                        },
+                        'emotion': emotion,
+                    })
+                    if speaker == 'system':
+                        dialogue['turns'][-1]['db_results'] = {}
+                    else:
+                        dialogue['turns'][-1]['state'] = {}
+
+                    ontology["dialogue_acts"]['binary'].setdefault((intent, domain, ''), {})
+                    ontology["dialogue_acts"]['binary'][(intent, domain, '')][speaker] = True
+
+                dialogues_by_split[data_split].append(dialogue)
+
+    ontology["dialogue_acts"]['binary'] = sorted([str({'user': speakers.get('user', False), 'system': speakers.get('system', False), 'intent':da[0],'domain':da[1], 'slot':da[2]}) for da, speakers in ontology["dialogue_acts"]['binary'].items()])
+    dialogues = dialogues_by_split['train']+dialogues_by_split['validation']+dialogues_by_split['test']
+    json.dump(dialogues[:10], open(f'dummy_data.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    json.dump(ontology, open(f'{new_data_dir}/ontology.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    json.dump(dialogues, open(f'{new_data_dir}/dialogues.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
+    with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf:
+        for filename in os.listdir(new_data_dir):
+            zf.write(f'{new_data_dir}/{filename}')
+    # rmtree(original_data_dir)
+    # rmtree(new_data_dir)
+    return dialogues, ontology
+
+    
+
+
+if __name__ == '__main__':
+    preprocess()
\ No newline at end of file
--- a/data/unified_datasets/dailydialog/shuffled_dial_ids.json
+++ b/data/unified_datasets/dailydialog/shuffled_dial_ids.json