add woz dataset

c3167c9d · zqwerty · aede1333 · c3167c9d · c3167c9d · c3167c9d
Commit c3167c9d authored 3 years ago by zqwerty
--- a/.gitignore
+++ b/.gitignore
@@ -23,6 +23,7 @@ data/unified_datasets/dart/dart-v1.1.1-*.json
 data/unified_datasets/commongen/commongen_data.zip
 data/unified_datasets/kvret/kvret_*
 data/unified_datasets/metalwoz/metalwoz-*.zip
+data/unified_datasets/woz/woz
 data/unified_datasets/personachat/original_data
 data/unified_datasets/wow/wizard_of_wikipedia
 data/unified_datasets/**/stat.txt

--- a/data/unified_datasets/woz/README.md
+++ b/data/unified_datasets/woz/README.md
-# README
+# Dataset Card for [dataset name]
-## Features
+- **Repository:** https://github.com/nmrksic/neural-belief-tracker/tree/master/data/woz
+- **Paper:** https://aclanthology.org/P17-1163.pdf
+- **Leaderboard:** None
+- **Who transforms the dataset:** Qi Zhu(zhuq96 at gmail dot com)
- Annotations: dialogue act, character-level span for non-categorical slots.
+### Dataset Summary
-Statistics: 
+Describe the dataset.
-|       | \# dialogues | \# utterances | avg. turns | avg. tokens | \# domains |
+- **How to get the transformed data from original data:** 
-| ----- | ------------ | ------------- | ---------- | ----------- | ---------- |
+  - download `woz_[train|validate|test]_en.json` from https://github.com/nmrksic/neural-belief-tracker/tree/master/data/woz and save to `woz` dir in the current directory.
-| train | 406         | 2936         | 7.23     | 11.36       | 1          |
+  - Run `python preprocess.py` in the current directory.
-| dev | 135         | 941         | 6.97      | 11.99       | 1          |
+- **Main changes of the transformation:**
-| train | 135         | 935         | 6.93       | 11.87       | 1          |
+  - domain is set to **restaurant**.
+  - normalize the value of categorical slots in state and dialogue acts.
+  - `belief_states` in WOZ dataset contains `request` intents, which are ignored in processing.
+  - use simple string match to find value spans of non-categorical slots.
+- **Annotations:**
+  - User dialogue acts, state
-## Main changes
+### Supported Tasks and Leaderboards
- domain is set to **restaurant**
+NLU, DST, E2E
- make some rule-based fixes on categorical values to make them in `possible value` lists
- `belief_states` in WOZ dataset contains `request` intents, which are ignored in processing
- some state annotations are not consistent with dialogue_act annotations. for example in `woz_train_en.json`, first dialog, 2nd turn:
-    `user: "How about Chinese food?"`
+### Languages
-    `chinese food` is included in `dialogue_act` annotation as a `inform` intent, but not updated in `belief_state` annotation.
+English
+### Data Splits
+| split      |   dialogues |   utterances |   avg_utt |   avg_tokens |   avg_domains |   cat slot match(state) | cat slot match(goal)   |   cat slot match(dialogue act) |   non-cat slot span(dialogue act) |
+|------------|-------------|--------------|-----------|--------------|---------------|-------------------------|------------------------|--------------------------------|-----------------------------------|
+| train      |         600 |         4472 |      7.45 |        11.37 |             1 |                     100 | -                      |                            100 |                             96.56 |
+| validation |         200 |         1460 |      7.3  |        11.28 |             1 |                     100 | -                      |                            100 |                             95.52 |
+| test       |         400 |         2892 |      7.23 |        11.49 |             1 |                     100 | -                      |                            100 |                             94.83 |
+| all        |        1200 |         8824 |      7.35 |        11.39 |             1 |                     100 | -                      |                            100 |                             95.83 |
-## Original data
+1 domains: ['restaurant']
+- **cat slot match**: how many values of categorical slots are in the possible values of ontology in percentage.
+- **non-cat slot span**: how many values of non-categorical slots have span annotation in percentage.
-https://github.com/nmrksic/neural-belief-tracker/tree/master/data/woz
+### Citation
\ No newline at end of file
+```
+@inproceedings{mrksic-etal-2017-neural,
+    title = "Neural Belief Tracker: Data-Driven Dialogue State Tracking",
+    author = "Mrk{\v{s}}i{\'c}, Nikola  and
+      {\'O} S{\'e}aghdha, Diarmuid  and
+      Wen, Tsung-Hsien  and
+      Thomson, Blaise  and
+      Young, Steve",
+    booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+    month = jul,
+    year = "2017",
+    address = "Vancouver, Canada",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/P17-1163",
+    doi = "10.18653/v1/P17-1163",
+    pages = "1777--1788",
+}
+```
+### Licensing Information
+Apache License, Version 2.0
--- a/data/unified_datasets/woz/data.zip
+++ b/data/unified_datasets/woz/data.zip
--- a/data/unified_datasets/woz/dummy_data.json
+++ b/data/unified_datasets/woz/dummy_data.json
--- a/data/unified_datasets/woz/original_data.zip
+++ b/data/unified_datasets/woz/original_data.zip
--- a/data/unified_datasets/woz/preprocess.py
+++ b/data/unified_datasets/woz/preprocess.py
 import copy
-import zipfile
 import json
 import os
-from collections import Counter
+from zipfile import ZipFile, ZIP_DEFLATED
-from tqdm import tqdm
+from shutil import rmtree
-import logging
-import sys
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
-logging.basicConfig(level=logging.INFO)
-from convlab2.util.file_util import read_zipped_json, write_zipped_json
-self_dir = os.path.dirname(os.path.abspath(__file__))
+ontology = {
+    'domains': {
-cat_slots = ['price range', 'area']
-cat_slot_values = {
-    'area': [
-                        "east",
-                        "west",
-                        "center",
-                        "north",
-                        "south"
-                    ],
-    'price range': [
-                        "expensive",
-                        "moderate",
-                        "dontcare",
-                        "cheap"
-                    ]
-}
-woz_desc = {
        'restaurant': {
-        'domain': 'search for a restaurant to dine',
+            'description': 'search for a restaurant to dine',
-        'food': 'food type of the restaurant',
+            'slots': {
-        'area': 'area of the restaurant',
+                'food': {
-        'postcode': 'postal code of the restaurant',
+                    'description': 'food type of the restaurant',
-        'phone': 'phone number of the restaurant',
+                    'is_categorical': False,
-        'address': 'address of the restaurant',
+                    'possible_values': []
-        'price range': 'price range of the restaurant',
+                },
-        'name': 'name of the restaurant'
+                'area': {
+                    'description': 'area of the restaurant',
+                    'is_categorical': True,
+                    'possible_values': ["east", "west", "centre", "north", "south"]
+                },
+                'postcode': {
+                    'description': 'postal code of the restaurant',
+                    'is_categorical': False,
+                    'possible_values': []
+                },
+                'phone': {
+                    'description': 'phone number of the restaurant',
+                    'is_categorical': False,
+                    'possible_values': []
+                },
+                'address': {
+                    'description': 'address of the restaurant',
+                    'is_categorical': False,
+                    'possible_values': []
+                },
+                'price range': {
+                    'description': 'price range of the restaurant',
+                    'is_categorical': True,
+                    'possible_values': ["expensive", "moderate", "cheap"]
+                },
+                'name': {
+                    'description': 'name of the restaurant',
+                    'is_categorical': False,
+                    'possible_values': []
+                }
+            }
+        }
    },
    'intents': {
-        'inform': 'system informs user the value of a slot',
+        'inform': {
-        'request': 'system asks the user to provide value of a slot',
+            'description': 'system informs user the value of a slot'
+        },
+        'request': {
+            'description': 'system asks the user to provide value of a slot'
        }
+    },
+    'state': {
+        'restaurant': {
+            'food': '',
+            'area': '',
+            'postcode': '',
+            'phone': '',
+            'address': '',
+            'price range': '',
+            'name': ''
        }
+    },
+    "dialogue_acts": {
+        "categorical": {},
+        "non-categorical": {},
+        "binary": {}
+    }
+}
+def convert_da(da, utt):
+    global ontology
-def convert_da(da, utt, all_binary):
    converted = {
        'binary': [],
        'categorical': [],
@@ -55,269 +84,148 @@ def convert_da(da, utt, all_binary):
    }
    for s, v in da:
-        v = 'expensive' if 'expensive' in v else v
+        if s == 'request':
-        v = 'center' if v == 'centre' else v
+            converted['binary'].append({
-        v = 'east' if 'east' in v else v
-        if s in ['request']:
-            _converted = {
                'intent': 'request',
                'domain': 'restaurant',
                'slot': v,
-                'value': '',
+            })
-            }
-            converted['binary'].append(_converted)
-            if _converted not in all_binary:
-                all_binary.append(_converted)
        else:
-            slot_name = s
+            slot_type = 'categorical' if ontology['domains']['restaurant']['slots'][s]['is_categorical'] else 'non-categorical'
-            slot_type = 'categorical' if s in cat_slots else 'non-categorical'
+            v = v.strip()
+            if v != 'dontcare' and ontology['domains']['restaurant']['slots'][s]['is_categorical']:
+                if v == 'center':
+                    v = 'centre'
+                elif v == 'east side':
+                    v = 'east'
+                assert v in ontology['domains']['restaurant']['slots'][s]['possible_values'], print([s,v, utt])
            converted[slot_type].append({
                'intent': 'inform',
                'domain': 'restaurant',
-                'slot': slot_name,
+                'slot': s,
                'value': v
            })
-            if slot_type == 'non-categorical':
+            if slot_type == 'non-categorical' and v != 'dontcare':
-                start = utt.find(v)
+                start = utt.lower().find(v)
                if start != -1:
                    end = start + len(v)
                    converted[slot_type][-1]['start'] = start
                    converted[slot_type][-1]['end'] = end
+                    converted[slot_type][-1]['value'] = utt[start:end]
    return converted
-def convert_state(state):
-    ret = {
-        'restaurant': {}
-    }
-    for s in woz_desc['restaurant']:
-        if s == 'domain':
-            continue
-        ret['restaurant'][s] = ''
-    for s in state:
-        assert s['act'] in ['request', 'inform']
-        if s['act'] == 'inform':
-            for _s, _v in s['slots']:
-                _v = 'expensive' if 'expensive' in _v else _v
-                _v = 'center' if _v == 'centre' else _v
-                _v = 'east' if 'east' in _v else _v
-                # try:
-                # assert _s not in ret['restaurant']
-                # except:
-                #     continue
-                ret['restaurant'][_s] = _v
-    return ret
-def get_state_update(prev_state, cur_state, usr_da, turn_idx, dialog_idx):
-    ret = {
-        'categorical': [],
-        'non-categorical': []
-    }
-    for k, v in prev_state['restaurant'].items():
-        if k in cur_state['restaurant'] and cur_state['restaurant'][k] == v:
-            continue
-        if k in cat_slots:
-            ret['categorical'].append({
-                'domain': 'restaurant',
-                'slot': k,
-                'value': cur_state['restaurant'][k]
-            })
-        else:
-            found = False
-            for _da in usr_da['non-categorical']:
-                if _da['slot'] == k and _da['value'] == cur_state['restaurant'][k]:
-                    found = True
-                    if v == 'dontcare':
-                        ret['non-categorical'].append({
-                            'domain': 'restaurant',
-                            'slot': k,
-                            'value': cur_state['restaurant'][k],
-                        })
-                    else:
-                        ret['non-categorical'].append({
-                            'domain': 'restaurant',
-                            'slot': k,
-                            'value': cur_state['restaurant'][k]
-                        })
-                        if 'start' in _da:
-                            ret['non-categorical'][-1].update({
-                                'utt_idx': turn_idx * 2,
-                                'start': _da['start'],
-                                'end': _da['end']
-                            })
-            if not found:
-                # print(dialog_idx, turn_idx*2)
-                # print(k, v)
-                # print('===================')
-                ret['non-categorical'].append({
-                    'domain': 'restaurant',
-                    'slot': k,
-                    'value': cur_state['restaurant'][k]
-                })
-    return ret
 def preprocess():
-    dataset_dir = 'woz'
+    original_data_dir = 'woz'
-    data_splits = ['train', 'validate', 'test']
+    new_data_dir = 'data'
-    all_dialogues = []
+    os.makedirs(new_data_dir, exist_ok=True)
-    all_binary_intents = []
-    all_slot = []
+    dataset = 'woz'
-    all_slot_value = {}
+    splits = ['train', 'validation', 'test']
-    extract_dir = os.path.join(self_dir, 'original_data')
+    domain = 'restaurant'
+    dialogues_by_split = {split: [] for split in splits}
-    if not os.path.exists('data.zip') or not os.path.exists('ontology.json'):
+    global ontology
-        # data not processed
-        data_zip_file = os.path.join(self_dir, 'original_data.zip')
+    for split in splits:
-        if not os.path.exists(data_zip_file):
+        if split != 'validation':
-            raise FileNotFoundError(data_zip_file)
+            filename = os.path.join(original_data_dir, f'woz_{split}_en.json')
+        else:
-        logging.info('unzip woz data to {}'.format(extract_dir))
+            filename = os.path.join(original_data_dir, 'woz_validate_en.json')
-        archive = zipfile.ZipFile(data_zip_file, 'r')
+        if not os.path.exists(filename):
-        archive.extractall(extract_dir)
+            raise FileNotFoundError(
+                f'cannot find {filename}, should manually download from https://github.com/nmrksic/neural-belief-tracker/tree/master/data/woz')
-        dialog_id = 1
-        for split in data_splits:
+        data = json.load(open(filename))
-            data = json.load(open(os.path.join(self_dir, extract_dir, 'original_data/woz_{}_en.json'.format(split))))
+        for item in data:
+            dialogue = {
+                'dataset': dataset,
-            for dialogue in data:
+                'data_split': split,
-                ret = {}
+                'dialogue_id': f'{dataset}-{split}-{len(dialogues_by_split[split])}',
-                ret['dataset'] = "woz"
+                'original_id': item['dialogue_idx'],
-                ret['data_split'] = split if split != 'validate' else 'val'
+                'domains': [domain],
-                ret['dialogue_id'] = 'woz_' + str(dialog_id)
+                'turns': []
-                ret['original_id'] = split + str(dialogue['dialogue_idx']) if split != 'validate' else 'val' + str(dialogue['dialogue_idx'])
+            }
-                ret['domains'] = ['restaurant']
-                ret['turns'] = []
-                turns = dialogue['dialogue']
+            turns = item['dialogue']
            n_turn = len(turns)
-                prev_state = {'restaurant':{k: '' for k in woz_desc['restaurant'] if k != 'domain'}}
            for i in range(n_turn):
+                sys_utt = turns[i]['system_transcript'].strip()
-                    sys_utt = turns[i]['system_transcript'].lower()
+                usr_utt = turns[i]['transcript'].strip()
-                    usr_utt = turns[i]['transcript'].lower()
                usr_da = turns[i]['turn_label']
-                    bs = turns[i]['belief_state']
                for s, v in usr_da:
                    if s == 'request':
-                            if v not in all_slot:
+                        assert v in ontology['domains']['restaurant']['slots']
-                                all_slot.append(v)
-                            if v not in all_slot_value and v != 'dontcare':
-                                all_slot_value[v] = []
                    else:
-                            if s not in all_slot:
+                        assert s in ontology['domains']['restaurant']['slots']
-                                all_slot.append(s)
-                            if v == 'dontcare':
-                                continue
-                            if s not in all_slot_value:
-                                all_slot_value[s] = [v]
-                            else:
-                                if v not in all_slot_value[s]:
-                                    all_slot_value[s].append(v)
                if i != 0:
-                        ret['turns'].append({
+                    dialogue['turns'].append({
-                            'utt_idx': len(ret['turns']),
+                        'utt_idx': len(dialogue['turns']),
                        'speaker': 'system',
                        'utterance': sys_utt,
-                            'dialogue_act': {'binary':[], 'categorical': [], 'non-categorical':[]},
                    })
-                    cur_state = convert_state(bs)
+                cur_state = copy.deepcopy(ontology['state'])
-                    cur_usr_da = convert_da(usr_da, usr_utt, all_binary_intents)
+                for act_slots in turns[i]['belief_state']:
+                    act, slots = act_slots['act'], act_slots['slots']
-                    ret['turns'].append({
+                    if act == 'inform':
-                        'utt_idx': len(ret['turns']),
+                        for s, v in slots:
+                            v = v.strip()
+                            if v != 'dontcare' and ontology['domains']['restaurant']['slots'][s]['is_categorical']:
+                                if v not in ontology['domains']['restaurant']['slots'][s]['possible_values']:
+                                    if v == 'center':
+                                        v = 'centre'
+                                    elif v == 'east side':
+                                        v = 'east'
+                                    assert v in ontology['domains']['restaurant']['slots'][s]['possible_values']
+                            cur_state[domain][s] = v
+                cur_usr_da = convert_da(usr_da, usr_utt)
+                # add to dialogue_acts dictionary in the ontology
+                for da_type in cur_usr_da:
+                    das = cur_usr_da[da_type]
+                    for da in das:
+                        ontology["dialogue_acts"][da_type].setdefault((da['intent'], da['domain'], da['slot']), {})
+                        ontology["dialogue_acts"][da_type][(da['intent'], da['domain'], da['slot'])]['user'] = True
+                dialogue['turns'].append({
+                    'utt_idx': len(dialogue['turns']),
                    'speaker': 'user',
                    'utterance': usr_utt,
                    'state': cur_state,
-                        'dialogue_act': cur_usr_da,
+                    'dialogue_acts': cur_usr_da,
-                        'state_update': get_state_update(prev_state, cur_state, cur_usr_da, i, ret['dialogue_id'])
                })
-                    prev_state = copy.deepcopy(cur_state)
+            dialogues_by_split[split].append(dialogue)
-                all_dialogues.append(ret)
+    dialogues = []
-                dialog_id += 1
+    for split in splits:
+        dialogues += dialogues_by_split[split]
-        save_file = 'data.json'
+    for da_type in ontology['dialogue_acts']:
-        json.dump(all_dialogues, open(save_file, 'w'), indent=4)
+        ontology["dialogue_acts"][da_type] = sorted([str(
-        write_zipped_json(os.path.join(self_dir, 'data.zip'), 'data.json')
+            {'user': speakers.get('user', False), 'system': speakers.get('system', False), 'intent': da[0],
-        os.remove('data.json')
+             'domain': da[1], 'slot': da[2]}) for da, speakers in ontology["dialogue_acts"][da_type].items()])
+    json.dump(dialogues[:10], open(f'dummy_data.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
-        new_ont = {'domains': {
+    json.dump(ontology, open(f'{new_data_dir}/ontology.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
-            'restaurant': {
+    json.dump(dialogues, open(f'{new_data_dir}/dialogues.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
-                'description': woz_desc['restaurant']['domain'],
+    with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf:
-                'slots': {}
+        for filename in os.listdir(new_data_dir):
-            }
+            zf.write(f'{new_data_dir}/{filename}')
-        }, 'intents': {
+    rmtree(original_data_dir)
-            'inform': {
+    rmtree(new_data_dir)
-                'description': woz_desc['intents']['inform'],
+    return dialogues, ontology
-            },
-            'request': {
-                'description': woz_desc['intents']['request'],
-            },
-        }, 'binary_dialogue_act': []
-        }
-        for i in all_binary_intents:
-            new_ont['binary_dialogue_act'].append(i)
-        for slot in all_slot_value:
-            if slot in cat_slots:
-                new_ont['domains']['restaurant']['slots'][slot] = {
-                    'is_categorical': True,
-                    'possible_values': [],
-                    'description': woz_desc['restaurant'][slot]
-                }
-                for v in all_slot_value[slot]:
-                    v = 'expensive' if 'expensive' in v else v
-                    v = 'center' if v == 'centre' else v
-                    v = 'east' if 'east' in v else v
-                    if v not in new_ont['domains']['restaurant']['slots'][slot]['possible_values']:
-                        new_ont['domains']['restaurant']['slots'][slot]['possible_values'].append(v)
-            else:
-                new_ont['domains']['restaurant']['slots'][slot] = {
-                    'is_categorical': False,
-                    'possible_values': [],
-                    'description': woz_desc['restaurant'][slot]
-                }
-        new_ont['state'] = {
-            'restaurant': {k: '' for k in all_slot_value}
-        }
-        json.dump(new_ont, open(os.path.join(self_dir, 'ontology.json'), 'w'), indent=4)
-    else:
-        # read from file
-        all_dialogues = read_zipped_json(os.path.join(self_dir, 'data.zip'), 'data.json')
-        new_ont = json.load(open(os.path.join(self_dir, 'ontology.json')))
-    return all_dialogues, new_ont
 if __name__ == '__main__':

--- a/data/unified_datasets/woz/shuffled_dial_ids.json
+++ b/data/unified_datasets/woz/shuffled_dial_ids.json