diff --git a/data/unified_datasets/README.md b/data/unified_datasets/README.md index 8e502f43eff1115139ade8144a5e3ce1d60aa94f..82b43645a0f9f5bb55cbb58c9b49baa3ca1ebb24 100644 --- a/data/unified_datasets/README.md +++ b/data/unified_datasets/README.md @@ -19,7 +19,7 @@ if __name__ == '__main__': preprocess() ``` -- `data.zip`: the zipped directory contains: +- `data.zip`: the zipped directory `data` contains: - `ontology.json`: dataset ontology, contains descriptions, state definition, etc. - `dialogues.json`: a list of all dialogues in the dataset. - other necessary files such as databases. @@ -42,7 +42,7 @@ class Database: We first introduce the unified format of `ontology` and `dialogues`. To transform a new dataset into the unified format: 1. Create `data/unified_datasets/$dataset` folder, where `$dataset` is the name of the dataset. 2. Write `preprocess.py` to transform the original dataset into the unified format, producing `data.zip` and `dummy_data.json`. -3. Run `python test.py $dataset` in the `data/unified_datasets` directory to check the validation of processed dataset and get data statistics. +3. Run `python check.py $dataset` in the `data/unified_datasets` directory to check the validation of processed dataset and get data statistics. 4. Write `README.md` to describe the data following [How to create dataset README](#how-to-create-dataset-readme). 5. Add `$dataset.py` and `dataset_info.json` following this [instruction](https://huggingface.co/docs/datasets/dataset_script.html) (Here no need to generate dummy data). Upload the dataset directory to Hugging Face's `Datasets` following this [instruction](https://huggingface.co/docs/datasets/share.html#add-a-community-dataset) (set `--organization` to `ConvLab`). @@ -73,37 +73,37 @@ We first introduce the unified format of `ontology` and `dialogues`. To transfor `dialogues.json`: a *list* of dialogues (*dict*) containing: - `dataset`: (*str*) dataset name, must be the same as the data directory. -- `data_split`: (*str*) in `["train", "validation", "test"]`. +- `data_split`: (*str*) in `["train", "validation", "test", ...]`. - `dialogue_id`: (*str*) `"$dataset-$split-$id"`, `id` increases from 0. - `domains`: (*list*) involved domains in this dialogue. -- `goal`: (*dict*, optional) - - `description`: (*str*, optional) a string describes the user goal. - - `constraints`: (*dict*, optional) same format as dialogue state of involved domains but with only filled slots as constraints. - - `requirements`: (*dict*, optional) same format as dialogue state of involved domains but with only empty required slots. +- `goal`: (*dict*) + - `description`: (*str*, could be empty) a string describes the user goal. + - `constraints`: (*dict*, could be empty) same format as dialogue state of involved domains but with only filled slots as constraints. + - `requirements`: (*dict*, could be empty) same format as dialogue state of involved domains but with only empty required slots. - `turns`: (*list* of *dict*) - `speaker`: (*str*) "user" or "system". - `utterance`: (*str*) - `utt_idx`: (*int*) `turns['utt_idx']` gives current turn. - - `dialogue_acts`: (*dict*, optional) - - `categorical`: (*list* of *dict*) for categorical slots. + - `dialogue_acts`: (*dict*) + - `categorical`: (*list* of *dict*, could be empty) for categorical slots. - `{"intent": (str), "domain": (str), "slot": (str), "value": (str)}`. Value sets are defined in the ontology. - - `non-categorical` (*list* of *dict*) for non-categorical slots. + - `non-categorical` (*list* of *dict*, could be empty) for non-categorical slots. - `{"intent": (str), "domain": (str), "slot": (str), "value": (str), "start": (int), "end": (int)}`. `start` and `end` are character indexes for the value span in the utterance and can be absent. - - `binary` (*list* of *dict*) for binary dialogue acts in ontology. + - `binary` (*list* of *dict*, could be empty) for binary dialogue acts in ontology. - `{"intent": (str), "domain": (str), "slot": (str), "value": (str)}`. Possible dialogue acts are listed in the `ontology['binary_dialogue_acts']`. - - `state`: (*dict*, user side, optional) dialogue state of involved domains. full state is shown in `ontology['state']`. + - `state`: (*dict*, user side, could be empty) dialogue state of involved domains. full state is shown in `ontology['state']`. - `$domain_name`: (*dict*) contains all slots in this domain. - `$slot_name`: (*str*) value for this slot. - - `db_results`: (*dict*, optional) + - `db_results`: (*dict*, system side, could be empty) - `$domain_name`: (*list* of *dict*) topk entities (each entity contains slot-value pairs) Other attributes are optional. -Run `python test.py $dataset` in the `data/unified_datasets` directory to check the validation of processed dataset and get data statistics. +Run `python check.py $dataset` in the `data/unified_datasets` directory to check the validation of processed dataset and get data statistics. ### How to create dataset README Each dataset has a README.md to describe the original and transformed data. Follow the Hugging Face's [dataset card creation](https://huggingface.co/docs/datasets/dataset_card.html) to export `README.md`. Make sure that the following additional information is included in the **Dataset Summary** section: -- Main changes from original data to processed data. +- How to get the transformed data from original data and what are the main changes. - Annotations: whether have user goal, dialogue acts, state, db results, etc. -And the data statistics given by `test.py` should be included in the **Data Splits** section. +And the data statistics given by `check.py` should be included in the **Data Splits** section. diff --git a/data/unified_datasets/evaluate.py b/data/unified_datasets/check.py similarity index 51% rename from data/unified_datasets/evaluate.py rename to data/unified_datasets/check.py index 1c68f384db76c34a6e05a4e3993af227e2e3be3c..1f33a22c88f55bba73ee88f455353208549d0880 100644 --- a/data/unified_datasets/evaluate.py +++ b/data/unified_datasets/check.py @@ -1,11 +1,14 @@ import json import os from copy import deepcopy +from zipfile import ZipFile +import importlib +from tabulate import tabulate -special_values = ['dontcare', ''] +special_values = ['', 'dontcare', None] -def check_ontology(name): +def check_ontology(ontology): """ ontology: { "domains": { @@ -13,10 +16,9 @@ def check_ontology(name): "description": domain description, "slots": { slot name: { - "description": slot description - // possible_values is empty iff is_categorical is False + "description": slot description, "is_categorical": is_categorical, - "possible_values": [possible_values...] + "possible_values": [possible_values...], not empty if is_categorical } } } @@ -26,11 +28,11 @@ def check_ontology(name): "description": intent description } }, - "binary_dialogue_act": { + "binary_dialogue_acts": { [ { "intent": intent name, - "domain": domain name + "domain": domain name, "slot": slot name, "value": some value } @@ -44,10 +46,6 @@ def check_ontology(name): } """ global special_values - - ontology_file = os.path.join(f'{name}', 'ontology.json') - assert os.path.exists(ontology_file), f'ontology file should named {ontology_file}' - ontology = json.load(open(ontology_file)) # record issues in ontology descriptions = { @@ -59,8 +57,6 @@ def check_ontology(name): for domain_name, domain in ontology['domains'].items(): if not domain['description']: descriptions["domains"] = False - # if not domain_name in ontology['state']: - # print(f"domain '{domain_name}' not found in state") for slot_name, slot in domain["slots"].items(): if not slot["description"]: descriptions["slots"] = False @@ -75,22 +71,17 @@ def check_ontology(name): descriptions["intents"] = False binary_dialogue_acts = set() - for bda in ontology['binary_dialogue_act']: + for bda in ontology['binary_dialogue_acts']: assert bda['intent'] is None or bda["intent"] in ontology['intents'], f'ONTOLOGY\tintent undefined intent in binary dialog act: {bda}' binary_dialogue_acts.add(tuple(bda.values())) ontology['bda_set'] = binary_dialogue_acts assert 'state' in ontology, 'ONTOLOGY\tno state' - redundant_value = False for domain_name, domain in ontology['state'].items(): assert domain_name in ontology['domains'] for slot_name, value in domain.items(): assert slot_name in ontology['domains'][domain_name]['slots'] - if value: - redundant_value = True - - if redundant_value: - print('ONTOLOGY: redundant value description in state') + assert value == "", "should set value in state to \"\"" # print('description existence:', descriptions, '\n') for description, value in descriptions.items(): @@ -99,27 +90,13 @@ def check_ontology(name): return ontology -def check_data(name, ontology): +def check_dialogues(name, dialogues, ontology): global special_values - from zipfile import ZipFile - data_file = os.path.join(f'{name}', 'data.zip') - if not os.path.exists(data_file): - print('cannot find data.zip') - return - - print('loading data') - with ZipFile(data_file) as zipfile: - with zipfile.open('data.json', 'r') as f: - data = json.load(f) - all_id = set() - splits = ['train', 'val', 'test'] + splits = ['train', 'validation', 'test'] da_values = 0 da_matches = 0 - state_values = 0 - state_matches = 0 - distances = [] stat_keys = ['dialogues', 'utterances', 'tokens', 'domains'] stat = { split: { @@ -129,24 +106,26 @@ def check_data(name, ontology): # present for both non-categorical or categorical - for dialogue in data: + for dialogue in dialogues: dialogue_id = dialogue['dialogue_id'] - assert isinstance(dialogue_id, str), '`dialogue_id` is expected to be str type' - dialogue_id = str(dialogue_id) + assert isinstance(dialogue_id, str), f'{dialogue_id}\t`dialogue_id` is expected to be str type' assert dialogue['dataset'] == name, f'{dialogue_id}\tinconsistent dataset name: {dialogue["dataset"]}' split = dialogue['data_split'] - assert split in splits, f'unknown split: `{split}`' + assert isinstance(split, str), f'{dialogue_id}\t`split` is expected to be str type but got {type(split)}' + if split not in splits: + splits.append(split) + stat[split] = {key: 0 for key in stat_keys} + cur_stat = stat[split] cur_stat['dialogues'] += 1 try: - prefix, num = dialogue_id.split('_') - assert prefix == name + prefix, id_split, num = dialogue_id.split('-') + assert prefix == name and id_split == split int(num) # try converting to int except: - print(f'{dialogue_id}\twrong dialogue id format: {dialogue_id}') - raise Exception + raise Exception(f'{dialogue_id}\twrong dialogue id format: {dialogue_id}') assert dialogue_id not in all_id, f'multiple dialogue id: {dialogue_id}' all_id.add(dialogue_id) @@ -158,17 +137,45 @@ def check_data(name, ontology): for domain_name in cur_domains: assert domain_name in ontology['domains'], f'{dialogue_id}\tundefined current domain: {domain_name}' + # check domain-slot-value + # prefix: error prefix + def check_dsv(domain_name, slot_name, value, categorical=None, prefix=f'{dialogue_id}'): + assert domain_name in cur_domains, f'{prefix}\t{domain_name} not presented in current domains' + domain = ontology['domains'][domain_name] + assert slot_name in domain['slots'], f'{prefix}\t{slot_name} not presented in domain {domain_name} in ontology' + slot = domain['slots'][slot_name] + if categorical is None: + categorical = slot['is_categorical'] + else: + assert categorical == slot['is_categorical'], \ + f'{prefix}\t{domain_name}-{slot_name} is_categorical should be {slot["is_categorical"]} as in ontology' + if categorical: + value = value.lower() + assert value in special_values or value in slot['possible_values'], \ + f'{prefix}\t`{value}` not presented in possible values of {domain_name}-{slot_name}: {slot["possible_values"]}' + + def check_da(da, categorical): + assert da['intent'] in ontology['intents'], f'{dialogue_id}:{turn_id}:da\tundefined intent {da["intent"]}' + check_dsv(da['domain'], da['slot'], da['value'], categorical, f'{dialogue_id}:{turn_id}:da') + + goal = dialogue['goal'] + assert isinstance(goal['description'], str), f'{dialogue_id}\tgoal description {goal["description"]} should be string' + assert isinstance(goal['constraints'], dict), f'{dialogue_id}\tgoal constraints {goal["constraints"]} should be dict' + assert isinstance(goal['requirements'], dict), f'{dialogue_id}\tgoal requirements {goal["requirements"]} should be dict' + for domain_name, domain in goal['constraints'].items(): + for slot_name, value in domain.items(): + check_dsv(domain_name, slot_name, value, prefix=f'{dialogue_id}:goal:constraints') + assert value != "", f'{dialogue_id}\tshould set non-empty value in goal constraints {goal["constraints"]}' + for domain_name, domain in goal['requirements'].items(): + for slot_name, value in domain.items(): + check_dsv(domain_name, slot_name, value, prefix=f'{dialogue_id}:goal:requirements') + assert value == "", f'{dialogue_id}\tshould set empty value in goal requirements {goal["requirements"]}' + turns = dialogue['turns'] cur_stat['utterances'] += len(turns) assert turns, f'{dialogue_id}\tempty turn' - assert turns[0]['speaker'] == 'user', f'{dialogue_id}\tnot start with user role' - if ontology['state']: - # update cur_state with state_update every turn, and compare it with state annotation - cur_state = { - domain_name: deepcopy(ontology['state'][domain_name]) for domain_name in cur_domains - } - # check dialog act + # assert turns[0]['speaker'] == 'user', f'{dialogue_id}\tnot start with user role' for turn_id, turn in enumerate(turns): assert turn['speaker'] in ['user', 'system'], f'{dialogue_id}:{turn_id}\tunknown speaker value: {turn["speaker"]}' assert turn_id == turn['utt_idx'], f'{dialogue_id}:{turn_id}\twrong utt_idx' @@ -177,27 +184,11 @@ def check_data(name, ontology): utterance = turn['utterance'] cur_stat['tokens'] += len(utterance.strip().split(' ')) - dialogue_acts = turn['dialogue_act'] - - # check domain-slot-value - # prefix: error prefix - def check_dsv(domain_name, slot_name, value, categorical, prefix): - assert domain_name in cur_domains or domain_name == 'booking', f'{prefix}\t{domain_name} not presented in current domains' - domain = ontology['domains'][domain_name] - assert slot_name in domain['slots'], f'{prefix}\t{slot_name} not presented in domain {domain_name}' - slot = domain['slots'][slot_name] - if categorical: - assert slot['is_categorical'], f'{prefix}\t{domain_name}-{slot_name} is not categorical' - value = value.lower() - assert value in special_values or value in slot['possible_values'], f'{prefix}\t`{value}` not presented in possible values of' \ - f' {domain_name}-{slot_name}: {slot["possible_values"]}' - else: - assert not slot['is_categorical'], f'{prefix}\t{domain_name}-{slot_name} is not non-categorical' - - def check_da(da, categorical): - assert da['intent'] in ontology['intents'], f'{dialogue_id}:{turn_id}\tundefined intent {da["intent"]}' - check_dsv(da['domain'], da['slot'], da['value'], categorical, f'{dialogue_id}:{turn_id}:da') + dialogue_acts = turn['dialogue_acts'] + assert isinstance(dialogue_acts['categorical'], list), f'{dialogue_id}:{turn_id}\tcategorical dialogue_acts should be a list' + assert isinstance(dialogue_acts['non-categorical'], list), f'{dialogue_id}:{turn_id}\tnon-categorical dialogue_acts should be a list' + assert isinstance(dialogue_acts['binary'], list), f'{dialogue_id}:{turn_id}\tbinary dialogue_acts should be a list' for da in dialogue_acts['categorical']: check_da(da, True) for da in dialogue_acts['non-categorical']: @@ -205,7 +196,7 @@ def check_data(name, ontology): # values only match after .strip() in some case, it's the issue of pre-processing if da['value'] not in special_values: da_values += 1 - assert 'start' in da and 'end' in da or 'start' not in da and 'end' not in da, \ + assert ('start' in da) == ('end' in da), \ f'{dialogue_id}:{turn_id}\tstart and end field in da should both present or neither not present' if 'start' in da: value = utterance[da['start']:da['end']] @@ -214,51 +205,37 @@ def check_data(name, ontology): for da in dialogue_acts['binary']: assert tuple(da.values()) in ontology['bda_set'], f'{dialogue_id}:{turn_id}\tbinary dialog act {da} not present in ontology' - # do not check domain-slot-value in binary dialogue acts + # do not check_dsv for binary dialogue acts if turn['speaker'] == 'user': - assert 'state' in turn and 'state_update' in turn, f"{dialogue_id}:{turn_id}\tstate and state_update must present in user's role" - state_update = turn['state_update'] - - def apply_update(update, categorical): - domain_name = update['domain'] - slot_name = update['slot'] - value = update['value'] - check_dsv(domain_name, slot_name, value, categorical, f'{dialogue_id}:{turn_id}:state_update') - cur_state[domain_name][slot_name] = value - if ontology['state']: - for update in state_update['categorical']: - apply_update(update, True) - for update in state_update['non-categorical']: - apply_update(update, False) - value = update['value'] - if value not in special_values: - state_values += 1 - if 'utt_idx' in update: - if turns[update['utt_idx']]['utterance'][update['start']:update['end']].lower() == update['value']: - state_matches += 1 - else: - print('value in utt:\t', turns[update['utt_idx']]['utterance'][update['start']:update['end']].strip()) - print('value in state:\t', update['value']) - pass - - assert cur_state == turn['state'], f'{dialogue_id}:{turn_id}:state_update incorrect state or state update calculation' + assert 'db_results' not in turn + assert 'state' in turn, f"{dialogue_id}:{turn_id}\tstate must present in user's role, but could be empty" + state = turn['state'] + assert isinstance(state, dict), f'{dialogue_id}:{turn_id}\tstate should be a dict' + for domain_name, domain in state.items(): + for slot_name, value in domain.items(): + check_dsv(domain_name, slot_name, value, prefix=f'{dialogue_id}:{turn_id}:state') else: - assert 'state' not in turn or 'state_update' in turn, f"{dialogue_id}:{turn_id}\tstate or state_update cannot present in system's role" + assert 'state' not in turn, f"{dialogue_id}:{turn_id}\tstate cannot present in system's role" + assert 'db_results' in turn + db_results = turn['db_results'] + assert isinstance(db_results, dict), f'{dialogue_id}:{turn_id}\db_results should be a dict' + for domain_name, results in db_results.items(): + assert domain_name in cur_domains, f'{dialogue_id}:{turn_id}:db_results\t{domain_name} not presented in current domains' + assert isinstance(results, list) - assert turns[-1]['speaker'] == 'user', f'{dialogue_id} dialog must end with user role' + # assert turns[-1]['speaker'] == 'user', f'{dialogue_id} dialog must end with user role' if da_values: - print('da values match rate: {:.3f}'.format(da_matches * 100 / da_values)) - if state_values: - print('state values match rate: {:.3f}'.format(state_matches * 100 / state_values)) + print('da values span match rate: {:.3f}'.format(da_matches * 100 / da_values)) all_stat = {key: 0 for key in stat_keys} for key in stat_keys: all_stat[key] = sum(stat[split][key] for split in splits) stat['all'] = all_stat + table = [] for split in splits + ['all']: cur_stat = stat[split] if cur_stat['dialogues']: @@ -267,19 +244,24 @@ def check_data(name, ontology): cur_stat['avg_domains'] = round(cur_stat.pop('domains') / cur_stat['dialogues'], 2) else: del stat[split] + table.append({ + 'split':split, + '\# dialogues': cur_stat['dialogues'], '\# utterances': cur_stat['utterances'], + 'avg_utt': cur_stat['avg_utt'], 'avg_tokens': cur_stat['avg_tokens'], 'avg_domains': cur_stat['avg_domains'] + }) + print(f'domains: {len(ontology["domains"])}') - print(json.dumps(stat, indent=4)) - if state_matches: - for dis, cnt in enumerate(distances): - print(cnt) + print('\n\nCopy-and-paste the following statistics to dataset README.md->Dataset Summary section') + print(tabulate(table, headers='keys', tablefmt='github')) + print() if __name__ == '__main__': from argparse import ArgumentParser - parser = ArgumentParser(description="evaluate pre-processed datasets") - parser.add_argument('datasets', metavar='dataset_name', nargs='*', help='dataset names to be evaluated') - parser.add_argument('--all', action='store_true', help='evaluate all datasets') + parser = ArgumentParser(description="test pre-processed datasets") + parser.add_argument('datasets', metavar='dataset_name', nargs='*', help='dataset names to be tested') + parser.add_argument('--all', action='store_true', help='test all datasets') parser.add_argument('--no-int', action='store_true', help='not interrupted by exception') parser.add_argument('--preprocess', '-p', action='store_true', help='run preprocess automatically') args = parser.parse_args() @@ -293,7 +275,7 @@ if __name__ == '__main__': parser.print_help() exit(1) - print('datasets to be evaluated:', datasets) + print('datasets to be tested:', datasets) fail = [] @@ -301,29 +283,38 @@ if __name__ == '__main__': try: print('') if not os.path.isdir(name): - print(f'dataset {name} not found') - continue + raise FileNotFoundError(f'dataset {name} not found') print(f'checking {name}') preprocess_file = os.path.join(f'{name}', 'preprocess.py') if not os.path.exists(preprocess_file): - print('no preprocess.py') - if args.preprocess: - print(f'skip evaluation of {name}') - continue + raise FileNotFoundError(f'no {preprocess_file}') + if args.preprocess: print('pre-processing') os.chdir(name) - import importlib preprocess = importlib.import_module(f'{name}.preprocess') preprocess.preprocess() os.chdir('..') - ontology = check_ontology(name) - check_data(name, ontology) + data_file = os.path.join(f'{name}', 'data.zip') + if not os.path.exists(data_file): + raise FileNotFoundError(f'cannot find {data_file}') + + with ZipFile(data_file) as zipfile: + print('check ontology') + with zipfile.open('data/ontology.json', 'r') as f: + ontology = json.load(f) + check_ontology(ontology) + + print('check dialogues') + with zipfile.open('data/dialogues.json', 'r') as f: + dialogues = json.load(f) + check_dialogues(name, dialogues, ontology) except Exception as e: if args.no_int: + print(e) fail.append(name) else: raise e