Skip to content
Snippets Groups Projects
Commit aede1333 authored by zqwerty's avatar zqwerty
Browse files

add wow data

parent ec2167a3
No related branches found
No related tags found
No related merge requests found
......@@ -24,6 +24,7 @@ data/unified_datasets/commongen/commongen_data.zip
data/unified_datasets/kvret/kvret_*
data/unified_datasets/metalwoz/metalwoz-*.zip
data/unified_datasets/personachat/original_data
data/unified_datasets/wow/wizard_of_wikipedia
data/unified_datasets/**/stat.txt
data/unified_datasets/**/data
data/**/train.json
......
......@@ -83,7 +83,7 @@ def preprocess():
with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf:
for filename in os.listdir(new_data_dir):
zf.write(f'{new_data_dir}/{filename}')
# rmtree(new_data_dir)
rmtree(new_data_dir)
return dialogues, ontology
......
File added
This diff is collapsed.
from zipfile import ZipFile, ZIP_DEFLATED
from shutil import rmtree
import json
import os
from tqdm import tqdm
from pprint import pprint
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import html
def preprocess():
original_data_dir = 'wizard_of_wikipedia'
new_data_dir = 'data'
os.makedirs(new_data_dir, exist_ok=True)
dataset = 'wow'
splits = ['train', 'validation', 'test_seen', 'test_unseen']
dialogues_by_split = {split:[] for split in splits}
ontology = {'domains': {},
'intents': {},
'state': {},
'dialogue_acts': {
"categorical": [],
"non-categorical": [],
"binary": []
}}
for data_split in splits:
if data_split == 'train':
filenames = ['train.json']
elif data_split == 'validation':
filenames = ['valid_random_split.json', 'valid_topic_split.json']
elif data_split == 'test_seen':
filenames = ['test_random_split.json']
else:
filenames = ['test_topic_split.json']
for filename in filenames:
with open(f'{original_data_dir}/{filename}') as f:
data = json.load(f)
for original_dial in tqdm(data, desc=data_split):
topic = html.unescape(original_dial['chosen_topic'])
# new dialog
dialogue_id = f'{dataset}-{data_split}-{len(dialogues_by_split[data_split])}'
dialogue = {
'dataset': dataset,
'data_split': data_split,
'dialogue_id': dialogue_id,
'original_id': f'{data_split}-{len(dialogues_by_split[data_split])}',
'topic': topic,
'turns': []
}
dialogues_by_split[data_split].append(dialogue)
topic2passage = {topic: original_dial['chosen_topic_passage']}
for original_turn in original_dial['dialog']:
speaker = 'system' if 'Wizard' in original_turn['speaker'] else 'user'
dialogue['turns'].append({
'speaker': speaker,
'utterance': original_turn['text'].strip(),
'utt_idx': len(dialogue['turns']),
})
for topic_passage in original_turn['retrieved_passages']:
for topic, passage in topic_passage.items():
topic2passage[html.unescape(topic)] = passage
if speaker == 'system':
if len(original_turn['checked_sentence']) == 0:
check_sentence = None
else:
check_sentence = list(original_turn['checked_sentence'].values())[0]
check_sentence = None if check_sentence == 'no_passages_used' else check_sentence
if len(original_turn['checked_passage']) == 0:
if check_sentence and check_sentence not in original_dial['chosen_topic_passage']:
# search over retrieved_passages
for topic, passage in topic2passage.items():
if check_sentence in passage:
checked_passage = topic
break
else:
pprint(original_turn)
exit()
else:
checked_passage = None
else:
checked_passage = html.unescape(list(original_turn['checked_passage'].values())[0])
# print(topic2passage.keys())
checked_passage = None if checked_passage == 'no_passages_used' else topic2passage[checked_passage]
dialogue['turns'][-1]['checked_sentence'] = check_sentence
dialogue['turns'][-1]['checked_passage'] = checked_passage
dialogues = dialogues_by_split['train']+dialogues_by_split['validation']+dialogues_by_split['test_seen']+dialogues_by_split['test_unseen']
json.dump(dialogues[:10], open(f'dummy_data.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
json.dump(ontology, open(f'{new_data_dir}/ontology.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
json.dump(dialogues, open(f'{new_data_dir}/dialogues.json', 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf:
for filename in os.listdir(new_data_dir):
zf.write(f'{new_data_dir}/{filename}')
rmtree(new_data_dir)
return dialogues, ontology
if __name__ == '__main__':
preprocess()
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment