diff --git a/data/unified_datasets/sgd/data.zip b/data/unified_datasets/sgd/data.zip index e3b9142c51145230a2a5dd8de2b7d8e89694843f..409cdc9efbbb07a1b2f3dbcfa70e19fdf223693f 100644 Binary files a/data/unified_datasets/sgd/data.zip and b/data/unified_datasets/sgd/data.zip differ diff --git a/data/unified_datasets/sgd/preprocess.py b/data/unified_datasets/sgd/preprocess.py index 4f334e3abbebcbd81fdb206726271669832f04ad..255e8008787f90593f77b333ca391adf379ef445 100644 --- a/data/unified_datasets/sgd/preprocess.py +++ b/data/unified_datasets/sgd/preprocess.py @@ -187,9 +187,9 @@ def preprocess(): "turns": [] } cnt += 1 - init_state = {} + prev_state = {} for domain in dialogue['domains']: - init_state.setdefault(domain, deepcopy(ontology['state'][domain])) + prev_state.setdefault(domain, deepcopy(ontology['state'][domain])) for utt_idx, t in enumerate(d['turns']): speaker = t['speaker'].lower() @@ -306,17 +306,22 @@ def preprocess(): assert value == 'dontcare', f'{action}-{slot_info}' if speaker == 'user': - state = deepcopy(init_state) + state = deepcopy(prev_state) active_intent = {} requested_slots = {} for frame in t['frames']: domain = frame['service'] active_intent[domain] = frame['state']['active_intent'] requested_slots[domain] = frame['state']['requested_slots'] - for slot, value_list in frame['state']['slot_values'].items(): - state[domain][slot] = value_list[0] - for value in value_list[1:]: - state[domain][slot] += '|' + value + for slot in state[domain]: + if slot in frame['state']['slot_values']: + value_list = frame['state']['slot_values'][slot] + state[domain][slot] = value_list[0] + for value in value_list[1:]: + state[domain][slot] += '|' + value + else: + state[domain][slot] = '' + prev_state = state turn['state'] = state turn['active_intent'] = active_intent turn['requested_slots'] = requested_slots @@ -348,8 +353,8 @@ def preprocess(): with ZipFile('data.zip', 'w', ZIP_DEFLATED) as zf: for filename in os.listdir(new_data_dir): zf.write(f'{new_data_dir}/{filename}') - rmtree(original_data_dir) - rmtree(new_data_dir) + # rmtree(original_data_dir) + # rmtree(new_data_dir) return dialogues, ontology if __name__ == '__main__':