utils.py

# -*- coding: utf-8 -*-
# Copyright 2023 DSML Group, Heinrich Heine University, Düsseldorf
# Authors: Carel van Niekerk (niekerk@hhu.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convlab3 Unified dataset data processing utilities"""

import numpy

from convlab.util import load_ontology, load_dst_data, load_nlu_data
from convlab.dst.setsumbt.datasets.value_maps import VALUE_MAP, DOMAINS_MAP, QUANTITIES, TIME


def get_ontology_slots(dataset_name: str) -> dict:
    """
    Function to extract slots, slot descriptions and categorical slot values from the dataset ontology.

    Args:
        dataset_name (str): Dataset name

    Returns:
        ontology_slots (dict): Ontology dictionary containing slots, descriptions and categorical slot values
    """
    dataset_names = dataset_name.split('+') if '+' in dataset_name else [dataset_name]
    ontology_slots = dict()
    for dataset_name in dataset_names:
        ontology = load_ontology(dataset_name)
        domains = [domain for domain in ontology['domains'] if domain not in ['booking', 'general']]
        for domain in domains:
            domain_name = DOMAINS_MAP.get(domain, domain.lower())
            if domain_name not in ontology_slots:
                ontology_slots[domain_name] = dict()
            for slot, slot_info in ontology['domains'][domain]['slots'].items():
                slot_name = slot.replace('.', '_')
                if slot_name not in ontology_slots[domain_name]:
                    ontology_slots[domain_name][slot_name] = {'description': slot_info['description'],
                                                              'possible_values': list(),
                                                              'dataset_names': list()}
                if slot_info['is_categorical']:
                    ontology_slots[domain_name][slot_name]['possible_values'] += slot_info['possible_values']

                unique_vals = list(set(ontology_slots[domain_name][slot_name]['possible_values']))
                ontology_slots[domain_name][slot_name]['possible_values'] = unique_vals
                ontology_slots[domain_name][slot_name]['dataset_names'].append(dataset_name)

    return ontology_slots


def get_values_from_data(dataset: dict, data_split: str = "train") -> dict:
    """
    Function to extract slots, slot descriptions and categorical slot values from the dataset ontology.

    Args:
        dataset (dict): Dataset dictionary obtained using the load_dataset function
        data_split (str): Dataset split: train/validation/test

    Returns:
        value_sets (dict): Dictionary containing possible values obtained from dataset
    """
    data = load_dst_data(dataset, data_split='all', speaker='user')

    # Remove test data from the data when building training/validation ontology
    if data_split == 'train':
        data = {key: itm for key, itm in data.items() if key == 'train'}
    elif data_split == 'validation':
        data = {key: itm for key, itm in data.items() if key in ['train', 'validation']}

    value_sets = {}
    for set_type, dataset in data.items():
        for turn in dataset:
            for domain, substate in turn['state'].items():
                domain_name = DOMAINS_MAP.get(domain, domain.lower())
                if domain_name not in value_sets:
                    value_sets[domain_name] = {}
                for slot, value in substate.items():
                    slot_name = slot.replace('.', '_')
                    if slot_name not in value_sets[domain_name]:
                        value_sets[domain_name][slot_name] = []
                    if value and value not in value_sets[domain_name][slot_name]:
                        value_sets[domain_name][slot_name].append(value)

    return clean_values(value_sets)


def combine_value_sets(value_sets: list) -> dict:
    """
    Function to combine value sets extracted from different datasets

    Args:
        value_sets (list): List of value sets extracted using the get_values_from_data function

    Returns:
        value_set (dict): Dictionary containing possible values obtained from datasets
    """
    value_set = value_sets[0]
    for _value_set in value_sets[1:]:
        for domain, domain_info in _value_set.items():
            for slot, possible_values in domain_info.items():
                if domain not in value_set:
                    value_set[domain] = dict()
                if slot not in value_set[domain]:
                    value_set[domain][slot] = list()
                value_set[domain][slot] += _value_set[domain][slot]
                value_set[domain][slot] = list(set(value_set[domain][slot]))

    return value_set


def clean_values(value_sets: dict, value_map: dict = VALUE_MAP) -> dict:
    """
    Function to clean up the possible value sets extracted from the states in the dataset

    Args:
        value_sets (dict): Dictionary containing possible values obtained from dataset
        value_map (dict): Label map to avoid duplication and typos in values

    Returns:
        clean_vals (dict): Cleaned Dictionary containing possible values obtained from dataset
    """
    clean_vals = {}
    for domain, subset in value_sets.items():
        clean_vals[domain] = {}
        for slot, values in subset.items():
            # Remove pipe separated values
            values = list(set([val.split('|', 1)[0] for val in values]))

            # Map values using value_map
            for old, new in value_map.items():
                values = list(set([val.replace(old, new) for val in values]))

            # Remove empty and dontcare from possible value sets
            values = [val for val in values if val not in ['', 'dontcare']]

            # MultiWOZ specific value sets for quantity, time and boolean slots
            if 'people' in slot or 'duration' in slot or 'stay' in slot:
                values = QUANTITIES
            elif 'time' in slot or 'leave' in slot or 'arrive' in slot:
                values = TIME
            elif 'parking' in slot or 'internet' in slot:
                values = ['yes', 'no']

            clean_vals[domain][slot] = values

    return clean_vals


def ontology_add_values(ontology_slots: dict, value_sets: dict, data_split: str = "train") -> dict:
    """
    Add value sets obtained from the dataset to the ontology
    Args:
        ontology_slots (dict): Ontology dictionary containing slots, descriptions and categorical slot values
        value_sets (dict): Cleaned Dictionary containing possible values obtained from dataset
        data_split (str): Dataset split: train/validation/test

    Returns:
        ontology_slots (dict): Ontology dictionary containing slots, slot descriptions and possible value sets
    """
    ontology = {}
    for domain in sorted(ontology_slots):
        if data_split in ['train', 'validation']:
            if domain not in value_sets:
                continue
            possible_values = [v for slot, vals in value_sets[domain].items() for v in vals]
            if len(possible_values) == 0:
                continue
        ontology[domain] = {}
        for slot in sorted(ontology_slots[domain]):
            if not ontology_slots[domain][slot]['possible_values']:
                if domain in value_sets:
                    if slot in value_sets[domain]:
                        ontology_slots[domain][slot]['possible_values'] = value_sets[domain][slot]
            if ontology_slots[domain][slot]['possible_values']:
                values = sorted(ontology_slots[domain][slot]['possible_values'])
                ontology_slots[domain][slot]['possible_values'] = ['none', 'do not care'] + values

            ontology[domain][slot] = ontology_slots[domain][slot]

    return ontology


def get_requestable_slots(datasets: list) -> dict:
    """
    Function to get set of requestable slots from the dataset action labels.
    Args:
        datasets (dict): Dataset dictionary obtained using the load_dataset function

    Returns:
        slots (dict): Dictionary containing requestable domain-slot pairs
    """
    datasets = [load_nlu_data(dataset, data_split='all', speaker='user') for dataset in datasets]

    slots = {}
    for data in datasets:
        for set_type, subset in data.items():
            for turn in subset:
                requests = [act for act in turn['dialogue_acts']['categorical'] if act['intent'] == 'request']
                requests += [act for act in turn['dialogue_acts']['non-categorical'] if act['intent'] == 'request']
                requests += [act for act in turn['dialogue_acts']['binary'] if act['intent'] == 'request']
                requests = [(act['domain'], act['slot']) for act in requests]
                for domain, slot in requests:
                    domain_name = DOMAINS_MAP.get(domain, domain.lower())
                    if domain_name not in slots:
                        slots[domain_name] = []
                    slots[domain_name].append(slot.replace('.', '_'))

    slots = {domain: list(set(slot_list)) for domain, slot_list in slots.items()}

    return slots


def ontology_add_requestable_slots(ontology_slots: dict, requestable_slots: dict) -> dict:
    """
    Add requestable slots obtained from the dataset to the ontology
    Args:
        ontology_slots (dict): Ontology dictionary containing slots, descriptions and categorical slot values
        requestable_slots (dict): Dictionary containing requestable domain-slot pairs

    Returns:
        ontology_slots (dict): Ontology dictionary containing slots, slot descriptions and
        possible value sets including requests
    """
    for domain in ontology_slots:
        for slot in ontology_slots[domain]:
            if domain in requestable_slots:
                if slot in requestable_slots[domain]:
                    ontology_slots[domain][slot]['possible_values'].append('?')

    return ontology_slots


def extract_turns(dialogue: list, dataset_name: str, dialogue_id: str) -> list:
    """
    Extract the required information from the data provided by unified loader
    Args:
        dialogue (list): List of turns within a dialogue
        dataset_name (str): Name of the dataset to which the dialogue belongs
        dialogue_str (str): ID of the dialogue

    Returns:
        turns (list): List of turns within a dialogue
    """
    turns = []
    turn_info = {}
    for turn in dialogue:
        if turn['speaker'] == 'system':
            turn_info['system_utterance'] = turn['utterance']

        # System utterance in the first turn is always empty as conversation is initiated by the user
        if turn['utt_idx'] == 1:
            turn_info['system_utterance'] = ''

        if turn['speaker'] == 'user':
            turn_info['user_utterance'] = turn['utterance']

            # Inform acts not required by model
            turn_info['dialogue_acts'] = [act for act in turn['dialogue_acts']['categorical']
                                          if act['intent'] not in ['inform']]
            turn_info['dialogue_acts'] += [act for act in turn['dialogue_acts']['non-categorical']
                                           if act['intent'] not in ['inform']]
            turn_info['dialogue_acts'] += [act for act in turn['dialogue_acts']['binary']
                                           if act['intent'] not in ['inform']]

            turn_info['state'] = turn['state']
            turn_info['dataset_name'] = dataset_name
            turn_info['dialogue_id'] = dialogue_id

        if 'system_utterance' in turn_info and 'user_utterance' in turn_info:
            turns.append(turn_info)
            turn_info = {}

    return turns


def clean_states(turns: list) -> list:
    """
    Clean the state within each turn of a dialogue (cleaning values and mapping to options used in ontology)
    Args:
        turns (list): List of turns within a dialogue

    Returns:
        clean_turns (list): List of turns within a dialogue
    """
    clean_turns = []
    for turn in turns:
        clean_state = {}
        clean_acts = []
        for act in turn['dialogue_acts']:
            domain = act['domain']
            act['domain'] = DOMAINS_MAP.get(domain, domain.lower())
            act['slot'] = act['slot'].replace('.', '_')
            clean_acts.append(act)
        for domain, subset in turn['state'].items():
            domain_name = DOMAINS_MAP.get(domain, domain.lower())
            clean_state[domain_name] = {}
            for slot, value in subset.items():
                # Remove pipe separated values
                value = value.split('|', 1)[0]

                # Map values using value_map
                for old, new in VALUE_MAP.items():
                    value = value.replace(old, new)

                # Map dontcare to "do not care" and empty to 'none'
                value = value.replace('dontcare', 'do not care')
                value = value if value else 'none'

                # Map quantity values to the integer quantity value
                if 'people' in slot or 'duration' in slot or 'stay' in slot:
                    try:
                        if value not in ['do not care', 'none']:
                            value = int(value)
                            value = str(value) if value < 10 else QUANTITIES[-1]
                    except:
                        value = value
                # Map time values to the most appropriate value in the standard time set
                elif 'time' in slot or 'leave' in slot or 'arrive' in slot:
                    try:
                        if value not in ['do not care', 'none']:
                            # Strip after/before from time value
                            value = value.replace('after ', '').replace('before ', '')
                            # Extract hours and minutes from different possible formats
                            if ':' not in value and len(value) == 4:
                                h, m = value[:2], value[2:]
                            elif len(value) == 1:
                                h = int(value)
                                m = 0
                            elif 'pm' in value:
                                h = int(value.replace('pm', '')) + 12
                                m = 0
                            elif 'am' in value:
                                h = int(value.replace('pm', ''))
                                m = 0
                            elif ':' in value:
                                h, m = value.split(':')
                            elif ';' in value:
                                h, m = value.split(';')
                            # Map to closest 5 minutes
                            if int(m) % 5 != 0:
                                m = round(int(m) / 5) * 5
                                h = int(h)
                                if m == 60:
                                    m = 0
                                    h += 1
                                if h >= 24:
                                    h -= 24
                            # Set in standard 24 hour format
                            h, m = int(h), int(m)
                            value = '%02i:%02i' % (h, m)
                    except:
                        value = value
                # Map boolean slots to yes/no value
                elif 'parking' in slot or 'internet' in slot:
                    if value not in ['do not care', 'none']:
                        if value == 'free':
                            value = 'yes'
                        elif True in [v in value.lower() for v in ['yes', 'no']]:
                            value = [v for v in ['yes', 'no'] if v in value][0]

                clean_state[domain_name][slot.replace('.', '_')] = value
        turn['state'] = clean_state
        turn['dialogue_acts'] = clean_acts
        clean_turns.append(turn)

    return clean_turns


def get_active_domains(turns: list) -> list:
    """
    Get active domains at each turn in a dialogue
    Args:
        turns (list): List of turns within a dialogue

    Returns:
        turns (list): List of turns within a dialogue
    """
    for turn_id in range(len(turns)):
        # At first turn all domains with not none values in the state are active
        if turn_id == 0:
            domains = [d for d, substate in turns[turn_id]['state'].items() for s, v in substate.items() if v != 'none']
            domains += [act['domain'] for act in turns[turn_id]['dialogue_acts'] if act['domain'] in turns[turn_id]['state']]
            domains = [DOMAINS_MAP.get(domain, domain.lower()) for domain in domains]
            turns[turn_id]['active_domains'] = list(set(domains))
        else:
            # Use changes in domains to identify active domains
            domains = []
            for domain, substate in turns[turn_id]['state'].items():
                domain_name = DOMAINS_MAP.get(domain, domain.lower())
                for slot, value in substate.items():
                    if value != turns[turn_id - 1]['state'][domain][slot]:
                        val = value
                    else:
                        val = 'none'
                    if value == 'none':
                        val = 'none'
                    if val != 'none':
                        domains.append(domain_name)
            # Add all domains activated by a user action
            domains += [act['domain'] for act in turns[turn_id]['dialogue_acts']
                        if act['domain'] in turns[turn_id]['state']]
            turns[turn_id]['active_domains'] = list(set(domains))

    return turns


class IdTensor:
    def __init__(self, values):
        self.values = numpy.array(values)

    def __getitem__(self, index: int):
        return self.values[index].tolist()

    def to(self, device):
        return self


def extract_dialogues(data: list, dataset_name: str) -> list:
    """
    Extract all dialogues from dataset

    Args:
        data (list): List of all dialogues in a subset of the data
        dataset_name (str): Name of the dataset to which the dialogues belongs

    Returns:
        dialogues (list): List of all extracted dialogues
    """
    dialogues = []
    for dial in data:
        dial_id = dial['dialogue_id']
        turns = extract_turns(dial['turns'], dataset_name, dial_id)
        turns = clean_states(turns)
        turns = get_active_domains(turns)
        dialogues.append(turns)

    return dialogues