Skip to content
Snippets Groups Projects
Commit e30a3991 authored by Carel van Niekerk's avatar Carel van Niekerk
Browse files

Refactor unified dataset code

parent e19624c3
Branches
No related tags found
No related merge requests found
...@@ -23,8 +23,13 @@ import torch ...@@ -23,8 +23,13 @@ import torch
import numpy as np import numpy as np
# Set seeds
def set_seed(args): def set_seed(args):
"""
Set random seeds
Args:
args (Arguments class): Arguments class containing seed and number of gpus to use
"""
random.seed(args.seed) random.seed(args.seed)
np.random.seed(args.seed) np.random.seed(args.seed)
torch.manual_seed(args.seed) torch.manual_seed(args.seed)
...@@ -32,31 +37,30 @@ def set_seed(args): ...@@ -32,31 +37,30 @@ def set_seed(args):
torch.cuda.manual_seed_all(args.seed) torch.cuda.manual_seed_all(args.seed)
# Embed candidates
def encode_candidates(candidates: list, args, tokenizer, embedding_model) -> torch.tensor: def encode_candidates(candidates: list, args, tokenizer, embedding_model) -> torch.tensor:
''' """
Embed candidates Embed candidates
Args: Args:
candidates (list): List of candidate descriptions candidates (list): List of candidate descriptions
args (argument class): Runtime arguments args (argument class): Runtime arguments
tokenizer (transformers Tokenizer): Tokenizer for the embedding_model tokenizer (transformers Tokenizer): Tokenizer for the embedding_model
embedding_model (transformer Model): Transormer model for embedding candidate descriptions embedding_model (transformer Model): Transformer model for embedding candidate descriptions
Returns: Returns:
feats (torch.tensor): Embeddings of the candidate descriptions feats (torch.tensor): Embeddings of the candidate descriptions
''' """
# Tokenize candidate descriptions # Tokenize candidate descriptions
feats = [tokenizer.encode_plus(val, add_special_tokens = True, feats = [tokenizer.encode_plus(val, add_special_tokens=True,max_length=args.max_candidate_len,
max_length = args.max_candidate_len, padding='max_length', padding='max_length', truncation='longest_first')
truncation = 'longest_first')
for val in candidates] for val in candidates]
# Encode tokenized decscriptions # Encode tokenized descriptions
with torch.no_grad(): with torch.no_grad():
feats = {key: torch.tensor([f[key] for f in feats]).to(embedding_model.device) for key in feats[0]} feats = {key: torch.tensor([f[key] for f in feats]).to(embedding_model.device) for key in feats[0]}
embedded_feats = embedding_model(**feats) # [num_candidates, max_candidate_len, hidden_dim] embedded_feats = embedding_model(**feats) # [num_candidates, max_candidate_len, hidden_dim]
# Reduce/pool decsriptions embeddings if required # Reduce/pool descriptions embeddings if required
if args.set_similarity: if args.set_similarity:
feats = embedded_feats.last_hidden_state.detach().cpu() # [num_candidates, max_candidate_len, hidden_dim] feats = embedded_feats.last_hidden_state.detach().cpu() # [num_candidates, max_candidate_len, hidden_dim]
elif args.candidate_pooling == 'cls': elif args.candidate_pooling == 'cls':
...@@ -70,10 +74,10 @@ def encode_candidates(candidates: list, args, tokenizer, embedding_model) -> tor ...@@ -70,10 +74,10 @@ def encode_candidates(candidates: list, args, tokenizer, embedding_model) -> tor
return feats return feats
# Get embeddings for slots and candidates
def get_slot_candidate_embeddings(ontology: dict, set_type: str, args, tokenizer, embedding_model, save_to_file=True): def get_slot_candidate_embeddings(ontology: dict, set_type: str, args, tokenizer, embedding_model, save_to_file=True):
''' """
Get embeddings for slots and candidates Get embeddings for slots and candidates
Args: Args:
ontology (dict): Dictionary of domain-slot pair descriptions and possible value sets ontology (dict): Dictionary of domain-slot pair descriptions and possible value sets
set_type (str): Subset of the dataset being used (train/validation/test) set_type (str): Subset of the dataset being used (train/validation/test)
...@@ -84,7 +88,7 @@ def get_slot_candidate_embeddings(ontology: dict, set_type: str, args, tokenizer ...@@ -84,7 +88,7 @@ def get_slot_candidate_embeddings(ontology: dict, set_type: str, args, tokenizer
Returns: Returns:
slots (dict): domain-slot description embeddings, candidate embeddings and requestable flag for each domain-slot slots (dict): domain-slot description embeddings, candidate embeddings and requestable flag for each domain-slot
''' """
# Set model to eval mode # Set model to eval mode
embedding_model.eval() embedding_model.eval()
......
...@@ -15,18 +15,19 @@ ...@@ -15,18 +15,19 @@
# limitations under the License. # limitations under the License.
"""Convlab3 Unified Format Dialogue Datasets""" """Convlab3 Unified Format Dialogue Datasets"""
from copy import deepcopy
import torch import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers.tokenization_utils import PreTrainedTokenizer from transformers.tokenization_utils import PreTrainedTokenizer
from copy import deepcopy
from convlab.dst.setsumbt.dataset.utils import (load_dataset, get_ontology_slots, ontology_add_values, from convlab.util import load_dataset
from convlab.dst.setsumbt.dataset.utils import (get_ontology_slots, ontology_add_values,
get_values_from_data, ontology_add_requestable_slots, get_values_from_data, ontology_add_requestable_slots,
get_requestable_slots, load_dst_data, extract_dialogues, get_requestable_slots, load_dst_data, extract_dialogues,
combine_value_sets) combine_value_sets)
# Convert dialogue examples to model input features and labels
def convert_examples_to_features(data: list, def convert_examples_to_features(data: list,
ontology: dict, ontology: dict,
tokenizer: PreTrainedTokenizer, tokenizer: PreTrainedTokenizer,
...@@ -34,7 +35,8 @@ def convert_examples_to_features(data: list, ...@@ -34,7 +35,8 @@ def convert_examples_to_features(data: list,
max_seq_len: int = 64) -> dict: max_seq_len: int = 64) -> dict:
""" """
Convert dialogue examples to model input features and labels Convert dialogue examples to model input features and labels
Parameters:
Args:
data (list): List of all extracted dialogues data (list): List of all extracted dialogues
ontology (dict): Ontology dictionary containing slots, slot descriptions and ontology (dict): Ontology dictionary containing slots, slot descriptions and
possible value sets including requests possible value sets including requests
...@@ -126,7 +128,7 @@ def convert_examples_to_features(data: list, ...@@ -126,7 +128,7 @@ def convert_examples_to_features(data: list,
labels.append(labs) labels.append(labs)
labels = torch.tensor(labels) labels = torch.tensor(labels)
features['labels-' + domslot] = labels features['state_labels-' + domslot] = labels
# Create requestable slot labels # Create requestable slot labels
for domslot in requestable_slots: for domslot in requestable_slots:
...@@ -151,9 +153,9 @@ def convert_examples_to_features(data: list, ...@@ -151,9 +153,9 @@ def convert_examples_to_features(data: list,
labels.append(labs) labels.append(labs)
labels = torch.tensor(labels) labels = torch.tensor(labels)
features['request-' + domslot] = labels features['request_labels-' + domslot] = labels
# Greeting act labels (0-no greeting, 1-goodbye, 2-thank you) # General act labels (1-goodbye, 2-thank you)
labels = [] labels = []
for dial in data: for dial in data:
labs = [] labs = []
...@@ -172,7 +174,7 @@ def convert_examples_to_features(data: list, ...@@ -172,7 +174,7 @@ def convert_examples_to_features(data: list,
labels.append(labs) labels.append(labs)
labels = torch.tensor(labels) labels = torch.tensor(labels)
features['goodbye'] = labels features['general_act_labels'] = labels
# Create active domain labels # Create active domain labels
for domain in domains: for domain in domains:
...@@ -190,17 +192,17 @@ def convert_examples_to_features(data: list, ...@@ -190,17 +192,17 @@ def convert_examples_to_features(data: list,
labels.append(labs) labels.append(labs)
labels = torch.tensor(labels) labels = torch.tensor(labels)
features['active-' + domain] = labels features['active_domain_labels-' + domain] = labels
del labels del labels
return features return features
# Unified Dataset object
class UnifiedFormatDataset(Dataset): class UnifiedFormatDataset(Dataset):
""" """
Class for preprocessing, and storing data easily from the Convlab3 unified format. Class for preprocessing, and storing data easily from the Convlab3 unified format.
Attributes: Attributes:
dataset_dict (dict): Dictionary containing all the data in dataset dataset_dict (dict): Dictionary containing all the data in dataset
ontology (dict): Set of all domain-slot-value triplets in the ontology of the model ontology (dict): Set of all domain-slot-value triplets in the ontology of the model
...@@ -215,8 +217,8 @@ class UnifiedFormatDataset(Dataset): ...@@ -215,8 +217,8 @@ class UnifiedFormatDataset(Dataset):
train_ratio: float = 1.0, train_ratio: float = 1.0,
seed: int = 0): seed: int = 0):
""" """
Parameters: Args:
dataset_name (str): Name of the dataset to load dataset_name (str): Name of the dataset/s to load (multiple to be seperated by +)
set_type (str): Subset of the dataset to load (train, validation or test) set_type (str): Subset of the dataset to load (train, validation or test)
tokenizer (transformers tokenizer): Tokenizer for the encoder model used tokenizer (transformers tokenizer): Tokenizer for the encoder model used
max_turns (int): Maximum numbers of turns in a dialogue max_turns (int): Maximum numbers of turns in a dialogue
...@@ -248,48 +250,73 @@ class UnifiedFormatDataset(Dataset): ...@@ -248,48 +250,73 @@ class UnifiedFormatDataset(Dataset):
data += extract_dialogues(data_) data += extract_dialogues(data_)
self.features = convert_examples_to_features(data, self.ontology, tokenizer, max_turns, max_seq_len) self.features = convert_examples_to_features(data, self.ontology, tokenizer, max_turns, max_seq_len)
def __getitem__(self, index): def __getitem__(self, index: int) -> dict:
"""
Obtain dialogues with specific ids from dataset
Args:
index (int/list/tensor): Index/indices of dialogues to get
Returns:
features (dict): All inputs and labels required to train the model
"""
return {label: self.features[label][index] for label in self.features return {label: self.features[label][index] for label in self.features
if self.features[label] is not None} if self.features[label] is not None}
def __len__(self): def __len__(self):
"""
Get number of dialogues in the dataset
Returns:
len (int): Number of dialogues in the dataset object
"""
return self.features['input_ids'].size(0) return self.features['input_ids'].size(0)
# Resample subset of the dataset def resample(self, size: int = None) -> Dataset:
def resample(self, size=None): """
'''
Resample subset of the dataset Resample subset of the dataset
Args: Args:
size (int): Number of dialogues to sample size (int): Number of dialogues to sample
'''
Returns:
self (Dataset): Dataset object
"""
# If no subset size is specified we resample a set with the same size as the full dataset # If no subset size is specified we resample a set with the same size as the full dataset
n_dialogues = self.__len__() n_dialogues = self.__len__()
if not size: if not size:
size = n_dialogues size = n_dialogues
dialogues = torch.randint(low=0, high=n_dialogues, size=(size,)) dialogues = torch.randint(low=0, high=n_dialogues, size=(size,))
self.features = {label: self.features[label][dialogues] for label in self.features self.features = self.__getitem__(dialogues)
if self.features[label] is not None}
return self return self
# Map all data to a device
def to(self, device): def to(self, device):
''' """
Map all data to a device Map all data to a device
Args: Args:
device (torch device): Device to map data to device (torch device): Device to map data to
''' """
self.device = device self.device = device
self.features = {label: self.features[label].to(device) for label in self.features self.features = {label: self.features[label].to(device) for label in self.features
if self.features[label] is not None} if self.features[label] is not None}
# Module to create torch dataloaders def get_dataloader(dataset_name: str,
def get_dataloader(dataset_name: str, set_type: str, batch_size: int, tokenizer, max_turns: int=12, max_seq_len: int=64, set_type: str,
device='cpu', resampled_size=None, train_ratio=1.0): batch_size: int,
tokenizer: PreTrainedTokenizer,
max_turns: int = 12,
max_seq_len: int = 64,
device='cpu',
resampled_size: int = None,
train_ratio: float = 1.0,
seed: int = 0) -> DataLoader:
''' '''
Module to create torch dataloaders Module to create torch dataloaders
Args: Args:
dataset_name (str): Name of the dataset to load dataset_name (str): Name of the dataset to load
set_type (str): Subset of the dataset to load (train, validation or test) set_type (str): Subset of the dataset to load (train, validation or test)
...@@ -299,11 +326,14 @@ def get_dataloader(dataset_name: str, set_type: str, batch_size: int, tokenizer, ...@@ -299,11 +326,14 @@ def get_dataloader(dataset_name: str, set_type: str, batch_size: int, tokenizer,
max_seq_len (int): Maximum number of tokens in a dialogue turn max_seq_len (int): Maximum number of tokens in a dialogue turn
device (torch device): Device to map data to device (torch device): Device to map data to
resampled_size (int): Number of dialogues to sample resampled_size (int): Number of dialogues to sample
train_ratio (float): Ratio of training data to use for training
seed (int): Seed governing random order of ids for subsampling
Returns: Returns:
loader (torch dataloader): Dataloader to train and evaluate the setsumbt model loader (torch dataloader): Dataloader to train and evaluate the setsumbt model
''' '''
data = UnifiedFormatDataset(dataset_name, set_type, tokenizer, max_turns, max_seq_len, train_ratio=train_ratio) data = UnifiedFormatDataset(dataset_name, set_type, tokenizer, max_turns, max_seq_len, train_ratio=train_ratio,
seed=seed)
data.to(device) data.to(device)
if resampled_size: if resampled_size:
...@@ -316,3 +346,24 @@ def get_dataloader(dataset_name: str, set_type: str, batch_size: int, tokenizer, ...@@ -316,3 +346,24 @@ def get_dataloader(dataset_name: str, set_type: str, batch_size: int, tokenizer,
loader = DataLoader(data, sampler=sampler, batch_size=batch_size) loader = DataLoader(data, sampler=sampler, batch_size=batch_size)
return loader return loader
def change_batch_size(loader: DataLoader, batch_size: int) -> DataLoader:
"""
Change the batch size of a preloaded loader
Args:
loader (DataLoader): Dataloader to train and evaluate the setsumbt model
batch_size (int): Batch size for the dataloader
Returns:
loader (DataLoader): Dataloader to train and evaluate the setsumbt model
"""
if 'SequentialSampler' in str(loader.sampler):
sampler = SequentialSampler(loader.dataset)
else:
sampler = RandomSampler(loader.dataset)
loader = DataLoader(loader.dataset, sampler=sampler, batch_size=batch_size)
return loader
...@@ -15,13 +15,14 @@ ...@@ -15,13 +15,14 @@
# limitations under the License. # limitations under the License.
"""Convlab3 Unified dataset data processing utilities""" """Convlab3 Unified dataset data processing utilities"""
from convlab.util import load_dataset, load_ontology, load_dst_data, load_nlu_data from convlab.util import load_ontology, load_dst_data, load_nlu_data
from convlab.dst.setsumbt.dataset.value_maps import * from convlab.dst.setsumbt.dataset.value_maps import VALUE_MAP, DOMAINS_MAP, QUANTITIES, TIME
def get_ontology_slots(dataset_name: str) -> dict: def get_ontology_slots(dataset_name: str) -> dict:
""" """
Function to extract slots, slot descriptions and categorical slot values from the dataset ontology. Function to extract slots, slot descriptions and categorical slot values from the dataset ontology.
Args: Args:
dataset_name (str): Dataset name dataset_name (str): Dataset name
...@@ -52,6 +53,7 @@ def get_ontology_slots(dataset_name: str) -> dict: ...@@ -52,6 +53,7 @@ def get_ontology_slots(dataset_name: str) -> dict:
def get_values_from_data(dataset: dict) -> dict: def get_values_from_data(dataset: dict) -> dict:
""" """
Function to extract slots, slot descriptions and categorical slot values from the dataset ontology. Function to extract slots, slot descriptions and categorical slot values from the dataset ontology.
Args: Args:
dataset (dict): Dataset dictionary obtained using the load_dataset function dataset (dict): Dataset dictionary obtained using the load_dataset function
...@@ -78,6 +80,7 @@ def get_values_from_data(dataset: dict) -> dict: ...@@ -78,6 +80,7 @@ def get_values_from_data(dataset: dict) -> dict:
def combine_value_sets(value_sets: list) -> dict: def combine_value_sets(value_sets: list) -> dict:
""" """
Function to combine value sets extracted from different datasets Function to combine value sets extracted from different datasets
Args: Args:
value_sets (list): List of value sets extracted using the get_values_from_data function value_sets (list): List of value sets extracted using the get_values_from_data function
...@@ -101,6 +104,7 @@ def combine_value_sets(value_sets: list) -> dict: ...@@ -101,6 +104,7 @@ def combine_value_sets(value_sets: list) -> dict:
def clean_values(value_sets: dict, value_map: dict = VALUE_MAP) -> dict: def clean_values(value_sets: dict, value_map: dict = VALUE_MAP) -> dict:
""" """
Function to clean up the possible value sets extracted from the states in the dataset Function to clean up the possible value sets extracted from the states in the dataset
Args: Args:
value_sets (dict): Dictionary containing possible values obtained from dataset value_sets (dict): Dictionary containing possible values obtained from dataset
value_map (dict): Label map to avoid duplication and typos in values value_map (dict): Label map to avoid duplication and typos in values
......
...@@ -42,14 +42,6 @@ DOMAINS_MAP = {'Alarm_1': 'alarm', 'Banks_1': 'banks', 'Banks_2': 'banks', 'Buse ...@@ -42,14 +42,6 @@ DOMAINS_MAP = {'Alarm_1': 'alarm', 'Banks_1': 'banks', 'Banks_2': 'banks', 'Buse
'auto_repair': 'car_repairs', 'flights': 'flights', 'food-ordering': 'takeout', 'hotels': 'hotel', 'auto_repair': 'car_repairs', 'flights': 'flights', 'food-ordering': 'takeout', 'hotels': 'hotel',
'movies': 'movies', 'music': 'music', 'restaurant-search': 'restaurant', 'sports': 'sports', 'movies': 'movies', 'music': 'music', 'restaurant-search': 'restaurant', 'sports': 'sports',
'movie': 'movies'} 'movie': 'movies'}
INVERSE_DOMAINS_MAP = {item: key for key, item in DOMAINS_MAP.items()}
SLOTS_MAP = {"account_balance": "balance", "transfer_amount": "amount", "from_location": "departure",
"from_station": "departure", "origin": "departure", "origin_station_name": "departure",
"from_city": "departure", "to_location": "destination", "to_station": "destination",
"destination_station_name": "destination", "to_city": "destination", "leaving_date": "departure_date",
"leaving_time": "departure_time", "fare": "price", "fare_type": "price"}
# Generic value sets for quantity and time slots # Generic value sets for quantity and time slots
......
# -*- coding: utf-8 -*-
# Copyright 2020 DSML Group, Heinrich Heine University, Düsseldorf
# Authors: Carel van Niekerk (niekerk@hhu.de)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Discriminative models calibration"""
import random
import os
import torch
import numpy as np
from torch.distributions import Categorical
from torch.nn.functional import kl_div
from torch.nn import Module
from tqdm import tqdm
# Load logger and tensorboard summary writer
def set_logger(logger_, tb_writer_):
global logger, tb_writer
logger = logger_
tb_writer = tb_writer_
# Set seeds
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
logger.info('Seed set to %d.' % args.seed)
def build_train_loaders(args, tokenizer, dataset):
dataloaders = [dataset.get_dataloader('train', args.train_batch_size, tokenizer, args.max_dialogue_len,
args.max_turn_len, resampled_size=args.data_sampling_size)
for _ in range(args.ensemble_size)]
return dataloaders
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment