diff --git a/convlab/dst/setsumbt/dataset/ontology.py b/convlab/dst/setsumbt/dataset/ontology.py index 81e207805c47dcde8b7194ff5f7fac8ff10b1c2f..ce150a61077ad61ab9d7af2ae3537971ae925f55 100644 --- a/convlab/dst/setsumbt/dataset/ontology.py +++ b/convlab/dst/setsumbt/dataset/ontology.py @@ -22,6 +22,7 @@ from copy import deepcopy import torch import numpy as np +from tqdm import tqdm def set_seed(args): @@ -94,8 +95,8 @@ def get_slot_candidate_embeddings(ontology: dict, set_type: str, args, tokenizer embedding_model.eval() slots = dict() - for domain, subset in ontology.items(): - for slot, slot_info in subset.items(): + for domain, subset in tqdm(ontology.items(), desc='Domains'): + for slot, slot_info in tqdm(subset.items(), desc='Slots'): # Get description or use "domain-slot" if args.use_descriptions: desc = slot_info['description'] diff --git a/convlab/dst/setsumbt/dataset/unified_format.py b/convlab/dst/setsumbt/dataset/unified_format.py index 26b67268db5425037732323dfc131c438b40494a..ca5793f6a42d01a9eef9769f363033aa798e16b4 100644 --- a/convlab/dst/setsumbt/dataset/unified_format.py +++ b/convlab/dst/setsumbt/dataset/unified_format.py @@ -258,8 +258,8 @@ class UnifiedFormatDataset(Dataset): dataset_args = [{"dataset_name": dataset_name}] self.dataset_dicts = [load_dataset(**dataset_args_) for dataset_args_ in dataset_args] self.ontology = get_ontology_slots(dataset_name) - values = [get_values_from_data(dataset) for dataset in self.dataset_dicts] - self.ontology = ontology_add_values(self.ontology, combine_value_sets(values)) + values = [get_values_from_data(dataset, set_type) for dataset in self.dataset_dicts] + self.ontology = ontology_add_values(self.ontology, combine_value_sets(values), set_type) self.ontology = ontology_add_requestable_slots(self.ontology, get_requestable_slots(self.dataset_dicts)) if train_ratio != 1.0: diff --git a/convlab/dst/setsumbt/dataset/utils.py b/convlab/dst/setsumbt/dataset/utils.py index 088480c4d9fec239caeaf3ceb455f1fe0eb962f6..1b601f027b8d8b02df7423e8b3d5fc351deca724 100644 --- a/convlab/dst/setsumbt/dataset/utils.py +++ b/convlab/dst/setsumbt/dataset/utils.py @@ -52,17 +52,23 @@ def get_ontology_slots(dataset_name: str) -> dict: return ontology_slots -def get_values_from_data(dataset: dict) -> dict: +def get_values_from_data(dataset: dict, data_split: str = "train") -> dict: """ Function to extract slots, slot descriptions and categorical slot values from the dataset ontology. Args: dataset (dict): Dataset dictionary obtained using the load_dataset function + data_split (str): Dataset split: train/validation/test Returns: value_sets (dict): Dictionary containing possible values obtained from dataset """ data = load_dst_data(dataset, data_split='all', speaker='user') + + # Remove test data from the data when building training/validation ontology + if data_split in ['train', 'validation']: + data = {key: itm for key, itm in data.items() if key in ['train', 'validation']} + value_sets = {} for set_type, dataset in data.items(): for turn in dataset: @@ -141,18 +147,22 @@ def clean_values(value_sets: dict, value_map: dict = VALUE_MAP) -> dict: return clean_vals -def ontology_add_values(ontology_slots: dict, value_sets: dict) -> dict: +def ontology_add_values(ontology_slots: dict, value_sets: dict, data_split: str = "train") -> dict: """ Add value sets obtained from the dataset to the ontology Args: ontology_slots (dict): Ontology dictionary containing slots, descriptions and categorical slot values value_sets (dict): Cleaned Dictionary containing possible values obtained from dataset + data_split (str): Dataset split: train/validation/test Returns: ontology_slots (dict): Ontology dictionary containing slots, slot descriptions and possible value sets """ ontology = {} for domain in sorted(ontology_slots): + if data_split in ['train', 'validation']: + if domain not in value_sets: + continue ontology[domain] = {} for slot in sorted(ontology_slots[domain]): if not ontology_slots[domain][slot]['possible_values']: @@ -172,7 +182,7 @@ def get_requestable_slots(datasets: list) -> dict: """ Function to get set of requestable slots from the dataset action labels. Args: - dataset (dict): Dataset dictionary obtained using the load_dataset function + datasets (dict): Dataset dictionary obtained using the load_dataset function Returns: slots (dict): Dictionary containing requestable domain-slot pairs diff --git a/convlab/policy/ppo/train.py b/convlab/policy/ppo/train.py index 45681169ff78554dae5228b5e0fce03b9a154b25..f376bde76b59db47409d6ad7c5a55425a2d52e4e 100755 --- a/convlab/policy/ppo/train.py +++ b/convlab/policy/ppo/train.py @@ -253,7 +253,7 @@ if __name__ == '__main__': if idx % conf['model']['eval_frequency'] == 0 and idx != 0: time_now = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) - logging.info(f"Evaluating at Epoch: {idx} - {time_now}" + '-'*60) + logging.info(f"Evaluating after Dialogues: {idx * conf['model']['batchsz']} - {time_now}" + '-' * 60) eval_dict = eval_policy(conf, policy_sys, env, sess, save_eval, log_save_path) diff --git a/convlab/util/custom_util.py b/convlab/util/custom_util.py index c79c6f0d06e26f916f06a926c58783ed367e8828..38d8b92a36efd67bdf9166c1c5f9f20734d1ecb5 100644 --- a/convlab/util/custom_util.py +++ b/convlab/util/custom_util.py @@ -21,6 +21,7 @@ from convlab.evaluator.multiwoz_eval import MultiWozEvaluator from convlab.util import load_dataset import shutil +import signal slot_mapping = {"pricerange": "price range", "post": "postcode", "arriveBy": "arrive by", "leaveAt": "leave at", @@ -34,6 +35,22 @@ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = DEVICE +class timeout: + def __init__(self, seconds=10, error_message='Timeout'): + self.seconds = seconds + self.error_message = error_message + + def handle_timeout(self, signum, frame): + raise TimeoutError(self.error_message) + + def __enter__(self): + signal.signal(signal.SIGALRM, self.handle_timeout) + signal.alarm(self.seconds) + + def __exit__(self, type, value, traceback): + signal.alarm(0) + + class NumpyEncoder(json.JSONEncoder): """ Special json encoder for numpy types """ @@ -154,20 +171,20 @@ def eval_policy(conf, policy_sys, env, sess, save_eval, log_save_path, single_do if conf['model']['process_num'] == 1: complete_rate, success_rate, success_rate_strict, avg_return, turns, \ avg_actions, task_success, book_acts, inform_acts, request_acts, \ - select_acts, offer_acts = evaluate(sess, + select_acts, offer_acts, recommend_acts = evaluate(sess, num_dialogues=conf['model']['num_eval_dialogues'], sys_semantic_to_usr=conf['model'][ 'sys_semantic_to_usr'], save_flag=save_eval, save_path=log_save_path, goals=goals) - total_acts = book_acts + inform_acts + request_acts + select_acts + offer_acts + total_acts = book_acts + inform_acts + request_acts + select_acts + offer_acts + recommend_acts else: complete_rate, success_rate, success_rate_strict, avg_return, turns, \ avg_actions, task_success, book_acts, inform_acts, request_acts, \ - select_acts, offer_acts = \ + select_acts, offer_acts, recommend_acts = \ evaluate_distributed(sess, list(range(1000, 1000 + conf['model']['num_eval_dialogues'])), conf['model']['process_num'], goals) - total_acts = book_acts + inform_acts + request_acts + select_acts + offer_acts + total_acts = book_acts + inform_acts + request_acts + select_acts + offer_acts + recommend_acts task_success_gathered = {} for task_dict in task_success: @@ -178,22 +195,40 @@ def eval_policy(conf, policy_sys, env, sess, save_eval, log_save_path, single_do task_success = task_success_gathered policy_sys.is_train = True - logging.info(f"Complete: {complete_rate}, Success: {success_rate}, Success strict: {success_rate_strict}, " - f"Average Return: {avg_return}, Turns: {turns}, Average Actions: {avg_actions}, " + + mean_complete, err_complete = np.average(complete_rate), np.std(complete_rate) / np.sqrt(len(complete_rate)) + mean_success, err_success = np.average(success_rate), np.std(success_rate) / np.sqrt(len(success_rate)) + mean_success_strict, err_success_strict = np.average(success_rate_strict), np.std(success_rate_strict) / np.sqrt(len(success_rate_strict)) + mean_return, err_return = np.average(avg_return), np.std(avg_return) / np.sqrt(len(avg_return)) + mean_turns, err_turns = np.average(turns), np.std(turns) / np.sqrt(len(turns)) + mean_actions, err_actions = np.average(avg_actions), np.std(avg_actions) / np.sqrt(len(avg_actions)) + + logging.info(f"Complete: {mean_complete}+-{round(err_complete, 2)}, " + f"Success: {mean_success}+-{round(err_success, 2)}, " + f"Success strict: {mean_success_strict}+-{round(err_success_strict, 2)}, " + f"Average Return: {mean_return}+-{round(err_return, 2)}, " + f"Turns: {mean_turns}+-{round(err_turns, 2)}, " + f"Average Actions: {mean_actions}+-{round(err_actions, 2)}, " f"Book Actions: {book_acts/total_acts}, Inform Actions: {inform_acts/total_acts}, " f"Request Actions: {request_acts/total_acts}, Select Actions: {select_acts/total_acts}, " - f"Offer Actions: {offer_acts/total_acts}") + f"Offer Actions: {offer_acts/total_acts}, Recommend Actions: {recommend_acts/total_acts}") for key in task_success: logging.info( f"{key}: Num: {len(task_success[key])} Success: {np.average(task_success[key]) if len(task_success[key]) > 0 else 0}") - return {"complete_rate": complete_rate, - "success_rate": success_rate, - "success_rate_strict": success_rate_strict, - "avg_return": avg_return, - "turns": turns, - "avg_actions": avg_actions} + return {"complete_rate": mean_complete, + "success_rate": mean_success, + "success_rate_strict": mean_success_strict, + "avg_return": mean_return, + "turns": mean_turns, + "avg_actions": mean_actions, + "book_acts": book_acts/total_acts, + "inform_acts": inform_acts/total_acts, + "request_acts": request_acts/total_acts, + "select_acts": select_acts/total_acts, + "offer_acts": offer_acts/total_acts, + "recommend_acts": recommend_acts/total_acts} def env_config(conf, policy_sys, check_book_constraints=True): @@ -294,7 +329,7 @@ def evaluate(sess, num_dialogues=400, sys_semantic_to_usr=False, save_flag=False task_success = {'All_user_sim': [], 'All_evaluator': [], "All_evaluator_strict": [], 'total_return': [], 'turns': [], 'avg_actions': [], 'total_booking_acts': [], 'total_inform_acts': [], 'total_request_acts': [], - 'total_select_acts': [], 'total_offer_acts': []} + 'total_select_acts': [], 'total_offer_acts': [], 'total_recommend_acts': []} dial_count = 0 for seed in range(1000, 1000 + num_dialogues): set_seed(seed) @@ -310,6 +345,7 @@ def evaluate(sess, num_dialogues=400, sys_semantic_to_usr=False, save_flag=False request = 0 select = 0 offer = 0 + recommend = 0 # this 40 represents the max turn of dialogue for i in range(40): sys_response, user_response, session_over, reward = sess.next_turn( @@ -332,6 +368,8 @@ def evaluate(sess, num_dialogues=400, sys_semantic_to_usr=False, save_flag=False select += 1 if intent.lower() == 'offerbook': offer += 1 + if intent.lower() == 'recommend': + recommend += 1 avg_actions += len(acts) turn_counter += 1 turns += 1 @@ -368,6 +406,8 @@ def evaluate(sess, num_dialogues=400, sys_semantic_to_usr=False, save_flag=False task_success['total_request_acts'].append(request) task_success['total_select_acts'].append(select) task_success['total_offer_acts'].append(offer) + task_success['total_offer_acts'].append(offer) + task_success['total_recommend_acts'].append(recommend) # print(agent_sys.agent_saves) eval_save['Conversation {}'.format(str(dial_count))] = [ @@ -388,7 +428,7 @@ def evaluate(sess, num_dialogues=400, sys_semantic_to_usr=False, save_flag=False np.average(task_success['turns']), np.average(task_success['avg_actions']), task_success, \ np.average(task_success['total_booking_acts']), np.average(task_success['total_inform_acts']), \ np.average(task_success['total_request_acts']), np.average(task_success['total_select_acts']), \ - np.average(task_success['total_offer_acts']) + np.average(task_success['total_offer_acts']), np.average(task_success['total_recommend_acts']) def model_downloader(download_dir, model_path):