diff --git a/convlab2/policy/evaluate.py b/convlab2/policy/evaluate.py index ca864f48d423bd673d7a2160c26d61239165801e..00b8347315f81b0d5955b77738bbe9060203cbf7 100755 --- a/convlab2/policy/evaluate.py +++ b/convlab2/policy/evaluate.py @@ -11,7 +11,8 @@ from convlab2.dialog_agent.agent import PipelineAgent from convlab2.dialog_agent.session import BiSession from convlab2.evaluator.multiwoz_eval import MultiWozEvaluator from convlab2.policy.rule.multiwoz import RulePolicy -from convlab2.util.custom_util import set_seed, get_config, env_config +from convlab2.task.multiwoz.goal_generator import GoalGenerator +from convlab2.util.custom_util import set_seed, get_config, env_config, create_goals def init_logging(log_dir_path, path_suffix=None): @@ -66,9 +67,14 @@ def evaluate(config_path, model_name, verbose=False): task_success = {'Complete': [], 'Success': [], 'Success strict': [], 'total_return': [], 'turns': []} - for seed in range(1000, 1400): + + dialogues = 500 + goal_generator = GoalGenerator() + goals = create_goals(goal_generator, num_goals=dialogues, single_domains=False, allowed_domains=None) + + for seed in range(1000, 1000 + dialogues): set_seed(seed) - sess.init_session() + sess.init_session(goal=goals[seed-1000]) sys_response = [] actions = 0.0 total_return = 0.0 diff --git a/convlab2/policy/gdpl/semantic_level_config.json b/convlab2/policy/gdpl/semantic_level_config.json index d46ad81753614f00ab46e78038b225aa3b723332..8e7178f013cc4981f97e14d5dd5b5920456a0777 100644 --- a/convlab2/policy/gdpl/semantic_level_config.json +++ b/convlab2/policy/gdpl/semantic_level_config.json @@ -5,14 +5,9 @@ "pretrained_load_path": "", "batchsz": 1000, "seed": 0, - "epoch": 200, + "epoch": 50, "eval_frequency": 5, "process_num": 4, - "use_masking": false, - "use_state_entropy": false, - "manually_add_entity_names": false, - "use_state_mutual_info": false, - "use_confidence_scores": false, "sys_semantic_to_usr": false, "num_eval_dialogues": 500 }, diff --git a/convlab2/policy/gdpl/train.py b/convlab2/policy/gdpl/train.py index ca1a57ea05845b666b7311d842493a4630aa9c92..a58a54cd9bf679cfc8bb2567a2bc933a3f070164 100755 --- a/convlab2/policy/gdpl/train.py +++ b/convlab2/policy/gdpl/train.py @@ -10,6 +10,7 @@ import logging import time import numpy as np import torch +import random from convlab2.policy.gdpl import GDPL from convlab2.policy.gdpl import RewardEstimator @@ -47,7 +48,7 @@ def sampler(pid, queue, evt, env, policy, batchsz, train_seed=0): :return: """ - buff = Memory(seed=train_seed) + buff = Memory() # we need to sample batchsz of (state, action, next_state, reward, mask) # each trajectory contains `trajectory_len` num of items, so we only need to sample # `batchsz//trajectory_len` num of trajectory totally @@ -58,6 +59,8 @@ def sampler(pid, queue, evt, env, policy, batchsz, train_seed=0): traj_len = 50 real_traj_len = 0 + set_seed(train_seed) + while sampled_num < batchsz: # for each trajectory, we reset the env and get initial state s = env.reset() @@ -121,6 +124,7 @@ def sample(env, policy, batchsz, process_num, seed): # batchsz will be splitted into each process, # final batchsz maybe larger than batchsz parameters process_batchsz = np.ceil(batchsz / process_num).astype(np.int32) + train_seeds = random.sample(range(0, 1000), process_num) # buffer to save all data queue = mp.Queue() @@ -134,7 +138,7 @@ def sample(env, policy, batchsz, process_num, seed): evt = mp.Event() processes = [] for i in range(process_num): - process_args = (i, queue, evt, env, policy, process_batchsz, seed) + process_args = (i, queue, evt, env, policy, process_batchsz, train_seeds[i]) processes.append(mp.Process(target=sampler, args=process_args)) for p in processes: # set the process as daemon, and it will be killed once the main process is stoped. diff --git a/convlab2/policy/pg/config.json b/convlab2/policy/pg/config.json index 480325a980a981ad91f749e3223bdf4bf4ca8ae4..8079b6b7c6ceb5fed5012430ccf8cde1c4b48ee4 100755 --- a/convlab2/policy/pg/config.json +++ b/convlab2/policy/pg/config.json @@ -1,7 +1,7 @@ { "batchsz": 32, "gamma": 0.99, - "lr": 0.00001, + "lr": 0.0000001, "save_dir": "save", "log_dir": "log", "save_per_epoch": 5, diff --git a/convlab2/policy/pg/semantic_level_config.json b/convlab2/policy/pg/semantic_level_config.json index d46ad81753614f00ab46e78038b225aa3b723332..8e7178f013cc4981f97e14d5dd5b5920456a0777 100644 --- a/convlab2/policy/pg/semantic_level_config.json +++ b/convlab2/policy/pg/semantic_level_config.json @@ -5,14 +5,9 @@ "pretrained_load_path": "", "batchsz": 1000, "seed": 0, - "epoch": 200, + "epoch": 50, "eval_frequency": 5, "process_num": 4, - "use_masking": false, - "use_state_entropy": false, - "manually_add_entity_names": false, - "use_state_mutual_info": false, - "use_confidence_scores": false, "sys_semantic_to_usr": false, "num_eval_dialogues": 500 }, diff --git a/convlab2/policy/pg/train.py b/convlab2/policy/pg/train.py index d33cd2f6be34d30aa12614e0907df24842fd1c01..0b5c385c28649dd7f5702ca7e6cedfe2c2e9ff7a 100755 --- a/convlab2/policy/pg/train.py +++ b/convlab2/policy/pg/train.py @@ -10,6 +10,7 @@ import logging import time import numpy as np import torch +import random from convlab2.policy.pg import PG from convlab2.policy.rlmodule import Memory @@ -46,7 +47,7 @@ def sampler(pid, queue, evt, env, policy, batchsz, train_seed=0): :return: """ - buff = Memory(seed=train_seed) + buff = Memory() # we need to sample batchsz of (state, action, next_state, reward, mask) # each trajectory contains `trajectory_len` num of items, so we only need to sample # `batchsz//trajectory_len` num of trajectory totally @@ -57,6 +58,8 @@ def sampler(pid, queue, evt, env, policy, batchsz, train_seed=0): traj_len = 50 real_traj_len = 0 + set_seed(train_seed) + while sampled_num < batchsz: # for each trajectory, we reset the env and get initial state s = env.reset() @@ -120,6 +123,7 @@ def sample(env, policy, batchsz, process_num, seed): # batchsz will be splitted into each process, # final batchsz maybe larger than batchsz parameters process_batchsz = np.ceil(batchsz / process_num).astype(np.int32) + train_seeds = random.sample(range(0, 1000), process_num) # buffer to save all data queue = mp.Queue() @@ -133,7 +137,7 @@ def sample(env, policy, batchsz, process_num, seed): evt = mp.Event() processes = [] for i in range(process_num): - process_args = (i, queue, evt, env, policy, process_batchsz, seed) + process_args = (i, queue, evt, env, policy, process_batchsz, train_seeds[i]) processes.append(mp.Process(target=sampler, args=process_args)) for p in processes: # set the process as daemon, and it will be killed once the main process is stoped. diff --git a/convlab2/policy/ppo/semantic_level_config.json b/convlab2/policy/ppo/semantic_level_config.json index 7182422e3c951f2604e193f0ae884eb9b20a5b15..b5fa40bd32d2200db6f4be274d6e314a694cf0cf 100644 --- a/convlab2/policy/ppo/semantic_level_config.json +++ b/convlab2/policy/ppo/semantic_level_config.json @@ -5,14 +5,9 @@ "pretrained_load_path": "", "batchsz": 1000, "seed": 0, - "epoch": 200, + "epoch": 50, "eval_frequency": 5, "process_num": 4, - "use_masking": false, - "use_state_entropy": false, - "manually_add_entity_names": false, - "use_state_mutual_info": false, - "use_confidence_scores": false, "sys_semantic_to_usr": false, "num_eval_dialogues": 500 }, diff --git a/convlab2/policy/ppo/train.py b/convlab2/policy/ppo/train.py index 101977dbb64ff376f2845ee8ce361124c362c354..aa1e4e77824906676c8660c82a2185b0eaf8ec80 100755 --- a/convlab2/policy/ppo/train.py +++ b/convlab2/policy/ppo/train.py @@ -10,6 +10,7 @@ import logging import time import numpy as np import torch +import random from convlab2.policy.ppo import PPO from convlab2.policy.rlmodule import Memory @@ -46,7 +47,7 @@ def sampler(pid, queue, evt, env, policy, batchsz, train_seed=0): :return: """ - buff = Memory(seed=train_seed) + buff = Memory() # we need to sample batchsz of (state, action, next_state, reward, mask) # each trajectory contains `trajectory_len` num of items, so we only need to sample # `batchsz//trajectory_len` num of trajectory totally @@ -57,6 +58,8 @@ def sampler(pid, queue, evt, env, policy, batchsz, train_seed=0): traj_len = 50 real_traj_len = 0 + set_seed(train_seed) + while sampled_num < batchsz: # for each trajectory, we reset the env and get initial state s = env.reset() @@ -120,6 +123,7 @@ def sample(env, policy, batchsz, process_num, seed): # batchsz will be splitted into each process, # final batchsz maybe larger than batchsz parameters process_batchsz = np.ceil(batchsz / process_num).astype(np.int32) + train_seeds = random.sample(range(0, 1000), process_num) # buffer to save all data queue = mp.Queue() @@ -133,7 +137,7 @@ def sample(env, policy, batchsz, process_num, seed): evt = mp.Event() processes = [] for i in range(process_num): - process_args = (i, queue, evt, env, policy, process_batchsz, seed) + process_args = (i, queue, evt, env, policy, process_batchsz, train_seeds[i]) processes.append(mp.Process(target=sampler, args=process_args)) for p in processes: # set the process as daemon, and it will be killed once the main process is stoped. diff --git a/convlab2/policy/rlmodule.py b/convlab2/policy/rlmodule.py index b67cca2c03163ad65933df2d5ff2de7d22bca31f..db46026656d908b2453a9143b3f482ce7378e382 100755 --- a/convlab2/policy/rlmodule.py +++ b/convlab2/policy/rlmodule.py @@ -319,17 +319,8 @@ Transition = namedtuple('Transition', ('state', 'action', class Memory(object): - def __init__(self, seed=0): + def __init__(self): self.memory = [] - self.set_seed(seed) - - def set_seed(self, seed): - np.random.seed(seed) - torch.random.manual_seed(seed) - random.seed(seed) - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(seed) def push(self, *args): """Saves a transition.""" diff --git a/convlab2/policy/rule/multiwoz/policy_agenda_multiwoz.py b/convlab2/policy/rule/multiwoz/policy_agenda_multiwoz.py index 1ce47425a66f19d415ec204dd98c486794ac551f..315227944c9bea49cb45c10d0715488b8469b14d 100755 --- a/convlab2/policy/rule/multiwoz/policy_agenda_multiwoz.py +++ b/convlab2/policy/rule/multiwoz/policy_agenda_multiwoz.py @@ -135,8 +135,8 @@ class UserPolicyAgendaMultiWoz(Policy): action = {} while len(action) == 0: # A -> A' + user_action - # action = self.agenda.get_action(random.randint(2, self.max_initiative)) - action = self.agenda.get_action(self.max_initiative) + action = self.agenda.get_action(random.randint(1, self.max_initiative)) + #action = self.agenda.get_action(self.max_initiative) # transform to DA action = self._transform_usract_out(action) diff --git a/convlab2/util/custom_util.py b/convlab2/util/custom_util.py index c58534e142db6057eb8c7ad03d507cb7bced6987..845316d3839cdb89b8dff84903d35e0d7e2788a6 100644 --- a/convlab2/util/custom_util.py +++ b/convlab2/util/custom_util.py @@ -18,6 +18,8 @@ from convlab2.dst.rule.multiwoz import RuleDST from convlab2.policy.rule.multiwoz import RulePolicy from convlab2.evaluator.multiwoz_eval import MultiWozEvaluator from convlab2.util import load_dataset +from convlab2.policy.rule.multiwoz.policy_agenda_multiwoz import Goal + import shutil @@ -435,6 +437,19 @@ def act_dict_to_flat_tuple(acts): tuples.append([intent, domain, slot, value]) +def create_goals(goal_generator, num_goals, single_domains=False, allowed_domains=None): + + collected_goals = [] + while len(collected_goals) != num_goals: + goal = Goal(goal_generator) + if single_domains and len(goal.domain_goals) > 1: + continue + if allowed_domains is not None and not set(goal.domain_goals).issubset(set(allowed_domains)): + continue + collected_goals.append(goal) + return collected_goals + + def map_class(cls_path: str): """ Map to class via package text path