diff --git a/convlab/policy/genTUS/evaluate.py b/convlab/policy/genTUS/evaluate.py index 87de854970d2701900ba180d2bf15736071e0c1a..23306764207f3c11ed6493efacacdda0fc530a57 100644 --- a/convlab/policy/genTUS/evaluate.py +++ b/convlab/policy/genTUS/evaluate.py @@ -147,6 +147,14 @@ class Evaluator: indent=2) return os.path.join(dir_name, "nlg_eval.json") + @staticmethod + def _intent_domain(action): + acts = [] + for intent, domain, slot, value in action: + if [intent, domain] not in acts: + acts.append([intent, domain]) + return acts + def evaluation(self, input_file=None, generated_file=None): force_prediction = True if generated_file: @@ -187,17 +195,28 @@ class Evaluator: golden_acts.append(dialog["golden_acts"]) dialog_result = gen_file['dialog'] - scores = {"precision": [], "recall": [], "f1": [], "turn_acc": []} + scores = {"complete": {"precision": [], "recall": [], "f1": [], "turn_acc": []}, + "intent_domain": {"precision": [], "recall": [], "f1": [], "turn_acc": []}} for gen_act, golden_act in zip(gen_acts, golden_acts): s = f1_measure(preds=gen_act, labels=golden_act) - for metric in scores: - scores[metric].append(s[metric]) + for metric in scores["complete"]: + scores["complete"][metric].append(s[metric]) + s = f1_measure(preds=self._intent_domain(gen_act), + labels=self._intent_domain(golden_act)) + for metric in scores["intent_domain"]: + scores["intent_domain"][metric].append(s[metric]) result = {} - for metric in scores: - result[metric] = sum(scores[metric])/len(scores[metric]) - print(f"{metric}: {result[metric]}") + # for metric in scores: + # result[metric] = sum(scores[metric])/len(scores[metric]) + # print(f"{metric}: {result[metric]}") + + for metric_type, score in scores.items(): + result[metric_type] = {} + for m, s in score.items(): + result[metric_type][m] = sum(s)/len(s) + print(f"{metric_type}-{m}: {result[metric_type][m]}") result["dialog"] = dialog_result basename = "semantic_evaluation_result"