From 848c229abea9873388b861660985a50458b4c503 Mon Sep 17 00:00:00 2001
From: Hsien-Chin Lin <linh@hhu.de>
Date: Fri, 14 Apr 2023 11:30:04 +0200
Subject: [PATCH] fix evaluation script

---
 convlab/policy/emoUS/evaluate.py | 266 +++++++++++++------------------
 1 file changed, 115 insertions(+), 151 deletions(-)

diff --git a/convlab/policy/emoUS/evaluate.py b/convlab/policy/emoUS/evaluate.py
index 99da1c24..bd97ab2b 100644
--- a/convlab/policy/emoUS/evaluate.py
+++ b/convlab/policy/emoUS/evaluate.py
@@ -50,6 +50,10 @@ class Evaluator:
         self.emotion_weight = kwargs.get("weight", None)
         self.sample = kwargs.get("sample", False)
         print("self.emotion_weight", self.emotion_weight)
+        self.evaluation_result = {
+            "emotion prediction": {},
+            "semantic action prediction": {},
+            "natural language generation": {}}
 
         self.usr = UserActionPolicy(
             model_checkpoint,
@@ -90,6 +94,7 @@ class Evaluator:
         mode = "max"
         if self.sample:
             mode = "sample"
+
         for dialog in tqdm(in_file['dialog']):
             inputs = dialog["in"]
             labels = self.usr._parse_output(dialog["out"])
@@ -129,6 +134,25 @@ class Evaluator:
 
             self._append_result(temp)
 
+        # save generations
+        generations = {}
+        generations["time"] = self.time
+        generations["golden"] = False
+        if golden_action:
+            # basically, golden_action includes golden_emotion
+            generations["golden"] = "golden_action"
+        elif golden_emotion:
+            generations["golden"] = "golden_emotion"
+        generations["mode"] = mode
+        generations["dialog"] = self._transform_result()
+
+        file_name = "generations.json"
+        if generations["golden"]:
+            file_name = generations['golden'] + "_" + file_name
+
+        with open(os.path.join(self.model_checkpoint, file_name), "w") as f:
+            json.dump(generations, f, indent=2)
+
     def read_generated_result(self, f_eval):
         in_file = json.load(open(f_eval))
 
@@ -148,61 +172,17 @@ class Evaluator:
             result.append(temp)
         return result
 
-    def nlg_evaluation(self, input_file=None, generated_file=None, golden_emotion=False, golden_action=False):
-        if input_file:
-            print("Force generation")
-            self.generate_results(input_file, golden_emotion, golden_action)
-
-        elif generated_file:
-            self.read_generated_result(generated_file)
-        else:
-            print("You must specify the input_file or the generated_file")
-        mode = "max"
-        if self.sample:
-            mode = "sample"
-
-        nlg_eval = {}
-        if golden_action:
-            nlg_eval["golden"] = "golden_action"
-        elif golden_emotion:
-            nlg_eval["golden"] = "golden_emotion"
-        else:
-            nlg_eval["golden"] = False
-
-        nlg_eval["mode"] = mode
-        nlg_eval["emotion_weight"] = self.emotion_weight
-        nlg_eval["metrics"] = {}
-        nlg_eval["dialog"] = self._transform_result()
-
-        # if golden_action:
-        # print("Calculate BLEU")
+    @staticmethod
+    def nlg_evaluation(golden_utts, gen_utts, gen_acts):
         bleu_metric = load_metric("sacrebleu")
-        labels = [[utt] for utt in self.r["golden_utts"]]
-
-        bleu_score = bleu_metric.compute(predictions=self.r["gen_utts"],
+        labels = [[utt] for utt in golden_utts]
+        bleu_score = bleu_metric.compute(predictions=gen_utts,
                                          references=labels,
                                          force=True)
-
-        nlg_eval["metrics"]["bleu"] = bleu_score
-
-        # else:
         missing, hallucinate, total, hallucination_dialogs, missing_dialogs = fine_SER(
-            self.r["gen_acts"], self.r["gen_utts"])
+            gen_acts, gen_utts)
 
-        # print("{} Missing acts: {}, Total acts: {}, Hallucinations {}, SER {}".format(
-        #     "EmoUSNLG", missing, total, hallucinate, missing/total))
-        nlg_eval["metrics"]["SER"] = missing/total
-
-        print("=== Natural language generation ===")
-        print("Sacre-BLEU", nlg_eval["metrics"]["bleu"]["score"])
-        print("SER", nlg_eval["metrics"]["SER"])
-
-        dir_name = self.model_checkpoint
-        json.dump(nlg_eval,
-                  open(os.path.join(
-                      dir_name, f"{self.time}-nlg_eval.json"), 'w'),
-                  indent=2)
-        return os.path.join(dir_name, f"{self.time}-nlg_eval.json")
+        return {"bleu": bleu_score["score"], "SER": missing/total}
 
     @staticmethod
     def _intent_domain(action):
@@ -212,37 +192,21 @@ class Evaluator:
                 acts.append([intent, domain])
         return acts
 
-    def evaluation(self, generated_file, golden_emotion=False, golden_action=False):
-        gen_file = json.load(open(generated_file))
-        self.read_generated_result(generated_file)
-
-        if golden_action:
-            print("golden_action, skip semantic evaluation")
-            return
-
-        elif golden_emotion:
-            print("golden_emotion, skip emotion evaluation")
-            gen_acts, golden_acts = [], []
-            for dialog in gen_file['dialog']:
-                gen_acts.append(dialog["gen_acts"])
-                golden_acts.append(dialog["golden_acts"])
-            dialog_result = gen_file['dialog']
+    def dialog_result(self, dialog):
+        x = {"gen_acts": [], "golden_acts": [],
+             "gen_emotions": [], "golden_emotions": []}
 
-        else:
-            gen_acts, golden_acts = [], []
-            gen_emotions, golden_emotions = [], []
-            for dialog in gen_file['dialog']:
-                gen_acts.append(dialog["gen_acts"])
-                golden_acts.append(dialog["golden_acts"])
-                gen_emotions.append(dialog["gen_emotion"])
-                golden_emotions.append(dialog["golden_emotion"])
-            dialog_result = gen_file['dialog']
+        for d in dialog:
+            x["gen_acts"].append(d["gen_acts"])
+            x["golden_acts"].append(d["golden_acts"])
+            x["gen_emotions"].append(d["gen_emotion"])
+            x["golden_emotions"].append(d["golden_emotion"])
+        return x
 
+    def semantic_evaluation(self, x):
         scores = {"full action": {"precision": [], "recall": [], "f1": [], "turn_acc": []},
                   "intent-domain": {"precision": [], "recall": [], "f1": [], "turn_acc": []}}
-
-        # full action
-        for gen_act, golden_act in zip(gen_acts, golden_acts):
+        for gen_act, golden_act in zip(x["gen_acts"], x["golden_acts"]):
             s = f1_measure(preds=gen_act, labels=golden_act)
             for metric in scores["full action"]:
                 scores["full action"][metric].append(s[metric])
@@ -252,59 +216,75 @@ class Evaluator:
                 scores["intent-domain"][metric].append(s[metric])
 
         result = {}
-        result["emotion_weight"] = self.emotion_weight
-        print("=== Semantic evaluation ===")
         for metric_type, score in scores.items():
             result[metric_type] = {}
-            print(f"> {metric_type}")
             for m, s in score.items():
                 result[metric_type][m] = sum(s)/len(s)
-                print(f"{m}: {result[metric_type][m]}")
-            print("")
-
-        if not golden_emotion:
-            emo_score = emotion_score(
-                golden_emotions,
-                gen_emotions,
-                self.model_checkpoint,
-                time=self.time,
-                no_neutral=False)
-            result["emotion"] = {"macro_f1": emo_score["macro_f1"],
-                                 "sep_f1": emo_score["sep_f1"]}
+        return result
+
+    def evaluation(self, input_file="", generated_file="", golden_emotion=False, golden_action=False):
+        if input_file:
+            print("Force generation")
+            self.generate_results(input_file, golden_emotion, golden_action)
+        elif generated_file:
+            self.read_generated_result(generated_file)
+        else:
+            print("You must specify the input_file or the generated_file")
+
+        gen_file = json.load(open(generated_file))
+        self.read_generated_result(generated_file)
+
+        r = self.nlg_evaluation(
+            self.r["golden_utts"], self.r["gen_utts"], self.r["gen_acts"])
+        for metric, score in r.items():
+            self.evaluation_result["natural language generation"][metric] = score
+        x = self.dialog_result(gen_file['dialog'])
+
+        if not golden_action:
+            r = self.semantic_evaluation(x)
+            for metric, score in r.items():
+                self.evaluation_result["semantic action prediction"][metric] = score
+
+        if not golden_emotion and not golden_action:
+            r = emotion_score(x["golden_emotions"],
+                              x["gen_emotions"],
+                              self.model_checkpoint)
+            self.evaluation_result["emotion prediction"]["emotion"] = {}
+            self.evaluation_result["emotion prediction"]["emotion"]["macro_f1"] = r["macro_f1"]
+            self.evaluation_result["emotion prediction"]["emotion"]["sep_f1"] = {
+                emo: f1 for emo, f1 in zip(r["label"], r["sep_f1"])}
+
             if self.use_sentiment:
-                sent_score = sentiment_score(
-                    self.r["golden_sentiment"],
-                    self.r["gen_sentiment"],
-                    self.model_checkpoint,
-                    time=self.time)
+                golden_sentiment = self.r["golden_sentiment"]
+                gen_sentiment = self.r["gen_sentiment"]
             else:
                 # transfer emotions to sentiment if the model do not generate sentiment
                 golden_sentiment = [self.emo2sent[emo]
-                                    for emo in golden_emotions]
-                gen_sentiment = [self.emo2sent[emo] for emo in gen_emotions]
-                sent_score = sentiment_score(
-                    golden_sentiment,
-                    gen_sentiment,
-                    self.model_checkpoint,
-                    time=self.time)
-            result["sentiment"] = {"macro_f1": sent_score["macro_f1"],
-                                   "sep_f1": sent_score["sep_f1"]}
-
-        # for metric in emo_score:
-        #     result[metric] = emo_score[metric]
-        #     print(f"{metric}: {result[metric]}")
-
-        result["dialog"] = dialog_result
-
-        basename = "semantic_evaluation_result"
-        json.dump(
-            result,
-            open(os.path.join(self.model_checkpoint,
-                              f"{self.time}-{self.dataset}-{basename}.json"), 'w'),
-            indent=2)
-
-
-def emotion_score(golden_emotions, gen_emotions, dirname=".", time="", no_neutral=False):
+                                    for emo in self.r["golden_emotions"]]
+                gen_sentiment = [self.emo2sent[emo]
+                                 for emo in self.r["gen_emotions"]]
+            r = sentiment_score(
+                golden_sentiment,
+                gen_sentiment,
+                self.model_checkpoint)
+
+            self.evaluation_result["emotion prediction"]["sentiment"] = {}
+            self.evaluation_result["emotion prediction"]["sentiment"]["macro_f1"] = r["macro_f1"]
+            self.evaluation_result["emotion prediction"]["sentiment"]["sep_f1"] = {
+                emo: f1 for emo, f1 in zip(r["label"], r["sep_f1"])}
+
+        print(self.evaluation_result)
+
+    # def save_results(self):
+
+    # def print_result(self):
+    #     print("=== Natural language generation ===")
+    #     print("Sacre-BLEU", nlg_eval["metrics"]["bleu"]["score"])
+    #     print("SER", nlg_eval["metrics"]["SER"])
+    #     self.r[""]
+
+
+def emotion_score(golden_emotions, gen_emotions, dirname=".", no_neutral=False):
     labels = ["Neutral", "Fearful", "Dissatisfied",
               "Apologetic", "Abusive", "Excited", "Satisfied"]
     if no_neutral:
@@ -318,19 +298,15 @@ def emotion_score(golden_emotions, gen_emotions, dirname=".", time="", no_neutra
     disp = metrics.ConfusionMatrixDisplay(
         confusion_matrix=cm, display_labels=labels)
     disp.plot()
-    plt.savefig(os.path.join(dirname, f"{time}-emotion.png"))
-    r = {"macro_f1": float(macro_f1), "sep_f1": list(
-        sep_f1), "cm": [list(c) for c in list(cm)]}
-    print("=== emotion score ===")
-    print("emotions:", labels)
-    print("macro_f1:", r["macro_f1"])
-    print("sep_f1:")
-    for i, l in enumerate(labels):
-        print(f"{l}: {r['sep_f1'][i]}")
+    plt.savefig(os.path.join(dirname, f"emotion.png"))
+    r = {"label": labels,
+         "macro_f1": float(macro_f1),
+         "sep_f1": list(sep_f1),
+         "cm": [list(c) for c in list(cm)]}
     return r
 
 
-def sentiment_score(golden_sentiment, gen_sentiment, dirname=".", time=""):
+def sentiment_score(golden_sentiment, gen_sentiment, dirname="."):
     labels = ["Neutral", "Negative", "Positive"]
 
     macro_f1 = metrics.f1_score(
@@ -342,15 +318,11 @@ def sentiment_score(golden_sentiment, gen_sentiment, dirname=".", time=""):
     disp = metrics.ConfusionMatrixDisplay(
         confusion_matrix=cm, display_labels=labels)
     disp.plot()
-    plt.savefig(os.path.join(dirname, f"{time}-sentiment.png"))
-    r = {"macro_f1": float(macro_f1), "sep_f1": list(
-        sep_f1), "cm": [list(c) for c in list(cm)]}
-    print("=== sentiment score ===")
-    print("sentiments:", labels)
-    print("macro_f1:", r["macro_f1"])
-    print("sep_f1:")
-    for i, l in enumerate(labels):
-        print(f"{l}: {r['sep_f1'][i]}")
+    plt.savefig(os.path.join(dirname, f"sentiment.png"))
+    r = {"label": labels,
+         "macro_f1": float(macro_f1),
+         "sep_f1": list(sep_f1),
+         "cm": [list(c) for c in list(cm)]}
     return r
 
 
@@ -385,16 +357,8 @@ def main():
     print("generated_file", args.generated_file)
     print("input_file", args.input_file)
     with torch.no_grad():
-        if args.generated_file:
-            generated_file = args.generated_file
-        else:
-            nlg_result = eval.nlg_evaluation(input_file=args.input_file,
-                                             generated_file=args.generated_file,
-                                             golden_emotion=args.golden_emotion,
-                                             golden_action=args.golden_action)
-
-            generated_file = nlg_result
-        eval.evaluation(generated_file,
+        eval.evaluation(input_file=args.input_file,
+                        generated_file=args.generated_file,
                         golden_emotion=args.golden_emotion,
                         golden_action=args.golden_action)
 
-- 
GitLab