clean the print format

0a33885a · Hsien-Chin Lin · bb9921c4 · 0a33885a · 0a33885a
Commit 0a33885a authored 2 years ago by Hsien-Chin Lin
--- a/convlab/policy/emoUS/evaluate.py
+++ b/convlab/policy/emoUS/evaluate.py
@@ -175,27 +175,27 @@ class Evaluator:
        nlg_eval["dialog"] = self._transform_result()

        # if golden_action:
-        print("Calculate BLEU")
+        # print("Calculate BLEU")
        bleu_metric = load_metric("sacrebleu")
        labels = [[utt] for utt in self.r["golden_utts"]]

        bleu_score = bleu_metric.compute(predictions=self.r["gen_utts"],
                                         references=labels,
                                         force=True)
-        print("bleu_metric", bleu_score)
+
        nlg_eval["metrics"]["bleu"] = bleu_score

        # else:
-        print("Calculate SER")
        missing, hallucinate, total, hallucination_dialogs, missing_dialogs = fine_SER(
            self.r["gen_acts"], self.r["gen_utts"])

-        print("{} Missing acts: {}, Total acts: {}, Hallucinations {}, SER {}".format(
-            "EmoUSNLG", missing, total, hallucinate, missing/total))
-        print(nlg_eval["metrics"])
+        # print("{} Missing acts: {}, Total acts: {}, Hallucinations {}, SER {}".format(
+        #     "EmoUSNLG", missing, total, hallucinate, missing/total))
        nlg_eval["metrics"]["SER"] = missing/total

-        # TODO emotion metric
+        print("=== Natural language generation ===")
+        print("Sacre-BLEU", nlg_eval["metrics"]["bleu"]["score"])
+        print("SER", nlg_eval["metrics"]["SER"])

        dir_name = self.model_checkpoint
        json.dump(nlg_eval,
@@ -213,7 +213,6 @@ class Evaluator:
        return acts

    def evaluation(self, generated_file, golden_emotion=False, golden_action=False):
-        # TODO add emotion
        gen_file = json.load(open(generated_file))
        self.read_generated_result(generated_file)

@@ -239,26 +238,29 @@ class Evaluator:
                golden_emotions.append(dialog["golden_emotion"])
            dialog_result = gen_file['dialog']

-        scores = {"complete": {"precision": [], "recall": [], "f1": [], "turn_acc": []},
-                  "intent_domain": {"precision": [], "recall": [], "f1": [], "turn_acc": []}}
+        scores = {"full action": {"precision": [], "recall": [], "f1": [], "turn_acc": []},
+                  "intent-domain": {"precision": [], "recall": [], "f1": [], "turn_acc": []}}

        # full action
        for gen_act, golden_act in zip(gen_acts, golden_acts):
            s = f1_measure(preds=gen_act, labels=golden_act)
-            for metric in scores["complete"]:
-                scores["complete"][metric].append(s[metric])
+            for metric in scores["full action"]:
+                scores["full action"][metric].append(s[metric])
            s = f1_measure(preds=self._intent_domain(gen_act),
                           labels=self._intent_domain(golden_act))
-            for metric in scores["intent_domain"]:
-                scores["intent_domain"][metric].append(s[metric])
+            for metric in scores["intent-domain"]:
+                scores["intent-domain"][metric].append(s[metric])

        result = {}
        result["emotion_weight"] = self.emotion_weight
+        print("=== Semantic evaluation ===")
        for metric_type, score in scores.items():
            result[metric_type] = {}
+            print(f"> {metric_type}")
            for m, s in score.items():
                result[metric_type][m] = sum(s)/len(s)
-                print(f"{metric_type}-{m}: {result[metric_type][m]}")
+                print(f"{m}: {result[metric_type][m]}")
+            print("")

        if not golden_emotion:
            emo_score = emotion_score(
@@ -307,7 +309,7 @@ def emotion_score(golden_emotions, gen_emotions, dirname=".", time="", no_neutra
              "Apologetic", "Abusive", "Excited", "Satisfied"]
    if no_neutral:
        labels = labels[1:]
-    print(labels)
+
    macro_f1 = metrics.f1_score(golden_emotions, gen_emotions, average="macro")
    sep_f1 = metrics.f1_score(
        golden_emotions, gen_emotions, average=None, labels=labels)
@@ -319,14 +321,18 @@ def emotion_score(golden_emotions, gen_emotions, dirname=".", time="", no_neutra
    plt.savefig(os.path.join(dirname, f"{time}-emotion.png"))
    r = {"macro_f1": float(macro_f1), "sep_f1": list(
        sep_f1), "cm": [list(c) for c in list(cm)]}
-    print(r)
+    print("=== emotion score ===")
+    print("emotions:", labels)
+    print("macro_f1:", r["macro_f1"])
+    print("sep_f1:")
+    for i, l in enumerate(labels):
+        print(f"{l}: {r['sep_f1'][i]}")
    return r


 def sentiment_score(golden_sentiment, gen_sentiment, dirname=".", time=""):
    labels = ["Neutral", "Negative", "Positive"]

-    print(labels)
    macro_f1 = metrics.f1_score(
        golden_sentiment, gen_sentiment, average="macro")
    sep_f1 = metrics.f1_score(
@@ -339,7 +345,12 @@ def sentiment_score(golden_sentiment, gen_sentiment, dirname=".", time=""):
    plt.savefig(os.path.join(dirname, f"{time}-sentiment.png"))
    r = {"macro_f1": float(macro_f1), "sep_f1": list(
        sep_f1), "cm": [list(c) for c in list(cm)]}
-    print(r)
+    print("=== sentiment score ===")
+    print("sentiments:", labels)
+    print("macro_f1:", r["macro_f1"])
+    print("sep_f1:")
+    for i, l in enumerate(labels):
+        print(f"{l}: {r['sep_f1'][i]}")
    return r



--- a/convlab/policy/emoUS/unify/build_data.py
+++ b/convlab/policy/emoUS/unify/build_data.py
@@ -143,14 +143,6 @@ class DataBuilder(GenTUSDataBuilder):
        return json.dumps(out_str)


-"""
-TODO
-1. add sentiment in the output str
-2. check exciting/fearful in user goal (domain)
-3. add impolite (user info?)
-"""
-
-
 if __name__ == "__main__":
    args = arg_parser()