Skip to content
Snippets Groups Projects
Commit f48c40eb authored by Hsien-Chin Lin's avatar Hsien-Chin Lin
Browse files

wip

parent 7fc4ea51
No related branches found
No related tags found
No related merge requests found
import json
import os
from argparse import ArgumentParser from argparse import ArgumentParser
import matplotlib.pyplot as plt
import numpy as np import numpy as np
import json
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt
result_dir = "convlab/policy/emoTUS/result"
def arg_parser(): def arg_parser():
...@@ -54,51 +58,69 @@ def get_turn_emotion(conversation): ...@@ -54,51 +58,69 @@ def get_turn_emotion(conversation):
insert_turn(turn_info[metric], turn, emotion) insert_turn(turn_info[metric], turn, emotion)
else: else:
insert_turn(turn_info[f"Not {metric}"], turn, emotion) insert_turn(turn_info[f"Not {metric}"], turn, emotion)
data = {'x': [t for t in range(turn)], 'all_positive': [ print("MAX_TURN", max_turn)
], 'all_negative': [], 'all_mean': []} data = {'x': [t for t in range(max_turn)],
'all_positive': [],
'all_negative': [],
'all_mean': [],
'all_std': []}
for metric in ["Complete", "Success", "Success strict"]: for metric in ["Complete", "Success", "Success strict"]:
data[f"{metric}_positive"] = [] data[f"{metric}_positive"] = []
data[f"{metric}_negative"] = [] data[f"{metric}_negative"] = []
data[f"{metric}_mean"] = [] data[f"{metric}_mean"] = []
data[f"{metric}_std"] = []
data[f"Not {metric}_positive"] = [] data[f"Not {metric}_positive"] = []
data[f"Not {metric}_negative"] = [] data[f"Not {metric}_negative"] = []
data[f"Not {metric}_mean"] = [] data[f"Not {metric}_mean"] = []
data[f"Not {metric}_std"] = []
for t in range(turn): for t in range(turn):
pos, neg, mean = turn_score(turn_info["all"][t]) pos, neg, mean, std = turn_score(turn_info["all"][t])
data[f"all_positive"].append(pos) data[f"all_positive"].append(pos)
data[f"all_negative"].append(neg) data[f"all_negative"].append(neg)
data[f"all_mean"].append(mean) data[f"all_mean"].append(mean)
data[f"all_std"].append(std)
for raw_metric in ["Complete", "Success", "Success strict"]: for raw_metric in ["Complete", "Success", "Success strict"]:
for metric in [raw_metric, f"Not {raw_metric}"]: for metric in [raw_metric, f"Not {raw_metric}"]:
if t not in turn_info[metric]: if t not in turn_info[metric]:
data[f"{metric}_positive"].append(0) data[f"{metric}_positive"].append(0)
data[f"{metric}_negative"].append(0) data[f"{metric}_negative"].append(0)
data[f"{metric}_mean"].append(0) data[f"{metric}_mean"].append(0)
data[f"{metric}_std"].append(0)
else: else:
pos, neg, mean = turn_score(turn_info[metric][t]) pos, neg, mean, std = turn_score(turn_info[metric][t])
data[f"{metric}_positive"].append(pos) data[f"{metric}_positive"].append(pos)
data[f"{metric}_negative"].append(neg) data[f"{metric}_negative"].append(neg)
data[f"{metric}_mean"].append(mean) data[f"{metric}_mean"].append(mean)
data[f"{metric}_std"].append(std)
for x in data:
data[x] = np.array(data[x])
fig, ax = plt.subplots() fig, ax = plt.subplots()
ax.plot(data['x'], data["Complete_mean"], p = {"Complete": {"color": "C0", "label": "Success"},
'o--', color='C0', label="Complete") "Not Complete": {"color": "C1", "label": "Fail"},
ax.fill_between(data['x'], data["Complete_positive"], "all": {"color": "C2", "label": "all"}}
data["Complete_negative"], color='C0', alpha=0.2) for name, para in p.items():
ax.plot(data['x'], data["Not Complete_mean"],
'o--', color='C1', label="Not Complete") ax.plot(data['x'],
ax.fill_between(data['x'], data["Not Complete_positive"], data[f"{name}_mean"],
data["Not Complete_negative"], color='C1', alpha=0.2) 'o--',
ax.plot(data['x'], data["all_mean"], 'o--', color='C2', color=para["color"],
label="All") label=para["label"])
ax.fill_between(data['x'], data["all_positive"], ax.fill_between(data['x'],
data["all_negative"], color='C2', alpha=0.2) data[f"{name}_mean"]+data[f"{name}_std"],
data[f"{name}_mean"]-data[f"{name}_std"],
color=para["color"], alpha=0.2)
ax.legend() ax.legend()
ax.set_xlabel("turn") ax.set_xlabel("turn")
ax.set_ylabel("Sentiment") ax.set_ylabel("Sentiment")
ax.set_xticks([t for t in range(0, max_turn, 2)])
plt.grid(axis='x', color='0.95')
plt.grid(axis='y', color='0.95')
# plt.show() # plt.show()
plt.savefig("convlab/policy/emoTUS/fig.png") plt.savefig(os.path.join(result_dir, "turn2emotion.png"))
def turn_score(score_list): def turn_score(score_list):
...@@ -110,7 +132,7 @@ def turn_score(score_list): ...@@ -110,7 +132,7 @@ def turn_score(score_list):
positive += 1 positive += 1
if s < 0: if s < 0:
negative += -1 negative += -1
return positive/count, negative/count, np.mean(score_list) return positive/count, negative/count, np.mean(score_list), np.std(score_list, ddof=1)/np.sqrt(len(score_list))
def insert_turn(turn_info, turn, emotion): def insert_turn(turn_info, turn, emotion):
...@@ -253,13 +275,15 @@ def dict2csv(data): ...@@ -253,13 +275,15 @@ def dict2csv(data):
r[act] = temp r[act] = temp
dataframe = pd.DataFrame.from_dict( dataframe = pd.DataFrame.from_dict(
r, orient='index', columns=[emo for emo in emotion]+["count"]) r, orient='index', columns=[emo for emo in emotion]+["count"])
dataframe.to_csv(open("convlab/policy/emoTUS/act2emotion.csv", 'w')) dataframe.to_csv(open(os.path.join(result_dir, "act2emotion.csv"), 'w'))
def main(): def main():
args = arg_parser() args = arg_parser()
result = {} result = {}
conversation = json.load(open(args.file)) if not os.path.exists(result_dir):
os.makedirs(result_dir)
conversation = json.load(open(args.file))["conversation"]
basic_info = basic_analysis(conversation) basic_info = basic_analysis(conversation)
result["basic_info"] = basic_info result["basic_info"] = basic_info
print(basic_info) print(basic_info)
...@@ -267,7 +291,7 @@ def main(): ...@@ -267,7 +291,7 @@ def main():
print(advance_info) print(advance_info)
result["advance_info"] = advance_info result["advance_info"] = advance_info
json.dump(result, open( json.dump(result, open(
"convlab/policy/emoTUS/conversation_result.json", 'w'), indent=2) os.path.join("conversation_result.json"), 'w'), indent=2)
dict2csv(advance_info) dict2csv(advance_info)
get_turn_emotion(conversation) get_turn_emotion(conversation)
......
...@@ -240,7 +240,6 @@ class UserActionPolicy(GenTUSUserActionPolicy): ...@@ -240,7 +240,6 @@ class UserActionPolicy(GenTUSUserActionPolicy):
model_input = self.vector.encode(raw_inputs, self.max_in_len) model_input = self.vector.encode(raw_inputs, self.max_in_len)
responses = {} responses = {}
if emotion: if emotion:
print("if emotion")
emotion_list = [emotion] emotion_list = [emotion]
else: else:
emotion_list = self.emotion_list emotion_list = self.emotion_list
...@@ -285,6 +284,8 @@ class UserActionPolicy(GenTUSUserActionPolicy): ...@@ -285,6 +284,8 @@ class UserActionPolicy(GenTUSUserActionPolicy):
self.seq = torch.zeros(1, self.max_out_len, device=self.device).long() self.seq = torch.zeros(1, self.max_out_len, device=self.device).long()
pos = self._update_seq([0], 0) pos = self._update_seq([0], 0)
pos = self._update_seq(self.token_map.get_id('start_json'), pos) pos = self._update_seq(self.token_map.get_id('start_json'), pos)
pos = self._update_seq(
self.token_map.get_id('start_emotion'), pos)
pos = self._update_seq(self.kg._get_token_id(emotion), pos) pos = self._update_seq(self.kg._get_token_id(emotion), pos)
pos = self._update_seq(self.token_map.get_id('sep_token'), pos) pos = self._update_seq(self.token_map.get_id('sep_token'), pos)
pos = self._update_seq(self.token_map.get_id('start_act'), pos) pos = self._update_seq(self.token_map.get_id('start_act'), pos)
......
...@@ -41,7 +41,7 @@ class Evaluator: ...@@ -41,7 +41,7 @@ class Evaluator:
self.time = f"{datetime.now().strftime('%y-%m-%d-%H-%M')}" self.time = f"{datetime.now().strftime('%y-%m-%d-%H-%M')}"
self.use_sentiment = kwargs.get("use_sentiment", False) self.use_sentiment = kwargs.get("use_sentiment", False)
self.add_persona = kwargs.get("add_persona", False) self.add_persona = kwargs.get("add_persona", True)
self.emotion_mid = kwargs.get("emotion_mid", False) self.emotion_mid = kwargs.get("emotion_mid", False)
weight = kwargs.get("weight", None) weight = kwargs.get("weight", None)
self.sample = kwargs.get("sample", False) self.sample = kwargs.get("sample", False)
......
...@@ -29,8 +29,10 @@ def arg_parser(): ...@@ -29,8 +29,10 @@ def arg_parser():
parser.add_argument("--generated-file", type=str, help="the generated results", parser.add_argument("--generated-file", type=str, help="the generated results",
default="") default="")
parser.add_argument("--dataset", default="multiwoz") parser.add_argument("--dataset", default="multiwoz")
parser.add_argument("--do-golden-nlg", action="store_true", parser.add_argument("--golden-emotion", action="store_true",
help="do golden nlg generation") help="golden emotion -> action + utt")
parser.add_argument("--golden-action", action="store_true",
help="golden emotion + action -> utt")
parser.add_argument("--use-sentiment", action="store_true") parser.add_argument("--use-sentiment", action="store_true")
parser.add_argument("--emotion-mid", action="store_true") parser.add_argument("--emotion-mid", action="store_true")
parser.add_argument("--weight", type=float, default=None) parser.add_argument("--weight", type=float, default=None)
...@@ -83,23 +85,29 @@ class Evaluator: ...@@ -83,23 +85,29 @@ class Evaluator:
for x in self.r: for x in self.r:
self.r[x].append(temp[x]) self.r[x].append(temp[x])
def generate_results(self, f_eval, golden=False): def generate_results(self, f_eval, golden_emotion=False, golden_action=False):
emotion_mode = "normal" emotion_mode = "normal"
in_file = json.load(open(f_eval)) in_file = json.load(open(f_eval))
mode = "max"
if self.sample:
mode = "sample"
for dialog in tqdm(in_file['dialog']): for dialog in tqdm(in_file['dialog']):
inputs = dialog["in"] inputs = dialog["in"]
labels = self.usr._parse_output(dialog["out"]) labels = self.usr._parse_output(dialog["out"])
if golden: if golden_action:
usr_act = labels["action"] usr_act = labels["action"]
usr_emo = labels["emotion"]
usr_utt = self.usr.generate_text_from_give_semantic( usr_utt = self.usr.generate_text_from_give_semantic(
inputs, labels["action"], labels["emotion"]) inputs, labels["action"], labels["emotion"])
elif golden_emotion:
usr_emo = labels["emotion"]
output = self.usr.generate_from_emotion(
inputs, emotion=usr_emo, mode=mode)
output = self.usr._parse_output(output[usr_emo])
usr_act = self.usr._remove_illegal_action(output["action"])
usr_utt = output["text"]
else: else:
mode = "max"
if self.sample:
mode = "sample"
output = self.usr._parse_output( output = self.usr._parse_output(
self.usr._generate_action(inputs, mode=mode, emotion_mode=emotion_mode)) self.usr._generate_action(inputs, mode=mode, emotion_mode=emotion_mode))
usr_emo = output["emotion"] usr_emo = output["emotion"]
...@@ -139,10 +147,10 @@ class Evaluator: ...@@ -139,10 +147,10 @@ class Evaluator:
result.append(temp) result.append(temp)
return result return result
def nlg_evaluation(self, input_file=None, generated_file=None, golden=False): def nlg_evaluation(self, input_file=None, generated_file=None, golden_emotion=False, golden_action=False):
if input_file: if input_file:
print("Force generation") print("Force generation")
self.generate_results(input_file, golden) self.generate_results(input_file, golden_emotion, golden_action)
elif generated_file: elif generated_file:
self.read_generated_result(generated_file) self.read_generated_result(generated_file)
...@@ -152,14 +160,19 @@ class Evaluator: ...@@ -152,14 +160,19 @@ class Evaluator:
if self.sample: if self.sample:
mode = "sample" mode = "sample"
nlg_eval = { nlg_eval = {}
"golden": golden, if golden_action:
"mode": mode, nlg_eval["golden"] = "golden_action"
"metrics": {}, elif golden_emotion:
"dialog": self._transform_result() nlg_eval["golden"] = "golden_emotion"
} else:
nlg_eval["golden"] = False
nlg_eval["mode"] = mode
nlg_eval["metrics"] = {}
nlg_eval["dialog"] = self._transform_result()
if golden: # if golden_action:
print("Calculate BLEU") print("Calculate BLEU")
bleu_metric = load_metric("sacrebleu") bleu_metric = load_metric("sacrebleu")
labels = [[utt] for utt in self.r["golden_utts"]] labels = [[utt] for utt in self.r["golden_utts"]]
...@@ -170,13 +183,14 @@ class Evaluator: ...@@ -170,13 +183,14 @@ class Evaluator:
print("bleu_metric", bleu_score) print("bleu_metric", bleu_score)
nlg_eval["metrics"]["bleu"] = bleu_score nlg_eval["metrics"]["bleu"] = bleu_score
else: # else:
print("Calculate SER") print("Calculate SER")
missing, hallucinate, total, hallucination_dialogs, missing_dialogs = fine_SER( missing, hallucinate, total, hallucination_dialogs, missing_dialogs = fine_SER(
self.r["gen_acts"], self.r["gen_utts"]) self.r["gen_acts"], self.r["gen_utts"])
print("{} Missing acts: {}, Total acts: {}, Hallucinations {}, SER {}".format( print("{} Missing acts: {}, Total acts: {}, Hallucinations {}, SER {}".format(
"genTUSNLG", missing, total, hallucinate, missing/total)) "EmoUSNLG", missing, total, hallucinate, missing/total))
print(nlg_eval["metrics"])
nlg_eval["metrics"]["SER"] = missing/total nlg_eval["metrics"]["SER"] = missing/total
# TODO emotion metric # TODO emotion metric
...@@ -188,42 +202,23 @@ class Evaluator: ...@@ -188,42 +202,23 @@ class Evaluator:
indent=2) indent=2)
return os.path.join(dir_name, f"{self.time}-nlg_eval.json") return os.path.join(dir_name, f"{self.time}-nlg_eval.json")
def evaluation(self, input_file=None, generated_file=None): def evaluation(self, generated_file, golden_emotion=False, golden_action=False):
# TODO add emotion # TODO add emotion
force_prediction = True
if generated_file:
print("---> use generated file")
gen_file = json.load(open(generated_file)) gen_file = json.load(open(generated_file))
force_prediction = False
if gen_file["golden"]:
force_prediction = True
self.read_generated_result(generated_file) self.read_generated_result(generated_file)
if force_prediction: if golden_action:
in_file = json.load(open(input_file)) print("golden_action, skip semantic evaluation")
dialog_result = [] return
elif golden_emotion:
print("golden_emotion, skip emotion evaluation")
gen_acts, golden_acts = [], [] gen_acts, golden_acts = [], []
# scores = {"precision": [], "recall": [], "f1": [], "turn_acc": []} for dialog in gen_file['dialog']:
for dialog in tqdm(in_file['dialog']): gen_acts.append(dialog["gen_acts"])
inputs = dialog["in"] golden_acts.append(dialog["golden_acts"])
labels = self.usr._parse_output(dialog["out"]) dialog_result = gen_file['dialog']
ans_action = self.usr._remove_illegal_action(labels["action"])
preds = self.usr._generate_action(inputs)
preds = self.usr._parse_output(preds)
usr_action = self.usr._remove_illegal_action(preds["action"])
gen_acts.append(usr_action)
golden_acts.append(ans_action)
d = {"input": inputs,
"golden_acts": ans_action,
"gen_acts": usr_action}
if "text" in preds:
d["golden_utts"] = labels["text"]
d["gen_utts"] = preds["text"]
# print("pred text", preds["text"])
dialog_result.append(d)
else: else:
gen_acts, golden_acts = [], [] gen_acts, golden_acts = [], []
gen_emotions, golden_emotions = [], [] gen_emotions, golden_emotions = [], []
...@@ -246,12 +241,15 @@ class Evaluator: ...@@ -246,12 +241,15 @@ class Evaluator:
result[metric] = sum(scores[metric])/len(scores[metric]) result[metric] = sum(scores[metric])/len(scores[metric])
print(f"{metric}: {result[metric]}") print(f"{metric}: {result[metric]}")
if not golden_emotion:
emo_score = emotion_score( emo_score = emotion_score(
golden_emotions, golden_emotions,
gen_emotions, gen_emotions,
self.model_checkpoint, self.model_checkpoint,
time=self.time, time=self.time,
no_neutral=False) no_neutral=False)
result["emotion"] = {"macro_f1": emo_score["macro_f1"],
"sep_f1": emo_score["sep_f1"]}
if self.use_sentiment: if self.use_sentiment:
sent_score = sentiment_score( sent_score = sentiment_score(
self.r["golden_sentiment"], self.r["golden_sentiment"],
...@@ -260,13 +258,16 @@ class Evaluator: ...@@ -260,13 +258,16 @@ class Evaluator:
time=self.time) time=self.time)
else: else:
# transfer emotions to sentiment if the model do not generate sentiment # transfer emotions to sentiment if the model do not generate sentiment
golden_sentiment = [self.emo2sent[emo] for emo in golden_emotions] golden_sentiment = [self.emo2sent[emo]
for emo in golden_emotions]
gen_sentiment = [self.emo2sent[emo] for emo in gen_emotions] gen_sentiment = [self.emo2sent[emo] for emo in gen_emotions]
sent_score = sentiment_score( sent_score = sentiment_score(
golden_sentiment, golden_sentiment,
gen_sentiment, gen_sentiment,
self.model_checkpoint, self.model_checkpoint,
time=self.time) time=self.time)
result["sentiment"] = {"macro_f1": sent_score["macro_f1"],
"sep_f1": sent_score["sep_f1"]}
# for metric in emo_score: # for metric in emo_score:
# result[metric] = emo_score[metric] # result[metric] = emo_score[metric]
...@@ -356,11 +357,13 @@ def main(): ...@@ -356,11 +357,13 @@ def main():
else: else:
nlg_result = eval.nlg_evaluation(input_file=args.input_file, nlg_result = eval.nlg_evaluation(input_file=args.input_file,
generated_file=args.generated_file, generated_file=args.generated_file,
golden=args.do_golden_nlg) golden_emotion=args.golden_emotion,
golden_action=args.golden_action)
generated_file = nlg_result generated_file = nlg_result
eval.evaluation(args.input_file, eval.evaluation(generated_file,
generated_file) golden_emotion=args.golden_emotion,
golden_action=args.golden_action)
if __name__ == '__main__': if __name__ == '__main__':
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment