diff --git a/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.sh b/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.sh
index f24058ecfa63c40c9100f03f061d64b58946796f..c98060c6044359fabbb4ee9295f2e3e70df86eec 100644
--- a/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.sh
+++ b/convlab2/base_models/gpt/keyword_extraction/gen_pretraining_data.sh
@@ -1,5 +1,5 @@
 task_name="key2gen_noisy"
-dataset_name="dailydialog+metalwoz+tm1+tm2+tm3"
+dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
 names=$(echo ${dataset_name} | tr "+" "\n")
 model_type="gpt"
 data_dir=data/${task_name}/${model_type}/${name}/${dataset_name}
diff --git a/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh b/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh
index 9a413f2dd91381cd0a05b13fdc79e5be588f8cc2..36e24587d3d7181e4a92b649e5f1b1e47cdcd9f3 100644
--- a/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh
+++ b/convlab2/base_models/gpt/keyword_extraction/train_t5_key2gen.sh
@@ -2,7 +2,7 @@ set -e
 n_gpus=2
 master_port=23457
 task_name="key2gen_noisy"
-dataset_name="dailydialog+metalwoz+tm1+tm2+tm3"
+dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
 model_type="gpt"
 data_dir="data/${task_name}/${model_type}/${dataset_name}"
 output_dir="output/${task_name}/${model_type}/${dataset_name}"
diff --git a/convlab2/base_models/t5/key2gen/create_data_key2gen.py b/convlab2/base_models/t5/key2gen/create_data_key2gen.py
index edf4a130997f9d6ed0716f1e99577fb4621e8cb5..d2a688558617c3f6aee306e0d6895021482b888c 100644
--- a/convlab2/base_models/t5/key2gen/create_data_key2gen.py
+++ b/convlab2/base_models/t5/key2gen/create_data_key2gen.py
@@ -1,11 +1,13 @@
 import os
 import json
 from tqdm import tqdm
-import re
 from transformers import AutoTokenizer
-from convlab2.util import load_dataset, load_nlu_data, load_dst_data, load_policy_data, load_nlg_data, load_e2e_data, load_rg_data
-from convlab2.base_models.t5.nlu.serialization import serialize_dialogue_acts, deserialize_dialogue_acts, equal_da_seq
+from convlab2.util import load_dataset, load_unified_data, load_nlu_data
+from convlab2.base_models.t5.nlu.serialization import serialize_dialogue_acts
 from collections import Counter
+from nltk.tokenize import word_tokenize, sent_tokenize
+from nltk.corpus import stopwords
+from string import punctuation
 
 def create_nlg_data(dataset, data_dir, args):
     data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
@@ -17,30 +19,26 @@ def create_nlg_data(dataset, data_dir, args):
         data = []
         for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
             if args.key2gen:
-                da_dict = {}
+                keywords = []
                 for da_type in sample['dialogue_acts']:
                     for da in sample['dialogue_acts'][da_type]:
                         intent, domain, slot, value = da['intent'], da['domain'], da['slot'], da.get('value', '')
                         intent_domain = f'{intent}-{domain}'
-                        da_dict.setdefault(intent_domain, [])
-                        da_dict[intent_domain].append((slot, value))
-                keywords = []
-                for intent_domain, slot_values in da_dict.items():
-                    keywords.append(intent_domain)
-                    for slot, value in slot_values:
+                        keywords.append([intent_domain])
                         if len(slot) > 0:
-                            keywords.append(slot)
+                            keywords[-1].append(slot)
                         if len(value) > 0:
-                            keywords.append(value)
-                dialogue_acts_seq = ' | '.join(keywords)
+                            keywords[-1].append(value)
+                dialogue_acts_seq = '| {} |'.format(' | '.join([' : '.join(da_keywords) for da_keywords in keywords]))
             else:
                 dialogue_acts_seq = serialize_dialogue_acts(sample['dialogue_acts'])
 
             if args.context_window_size>0:
                 context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: '])
-                context = f'{dialogue_acts_seq}\n\ncontext: {context}'
+                context = f'generate a response: grounded knowledge: {dialogue_acts_seq} context:\n\n{context}'
             else:
-                context = f'{dialogue_acts_seq}\n\ncontext: {sample["speaker"]}: '
+                context = f'generate a response: grounded knowledge: {dialogue_acts_seq} context:\n\n{sample["speaker"]}: '
+
             data.append(json.dumps({'context+da': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n')
 
         file_name = os.path.join(data_dir, f"{data_split}.json")
@@ -61,23 +59,21 @@ def create_dart_data(dataset, data_dir, args):
             triples = sample['tripleset']
             if args.key2gen:
                 keywords = [w for triple in triples for w in triple]
-                # TODO: try adding prompt
-                # entity_cnt = Counter()
-                # for triple in triples:
-                #     e1, r, e2 = triple
-                #     for e in [e1, e2]:
-                #         if e.startswith('[') and e.endswith(']'):
-                #             continue
-                #         entity_cnt[e] += 1
+                # DONE: try adding prompt, no improvement
+                entity_cnt = Counter()
+                for triple in triples:
+                    e1, r, e2 = triple
+                    for e in [e1, e2]:
+                        if e.startswith('[') and e.endswith(']'):
+                            continue
+                        entity_cnt[e] += 1
                         
-                # assert len(entity_cnt) > 0
-                # common_entity = entity_cnt.most_common(1)[0][0]
-                # context = f'{" | ".join(keywords)}\n\ncontext: user: tell me something about {common_entity}. system: '
-
-                context = f'{" | ".join(keywords)}\n\ncontext: system: '
+                assert len(entity_cnt) > 0
+                common_entity = entity_cnt.most_common(1)[0][0]
+                context = f'{" | ".join(keywords)}\n\ncontext: user: tell me something about {common_entity}. system: '
             else:
-                triples = [f"[{triple[0]}][{triple[1]}][{triple[2]}]" for triple in triples]
-                context = f'{";".join(triples)}\n\ncontext: system: '
+                triples = [' : '.join(triple) for triple in triples]
+                context = f'{" | ".join(triples)}\n\ncontext: system: '
 
             data.append(json.dumps({'triples': context, 'text': sample['utterance']}, ensure_ascii=False)+'\n')
 
@@ -87,6 +83,148 @@ def create_dart_data(dataset, data_dir, args):
         data_by_split[data_split] = data
     return data_by_split
 
+def create_commongen_data(dataset, data_dir, args):
+    data_by_split = dataset
+    os.makedirs(data_dir, exist_ok=True)
+
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            sample = sample['turns'][0]
+            concepts = sample['concepts']
+            context = f'{" | ".join(concepts)}\n\ncontext: system: '
+
+            data.append(json.dumps({'concepts': context, 'text': sample['utterance']}, ensure_ascii=False)+'\n')
+
+        file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
+
+def create_kvret_data(dataset, data_dir, args):
+    data_by_split = load_unified_data(dataset, speaker='system', utterance=True, db_results=True, use_context=True, context_window_size=100)
+    os.makedirs(data_dir, exist_ok=True)
+
+    domain2entity_col = {'schedule': 'event' ,'navigate': 'poi', 'weather': 'location'}
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            if len(sample['utterance']) == 0:
+                continue
+            db_results = sample['db_results']
+            db_seqs = []
+            for domain, db_items in db_results.items():
+                entity_col = domain2entity_col[domain]
+                for db_item in db_items:
+                    entity = db_item[entity_col]
+                    for db_key, db_value in db_item.items():
+                        if db_key == entity_col:
+                            continue
+                        db_seqs.append(' : '.join([entity, db_key, db_value]))
+            db_seq = ' |\n'.join(db_seqs)
+
+            context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: '])
+            context = f'generate a response: all knowledge:\n\n| {db_seq} | context:\n\n{context}'
+            data.append(json.dumps({'context+db': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n')
+
+        file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
+
+def create_personachat_data(dataset, data_dir, args):
+    data_by_split = dataset
+    os.makedirs(data_dir, exist_ok=True)
+
+    stop_words = set(stopwords.words('english')) | set(punctuation)
+    def sentence2keywords(sentence):
+        index2keyword = {}
+        for i, w in enumerate(word_tokenize(sentence)):
+            if not w.lower() in stop_words:
+                index2keyword[i] = w
+        indexes = sorted(index2keyword.keys())
+        keywords = []
+        for i, index in enumerate(indexes):
+            if i > 0 and index == indexes[i-1] + 1:
+                keywords[-1]+= ' '+index2keyword[index]
+            else:
+                keywords.append(index2keyword[index])
+        return keywords
+
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            persona = dial['persona']['system']
+            if args.key2gen:
+                persona_seq = ' |\n'.join([' : '.join(sentence2keywords(s)) for s in persona])
+            else:
+                persona_seq = ' | '.join(persona)
+            context = []
+            for turn in dial['turns']:
+                if turn['speaker'] == 'system':
+                    context_seq = '\n'.join([f"{t['speaker']}: {t['utterance']}" for t in context]+[f'{turn["speaker"]}: '])
+                    context_seq = f'generate a response: all knowledge:\n\n| {persona_seq} | context:\n\n{context_seq}'
+                    data.append(json.dumps({'context+persona': context_seq, 'response': turn['utterance']}, ensure_ascii=False)+'\n')
+                context.append({'speaker': turn['speaker'], 'utterance': turn['utterance']})
+
+        file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
+
+def create_wow_data(dataset, data_dir, args):
+    data_by_split = dataset
+    os.makedirs(data_dir, exist_ok=True)
+
+    stop_words = set(stopwords.words('english')) | set(punctuation)
+    def sentence2keywords(sentence):
+        index2keyword = {}
+        for i, w in enumerate(word_tokenize(sentence)):
+            if not w.lower() in stop_words:
+                index2keyword[i] = w
+        indexes = sorted(index2keyword.keys())
+        keywords = []
+        for i, index in enumerate(indexes):
+            if i > 0 and index == indexes[i-1] + 1:
+                keywords[-1]+= ' '+index2keyword[index]
+            else:
+                keywords.append(index2keyword[index])
+        return keywords
+
+    def sentences2keywords_seq(sentences):
+        return ' |\n'.join([' : '.join(sentence2keywords(sentence)) for sentence in sent_tokenize(sentences)])
+
+
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            context = []
+            for turn in dial['turns']:
+                if turn['speaker'] == 'system':
+                    if turn['checked_sentence']:
+                        if args.key2gen:
+                            know_seq = f" | {sentences2keywords_seq(turn['checked_sentence'])} |"
+                        else:
+                            know_seq = turn['checked_sentence']
+                    else:
+                        know_seq = ''
+                    context_seq = '\n'.join([f"{t['speaker']}: {t['utterance']}" for t in context]+[f'{turn["speaker"]}: '])
+                    context_seq = f'generate a response: grounded knowledge:\n\n{know_seq} context:\n\n{context_seq}'
+                    data.append(json.dumps({'context+knowledge': context_seq, 'response': turn['utterance']}, ensure_ascii=False)+'\n')
+                context.append({'speaker': turn['speaker'], 'utterance': turn['utterance']})
+
+        file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
 
 def get_max_len(data_by_split, tokenizer):
     for data_split in data_by_split.keys():
@@ -104,7 +242,7 @@ def get_max_len(data_by_split, tokenizer):
 if __name__ == '__main__':
     from argparse import ArgumentParser
     parser = ArgumentParser(description="create data for seq2seq training")
-    parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['nlg', 'dart'], help='names of tasks')
+    parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['nlg', 'dart', 'commongen', 'kvret', 'personachat', 'wow'], help='names of tasks')
     parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
     parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s)')
     parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
diff --git a/convlab2/base_models/t5/key2gen/eval.ipynb b/convlab2/base_models/t5/key2gen/eval.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..51fcc5e0da1321ef740084d0a8b0241b5721a2fc
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/eval.ipynb
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["import json\n","import re"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["def read_jsonline(path):\n","    return [json.loads(line) for line in open(path)]"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["origin = read_jsonline('output/wow/wow/test_unseen.json')"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[],"source":["key2gen = read_jsonline('output/wow/key2gen_wow/test_unseen.json')"]},{"cell_type":"code","execution_count":23,"metadata":{},"outputs":[],"source":["with open('tmp_wow.txt', 'w') as f:\n","    for d1, d2 in zip(origin, key2gen):\n","        print(re.split('context:|grounded knowledge:', d1['context+knowledge'])[1].strip(), file=f)\n","        print(re.split('context:|grounded knowledge:', d2['context+knowledge'])[1].strip(), file=f)\n","        print(d1['context+knowledge'].split('context:')[1].replace('\\n\\n', '\\n'), file=f)\n","        print(file=f)\n","        print('target', d1['response'], file=f)\n","        print('origin', d1['predictions'], file=f)\n","        print('key2gen', d2['predictions'], file=f)\n","        print('='*100, file=f)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["for ratio in [0.1, 0.01]:\n","    for order in [0, 1, 2]:\n","        origin = read_jsonline(f'output/personachat/key2gen_personachat_{ratio}_order{order}/generated_predictions.json')\n","        score = metric.compute(predictions=[d['predictions'] for d in origin], references=[d['response'] for d in origin])\n","        print(ratio, order)\n","        print(score)\n","        "]},{"cell_type":"code","execution_count":51,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["0.01 1\n","{'bleu-1': 24.322560358946276, 'bleu-2': 13.03630111937752, 'bleu-3': 7.43647978674912, 'bleu-4': 4.450365738541082, 'unigram f1': 0.20101056184593705, 'unigram f1 (non-stop words)': 0.09881569367818614, 'rouge1': 21.359332522961864, 'rouge2': 6.532120354812852, 'rougeL': 19.76437990594138}\n"]}],"source":["for ratio in [0.01]:\n","    for order in [1]:\n","        origin = read_jsonline(f'output/personachat/personachat/generated_predictions.json')\n","        score = metric.compute(predictions=[d['predictions'] for d in origin], references=[d['response'] for d in origin])\n","        print(ratio, order)\n","        print(score)\n","        "]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[],"source":["from datasets import load_metric"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[],"source":["metric = load_metric('metric.py')"]},{"cell_type":"code","execution_count":58,"metadata":{},"outputs":[{"data":{"text/plain":["{'bleu-1': 47.9848465486215,\n"," 'bleu-2': 37.18000679532912,\n"," 'bleu-3': 29.346646172092814,\n"," 'bleu-4': 23.410526740211363,\n"," 'unigram f1': 0.4999850046010773,\n"," 'unigram f1 (non-stop words)': 0.5150265227462978,\n"," 'rouge1': 50.536642578692195,\n"," 'rouge2': 33.10681789367832,\n"," 'rougeL': 46.84702913163778,\n"," 'meteor': 0.4641962079490068}"]},"execution_count":58,"metadata":{},"output_type":"execute_result"}],"source":["metric.compute(predictions=[d['predictions'] for d in key2gen], references=[d['response'] for d in key2gen])"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"data":{"text/plain":["{'bleu-1': 37.570099942714585,\n"," 'bleu-2': 26.77393964962893,\n"," 'bleu-3': 21.115954644820572,\n"," 'bleu-4': 17.513316671216046,\n"," 'unigram f1': 0.3656930567072274,\n"," 'unigram f1 (non-stop words)': 0.36456219281235724,\n"," 'rouge1': 39.1982724920493,\n"," 'rouge2': 20.825159884632743,\n"," 'rougeL': 34.98278542180112,\n"," 'meteor': 0.3405671227693821,\n"," 'distinct-1': 0.07838670580160921,\n"," 'distinct-2': 0.29689084413659694}"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["metric.compute(predictions=[d['predictions'] for d in origin], references=[d['response'] for d in origin])"]},{"cell_type":"code","execution_count":34,"metadata":{},"outputs":[{"data":{"text/plain":["{'bleu-1': 47.9848465486215,\n"," 'bleu-2': 37.18000679532912,\n"," 'bleu-3': 29.346646172092814,\n"," 'bleu-4': 23.410526740211363,\n"," 'unigram f1': 0.4999850046010773,\n"," 'unigram f1 (non-stop words)': 0.5150265227462978,\n"," 'rouge1': AggregateScore(low=Score(precision=0.5301926525013549, recall=0.4821419251082986, fmeasure=0.48565655175230005), mid=Score(precision=0.5513392693168799, recall=0.50235850981064, fmeasure=0.5053664257869219), high=Score(precision=0.5760132731228504, recall=0.5268580272115051, fmeasure=0.5279111393835526)),\n"," 'rouge2': AggregateScore(low=Score(precision=0.34772127155901306, recall=0.30411953889228, fmeasure=0.31029658993105447), mid=Score(precision=0.3696898381097765, recall=0.32612705034192035, fmeasure=0.3310681789367832), high=Score(precision=0.3947745596965405, recall=0.34880792116864995, fmeasure=0.35356317521641434)),\n"," 'rougeL': AggregateScore(low=Score(precision=0.4874189522136045, recall=0.4413343070361347, fmeasure=0.4464463084888409), mid=Score(precision=0.5108530997712726, recall=0.4642203560120527, fmeasure=0.46847029131637785), high=Score(precision=0.5350154077389535, recall=0.4855131911095939, fmeasure=0.4899950876629784)),\n"," 'rougeLsum': AggregateScore(low=Score(precision=0.4871840444049138, recall=0.44081531444183386, fmeasure=0.44514075751478493), mid=Score(precision=0.5105975305923949, recall=0.4639265647317744, fmeasure=0.46779186414456864), high=Score(precision=0.5348015149575474, recall=0.48693312722760357, fmeasure=0.4918651382986408))}"]},"execution_count":34,"metadata":{},"output_type":"execute_result"}],"source":["metric.compute(predictions=[d['predictions'] for d in key2gen], references=[d['response'] for d in key2gen])"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]}],"metadata":{"interpreter":{"hash":"0f9333403d680bc010aa5ce5a2f27ba398c9e47e92ba3724506306aa234cd07d"},"kernelspec":{"display_name":"Python 3.8.12 ('py38')","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.12"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2}
diff --git a/convlab2/base_models/t5/key2gen/metric.py b/convlab2/base_models/t5/key2gen/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..050db04d120383bece5bfd5e9cb3362f66c8ecac
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/metric.py
@@ -0,0 +1,188 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NLG Metric"""
+
+import datasets
+from sacrebleu.metrics import BLEU
+from sacrebleu.utils import sum_of_lists
+import re
+from collections import Counter
+import numpy as np
+from nltk.corpus import stopwords
+from nltk import sent_tokenize
+from rouge_score import rouge_scorer, scoring
+from nltk.translate import meteor_score
+from datasets.config import importlib_metadata, version
+
+
+NLTK_VERSION = version.parse(importlib_metadata.version("nltk"))
+if NLTK_VERSION >= version.Version("3.6.5"):
+    from nltk import word_tokenize
+
+
+# TODO: Add BibTeX citation
+_CITATION = """\
+@inproceedings{post-2018-call,
+    title = "A Call for Clarity in Reporting {BLEU} Scores",
+    author = "Post, Matt",
+    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
+    month = oct,
+    year = "2018",
+    address = "Belgium, Brussels",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/W18-6319",
+    pages = "186--191",
+}
+"""
+
+_DESCRIPTION = """\
+Metric to evaluate text-to-text models on the natural language generation task.
+"""
+
+_KWARGS_DESCRIPTION = """
+Calculates corpus-bleu4
+Args:
+    predictions: list of predictions to score. Each predictions
+        should be a string.
+    references: list of reference for each prediction. Each
+        reference should be a string.
+Returns:
+    bleu: corpus-bleu score
+Examples:
+
+    >>> nlg_metric = datasets.load_metric("nlg_metric.py")
+    >>> predictions = ["hello there general kenobi", "foo bar foobar"]
+    >>> references = ["hello there kenobi", "foo bar foobar"]
+    >>> results = nlg_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'bleu': 35.35533905932737}
+"""
+
+
+@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class NLGMetrics(datasets.Metric):
+    """Metric to evaluate text-to-text models on the natural language generation task."""
+    def _info(self):
+        return datasets.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            # This defines the format of each prediction and reference
+            features=datasets.Features({
+                'predictions': datasets.Value('string'),
+                'references': datasets.Value('string'),
+            })
+        )
+
+    # def _download_and_prepare(self, dl_manager):
+    #     import nltk
+    #     nltk.download("wordnet")
+    #     if NLTK_VERSION >= version.Version("3.6.5"):
+    #         nltk.download("punkt")
+    #     if NLTK_VERSION >= version.Version("3.6.6"):
+    #         nltk.download("omw-1.4")
+
+    def _compute(self, predictions, references):
+        """Returns the scores: bleu"""
+        metrics = {}
+        # bleu
+        bleu = BLEU(lowercase=True, force=False, tokenize=BLEU.TOKENIZER_DEFAULT, smooth_method='exp', smooth_value=None, effective_order=False)
+        stats = sum_of_lists(bleu._extract_corpus_statistics(predictions, [references]))
+        for n in range(1,5):
+            metrics[f'bleu-{n}'] = bleu.compute_bleu(
+                correct=stats[2: 2 + bleu.max_ngram_order],
+                total=stats[2 + bleu.max_ngram_order:],
+                sys_len=int(stats[0]), ref_len=int(stats[1]),
+                smooth_method=bleu.smooth_method, smooth_value=bleu.smooth_value,
+                effective_order=bleu.effective_order,
+                max_ngram_order=n).score
+                
+        # unigram f1
+        re_art = re.compile(r'\b(a|an|the)\b')
+        re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']')
+        stop_words = set(stopwords.words('english'))
+        def utt2words(s):
+            """Lower text and remove punctuation, articles and extra whitespace."""
+            s = s.lower()
+            s = re_punc.sub(' ', s)
+            s = re_art.sub(' ', s)
+            return s.split()
+
+        metrics['unigram f1'] = []
+        metrics['unigram f1 (non-stop words)'] = []
+        for prediction, reference in zip(predictions, references):
+            pred_items = utt2words(prediction)
+            gold_items = utt2words(reference)
+            for remove_stopwords in [False, True]:
+                if remove_stopwords:
+                    pred_items = [w for w in pred_items if w not in stop_words]
+                    gold_items = [w for w in gold_items if w not in stop_words]
+                common = Counter(pred_items) & Counter(gold_items)
+                num_same = sum(common.values())
+                if num_same == 0:
+                    f1 = 0
+                else:
+                    precision = 1.0 * num_same / len(pred_items)
+                    recall = 1.0 * num_same / len(gold_items)
+                    f1 = (2 * precision * recall) / (precision + recall)
+                if not remove_stopwords:
+                    metrics['unigram f1'].append(f1)
+                else:
+                    metrics['unigram f1 (non-stop words)'].append(f1)
+        metrics['unigram f1'] = np.mean(metrics['unigram f1'])
+        metrics['unigram f1 (non-stop words)'] = np.mean(metrics['unigram f1 (non-stop words)'])
+
+        # rouge-1/2/L-fmeasure
+        rouge_types=["rouge1", "rouge2", "rougeL"]
+        scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=True)
+        aggregator = scoring.BootstrapAggregator()
+
+        for prediction, reference in zip(predictions, references):
+            score = scorer.score(reference, prediction)
+            aggregator.add_scores(score)
+
+        result = aggregator.aggregate()
+        metrics.update({key: value.mid.fmeasure * 100 for key, value in result.items()})
+
+        # meteor
+        alpha=0.9
+        beta=3
+        gamma=0.5
+        if NLTK_VERSION >= version.Version("3.6.5"):
+            scores = [
+                meteor_score.single_meteor_score(
+                    word_tokenize(ref), word_tokenize(pred), alpha=alpha, beta=beta, gamma=gamma
+                )
+                for ref, pred in zip(references, predictions)
+            ]
+        else:
+            scores = [
+                meteor_score.single_meteor_score(ref, pred, alpha=alpha, beta=beta, gamma=gamma)
+                for ref, pred in zip(references, predictions)
+            ]
+        metrics.update({"meteor": np.mean(scores)})
+
+        # inter/intra-distinct-1/2
+        def _ngram(seq, n):
+            for i in range(len(seq) - n + 1):
+                yield tuple(seq[i : i + n])
+        
+        for k in [1, 2]:
+            inter_cnt = Counter()
+            for prediction in predictions:
+                ngram = Counter(_ngram(utt2words(prediction), k))
+                inter_cnt += ngram
+            metrics[f'distinct-{k}'] = max(len(inter_cnt), 1e-12) / max(sum(inter_cnt.values()), 1e-5)
+
+        return metrics
diff --git a/convlab2/base_models/t5/key2gen/run.sh b/convlab2/base_models/t5/key2gen/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..74e44434ea1ab9e0099e680f2862521f7063049b
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run.sh
@@ -0,0 +1,7 @@
+for ratio in 0.1 0.01
+do
+    for dial_ids_order in 0 1 2
+    do
+        bash run_persona_fewshot_key2gen.sh ${ratio} ${dial_ids_order}
+    done
+done
\ No newline at end of file
diff --git a/convlab2/base_models/t5/key2gen/run2.sh b/convlab2/base_models/t5/key2gen/run2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7e7bf5d6a75e72d6c236fbf738a8c7d010f0e18b
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run2.sh
@@ -0,0 +1,7 @@
+for ratio in 0.1 0.01
+do
+    for dial_ids_order in 0 1 2
+    do
+        bash run_wow_fewshot_key2gen.sh ${ratio} ${dial_ids_order}
+    done
+done
\ No newline at end of file
diff --git a/convlab2/base_models/t5/key2gen/run_commongen.sh b/convlab2/base_models/t5/key2gen/run_commongen.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5952725709afa1848695158b76f04aa81ff328a6
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_commongen.sh
@@ -0,0 +1,85 @@
+n_gpus=1
+task_name="commongen"
+dataset_name="commongen"
+speaker="system"
+context_window_size=0
+data_dir="data/${task_name}/${dataset_name}"
+output_dir="output/${task_name}/${dataset_name}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/validation.json"
+metric_name_or_path="../nlg/nlg_metric.py"
+metric_for_best_model="bleu"
+source_column="concepts"
+target_column="text"
+truncation_side="left"
+max_source_length=512
+max_target_length=512
+model_name_or_path="t5-small"
+per_device_train_batch_size=128
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=10
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 3 \
+    --prediction_loss_only \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_commongen_key2gen.sh b/convlab2/base_models/t5/key2gen/run_commongen_key2gen.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e09d3b464e2685dd8e0d774539f8c8dc4808f99c
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_commongen_key2gen.sh
@@ -0,0 +1,88 @@
+n_gpus=1
+task_name="commongen"
+dataset_name="commongen"
+speaker="system"
+context_window_size=0
+data_dir="data/${task_name}/key2gen_${dataset_name}"
+output_dir="output/${task_name}/key2gen_${dataset_name}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/validation.json"
+metric_name_or_path="../nlg/nlg_metric.py"
+metric_for_best_model="bleu"
+source_column="concepts"
+target_column="text"
+source_prefix="keywords: "
+truncation_side="left"
+max_source_length=512
+max_target_length=512
+model_name_or_path="../../gpt/keyword_extraction/output/key2gen_shuffle_noisy/gpt/dailydialog+metalwoz+sgd+tm1+tm2+tm3"
+per_device_train_batch_size=128
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=10
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --source_prefix ${source_prefix} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 3 \
+    --prediction_loss_only \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --source_prefix ${source_prefix} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_dart.sh b/convlab2/base_models/t5/key2gen/run_dart.sh
index 6c346408c18c3ead330018cae58effd97fb45265..62eb3236f0c468c6b185eb9c5c39e807e3a92800 100644
--- a/convlab2/base_models/t5/key2gen/run_dart.sh
+++ b/convlab2/base_models/t5/key2gen/run_dart.sh
@@ -3,7 +3,7 @@ task_name="dart"
 dataset_name="dart"
 speaker="system"
 context_window_size=0
-data_dir="data/${task_name}/${dataset_name}"
+data_dir="data/${task_name}/key2gen_${dataset_name}"
 output_dir="output/${task_name}/${dataset_name}"
 cache_dir="../cache"
 logging_dir="${output_dir}/runs"
@@ -24,36 +24,36 @@ gradient_accumulation_steps=4
 lr=1e-3
 num_train_epochs=10
 
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
+# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
 
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 3 \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
+# python ../run_seq2seq.py \
+#     --task_name ${task_name} \
+#     --train_file ${train_file} \
+#     --validation_file ${validation_file} \
+#     --source_column ${source_column} \
+#     --target_column ${target_column} \
+#     --max_source_length ${max_source_length} \
+#     --max_target_length ${max_target_length} \
+#     --truncation_side ${truncation_side} \
+#     --model_name_or_path ${model_name_or_path} \
+#     --do_train \
+#     --do_eval \
+#     --save_strategy epoch \
+#     --evaluation_strategy epoch \
+#     --save_total_limit 3 \
+#     --prediction_loss_only \
+#     --cache_dir ${cache_dir} \
+#     --output_dir ${output_dir} \
+#     --logging_dir ${logging_dir} \
+#     --overwrite_output_dir \
+#     --preprocessing_num_workers 4 \
+#     --per_device_train_batch_size ${per_device_train_batch_size} \
+#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
+#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
+#     --learning_rate ${lr} \
+#     --num_train_epochs ${num_train_epochs} \
+#     --adafactor \
+#     --gradient_checkpointing
 
 python ../run_seq2seq.py \
     --task_name ${task_name} \
@@ -63,7 +63,7 @@ python ../run_seq2seq.py \
     --max_source_length ${max_source_length} \
     --max_target_length ${max_target_length} \
     --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
+    --model_name_or_path ${model_name_or_path} \
     --do_predict \
     --predict_with_generate \
     --metric_name_or_path ${metric_name_or_path} \
@@ -80,6 +80,6 @@ python ../run_seq2seq.py \
     --adafactor \
     --gradient_checkpointing
 
-python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
 
-python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_dart_key2gen.sh b/convlab2/base_models/t5/key2gen/run_dart_key2gen.sh
index 8f539f469d58ff169ad7a6c71f0a38a894acf566..b729da6fdf27ed61dad7a2a48120fbbbaef49afc 100644
--- a/convlab2/base_models/t5/key2gen/run_dart_key2gen.sh
+++ b/convlab2/base_models/t5/key2gen/run_dart_key2gen.sh
@@ -3,7 +3,7 @@ task_name="dart"
 dataset_name="dart"
 speaker="system"
 context_window_size=0
-data_dir="data/${task_name}/key2gen_${dataset_name}"
+data_dir="data/${task_name}/${dataset_name}"
 output_dir="output/${task_name}/key2gen_${dataset_name}"
 cache_dir="../cache"
 logging_dir="${output_dir}/runs"
@@ -14,56 +14,59 @@ metric_name_or_path="../nlg/nlg_metric.py"
 metric_for_best_model="bleu"
 source_column="triples"
 target_column="text"
+source_prefix="possible keywords: "
 truncation_side="left"
 max_source_length=512
 max_target_length=512
-model_name_or_path="t5-small"
+model_name_or_path="../../gpt/keyword_extraction/output/key2gen_shuffle_noisy/gpt/dailydialog+metalwoz+sgd+tm1+tm2+tm3"
 per_device_train_batch_size=128
 per_device_eval_batch_size=64
 gradient_accumulation_steps=4
 lr=1e-3
 num_train_epochs=10
 
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen
+# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen
 
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 3 \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
+# python ../run_seq2seq.py \
+#     --task_name ${task_name} \
+#     --train_file ${train_file} \
+#     --validation_file ${validation_file} \
+#     --source_column ${source_column} \
+#     --target_column ${target_column} \
+#     --source_prefix "${source_prefix}" \
+#     --max_source_length ${max_source_length} \
+#     --max_target_length ${max_target_length} \
+#     --truncation_side ${truncation_side} \
+#     --model_name_or_path ${model_name_or_path} \
+#     --do_train \
+#     --do_eval \
+#     --save_strategy epoch \
+#     --evaluation_strategy epoch \
+#     --save_total_limit 3 \
+#     --prediction_loss_only \
+#     --cache_dir ${cache_dir} \
+#     --output_dir ${output_dir} \
+#     --logging_dir ${logging_dir} \
+#     --overwrite_output_dir \
+#     --preprocessing_num_workers 4 \
+#     --per_device_train_batch_size ${per_device_train_batch_size} \
+#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
+#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
+#     --learning_rate ${lr} \
+#     --num_train_epochs ${num_train_epochs} \
+#     --adafactor \
+#     --gradient_checkpointing
 
 python ../run_seq2seq.py \
     --task_name ${task_name} \
     --test_file ${test_file} \
     --source_column ${source_column} \
     --target_column ${target_column} \
+    --source_prefix "${source_prefix}" \
     --max_source_length ${max_source_length} \
     --max_target_length ${max_target_length} \
     --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
+    --model_name_or_path ${model_name_or_path} \
     --do_predict \
     --predict_with_generate \
     --metric_name_or_path ${metric_name_or_path} \
@@ -80,6 +83,6 @@ python ../run_seq2seq.py \
     --adafactor \
     --gradient_checkpointing
 
-python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
 
-python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_kvret.sh b/convlab2/base_models/t5/key2gen/run_kvret.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b960ed18735d804f82116b4307223b4245f2b368
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_kvret.sh
@@ -0,0 +1,85 @@
+n_gpus=1
+task_name="kvret"
+dataset_name="kvret"
+speaker="system"
+data_dir="data/${task_name}/${dataset_name}"
+output_dir="output/${task_name}/${dataset_name}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test.json"
+metric_name_or_path="../nlg/nlg_metric.py"
+metric_for_best_model="bleu"
+source_column="context+db"
+target_column="response"
+truncation_side="left"
+max_source_length=1024
+max_target_length=512
+model_name_or_path="t5-small"
+per_device_train_batch_size=32
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=10
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name}
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --load_best_model_at_end \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_kvret_fewshot.sh b/convlab2/base_models/t5/key2gen/run_kvret_fewshot.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a6523d6cea506b6433f2cd853ca6255a28cfd8a6
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_kvret_fewshot.sh
@@ -0,0 +1,87 @@
+n_gpus=1
+task_name="kvret"
+dataset_name="kvret"
+speaker="system"
+ratio=$1
+dial_ids_order=$2
+data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
+output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test.json"
+metric_name_or_path="../nlg/nlg_metric.py"
+metric_for_best_model="bleu"
+source_column="context+db"
+target_column="response"
+truncation_side="left"
+max_source_length=1024
+max_target_length=512
+model_name_or_path="t5-small"
+per_device_train_batch_size=32
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=100
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order}
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --load_best_model_at_end \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh b/convlab2/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d19ae9c1ca3890896491839c76e3d7beff9bdbfc
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh
@@ -0,0 +1,87 @@
+n_gpus=1
+task_name="kvret"
+dataset_name="kvret"
+speaker="system"
+ratio=$1
+dial_ids_order=$2
+data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
+output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test.json"
+metric_name_or_path="../nlg/nlg_metric.py"
+metric_for_best_model="bleu"
+source_column="context+db"
+target_column="response"
+truncation_side="left"
+max_source_length=1024
+max_target_length=512
+model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3"
+per_device_train_batch_size=32
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=100
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --load_best_model_at_end \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_kvret_key2gen.sh b/convlab2/base_models/t5/key2gen/run_kvret_key2gen.sh
new file mode 100644
index 0000000000000000000000000000000000000000..44f8dbc6732f8ccd3470b8d5fd62a886898b949c
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_kvret_key2gen.sh
@@ -0,0 +1,85 @@
+n_gpus=1
+task_name="kvret"
+dataset_name="kvret"
+speaker="system"
+data_dir="data/${task_name}/${dataset_name}"
+output_dir="output/${task_name}/key2gen_${dataset_name}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test.json"
+metric_name_or_path="../nlg/nlg_metric.py"
+metric_for_best_model="bleu"
+source_column="context+db"
+target_column="response"
+truncation_side="left"
+max_source_length=1024
+max_target_length=512
+model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3"
+per_device_train_batch_size=32
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=10
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name}
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --load_best_model_at_end \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_nlg.sh b/convlab2/base_models/t5/key2gen/run_nlg.sh
index 65140ce334b4b7e7de1201eeef628ad017d03165..ec225ef719f25e970bd00ca662491fe76fd6b8af 100644
--- a/convlab2/base_models/t5/key2gen/run_nlg.sh
+++ b/convlab2/base_models/t5/key2gen/run_nlg.sh
@@ -24,61 +24,62 @@ gradient_accumulation_steps=4
 lr=1e-3
 num_train_epochs=10
 
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
+# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
 
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 3 \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
+# python ../run_seq2seq.py \
+#     --task_name ${task_name} \
+#     --train_file ${train_file} \
+#     --validation_file ${validation_file} \
+#     --source_column ${source_column} \
+#     --target_column ${target_column} \
+#     --max_source_length ${max_source_length} \
+#     --max_target_length ${max_target_length} \
+#     --truncation_side ${truncation_side} \
+#     --model_name_or_path ${model_name_or_path} \
+#     --do_train \
+#     --do_eval \
+#     --save_strategy epoch \
+#     --evaluation_strategy epoch \
+#     --save_total_limit 1 \
+#     --prediction_loss_only \
+#     --load_best_model_at_end \
+#     --cache_dir ${cache_dir} \
+#     --output_dir ${output_dir} \
+#     --logging_dir ${logging_dir} \
+#     --overwrite_output_dir \
+#     --preprocessing_num_workers 4 \
+#     --per_device_train_batch_size ${per_device_train_batch_size} \
+#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
+#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
+#     --learning_rate ${lr} \
+#     --num_train_epochs ${num_train_epochs} \
+#     --adafactor \
+#     --gradient_checkpointing
 
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
+# python ../run_seq2seq.py \
+#     --task_name ${task_name} \
+#     --test_file ${test_file} \
+#     --source_column ${source_column} \
+#     --target_column ${target_column} \
+#     --max_source_length ${max_source_length} \
+#     --max_target_length ${max_target_length} \
+#     --truncation_side ${truncation_side} \
+#     --model_name_or_path ${output_dir} \
+#     --do_predict \
+#     --predict_with_generate \
+#     --metric_name_or_path ${metric_name_or_path} \
+#     --cache_dir ${cache_dir} \
+#     --output_dir ${output_dir} \
+#     --logging_dir ${logging_dir} \
+#     --overwrite_output_dir \
+#     --preprocessing_num_workers 4 \
+#     --per_device_train_batch_size ${per_device_train_batch_size} \
+#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
+#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
+#     --learning_rate ${lr} \
+#     --num_train_epochs ${num_train_epochs} \
+#     --adafactor \
+#     --gradient_checkpointing
 
 python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
 
diff --git a/convlab2/base_models/t5/key2gen/run_nlg_fewshot.sh b/convlab2/base_models/t5/key2gen/run_nlg_fewshot.sh
index b831beef20ca0d44eb8809c8823d2fdb56b656f7..f8ab2ee0f5969f2c27552de4fcf82ede5526c384 100644
--- a/convlab2/base_models/t5/key2gen/run_nlg_fewshot.sh
+++ b/convlab2/base_models/t5/key2gen/run_nlg_fewshot.sh
@@ -26,62 +26,62 @@ gradient_accumulation_steps=4
 lr=1e-3
 num_train_epochs=100
 
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order}
+# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order}
 
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 3 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
+# python ../run_seq2seq.py \
+#     --task_name ${task_name} \
+#     --train_file ${train_file} \
+#     --validation_file ${validation_file} \
+#     --source_column ${source_column} \
+#     --target_column ${target_column} \
+#     --max_source_length ${max_source_length} \
+#     --max_target_length ${max_target_length} \
+#     --truncation_side ${truncation_side} \
+#     --model_name_or_path ${model_name_or_path} \
+#     --do_train \
+#     --do_eval \
+#     --save_strategy epoch \
+#     --evaluation_strategy epoch \
+#     --save_total_limit 1 \
+#     --prediction_loss_only \
+#     --load_best_model_at_end \
+#     --cache_dir ${cache_dir} \
+#     --output_dir ${output_dir} \
+#     --logging_dir ${logging_dir} \
+#     --overwrite_output_dir \
+#     --preprocessing_num_workers 4 \
+#     --per_device_train_batch_size ${per_device_train_batch_size} \
+#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
+#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
+#     --learning_rate ${lr} \
+#     --num_train_epochs ${num_train_epochs} \
+#     --adafactor \
+#     --gradient_checkpointing
 
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
+# python ../run_seq2seq.py \
+#     --task_name ${task_name} \
+#     --test_file ${test_file} \
+#     --source_column ${source_column} \
+#     --target_column ${target_column} \
+#     --max_source_length ${max_source_length} \
+#     --max_target_length ${max_target_length} \
+#     --truncation_side ${truncation_side} \
+#     --model_name_or_path ${output_dir} \
+#     --do_predict \
+#     --predict_with_generate \
+#     --metric_name_or_path ${metric_name_or_path} \
+#     --cache_dir ${cache_dir} \
+#     --output_dir ${output_dir} \
+#     --logging_dir ${logging_dir} \
+#     --overwrite_output_dir \
+#     --preprocessing_num_workers 4 \
+#     --per_device_train_batch_size ${per_device_train_batch_size} \
+#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
+#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
+#     --learning_rate ${lr} \
+#     --num_train_epochs ${num_train_epochs} \
+#     --adafactor \
+#     --gradient_checkpointing
 
 python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
 
diff --git a/convlab2/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh b/convlab2/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh
index 1ffcc869e554e6002cb4408e7eb3607acb9f1119..c13a187799bcfb6364d5d9d8a22f4ede96c87897 100644
--- a/convlab2/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh
+++ b/convlab2/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh
@@ -19,69 +19,69 @@ target_column="response"
 truncation_side="left"
 max_source_length=512
 max_target_length=512
-model_name_or_path="t5-small"
+model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3"
 per_device_train_batch_size=128
 per_device_eval_batch_size=64
 gradient_accumulation_steps=4
 lr=1e-3
 num_train_epochs=100
 
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order} --key2gen
+# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order} --key2gen
 
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 3 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
+# python ../run_seq2seq.py \
+#     --task_name ${task_name} \
+#     --train_file ${train_file} \
+#     --validation_file ${validation_file} \
+#     --source_column ${source_column} \
+#     --target_column ${target_column} \
+#     --max_source_length ${max_source_length} \
+#     --max_target_length ${max_target_length} \
+#     --truncation_side ${truncation_side} \
+#     --model_name_or_path ${model_name_or_path} \
+#     --do_train \
+#     --do_eval \
+#     --save_strategy epoch \
+#     --evaluation_strategy epoch \
+#     --save_total_limit 1 \
+#     --prediction_loss_only \
+#     --load_best_model_at_end \
+#     --cache_dir ${cache_dir} \
+#     --output_dir ${output_dir} \
+#     --logging_dir ${logging_dir} \
+#     --overwrite_output_dir \
+#     --preprocessing_num_workers 4 \
+#     --per_device_train_batch_size ${per_device_train_batch_size} \
+#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
+#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
+#     --learning_rate ${lr} \
+#     --num_train_epochs ${num_train_epochs} \
+#     --adafactor \
+#     --gradient_checkpointing
 
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
+# python ../run_seq2seq.py \
+#     --task_name ${task_name} \
+#     --test_file ${test_file} \
+#     --source_column ${source_column} \
+#     --target_column ${target_column} \
+#     --max_source_length ${max_source_length} \
+#     --max_target_length ${max_target_length} \
+#     --truncation_side ${truncation_side} \
+#     --model_name_or_path ${output_dir} \
+#     --do_predict \
+#     --predict_with_generate \
+#     --metric_name_or_path ${metric_name_or_path} \
+#     --cache_dir ${cache_dir} \
+#     --output_dir ${output_dir} \
+#     --logging_dir ${logging_dir} \
+#     --overwrite_output_dir \
+#     --preprocessing_num_workers 4 \
+#     --per_device_train_batch_size ${per_device_train_batch_size} \
+#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
+#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
+#     --learning_rate ${lr} \
+#     --num_train_epochs ${num_train_epochs} \
+#     --adafactor \
+#     --gradient_checkpointing
 
 python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
 
diff --git a/convlab2/base_models/t5/key2gen/run_nlg_key2gen.sh b/convlab2/base_models/t5/key2gen/run_nlg_key2gen.sh
index 38807e9a97027a636a7cd54a552f60283fceda0a..f9b89aae85ca2963a162acde2f670caee0f1790e 100644
--- a/convlab2/base_models/t5/key2gen/run_nlg_key2gen.sh
+++ b/convlab2/base_models/t5/key2gen/run_nlg_key2gen.sh
@@ -17,68 +17,69 @@ target_column="response"
 truncation_side="left"
 max_source_length=512
 max_target_length=512
-model_name_or_path="t5-small"
+model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3"
 per_device_train_batch_size=128
 per_device_eval_batch_size=64
 gradient_accumulation_steps=4
 lr=1e-3
 num_train_epochs=10
 
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen
+# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen
 
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 3 \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
+# python ../run_seq2seq.py \
+#     --task_name ${task_name} \
+#     --train_file ${train_file} \
+#     --validation_file ${validation_file} \
+#     --source_column ${source_column} \
+#     --target_column ${target_column} \
+#     --max_source_length ${max_source_length} \
+#     --max_target_length ${max_target_length} \
+#     --truncation_side ${truncation_side} \
+#     --model_name_or_path ${model_name_or_path} \
+#     --do_train \
+#     --do_eval \
+#     --save_strategy epoch \
+#     --evaluation_strategy epoch \
+#     --save_total_limit 1 \
+#     --prediction_loss_only \
+#     --load_best_model_at_end \
+#     --cache_dir ${cache_dir} \
+#     --output_dir ${output_dir} \
+#     --logging_dir ${logging_dir} \
+#     --overwrite_output_dir \
+#     --preprocessing_num_workers 4 \
+#     --per_device_train_batch_size ${per_device_train_batch_size} \
+#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
+#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
+#     --learning_rate ${lr} \
+#     --num_train_epochs ${num_train_epochs} \
+#     --adafactor \
+#     --gradient_checkpointing
 
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
+# python ../run_seq2seq.py \
+#     --task_name ${task_name} \
+#     --test_file ${test_file} \
+#     --source_column ${source_column} \
+#     --target_column ${target_column} \
+#     --max_source_length ${max_source_length} \
+#     --max_target_length ${max_target_length} \
+#     --truncation_side ${truncation_side} \
+#     --model_name_or_path ${output_dir} \
+#     --do_predict \
+#     --predict_with_generate \
+#     --metric_name_or_path ${metric_name_or_path} \
+#     --cache_dir ${cache_dir} \
+#     --output_dir ${output_dir} \
+#     --logging_dir ${logging_dir} \
+#     --overwrite_output_dir \
+#     --preprocessing_num_workers 4 \
+#     --per_device_train_batch_size ${per_device_train_batch_size} \
+#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
+#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
+#     --learning_rate ${lr} \
+#     --num_train_epochs ${num_train_epochs} \
+#     --adafactor \
+#     --gradient_checkpointing
 
 python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
 
diff --git a/convlab2/base_models/t5/key2gen/run_persona.sh b/convlab2/base_models/t5/key2gen/run_persona.sh
new file mode 100644
index 0000000000000000000000000000000000000000..20ae23608ca8ebcd05cb937b53a088df924e0d66
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_persona.sh
@@ -0,0 +1,85 @@
+n_gpus=1
+task_name="personachat"
+dataset_name="personachat"
+speaker="system"
+data_dir="data/${task_name}/${dataset_name}"
+output_dir="output/${task_name}/${dataset_name}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test.json"
+metric_name_or_path="metric.py"
+metric_for_best_model="bleu"
+source_column="context+persona"
+target_column="response"
+truncation_side="left"
+max_source_length=1024
+max_target_length=512
+model_name_or_path="t5-small"
+per_device_train_batch_size=32
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=10
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name}
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --load_best_model_at_end \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_persona_fewshot.sh b/convlab2/base_models/t5/key2gen/run_persona_fewshot.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4ecf435e52293d2d8cc03411940ce55f5e73f3ab
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_persona_fewshot.sh
@@ -0,0 +1,87 @@
+n_gpus=1
+task_name="personachat"
+dataset_name="personachat"
+speaker="system"
+ratio=$1
+dial_ids_order=$2
+data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
+output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test.json"
+metric_name_or_path="metric.py"
+metric_for_best_model="bleu"
+source_column="context+persona"
+target_column="response"
+truncation_side="left"
+max_source_length=1024
+max_target_length=512
+model_name_or_path="t5-small"
+per_device_train_batch_size=32
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=100
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order}
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --load_best_model_at_end \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh b/convlab2/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6b95839a376e4857c387535ef0a47a5317530498
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh
@@ -0,0 +1,87 @@
+n_gpus=1
+task_name="personachat"
+dataset_name="personachat"
+speaker="system"
+ratio=$1
+dial_ids_order=$2
+data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
+output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test.json"
+metric_name_or_path="metric.py"
+metric_for_best_model="bleu"
+source_column="context+persona"
+target_column="response"
+truncation_side="left"
+max_source_length=1024
+max_target_length=512
+model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
+per_device_train_batch_size=32
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=100
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --load_best_model_at_end \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_persona_key2gen.sh b/convlab2/base_models/t5/key2gen/run_persona_key2gen.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a2fea9b1fea8540731fd7e3d499156bfe7025690
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_persona_key2gen.sh
@@ -0,0 +1,85 @@
+n_gpus=1
+task_name="personachat"
+dataset_name="personachat"
+speaker="system"
+data_dir="data/${task_name}/key2gen_${dataset_name}"
+output_dir="output/${task_name}/key2gen_${dataset_name}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test.json"
+metric_name_or_path="metric.py"
+metric_for_best_model="bleu"
+source_column="context+persona"
+target_column="response"
+truncation_side="left"
+max_source_length=1024
+max_target_length=512
+model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
+per_device_train_batch_size=32
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=10
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name} --key2gen
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --load_best_model_at_end \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_wow.sh b/convlab2/base_models/t5/key2gen/run_wow.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0fd9fb78ea5e260ef30fdf00658d4a390421f900
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_wow.sh
@@ -0,0 +1,85 @@
+n_gpus=1
+task_name="wow"
+dataset_name="wow"
+speaker="system"
+data_dir="data/${task_name}/${dataset_name}"
+output_dir="output/${task_name}/${dataset_name}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test_unseen.json"
+metric_name_or_path="metric.py"
+metric_for_best_model="bleu"
+source_column="context+knowledge"
+target_column="response"
+truncation_side="left"
+max_source_length=1024
+max_target_length=512
+model_name_or_path="t5-small"
+per_device_train_batch_size=32
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=10
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name}
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --load_best_model_at_end \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_wow_fewshot.sh b/convlab2/base_models/t5/key2gen/run_wow_fewshot.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c9cdbe645d771d099403c9f840a08114e005a18d
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_wow_fewshot.sh
@@ -0,0 +1,87 @@
+n_gpus=1
+task_name="wow"
+dataset_name="wow"
+speaker="system"
+ratio=$1
+dial_ids_order=$2
+data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
+output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test_unseen.json"
+metric_name_or_path="metric.py"
+metric_for_best_model="bleu"
+source_column="context+knowledge"
+target_column="response"
+truncation_side="left"
+max_source_length=1024
+max_target_length=512
+model_name_or_path="t5-small"
+per_device_train_batch_size=32
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=100
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order}
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --load_best_model_at_end \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh b/convlab2/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6aaf8ca4dd160f8c108281413441e021b5bd0571
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh
@@ -0,0 +1,87 @@
+n_gpus=1
+task_name="wow"
+dataset_name="wow"
+speaker="system"
+ratio=$1
+dial_ids_order=$2
+data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
+output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test_unseen.json"
+metric_name_or_path="metric.py"
+metric_for_best_model="bleu"
+source_column="context+knowledge"
+target_column="response"
+truncation_side="left"
+max_source_length=1024
+max_target_length=512
+model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
+per_device_train_batch_size=32
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=100
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --load_best_model_at_end \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab2/base_models/t5/key2gen/run_wow_key2gen.sh b/convlab2/base_models/t5/key2gen/run_wow_key2gen.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b5badb7f7fefb979f6d9dea201d87e384a403493
--- /dev/null
+++ b/convlab2/base_models/t5/key2gen/run_wow_key2gen.sh
@@ -0,0 +1,85 @@
+n_gpus=1
+task_name="wow"
+dataset_name="wow"
+speaker="system"
+data_dir="data/${task_name}/key2gen_${dataset_name}"
+output_dir="output/${task_name}/key2gen_${dataset_name}"
+cache_dir="../cache"
+logging_dir="${output_dir}/runs"
+train_file="${data_dir}/train.json"
+validation_file="${data_dir}/validation.json"
+test_file="${data_dir}/test_unseen.json"
+metric_name_or_path="metric.py"
+metric_for_best_model="bleu"
+source_column="context+knowledge"
+target_column="response"
+truncation_side="left"
+max_source_length=1024
+max_target_length=512
+model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
+per_device_train_batch_size=32
+per_device_eval_batch_size=64
+gradient_accumulation_steps=4
+lr=1e-3
+num_train_epochs=10
+
+python create_data_key2gen.py -t ${task_name} -d ${dataset_name} --key2gen
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${model_name_or_path} \
+    --do_train \
+    --do_eval \
+    --save_strategy epoch \
+    --evaluation_strategy epoch \
+    --save_total_limit 1 \
+    --prediction_loss_only \
+    --load_best_model_at_end \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+python ../run_seq2seq.py \
+    --task_name ${task_name} \
+    --test_file ${test_file} \
+    --source_column ${source_column} \
+    --target_column ${target_column} \
+    --max_source_length ${max_source_length} \
+    --max_target_length ${max_target_length} \
+    --truncation_side ${truncation_side} \
+    --model_name_or_path ${output_dir} \
+    --do_predict \
+    --predict_with_generate \
+    --metric_name_or_path ${metric_name_or_path} \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+    --logging_dir ${logging_dir} \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 4 \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --learning_rate ${lr} \
+    --num_train_epochs ${num_train_epochs} \
+    --adafactor \
+    --gradient_checkpointing
+
+# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
+
+# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}