diff --git a/convlab/base_models/gpt/keyword_extraction/run.sh b/convlab/base_models/gpt/keyword_extraction/run.sh
deleted file mode 100644
index f35c2403ce21f9450d3d7a84dc8e7076ee6f5f89..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/run.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-set -e
-for dataset_name in dailydialog metalwoz tm1 tm2 tm3 sgd multiwoz21
-do
-    bash get_keywords.sh ${dataset_name}
-done
\ No newline at end of file
diff --git a/convlab/base_models/gpt/keyword_extraction/test_t5_key2gen.sh b/convlab/base_models/gpt/keyword_extraction/test_t5_key2gen.sh
deleted file mode 100644
index faaef560c20bd1a928f9c99503277780c4e8c26d..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/test_t5_key2gen.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-set -e
-n_gpus=2
-master_port=23457
-task_name="key2gen_noisy"
-dataset_name=$1
-model_type="gpt"
-data_dir="data/${task_name}/${model_type}/${dataset_name}"
-output_dir="output/${task_name}/${model_type}/${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-source_column="source"
-target_column="target"
-truncation_side="left"
-max_source_length=512
-max_target_length=128
-model_name_or_path="output/${task_name}/${model_type}/dailydialog+metalwoz+tm1+tm2+tm3"
-per_device_train_batch_size=128
-per_device_eval_batch_size=128
-gradient_accumulation_steps=2
-lr=1e-3
-num_train_epochs=3
-
-python -m torch.distributed.launch --master_port ${master_port} \
-    --nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_predict \
-    --predict_with_generate \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen.sh
deleted file mode 100644
index 36e24587d3d7181e4a92b649e5f1b1e47cdcd9f3..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-set -e
-n_gpus=2
-master_port=23457
-task_name="key2gen_noisy"
-dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
-model_type="gpt"
-data_dir="data/${task_name}/${model_type}/${dataset_name}"
-output_dir="output/${task_name}/${model_type}/${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-source_column="source"
-target_column="target"
-truncation_side="left"
-max_source_length=512
-max_target_length=128
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=128
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=3
-
-python -m torch.distributed.launch --master_port ${master_port} \
-    --nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --do_predict \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --load_best_model_at_end \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh
index 4d628f7f2c53d766c4a0b92861ac7681b8c80b02..b2de55410064b0234f8416b6338e2a070c79147f 100644
--- a/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh
+++ b/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh
@@ -4,8 +4,9 @@ master_port=23456
 task_name="rg"
 dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
 model_type="gpt"
+model_name="t5-small"
 data_dir="data/${task_name}/${model_type}/${dataset_name}"
-output_dir="output/${task_name}/${model_type}/${dataset_name}"
+output_dir="output/${task_name}/${model_name}/${dataset_name}"
 cache_dir="../cache"
 logging_dir="${output_dir}/runs"
 train_file="${data_dir}/train.json"
@@ -14,7 +15,7 @@ target_column="target"
 truncation_side="left"
 max_source_length=512
 max_target_length=128
-model_name_or_path="t5-small"
+model_name_or_path="${model_name}"
 per_device_train_batch_size=64
 per_device_eval_batch_size=128
 gradient_accumulation_steps=1
diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_multitask.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh
similarity index 91%
rename from convlab/base_models/gpt/keyword_extraction/train_t5_multitask.sh
rename to convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh
index dd8d5a460478ae068143b5d18914500e31bae439..74c418164815cfd538e17cb08cd0de7c24ba7624 100644
--- a/convlab/base_models/gpt/keyword_extraction/train_t5_multitask.sh
+++ b/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh
@@ -4,8 +4,9 @@ master_port=23456
 task_name="key2gen+key2gen_noisy"
 dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
 model_type="gpt"
+model_name="t5-small"
 data_dir="data/${task_name}/${model_type}/${dataset_name}"
-output_dir="output/${task_name}/${model_type}/${dataset_name}"
+output_dir="output/${task_name}/${model_name}/${dataset_name}"
 cache_dir="../cache"
 logging_dir="${output_dir}/runs"
 train_file="${data_dir}/train.json"
@@ -14,7 +15,7 @@ target_column="target"
 truncation_side="left"
 max_source_length=512
 max_target_length=128
-model_name_or_path="output/rg/${model_type}/${dataset_name}"
+model_name_or_path="output/rg/${model_name}/${dataset_name}"
 per_device_train_batch_size=64
 per_device_eval_batch_size=128
 gradient_accumulation_steps=1
diff --git a/convlab/base_models/t5/key2gen/create_data.py b/convlab/base_models/t5/key2gen/create_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb4e12c8e720f3031808ec0972c38926a617ef68
--- /dev/null
+++ b/convlab/base_models/t5/key2gen/create_data.py
@@ -0,0 +1,159 @@
+import os
+import json
+from tqdm import tqdm
+from convlab.util import load_dataset, load_unified_data, load_nlu_data
+
+def create_nlg_data(dataset, data_dir, args):
+    data_by_split = load_nlu_data(dataset, speaker='system', use_context=True, context_window_size=3)
+    os.makedirs(data_dir, exist_ok=True)
+
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            context = [(turn['speaker'], turn['utterance']) for turn in sample['context']]
+            response = sample['utterance']
+            if len(context) > 0 and len(response) > 0:
+                knowledge = sample['dialogue_acts']
+                data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
+
+        if 'test' in data_split:
+            file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
+        else:
+            file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
+
+def create_kvret_data(dataset, data_dir, args):
+    data_by_split = load_unified_data(dataset, speaker='system', utterance=True, db_results=True, use_context=True, context_window_size=100)
+    os.makedirs(data_dir, exist_ok=True)
+
+    domain2entity_col = {'schedule': 'event' ,'navigate': 'poi', 'weather': 'location'}
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            context = [(turn['speaker'], turn['utterance']) for turn in sample['context']]
+            response = sample['utterance']
+            if len(context) > 0 and len(response) > 0:
+                knowledge = sample['db_results']
+                for domain, db_items in knowledge.items():
+                    entity_col = domain2entity_col[domain]
+                    for db_item in db_items:
+                        db_item['entity'] = db_item.pop(entity_col)
+
+                data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
+
+        if 'test' in data_split:
+            file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
+        else:
+            file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
+
+def create_personachat_data(dataset, data_dir, args):
+    data_by_split = dataset
+    os.makedirs(data_dir, exist_ok=True)
+
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            knowledge = dial['persona']['system']
+            context = []
+            for turn in dial['turns']:
+                response = turn['utterance']
+                if turn['speaker'] == 'system' and len(context) > 0 and len(response) > 0:
+                    data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
+                context.append((turn['speaker'], turn['utterance']))
+
+        if 'test' in data_split:
+            file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
+        else:
+            file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
+
+def create_wow_data(dataset, data_dir, args):
+    data_by_split = dataset
+    os.makedirs(data_dir, exist_ok=True)
+
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            context = []
+            for turn in dial['turns']:
+                response = turn['utterance']
+                if turn['speaker'] == 'system' and len(context) > 0 and len(response) > 0:
+                    knowledge = turn['checked_passage']
+                    if knowledge is None:
+                        knowledge = []
+                    elif isinstance(knowledge, str):
+                        knowledge = [knowledge]
+                    data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
+                context.append((turn['speaker'], turn['utterance']))
+
+        if 'test' in data_split:
+            file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
+        else:
+            file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
+
+def create_opendialkg_data(dataset, data_dir, args):
+    data_by_split = dataset
+    os.makedirs(data_dir, exist_ok=True)
+
+    data_splits = data_by_split.keys()
+    for data_split in data_splits:
+        data = []
+        for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
+            context = []
+            for turn in dial['turns']:
+                response = turn['utterance']
+                if turn['speaker'] == 'system' and 'kg_path' in turn and len(context) > 0 and len(response) > 0:
+                    knowledge = turn['kg_path']['triples']
+                    data.append(json.dumps({'context': context, 'knowledge': knowledge, 'response': response}, ensure_ascii=False)+'\n')
+                context.append((turn['speaker'], turn['utterance']))
+
+        if 'test' in data_split:
+            file_name = os.path.join(os.path.dirname(data_dir), f"{data_split}.json")
+        else:
+            file_name = os.path.join(data_dir, f"{data_split}.json")
+        with open(file_name, "w", encoding='utf-8') as f:
+            f.writelines(data)
+        data_by_split[data_split] = data
+    return data_by_split
+
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="create data for seq2seq training")
+    parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['nlg', 'kvret', 'opendialkg', 'personachat', 'wow'], help='names of tasks')
+    parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
+    parser.add_argument('--shot', '-s', type=float, default=None, help='how many data is used for training and evaluation, ratio if < 1 else absolute number')
+    parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments')
+    args = parser.parse_args()
+    print(args)
+    for dataset_name in tqdm(args.datasets, desc='datasets'):
+        dataset = load_dataset(dataset_name, dial_ids_order=args.dial_ids_order)
+        if args.shot:
+            if args.shot < 1:
+                dataset['train'] = dataset['train'][:round(len(dataset['train'])*args.shot)]
+                dataset['validation'] = dataset['validation'][:round(len(dataset['validation'])*args.shot)]
+            else:
+                args.shot = int(args.shot)
+                dataset['train'] = dataset['train'][:args.shot]
+                dataset['validation'] = dataset['validation'][:args.shot]
+        for task_name in tqdm(args.tasks, desc='tasks', leave=False):
+            data_dir = os.path.join('data', task_name, (dataset_name if not args.shot else f'{dataset_name}_{args.shot}shot_order{args.dial_ids_order}'))
+            data_by_split = eval(f"create_{task_name}_data")(dataset, data_dir, args)
diff --git a/convlab/base_models/t5/key2gen/create_data_key2gen.py b/convlab/base_models/t5/key2gen/create_data_key2gen.py
deleted file mode 100644
index 6c6d9a0fa6f2435f6b657c44d9ce714d3a66ddfb..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/create_data_key2gen.py
+++ /dev/null
@@ -1,269 +0,0 @@
-import os
-import json
-from tqdm import tqdm
-from transformers import AutoTokenizer
-from convlab.util import load_dataset, load_unified_data, load_nlu_data
-from convlab.base_models.t5.nlu.serialization import serialize_dialogue_acts
-from collections import Counter
-from nltk.tokenize import word_tokenize, sent_tokenize
-from nltk.corpus import stopwords
-from string import punctuation
-
-def create_nlg_data(dataset, data_dir, args):
-    data_by_split = load_nlu_data(dataset, speaker=args.speaker, use_context=args.context_window_size>0, context_window_size=args.context_window_size)
-    data_dir = os.path.join(data_dir, args.speaker, f'context_{args.context_window_size}')
-    os.makedirs(data_dir, exist_ok=True)
-
-    data_splits = data_by_split.keys()
-    for data_split in data_splits:
-        data = []
-        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
-            if args.key2gen:
-                keywords = []
-                for da_type in sample['dialogue_acts']:
-                    for da in sample['dialogue_acts'][da_type]:
-                        intent, domain, slot, value = da['intent'], da['domain'], da['slot'], da.get('value', '')
-                        intent_domain = f'{intent}-{domain}'
-                        keywords.append([intent_domain])
-                        if len(slot) > 0:
-                            keywords[-1].append(slot)
-                        if len(value) > 0:
-                            keywords[-1].append(value)
-                dialogue_acts_seq = '| {} |'.format(' | '.join([' : '.join(da_keywords) for da_keywords in keywords]))
-            else:
-                dialogue_acts_seq = serialize_dialogue_acts(sample['dialogue_acts'])
-
-            if args.context_window_size>0:
-                context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: '])
-                context = f'generate a response: grounded knowledge: {dialogue_acts_seq} context:\n\n{context}'
-            else:
-                context = f'generate a response: grounded knowledge: {dialogue_acts_seq} context:\n\n{sample["speaker"]}: '
-
-            data.append(json.dumps({'context+da': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n')
-
-        file_name = os.path.join(data_dir, f"{data_split}.json")
-        with open(file_name, "w", encoding='utf-8') as f:
-            f.writelines(data)
-        data_by_split[data_split] = data
-    return data_by_split
-
-def create_dart_data(dataset, data_dir, args):
-    data_by_split = dataset
-    os.makedirs(data_dir, exist_ok=True)
-
-    data_splits = data_by_split.keys()
-    for data_split in data_splits:
-        data = []
-        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
-            sample = sample['turns'][0]
-            triples = sample['tripleset']
-            if args.key2gen:
-                keywords = [w for triple in triples for w in triple]
-                # DONE: try adding prompt, no improvement
-                entity_cnt = Counter()
-                for triple in triples:
-                    e1, r, e2 = triple
-                    for e in [e1, e2]:
-                        if e.startswith('[') and e.endswith(']'):
-                            continue
-                        entity_cnt[e] += 1
-                        
-                assert len(entity_cnt) > 0
-                common_entity = entity_cnt.most_common(1)[0][0]
-                context = f'{" | ".join(keywords)}\n\ncontext: user: tell me something about {common_entity}. system: '
-            else:
-                triples = [' : '.join(triple) for triple in triples]
-                context = f'{" | ".join(triples)}\n\ncontext: system: '
-
-            data.append(json.dumps({'triples': context, 'text': sample['utterance']}, ensure_ascii=False)+'\n')
-
-        file_name = os.path.join(data_dir, f"{data_split}.json")
-        with open(file_name, "w", encoding='utf-8') as f:
-            f.writelines(data)
-        data_by_split[data_split] = data
-    return data_by_split
-
-def create_commongen_data(dataset, data_dir, args):
-    data_by_split = dataset
-    os.makedirs(data_dir, exist_ok=True)
-
-    data_splits = data_by_split.keys()
-    for data_split in data_splits:
-        data = []
-        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
-            sample = sample['turns'][0]
-            concepts = sample['concepts']
-            context = f'{" | ".join(concepts)}\n\ncontext: system: '
-
-            data.append(json.dumps({'concepts': context, 'text': sample['utterance']}, ensure_ascii=False)+'\n')
-
-        file_name = os.path.join(data_dir, f"{data_split}.json")
-        with open(file_name, "w", encoding='utf-8') as f:
-            f.writelines(data)
-        data_by_split[data_split] = data
-    return data_by_split
-
-def create_kvret_data(dataset, data_dir, args):
-    data_by_split = load_unified_data(dataset, speaker='system', utterance=True, db_results=True, use_context=True, context_window_size=100)
-    os.makedirs(data_dir, exist_ok=True)
-
-    domain2entity_col = {'schedule': 'event' ,'navigate': 'poi', 'weather': 'location'}
-    data_splits = data_by_split.keys()
-    for data_split in data_splits:
-        data = []
-        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
-            if len(sample['utterance']) == 0:
-                continue
-            db_results = sample['db_results']
-            db_seqs = []
-            for domain, db_items in db_results.items():
-                entity_col = domain2entity_col[domain]
-                for db_item in db_items:
-                    entity = db_item[entity_col]
-                    for db_key, db_value in db_item.items():
-                        if db_key == entity_col:
-                            continue
-                        db_seqs.append(' : '.join([entity, db_key, db_value]))
-            db_seq = ' |\n'.join(db_seqs)
-
-            context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: '])
-            context = f'generate a response: all knowledge:\n\n| {db_seq} | context:\n\n{context}'
-            data.append(json.dumps({'context+db': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n')
-
-        file_name = os.path.join(data_dir, f"{data_split}.json")
-        with open(file_name, "w", encoding='utf-8') as f:
-            f.writelines(data)
-        data_by_split[data_split] = data
-    return data_by_split
-
-def create_personachat_data(dataset, data_dir, args):
-    data_by_split = dataset
-    os.makedirs(data_dir, exist_ok=True)
-
-    stop_words = set(stopwords.words('english')) | set(punctuation)
-    def sentence2keywords(sentence):
-        index2keyword = {}
-        for i, w in enumerate(word_tokenize(sentence)):
-            if not w.lower() in stop_words:
-                index2keyword[i] = w
-        indexes = sorted(index2keyword.keys())
-        keywords = []
-        for i, index in enumerate(indexes):
-            if i > 0 and index == indexes[i-1] + 1:
-                keywords[-1]+= ' '+index2keyword[index]
-            else:
-                keywords.append(index2keyword[index])
-        return keywords
-
-    data_splits = data_by_split.keys()
-    for data_split in data_splits:
-        data = []
-        for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
-            persona = dial['persona']['system']
-            if args.key2gen:
-                persona_seq = ' |\n'.join([' : '.join(sentence2keywords(s)) for s in persona])
-            else:
-                persona_seq = ' | '.join(persona)
-            context = []
-            for turn in dial['turns']:
-                if turn['speaker'] == 'system':
-                    context_seq = '\n'.join([f"{t['speaker']}: {t['utterance']}" for t in context]+[f'{turn["speaker"]}: '])
-                    context_seq = f'generate a response: all knowledge:\n\n| {persona_seq} | context:\n\n{context_seq}'
-                    data.append(json.dumps({'context+persona': context_seq, 'response': turn['utterance']}, ensure_ascii=False)+'\n')
-                context.append({'speaker': turn['speaker'], 'utterance': turn['utterance']})
-
-        file_name = os.path.join(data_dir, f"{data_split}.json")
-        with open(file_name, "w", encoding='utf-8') as f:
-            f.writelines(data)
-        data_by_split[data_split] = data
-    return data_by_split
-
-def create_wow_data(dataset, data_dir, args):
-    data_by_split = dataset
-    os.makedirs(data_dir, exist_ok=True)
-
-    stop_words = set(stopwords.words('english')) | set(punctuation)
-    def sentence2keywords(sentence):
-        index2keyword = {}
-        for i, w in enumerate(word_tokenize(sentence)):
-            if not w.lower() in stop_words:
-                index2keyword[i] = w
-        indexes = sorted(index2keyword.keys())
-        keywords = []
-        for i, index in enumerate(indexes):
-            if i > 0 and index == indexes[i-1] + 1:
-                keywords[-1]+= ' '+index2keyword[index]
-            else:
-                keywords.append(index2keyword[index])
-        return keywords
-
-    def sentences2keywords_seq(sentences):
-        return ' |\n'.join([' : '.join(sentence2keywords(sentence)) for sentence in sent_tokenize(sentences)])
-
-
-    data_splits = data_by_split.keys()
-    for data_split in data_splits:
-        data = []
-        for dial in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
-            context = []
-            for turn in dial['turns']:
-                if turn['speaker'] == 'system':
-                    if turn['checked_sentence']:
-                        if args.key2gen:
-                            know_seq = f" | {sentences2keywords_seq(turn['checked_sentence'])} |"
-                        else:
-                            know_seq = turn['checked_sentence']
-                    else:
-                        know_seq = ''
-                    context_seq = '\n'.join([f"{t['speaker']}: {t['utterance']}" for t in context]+[f'{turn["speaker"]}: '])
-                    context_seq = f'generate a response: grounded knowledge:\n\n{know_seq} context:\n\n{context_seq}'
-                    data.append(json.dumps({'context+knowledge': context_seq, 'response': turn['utterance']}, ensure_ascii=False)+'\n')
-                context.append({'speaker': turn['speaker'], 'utterance': turn['utterance']})
-
-        file_name = os.path.join(data_dir, f"{data_split}.json")
-        with open(file_name, "w", encoding='utf-8') as f:
-            f.writelines(data)
-        data_by_split[data_split] = data
-    return data_by_split
-
-def get_max_len(data_by_split, tokenizer):
-    for data_split in data_by_split.keys():
-        seq_len = {}
-        for line in data_by_split[data_split]:
-            item = json.loads(line.strip())
-            for column, seq in item.items():
-                seq_len.setdefault(column, [])
-                seq_len[column].append(len(tokenizer.tokenize(seq)))
-        print(f"data split: {data_split}")
-        for column, lens in seq_len.items():
-            print(f'\t{column}\tmax_len: {max(lens)}\tmean_len: {round(sum(lens)/len(lens),2)}')
-
-
-if __name__ == '__main__':
-    from argparse import ArgumentParser
-    parser = ArgumentParser(description="create data for seq2seq training")
-    parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['nlg', 'dart', 'commongen', 'kvret', 'personachat', 'wow'], help='names of tasks')
-    parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
-    parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s)')
-    parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
-    parser.add_argument('--len_tokenizer', '-l', type=str, default=None, help='name or path of tokenizer that used to get seq len')
-    parser.add_argument('--ratio', '-r', type=float, default=None, help='how many data is used for training and evaluation')
-    parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments')
-    parser.add_argument('--key2gen', '-k', action='store_true', default=False, help='generate data for key2gen models')
-    args = parser.parse_args()
-    print(args)
-    if args.len_tokenizer:
-        tokenizer = AutoTokenizer.from_pretrained(args.len_tokenizer)
-    for dataset_name in tqdm(args.datasets, desc='datasets'):
-        dataset = load_dataset(dataset_name, args.dial_ids_order)
-        if args.ratio:
-            dataset['train'] = dataset['train'][:round(len(dataset['train'])*args.ratio)]
-            dataset['validation'] = dataset['validation'][:round(len(dataset['validation'])*args.ratio)]
-        for task_name in tqdm(args.tasks, desc='tasks', leave=False):
-            if args.key2gen:
-                data_dir = os.path.join('data', task_name, "key2gen_"+(dataset_name if not args.ratio else f'{dataset_name}_{args.ratio}_order{args.dial_ids_order}'))
-            else:
-                data_dir = os.path.join('data', task_name, (dataset_name if not args.ratio else f'{dataset_name}_{args.ratio}_order{args.dial_ids_order}'))
-            data_by_split = eval(f"create_{task_name}_data")(dataset, data_dir, args)
-            if args.len_tokenizer:
-                get_max_len(data_by_split, tokenizer)
diff --git a/convlab/base_models/t5/key2gen/dataset_vanilla.py b/convlab/base_models/t5/key2gen/dataset_vanilla.py
new file mode 100644
index 0000000000000000000000000000000000000000..083c277f2c0f0af346d5b33e2248c40bb010cec0
--- /dev/null
+++ b/convlab/base_models/t5/key2gen/dataset_vanilla.py
@@ -0,0 +1,123 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Data processing for vanilla generator"""
+
+import json
+import datasets
+from convlab.base_models.t5.key2gen.features import FEATURES
+from copy import deepcopy
+
+
+class VanillaDataset(datasets.GeneratorBasedBuilder):
+    """Dataset for vanilla generator (e.g., t5)"""
+
+    VERSION = datasets.Version("1.18.0")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(name="nlg", version=VERSION, description="DA grounded generation task"),
+        datasets.BuilderConfig(name="kvret", version=VERSION, description="KB grounded generation task"),
+        datasets.BuilderConfig(name="opendialkg", version=VERSION, description="KG grounded generation task"),
+        datasets.BuilderConfig(name="wow", version=VERSION, description="Passage grounded generation task"),
+        datasets.BuilderConfig(name="personachat", version=VERSION, description="Persona grounded generation task"),
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=f"Vanilla Dataset for {self.config.description}",
+            features=datasets.Features(deepcopy(FEATURES[self.config.name]))
+        )
+
+    def _split_generators(self, dl_manager):
+        generators = []
+        if "train" in self.config.data_files:
+            generators.append(datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": self.config.data_files["train"][0],
+                    "split": "train",
+                },
+            ))
+        if "validation" in self.config.data_files:
+            generators.append(datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": self.config.data_files["validation"][0],
+                    "split": "validation",
+                },
+            ))
+        if "test" in self.config.data_files:
+            generators.append(datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": self.config.data_files["test"][0],
+                    "split": "test",
+                },
+            ))
+            
+        return generators
+
+    def _generate_examples(self, filepath, split):
+        with open(filepath, encoding="utf-8") as f:
+            for key, row in enumerate(f):
+                item = json.loads(row)
+                if self.config.name == "nlg":
+                    knowledge = item["knowledge"]
+                    triples = []
+                    for da_type in knowledge:
+                        for da in knowledge[da_type]:
+                            intent, domain, slot, value = da["intent"], da["domain"], da["slot"], da.get("value", "")
+                            if 'start' in da:
+                                da.pop('start')
+                                da.pop('end')
+                            intent_domain = f"{intent}-{domain}"
+                            triples.append([intent_domain])
+                            if len(slot) > 0:
+                                triples[-1].append(slot)
+                            if len(value) > 0:
+                                triples[-1].append(value)
+                    knowledge_seq = "| {} |".format(" | ".join([" : ".join(da_keywords) for da_keywords in triples]))
+                    
+                elif self.config.name == "kvret":
+                    knowledge = {"schedule": [], "weather": [], "navigate": []}
+                    triples = []
+                    for domain, db_items in item["knowledge"].items():
+                        knowledge[domain] = db_items
+                        for db_item in db_items:
+                            entity = db_item["entity"]
+                            for db_key, db_value in db_item.items():
+                                if db_key == "entity":
+                                    continue
+                                triples.append([entity, db_key, db_value])
+                    knowledge_seq = "| {} |".format(" | ".join([" : ".join(triple) for triple in triples]))
+
+                elif self.config.name == "opendialkg":
+                    knowledge = item["knowledge"]
+                    knowledge_seq = "| {} |".format(" | ".join([" : ".join(triple) for triple in item["knowledge"]]))
+                
+                elif self.config.name in ["wow", "personachat"]:
+                    knowledge = item["knowledge"]
+                    try:
+                        knowledge_seq = "| {} |".format(" | ".join(item["knowledge"]))
+                    except:
+                        print([knowledge])
+                        raise
+                
+                context = "\n".join([f"{turn[0]}: {turn[1]}" for turn in item["context"]]+["system: "])
+                context_knowledge = f"generate a response: grounded knowledge: \n\n{knowledge_seq} context:\n\n{context}"
+                
+                yield key, {
+                    "context+knowledge": context_knowledge,
+                    "response": item["response"],
+                    "knowledge": knowledge,
+                }
diff --git a/convlab/base_models/t5/key2gen/evaluate.py b/convlab/base_models/t5/key2gen/evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..63bf815de92d5cc5aa1b6adda3d8edfe2321c130
--- /dev/null
+++ b/convlab/base_models/t5/key2gen/evaluate.py
@@ -0,0 +1,78 @@
+from tabulate import tabulate
+import os
+import json
+from tqdm import tqdm
+from datasets import load_metric
+import numpy as np
+
+def evaluate(filename, metric):
+    """
+    It reads the predictions, references, and knowledge from a file, and then computes the metric
+    
+    :param filename: the path to the file containing the predictions
+    :param metric: the metric to use for evaluation
+    :return: The result of the evaluation.
+    """
+    predictions, references, knowledge = [], [], []
+    with open(filename, 'r') as f:
+        for line in f:
+            item = json.loads(line)
+            predictions.append(item['predictions'])
+            references.append(item['response'])
+            knowledge.append(item['knowledge'])
+    result = metric.compute(predictions=predictions, references=references, knowledge=knowledge)
+    return result
+
+
+def avg_result(results):
+    """
+    It takes a list of dictionaries, and returns a dictionary with the same keys, but the values are the
+    mean and standard deviation of the values in the input dictionaries
+    
+    :param results: a list of dictionaries, each dictionary is the result of a single run of the model
+    :return: The average and standard deviation of the results.
+    """
+    ret = {}
+    for k in results[0]:
+        m = round(np.mean([result[k] for result in results]), 2)
+        v = round(np.std([result[k] for result in results], ddof=1), 2) if len(results) > 1 else None
+        ret[k] = f"{m}({v})"
+    return ret
+
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description="create data for seq2seq training")
+    parser.add_argument("--output_dirs", type=str, nargs='*', required=True)
+    parser.add_argument('--tasks', '-t', type=str, nargs='*', choices=['nlg', 'kvret', 'opendialkg', 'personachat', 'wow'], help='names of tasks')
+    parser.add_argument('--shots', '-s', type=int, nargs='*', help='how many data is used for training and evaluation, ratio if < 1 else absolute number')
+    parser.add_argument('--dial_ids_orders', '-o', type=int, nargs='*', help='which data order is used for experiments')
+    args = parser.parse_args()
+    print(args)
+    
+    tables = []
+    for task_name in tqdm(args.tasks, desc='tasks'):
+        metric = load_metric("metric.py", task_name)
+        dataset_name = task_name if task_name != "nlg" else "multiwoz21"
+        for shot in tqdm(args.shots, desc='shots'):
+            for output_dir in tqdm(args.output_dirs, desc='models'):
+                model_name = output_dir.split('/')[-1]
+                if task_name == "wow":
+                    test_splits = ["_seen", "_unseen"]
+                else:
+                    test_splits = [""]
+                for test_split in test_splits:
+                    results = []
+                    for dial_ids_order in tqdm(args.dial_ids_orders, desc='dial_ids_orders'):
+                        filename = os.path.join(output_dir, task_name, f"{dataset_name}_{shot}shot_order{dial_ids_order}/gen{test_split}/generated_predictions.json")
+                        results.append(evaluate(filename, metric))
+                    res = {
+                        "dataset": f"{task_name}-{shot}shot",
+                        "model": f"{model_name}{test_split}",
+                        **avg_result(results)
+                    }
+                    tables.append(res)
+                    # print(res)
+    res = tabulate(tables, headers='keys', tablefmt='github')
+    with open(f'eval_results.txt', 'w', encoding='utf-8') as f:
+        print(res, file=f)
diff --git a/convlab/base_models/t5/key2gen/features.py b/convlab/base_models/t5/key2gen/features.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ac768b5cbe61d46e430580b025182e515db93ef
--- /dev/null
+++ b/convlab/base_models/t5/key2gen/features.py
@@ -0,0 +1,72 @@
+import datasets
+
+FEATURES = {
+    "nlg": {
+        "context+knowledge": datasets.Value("string"),
+        "response": datasets.Value("string"),
+        "knowledge": {
+            "categorical": datasets.Sequence({
+                "intent": datasets.Value("string"),
+                "domain": datasets.Value("string"),
+                "slot": datasets.Value("string"),
+                "value": datasets.Value("string"),
+            }), 
+            "non-categorical": datasets.Sequence({
+                "intent": datasets.Value("string"),
+                "domain": datasets.Value("string"),
+                "slot": datasets.Value("string"),
+                "value": datasets.Value("string"),
+            }), 
+            "binary": datasets.Sequence({
+                "intent": datasets.Value("string"),
+                "domain": datasets.Value("string"),
+                "slot": datasets.Value("string"),
+            })
+        }},
+    "kvret": {
+        "context+knowledge": datasets.Value("string"),
+        "response": datasets.Value("string"),
+        "knowledge": {
+            "schedule": datasets.Sequence({
+                "entity": datasets.Value("string"),
+                "time": datasets.Value("string"),
+                "date": datasets.Value("string"),
+                "party": datasets.Value("string"),
+                "room": datasets.Value("string"),
+                "agenda": datasets.Value("string")
+            }),
+            "weather": datasets.Sequence({
+                "entity": datasets.Value("string"),
+                "today": datasets.Value("string"),
+                "monday": datasets.Value("string"),
+                "tuesday": datasets.Value("string"),
+                "wednesday": datasets.Value("string"),
+                "thursday": datasets.Value("string"),
+                "friday": datasets.Value("string"),
+                "saturday": datasets.Value("string"),
+                "sunday": datasets.Value("string"),
+            }),
+            "navigate": datasets.Sequence({
+                "entity": datasets.Value("string"),
+                "traffic_info": datasets.Value("string"),
+                "poi_type": datasets.Value("string"),
+                "address": datasets.Value("string"),
+                "distance": datasets.Value("string")
+            })
+        }},
+    "opendialkg": {
+        "context+knowledge": datasets.Value("string"),
+        "response": datasets.Value("string"),
+        "knowledge": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
+        },
+    "wow": {
+        "context+knowledge": datasets.Value("string"),
+        "response": datasets.Value("string"),
+        "knowledge": datasets.Sequence(datasets.Value("string")),
+        },
+    "personachat": {
+        "context+knowledge": datasets.Value("string"),
+        "response": datasets.Value("string"),
+        "knowledge": datasets.Sequence(datasets.Value("string")),
+    }
+}
\ No newline at end of file
diff --git a/convlab/base_models/t5/key2gen/finetune.sh b/convlab/base_models/t5/key2gen/finetune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..390ea1a908dcf9f335ec74d35a422239e9a923ca
--- /dev/null
+++ b/convlab/base_models/t5/key2gen/finetune.sh
@@ -0,0 +1,153 @@
+n_gpus=2
+master_port=23456
+cache_dir="../cache"
+dataset_path="dataset_vanilla.py"
+metric_name_or_path="metric.py"
+source_column="context+knowledge"
+target_column="response"
+truncation_side="left"
+max_source_length=512
+max_target_length=512
+model_name="t5-small"
+model_name_or_path="t5-small"
+per_device_train_batch_size=64
+per_device_eval_batch_size=64
+gradient_accumulation_steps=1
+num_workers=16
+lr=1e-3
+num_train_epochs=100
+
+for dataset_name in multiwoz21 kvret opendialkg wow personachat
+do
+    if [ "${dataset_name}" == "multiwoz21" ]
+    then
+        task_name="nlg"
+    else
+        task_name=${dataset_name}
+    fi
+    for shot in 50 100 200
+    do
+        for dial_ids_order in 0 1 2 3 4
+        do
+            python create_data.py -t ${task_name} -d ${dataset_name} -o ${dial_ids_order} -s ${shot}
+
+            data_dir="data/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}"
+            output_dir="output/${model_name}/${task_name}/${dataset_name}_${shot}shot_order${dial_ids_order}"
+            logging_dir="${output_dir}/runs"
+            train_file="${data_dir}/train.json"
+            validation_file="${data_dir}/validation.json"
+
+            # training
+            python -m torch.distributed.launch --master_port ${master_port} \
+                --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+                --task_name ${task_name} \
+                --dataset_name ${dataset_path} \
+                --dataset_config_name ${task_name} \
+                --train_file ${train_file} \
+                --validation_file ${validation_file} \
+                --source_column ${source_column} \
+                --target_column ${target_column} \
+                --max_source_length ${max_source_length} \
+                --max_target_length ${max_target_length} \
+                --truncation_side ${truncation_side} \
+                --model_name_or_path ${model_name_or_path} \
+                --do_train \
+                --do_eval \
+                --save_strategy epoch \
+                --evaluation_strategy epoch \
+                --save_total_limit 1 \
+                --prediction_loss_only \
+                --load_best_model_at_end \
+                --overwrite_output_dir \
+                --cache_dir ${cache_dir} \
+                --output_dir ${output_dir} \
+                --logging_dir ${logging_dir} \
+                --preprocessing_num_workers ${num_workers} \
+                --dataloader_num_workers ${num_workers} \
+                --per_device_train_batch_size ${per_device_train_batch_size} \
+                --per_device_eval_batch_size ${per_device_eval_batch_size} \
+                --gradient_accumulation_steps ${gradient_accumulation_steps} \
+                --learning_rate ${lr} \
+                --num_train_epochs ${num_train_epochs} \
+                --optim adafactor \
+                --lr_scheduler_type constant \
+                --gradient_checkpointing
+
+            # inference
+            if [ "${dataset_name}" == "wow" ]
+            then
+                for test_split in seen unseen
+                do
+                    test_file="data/${task_name}/test_${test_split}.json"
+                    gen_output_dir="${output_dir}/gen_${test_split}"
+
+                    python -m torch.distributed.launch --master_port ${master_port} \
+                        --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+                        --task_name ${task_name} \
+                        --dataset_name ${dataset_path} \
+                        --dataset_config_name ${task_name} \
+                        --test_file ${test_file} \
+                        --source_column ${source_column} \
+                        --target_column ${target_column} \
+                        --max_source_length ${max_source_length} \
+                        --max_target_length ${max_target_length} \
+                        --truncation_side ${truncation_side} \
+                        --model_name_or_path ${output_dir} \
+                        --do_predict \
+                        --predict_with_generate \
+                        --cache_dir ${cache_dir} \
+                        --output_dir ${gen_output_dir} \
+                        --logging_dir ${logging_dir} \
+                        --overwrite_output_dir \
+                        --preprocessing_num_workers ${num_workers} \
+                        --dataloader_num_workers ${num_workers} \
+                        --per_device_train_batch_size ${per_device_train_batch_size} \
+                        --per_device_eval_batch_size ${per_device_eval_batch_size} \
+                        --gradient_accumulation_steps ${gradient_accumulation_steps} \
+                        --learning_rate ${lr} \
+                        --num_train_epochs ${num_train_epochs} \
+                        --optim adafactor \
+                        --lr_scheduler_type constant \
+                        --gradient_checkpointing
+                done
+            else
+                test_file="data/${task_name}/test.json"
+                gen_output_dir="${output_dir}/gen"
+            
+                python -m torch.distributed.launch --master_port ${master_port} \
+                    --nproc_per_node ${n_gpus} ../run_seq2seq.py \
+                    --task_name ${task_name} \
+                    --dataset_name ${dataset_path} \
+                    --dataset_config_name ${task_name} \
+                    --metric_name_or_path ${metric_name_or_path} \
+                    --metric_config_name ${task_name} \
+                    --test_file ${test_file} \
+                    --source_column ${source_column} \
+                    --target_column ${target_column} \
+                    --max_source_length ${max_source_length} \
+                    --max_target_length ${max_target_length} \
+                    --truncation_side ${truncation_side} \
+                    --model_name_or_path ${output_dir} \
+                    --do_predict \
+                    --predict_with_generate \
+                    --cache_dir ${cache_dir} \
+                    --output_dir ${gen_output_dir} \
+                    --logging_dir ${logging_dir} \
+                    --overwrite_output_dir \
+                    --preprocessing_num_workers ${num_workers} \
+                    --dataloader_num_workers ${num_workers} \
+                    --per_device_train_batch_size ${per_device_train_batch_size} \
+                    --per_device_eval_batch_size ${per_device_eval_batch_size} \
+                    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+                    --learning_rate ${lr} \
+                    --num_train_epochs ${num_train_epochs} \
+                    --optim adafactor \
+                    --lr_scheduler_type constant \
+                    --gradient_checkpointing
+            fi
+        done
+    done
+done
+
+# evaluation
+python evaluate.py --output_dirs output/${model_name} -t nlg kvret opendialkg personachat wow -s 50 100 200 -o 0 1 2 3 4
\ No newline at end of file
diff --git a/convlab/base_models/t5/key2gen/metric.py b/convlab/base_models/t5/key2gen/metric.py
index 050db04d120383bece5bfd5e9cb3362f66c8ecac..d3e493188194639adad04539619bb63f14284841 100644
--- a/convlab/base_models/t5/key2gen/metric.py
+++ b/convlab/base_models/t5/key2gen/metric.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""NLG Metric"""
+"""Grounded Dialog Generation Metric"""
 
+from weakref import ref
 import datasets
 from sacrebleu.metrics import BLEU
 from sacrebleu.utils import sum_of_lists
@@ -20,169 +21,423 @@ import re
 from collections import Counter
 import numpy as np
 from nltk.corpus import stopwords
-from nltk import sent_tokenize
 from rouge_score import rouge_scorer, scoring
 from nltk.translate import meteor_score
 from datasets.config import importlib_metadata, version
+from convlab.base_models.t5.key2gen.features import FEATURES
+from convlab.util import load_ontology
+from copy import deepcopy
 
 
 NLTK_VERSION = version.parse(importlib_metadata.version("nltk"))
 if NLTK_VERSION >= version.Version("3.6.5"):
     from nltk import word_tokenize
 
+# Uncomment to download nltk_data for the first time running.
+# import nltk
+# nltk.download("wordnet")
+# if NLTK_VERSION >= version.Version("3.6.5"):
+#     nltk.download("punkt")
+# if NLTK_VERSION >= version.Version("3.6.6"):
+#     nltk.download("omw-1.4")
 
-# TODO: Add BibTeX citation
-_CITATION = """\
-@inproceedings{post-2018-call,
-    title = "A Call for Clarity in Reporting {BLEU} Scores",
-    author = "Post, Matt",
-    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
-    month = oct,
-    year = "2018",
-    address = "Belgium, Brussels",
-    publisher = "Association for Computational Linguistics",
-    url = "https://www.aclweb.org/anthology/W18-6319",
-    pages = "186--191",
-}
+
+_CITATION = """
 """
 
 _DESCRIPTION = """\
-Metric to evaluate text-to-text models on the natural language generation task.
+Metric to evaluate text generation models on the grounded dialog generation task.
 """
 
+# TODO
 _KWARGS_DESCRIPTION = """
-Calculates corpus-bleu4
 Args:
     predictions: list of predictions to score. Each predictions
         should be a string.
     references: list of reference for each prediction. Each
         reference should be a string.
+    knowledge: task-specific grounded knowledge
+
 Returns:
-    bleu: corpus-bleu score
+    bleu-1/2/3/4: corpus-bleu score, from sacrebleu
+    rouge-1/2/L: ROUGE-F1, from rouge_score
+    meteor: METEOR, from nltk
+    unigram f1: unigram overlap, from parlai
+    distinct-1/2: from parlai
+    other knowledge utility score: task-specific knowledge utility metrics
+
 Examples:
 
-    >>> nlg_metric = datasets.load_metric("nlg_metric.py")
+    >>> nlg_metric = datasets.load_metric("metric.py", "nlg")
     >>> predictions = ["hello there general kenobi", "foo bar foobar"]
     >>> references = ["hello there kenobi", "foo bar foobar"]
     >>> results = nlg_metric.compute(predictions=predictions, references=references)
     >>> print(results)
-    {'bleu': 35.35533905932737}
+    {"bleu": 35.35533905932737}
 """
 
+re_art = re.compile(r'\b(a|an|the)\b')
+re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']')
+stop_words = set(stopwords.words("english"))
+def utt2words(s):
+    """Lower text and remove punctuation, articles and extra whitespace.
+    from parlai https://github.com/facebookresearch/ParlAI/blob/9daae69320c07104493486e022c0e46a7871b253/parlai/core/metrics.py#L810"""
+    s = s.lower()
+    s = re_punc.sub(' ', s)
+    s = re_art.sub(' ', s)
+    return s.split()
+
+
+def get_bleu(predictions, references):
+    """bleu-1/2/3/4 from sacrebleu"""
+    references = [" " if ref=="" else ref for ref in references]
+    metrics = {}
+    bleu = BLEU(lowercase=True, force=False, tokenize=BLEU.TOKENIZER_DEFAULT, smooth_method="exp", smooth_value=None, effective_order=False)
+    stats = sum_of_lists(bleu._extract_corpus_statistics(predictions, [references]))
+    for n in range(1,5):
+        metrics[f"bleu-{n}"] = bleu.compute_bleu(
+            correct=stats[2: 2 + bleu.max_ngram_order],
+            total=stats[2 + bleu.max_ngram_order:],
+            sys_len=int(stats[0]), ref_len=int(stats[1]),
+            smooth_method=bleu.smooth_method, smooth_value=bleu.smooth_value,
+            effective_order=bleu.effective_order,
+            max_ngram_order=n).score
+    return metrics
+
+
+def get_unigram_f1(predictions, references):
+    """unigram f1 between prediction and reference, from parlai"""
+    metrics = {}
+    metrics["unigram f1"] = []
+    metrics["unigram f1 (non-stop words)"] = []
+    for prediction, reference in zip(predictions, references):
+        pred_items = utt2words(prediction)
+        gold_items = utt2words(reference)
+        for remove_stopwords in [False, True]:
+            if remove_stopwords:
+                pred_items = [w for w in pred_items if w not in stop_words]
+                gold_items = [w for w in gold_items if w not in stop_words]
+            common = Counter(pred_items) & Counter(gold_items)
+            num_same = sum(common.values())
+            if num_same == 0:
+                f1 = 0
+            else:
+                precision = 1.0 * num_same / len(pred_items)
+                recall = 1.0 * num_same / len(gold_items)
+                f1 = (2 * precision * recall) / (precision + recall)
+            if not remove_stopwords:
+                metrics["unigram f1"].append(f1)
+            else:
+                metrics["unigram f1 (non-stop words)"].append(f1)
+    metrics["unigram f1"] = np.mean(metrics["unigram f1"]) * 100
+    metrics["unigram f1 (non-stop words)"] = np.mean(metrics["unigram f1 (non-stop words)"]) * 100
+    return metrics
+
+
+def get_rouge(predictions, references):
+    """rouge-1/2/L from rouge-score"""
+    rouge_types=["rouge1", "rouge2", "rougeL"]
+    scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=True)
+    aggregator = scoring.BootstrapAggregator()
+
+    for prediction, reference in zip(predictions, references):
+        score = scorer.score(reference, prediction)
+        aggregator.add_scores(score)
+
+    return {key: 100 * (value.mid.fmeasure if key == "rougeL" else value.mid.recall) for key, value in aggregator.aggregate().items()}
+
+
+def get_meteor(predictions, references):
+    """meteor from nltk"""
+    alpha=0.9
+    beta=3
+    gamma=0.5
+    if NLTK_VERSION >= version.Version("3.6.5"):
+        scores = [
+            meteor_score.single_meteor_score(
+                word_tokenize(ref), word_tokenize(pred), alpha=alpha, beta=beta, gamma=gamma
+            )
+            for ref, pred in zip(references, predictions)
+        ]
+    else:
+        scores = [
+            meteor_score.single_meteor_score(ref, pred, alpha=alpha, beta=beta, gamma=gamma)
+            for ref, pred in zip(references, predictions)
+        ]
+    return {"meteor": np.mean(scores) * 100}
+
+
+def get_distinct(predictions):
+    """distinct-1/2 
+    from parlai https://github.com/facebookresearch/ParlAI/blob/9daae69320c07104493486e022c0e46a7871b253/parlai/core/metrics.py#L781"""
+    def _ngram(seq, n):
+        for i in range(len(seq) - n + 1):
+            yield tuple(seq[i : i + n])
+    
+    metrics = {}
+    for k in [1, 2]:
+        inter_cnt = Counter()
+        for prediction in predictions:
+            ngram = Counter(_ngram(utt2words(prediction), k))
+            inter_cnt += ngram
+        metrics[f"distinct-{k}"] = max(len(inter_cnt), 1e-12) / max(sum(inter_cnt.values()), 1e-5) * 100
+    return metrics
+
+
+def get_nlg_slot_err(predictions, knowledge):
+    """slot error rate: (missing_count + redundant_count) / all_count for value in dialog acts"""
+    val2ds_dict = {}
+    ontology = load_ontology("multiwoz21")
+    for domain_name in ontology["domains"]:
+        domain = ontology["domains"][domain_name]
+        for slot_name in domain["slots"]:
+            slot = domain["slots"][slot_name]
+            if "possible_values" not in slot:
+                continue
+            possible_vals = slot["possible_values"]
+            if len(possible_vals) > 0:
+                for val in possible_vals:
+                    val2ds_dict[val] = f"{domain_name}-{slot_name}"
+    score_list = []
+    for utterance, da in zip(predictions, knowledge):
+        missing_count = 0
+        redundant_count = 0
+        all_count = 0
+        all_values = set()
+        ## missing values
+        # print(da)
+        # print(utterance)
+        for key in ['categorical', 'non-categorical']:
+            for value in da[key]['value']:
+                if len(value) > 0:
+                    # print(value)
+                    all_values.add(value)
+                    if value.strip().lower() not in utterance.lower():
+                        missing_count += 1
+                        # print(f"\tmissing: {value}")
+                    all_count += 1
+        if all_count == 0:
+            continue
+        ## redundant values
+        for val in val2ds_dict:
+            if f" {val.strip().lower()} " in f" {utterance.strip().lower()} " and val.strip().lower() not in all_values:
+                wlist = val2ds_dict[val].split("-")
+                domain, slot = wlist[0], wlist[1]
+                if f" {slot.strip().lower()}" in f" {utterance.strip().lower()} ":
+                    redundant_count += 1
+                    # print(f"redundant: {val}/{val2ds_dict[val]}")
+        item_score = float(missing_count + redundant_count) / all_count
+        # print(f"\tredundant: {redundant_count} | missing_count: {missing_count} |all_count: {all_count}")
+        # print('-'*100)
+        score_list.append(item_score)
+    return {"err": np.mean(score_list) * 100}
+
+
+def load_entities():
+    """modified (load from unified ontology) from UnifiedSKG
+    https://github.com/HKUNLP/UnifiedSKG/blob/49a2ff950bb312b980c22ad72b11520db72ab6a3/metrics/kvret/evaluator.py#L8"""
+
+    ontology = load_ontology("kvret")
+    all_entities = set()
+    for domain in ontology["domains"]:
+        for slot in ontology["domains"][domain]["slots"]:
+            all_entities |= set(ontology["domains"][domain]["slots"][slot]["possible_values"])
+    missed_entities = ["yoga", "tennis", "swimming", "football", " lab ", "doctor", "optometrist", "dentist", "1st",
+                        "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th",
+                        "11th", "12th", "13th", "14th", "15th", "16th", "17th", "18th", "19th", "20th", "Jill",
+                        "Jack"]
+    all_entities |= set(missed_entities)
+    all_entities.remove("HR")
+    all_entities.add(" HR ")
+    all_entities = sorted(list(all_entities), key=lambda i: len(i), reverse=True)
+    return all_entities
+
+
+def check_sub_str(str_list: list, sub_str: str):
+    """
+    It takes a list of strings and a substring as input, and returns True if the substring is found
+    in any of the strings in the list, and False otherwise
+    """
+    for str_item in str_list:
+        if sub_str in str_item or sub_str.lower() in str_item.lower():
+            return True
+    return False
+
+
+def extract_entities_from_utterance(utterance, sorted_entities):
+    """modified (remove underscore) from UnifiedSKG
+    https://github.com/HKUNLP/UnifiedSKG/blob/49a2ff950bb312b980c22ad72b11520db72ab6a3/metrics/kvret/response_entity_hit.py#L45"""
+
+    utterance = " {} ".format(utterance)  # for entity matching
+    for h in range(0, 13): # for formulating am & pm
+        utterance = utterance.replace("{} am".format(h), "{}am".format(h))
+        utterance = utterance.replace("{} pm".format(h), "{}pm".format(h))
+    for entity_item_a in [20, 30, 40, 50, 60, 70, 80, 90, 100]:
+        for entity_item_b in [20, 30, 40, 50, 60, 70, 80, 90, 100]:
+            utterance = utterance.replace("{}-{}f".format(str(entity_item_a), str(entity_item_b)), "{}f-{}f".format(str(entity_item_a), str(entity_item_b)))
+    entities_in_this_utterance = []
+    for entity in sorted_entities:
+        # len(entity) decreases
+        if (entity in utterance) or (entity.lower() in utterance.lower()):
+            if not check_sub_str(entities_in_this_utterance, entity):
+                # in case of "week & weekend", "week & next_week" etc
+                entities_in_this_utterance.append(entity)
+    return entities_in_this_utterance
+
+
+def f1_score(y_pred, y_true, average="micro"):
+    """micro/marco-F1 score, modified from UnifiedSKG
+    https://github.com/HKUNLP/UnifiedSKG/blob/49a2ff950bb312b980c22ad72b11520db72ab6a3/metrics/kvret/response_entity_hit.py#L76"""
+
+    assert len(y_pred) == len(y_true)
+
+    def _compute_F1(precision, recall):
+        return 2 * precision * recall / float(precision + recall) if (precision + recall) != 0 else 0
+
+    def _compute_prf(gold, pred):
+        TP, FP, FN = 0, 0, 0
+        if len(gold) != 0:
+            count = 1
+            for g in gold:
+                if g in pred:
+                    TP += 1
+                else:
+                    FN += 1
+            for p in set(pred):
+                if p not in gold:
+                    FP += 1
+            precision = TP / float(TP + FP) if (TP + FP) != 0 else 0
+            recall = TP / float(TP + FN) if (TP + FN) != 0 else 0
+            F1 = _compute_F1(precision, recall)
+        else:
+            precision, recall, F1, count = 0, 0, 0, 0
+        return TP, FP, FN, F1, count
+
+    F1_pred, F1_count, TP_all, FP_all, FN_all = 0, 0, 0, 0, 0
+
+    for y_true_item, y_pred_item in zip(y_true, y_pred):
+        single_tp, single_fp, single_fn, single_f1, count = _compute_prf(y_true_item, y_pred_item)
+        F1_pred += single_f1
+        F1_count += count
+        TP_all += single_tp
+        FP_all += single_fp
+        FN_all += single_fn
+
+    if average == "macro":
+        F1_macro_score = F1_pred / float(F1_count) if F1_count != 0 else 0
+        return F1_macro_score
+    elif average == "micro":
+        P_score = TP_all / float(TP_all + FP_all) if (TP_all + FP_all) != 0 else 0
+        R_score = TP_all / float(TP_all + FN_all) if (TP_all + FN_all) != 0 else 0
+        F1_micro_score = _compute_F1(P_score, R_score)
+        return F1_micro_score
+    else:
+        raise ValueError("Options other than micro/macro are not supported.")
+
+
+def get_kvret_entity_f1(predictions, references, knowledge):
+    """entity f1 for kvret, modified from
+    https://github.com/HKUNLP/UnifiedSKG/blob/49a2ff950bb312b980c22ad72b11520db72ab6a3/metrics/kvret/response_entity_hit.py#L178"""
+
+    global_entities = load_entities()
+    F1_scores = {}
+    entities_from_predictions_and_references = {
+        d: {"predictions_entities": [], "references_entities": []} for d in ["all", "schedule", "weather", "navigate"]
+    }
+    for prediction, reference, kb in zip(predictions, references, knowledge):
+        prediction_entities = extract_entities_from_utterance(utterance=prediction, sorted_entities=global_entities)
+        reference_entities = extract_entities_from_utterance(utterance=reference, sorted_entities=global_entities)
+        entities_from_predictions_and_references["all"]["predictions_entities"].append(prediction_entities)
+        entities_from_predictions_and_references["all"]["references_entities"].append(reference_entities)
+        domain = "schedule"
+        for d in kb:
+            if len(kb[d]["entity"]) > 0:
+                domain = d
+                break
+        entities_from_predictions_and_references[domain]["predictions_entities"].append(prediction_entities)
+        entities_from_predictions_and_references[domain]["references_entities"].append(reference_entities)
+    
+    for category in entities_from_predictions_and_references.keys():
+        predictions_entities = entities_from_predictions_and_references[category]["predictions_entities"]
+        references_entities = entities_from_predictions_and_references[category]["references_entities"]
+        F1_scores["{} micro entity F1".format(category)] = f1_score(y_pred=predictions_entities, y_true=references_entities, average="micro")
+        F1_scores["{} macro entity F1".format(category)] = f1_score(y_pred=predictions_entities, y_true=references_entities, average="macro")
+
+    return {**F1_scores}
+
+
+def get_opendialkg_entity_f1(predictions, references, knowledge):
+    predictions_entities, references_entities = [], []
+    for prediction, reference, kg_path in zip(predictions, references, knowledge):
+        kg_entities = set()
+        for kg_triple in kg_path:
+            # add head and tail entities
+            kg_entities.add(kg_triple[0])
+            kg_entities.add(kg_triple[-1])
+        kg_entities = sorted(list(kg_entities), key=lambda i: len(i), reverse=True)
+        
+        for utterance, entities in zip([prediction, reference], [predictions_entities, references_entities]):
+            entities_in_this_utterance = []
+            for entity in kg_entities:
+                if (entity in utterance) or (entity.lower() in utterance.lower()):
+                    if not check_sub_str(entities_in_this_utterance, entity):
+                        # in case of "week & weekend", "week & next_week" etc
+                        entities_in_this_utterance.append(entity)
+            entities.append(entities_in_this_utterance)
+
+    return {
+        "micro entity f1": f1_score(y_pred=predictions_entities, y_true=references_entities, average="micro"),
+        "macro entity f1": f1_score(y_pred=predictions_entities, y_true=references_entities, average="macro")
+    }
+
+def get_knowledge_sentences_f1(predictions, knowledge):
+    knowledge_reference = [' '.join(k_sens) for k_sens in knowledge]
+    f1_score = get_unigram_f1(predictions, knowledge_reference)
+    return {f"knowledge {k}": v for k, v in f1_score.items()}
+
 
 @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class NLGMetrics(datasets.Metric):
-    """Metric to evaluate text-to-text models on the natural language generation task."""
+class GroundedDialogGenerationMetrics(datasets.Metric):
+    """Metric to evaluate text generation models on the grounded dialog generation task."""
     def _info(self):
         return datasets.MetricInfo(
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
-            # This defines the format of each prediction and reference
             features=datasets.Features({
-                'predictions': datasets.Value('string'),
-                'references': datasets.Value('string'),
+                "predictions": datasets.Value("string"),
+                "references": datasets.Value("string"),
+                "knowledge": deepcopy(FEATURES[self.config_name]["knowledge"])
             })
         )
 
-    # def _download_and_prepare(self, dl_manager):
-    #     import nltk
-    #     nltk.download("wordnet")
-    #     if NLTK_VERSION >= version.Version("3.6.5"):
-    #         nltk.download("punkt")
-    #     if NLTK_VERSION >= version.Version("3.6.6"):
-    #         nltk.download("omw-1.4")
-
-    def _compute(self, predictions, references):
+    def compute(self, predictions, references, knowledge=None):
         """Returns the scores: bleu"""
         metrics = {}
+
         # bleu
-        bleu = BLEU(lowercase=True, force=False, tokenize=BLEU.TOKENIZER_DEFAULT, smooth_method='exp', smooth_value=None, effective_order=False)
-        stats = sum_of_lists(bleu._extract_corpus_statistics(predictions, [references]))
-        for n in range(1,5):
-            metrics[f'bleu-{n}'] = bleu.compute_bleu(
-                correct=stats[2: 2 + bleu.max_ngram_order],
-                total=stats[2 + bleu.max_ngram_order:],
-                sys_len=int(stats[0]), ref_len=int(stats[1]),
-                smooth_method=bleu.smooth_method, smooth_value=bleu.smooth_value,
-                effective_order=bleu.effective_order,
-                max_ngram_order=n).score
+        metrics.update(get_bleu(predictions, references))
                 
         # unigram f1
-        re_art = re.compile(r'\b(a|an|the)\b')
-        re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']')
-        stop_words = set(stopwords.words('english'))
-        def utt2words(s):
-            """Lower text and remove punctuation, articles and extra whitespace."""
-            s = s.lower()
-            s = re_punc.sub(' ', s)
-            s = re_art.sub(' ', s)
-            return s.split()
-
-        metrics['unigram f1'] = []
-        metrics['unigram f1 (non-stop words)'] = []
-        for prediction, reference in zip(predictions, references):
-            pred_items = utt2words(prediction)
-            gold_items = utt2words(reference)
-            for remove_stopwords in [False, True]:
-                if remove_stopwords:
-                    pred_items = [w for w in pred_items if w not in stop_words]
-                    gold_items = [w for w in gold_items if w not in stop_words]
-                common = Counter(pred_items) & Counter(gold_items)
-                num_same = sum(common.values())
-                if num_same == 0:
-                    f1 = 0
-                else:
-                    precision = 1.0 * num_same / len(pred_items)
-                    recall = 1.0 * num_same / len(gold_items)
-                    f1 = (2 * precision * recall) / (precision + recall)
-                if not remove_stopwords:
-                    metrics['unigram f1'].append(f1)
-                else:
-                    metrics['unigram f1 (non-stop words)'].append(f1)
-        metrics['unigram f1'] = np.mean(metrics['unigram f1'])
-        metrics['unigram f1 (non-stop words)'] = np.mean(metrics['unigram f1 (non-stop words)'])
-
+        metrics.update(get_unigram_f1(predictions, references))
+        
         # rouge-1/2/L-fmeasure
-        rouge_types=["rouge1", "rouge2", "rougeL"]
-        scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=True)
-        aggregator = scoring.BootstrapAggregator()
-
-        for prediction, reference in zip(predictions, references):
-            score = scorer.score(reference, prediction)
-            aggregator.add_scores(score)
-
-        result = aggregator.aggregate()
-        metrics.update({key: value.mid.fmeasure * 100 for key, value in result.items()})
+        metrics.update(get_rouge(predictions, references))
 
         # meteor
-        alpha=0.9
-        beta=3
-        gamma=0.5
-        if NLTK_VERSION >= version.Version("3.6.5"):
-            scores = [
-                meteor_score.single_meteor_score(
-                    word_tokenize(ref), word_tokenize(pred), alpha=alpha, beta=beta, gamma=gamma
-                )
-                for ref, pred in zip(references, predictions)
-            ]
-        else:
-            scores = [
-                meteor_score.single_meteor_score(ref, pred, alpha=alpha, beta=beta, gamma=gamma)
-                for ref, pred in zip(references, predictions)
-            ]
-        metrics.update({"meteor": np.mean(scores)})
-
-        # inter/intra-distinct-1/2
-        def _ngram(seq, n):
-            for i in range(len(seq) - n + 1):
-                yield tuple(seq[i : i + n])
+        metrics.update(get_meteor(predictions, references))
+
+        # inter-distinct-1/2
+        metrics.update(get_distinct(predictions))
         
-        for k in [1, 2]:
-            inter_cnt = Counter()
-            for prediction in predictions:
-                ngram = Counter(_ngram(utt2words(prediction), k))
-                inter_cnt += ngram
-            metrics[f'distinct-{k}'] = max(len(inter_cnt), 1e-12) / max(sum(inter_cnt.values()), 1e-5)
+        if knowledge is not None:
+            if self.config_name == "nlg":
+                metrics.update(get_nlg_slot_err(predictions, knowledge))
+            elif self.config_name == "kvret":
+                metrics.update(get_kvret_entity_f1(predictions, references, knowledge))
+            elif self.config_name == "opendialkg":
+                metrics.update(get_opendialkg_entity_f1(predictions, references, knowledge))
+            elif self.config_name in ["wow", "personachat"]:
+                metrics.update(get_knowledge_sentences_f1(predictions, knowledge))
 
         return metrics
diff --git a/convlab/base_models/t5/key2gen/run.sh b/convlab/base_models/t5/key2gen/run.sh
deleted file mode 100644
index 74e44434ea1ab9e0099e680f2862521f7063049b..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-for ratio in 0.1 0.01
-do
-    for dial_ids_order in 0 1 2
-    do
-        bash run_persona_fewshot_key2gen.sh ${ratio} ${dial_ids_order}
-    done
-done
\ No newline at end of file
diff --git a/convlab/base_models/t5/key2gen/run2.sh b/convlab/base_models/t5/key2gen/run2.sh
deleted file mode 100644
index 7e7bf5d6a75e72d6c236fbf738a8c7d010f0e18b..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run2.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-for ratio in 0.1 0.01
-do
-    for dial_ids_order in 0 1 2
-    do
-        bash run_wow_fewshot_key2gen.sh ${ratio} ${dial_ids_order}
-    done
-done
\ No newline at end of file
diff --git a/convlab/base_models/t5/key2gen/run_commongen.sh b/convlab/base_models/t5/key2gen/run_commongen.sh
deleted file mode 100644
index 5952725709afa1848695158b76f04aa81ff328a6..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_commongen.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-n_gpus=1
-task_name="commongen"
-dataset_name="commongen"
-speaker="system"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}"
-output_dir="output/${task_name}/${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/validation.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="concepts"
-target_column="text"
-truncation_side="left"
-max_source_length=512
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 3 \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_commongen_key2gen.sh b/convlab/base_models/t5/key2gen/run_commongen_key2gen.sh
deleted file mode 100644
index e09d3b464e2685dd8e0d774539f8c8dc4808f99c..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_commongen_key2gen.sh
+++ /dev/null
@@ -1,88 +0,0 @@
-n_gpus=1
-task_name="commongen"
-dataset_name="commongen"
-speaker="system"
-context_window_size=0
-data_dir="data/${task_name}/key2gen_${dataset_name}"
-output_dir="output/${task_name}/key2gen_${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/validation.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="concepts"
-target_column="text"
-source_prefix="keywords: "
-truncation_side="left"
-max_source_length=512
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_shuffle_noisy/gpt/dailydialog+metalwoz+sgd+tm1+tm2+tm3"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 3 \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --source_prefix ${source_prefix} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_dart.sh b/convlab/base_models/t5/key2gen/run_dart.sh
deleted file mode 100644
index 62eb3236f0c468c6b185eb9c5c39e807e3a92800..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_dart.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-n_gpus=1
-task_name="dart"
-dataset_name="dart"
-speaker="system"
-context_window_size=0
-data_dir="data/${task_name}/key2gen_${dataset_name}"
-output_dir="output/${task_name}/${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="triples"
-target_column="text"
-truncation_side="left"
-max_source_length=512
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
-
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --train_file ${train_file} \
-#     --validation_file ${validation_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${model_name_or_path} \
-#     --do_train \
-#     --do_eval \
-#     --save_strategy epoch \
-#     --evaluation_strategy epoch \
-#     --save_total_limit 3 \
-#     --prediction_loss_only \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_dart_key2gen.sh b/convlab/base_models/t5/key2gen/run_dart_key2gen.sh
deleted file mode 100644
index b729da6fdf27ed61dad7a2a48120fbbbaef49afc..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_dart_key2gen.sh
+++ /dev/null
@@ -1,88 +0,0 @@
-n_gpus=1
-task_name="dart"
-dataset_name="dart"
-speaker="system"
-context_window_size=0
-data_dir="data/${task_name}/${dataset_name}"
-output_dir="output/${task_name}/key2gen_${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="triples"
-target_column="text"
-source_prefix="possible keywords: "
-truncation_side="left"
-max_source_length=512
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_shuffle_noisy/gpt/dailydialog+metalwoz+sgd+tm1+tm2+tm3"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen
-
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --train_file ${train_file} \
-#     --validation_file ${validation_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --source_prefix "${source_prefix}" \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${model_name_or_path} \
-#     --do_train \
-#     --do_eval \
-#     --save_strategy epoch \
-#     --evaluation_strategy epoch \
-#     --save_total_limit 3 \
-#     --prediction_loss_only \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --source_prefix "${source_prefix}" \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_kvret.sh b/convlab/base_models/t5/key2gen/run_kvret.sh
deleted file mode 100644
index b960ed18735d804f82116b4307223b4245f2b368..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_kvret.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-n_gpus=1
-task_name="kvret"
-dataset_name="kvret"
-speaker="system"
-data_dir="data/${task_name}/${dataset_name}"
-output_dir="output/${task_name}/${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+db"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name}
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_kvret_fewshot.sh b/convlab/base_models/t5/key2gen/run_kvret_fewshot.sh
deleted file mode 100644
index a6523d6cea506b6433f2cd853ca6255a28cfd8a6..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_kvret_fewshot.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-n_gpus=1
-task_name="kvret"
-dataset_name="kvret"
-speaker="system"
-ratio=$1
-dial_ids_order=$2
-data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
-output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+db"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order}
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh b/convlab/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh
deleted file mode 100644
index d19ae9c1ca3890896491839c76e3d7beff9bdbfc..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_kvret_fewshot_key2gen.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-n_gpus=1
-task_name="kvret"
-dataset_name="kvret"
-speaker="system"
-ratio=$1
-dial_ids_order=$2
-data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
-output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+db"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_kvret_key2gen.sh b/convlab/base_models/t5/key2gen/run_kvret_key2gen.sh
deleted file mode 100644
index 44f8dbc6732f8ccd3470b8d5fd62a886898b949c..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_kvret_key2gen.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-n_gpus=1
-task_name="kvret"
-dataset_name="kvret"
-speaker="system"
-data_dir="data/${task_name}/${dataset_name}"
-output_dir="output/${task_name}/key2gen_${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+db"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name}
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_nlg.sh b/convlab/base_models/t5/key2gen/run_nlg.sh
deleted file mode 100644
index ec225ef719f25e970bd00ca662491fe76fd6b8af..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_nlg.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-n_gpus=1
-task_name="nlg"
-dataset_name=$1
-speaker="system"
-context_window_size=$2
-data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+da"
-target_column="response"
-truncation_side="left"
-max_source_length=512
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
-
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --train_file ${train_file} \
-#     --validation_file ${validation_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${model_name_or_path} \
-#     --do_train \
-#     --do_eval \
-#     --save_strategy epoch \
-#     --evaluation_strategy epoch \
-#     --save_total_limit 1 \
-#     --prediction_loss_only \
-#     --load_best_model_at_end \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --test_file ${test_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${output_dir} \
-#     --do_predict \
-#     --predict_with_generate \
-#     --metric_name_or_path ${metric_name_or_path} \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-
-python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_nlg_fewshot.sh b/convlab/base_models/t5/key2gen/run_nlg_fewshot.sh
deleted file mode 100644
index f8ab2ee0f5969f2c27552de4fcf82ede5526c384..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_nlg_fewshot.sh
+++ /dev/null
@@ -1,88 +0,0 @@
-n_gpus=1
-task_name="nlg"
-dataset_name=$1
-speaker="system"
-context_window_size=$2
-ratio=$3
-dial_ids_order=$4
-data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+da"
-target_column="response"
-truncation_side="left"
-max_source_length=512
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-
-# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order}
-
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --train_file ${train_file} \
-#     --validation_file ${validation_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${model_name_or_path} \
-#     --do_train \
-#     --do_eval \
-#     --save_strategy epoch \
-#     --evaluation_strategy epoch \
-#     --save_total_limit 1 \
-#     --prediction_loss_only \
-#     --load_best_model_at_end \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --test_file ${test_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${output_dir} \
-#     --do_predict \
-#     --predict_with_generate \
-#     --metric_name_or_path ${metric_name_or_path} \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-
-python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
-
-python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh b/convlab/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh
deleted file mode 100644
index c13a187799bcfb6364d5d9d8a22f4ede96c87897..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_nlg_fewshot_key2gen.sh
+++ /dev/null
@@ -1,88 +0,0 @@
-n_gpus=1
-task_name="nlg"
-dataset_name=$1
-speaker="system"
-context_window_size=$2
-ratio=$3
-dial_ids_order=$4
-data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+da"
-target_column="response"
-truncation_side="left"
-max_source_length=512
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-
-# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order} --key2gen
-
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --train_file ${train_file} \
-#     --validation_file ${validation_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${model_name_or_path} \
-#     --do_train \
-#     --do_eval \
-#     --save_strategy epoch \
-#     --evaluation_strategy epoch \
-#     --save_total_limit 1 \
-#     --prediction_loss_only \
-#     --load_best_model_at_end \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --test_file ${test_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${output_dir} \
-#     --do_predict \
-#     --predict_with_generate \
-#     --metric_name_or_path ${metric_name_or_path} \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-
-python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
-
-python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_nlg_key2gen.sh b/convlab/base_models/t5/key2gen/run_nlg_key2gen.sh
deleted file mode 100644
index f9b89aae85ca2963a162acde2f670caee0f1790e..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_nlg_key2gen.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-n_gpus=1
-task_name="nlg"
-dataset_name=$1
-speaker="system"
-context_window_size=$2
-data_dir="data/${task_name}/key2gen_${dataset_name}/${speaker}/context_${context_window_size}"
-output_dir="output/${task_name}/key2gen_${dataset_name}/${speaker}/context_${context_window_size}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="../nlg/nlg_metric.py"
-metric_for_best_model="bleu"
-source_column="context+da"
-target_column="response"
-truncation_side="left"
-max_source_length=512
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3"
-per_device_train_batch_size=128
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-# python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} --key2gen
-
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --train_file ${train_file} \
-#     --validation_file ${validation_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${model_name_or_path} \
-#     --do_train \
-#     --do_eval \
-#     --save_strategy epoch \
-#     --evaluation_strategy epoch \
-#     --save_total_limit 1 \
-#     --prediction_loss_only \
-#     --load_best_model_at_end \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-
-# python ../run_seq2seq.py \
-#     --task_name ${task_name} \
-#     --test_file ${test_file} \
-#     --source_column ${source_column} \
-#     --target_column ${target_column} \
-#     --max_source_length ${max_source_length} \
-#     --max_target_length ${max_target_length} \
-#     --truncation_side ${truncation_side} \
-#     --model_name_or_path ${output_dir} \
-#     --do_predict \
-#     --predict_with_generate \
-#     --metric_name_or_path ${metric_name_or_path} \
-#     --cache_dir ${cache_dir} \
-#     --output_dir ${output_dir} \
-#     --logging_dir ${logging_dir} \
-#     --overwrite_output_dir \
-#     --preprocessing_num_workers 4 \
-#     --per_device_train_batch_size ${per_device_train_batch_size} \
-#     --per_device_eval_batch_size ${per_device_eval_batch_size} \
-#     --gradient_accumulation_steps ${gradient_accumulation_steps} \
-#     --learning_rate ${lr} \
-#     --num_train_epochs ${num_train_epochs} \
-#     --adafactor \
-#     --gradient_checkpointing
-
-python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_persona.sh b/convlab/base_models/t5/key2gen/run_persona.sh
deleted file mode 100644
index 20ae23608ca8ebcd05cb937b53a088df924e0d66..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_persona.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-n_gpus=1
-task_name="personachat"
-dataset_name="personachat"
-speaker="system"
-data_dir="data/${task_name}/${dataset_name}"
-output_dir="output/${task_name}/${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+persona"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name}
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_persona_fewshot.sh b/convlab/base_models/t5/key2gen/run_persona_fewshot.sh
deleted file mode 100644
index 4ecf435e52293d2d8cc03411940ce55f5e73f3ab..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_persona_fewshot.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-n_gpus=1
-task_name="personachat"
-dataset_name="personachat"
-speaker="system"
-ratio=$1
-dial_ids_order=$2
-data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
-output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+persona"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order}
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh b/convlab/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh
deleted file mode 100644
index 6b95839a376e4857c387535ef0a47a5317530498..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_persona_fewshot_key2gen.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-n_gpus=1
-task_name="personachat"
-dataset_name="personachat"
-speaker="system"
-ratio=$1
-dial_ids_order=$2
-data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
-output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+persona"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_persona_key2gen.sh b/convlab/base_models/t5/key2gen/run_persona_key2gen.sh
deleted file mode 100644
index a2fea9b1fea8540731fd7e3d499156bfe7025690..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_persona_key2gen.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-n_gpus=1
-task_name="personachat"
-dataset_name="personachat"
-speaker="system"
-data_dir="data/${task_name}/key2gen_${dataset_name}"
-output_dir="output/${task_name}/key2gen_${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+persona"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} --key2gen
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_wow.sh b/convlab/base_models/t5/key2gen/run_wow.sh
deleted file mode 100644
index 0fd9fb78ea5e260ef30fdf00658d4a390421f900..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_wow.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-n_gpus=1
-task_name="wow"
-dataset_name="wow"
-speaker="system"
-data_dir="data/${task_name}/${dataset_name}"
-output_dir="output/${task_name}/${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test_unseen.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+knowledge"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name}
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_wow_fewshot.sh b/convlab/base_models/t5/key2gen/run_wow_fewshot.sh
deleted file mode 100644
index c9cdbe645d771d099403c9f840a08114e005a18d..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_wow_fewshot.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-n_gpus=1
-task_name="wow"
-dataset_name="wow"
-speaker="system"
-ratio=$1
-dial_ids_order=$2
-data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
-output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test_unseen.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+knowledge"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="t5-small"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order}
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh b/convlab/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh
deleted file mode 100644
index 6aaf8ca4dd160f8c108281413441e021b5bd0571..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_wow_fewshot_key2gen.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-n_gpus=1
-task_name="wow"
-dataset_name="wow"
-speaker="system"
-ratio=$1
-dial_ids_order=$2
-data_dir="data/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
-output_dir="output/${task_name}/key2gen_${dataset_name}_${ratio}_order${dial_ids_order}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test_unseen.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+knowledge"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=100
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} -r ${ratio} -o ${dial_ids_order} --key2gen
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/key2gen/run_wow_key2gen.sh b/convlab/base_models/t5/key2gen/run_wow_key2gen.sh
deleted file mode 100644
index b5badb7f7fefb979f6d9dea201d87e384a403493..0000000000000000000000000000000000000000
--- a/convlab/base_models/t5/key2gen/run_wow_key2gen.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-n_gpus=1
-task_name="wow"
-dataset_name="wow"
-speaker="system"
-data_dir="data/${task_name}/key2gen_${dataset_name}"
-output_dir="output/${task_name}/key2gen_${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test_unseen.json"
-metric_name_or_path="metric.py"
-metric_for_best_model="bleu"
-source_column="context+knowledge"
-target_column="response"
-truncation_side="left"
-max_source_length=1024
-max_target_length=512
-model_name_or_path="../../gpt/keyword_extraction/output/key2gen_noisy/gpt/dailydialog+metalwoz+tm1+tm2+tm3+wikidialog"
-per_device_train_batch_size=32
-per_device_eval_batch_size=64
-gradient_accumulation_steps=4
-lr=1e-3
-num_train_epochs=10
-
-python create_data_key2gen.py -t ${task_name} -d ${dataset_name} --key2gen
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --save_total_limit 1 \
-    --prediction_loss_only \
-    --load_best_model_at_end \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-python ../run_seq2seq.py \
-    --task_name ${task_name} \
-    --test_file ${test_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${output_dir} \
-    --do_predict \
-    --predict_with_generate \
-    --metric_name_or_path ${metric_name_or_path} \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --adafactor \
-    --gradient_checkpointing
-
-# python ../nlg/merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
-
-# python ../../../nlg/evaluate_unified_datasets.py -p ${output_dir}/predictions.json --dataset_name ${dataset_name}
diff --git a/convlab/base_models/t5/run_seq2seq.py b/convlab/base_models/t5/run_seq2seq.py
index 7aac3c70746e877469fc34892cd3f93f9fd01f22..5fa921f0d4c855dc17b7f3b5d1daa8cc404f957c 100644
--- a/convlab/base_models/t5/run_seq2seq.py
+++ b/convlab/base_models/t5/run_seq2seq.py
@@ -149,6 +149,9 @@ class DataTrainingArguments:
             "help": "An optional metric name or file to evaluate the model."
         },
     )
+    metric_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the metric to use (via the datasets library)."}
+    )
     overwrite_cache: bool = field(
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
@@ -317,8 +320,17 @@ def main():
     # download the dataset.
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+        
         raw_datasets = load_dataset(
-            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir,
+            data_files=data_files if len(data_files) > 0 else None
         )
     else:
         data_files = {}
@@ -528,7 +540,7 @@ def main():
 
     # compute custom metric at evaluation.
     if data_args.metric_name_or_path:
-        metric = load_metric(data_args.metric_name_or_path)
+        metric = load_metric(data_args.metric_name_or_path, data_args.metric_config_name)
     # Must take a EvalPrediction and return a dictionary string to metric values.
     def compute_metrics(p: EvalPrediction):
         preds, labels = p.predictions, p.label_ids