diff --git a/convlab/base_models/gpt/__init__.py b/convlab/base_models/gpt/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/convlab/base_models/gpt/create_data.py b/convlab/base_models/gpt/create_data.py
deleted file mode 100644
index e6c4d67bb4babd509feb026d256264c6ff1c0051..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/create_data.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import os
-import json
-from tqdm import tqdm
-import re
-from convlab.util import load_dataset
-
-
-def create_lm_data(dataset, data_dir, args):
-    data_by_split = dataset
-    os.makedirs(data_dir, exist_ok=True)
-
-    data_splits = data_by_split.keys()
-    for data_split in data_splits:
-        data = []
-        for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
-            if args.model_type == 'dialogpt':
-                dialogue = ' <|endoftext|> '.join([turn['utterance'] for turn in sample['turns']]) + ' <|endoftext|>'
-            else:
-                dialogue = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']])
-            data.append(json.dumps({'dialogue': dialogue}, ensure_ascii=False)+'\n')
-
-        file_name = os.path.join(data_dir, f"{data_split}.json")
-        with open(file_name, "w", encoding='utf-8') as f:
-            f.writelines(data)
-
-
-if __name__ == '__main__':
-    from argparse import ArgumentParser
-    parser = ArgumentParser(description="create data for seq2seq training")
-    parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['lm'], help='names of tasks')
-    parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets')
-    parser.add_argument('--model_type', '-m', metavar='model_type', help='type of the language model: gpt, dialogpt, ..')
-    args = parser.parse_args()
-    print(args)
-    for dataset_name in tqdm(args.datasets, desc='datasets'):
-        dataset = load_dataset(dataset_name)
-        for task_name in tqdm(args.tasks, desc='tasks', leave=False):
-            data_dir = os.path.join('data', task_name, args.model_type, dataset_name)
-            eval(f"create_{task_name}_data")(dataset, data_dir, args)
diff --git a/convlab/base_models/gpt/keyword_extraction/eval_key2gen.py b/convlab/base_models/gpt/keyword_extraction/eval_key2gen.py
deleted file mode 100644
index 6b1068cef045550f57621fe0ab4aad8a4047cfbb..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/eval_key2gen.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import json
-import datasets
-from tabulate import tabulate
-
-def main(predict_result):
-    data = {
-        "grounded keywords": {
-            "positive_keywords": [], "negative_keywords": None,
-            "predictions": [], "references": []
-        },
-        "all keywords": {
-            "positive_keywords": [], "negative_keywords": [],
-            "predictions": [], "references": []
-        },
-        "no keywords": {
-            "positive_keywords": None, "negative_keywords": None,
-            "predictions": [], "references": []
-        }
-    }
-    with open(predict_result) as f:
-        for line in f:
-            item = json.loads(line)
-            prediction = item['predictions'].strip()
-            reference = item['target'].strip()
-            if 'all_keywords' in item and item['all_keywords']:
-                sample_type = 'all keywords'
-
-                positive_keywords = [k for g in item['keywords'] for k in g]
-                data[sample_type]["positive_keywords"].append(positive_keywords)
-
-                all_keywords = [k for g in item['all_keywords'] for k in g]
-                for keyword in positive_keywords:
-                    all_keywords.remove(keyword)
-                data[sample_type]["negative_keywords"].append(all_keywords)
-
-            elif 'keywords' in item and item['keywords']:
-                sample_type = 'grounded keywords'
-
-                positive_keywords = [k for g in item['keywords'] for k in g]
-                data[sample_type]["positive_keywords"].append(positive_keywords)
-            
-            else:
-                sample_type = 'no keywords'
-
-            data[sample_type]["predictions"].append(prediction)
-            data[sample_type]["references"].append(reference)
-
-    metric = datasets.load_metric('./key2gen_metric.py')
-    table = []
-    for sample_type in data:
-        table.append({'sample_type': sample_type, **metric.compute(**data[sample_type])})
-    print(tabulate(table, headers='keys', tablefmt='github'))
-
-
-if __name__ == '__main__':
-    from argparse import ArgumentParser
-    parser = ArgumentParser(description="evaluate keywords to response generation performance")
-    parser.add_argument('--predict_result', '-p', type=str, required=True, help='path to the output file generated_predictions.json')
-    args = parser.parse_args()
-    print(args)
-    main(args.predict_result)
diff --git a/convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.py b/convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.py
deleted file mode 100644
index b6ef65db298378b744a45130fd71c072243bcfca..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import json
-import json_lines
-import os
-import random
-from tqdm import tqdm
-from nltk import sent_tokenize
-
-def main(args):
-    random.seed(42)
-    os.makedirs(args.output_dir, exist_ok=True)
-    filenames = [os.path.join(args.input_dir, f) for (_, _, fs) in os.walk(args.input_dir) for f in fs if 'keywords' in f]
-    for filename in filenames:
-        dataset_name = filename.split('/')[-2]
-        data_split = filename.split('/')[-1].split('_')[-1].split('.')[0]
-        output_file = os.path.join(args.output_dir, f"{filename.split('/')[-1].split('_')[-1]}")
-        print(f'processing {dataset_name}: {filename} => {output_file}')
-        with open(filename, 'rb') as fin, open(output_file, 'w', encoding='utf-8') as fout:
-            for dial in tqdm(json_lines.reader(fin)):
-                context = []
-                turns_keywords = [turn['keywords'] for turn in dial]
-                for i, turn in enumerate(dial):
-                    if dataset_name == 'wikidialog':
-                        # skip user turns that generated by T5 in wikidialog 
-                        speaker = 'user' if i % 2 == 1 else 'system'
-                    else:
-                        speaker = 'user' if i % 2 == 0 else 'system'
-                    utt = turn['utterance']
-                    context_seq = '\n'.join([f"{turn['speaker']}: {turn['utt']}" for turn in context]+[f'{speaker}: '])
-                    context.append({'speaker': speaker, 'utt': utt})
-                    if i == 0 or (dataset_name == 'wikidialog' and speaker == 'user'):
-                        continue
-                    
-                    if args.mode == 'rg':
-                        input_seq = f'generate a response: all knowledge: | | context:\n\n{context_seq}'
-                        fout.write(json.dumps({
-                            'dataset': dataset_name,
-                            'source': input_seq, 
-                            'target': utt
-                            }, ensure_ascii=False)+'\n')
-                        continue
-
-                    if args.mode == 'key2gen':
-                        random.shuffle(turn['keywords'])
-                        for j in range(len(turn['keywords'])):
-                            random.shuffle(turn['keywords'][j])
-                        keywords = ' | '.join([' : '.join(sent_keywords) for sent_keywords in turn['keywords']])
-                        input_seq = f'generate a response: grounded knowledge: | {keywords} | context:\n\n{context_seq}'
-                        json2dump = {
-                            'dataset': dataset_name,
-                            'source': input_seq, 
-                            'target': utt
-                            }
-                        if data_split == 'validation':
-                            json2dump.update({'keywords': turn['keywords']})
-                        fout.write(json.dumps(json2dump, ensure_ascii=False)+'\n')
-                        continue
-
-                    if args.mode == 'key2gen_noisy':
-                        if random.random() < 0.8:
-                            possible_keywords_sents = turn['keywords'][:]
-                        else:
-                            possible_keywords_sents = []
-                        num_possible_keywords_turns = min(random.randint(1, 5), len(turns_keywords) - 1)
-                        for turn_keywords in random.sample(turns_keywords[:i] + turns_keywords[i+1:], num_possible_keywords_turns):
-                            possible_keywords_sents.extend(turn_keywords)
-                        random.shuffle(possible_keywords_sents)
-                        possible_keywords = ' | '.join([' : '.join(sent_keywords) for sent_keywords in possible_keywords_sents])
-                        input_seq = f'generate a response: all knowledge: | {possible_keywords} | context:\n\n{context_seq}'
-                        json2dump = {
-                            'dataset': dataset_name,
-                            'source': input_seq, 
-                            'target': utt
-                            }
-                        if data_split == 'validation':
-                            json2dump.update({'keywords': turn['keywords'], 'all_keywords': possible_keywords_sents})
-                        fout.write(json.dumps(json2dump, ensure_ascii=False)+'\n')
-                        continue
-    
-
-if __name__ == '__main__':
-    from argparse import ArgumentParser
-    parser = ArgumentParser(description="calculate NLU metrics for unified datasets")
-    parser.add_argument('--input_dir', '-i', type=str, help='path to the input files')
-    parser.add_argument('--output_dir', '-o', type=str, help='path to the output files')
-    parser.add_argument('--mode', '-m', type=str, choices=['rg', 'key2gen', 'key2gen_noisy'], help='which task to perform')
-    args = parser.parse_args()
-    print(args)
-    main(args)
diff --git a/convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.sh b/convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.sh
deleted file mode 100644
index eb67a18b22e0480323f132abbf42a2d1508755b9..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-# generate data for response generation, key2gen, key2gen_noisy
-for task_name in rg key2gen key2gen_noisy
-do
-    dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
-    names=$(echo ${dataset_name} | tr "+" "\n")
-    model_type="gpt"
-    data_dir=data/${task_name}/${model_type}/${dataset_name}
-    mkdir -p ${data_dir}
-    train_file="${data_dir}/train.json"
-    validation_file="${data_dir}/validation.json"
-    rm ${train_file} ${validation_file}
-    for name in ${names}
-    do
-        echo "preprocessing ${name}"
-        python gen_pretraining_data.py -i data/lm/${model_type}/${name} -o data/${task_name}/${model_type}/${name} -m ${task_name}
-        if [ "${name}" != "${dataset_name}" ]; then
-            cat "data/${task_name}/${model_type}/${name}/train.json" >> ${train_file}
-            cat "data/${task_name}/${model_type}/${name}/validation.json" >> ${validation_file}
-        fi
-    done
-done
-
-# merge key2gen+key2gen_noisy data
-task_name="key2gen+key2gen_noisy"
-dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
-names=$(echo ${task_name} | tr "+" "\n")
-model_type="gpt"
-data_dir=data/${task_name}/${model_type}/${dataset_name}
-mkdir -p ${data_dir}
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-rm ${train_file} ${validation_file}
-for name in ${names}
-do
-    echo "preprocessing ${name}"
-    if [ "${name}" != "${task_name}" ]; then
-        cat "data/${name}/${model_type}/${dataset_name}/train.json" >> ${train_file}
-        cat "data/${name}/${model_type}/${dataset_name}/validation.json" >> ${validation_file}
-    fi
-done
\ No newline at end of file
diff --git a/convlab/base_models/gpt/keyword_extraction/get_keywords.sh b/convlab/base_models/gpt/keyword_extraction/get_keywords.sh
deleted file mode 100644
index d3051ba65b2458bfad7af288746a28456f4936e5..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/get_keywords.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-task_name="lm"
-model_type="gpt"
-model_name_or_path="gpt2-large"
-keywords_num=100
-keywords_ratio=0.3
-keywords_loss_th=0
-stopwords=True
-for dataset_name in dailydialog metalwoz tm1 tm2 tm3 sgd reddit wikidialog
-do
-    data_dir="data/${task_name}/${model_type}/${dataset_name}"
-    for data_split in validation train
-    do
-        token_loss_file="${data_dir}/token_loss_${data_split}.json"
-        output_file="${data_dir}/keywords_${data_split}.json"
-        python lmloss2keywords.py \
-            --model_type ${model_type} \
-            --model_name_or_path ${model_name_or_path} \
-            --token_loss_file ${token_loss_file} \
-            --keywords_num ${keywords_num} \
-            --keywords_ratio ${keywords_ratio} \
-            --keywords_loss_th ${keywords_loss_th} \
-            --stopwords ${stopwords} \
-            --output_file ${output_file}
-    done
-done
\ No newline at end of file
diff --git a/convlab/base_models/gpt/keyword_extraction/get_token_loss.sh b/convlab/base_models/gpt/keyword_extraction/get_token_loss.sh
deleted file mode 100644
index 7c2b57dafa89d0bd711c927a0e97994d43d15bfc..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/get_token_loss.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-n_gpus=4
-master_port=23456
-task_name="lm"
-model_type="gpt"
-cache_dir="../cache"
-source_column="dialogue"
-max_length=512
-model_name_or_path="gpt2-large"
-per_device_eval_batch_size=16
-
-for dataset_name in dailydialog metalwoz tm1 tm2 tm3 sgd reddit wikidialog
-do
-    data_dir="data/${task_name}/${model_type}/${dataset_name}"
-    output_dir="output/${task_name}/${model_type}/${dataset_name}"
-
-    python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type}
-    for data_split in validation train
-    do
-        validation_file="${data_dir}/${data_split}.json"
-        dump_eval_loss_to="${data_dir}/token_loss_${data_split}.json"
-        rm ${dump_eval_loss_to}
-        python -m torch.distributed.launch --master_port ${master_port} \
-            --nproc_per_node ${n_gpus} ../run_clm.py \
-            --dump_eval_loss_to ${dump_eval_loss_to}\
-            --model_name_or_path ${model_name_or_path} \
-            --output_dir ${data_dir} \
-            --validation_file ${validation_file} \
-            --source_column ${source_column} \
-            --max_length ${max_length} \
-            --do_eval \
-            --cache_dir ${cache_dir} \
-            --preprocessing_num_workers 4 \
-            --per_device_eval_batch_size ${per_device_eval_batch_size}
-    done
-done
diff --git a/convlab/base_models/gpt/keyword_extraction/key2gen_metric.py b/convlab/base_models/gpt/keyword_extraction/key2gen_metric.py
deleted file mode 100644
index d9722d96ca71a961dc7ad837191fa202848111f3..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/key2gen_metric.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""key2gen Metric"""
-
-import datasets
-import sacrebleu
-
-# TODO: Add BibTeX citation
-_CITATION = """\
-@inproceedings{post-2018-call,
-    title = "A Call for Clarity in Reporting {BLEU} Scores",
-    author = "Post, Matt",
-    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
-    month = oct,
-    year = "2018",
-    address = "Belgium, Brussels",
-    publisher = "Association for Computational Linguistics",
-    url = "https://www.aclweb.org/anthology/W18-6319",
-    pages = "186--191",
-}
-"""
-
-_DESCRIPTION = """\
-Metric to evaluate text-to-text models on the keywords grounded generation task.
-"""
-
-_KWARGS_DESCRIPTION = """
-Calculates corpus-bleu4, positive keywords recall, negative keywords recall 
-Args:
-    positive_keywords: list of keywords (list of string) in the ground truth references
-    negative_keywords: list of keywords (list of string) in the random sampled references
-    predictions: list of predictions to score. Each predictions
-        should be a string.
-    references: list of reference for each prediction. Each
-        reference should be a string.
-Returns:
-    bleu: corpus-bleu score
-    positive_keywords_recall: how many keywords in the ground truth response are generated, micro-averaged
-    negative_keywords_recall: how many keywords in the random sampled response are generated, micro-averaged
-"""
-
-
-@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class Key2GenMetrics(datasets.Metric):
-    """Metric to evaluate text-to-text models on the keywords grounded generation task."""
-
-    def _info(self):
-        return datasets.MetricInfo(
-            description=_DESCRIPTION,
-            citation=_CITATION,
-            inputs_description=_KWARGS_DESCRIPTION,
-            # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'predictions': datasets.Value('string'),
-                'references': datasets.Value('string'),
-            })
-        )
-
-    def _compute(self, predictions, references, positive_keywords, negative_keywords=None):
-        """Returns the scores: bleu, positive_keywords_recall, negative_keywords_recall"""
-        bleu = sacrebleu.corpus_bleu(predictions, [references], lowercase=True).score
-        cnt = {'pos': 0, 'neg': 0, 'pos_recall': 0, 'neg_recall': 0}
-        if positive_keywords:
-            if not negative_keywords:
-                negative_keywords = [[]] * len(positive_keywords)
-            for poskeys, negkeys, prediction in zip(positive_keywords, negative_keywords, predictions):
-                cnt['pos'] += len(poskeys)
-                cnt['neg'] += len(negkeys)
-
-                prediction = prediction.lower()
-                for key in poskeys:
-                    key = key.lower()
-                    if key in prediction:
-                        cnt['pos_recall'] += 1
-                
-                for key in negkeys:
-                    key = key.lower()
-                    if key in prediction:
-                        cnt['neg_recall'] += 1
-            
-        return {
-            "bleu": bleu,
-            "positive_keywords_recall": cnt['pos_recall']/cnt['pos'] if cnt['pos'] > 0 else 0,
-            "negative_keywords_recall": cnt['neg_recall']/cnt['neg'] if cnt['neg'] > 0 else 0,
-        }
diff --git a/convlab/base_models/gpt/keyword_extraction/lmloss2keywords.py b/convlab/base_models/gpt/keyword_extraction/lmloss2keywords.py
deleted file mode 100644
index bb221f6d78b026c61f10846c385b5fa903c64e7f..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/lmloss2keywords.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import json
-import json_lines
-from pprint import pprint
-import os
-from tqdm import tqdm
-import numpy as np
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize, PunktSentenceTokenizer
-from transformers import GPT2Tokenizer
-from string import punctuation
-
-
-def merge_tokens(tokens, losses):
-    """Merge tokens into words"""
-    res = []
-    i = 0
-    while i < len(tokens):
-        token = tokens[i]
-        loss = losses[i]
-        if token in ['Ġ', 'Ċ']:
-            # "Ġ" means " ", "Ċ" means "\n"
-            if token == 'Ċ' and i < len(tokens) - 1 and not tokens[i+1].startswith('Ġ'):
-                tokens[i+1] = 'Ġ'+tokens[i+1]
-            i += 1
-            continue
-        if token in ['user', 'system', 'Ġuser', 'Ġsystem'] and i < len(tokens)-1 and tokens[i+1] == ':':
-            if i > 0:
-                tokens[i+1] = '<|endoftext|>'
-                i += 1
-            else:
-                i += 2
-            continue
-        if token.startswith('Ġ'):
-            # token = token.replace("Ġ", "")
-            res.append([[token], [loss]])
-        elif token == '<|endoftext|>':
-            res.append([[token], [0.]])
-        else:
-            assert 'Ġ' not in token
-            if len(res) > 0:
-                res[-1][0].append(token)
-                res[-1][1].append(loss)
-            else:
-                res.append([[token], [loss]])
-        i += 1
-    return res
-
-
-def convert_token_loss2word_loss(token_loss_file):
-    """generate a word loss file according to the token loss file"""
-    word_loss_file = os.path.join(os.path.dirname(token_loss_file), token_loss_file.split('/')[-1].replace('token', 'word'))
-    fin = open(token_loss_file, 'rb')
-    fout = open(word_loss_file, 'w', encoding='utf-8')
-
-    for item in tqdm(json_lines.reader(fin)):
-        tokens, losses = item['tokens'], item['losses']
-        assert len(tokens) == len(losses)
-        word2losses = merge_tokens(tokens, losses)
-        fout.write(json.dumps({"words": [x[0] for x in word2losses], "losses": [x[1] for x in word2losses]}, ensure_ascii=False)+'\n')
-
-    fin.close()
-    fout.close()
-    return word_loss_file
-
-def main(args):
-    if not args.word_loss_file:
-        word_loss_file = convert_token_loss2word_loss(args.token_loss_file)
-    else:
-        word_loss_file = args.word_loss_file
-
-    if not args.output_file:
-        return
-
-    stop_words = set(stopwords.words('english'))
-    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
-    sent_tokenizer = PunktSentenceTokenizer()
-
-    def keywords_filter(words, losses):
-        word_loss_pairs = list(zip(words, losses))
-        index2keyword = {}
-        index2turn_sent = {}
-        num_turns = 0
-        turns_sent_spans = [list(sent_tokenizer.span_tokenize(utt)) for utt in ''.join(words).strip().split('<|endoftext|>')]
-        utt = ''
-        for i, word_loss_pair in enumerate(word_loss_pairs):
-            if word_loss_pair[0].startswith('<|endoftext|>'):
-                num_turns += 1
-                utt = ''
-                continue
-            utt += word_loss_pair[0]
-            words = word_tokenize(word_loss_pair[0])
-            if args.stopwords and any([w.lower() in stop_words for w in words]):
-                # skip stopwords
-                continue
-            if word_loss_pair[1] <= args.keywords_loss_th:
-                # skip if loss is too small
-                continue
-            # strip punctuation
-            strip_punctuation = word_loss_pair[0].strip(punctuation).strip()
-            if len(strip_punctuation) == 0:
-                # skip punctuation
-                continue
-            index2keyword[i] = strip_punctuation
-            for sent_idx, (sent_start, sent_end) in enumerate(turns_sent_spans[num_turns]):
-                if len(utt.strip()) <= sent_end:
-                    index2turn_sent[i] = (num_turns, sent_idx)
-                    break
-        candidate_indexes = list(index2keyword.keys())
-        topk = min(round(args.keywords_ratio*(len(word_loss_pairs)-num_turns)), args.keywords_num)
-        topk_indexes = sorted(candidate_indexes, key=lambda x: word_loss_pairs[x][1], reverse=True)[:topk]
-        topk_indexes = sorted(topk_indexes)
-        keywords = []
-        keywords_turn_sent2idx = {}
-        for i, index in enumerate(topk_indexes):
-            if i > 0 and index == topk_indexes[i-1] + 1 and \
-                word_loss_pairs[index][0].strip().startswith(index2keyword[index]) and \
-                word_loss_pairs[topk_indexes[i-1]][0].strip().endswith(index2keyword[topk_indexes[i-1]]):
-                keywords[-1]+= ' '+index2keyword[index]
-            else:
-                keywords_turn_sent2idx.setdefault(index2turn_sent[index][0], {})
-                keywords_turn_sent2idx[index2turn_sent[index][0]].setdefault(index2turn_sent[index][1], [])
-                keywords_turn_sent2idx[index2turn_sent[index][0]][index2turn_sent[index][1]].append(len(keywords))
-                keywords.append(index2keyword[index])
-
-        return keywords, keywords_turn_sent2idx
-
-    fin = open(word_loss_file, 'rb')
-    fout = open(args.output_file, 'w', encoding='utf-8')
-
-    for item in tqdm(json_lines.reader(fin)):
-        words = [tokenizer.convert_tokens_to_string(tokens) for tokens in item['words']]
-        losses = [np.mean(loss) for loss in item['losses']]
-        dialog_keywords, keywords_turn_sent2idx = keywords_filter(words, losses)
-        # print(keywords_turn_sent2idx)
-        turns = []
-        turn = {'words': [], 'losses': []}
-        for i, (word, loss) in enumerate(zip(words, losses)):
-            if word != '<|endoftext|>':
-                turn['words'].append(word)
-                turn['losses'].append(loss)
-            if word == '<|endoftext|>' or i == len(words) - 1:
-                # switch turn
-                turn['utterance'] = ''.join(turn['words']).strip()
-                # 1) extract keywords according to LM loss within the turn
-                # keywords, _ = keywords_filter(turn['words'], turn['losses'])
-                # turn['turn-level_keywords'] = keywords
-                # 1) extract keywords according to LM loss over the dialog, and group them by sentence
-                turn['keywords'] = [[dialog_keywords[idx] for idx in k_idxes] for sent_idx, k_idxes in keywords_turn_sent2idx.get(len(turns), {}).items()]
-                turn.pop('words')
-                turn.pop('losses')
-                turns.append(turn)
-                turn = {'words': [], 'losses': []}
-                
-        fout.write(json.dumps(turns, ensure_ascii=False)+'\n')
-    
-    fin.close()
-    fout.close()
-
-
-if __name__ == '__main__':
-    from argparse import ArgumentParser
-    parser = ArgumentParser(description="extract keywords according to lm loss")
-    parser.add_argument('--model_type', '-m', type=str, help='gpt or dialogpt')
-    parser.add_argument('--model_name_or_path', type=str, help='model name or path')
-    parser.add_argument('--token_loss_file', '-t', type=str, help='path to the token loss file that contains two columns: [tokens, losses]')
-    parser.add_argument('--word_loss_file', '-w', type=str, help='path to the token loss file that contains two columns: [tokens, losses]')
-    parser.add_argument('--output_file', '-o', type=str, help='path to the output file')
-    parser.add_argument('--keywords_num', '-n', type=int, default=100, help='how many words in an utterance serve as keywords')
-    parser.add_argument('--keywords_ratio', '-r', type=float, default=1.0, help='how many words (in ratio) in an utterance serve as keywords')
-    parser.add_argument('--keywords_loss_th', '-th', type=float, default=0., help='loss threshold for the keywords')
-    parser.add_argument('--stopwords', '-s', type=lambda x: bool(eval(x)), default=True, help='filter out stopwords')
-    args = parser.parse_args()
-    print(args)
-    main(args)
diff --git a/convlab/base_models/gpt/keyword_extraction/merge_keywords_res.py b/convlab/base_models/gpt/keyword_extraction/merge_keywords_res.py
deleted file mode 100644
index 94af288a38845f1cf72470da4af916be5a6f0dda..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/merge_keywords_res.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import json
-
-def main(args):
-    filename2data = {f.split('/')[-1]: json.load(open(f)) for f in args.keywords_files}
-    first_filename = args.keywords_files[0].split('/')[-1]
-    dialogs = []
-    for i in range(len(filename2data[first_filename])):
-        turns = []
-        for j in range(min([len(filename2data[filename][i]) for filename in filename2data])):
-            utt = filename2data[first_filename][i][j]['utterance']
-            keywords = {filename.split('_')[3]+'_nonstopword'+filename.split('_')[-1]: ' | '.join(filename2data[filename][i][j]['keywords']) for filename in filename2data}
-            turns.append({
-                "utterance": utt,
-                **keywords
-            })
-        dialogs.append(turns)
-    json.dump(dialogs, open(args.output_file, "w", encoding='utf-8'), indent=2, ensure_ascii=False)
-
-
-    
-
-if __name__ == '__main__':
-    from argparse import ArgumentParser
-    parser = ArgumentParser(description="calculate NLU metrics for unified datasets")
-    parser.add_argument('--keywords_files', '-f', metavar='keywords_files', nargs='*', help='keywords files')
-    parser.add_argument('--output_file', '-o', type=str, help='path to the output file')
-    args = parser.parse_args()
-    print(args)
-    main(args)
diff --git a/convlab/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh b/convlab/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh
deleted file mode 100644
index f260f7071529e6837f9c7807d6d5ecf2469494a2..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-set -e
-n_gpus=1
-task_name="lm"
-dataset_name="multiwoz21"
-model_type="dialogpt"
-data_dir="data/${task_name}/${dataset_name}/${model_type}"
-output_dir="output/${task_name}/${dataset_name}/${model_type}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-source_column="dialogue"
-max_length=512
-model_name_or_path="microsoft/DialoGPT-large"
-per_device_train_batch_size=16
-per_device_eval_batch_size=16
-gradient_accumulation_steps=4
-lr=5e-5
-num_train_epochs=3
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type}
-
-python ../run_clm.py \
-    --model_name_or_path ${model_name_or_path} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --max_length ${max_length} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --load_best_model_at_end \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --gradient_checkpointing
diff --git a/convlab/base_models/gpt/keyword_extraction/train_lm_gpt.sh b/convlab/base_models/gpt/keyword_extraction/train_lm_gpt.sh
deleted file mode 100644
index 82c63a1f4c4a1633ad5e7d4a721a3bbac558cefb..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/train_lm_gpt.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-set -e
-n_gpus=1
-task_name="lm"
-dataset_name="multiwoz21"
-model_type="gpt"
-data_dir="data/${task_name}/${dataset_name}/${model_type}"
-output_dir="output/${task_name}/${dataset_name}/${model_type}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-validation_file="${data_dir}/validation.json"
-test_file="${data_dir}/test.json"
-source_column="dialogue"
-max_length=512
-model_name_or_path="gpt2-large"
-per_device_train_batch_size=16
-per_device_eval_batch_size=16
-gradient_accumulation_steps=4
-lr=5e-5
-num_train_epochs=3
-
-python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type}
-
-python ../run_clm.py \
-    --model_name_or_path ${model_name_or_path} \
-    --train_file ${train_file} \
-    --validation_file ${validation_file} \
-    --source_column ${source_column} \
-    --max_length ${max_length} \
-    --do_train \
-    --do_eval \
-    --save_strategy epoch \
-    --evaluation_strategy epoch \
-    --load_best_model_at_end \
-    --prediction_loss_only \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 4 \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --gradient_checkpointing
diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen+key2gen_noisy.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen+key2gen_noisy.sh
deleted file mode 100644
index 8e0b3617210408d3226bd7da9f675534c9458398..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen+key2gen_noisy.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-set -e
-n_gpus=8
-master_port=23456
-task_name="key2gen+key2gen_noisy"
-dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
-model_type="gpt"
-model_name="t5-small"
-data_dir="data/${task_name}/${model_type}/${dataset_name}"
-output_dir="output/${task_name}/${model_name}/${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-source_column="source"
-target_column="target"
-truncation_side="left"
-max_source_length=512
-max_target_length=128
-model_name_or_path="${model_name}"
-per_device_train_batch_size=64
-per_device_eval_batch_size=128
-gradient_accumulation_steps=1
-num_workers=16
-lr=1e-3
-num_train_epochs=1
-
-python -m torch.distributed.launch --master_port ${master_port} \
-    --nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_steps 5000 \
-    --save_total_limit 1 \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --preprocessing_num_workers ${num_workers} \
-    --dataloader_num_workers ${num_workers} \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --optim adafactor \
-    --lr_scheduler_type constant \
-    --gradient_checkpointing
diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh
deleted file mode 100644
index 8d9a019bd0fa10d63586c023705807a3eafd5ff0..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-set -e
-n_gpus=8
-master_port=23456
-task_name="rg"
-dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
-model_type="gpt"
-model_name="t5-small"
-data_dir="data/${task_name}/${model_type}/${dataset_name}"
-output_dir="output/${task_name}/${model_name}/${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-source_column="source"
-target_column="target"
-truncation_side="left"
-max_source_length=512
-max_target_length=128
-model_name_or_path="${model_name}"
-per_device_train_batch_size=64
-per_device_eval_batch_size=128
-gradient_accumulation_steps=1
-num_workers=16
-lr=1e-3
-num_train_epochs=1
-
-python -m torch.distributed.launch --master_port ${master_port} \
-    --nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_steps 5000 \
-    --save_total_limit 1 \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --preprocessing_num_workers ${num_workers} \
-    --dataloader_num_workers ${num_workers} \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --optim adafactor \
-    --lr_scheduler_type constant \
-    --gradient_checkpointing
diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh
deleted file mode 100644
index 75b79932bb94b0699d2e2349a4c8cb8846915cb3..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-set -e
-n_gpus=8
-master_port=23456
-task_name="key2gen+key2gen_noisy"
-dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog"
-model_type="gpt"
-model_name="t5-small"
-data_dir="data/${task_name}/${model_type}/${dataset_name}"
-output_dir="output/${task_name}/${model_name}/${dataset_name}"
-cache_dir="../cache"
-logging_dir="${output_dir}/runs"
-train_file="${data_dir}/train.json"
-source_column="source"
-target_column="target"
-truncation_side="left"
-max_source_length=512
-max_target_length=128
-model_name_or_path="output/rg/${model_name}/${dataset_name}"
-per_device_train_batch_size=64
-per_device_eval_batch_size=128
-gradient_accumulation_steps=1
-num_workers=16
-lr=1e-3
-num_train_epochs=1
-
-python -m torch.distributed.launch --master_port ${master_port} \
-    --nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
-    --task_name ${task_name} \
-    --train_file ${train_file} \
-    --source_column ${source_column} \
-    --target_column ${target_column} \
-    --max_source_length ${max_source_length} \
-    --max_target_length ${max_target_length} \
-    --truncation_side ${truncation_side} \
-    --model_name_or_path ${model_name_or_path} \
-    --do_train \
-    --save_steps 5000 \
-    --save_total_limit 1 \
-    --cache_dir ${cache_dir} \
-    --output_dir ${output_dir} \
-    --logging_dir ${logging_dir} \
-    --preprocessing_num_workers ${num_workers} \
-    --dataloader_num_workers ${num_workers} \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
-    --per_device_eval_batch_size ${per_device_eval_batch_size} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --learning_rate ${lr} \
-    --num_train_epochs ${num_train_epochs} \
-    --optim adafactor \
-    --lr_scheduler_type constant \
-    --gradient_checkpointing
diff --git a/convlab/base_models/gpt/run_clm.py b/convlab/base_models/gpt/run_clm.py
deleted file mode 100644
index ace68609af00bb6a05d3b6c45378719a98732414..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/run_clm.py
+++ /dev/null
@@ -1,564 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
-Modified from https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py
-Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
-https://huggingface.co/models?filter=text-generation
-"""
-# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
-
-import logging
-import math
-import os
-import sys
-from dataclasses import dataclass, field
-from itertools import chain
-from typing import Optional
-
-import datasets
-from datasets import load_dataset
-from tqdm import tqdm
-from torch.utils.data import DataLoader
-import torch
-import json
-
-import transformers
-from transformers import (
-    CONFIG_MAPPING,
-    MODEL_FOR_CAUSAL_LM_MAPPING,
-    AutoConfig,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    HfArgumentParser,
-    TrainingArguments,
-    DataCollatorForTokenClassification,
-    is_torch_tpu_available,
-    set_seed,
-)
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version
-from transformers.utils.versions import require_version
-from convlab.base_models.gpt.trainer import DumpTokenLossTrainer
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.17.0")
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
-
-logger = logging.getLogger(__name__)
-
-
-MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The model checkpoint for weights initialization."
-            "Don't set if you want to train a model from scratch."
-        },
-    )
-    model_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
-    )
-    config_overrides: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Override some existing default config settings when a model is trained from scratch. Example: "
-            "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
-        },
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    truncation_side: Optional[str] = field(
-        default="right",
-        metadata={"help": "Which side to truncate, left or right."}
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
-            "with private models)."
-        },
-    )
-    resize_position_embeddings: Optional[bool] = field(
-        default=None,
-        metadata={
-            "help": "Whether to automatically resize the position embeddings if `max_source_length` exceeds "
-                    "the model's position embeddings."
-        },
-    )
-
-    def __post_init__(self):
-        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
-            raise ValueError(
-                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
-            )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    source_column: Optional[str] = field(
-        default=None,
-        metadata={"help": "The name of the column in the datasets containing the texts."},
-    )
-    train_file: Optional[str] = field(
-        default=None, metadata={"help": "The input training data file (a text, jsonlines or csv file)."}
-    )
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "An optional input evaluation data file to evaluate the metrics on (a text, jsonlines or csv file)."
-        },
-    )
-    dump_eval_loss_to: Optional[str] = field(
-        default=None, metadata={"help": "Where to dump the tokens' losses in the evaluation data, default not to"}
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_length: Optional[int] = field(
-        default=1024,
-        metadata={
-            "help": "The maximum total input sequence length after tokenization. Sequences longer "
-                    "than this will be truncated, sequences shorter will be padded."
-        },
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to pad all samples to model maximum sentence length. "
-                    "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
-                    "efficient on GPU but very bad for TPU."
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-            "value if set."
-        },
-    )
-    ignore_pad_token_for_loss: bool = field(
-        default=True,
-        metadata={
-            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
-        },
-    )
-    validation_split_percentage: Optional[int] = field(
-        default=5,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
-    )
-    keep_linebreaks: bool = field(
-        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    datasets.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
-        )
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
-            )
-            raw_datasets["train"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
-            )
-    else:
-        data_files = {}
-        dataset_args = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-        extension = (
-            data_args.train_file.split(".")[-1]
-            if data_args.train_file is not None
-            else data_args.validation_file.split(".")[-1]
-        )
-        if extension == "txt":
-            extension = "text"
-            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
-        raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            use_auth_token=True if model_args.use_auth_token else None,
-            **dataset_args,
-        )
-        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
-                **dataset_args,
-            )
-            raw_datasets["train"] = load_dataset(
-                extension,
-                data_files=data_files,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-                use_auth_token=True if model_args.use_auth_token else None,
-                **dataset_args,
-            )
-
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config_kwargs = {
-        "cache_dir": model_args.cache_dir,
-        "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
-    }
-    if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
-    elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
-    else:
-        config = CONFIG_MAPPING[model_args.model_type]()
-        logger.warning("You are instantiating a new config instance from scratch.")
-        if model_args.config_overrides is not None:
-            logger.info(f"Overriding config: {model_args.config_overrides}")
-            config.update_from_string(model_args.config_overrides)
-            logger.info(f"New config: {config}")
-
-    tokenizer_kwargs = {
-        "cache_dir": model_args.cache_dir,
-        "use_fast": model_args.use_fast_tokenizer,
-        "truncation_side": model_args.truncation_side,
-        "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
-    }
-    if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
-    elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    if model_args.model_name_or_path:
-        model = AutoModelForCausalLM.from_pretrained(
-            model_args.model_name_or_path,
-            from_tf=bool(".ckpt" in model_args.model_name_or_path),
-            config=config,
-            cache_dir=model_args.cache_dir,
-            revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
-        )
-    else:
-        model = AutoModelForCausalLM.from_config(config)
-        n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
-        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
-
-    model.resize_token_embeddings(len(tokenizer))
-
-    if training_args.gradient_checkpointing:
-        # use_cache=True is incompatible with gradient checkpointing.
-        config.use_cache = False
-
-    # Preprocessing the datasets.
-    # First we tokenize all the texts.
-    if training_args.do_train:
-        column_names = raw_datasets["train"].column_names
-    elif training_args.do_eval:
-        column_names = raw_datasets["validation"].column_names
-    else:
-        logger.info("There is nothing to do. Please pass `do_train` and/or `do_eval`.")
-        return
-    if data_args.source_column is None:
-        source_column = column_names[0]
-    else:
-        source_column = data_args.source_column
-        if source_column not in column_names:
-            raise ValueError(
-                f"--source_column' value '{data_args.source_column}' needs to be one of: {', '.join(column_names)}"
-            )
-
-    def preprocess_function(examples):
-
-        inputs = []
-        for i in range(len(examples[source_column])):
-            if len(examples[source_column][i]) > 0:
-                inputs.append(examples[source_column][i])
-        
-        padding = "max_length" if data_args.pad_to_max_length else False
-        model_inputs = tokenizer(inputs, max_length=data_args.max_length, padding=padding, truncation=True)
-
-        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
-        # padding in the loss. Else pad in data_collator.
-        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
-            model_inputs["labels"] = [
-                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in model_inputs["input_ids"]
-            ]
-        else:
-            model_inputs["labels"] = model_inputs["input_ids"].copy()
-
-        return model_inputs
-
-    with training_args.main_process_first(desc="dataset map tokenization"):
-        tokenized_datasets = raw_datasets.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on dataset",
-        )
-    
-    lm_datasets = tokenized_datasets
-
-    if training_args.do_train:
-        if "train" not in tokenized_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = lm_datasets["train"]
-        if data_args.max_train_samples is not None:
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in tokenized_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = lm_datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
-
-    # Data collator
-    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
-    data_collator = DataCollatorForTokenClassification(
-        tokenizer,
-        label_pad_token_id=label_pad_token_id,
-        pad_to_multiple_of=8 if training_args.fp16 else None,
-    )
-
-    training_args.dump_eval_loss_to = data_args.dump_eval_loss_to
-    
-    # Initialize our Trainer
-    trainer = DumpTokenLossTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        # Data collator will default to DataCollatorWithPadding, so we change it.
-        data_collator=data_collator,
-    )
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate(metric_key_prefix="eval")
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-        try:
-            perplexity = math.exp(metrics["eval_loss"])
-        except OverflowError:
-            perplexity = float("inf")
-        metrics["eval_perplexity"] = perplexity
-        logger.info(f"eval_perplexity: {perplexity}")
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-        
-    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
-    if data_args.dataset_name is not None:
-        kwargs["dataset_tags"] = data_args.dataset_name
-        if data_args.dataset_config_name is not None:
-            kwargs["dataset_args"] = data_args.dataset_config_name
-            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-        else:
-            kwargs["dataset"] = data_args.dataset_name
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/convlab/base_models/gpt/trainer.py b/convlab/base_models/gpt/trainer.py
deleted file mode 100644
index 5a8ed11c6566e897a5d3ef7a0d16a130968cd6aa..0000000000000000000000000000000000000000
--- a/convlab/base_models/gpt/trainer.py
+++ /dev/null
@@ -1,243 +0,0 @@
-from transformers import Trainer
-from transformers.trainer_utils import EvalLoopOutput, has_length
-from transformers.deepspeed import deepspeed_init
-from transformers.utils import logging
-from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, IterableDatasetShard, nested_truncate
-from transformers.trainer_utils import EvalPrediction, denumpify_detensorize
-import torch
-from torch.utils.data import DataLoader
-import numpy as np
-from typing import List, Optional
-import json
-
-
-logger = logging.get_logger(__name__)
-
-class DumpTokenLossTrainer(Trainer):
-    def evaluation_loop(
-        self,
-        dataloader: DataLoader,
-        description: str,
-        prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-    ) -> EvalLoopOutput:
-        """
-        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
-        Works both with or without labels.
-        """
-        args = self.args
-
-        prediction_loss_only = args.prediction_loss_only
-
-        # if eval is called w/o train init deepspeed here
-        if args.deepspeed and not self.deepspeed:
-
-            # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
-            # from the checkpoint eventually
-            deepspeed_engine, _, _ = deepspeed_init(
-                self, num_training_steps=0, resume_from_checkpoint=None, inference=True
-            )
-            self.model = deepspeed_engine.module
-            self.model_wrapped = deepspeed_engine
-            self.deepspeed = deepspeed_engine
-
-        model = self._wrap_model(self.model, training=False, dataloader=dataloader)
-
-        # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
-        # while ``train`` is running, cast it to the right dtype first and then put on device
-        if not self.is_in_train:
-            if args.fp16_full_eval:
-                model = model.to(dtype=torch.float16, device=args.device)
-            elif args.bf16_full_eval:
-                model = model.to(dtype=torch.bfloat16, device=args.device)
-
-        batch_size = self.args.eval_batch_size
-
-        logger.info(f"***** Running {description} *****")
-        if has_length(dataloader):
-            logger.info(f"  Num examples = {self.num_examples(dataloader)}")
-        else:
-            logger.info("  Num examples: Unknown")
-        logger.info(f"  Batch size = {batch_size}")
-
-        model.eval()
-
-        self.callback_handler.eval_dataloader = dataloader
-        # Do this before wrapping.
-        eval_dataset = getattr(dataloader, "dataset", None)
-
-        if args.past_index >= 0:
-            self._past = None
-
-        # Initialize containers
-        # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps)
-        losses_host = None
-        preds_host = None
-        labels_host = None
-        inputs_host = None
-
-        # losses/preds/labels on CPU (final containers)
-        all_losses = None
-        all_preds = None
-        all_labels = None
-        all_inputs = None
-        # Will be useful when we have an iterable dataset so don't know its length.
-
-        if args.dump_eval_loss_to:
-            writer = open(args.dump_eval_loss_to, "a", encoding='utf-8')
-            loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
-            num_sample_to_write = len(eval_dataset)
-
-        observed_num_examples = 0
-        # Main evaluation loop
-        for step, inputs in enumerate(dataloader):
-            # Update the observed num examples
-            observed_batch_size = find_batch_size(inputs)
-            if observed_batch_size is not None:
-                observed_num_examples += observed_batch_size
-                # For batch samplers, batch_size is not known by the dataloader in advance.
-                if batch_size is None:
-                    batch_size = observed_batch_size
-
-            # Prediction step
-            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
-            inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None
-
-            # Update containers on host
-            if loss is not None:
-                losses = self._nested_gather(loss.repeat(batch_size))
-                losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
-            if labels is not None:
-                labels = self._pad_across_processes(labels)
-                labels = self._nested_gather(labels)
-                # labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
-            if inputs_decode is not None:
-                inputs_decode = self._pad_across_processes(inputs_decode)
-                inputs_decode = self._nested_gather(inputs_decode)
-                inputs_host = (
-                    inputs_decode
-                    if inputs_host is None
-                    else nested_concat(inputs_host, inputs_decode, padding_index=-100)
-                )
-            if logits is not None:
-                logits = self._pad_across_processes(logits)
-                logits = self._nested_gather(logits)
-                if self.preprocess_logits_for_metrics is not None:
-                    logits = self.preprocess_logits_for_metrics(logits, labels)
-                # preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
-
-            if args.dump_eval_loss_to:
-                if self.is_world_process_zero() and num_sample_to_write > 0:
-                    assert logits is not None and labels is not None, print('prediction_loss_only', prediction_loss_only)
-                    shift_logits = logits[..., :-1, :].contiguous()
-                    shift_labels = labels[..., 1:].contiguous()
-                    batch_token_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-                    batch_token_loss = batch_token_loss.view(shift_labels.size()).tolist()
-                    labels = labels.tolist()
-                    for i in range(len(labels)):
-                        if num_sample_to_write > 0:
-                            num_sample_to_write -= 1
-                        else:
-                            break
-                        token_ids = [x for x in labels[i] if x != -100]
-                        tokens = self.tokenizer.convert_ids_to_tokens(token_ids)
-                        token_losses = [0] + batch_token_loss[i][:len(token_ids)-1]
-                        writer.write(json.dumps({"tokens": tokens, "losses": token_losses}, ensure_ascii=False)+'\n')
-
-            self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
-
-            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
-            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
-                if losses_host is not None:
-                    losses = nested_numpify(losses_host)
-                    all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
-                if preds_host is not None:
-                    logits = nested_numpify(preds_host)
-                    all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
-                if inputs_host is not None:
-                    inputs_decode = nested_numpify(inputs_host)
-                    all_inputs = (
-                        inputs_decode
-                        if all_inputs is None
-                        else nested_concat(all_inputs, inputs_decode, padding_index=-100)
-                    )
-                if labels_host is not None:
-                    labels = nested_numpify(labels_host)
-                    all_labels = (
-                        labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
-                    )
-
-                # Set back to None to begin a new accumulation
-                losses_host, preds_host, inputs_host, labels_host = None, None, None, None
-
-        if args.dump_eval_loss_to:
-            writer.close()
-        
-        if args.past_index and hasattr(self, "_past"):
-            # Clean the state at the end of the evaluation loop
-            delattr(self, "_past")
-
-        # Gather all remaining tensors and put them back on the CPU
-        if losses_host is not None:
-            losses = nested_numpify(losses_host)
-            all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
-        if preds_host is not None:
-            logits = nested_numpify(preds_host)
-            all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
-        if inputs_host is not None:
-            inputs_decode = nested_numpify(inputs_host)
-            all_inputs = (
-                inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100)
-            )
-        if labels_host is not None:
-            labels = nested_numpify(labels_host)
-            all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
-
-        # Number of samples
-        if has_length(eval_dataset):
-            num_samples = len(eval_dataset)
-        # The instance check is weird and does not actually check for the type, but whether the dataset has the right
-        # methods. Therefore we need to make sure it also has the attribute.
-        elif isinstance(eval_dataset, IterableDatasetShard) and hasattr(eval_dataset, "num_examples"):
-            num_samples = eval_dataset.num_examples
-        else:
-            if has_length(dataloader):
-                num_samples = self.num_examples(dataloader)
-            else:  # both len(dataloader.dataset) and len(dataloader) fail
-                num_samples = observed_num_examples
-
-        # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of
-        # samplers has been rounded to a multiple of batch_size, so we truncate.
-        if all_losses is not None:
-            all_losses = all_losses[:num_samples]
-        if all_preds is not None:
-            all_preds = nested_truncate(all_preds, num_samples)
-        if all_labels is not None:
-            all_labels = nested_truncate(all_labels, num_samples)
-        if all_inputs is not None:
-            all_inputs = nested_truncate(all_inputs, num_samples)
-
-        # Metrics!
-        if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
-            if args.include_inputs_for_metrics:
-                metrics = self.compute_metrics(
-                    EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs)
-                )
-            else:
-                metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
-        else:
-            metrics = {}
-
-        # To be JSON-serializable, we need to remove numpy types or zero-d tensors
-        metrics = denumpify_detensorize(metrics)
-
-        if all_losses is not None:
-            metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
-
-        # Prefix all keys with metric_key_prefix + '_'
-        for key in list(metrics.keys()):
-            if not key.startswith(f"{metric_key_prefix}_"):
-                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-
-        return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)