From a92c3a00daa33916f2cda5a8c68e4bcc7a9d27a2 Mon Sep 17 00:00:00 2001 From: zqwerty <zhuq96@hotmail.com> Date: Tue, 29 Nov 2022 20:11:32 +0800 Subject: [PATCH] rm gpt base model --- convlab/base_models/gpt/__init__.py | 0 convlab/base_models/gpt/create_data.py | 39 -- .../gpt/keyword_extraction/eval_key2gen.py | 61 -- .../gen_pretraining_data.py | 88 --- .../gen_pretraining_data.sh | 40 -- .../gpt/keyword_extraction/get_keywords.sh | 25 - .../gpt/keyword_extraction/get_token_loss.sh | 35 -- .../gpt/keyword_extraction/key2gen_metric.py | 96 --- .../gpt/keyword_extraction/lmloss2keywords.py | 174 ------ .../keyword_extraction/merge_keywords_res.py | 29 - .../keyword_extraction/train_lm_dialogpt.sh | 46 -- .../gpt/keyword_extraction/train_lm_gpt.sh | 46 -- .../train_t5_key2gen+key2gen_noisy.sh | 51 -- .../gpt/keyword_extraction/train_t5_rg.sh | 51 -- .../train_t5_rg_key2gen+key2gen_noisy.sh | 51 -- convlab/base_models/gpt/run_clm.py | 564 ------------------ convlab/base_models/gpt/trainer.py | 243 -------- 17 files changed, 1639 deletions(-) delete mode 100644 convlab/base_models/gpt/__init__.py delete mode 100644 convlab/base_models/gpt/create_data.py delete mode 100644 convlab/base_models/gpt/keyword_extraction/eval_key2gen.py delete mode 100644 convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.py delete mode 100644 convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.sh delete mode 100644 convlab/base_models/gpt/keyword_extraction/get_keywords.sh delete mode 100644 convlab/base_models/gpt/keyword_extraction/get_token_loss.sh delete mode 100644 convlab/base_models/gpt/keyword_extraction/key2gen_metric.py delete mode 100644 convlab/base_models/gpt/keyword_extraction/lmloss2keywords.py delete mode 100644 convlab/base_models/gpt/keyword_extraction/merge_keywords_res.py delete mode 100644 convlab/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh delete mode 100644 convlab/base_models/gpt/keyword_extraction/train_lm_gpt.sh delete mode 100644 convlab/base_models/gpt/keyword_extraction/train_t5_key2gen+key2gen_noisy.sh delete mode 100644 convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh delete mode 100644 convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh delete mode 100644 convlab/base_models/gpt/run_clm.py delete mode 100644 convlab/base_models/gpt/trainer.py diff --git a/convlab/base_models/gpt/__init__.py b/convlab/base_models/gpt/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/convlab/base_models/gpt/create_data.py b/convlab/base_models/gpt/create_data.py deleted file mode 100644 index e6c4d67b..00000000 --- a/convlab/base_models/gpt/create_data.py +++ /dev/null @@ -1,39 +0,0 @@ -import os -import json -from tqdm import tqdm -import re -from convlab.util import load_dataset - - -def create_lm_data(dataset, data_dir, args): - data_by_split = dataset - os.makedirs(data_dir, exist_ok=True) - - data_splits = data_by_split.keys() - for data_split in data_splits: - data = [] - for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): - if args.model_type == 'dialogpt': - dialogue = ' <|endoftext|> '.join([turn['utterance'] for turn in sample['turns']]) + ' <|endoftext|>' - else: - dialogue = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']]) - data.append(json.dumps({'dialogue': dialogue}, ensure_ascii=False)+'\n') - - file_name = os.path.join(data_dir, f"{data_split}.json") - with open(file_name, "w", encoding='utf-8') as f: - f.writelines(data) - - -if __name__ == '__main__': - from argparse import ArgumentParser - parser = ArgumentParser(description="create data for seq2seq training") - parser.add_argument('--tasks', '-t', metavar='task_name', nargs='*', choices=['lm'], help='names of tasks') - parser.add_argument('--datasets', '-d', metavar='dataset_name', nargs='*', help='names of unified datasets') - parser.add_argument('--model_type', '-m', metavar='model_type', help='type of the language model: gpt, dialogpt, ..') - args = parser.parse_args() - print(args) - for dataset_name in tqdm(args.datasets, desc='datasets'): - dataset = load_dataset(dataset_name) - for task_name in tqdm(args.tasks, desc='tasks', leave=False): - data_dir = os.path.join('data', task_name, args.model_type, dataset_name) - eval(f"create_{task_name}_data")(dataset, data_dir, args) diff --git a/convlab/base_models/gpt/keyword_extraction/eval_key2gen.py b/convlab/base_models/gpt/keyword_extraction/eval_key2gen.py deleted file mode 100644 index 6b1068ce..00000000 --- a/convlab/base_models/gpt/keyword_extraction/eval_key2gen.py +++ /dev/null @@ -1,61 +0,0 @@ -import json -import datasets -from tabulate import tabulate - -def main(predict_result): - data = { - "grounded keywords": { - "positive_keywords": [], "negative_keywords": None, - "predictions": [], "references": [] - }, - "all keywords": { - "positive_keywords": [], "negative_keywords": [], - "predictions": [], "references": [] - }, - "no keywords": { - "positive_keywords": None, "negative_keywords": None, - "predictions": [], "references": [] - } - } - with open(predict_result) as f: - for line in f: - item = json.loads(line) - prediction = item['predictions'].strip() - reference = item['target'].strip() - if 'all_keywords' in item and item['all_keywords']: - sample_type = 'all keywords' - - positive_keywords = [k for g in item['keywords'] for k in g] - data[sample_type]["positive_keywords"].append(positive_keywords) - - all_keywords = [k for g in item['all_keywords'] for k in g] - for keyword in positive_keywords: - all_keywords.remove(keyword) - data[sample_type]["negative_keywords"].append(all_keywords) - - elif 'keywords' in item and item['keywords']: - sample_type = 'grounded keywords' - - positive_keywords = [k for g in item['keywords'] for k in g] - data[sample_type]["positive_keywords"].append(positive_keywords) - - else: - sample_type = 'no keywords' - - data[sample_type]["predictions"].append(prediction) - data[sample_type]["references"].append(reference) - - metric = datasets.load_metric('./key2gen_metric.py') - table = [] - for sample_type in data: - table.append({'sample_type': sample_type, **metric.compute(**data[sample_type])}) - print(tabulate(table, headers='keys', tablefmt='github')) - - -if __name__ == '__main__': - from argparse import ArgumentParser - parser = ArgumentParser(description="evaluate keywords to response generation performance") - parser.add_argument('--predict_result', '-p', type=str, required=True, help='path to the output file generated_predictions.json') - args = parser.parse_args() - print(args) - main(args.predict_result) diff --git a/convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.py b/convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.py deleted file mode 100644 index b6ef65db..00000000 --- a/convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.py +++ /dev/null @@ -1,88 +0,0 @@ -import json -import json_lines -import os -import random -from tqdm import tqdm -from nltk import sent_tokenize - -def main(args): - random.seed(42) - os.makedirs(args.output_dir, exist_ok=True) - filenames = [os.path.join(args.input_dir, f) for (_, _, fs) in os.walk(args.input_dir) for f in fs if 'keywords' in f] - for filename in filenames: - dataset_name = filename.split('/')[-2] - data_split = filename.split('/')[-1].split('_')[-1].split('.')[0] - output_file = os.path.join(args.output_dir, f"{filename.split('/')[-1].split('_')[-1]}") - print(f'processing {dataset_name}: {filename} => {output_file}') - with open(filename, 'rb') as fin, open(output_file, 'w', encoding='utf-8') as fout: - for dial in tqdm(json_lines.reader(fin)): - context = [] - turns_keywords = [turn['keywords'] for turn in dial] - for i, turn in enumerate(dial): - if dataset_name == 'wikidialog': - # skip user turns that generated by T5 in wikidialog - speaker = 'user' if i % 2 == 1 else 'system' - else: - speaker = 'user' if i % 2 == 0 else 'system' - utt = turn['utterance'] - context_seq = '\n'.join([f"{turn['speaker']}: {turn['utt']}" for turn in context]+[f'{speaker}: ']) - context.append({'speaker': speaker, 'utt': utt}) - if i == 0 or (dataset_name == 'wikidialog' and speaker == 'user'): - continue - - if args.mode == 'rg': - input_seq = f'generate a response: all knowledge: | | context:\n\n{context_seq}' - fout.write(json.dumps({ - 'dataset': dataset_name, - 'source': input_seq, - 'target': utt - }, ensure_ascii=False)+'\n') - continue - - if args.mode == 'key2gen': - random.shuffle(turn['keywords']) - for j in range(len(turn['keywords'])): - random.shuffle(turn['keywords'][j]) - keywords = ' | '.join([' : '.join(sent_keywords) for sent_keywords in turn['keywords']]) - input_seq = f'generate a response: grounded knowledge: | {keywords} | context:\n\n{context_seq}' - json2dump = { - 'dataset': dataset_name, - 'source': input_seq, - 'target': utt - } - if data_split == 'validation': - json2dump.update({'keywords': turn['keywords']}) - fout.write(json.dumps(json2dump, ensure_ascii=False)+'\n') - continue - - if args.mode == 'key2gen_noisy': - if random.random() < 0.8: - possible_keywords_sents = turn['keywords'][:] - else: - possible_keywords_sents = [] - num_possible_keywords_turns = min(random.randint(1, 5), len(turns_keywords) - 1) - for turn_keywords in random.sample(turns_keywords[:i] + turns_keywords[i+1:], num_possible_keywords_turns): - possible_keywords_sents.extend(turn_keywords) - random.shuffle(possible_keywords_sents) - possible_keywords = ' | '.join([' : '.join(sent_keywords) for sent_keywords in possible_keywords_sents]) - input_seq = f'generate a response: all knowledge: | {possible_keywords} | context:\n\n{context_seq}' - json2dump = { - 'dataset': dataset_name, - 'source': input_seq, - 'target': utt - } - if data_split == 'validation': - json2dump.update({'keywords': turn['keywords'], 'all_keywords': possible_keywords_sents}) - fout.write(json.dumps(json2dump, ensure_ascii=False)+'\n') - continue - - -if __name__ == '__main__': - from argparse import ArgumentParser - parser = ArgumentParser(description="calculate NLU metrics for unified datasets") - parser.add_argument('--input_dir', '-i', type=str, help='path to the input files') - parser.add_argument('--output_dir', '-o', type=str, help='path to the output files') - parser.add_argument('--mode', '-m', type=str, choices=['rg', 'key2gen', 'key2gen_noisy'], help='which task to perform') - args = parser.parse_args() - print(args) - main(args) diff --git a/convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.sh b/convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.sh deleted file mode 100644 index eb67a18b..00000000 --- a/convlab/base_models/gpt/keyword_extraction/gen_pretraining_data.sh +++ /dev/null @@ -1,40 +0,0 @@ -# generate data for response generation, key2gen, key2gen_noisy -for task_name in rg key2gen key2gen_noisy -do - dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog" - names=$(echo ${dataset_name} | tr "+" "\n") - model_type="gpt" - data_dir=data/${task_name}/${model_type}/${dataset_name} - mkdir -p ${data_dir} - train_file="${data_dir}/train.json" - validation_file="${data_dir}/validation.json" - rm ${train_file} ${validation_file} - for name in ${names} - do - echo "preprocessing ${name}" - python gen_pretraining_data.py -i data/lm/${model_type}/${name} -o data/${task_name}/${model_type}/${name} -m ${task_name} - if [ "${name}" != "${dataset_name}" ]; then - cat "data/${task_name}/${model_type}/${name}/train.json" >> ${train_file} - cat "data/${task_name}/${model_type}/${name}/validation.json" >> ${validation_file} - fi - done -done - -# merge key2gen+key2gen_noisy data -task_name="key2gen+key2gen_noisy" -dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog" -names=$(echo ${task_name} | tr "+" "\n") -model_type="gpt" -data_dir=data/${task_name}/${model_type}/${dataset_name} -mkdir -p ${data_dir} -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -rm ${train_file} ${validation_file} -for name in ${names} -do - echo "preprocessing ${name}" - if [ "${name}" != "${task_name}" ]; then - cat "data/${name}/${model_type}/${dataset_name}/train.json" >> ${train_file} - cat "data/${name}/${model_type}/${dataset_name}/validation.json" >> ${validation_file} - fi -done \ No newline at end of file diff --git a/convlab/base_models/gpt/keyword_extraction/get_keywords.sh b/convlab/base_models/gpt/keyword_extraction/get_keywords.sh deleted file mode 100644 index d3051ba6..00000000 --- a/convlab/base_models/gpt/keyword_extraction/get_keywords.sh +++ /dev/null @@ -1,25 +0,0 @@ -task_name="lm" -model_type="gpt" -model_name_or_path="gpt2-large" -keywords_num=100 -keywords_ratio=0.3 -keywords_loss_th=0 -stopwords=True -for dataset_name in dailydialog metalwoz tm1 tm2 tm3 sgd reddit wikidialog -do - data_dir="data/${task_name}/${model_type}/${dataset_name}" - for data_split in validation train - do - token_loss_file="${data_dir}/token_loss_${data_split}.json" - output_file="${data_dir}/keywords_${data_split}.json" - python lmloss2keywords.py \ - --model_type ${model_type} \ - --model_name_or_path ${model_name_or_path} \ - --token_loss_file ${token_loss_file} \ - --keywords_num ${keywords_num} \ - --keywords_ratio ${keywords_ratio} \ - --keywords_loss_th ${keywords_loss_th} \ - --stopwords ${stopwords} \ - --output_file ${output_file} - done -done \ No newline at end of file diff --git a/convlab/base_models/gpt/keyword_extraction/get_token_loss.sh b/convlab/base_models/gpt/keyword_extraction/get_token_loss.sh deleted file mode 100644 index 7c2b57da..00000000 --- a/convlab/base_models/gpt/keyword_extraction/get_token_loss.sh +++ /dev/null @@ -1,35 +0,0 @@ -n_gpus=4 -master_port=23456 -task_name="lm" -model_type="gpt" -cache_dir="../cache" -source_column="dialogue" -max_length=512 -model_name_or_path="gpt2-large" -per_device_eval_batch_size=16 - -for dataset_name in dailydialog metalwoz tm1 tm2 tm3 sgd reddit wikidialog -do - data_dir="data/${task_name}/${model_type}/${dataset_name}" - output_dir="output/${task_name}/${model_type}/${dataset_name}" - - python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type} - for data_split in validation train - do - validation_file="${data_dir}/${data_split}.json" - dump_eval_loss_to="${data_dir}/token_loss_${data_split}.json" - rm ${dump_eval_loss_to} - python -m torch.distributed.launch --master_port ${master_port} \ - --nproc_per_node ${n_gpus} ../run_clm.py \ - --dump_eval_loss_to ${dump_eval_loss_to}\ - --model_name_or_path ${model_name_or_path} \ - --output_dir ${data_dir} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --max_length ${max_length} \ - --do_eval \ - --cache_dir ${cache_dir} \ - --preprocessing_num_workers 4 \ - --per_device_eval_batch_size ${per_device_eval_batch_size} - done -done diff --git a/convlab/base_models/gpt/keyword_extraction/key2gen_metric.py b/convlab/base_models/gpt/keyword_extraction/key2gen_metric.py deleted file mode 100644 index d9722d96..00000000 --- a/convlab/base_models/gpt/keyword_extraction/key2gen_metric.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""key2gen Metric""" - -import datasets -import sacrebleu - -# TODO: Add BibTeX citation -_CITATION = """\ -@inproceedings{post-2018-call, - title = "A Call for Clarity in Reporting {BLEU} Scores", - author = "Post, Matt", - booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers", - month = oct, - year = "2018", - address = "Belgium, Brussels", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/W18-6319", - pages = "186--191", -} -""" - -_DESCRIPTION = """\ -Metric to evaluate text-to-text models on the keywords grounded generation task. -""" - -_KWARGS_DESCRIPTION = """ -Calculates corpus-bleu4, positive keywords recall, negative keywords recall -Args: - positive_keywords: list of keywords (list of string) in the ground truth references - negative_keywords: list of keywords (list of string) in the random sampled references - predictions: list of predictions to score. Each predictions - should be a string. - references: list of reference for each prediction. Each - reference should be a string. -Returns: - bleu: corpus-bleu score - positive_keywords_recall: how many keywords in the ground truth response are generated, micro-averaged - negative_keywords_recall: how many keywords in the random sampled response are generated, micro-averaged -""" - - -@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) -class Key2GenMetrics(datasets.Metric): - """Metric to evaluate text-to-text models on the keywords grounded generation task.""" - - def _info(self): - return datasets.MetricInfo( - description=_DESCRIPTION, - citation=_CITATION, - inputs_description=_KWARGS_DESCRIPTION, - # This defines the format of each prediction and reference - features=datasets.Features({ - 'predictions': datasets.Value('string'), - 'references': datasets.Value('string'), - }) - ) - - def _compute(self, predictions, references, positive_keywords, negative_keywords=None): - """Returns the scores: bleu, positive_keywords_recall, negative_keywords_recall""" - bleu = sacrebleu.corpus_bleu(predictions, [references], lowercase=True).score - cnt = {'pos': 0, 'neg': 0, 'pos_recall': 0, 'neg_recall': 0} - if positive_keywords: - if not negative_keywords: - negative_keywords = [[]] * len(positive_keywords) - for poskeys, negkeys, prediction in zip(positive_keywords, negative_keywords, predictions): - cnt['pos'] += len(poskeys) - cnt['neg'] += len(negkeys) - - prediction = prediction.lower() - for key in poskeys: - key = key.lower() - if key in prediction: - cnt['pos_recall'] += 1 - - for key in negkeys: - key = key.lower() - if key in prediction: - cnt['neg_recall'] += 1 - - return { - "bleu": bleu, - "positive_keywords_recall": cnt['pos_recall']/cnt['pos'] if cnt['pos'] > 0 else 0, - "negative_keywords_recall": cnt['neg_recall']/cnt['neg'] if cnt['neg'] > 0 else 0, - } diff --git a/convlab/base_models/gpt/keyword_extraction/lmloss2keywords.py b/convlab/base_models/gpt/keyword_extraction/lmloss2keywords.py deleted file mode 100644 index bb221f6d..00000000 --- a/convlab/base_models/gpt/keyword_extraction/lmloss2keywords.py +++ /dev/null @@ -1,174 +0,0 @@ -import json -import json_lines -from pprint import pprint -import os -from tqdm import tqdm -import numpy as np -from nltk.corpus import stopwords -from nltk.tokenize import word_tokenize, PunktSentenceTokenizer -from transformers import GPT2Tokenizer -from string import punctuation - - -def merge_tokens(tokens, losses): - """Merge tokens into words""" - res = [] - i = 0 - while i < len(tokens): - token = tokens[i] - loss = losses[i] - if token in ['Ġ', 'Ċ']: - # "Ġ" means " ", "Ċ" means "\n" - if token == 'Ċ' and i < len(tokens) - 1 and not tokens[i+1].startswith('Ġ'): - tokens[i+1] = 'Ġ'+tokens[i+1] - i += 1 - continue - if token in ['user', 'system', 'Ġuser', 'Ġsystem'] and i < len(tokens)-1 and tokens[i+1] == ':': - if i > 0: - tokens[i+1] = '<|endoftext|>' - i += 1 - else: - i += 2 - continue - if token.startswith('Ġ'): - # token = token.replace("Ġ", "") - res.append([[token], [loss]]) - elif token == '<|endoftext|>': - res.append([[token], [0.]]) - else: - assert 'Ġ' not in token - if len(res) > 0: - res[-1][0].append(token) - res[-1][1].append(loss) - else: - res.append([[token], [loss]]) - i += 1 - return res - - -def convert_token_loss2word_loss(token_loss_file): - """generate a word loss file according to the token loss file""" - word_loss_file = os.path.join(os.path.dirname(token_loss_file), token_loss_file.split('/')[-1].replace('token', 'word')) - fin = open(token_loss_file, 'rb') - fout = open(word_loss_file, 'w', encoding='utf-8') - - for item in tqdm(json_lines.reader(fin)): - tokens, losses = item['tokens'], item['losses'] - assert len(tokens) == len(losses) - word2losses = merge_tokens(tokens, losses) - fout.write(json.dumps({"words": [x[0] for x in word2losses], "losses": [x[1] for x in word2losses]}, ensure_ascii=False)+'\n') - - fin.close() - fout.close() - return word_loss_file - -def main(args): - if not args.word_loss_file: - word_loss_file = convert_token_loss2word_loss(args.token_loss_file) - else: - word_loss_file = args.word_loss_file - - if not args.output_file: - return - - stop_words = set(stopwords.words('english')) - tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path) - sent_tokenizer = PunktSentenceTokenizer() - - def keywords_filter(words, losses): - word_loss_pairs = list(zip(words, losses)) - index2keyword = {} - index2turn_sent = {} - num_turns = 0 - turns_sent_spans = [list(sent_tokenizer.span_tokenize(utt)) for utt in ''.join(words).strip().split('<|endoftext|>')] - utt = '' - for i, word_loss_pair in enumerate(word_loss_pairs): - if word_loss_pair[0].startswith('<|endoftext|>'): - num_turns += 1 - utt = '' - continue - utt += word_loss_pair[0] - words = word_tokenize(word_loss_pair[0]) - if args.stopwords and any([w.lower() in stop_words for w in words]): - # skip stopwords - continue - if word_loss_pair[1] <= args.keywords_loss_th: - # skip if loss is too small - continue - # strip punctuation - strip_punctuation = word_loss_pair[0].strip(punctuation).strip() - if len(strip_punctuation) == 0: - # skip punctuation - continue - index2keyword[i] = strip_punctuation - for sent_idx, (sent_start, sent_end) in enumerate(turns_sent_spans[num_turns]): - if len(utt.strip()) <= sent_end: - index2turn_sent[i] = (num_turns, sent_idx) - break - candidate_indexes = list(index2keyword.keys()) - topk = min(round(args.keywords_ratio*(len(word_loss_pairs)-num_turns)), args.keywords_num) - topk_indexes = sorted(candidate_indexes, key=lambda x: word_loss_pairs[x][1], reverse=True)[:topk] - topk_indexes = sorted(topk_indexes) - keywords = [] - keywords_turn_sent2idx = {} - for i, index in enumerate(topk_indexes): - if i > 0 and index == topk_indexes[i-1] + 1 and \ - word_loss_pairs[index][0].strip().startswith(index2keyword[index]) and \ - word_loss_pairs[topk_indexes[i-1]][0].strip().endswith(index2keyword[topk_indexes[i-1]]): - keywords[-1]+= ' '+index2keyword[index] - else: - keywords_turn_sent2idx.setdefault(index2turn_sent[index][0], {}) - keywords_turn_sent2idx[index2turn_sent[index][0]].setdefault(index2turn_sent[index][1], []) - keywords_turn_sent2idx[index2turn_sent[index][0]][index2turn_sent[index][1]].append(len(keywords)) - keywords.append(index2keyword[index]) - - return keywords, keywords_turn_sent2idx - - fin = open(word_loss_file, 'rb') - fout = open(args.output_file, 'w', encoding='utf-8') - - for item in tqdm(json_lines.reader(fin)): - words = [tokenizer.convert_tokens_to_string(tokens) for tokens in item['words']] - losses = [np.mean(loss) for loss in item['losses']] - dialog_keywords, keywords_turn_sent2idx = keywords_filter(words, losses) - # print(keywords_turn_sent2idx) - turns = [] - turn = {'words': [], 'losses': []} - for i, (word, loss) in enumerate(zip(words, losses)): - if word != '<|endoftext|>': - turn['words'].append(word) - turn['losses'].append(loss) - if word == '<|endoftext|>' or i == len(words) - 1: - # switch turn - turn['utterance'] = ''.join(turn['words']).strip() - # 1) extract keywords according to LM loss within the turn - # keywords, _ = keywords_filter(turn['words'], turn['losses']) - # turn['turn-level_keywords'] = keywords - # 1) extract keywords according to LM loss over the dialog, and group them by sentence - turn['keywords'] = [[dialog_keywords[idx] for idx in k_idxes] for sent_idx, k_idxes in keywords_turn_sent2idx.get(len(turns), {}).items()] - turn.pop('words') - turn.pop('losses') - turns.append(turn) - turn = {'words': [], 'losses': []} - - fout.write(json.dumps(turns, ensure_ascii=False)+'\n') - - fin.close() - fout.close() - - -if __name__ == '__main__': - from argparse import ArgumentParser - parser = ArgumentParser(description="extract keywords according to lm loss") - parser.add_argument('--model_type', '-m', type=str, help='gpt or dialogpt') - parser.add_argument('--model_name_or_path', type=str, help='model name or path') - parser.add_argument('--token_loss_file', '-t', type=str, help='path to the token loss file that contains two columns: [tokens, losses]') - parser.add_argument('--word_loss_file', '-w', type=str, help='path to the token loss file that contains two columns: [tokens, losses]') - parser.add_argument('--output_file', '-o', type=str, help='path to the output file') - parser.add_argument('--keywords_num', '-n', type=int, default=100, help='how many words in an utterance serve as keywords') - parser.add_argument('--keywords_ratio', '-r', type=float, default=1.0, help='how many words (in ratio) in an utterance serve as keywords') - parser.add_argument('--keywords_loss_th', '-th', type=float, default=0., help='loss threshold for the keywords') - parser.add_argument('--stopwords', '-s', type=lambda x: bool(eval(x)), default=True, help='filter out stopwords') - args = parser.parse_args() - print(args) - main(args) diff --git a/convlab/base_models/gpt/keyword_extraction/merge_keywords_res.py b/convlab/base_models/gpt/keyword_extraction/merge_keywords_res.py deleted file mode 100644 index 94af288a..00000000 --- a/convlab/base_models/gpt/keyword_extraction/merge_keywords_res.py +++ /dev/null @@ -1,29 +0,0 @@ -import json - -def main(args): - filename2data = {f.split('/')[-1]: json.load(open(f)) for f in args.keywords_files} - first_filename = args.keywords_files[0].split('/')[-1] - dialogs = [] - for i in range(len(filename2data[first_filename])): - turns = [] - for j in range(min([len(filename2data[filename][i]) for filename in filename2data])): - utt = filename2data[first_filename][i][j]['utterance'] - keywords = {filename.split('_')[3]+'_nonstopword'+filename.split('_')[-1]: ' | '.join(filename2data[filename][i][j]['keywords']) for filename in filename2data} - turns.append({ - "utterance": utt, - **keywords - }) - dialogs.append(turns) - json.dump(dialogs, open(args.output_file, "w", encoding='utf-8'), indent=2, ensure_ascii=False) - - - - -if __name__ == '__main__': - from argparse import ArgumentParser - parser = ArgumentParser(description="calculate NLU metrics for unified datasets") - parser.add_argument('--keywords_files', '-f', metavar='keywords_files', nargs='*', help='keywords files') - parser.add_argument('--output_file', '-o', type=str, help='path to the output file') - args = parser.parse_args() - print(args) - main(args) diff --git a/convlab/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh b/convlab/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh deleted file mode 100644 index f260f707..00000000 --- a/convlab/base_models/gpt/keyword_extraction/train_lm_dialogpt.sh +++ /dev/null @@ -1,46 +0,0 @@ -set -e -n_gpus=1 -task_name="lm" -dataset_name="multiwoz21" -model_type="dialogpt" -data_dir="data/${task_name}/${dataset_name}/${model_type}" -output_dir="output/${task_name}/${dataset_name}/${model_type}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -source_column="dialogue" -max_length=512 -model_name_or_path="microsoft/DialoGPT-large" -per_device_train_batch_size=16 -per_device_eval_batch_size=16 -gradient_accumulation_steps=4 -lr=5e-5 -num_train_epochs=3 - -python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type} - -python ../run_clm.py \ - --model_name_or_path ${model_name_or_path} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --max_length ${max_length} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --load_best_model_at_end \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --gradient_checkpointing diff --git a/convlab/base_models/gpt/keyword_extraction/train_lm_gpt.sh b/convlab/base_models/gpt/keyword_extraction/train_lm_gpt.sh deleted file mode 100644 index 82c63a1f..00000000 --- a/convlab/base_models/gpt/keyword_extraction/train_lm_gpt.sh +++ /dev/null @@ -1,46 +0,0 @@ -set -e -n_gpus=1 -task_name="lm" -dataset_name="multiwoz21" -model_type="gpt" -data_dir="data/${task_name}/${dataset_name}/${model_type}" -output_dir="output/${task_name}/${dataset_name}/${model_type}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -validation_file="${data_dir}/validation.json" -test_file="${data_dir}/test.json" -source_column="dialogue" -max_length=512 -model_name_or_path="gpt2-large" -per_device_train_batch_size=16 -per_device_eval_batch_size=16 -gradient_accumulation_steps=4 -lr=5e-5 -num_train_epochs=3 - -python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type} - -python ../run_clm.py \ - --model_name_or_path ${model_name_or_path} \ - --train_file ${train_file} \ - --validation_file ${validation_file} \ - --source_column ${source_column} \ - --max_length ${max_length} \ - --do_train \ - --do_eval \ - --save_strategy epoch \ - --evaluation_strategy epoch \ - --load_best_model_at_end \ - --prediction_loss_only \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --overwrite_output_dir \ - --preprocessing_num_workers 4 \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --gradient_checkpointing diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen+key2gen_noisy.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen+key2gen_noisy.sh deleted file mode 100644 index 8e0b3617..00000000 --- a/convlab/base_models/gpt/keyword_extraction/train_t5_key2gen+key2gen_noisy.sh +++ /dev/null @@ -1,51 +0,0 @@ -set -e -n_gpus=8 -master_port=23456 -task_name="key2gen+key2gen_noisy" -dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog" -model_type="gpt" -model_name="t5-small" -data_dir="data/${task_name}/${model_type}/${dataset_name}" -output_dir="output/${task_name}/${model_name}/${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -source_column="source" -target_column="target" -truncation_side="left" -max_source_length=512 -max_target_length=128 -model_name_or_path="${model_name}" -per_device_train_batch_size=64 -per_device_eval_batch_size=128 -gradient_accumulation_steps=1 -num_workers=16 -lr=1e-3 -num_train_epochs=1 - -python -m torch.distributed.launch --master_port ${master_port} \ - --nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --save_steps 5000 \ - --save_total_limit 1 \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --preprocessing_num_workers ${num_workers} \ - --dataloader_num_workers ${num_workers} \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --optim adafactor \ - --lr_scheduler_type constant \ - --gradient_checkpointing diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh deleted file mode 100644 index 8d9a019b..00000000 --- a/convlab/base_models/gpt/keyword_extraction/train_t5_rg.sh +++ /dev/null @@ -1,51 +0,0 @@ -set -e -n_gpus=8 -master_port=23456 -task_name="rg" -dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog" -model_type="gpt" -model_name="t5-small" -data_dir="data/${task_name}/${model_type}/${dataset_name}" -output_dir="output/${task_name}/${model_name}/${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -source_column="source" -target_column="target" -truncation_side="left" -max_source_length=512 -max_target_length=128 -model_name_or_path="${model_name}" -per_device_train_batch_size=64 -per_device_eval_batch_size=128 -gradient_accumulation_steps=1 -num_workers=16 -lr=1e-3 -num_train_epochs=1 - -python -m torch.distributed.launch --master_port ${master_port} \ - --nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --save_steps 5000 \ - --save_total_limit 1 \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --preprocessing_num_workers ${num_workers} \ - --dataloader_num_workers ${num_workers} \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --optim adafactor \ - --lr_scheduler_type constant \ - --gradient_checkpointing diff --git a/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh b/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh deleted file mode 100644 index 75b79932..00000000 --- a/convlab/base_models/gpt/keyword_extraction/train_t5_rg_key2gen+key2gen_noisy.sh +++ /dev/null @@ -1,51 +0,0 @@ -set -e -n_gpus=8 -master_port=23456 -task_name="key2gen+key2gen_noisy" -dataset_name="dailydialog+metalwoz+tm1+tm2+tm3+sgd+reddit+wikidialog" -model_type="gpt" -model_name="t5-small" -data_dir="data/${task_name}/${model_type}/${dataset_name}" -output_dir="output/${task_name}/${model_name}/${dataset_name}" -cache_dir="../cache" -logging_dir="${output_dir}/runs" -train_file="${data_dir}/train.json" -source_column="source" -target_column="target" -truncation_side="left" -max_source_length=512 -max_target_length=128 -model_name_or_path="output/rg/${model_name}/${dataset_name}" -per_device_train_batch_size=64 -per_device_eval_batch_size=128 -gradient_accumulation_steps=1 -num_workers=16 -lr=1e-3 -num_train_epochs=1 - -python -m torch.distributed.launch --master_port ${master_port} \ - --nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \ - --task_name ${task_name} \ - --train_file ${train_file} \ - --source_column ${source_column} \ - --target_column ${target_column} \ - --max_source_length ${max_source_length} \ - --max_target_length ${max_target_length} \ - --truncation_side ${truncation_side} \ - --model_name_or_path ${model_name_or_path} \ - --do_train \ - --save_steps 5000 \ - --save_total_limit 1 \ - --cache_dir ${cache_dir} \ - --output_dir ${output_dir} \ - --logging_dir ${logging_dir} \ - --preprocessing_num_workers ${num_workers} \ - --dataloader_num_workers ${num_workers} \ - --per_device_train_batch_size ${per_device_train_batch_size} \ - --per_device_eval_batch_size ${per_device_eval_batch_size} \ - --gradient_accumulation_steps ${gradient_accumulation_steps} \ - --learning_rate ${lr} \ - --num_train_epochs ${num_train_epochs} \ - --optim adafactor \ - --lr_scheduler_type constant \ - --gradient_checkpointing diff --git a/convlab/base_models/gpt/run_clm.py b/convlab/base_models/gpt/run_clm.py deleted file mode 100644 index ace68609..00000000 --- a/convlab/base_models/gpt/run_clm.py +++ /dev/null @@ -1,564 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. -Modified from https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py -Here is the full list of checkpoints on the hub that can be fine-tuned by this script: -https://huggingface.co/models?filter=text-generation -""" -# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. - -import logging -import math -import os -import sys -from dataclasses import dataclass, field -from itertools import chain -from typing import Optional - -import datasets -from datasets import load_dataset -from tqdm import tqdm -from torch.utils.data import DataLoader -import torch -import json - -import transformers -from transformers import ( - CONFIG_MAPPING, - MODEL_FOR_CAUSAL_LM_MAPPING, - AutoConfig, - AutoModelForCausalLM, - AutoTokenizer, - HfArgumentParser, - TrainingArguments, - DataCollatorForTokenClassification, - is_torch_tpu_available, - set_seed, -) -from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version -from transformers.utils.versions import require_version -from convlab.base_models.gpt.trainer import DumpTokenLossTrainer - - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.17.0") - -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") - -logger = logging.getLogger(__name__) - - -MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. - """ - - model_name_or_path: Optional[str] = field( - default=None, - metadata={ - "help": "The model checkpoint for weights initialization." - "Don't set if you want to train a model from scratch." - }, - ) - model_type: Optional[str] = field( - default=None, - metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, - ) - config_overrides: Optional[str] = field( - default=None, - metadata={ - "help": "Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" - }, - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"}, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, - ) - truncation_side: Optional[str] = field( - default="right", - metadata={"help": "Which side to truncate, left or right."} - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - }, - ) - resize_position_embeddings: Optional[bool] = field( - default=None, - metadata={ - "help": "Whether to automatically resize the position embeddings if `max_source_length` exceeds " - "the model's position embeddings." - }, - ) - - def __post_init__(self): - if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): - raise ValueError( - "--config_overrides can't be used in combination with --config_name or --model_name_or_path" - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - source_column: Optional[str] = field( - default=None, - metadata={"help": "The name of the column in the datasets containing the texts."}, - ) - train_file: Optional[str] = field( - default=None, metadata={"help": "The input training data file (a text, jsonlines or csv file)."} - ) - validation_file: Optional[str] = field( - default=None, - metadata={ - "help": "An optional input evaluation data file to evaluate the metrics on (a text, jsonlines or csv file)." - }, - ) - dump_eval_loss_to: Optional[str] = field( - default=None, metadata={"help": "Where to dump the tokens' losses in the evaluation data, default not to"} - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - max_length: Optional[int] = field( - default=1024, - metadata={ - "help": "The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded." - }, - ) - pad_to_max_length: bool = field( - default=False, - metadata={ - "help": "Whether to pad all samples to model maximum sentence length. " - "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " - "efficient on GPU but very bad for TPU." - }, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - }, - ) - ignore_pad_token_for_loss: bool = field( - default=True, - metadata={ - "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not." - }, - ) - validation_split_percentage: Optional[int] = field( - default=5, - metadata={ - "help": "The percentage of the train set used as validation set in case there's no validation split" - }, - ) - keep_linebreaks: bool = field( - default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} - ) - - def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - datasets.utils.logging.set_verbosity(log_level) - transformers.utils.logging.set_verbosity(log_level) - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - logger.info(f"Training/evaluation parameters {training_args}") - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Set seed before initializing model. - set_seed(training_args.seed) - - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - raw_datasets["train"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - data_files = {} - dataset_args = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = ( - data_args.train_file.split(".")[-1] - if data_args.train_file is not None - else data_args.validation_file.split(".")[-1] - ) - if extension == "txt": - extension = "text" - dataset_args["keep_linebreaks"] = data_args.keep_linebreaks - raw_datasets = load_dataset( - extension, - data_files=data_files, - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - **dataset_args, - ) - # If no validation data is there, validation_split_percentage will be used to divide the dataset. - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - extension, - data_files=data_files, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - **dataset_args, - ) - raw_datasets["train"] = load_dataset( - extension, - data_files=data_files, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - **dataset_args, - ) - - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Load pretrained model and tokenizer - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config_kwargs = { - "cache_dir": model_args.cache_dir, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.config_name: - config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) - elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) - else: - config = CONFIG_MAPPING[model_args.model_type]() - logger.warning("You are instantiating a new config instance from scratch.") - if model_args.config_overrides is not None: - logger.info(f"Overriding config: {model_args.config_overrides}") - config.update_from_string(model_args.config_overrides) - logger.info(f"New config: {config}") - - tokenizer_kwargs = { - "cache_dir": model_args.cache_dir, - "use_fast": model_args.use_fast_tokenizer, - "truncation_side": model_args.truncation_side, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.tokenizer_name: - tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) - elif model_args.model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) - else: - raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." - "You can do it from another script, save it, and load it from here, using --tokenizer_name." - ) - - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - if model_args.model_name_or_path: - model = AutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - model = AutoModelForCausalLM.from_config(config) - n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) - logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") - - model.resize_token_embeddings(len(tokenizer)) - - if training_args.gradient_checkpointing: - # use_cache=True is incompatible with gradient checkpointing. - config.use_cache = False - - # Preprocessing the datasets. - # First we tokenize all the texts. - if training_args.do_train: - column_names = raw_datasets["train"].column_names - elif training_args.do_eval: - column_names = raw_datasets["validation"].column_names - else: - logger.info("There is nothing to do. Please pass `do_train` and/or `do_eval`.") - return - if data_args.source_column is None: - source_column = column_names[0] - else: - source_column = data_args.source_column - if source_column not in column_names: - raise ValueError( - f"--source_column' value '{data_args.source_column}' needs to be one of: {', '.join(column_names)}" - ) - - def preprocess_function(examples): - - inputs = [] - for i in range(len(examples[source_column])): - if len(examples[source_column][i]) > 0: - inputs.append(examples[source_column][i]) - - padding = "max_length" if data_args.pad_to_max_length else False - model_inputs = tokenizer(inputs, max_length=data_args.max_length, padding=padding, truncation=True) - - # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore - # padding in the loss. Else pad in data_collator. - if padding == "max_length" and data_args.ignore_pad_token_for_loss: - model_inputs["labels"] = [ - [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in model_inputs["input_ids"] - ] - else: - model_inputs["labels"] = model_inputs["input_ids"].copy() - - return model_inputs - - with training_args.main_process_first(desc="dataset map tokenization"): - tokenized_datasets = raw_datasets.map( - preprocess_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset", - ) - - lm_datasets = tokenized_datasets - - if training_args.do_train: - if "train" not in tokenized_datasets: - raise ValueError("--do_train requires a train dataset") - train_dataset = lm_datasets["train"] - if data_args.max_train_samples is not None: - max_train_samples = min(len(train_dataset), data_args.max_train_samples) - train_dataset = train_dataset.select(range(max_train_samples)) - - if training_args.do_eval: - if "validation" not in tokenized_datasets: - raise ValueError("--do_eval requires a validation dataset") - eval_dataset = lm_datasets["validation"] - if data_args.max_eval_samples is not None: - max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) - eval_dataset = eval_dataset.select(range(max_eval_samples)) - - # Data collator - label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id - data_collator = DataCollatorForTokenClassification( - tokenizer, - label_pad_token_id=label_pad_token_id, - pad_to_multiple_of=8 if training_args.fp16 else None, - ) - - training_args.dump_eval_loss_to = data_args.dump_eval_loss_to - - # Initialize our Trainer - trainer = DumpTokenLossTrainer( - model=model, - args=training_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, - tokenizer=tokenizer, - # Data collator will default to DataCollatorWithPadding, so we change it. - data_collator=data_collator, - ) - - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload - - metrics = train_result.metrics - max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) - ) - metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - logger.info("*** Evaluate ***") - metrics = trainer.evaluate(metric_key_prefix="eval") - max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) - metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) - try: - perplexity = math.exp(metrics["eval_loss"]) - except OverflowError: - perplexity = float("inf") - metrics["eval_perplexity"] = perplexity - logger.info(f"eval_perplexity: {perplexity}") - - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} - if data_args.dataset_name is not None: - kwargs["dataset_tags"] = data_args.dataset_name - if data_args.dataset_config_name is not None: - kwargs["dataset_args"] = data_args.dataset_config_name - kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" - else: - kwargs["dataset"] = data_args.dataset_name - - if training_args.push_to_hub: - trainer.push_to_hub(**kwargs) - else: - trainer.create_model_card(**kwargs) - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() - - -if __name__ == "__main__": - main() diff --git a/convlab/base_models/gpt/trainer.py b/convlab/base_models/gpt/trainer.py deleted file mode 100644 index 5a8ed11c..00000000 --- a/convlab/base_models/gpt/trainer.py +++ /dev/null @@ -1,243 +0,0 @@ -from transformers import Trainer -from transformers.trainer_utils import EvalLoopOutput, has_length -from transformers.deepspeed import deepspeed_init -from transformers.utils import logging -from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, IterableDatasetShard, nested_truncate -from transformers.trainer_utils import EvalPrediction, denumpify_detensorize -import torch -from torch.utils.data import DataLoader -import numpy as np -from typing import List, Optional -import json - - -logger = logging.get_logger(__name__) - -class DumpTokenLossTrainer(Trainer): - def evaluation_loop( - self, - dataloader: DataLoader, - description: str, - prediction_loss_only: Optional[bool] = None, - ignore_keys: Optional[List[str]] = None, - metric_key_prefix: str = "eval", - ) -> EvalLoopOutput: - """ - Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`. - Works both with or without labels. - """ - args = self.args - - prediction_loss_only = args.prediction_loss_only - - # if eval is called w/o train init deepspeed here - if args.deepspeed and not self.deepspeed: - - # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval - # from the checkpoint eventually - deepspeed_engine, _, _ = deepspeed_init( - self, num_training_steps=0, resume_from_checkpoint=None, inference=True - ) - self.model = deepspeed_engine.module - self.model_wrapped = deepspeed_engine - self.deepspeed = deepspeed_engine - - model = self._wrap_model(self.model, training=False, dataloader=dataloader) - - # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called - # while ``train`` is running, cast it to the right dtype first and then put on device - if not self.is_in_train: - if args.fp16_full_eval: - model = model.to(dtype=torch.float16, device=args.device) - elif args.bf16_full_eval: - model = model.to(dtype=torch.bfloat16, device=args.device) - - batch_size = self.args.eval_batch_size - - logger.info(f"***** Running {description} *****") - if has_length(dataloader): - logger.info(f" Num examples = {self.num_examples(dataloader)}") - else: - logger.info(" Num examples: Unknown") - logger.info(f" Batch size = {batch_size}") - - model.eval() - - self.callback_handler.eval_dataloader = dataloader - # Do this before wrapping. - eval_dataset = getattr(dataloader, "dataset", None) - - if args.past_index >= 0: - self._past = None - - # Initialize containers - # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps) - losses_host = None - preds_host = None - labels_host = None - inputs_host = None - - # losses/preds/labels on CPU (final containers) - all_losses = None - all_preds = None - all_labels = None - all_inputs = None - # Will be useful when we have an iterable dataset so don't know its length. - - if args.dump_eval_loss_to: - writer = open(args.dump_eval_loss_to, "a", encoding='utf-8') - loss_fct = torch.nn.CrossEntropyLoss(reduction='none') - num_sample_to_write = len(eval_dataset) - - observed_num_examples = 0 - # Main evaluation loop - for step, inputs in enumerate(dataloader): - # Update the observed num examples - observed_batch_size = find_batch_size(inputs) - if observed_batch_size is not None: - observed_num_examples += observed_batch_size - # For batch samplers, batch_size is not known by the dataloader in advance. - if batch_size is None: - batch_size = observed_batch_size - - # Prediction step - loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) - inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None - - # Update containers on host - if loss is not None: - losses = self._nested_gather(loss.repeat(batch_size)) - losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) - if labels is not None: - labels = self._pad_across_processes(labels) - labels = self._nested_gather(labels) - # labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) - if inputs_decode is not None: - inputs_decode = self._pad_across_processes(inputs_decode) - inputs_decode = self._nested_gather(inputs_decode) - inputs_host = ( - inputs_decode - if inputs_host is None - else nested_concat(inputs_host, inputs_decode, padding_index=-100) - ) - if logits is not None: - logits = self._pad_across_processes(logits) - logits = self._nested_gather(logits) - if self.preprocess_logits_for_metrics is not None: - logits = self.preprocess_logits_for_metrics(logits, labels) - # preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) - - if args.dump_eval_loss_to: - if self.is_world_process_zero() and num_sample_to_write > 0: - assert logits is not None and labels is not None, print('prediction_loss_only', prediction_loss_only) - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - batch_token_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - batch_token_loss = batch_token_loss.view(shift_labels.size()).tolist() - labels = labels.tolist() - for i in range(len(labels)): - if num_sample_to_write > 0: - num_sample_to_write -= 1 - else: - break - token_ids = [x for x in labels[i] if x != -100] - tokens = self.tokenizer.convert_ids_to_tokens(token_ids) - token_losses = [0] + batch_token_loss[i][:len(token_ids)-1] - writer.write(json.dumps({"tokens": tokens, "losses": token_losses}, ensure_ascii=False)+'\n') - - self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) - - # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. - if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0: - if losses_host is not None: - losses = nested_numpify(losses_host) - all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) - if preds_host is not None: - logits = nested_numpify(preds_host) - all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) - if inputs_host is not None: - inputs_decode = nested_numpify(inputs_host) - all_inputs = ( - inputs_decode - if all_inputs is None - else nested_concat(all_inputs, inputs_decode, padding_index=-100) - ) - if labels_host is not None: - labels = nested_numpify(labels_host) - all_labels = ( - labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) - ) - - # Set back to None to begin a new accumulation - losses_host, preds_host, inputs_host, labels_host = None, None, None, None - - if args.dump_eval_loss_to: - writer.close() - - if args.past_index and hasattr(self, "_past"): - # Clean the state at the end of the evaluation loop - delattr(self, "_past") - - # Gather all remaining tensors and put them back on the CPU - if losses_host is not None: - losses = nested_numpify(losses_host) - all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) - if preds_host is not None: - logits = nested_numpify(preds_host) - all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) - if inputs_host is not None: - inputs_decode = nested_numpify(inputs_host) - all_inputs = ( - inputs_decode if all_inputs is None else nested_concat(all_inputs, inputs_decode, padding_index=-100) - ) - if labels_host is not None: - labels = nested_numpify(labels_host) - all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) - - # Number of samples - if has_length(eval_dataset): - num_samples = len(eval_dataset) - # The instance check is weird and does not actually check for the type, but whether the dataset has the right - # methods. Therefore we need to make sure it also has the attribute. - elif isinstance(eval_dataset, IterableDatasetShard) and hasattr(eval_dataset, "num_examples"): - num_samples = eval_dataset.num_examples - else: - if has_length(dataloader): - num_samples = self.num_examples(dataloader) - else: # both len(dataloader.dataset) and len(dataloader) fail - num_samples = observed_num_examples - - # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of - # samplers has been rounded to a multiple of batch_size, so we truncate. - if all_losses is not None: - all_losses = all_losses[:num_samples] - if all_preds is not None: - all_preds = nested_truncate(all_preds, num_samples) - if all_labels is not None: - all_labels = nested_truncate(all_labels, num_samples) - if all_inputs is not None: - all_inputs = nested_truncate(all_inputs, num_samples) - - # Metrics! - if self.compute_metrics is not None and all_preds is not None and all_labels is not None: - if args.include_inputs_for_metrics: - metrics = self.compute_metrics( - EvalPrediction(predictions=all_preds, label_ids=all_labels, inputs=all_inputs) - ) - else: - metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels)) - else: - metrics = {} - - # To be JSON-serializable, we need to remove numpy types or zero-d tensors - metrics = denumpify_detensorize(metrics) - - if all_losses is not None: - metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item() - - # Prefix all keys with metric_key_prefix + '_' - for key in list(metrics.keys()): - if not key.startswith(f"{metric_key_prefix}_"): - metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) - - return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples) -- GitLab