From b2e665432f6716bdcd4dfc8092bc1ee62f7e0fdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BChle=2C=20Laura=20Christine=20=28lakue103=29?= <laura.kuehle@uni-duesseldorf.de> Date: Tue, 7 Dec 2021 18:16:22 +0100 Subject: [PATCH] Added option to compare evaluation of multiple models. --- ANN_Training.py | 103 ++++++++++++++++++++++++++++-------------------- Plotting.py | 20 +++------- Snakefile | 59 ++++++++++----------------- config.yaml | 25 ++++++++---- 4 files changed, 103 insertions(+), 104 deletions(-) diff --git a/ANN_Training.py b/ANN_Training.py index 09ee018..53ee0e2 100644 --- a/ANN_Training.py +++ b/ANN_Training.py @@ -2,7 +2,7 @@ """ @author: Laura C. Kühle, Soraya Terrab (sorayaterrab) -TODO: Give option to compare multiple models +TODO: Give option to compare multiple models -> Done TODO: Add more evaluation measures (AUROC, ROC, F1, training accuracy, boxplot over CVF, etc.) -> Done TODO: Add log to pipeline TODO: Remove object set-up @@ -11,6 +11,7 @@ TODO: Improve maximum selection runtime TODO: Discuss if we want training accuracy/ROC in addition to CFV TODO: Discuss whether to change output to binary TODO: Adapt TCD file to new classification +TODO: Improve classification stat handling """ import numpy as np @@ -33,7 +34,7 @@ class ModelTrainer(object): def _reset(self, config): self._dir = config.pop('dir', 'test_data') self._model_name = config.pop('model_name', '0') - self._read_training_data() + self._training_data = read_training_data(self._dir) self._batch_size = config.pop('batch_size', min(len(self._training_data)//2, 500)) self._num_epochs = config.pop('num_epochs', 1000) @@ -63,13 +64,6 @@ class ModelTrainer(object): self._model.parameters(), **self._optimizer_config) self._validation_loss = torch.zeros(self._num_epochs//10) - def _read_training_data(self): - # Get training dataset from saved file and map to Torch tensor and dataset - input_file = self._dir + '/input_data.npy' - output_file = self._dir + '/output_data.npy' - self._training_data = TensorDataset(*map(torch.tensor, (np.load(input_file), - np.load(output_file)))) - def epoch_training(self, dataset=None, num_epochs=None): # Split data into training and validation set if dataset is None: @@ -108,39 +102,7 @@ class ModelTrainer(object): if valid_loss / len(valid_dl) < self._threshold: break - def test_model(self, num_iterations=100): - classification_stats = [] - for iteration in range(num_iterations): - dataset = self._training_data - for train_index, test_index in KFold(n_splits=5, shuffle=True).split(dataset): - # print("TRAIN:", train_index, "TEST:", test_index) - training_set = TensorDataset(*dataset[train_index]) - test_set = dataset[test_index] - - classification_stats.append(self._test_fold(training_set, test_set)) - - # print(classification_stats) - # print(np.array(classification_stats).mean(axis=0)) - plot_boxplot([self._model_name], *np.array(classification_stats).transpose()) - classification_stats = np.array(classification_stats).mean(axis=0) - - plot_classification_accuracy([self._model_name], *classification_stats) - - # Set paths for plot files if not existing already - plot_dir = self._dir + '/model evaluation' - if not os.path.exists(plot_dir): - os.makedirs(plot_dir) - - # Save plots - for identifier in plt.get_figlabels(): - # Set path for figure directory if not existing already - if not os.path.exists(plot_dir + '/' + identifier): - os.makedirs(plot_dir + '/' + identifier) - - plt.figure(identifier) - plt.savefig(plot_dir + '/' + identifier + '/' + self._model_name + '.pdf') - - def _test_fold(self, training_set, test_set): + def test_model(self, training_set, test_set): self.epoch_training(training_set, num_epochs=100) self._model.eval() @@ -185,6 +147,63 @@ class ModelTrainer(object): pass +def read_training_data(directory): + # Get training dataset from saved file and map to Torch tensor and dataset + input_file = directory + '/input_data.npy' + output_file = directory + '/output_data.npy' + return TensorDataset(*map(torch.tensor, (np.load(input_file), np.load(output_file)))) + + +def evaluate_models(models, directory, num_iterations=100): + dataset = read_training_data(directory) + stats = ['Precision', 'Recall', 'Accuracy', 'F-Score', 'AUROC'] + classification_stats = {model: {name: [] for name in stats} for model in models} + for iteration in range(num_iterations): + for train_index, test_index in KFold(n_splits=5, shuffle=True).split(dataset): + # print("TRAIN:", train_index, "TEST:", test_index) + training_set = TensorDataset(*dataset[train_index]) + test_set = dataset[test_index] + + for model in models: + result = models[model].test_model(training_set, test_set) + count = 0 + for stat in stats: + classification_stats[model][stat].append(result[count]) + count += 1 + + # print(classification_stats) + # print(np.array(classification_stats).mean(axis=0)) + # print(np.array(classification_stats['Adam']['Precision']).shape) + # print(np.array([np.array(classification_stats[model]) for model in models]).transpose().shape) + # print(np.array([np.array(classification_stats[model]).transpose() for model in models]).shape) + # print(np.array([[classification_stats[model][stat] for model in models] for stat in stats]).shape) + # print(np.array([[np.array(classification_stats[model][stat]).mean(axis=0) for model in models] for stat in stats]).shape) + # print(np.array([*(np.array([[classification_stats[model][stat] + # for model in models] for stat in stats]))]).shape) + # print(*(np.array([[classification_stats[model][stat] + # for model in models] for stat in stats]))[0].shape) + plot_boxplot(models.keys(), *(np.array([[classification_stats[model][stat] + for model in models] for stat in stats]))) + classification_stats = [[np.array(classification_stats[model][stat]).mean(axis=0) for model in models] for stat in stats] + # print(*classification_stats) + + plot_classification_accuracy(models.keys(), *classification_stats) + + # Set paths for plot files if not existing already + plot_dir = directory + '/model evaluation' + if not os.path.exists(plot_dir): + os.makedirs(plot_dir) + + # Save plots + for identifier in plt.get_figlabels(): + # Set path for figure directory if not existing already + if not os.path.exists(plot_dir + '/' + identifier): + os.makedirs(plot_dir + '/' + identifier) + + plt.figure(identifier) + plt.savefig(plot_dir + '/' + identifier + '/' + '_'.join(models.keys()) + '.pdf') + + # Loss Functions: BCELoss, BCEWithLogitsLoss, # CrossEntropyLoss (not working), MSELoss (with reduction='sum') # Optimizer: Adam, SGD diff --git a/Plotting.py b/Plotting.py index 813ecb8..078381c 100644 --- a/Plotting.py +++ b/Plotting.py @@ -253,11 +253,6 @@ def plot_classification_accuracy(xlabels, precision, recall, accuracy, fscore, a List of strings for x-axis labels. """ - precision = [precision] - recall = [recall] - accuracy = [accuracy] - fscore = [fscore] - auroc = [auroc] pos = np.arange(len(xlabels)) width = 1/(3*len(xlabels)) fig = plt.figure('classification_accuracy') @@ -278,25 +273,20 @@ def plot_classification_accuracy(xlabels, precision, recall, accuracy, fscore, a def plot_boxplot(xlabels, precision, recall, accuracy, fscore, auroc): - precision = [precision] - recall = [recall] - accuracy = [accuracy] - fscore = [fscore] - auroc = [auroc] fig = plt.figure('boxplot_accuracy') pos = np.arange(len(xlabels)) width = 1/(5*len(xlabels)) ax = fig.add_axes([0.15, 0.1, 0.75, 0.8]) boxplots = [] - boxplots.append(ax.boxplot(fscore, positions=pos - 3*width, widths=width, meanline=True, + boxplots.append(ax.boxplot(fscore.transpose(), positions=pos - 3*width, widths=width, meanline=True, showmeans=True, patch_artist=True)) - boxplots.append(ax.boxplot(precision, positions=pos - 1.5*width, widths=width, meanline=True, + boxplots.append(ax.boxplot(precision.transpose(), positions=pos - 1.5*width, widths=width, meanline=True, showmeans=True, patch_artist=True)) - boxplots.append(ax.boxplot(recall, positions=pos, widths=width, meanline=True, showmeans=True, + boxplots.append(ax.boxplot(recall.transpose(), positions=pos, widths=width, meanline=True, showmeans=True, patch_artist=True)) - boxplots.append(ax.boxplot(accuracy, positions=pos + 1.5*width, widths=width, meanline=True, + boxplots.append(ax.boxplot(accuracy.transpose(), positions=pos + 1.5*width, widths=width, meanline=True, showmeans=True, patch_artist=True)) - boxplots.append(ax.boxplot(auroc, positions=pos + 3*width, widths=width, meanline=True, + boxplots.append(ax.boxplot(auroc.transpose(), positions=pos + 3*width, widths=width, meanline=True, showmeans=True, patch_artist=True)) count = 0 colors = ['red', 'yellow', 'blue', 'tan', 'green'] diff --git a/Snakefile b/Snakefile index d1ab162..ded70a8 100644 --- a/Snakefile +++ b/Snakefile @@ -1,46 +1,37 @@ configfile: 'config.yaml' import ANN_Data_Generator, Initial_Condition, ANN_Training +from ANN_Training import evaluate_models import numpy as np def replace_none(list): return {} if list is None else list DIR = config['data_directory'] +MODELS = config['models'] if config['random_seed'] is not None: np.random.seed(config['random_seed']) rule all: input: - DIR+'/trained models/model__' + config['model_name'] + '.pt', - DIR+'/model evaluation/classification_accuracy/' + config['model_name'] + '.pdf' + expand(DIR+'/trained models/model__{model}.pt', model=MODELS), + DIR+'/model evaluation/classification_accuracy/' + '_'.join(MODELS.keys()) + '.pdf' rule test_model: input: DIR+'/input_data.npy', DIR+'/output_data.npy' - params: - model_name = config['model_name'], - num_epochs = config['num_epochs'], - threshold = config['threshold'], - batch_size = config['batch_size'], - model = config['model'], - model_config = replace_none(config['model_config']), - loss_function = config['loss_function'], - optimizer = config['optimizer'] log: DIR+'/log/test_model.log' output: - DIR+'/model evaluation/classification_accuracy/' + config['model_name'] + '.pdf' + DIR+'/model evaluation/classification_accuracy/' + '_'.join(MODELS.keys()) + '.pdf' run: - trainer= ANN_Training.ModelTrainer({'model_name': params.model_name, - 'num_epochs': params.num_epochs, 'dir': DIR, - 'model_dir': DIR, 'threshold': params.threshold, - 'batch_size': params.batch_size, 'model': params.model, - 'model_config': params.model_config, - 'loss_function': params.loss_function, - 'optimizer': params.optimizer}) - trainer.test_model() + models = {} + for model in MODELS: + trainer= ANN_Training.ModelTrainer({'model_name': model, 'dir': DIR, + 'model_dir': DIR, **MODELS[model]}) + models[model] = trainer + evaluate_models(models, DIR, 2) rule generate_data: output: @@ -74,26 +65,16 @@ rule train_model: DIR+'/input_data.npy', DIR+'/output_data.npy' params: - model_name = config['model_name'], - num_epochs = config['num_epochs'], - threshold = config['threshold'], - batch_size = config['batch_size'], - model = config['model'], - model_config = replace_none(config['model_config']), - loss_function = config['loss_function'], - optimizer = config['optimizer'] + models = MODELS log: DIR+'/log/train_model.log' output: - DIR+'/trained models/model__' + config['model_name'] + '.pt', - DIR+'/trained models/loss__' + config['model_name'] + '.pt' + expand(DIR+'/trained models/model__{model}.pt', model=MODELS), + expand(DIR+'/trained models/loss__{model}.pt', model=MODELS) run: - trainer= ANN_Training.ModelTrainer({'model_name': params.model_name, - 'num_epochs': params.num_epochs, 'dir': DIR, - 'model_dir': DIR, 'threshold': params.threshold, - 'batch_size': params.batch_size, 'model': params.model, - 'model_config': params.model_config, - 'loss_function': params.loss_function, - 'optimizer': params.optimizer}) - trainer.epoch_training() - trainer.save_model() \ No newline at end of file + for model in params.models: + print(model) + trainer= ANN_Training.ModelTrainer({'model_name': model, 'dir': DIR, + 'model_dir': DIR, **params.models[model]}) + trainer.epoch_training() + trainer.save_model() \ No newline at end of file diff --git a/config.yaml b/config.yaml index 1b9ca99..90cb41c 100644 --- a/config.yaml +++ b/config.yaml @@ -24,12 +24,21 @@ functions: adjustment: 0 # Parameter for Model Training -model_name: Test_Name -num_epochs: 1000 -threshold: 1.0e-5 -batch_size: 500 -model: ThreeLayerReLu -model_config: -loss_function: BCELoss -optimizer: Adam +models: + Adam: + num_epochs: 1000 + threshold: 1.0e-5 + batch_size: 500 + model: ThreeLayerReLu + model_config: {} + loss_function: BCELoss + optimizer: Adam + SGD: + num_epochs: 1000 + threshold: 1.0e-5 + batch_size: 500 + model: ThreeLayerReLu + model_config: {} + loss_function: BCELoss + optimizer: SGD -- GitLab