Added option to compare evaluation of multiple models.

b2e66543 · Laura Christine Kühle · 9adbf4a6 · b2e66543 · b2e66543 · b2e66543
Commit b2e66543 authored Dec 7, 2021 by Laura Christine Kühle
--- a/ANN_Training.py
+++ b/ANN_Training.py
@@ -2,7 +2,7 @@
 """
 @author: Laura C. Kühle, Soraya Terrab (sorayaterrab)

-TODO: Give option to compare multiple models
+TODO: Give option to compare multiple models -> Done
 TODO: Add more evaluation measures (AUROC, ROC, F1, training accuracy, boxplot over CVF, etc.) -> Done
 TODO: Add log to pipeline
 TODO: Remove object set-up
@@ -11,6 +11,7 @@ TODO: Improve maximum selection runtime
 TODO: Discuss if we want training accuracy/ROC in addition to CFV
 TODO: Discuss whether to change output to binary
 TODO: Adapt TCD file to new classification
+TODO: Improve classification stat handling

 """
 import numpy as np
@@ -33,7 +34,7 @@ class ModelTrainer(object):
    def _reset(self, config):
        self._dir = config.pop('dir', 'test_data')
        self._model_name = config.pop('model_name', '0')
-        self._read_training_data()
+        self._training_data = read_training_data(self._dir)

        self._batch_size = config.pop('batch_size', min(len(self._training_data)//2, 500))
        self._num_epochs = config.pop('num_epochs', 1000)
@@ -63,13 +64,6 @@ class ModelTrainer(object):
            self._model.parameters(), **self._optimizer_config)
        self._validation_loss = torch.zeros(self._num_epochs//10)

-    def _read_training_data(self):
-        # Get training dataset from saved file and map to Torch tensor and dataset
-        input_file = self._dir + '/input_data.npy'
-        output_file = self._dir + '/output_data.npy'
-        self._training_data = TensorDataset(*map(torch.tensor, (np.load(input_file),
-                                                                np.load(output_file))))
-
    def epoch_training(self, dataset=None, num_epochs=None):
        # Split data into training and validation set
        if dataset is None:
@@ -108,39 +102,7 @@ class ModelTrainer(object):
                if valid_loss / len(valid_dl) < self._threshold:
                    break

-    def test_model(self, num_iterations=100):
-        classification_stats = []
-        for iteration in range(num_iterations):
-            dataset = self._training_data
-            for train_index, test_index in KFold(n_splits=5, shuffle=True).split(dataset):
-                # print("TRAIN:", train_index, "TEST:", test_index)
-                training_set = TensorDataset(*dataset[train_index])
-                test_set = dataset[test_index]
-
-                classification_stats.append(self._test_fold(training_set, test_set))
-
-        # print(classification_stats)
-        # print(np.array(classification_stats).mean(axis=0))
-        plot_boxplot([self._model_name], *np.array(classification_stats).transpose())
-        classification_stats = np.array(classification_stats).mean(axis=0)
-
-        plot_classification_accuracy([self._model_name], *classification_stats)
-
-        # Set paths for plot files if not existing already
-        plot_dir = self._dir + '/model evaluation'
-        if not os.path.exists(plot_dir):
-            os.makedirs(plot_dir)
-
-        # Save plots
-        for identifier in plt.get_figlabels():
-            # Set path for figure directory if not existing already
-            if not os.path.exists(plot_dir + '/' + identifier):
-                os.makedirs(plot_dir + '/' + identifier)
-
-            plt.figure(identifier)
-            plt.savefig(plot_dir + '/' + identifier + '/' + self._model_name + '.pdf')
-
-    def _test_fold(self, training_set, test_set):
+    def test_model(self, training_set, test_set):
        self.epoch_training(training_set, num_epochs=100)
        self._model.eval()

@@ -185,6 +147,63 @@ class ModelTrainer(object):
        pass


+def read_training_data(directory):
+    # Get training dataset from saved file and map to Torch tensor and dataset
+    input_file = directory + '/input_data.npy'
+    output_file = directory + '/output_data.npy'
+    return TensorDataset(*map(torch.tensor, (np.load(input_file), np.load(output_file))))
+
+
+def evaluate_models(models, directory, num_iterations=100):
+    dataset = read_training_data(directory)
+    stats = ['Precision', 'Recall', 'Accuracy', 'F-Score', 'AUROC']
+    classification_stats = {model: {name: [] for name in stats} for model in models}
+    for iteration in range(num_iterations):
+        for train_index, test_index in KFold(n_splits=5, shuffle=True).split(dataset):
+            # print("TRAIN:", train_index, "TEST:", test_index)
+            training_set = TensorDataset(*dataset[train_index])
+            test_set = dataset[test_index]
+
+            for model in models:
+                result = models[model].test_model(training_set, test_set)
+                count = 0
+                for stat in stats:
+                    classification_stats[model][stat].append(result[count])
+                    count += 1
+
+    # print(classification_stats)
+    # print(np.array(classification_stats).mean(axis=0))
+    # print(np.array(classification_stats['Adam']['Precision']).shape)
+    # print(np.array([np.array(classification_stats[model]) for model in models]).transpose().shape)
+    # print(np.array([np.array(classification_stats[model]).transpose() for model in models]).shape)
+    # print(np.array([[classification_stats[model][stat] for model in models] for stat in stats]).shape)
+    # print(np.array([[np.array(classification_stats[model][stat]).mean(axis=0) for model in models] for stat in stats]).shape)
+    # print(np.array([*(np.array([[classification_stats[model][stat]
+    #                                          for model in models] for stat in stats]))]).shape)
+    # print(*(np.array([[classification_stats[model][stat]
+    #                                          for model in models] for stat in stats]))[0].shape)
+    plot_boxplot(models.keys(), *(np.array([[classification_stats[model][stat]
+                                             for model in models] for stat in stats])))
+    classification_stats = [[np.array(classification_stats[model][stat]).mean(axis=0) for model in models] for stat in stats]
+    # print(*classification_stats)
+
+    plot_classification_accuracy(models.keys(), *classification_stats)
+
+    # Set paths for plot files if not existing already
+    plot_dir = directory + '/model evaluation'
+    if not os.path.exists(plot_dir):
+        os.makedirs(plot_dir)
+
+    # Save plots
+    for identifier in plt.get_figlabels():
+        # Set path for figure directory if not existing already
+        if not os.path.exists(plot_dir + '/' + identifier):
+            os.makedirs(plot_dir + '/' + identifier)
+
+        plt.figure(identifier)
+        plt.savefig(plot_dir + '/' + identifier + '/' + '_'.join(models.keys()) + '.pdf')
+
+
 # Loss Functions: BCELoss, BCEWithLogitsLoss,
 # CrossEntropyLoss (not working), MSELoss (with reduction='sum')
 # Optimizer: Adam, SGD

--- a/Plotting.py
+++ b/Plotting.py
@@ -253,11 +253,6 @@ def plot_classification_accuracy(xlabels, precision, recall, accuracy, fscore, a
        List of strings for x-axis labels.

    """
-    precision = [precision]
-    recall = [recall]
-    accuracy = [accuracy]
-    fscore = [fscore]
-    auroc = [auroc]
    pos = np.arange(len(xlabels))
    width = 1/(3*len(xlabels))
    fig = plt.figure('classification_accuracy')
@@ -278,25 +273,20 @@ def plot_classification_accuracy(xlabels, precision, recall, accuracy, fscore, a


 def plot_boxplot(xlabels, precision, recall, accuracy, fscore, auroc):
-    precision = [precision]
-    recall = [recall]
-    accuracy = [accuracy]
-    fscore = [fscore]
-    auroc = [auroc]
    fig = plt.figure('boxplot_accuracy')
    pos = np.arange(len(xlabels))
    width = 1/(5*len(xlabels))
    ax = fig.add_axes([0.15, 0.1, 0.75, 0.8])
    boxplots = []
-    boxplots.append(ax.boxplot(fscore, positions=pos - 3*width, widths=width, meanline=True,
+    boxplots.append(ax.boxplot(fscore.transpose(), positions=pos - 3*width, widths=width, meanline=True,
                               showmeans=True, patch_artist=True))
-    boxplots.append(ax.boxplot(precision, positions=pos - 1.5*width, widths=width, meanline=True,
+    boxplots.append(ax.boxplot(precision.transpose(), positions=pos - 1.5*width, widths=width, meanline=True,
                               showmeans=True, patch_artist=True))
-    boxplots.append(ax.boxplot(recall, positions=pos, widths=width, meanline=True, showmeans=True,
+    boxplots.append(ax.boxplot(recall.transpose(), positions=pos, widths=width, meanline=True, showmeans=True,
                               patch_artist=True))
-    boxplots.append(ax.boxplot(accuracy, positions=pos + 1.5*width, widths=width, meanline=True,
+    boxplots.append(ax.boxplot(accuracy.transpose(), positions=pos + 1.5*width, widths=width, meanline=True,
                               showmeans=True, patch_artist=True))
-    boxplots.append(ax.boxplot(auroc, positions=pos + 3*width, widths=width, meanline=True,
+    boxplots.append(ax.boxplot(auroc.transpose(), positions=pos + 3*width, widths=width, meanline=True,
                               showmeans=True, patch_artist=True))
    count = 0
    colors = ['red', 'yellow', 'blue', 'tan', 'green']

--- a/Snakefile
+++ b/Snakefile
 configfile: 'config.yaml'
 import ANN_Data_Generator, Initial_Condition, ANN_Training
+from ANN_Training import evaluate_models
 import numpy as np

 def replace_none(list):
    return {} if list is None else list

 DIR = config['data_directory']
+MODELS = config['models']

 if config['random_seed'] is not None:
    np.random.seed(config['random_seed'])

 rule all:
    input:
-        DIR+'/trained models/model__' + config['model_name'] + '.pt',
-        DIR+'/model evaluation/classification_accuracy/' + config['model_name'] + '.pdf'
+        expand(DIR+'/trained models/model__{model}.pt', model=MODELS),
+        DIR+'/model evaluation/classification_accuracy/' + '_'.join(MODELS.keys()) + '.pdf'

 rule test_model:
    input:
        DIR+'/input_data.npy',
        DIR+'/output_data.npy'
-    params:
-        model_name = config['model_name'],
-        num_epochs = config['num_epochs'],
-        threshold = config['threshold'],
-        batch_size = config['batch_size'],
-        model = config['model'],
-        model_config = replace_none(config['model_config']),
-        loss_function = config['loss_function'],
-        optimizer = config['optimizer']
    log:
        DIR+'/log/test_model.log'
    output:
-        DIR+'/model evaluation/classification_accuracy/' + config['model_name'] + '.pdf'
+        DIR+'/model evaluation/classification_accuracy/' + '_'.join(MODELS.keys()) + '.pdf'
    run:
-        trainer= ANN_Training.ModelTrainer({'model_name': params.model_name,
-                                            'num_epochs': params.num_epochs, 'dir': DIR,
-                                            'model_dir': DIR, 'threshold': params.threshold,
-                                            'batch_size': params.batch_size, 'model': params.model,
-                                            'model_config': params.model_config,
-                                            'loss_function': params.loss_function,
-                                            'optimizer': params.optimizer})
-        trainer.test_model()
+        models = {}
+        for model in MODELS:
+            trainer= ANN_Training.ModelTrainer({'model_name': model, 'dir': DIR,
+                                            'model_dir': DIR, **MODELS[model]})
+            models[model] = trainer
+        evaluate_models(models, DIR, 2)

 rule generate_data:
    output:
@@ -74,26 +65,16 @@ rule train_model:
        DIR+'/input_data.npy',
        DIR+'/output_data.npy'
    params:
-        model_name = config['model_name'],
-        num_epochs = config['num_epochs'],
-        threshold = config['threshold'],
-        batch_size = config['batch_size'],
-        model = config['model'],
-        model_config = replace_none(config['model_config']),
-        loss_function = config['loss_function'],
-        optimizer = config['optimizer']
+        models = MODELS
    log:
        DIR+'/log/train_model.log'
    output:
-        DIR+'/trained models/model__' + config['model_name'] + '.pt',
-        DIR+'/trained models/loss__' + config['model_name'] + '.pt'
+        expand(DIR+'/trained models/model__{model}.pt', model=MODELS),
+        expand(DIR+'/trained models/loss__{model}.pt', model=MODELS)
    run:
-        trainer= ANN_Training.ModelTrainer({'model_name': params.model_name,
-                                            'num_epochs': params.num_epochs, 'dir': DIR,
-                                            'model_dir': DIR, 'threshold': params.threshold,
-                                            'batch_size': params.batch_size, 'model': params.model,
-                                            'model_config': params.model_config,
-                                            'loss_function': params.loss_function,
-                                            'optimizer': params.optimizer})
+        for model in params.models:
+            print(model)
+            trainer= ANN_Training.ModelTrainer({'model_name': model, 'dir': DIR,
+                                            'model_dir': DIR, **params.models[model]})
            trainer.epoch_training()
            trainer.save_model()
\ No newline at end of file
--- a/config.yaml
+++ b/config.yaml
@@ -24,12 +24,21 @@ functions:
    adjustment: 0

 # Parameter for Model Training
-model_name: Test_Name
+models:
+  Adam:
    num_epochs: 1000
    threshold: 1.0e-5
    batch_size: 500
    model: ThreeLayerReLu
-model_config:
+    model_config: {}
    loss_function: BCELoss
    optimizer: Adam
+  SGD:
+    num_epochs: 1000
+    threshold: 1.0e-5
+    batch_size: 500
+    model: ThreeLayerReLu
+    model_config: {}
+    loss_function: BCELoss
+    optimizer: SGD