From b2e665432f6716bdcd4dfc8092bc1ee62f7e0fdc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=BChle=2C=20Laura=20Christine=20=28lakue103=29?=
 <laura.kuehle@uni-duesseldorf.de>
Date: Tue, 7 Dec 2021 18:16:22 +0100
Subject: [PATCH] Added option to compare evaluation of multiple models.

---
 ANN_Training.py | 103 ++++++++++++++++++++++++++++--------------------
 Plotting.py     |  20 +++-------
 Snakefile       |  59 ++++++++++-----------------
 config.yaml     |  25 ++++++++----
 4 files changed, 103 insertions(+), 104 deletions(-)

diff --git a/ANN_Training.py b/ANN_Training.py
index 09ee018..53ee0e2 100644
--- a/ANN_Training.py
+++ b/ANN_Training.py
@@ -2,7 +2,7 @@
 """
 @author: Laura C. Kühle, Soraya Terrab (sorayaterrab)
 
-TODO: Give option to compare multiple models
+TODO: Give option to compare multiple models -> Done
 TODO: Add more evaluation measures (AUROC, ROC, F1, training accuracy, boxplot over CVF, etc.) -> Done
 TODO: Add log to pipeline
 TODO: Remove object set-up
@@ -11,6 +11,7 @@ TODO: Improve maximum selection runtime
 TODO: Discuss if we want training accuracy/ROC in addition to CFV
 TODO: Discuss whether to change output to binary
 TODO: Adapt TCD file to new classification
+TODO: Improve classification stat handling
 
 """
 import numpy as np
@@ -33,7 +34,7 @@ class ModelTrainer(object):
     def _reset(self, config):
         self._dir = config.pop('dir', 'test_data')
         self._model_name = config.pop('model_name', '0')
-        self._read_training_data()
+        self._training_data = read_training_data(self._dir)
 
         self._batch_size = config.pop('batch_size', min(len(self._training_data)//2, 500))
         self._num_epochs = config.pop('num_epochs', 1000)
@@ -63,13 +64,6 @@ class ModelTrainer(object):
             self._model.parameters(), **self._optimizer_config)
         self._validation_loss = torch.zeros(self._num_epochs//10)
 
-    def _read_training_data(self):
-        # Get training dataset from saved file and map to Torch tensor and dataset
-        input_file = self._dir + '/input_data.npy'
-        output_file = self._dir + '/output_data.npy'
-        self._training_data = TensorDataset(*map(torch.tensor, (np.load(input_file),
-                                                                np.load(output_file))))
-
     def epoch_training(self, dataset=None, num_epochs=None):
         # Split data into training and validation set
         if dataset is None:
@@ -108,39 +102,7 @@ class ModelTrainer(object):
                 if valid_loss / len(valid_dl) < self._threshold:
                     break
 
-    def test_model(self, num_iterations=100):
-        classification_stats = []
-        for iteration in range(num_iterations):
-            dataset = self._training_data
-            for train_index, test_index in KFold(n_splits=5, shuffle=True).split(dataset):
-                # print("TRAIN:", train_index, "TEST:", test_index)
-                training_set = TensorDataset(*dataset[train_index])
-                test_set = dataset[test_index]
-
-                classification_stats.append(self._test_fold(training_set, test_set))
-
-        # print(classification_stats)
-        # print(np.array(classification_stats).mean(axis=0))
-        plot_boxplot([self._model_name], *np.array(classification_stats).transpose())
-        classification_stats = np.array(classification_stats).mean(axis=0)
-
-        plot_classification_accuracy([self._model_name], *classification_stats)
-
-        # Set paths for plot files if not existing already
-        plot_dir = self._dir + '/model evaluation'
-        if not os.path.exists(plot_dir):
-            os.makedirs(plot_dir)
-
-        # Save plots
-        for identifier in plt.get_figlabels():
-            # Set path for figure directory if not existing already
-            if not os.path.exists(plot_dir + '/' + identifier):
-                os.makedirs(plot_dir + '/' + identifier)
-
-            plt.figure(identifier)
-            plt.savefig(plot_dir + '/' + identifier + '/' + self._model_name + '.pdf')
-
-    def _test_fold(self, training_set, test_set):
+    def test_model(self, training_set, test_set):
         self.epoch_training(training_set, num_epochs=100)
         self._model.eval()
 
@@ -185,6 +147,63 @@ class ModelTrainer(object):
         pass
 
 
+def read_training_data(directory):
+    # Get training dataset from saved file and map to Torch tensor and dataset
+    input_file = directory + '/input_data.npy'
+    output_file = directory + '/output_data.npy'
+    return TensorDataset(*map(torch.tensor, (np.load(input_file), np.load(output_file))))
+
+
+def evaluate_models(models, directory, num_iterations=100):
+    dataset = read_training_data(directory)
+    stats = ['Precision', 'Recall', 'Accuracy', 'F-Score', 'AUROC']
+    classification_stats = {model: {name: [] for name in stats} for model in models}
+    for iteration in range(num_iterations):
+        for train_index, test_index in KFold(n_splits=5, shuffle=True).split(dataset):
+            # print("TRAIN:", train_index, "TEST:", test_index)
+            training_set = TensorDataset(*dataset[train_index])
+            test_set = dataset[test_index]
+
+            for model in models:
+                result = models[model].test_model(training_set, test_set)
+                count = 0
+                for stat in stats:
+                    classification_stats[model][stat].append(result[count])
+                    count += 1
+
+    # print(classification_stats)
+    # print(np.array(classification_stats).mean(axis=0))
+    # print(np.array(classification_stats['Adam']['Precision']).shape)
+    # print(np.array([np.array(classification_stats[model]) for model in models]).transpose().shape)
+    # print(np.array([np.array(classification_stats[model]).transpose() for model in models]).shape)
+    # print(np.array([[classification_stats[model][stat] for model in models] for stat in stats]).shape)
+    # print(np.array([[np.array(classification_stats[model][stat]).mean(axis=0) for model in models] for stat in stats]).shape)
+    # print(np.array([*(np.array([[classification_stats[model][stat]
+    #                                          for model in models] for stat in stats]))]).shape)
+    # print(*(np.array([[classification_stats[model][stat]
+    #                                          for model in models] for stat in stats]))[0].shape)
+    plot_boxplot(models.keys(), *(np.array([[classification_stats[model][stat]
+                                             for model in models] for stat in stats])))
+    classification_stats = [[np.array(classification_stats[model][stat]).mean(axis=0) for model in models] for stat in stats]
+    # print(*classification_stats)
+
+    plot_classification_accuracy(models.keys(), *classification_stats)
+
+    # Set paths for plot files if not existing already
+    plot_dir = directory + '/model evaluation'
+    if not os.path.exists(plot_dir):
+        os.makedirs(plot_dir)
+
+    # Save plots
+    for identifier in plt.get_figlabels():
+        # Set path for figure directory if not existing already
+        if not os.path.exists(plot_dir + '/' + identifier):
+            os.makedirs(plot_dir + '/' + identifier)
+
+        plt.figure(identifier)
+        plt.savefig(plot_dir + '/' + identifier + '/' + '_'.join(models.keys()) + '.pdf')
+
+
 # Loss Functions: BCELoss, BCEWithLogitsLoss,
 # CrossEntropyLoss (not working), MSELoss (with reduction='sum')
 # Optimizer: Adam, SGD
diff --git a/Plotting.py b/Plotting.py
index 813ecb8..078381c 100644
--- a/Plotting.py
+++ b/Plotting.py
@@ -253,11 +253,6 @@ def plot_classification_accuracy(xlabels, precision, recall, accuracy, fscore, a
         List of strings for x-axis labels.
 
     """
-    precision = [precision]
-    recall = [recall]
-    accuracy = [accuracy]
-    fscore = [fscore]
-    auroc = [auroc]
     pos = np.arange(len(xlabels))
     width = 1/(3*len(xlabels))
     fig = plt.figure('classification_accuracy')
@@ -278,25 +273,20 @@ def plot_classification_accuracy(xlabels, precision, recall, accuracy, fscore, a
 
 
 def plot_boxplot(xlabels, precision, recall, accuracy, fscore, auroc):
-    precision = [precision]
-    recall = [recall]
-    accuracy = [accuracy]
-    fscore = [fscore]
-    auroc = [auroc]
     fig = plt.figure('boxplot_accuracy')
     pos = np.arange(len(xlabels))
     width = 1/(5*len(xlabels))
     ax = fig.add_axes([0.15, 0.1, 0.75, 0.8])
     boxplots = []
-    boxplots.append(ax.boxplot(fscore, positions=pos - 3*width, widths=width, meanline=True,
+    boxplots.append(ax.boxplot(fscore.transpose(), positions=pos - 3*width, widths=width, meanline=True,
                                showmeans=True, patch_artist=True))
-    boxplots.append(ax.boxplot(precision, positions=pos - 1.5*width, widths=width, meanline=True,
+    boxplots.append(ax.boxplot(precision.transpose(), positions=pos - 1.5*width, widths=width, meanline=True,
                                showmeans=True, patch_artist=True))
-    boxplots.append(ax.boxplot(recall, positions=pos, widths=width, meanline=True, showmeans=True,
+    boxplots.append(ax.boxplot(recall.transpose(), positions=pos, widths=width, meanline=True, showmeans=True,
                                patch_artist=True))
-    boxplots.append(ax.boxplot(accuracy, positions=pos + 1.5*width, widths=width, meanline=True,
+    boxplots.append(ax.boxplot(accuracy.transpose(), positions=pos + 1.5*width, widths=width, meanline=True,
                                showmeans=True, patch_artist=True))
-    boxplots.append(ax.boxplot(auroc, positions=pos + 3*width, widths=width, meanline=True,
+    boxplots.append(ax.boxplot(auroc.transpose(), positions=pos + 3*width, widths=width, meanline=True,
                                showmeans=True, patch_artist=True))
     count = 0
     colors = ['red', 'yellow', 'blue', 'tan', 'green']
diff --git a/Snakefile b/Snakefile
index d1ab162..ded70a8 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1,46 +1,37 @@
 configfile: 'config.yaml'
 import ANN_Data_Generator, Initial_Condition, ANN_Training
+from ANN_Training import evaluate_models
 import numpy as np
 
 def replace_none(list):
     return {} if list is None else list
 
 DIR = config['data_directory']
+MODELS = config['models']
 
 if config['random_seed'] is not None:
     np.random.seed(config['random_seed'])
 
 rule all:
     input:
-        DIR+'/trained models/model__' + config['model_name'] + '.pt',
-        DIR+'/model evaluation/classification_accuracy/' + config['model_name'] + '.pdf'
+        expand(DIR+'/trained models/model__{model}.pt', model=MODELS),
+        DIR+'/model evaluation/classification_accuracy/' + '_'.join(MODELS.keys()) + '.pdf'
 
 rule test_model:
     input:
         DIR+'/input_data.npy',
         DIR+'/output_data.npy'
-    params:
-        model_name = config['model_name'],
-        num_epochs = config['num_epochs'],
-        threshold = config['threshold'],
-        batch_size = config['batch_size'],
-        model = config['model'],
-        model_config = replace_none(config['model_config']),
-        loss_function = config['loss_function'],
-        optimizer = config['optimizer']
     log:
         DIR+'/log/test_model.log'
     output:
-        DIR+'/model evaluation/classification_accuracy/' + config['model_name'] + '.pdf'
+        DIR+'/model evaluation/classification_accuracy/' + '_'.join(MODELS.keys()) + '.pdf'
     run:
-        trainer= ANN_Training.ModelTrainer({'model_name': params.model_name,
-                                            'num_epochs': params.num_epochs, 'dir': DIR,
-                                            'model_dir': DIR, 'threshold': params.threshold,
-                                            'batch_size': params.batch_size, 'model': params.model,
-                                            'model_config': params.model_config,
-                                            'loss_function': params.loss_function,
-                                            'optimizer': params.optimizer})
-        trainer.test_model()
+        models = {}
+        for model in MODELS:
+            trainer= ANN_Training.ModelTrainer({'model_name': model, 'dir': DIR,
+                                            'model_dir': DIR, **MODELS[model]})
+            models[model] = trainer
+        evaluate_models(models, DIR, 2)
 
 rule generate_data:
     output:
@@ -74,26 +65,16 @@ rule train_model:
         DIR+'/input_data.npy',
         DIR+'/output_data.npy'
     params:
-        model_name = config['model_name'],
-        num_epochs = config['num_epochs'],
-        threshold = config['threshold'],
-        batch_size = config['batch_size'],
-        model = config['model'],
-        model_config = replace_none(config['model_config']),
-        loss_function = config['loss_function'],
-        optimizer = config['optimizer']
+        models = MODELS
     log:
         DIR+'/log/train_model.log'
     output:
-        DIR+'/trained models/model__' + config['model_name'] + '.pt',
-        DIR+'/trained models/loss__' + config['model_name'] + '.pt'
+        expand(DIR+'/trained models/model__{model}.pt', model=MODELS),
+        expand(DIR+'/trained models/loss__{model}.pt', model=MODELS)
     run:
-        trainer= ANN_Training.ModelTrainer({'model_name': params.model_name,
-                                            'num_epochs': params.num_epochs, 'dir': DIR,
-                                            'model_dir': DIR, 'threshold': params.threshold,
-                                            'batch_size': params.batch_size, 'model': params.model,
-                                            'model_config': params.model_config,
-                                            'loss_function': params.loss_function,
-                                            'optimizer': params.optimizer})
-        trainer.epoch_training()
-        trainer.save_model()
\ No newline at end of file
+        for model in params.models:
+            print(model)
+            trainer= ANN_Training.ModelTrainer({'model_name': model, 'dir': DIR,
+                                            'model_dir': DIR, **params.models[model]})
+            trainer.epoch_training()
+            trainer.save_model()
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
index 1b9ca99..90cb41c 100644
--- a/config.yaml
+++ b/config.yaml
@@ -24,12 +24,21 @@ functions:
     adjustment: 0
 
 # Parameter for Model Training
-model_name: Test_Name
-num_epochs: 1000
-threshold: 1.0e-5
-batch_size: 500
-model: ThreeLayerReLu
-model_config:
-loss_function: BCELoss
-optimizer: Adam
+models:
+  Adam:
+    num_epochs: 1000
+    threshold: 1.0e-5
+    batch_size: 500
+    model: ThreeLayerReLu
+    model_config: {}
+    loss_function: BCELoss
+    optimizer: Adam
+  SGD:
+    num_epochs: 1000
+    threshold: 1.0e-5
+    batch_size: 500
+    model: ThreeLayerReLu
+    model_config: {}
+    loss_function: BCELoss
+    optimizer: SGD
 
-- 
GitLab