diff --git a/ANN_Training.py b/ANN_Training.py
index 158c6f52972f08e8cdacadea34fdbc8cb784db1f..27f86b6f3f96ca1921b6ed1df9b5b241b6d93d51 100644
--- a/ANN_Training.py
+++ b/ANN_Training.py
@@ -9,8 +9,9 @@ TODO: Add README for ANN training
 TODO: Fix random seed
 TODO: Write-protect all data and models
 TODO: Put legend outside plot (bbox_to_anchor)
-TODO: Put plotting into separate function
+TODO: Put plotting into separate function -> Done
 TODO: Reduce number of testing epochs to 50 -> Done
+TODO: Rename 'data_directory' to 'data_dir' -> Done
 
 """
 import numpy as np
@@ -105,8 +106,8 @@ class ModelTrainer(object):
         self._optimizer = getattr(torch.optim, optimizer)(
             self._model.parameters(), **optimizer_config)
         self._validation_loss = torch.zeros(self._num_epochs//10)
-        print(type(self._model), type(self._loss_function),
-              type(self._optimizer), type(self._validation_loss))
+        # print(type(self._model), type(self._loss_function),
+        #       type(self._optimizer), type(self._validation_loss))
 
     def epoch_training(self, dataset: torch.utils.data.dataset.TensorDataset,
                        num_epochs: int = None, verbose: bool = True) -> None:
@@ -126,7 +127,7 @@ class ModelTrainer(object):
             Flag whether commentary in console is wanted. Default: False.
 
         """
-        print(type(dataset))
+        # print(type(dataset))
         tic = time.perf_counter()
         if num_epochs is None:
             num_epochs = self._num_epochs
@@ -278,10 +279,11 @@ def read_training_data(directory: str, normalized: bool = True) -> \
 
 
 def evaluate_models(models: dict, directory: str, num_iterations: int = 100,
-                    colors: dict = None,
                     compare_normalization: bool = False) -> None:
     """Evaluates the classification of a given set of models.
 
+    Evaluates the classification and saves the results in a json file.
+
     Parameters
     ----------
     models : dict
@@ -290,34 +292,22 @@ def evaluate_models(models: dict, directory: str, num_iterations: int = 100,
         Path to directory for saving resulting plots.
     num_iterations : int, optional
         Number of iterations for evaluation. Default: 100.
-    colors : dict, optional
-        Dictionary containing plotting colors. If None, set to default colors.
-        Default: None.
     compare_normalization : bool, optional
         Flag whether both normalized and raw data should be evaluated.
         Default: False.
 
     """
     tic = time.perf_counter()
-    if colors is None:
-        colors = {'Accuracy': 'magenta', 'Precision_Smooth': 'red',
-                  'Precision_Troubled': '#8B0000', 'Recall_Smooth': 'blue',
-                  'Recall_Troubled': '#00008B', 'F-Score_Smooth': 'green',
-                  'F-Score_Troubled': '#006400', 'AUROC': 'yellow'}
-
     print('Read normalized training data.')
     datasets = {'normalized': read_training_data(directory)}
     if compare_normalization:
         print('Read raw, non-normalized training data.')
         datasets['raw'] = read_training_data(directory, False)
-    classification_stats = {measure: {model + ' (' + dataset + ')': []
-                                      for model in models
-                                      for dataset in datasets}
-                            for measure in colors}
 
     print('\nTraining models with 5-fold cross validation...')
     print('Number of iterations:', num_iterations)
     tic_train = time.perf_counter()
+    classification_stats = {}
     for iteration in range(num_iterations):
         for train_index, test_index in KFold(
                 n_splits=5, shuffle=True).split(datasets['normalized']):
@@ -327,7 +317,12 @@ def evaluate_models(models: dict, directory: str, num_iterations: int = 100,
 
                 for model in models:
                     result = models[model].test_model(training_set, test_set)
-                    for measure in colors:
+                    for measure in result.keys():
+                        if measure not in classification_stats.keys():
+                            classification_stats[measure] = \
+                                {model + ' (' + dataset + ')': []
+                                 for model in models
+                                 for dataset in datasets}
                         classification_stats[measure][model + ' (' + dataset +
                                                       ')'].append(
                             result[measure])
@@ -337,21 +332,55 @@ def evaluate_models(models: dict, directory: str, num_iterations: int = 100,
     print('Finished training models with 5-fold cross validation!')
     print(f'Training time: {toc_train - tic_train:0.4f}s\n')
 
+    print('Saving evaluation results in json format.')
     with open(directory + '/' + '_'.join(models.keys()) + '.json', 'w')\
             as json_file:
         json_file.write(json.dumps(classification_stats))
-    with open(directory + '/' + '_'.join(models.keys()) + '.json')\
-            as json_file:
+    toc = time.perf_counter()
+    print(f'Total runtime: {toc - tic:0.4f}s')
+
+
+def plot_evaluation_results(evaluation_file: str, directory: str,
+                    colors: dict = None) -> None:
+    """Plots given evaluation results of model classifications.
+
+    Plots evaluation results for all measures for which a color is given. If
+    colors is set to None, all measures are plotted with a default color
+    scheme.
+
+    Parameters
+    ----------
+    evaluation_file: str
+        Path to file containing evaluation results.
+    directory : str
+        Path to directory for saving resulting plots.
+    colors : dict, optional
+        Dictionary containing plotting colors. If None, set to default colors.
+        Default: None.
+
+    """
+    tic = time.perf_counter()
+    if colors is None:
+        colors = {'Accuracy': 'magenta', 'Precision_Smooth': 'red',
+                  'Precision_Troubled': '#8B0000', 'Recall_Smooth': 'blue',
+                  'Recall_Troubled': '#00008B', 'F-Score_Smooth': 'green',
+                  'F-Score_Troubled': '#006400', 'AUROC': 'yellow'}
+
+    print('Reading evaluation results.')
+    with open(evaluation_file) as json_file:
         classification_stats = json.load(json_file)
 
-    print('Plotting evaluation of trained models.')
+    print('\nPlotting evaluation of trained models...')
+    print('Plotting data in boxplot.')
+    models = classification_stats[list(colors.keys())[0]].keys()
     plot_boxplot(classification_stats, colors)
-    classification_stats = {measure: {model + ' (' + dataset + ')': np.array(
-        classification_stats[measure][model + ' (' + dataset + ')']).mean()
-                                      for model in models
-                                      for dataset in datasets}
+    print('Plotting averaged data in barplot.')
+    classification_stats = {measure: {model: np.array(
+        classification_stats[measure][model]).mean()
+                                      for model in models}
                             for measure in colors}
     plot_classification_accuracy(classification_stats, colors)
+    print('Finished plotting evaluation of trained models!\n')
 
     # Set paths for plot files if not existing already
     plot_dir = directory + '/model evaluation'
@@ -360,13 +389,13 @@ def evaluate_models(models: dict, directory: str, num_iterations: int = 100,
 
     # Save plots
     print('Saving plots.')
+    file_name = evaluation_file.split('/')[-1].rstrip('.json')
     for identifier in plt.get_figlabels():
         # Set path for figure directory if not existing already
         if not os.path.exists(plot_dir + '/' + identifier):
             os.makedirs(plot_dir + '/' + identifier)
 
         plt.figure(identifier)
-        plt.savefig(plot_dir + '/' + identifier + '/' +
-                    '_'.join(models.keys()) + '.pdf')
+        plt.savefig(plot_dir + '/' + identifier + '/' + file_name + '.pdf')
     toc = time.perf_counter()
     print(f'Total runtime: {toc - tic:0.4f}s')
diff --git a/workflows/ANN_training.smk b/workflows/ANN_training.smk
index 23004ae9945e3b989f3245fcf3e4321190b2cb29..66de3521b5276f94bb58cf87a527eeefab1c18c7 100644
--- a/workflows/ANN_training.smk
+++ b/workflows/ANN_training.smk
@@ -17,17 +17,39 @@ rule all:
         + '_'.join(MODELS.keys()) + '.pdf'
     default_target: True
 
+rule plot_test_results:
+    input:
+        json_file=DIR+'/'+ '_'.join(MODELS.keys()) + '.json'
+    output:
+        DIR+'/model evaluation/classification_accuracy/'
+        + '_'.join(MODELS.keys())+'.pdf'
+    params:
+        colors = config['classification_colors']
+    log:
+        DIR+'/log/plot_test_results.log'
+    run:
+        models = {}
+        with open(str(log), 'w') as logfile:
+            sys.stdout = logfile
+            sys.stderr = logfile
+            for model in MODELS:
+                trainer= ANN_Training.ModelTrainer(
+                    {'model_name': model, 'dir': DIR, 'model_dir': DIR,
+                     **MODELS[model]})
+                models[model] = trainer
+            plot_evaluation_results(evaluation_file=input.json_file,
+                directory=DIR, colors=params.colors)
+
+
 rule test_model:
     input:
         DIR+'/input_data.npy',
         DIR+'/normalized_input_data.npy',
         DIR+'/output_data.npy'
     output:
-        DIR+'/model evaluation/classification_accuracy/'
-        + '_'.join(MODELS.keys()) + '.pdf'
+        DIR+'/'+'_'.join(MODELS.keys())+'.json'
     params:
         num_iterations = config['num_iterations'],
-        colors = config['classification_colors'],
         compare_normalization = config['compare_normalization']
     log:
         DIR+'/log/test_model.log'
@@ -42,7 +64,7 @@ rule test_model:
                      **MODELS[model]})
                 models[model] = trainer
             evaluate_models(models=models, directory=DIR,
-                num_iterations=params.num_iterations, colors=params.colors,
+                num_iterations=params.num_iterations,
                 compare_normalization=params.compare_normalization)
 
 rule train_model: