diff --git a/ANN_Training.py b/ANN_Training.py
index 53ee0e27459921eab3397824e3c172901060c975..417c6f937dbf86ba45582bc78809d79763ba050e 100644
--- a/ANN_Training.py
+++ b/ANN_Training.py
@@ -11,7 +11,8 @@ TODO: Improve maximum selection runtime
 TODO: Discuss if we want training accuracy/ROC in addition to CFV
 TODO: Discuss whether to change output to binary
 TODO: Adapt TCD file to new classification
-TODO: Improve classification stat handling
+TODO: Improve classification stat handling -> Done
+TODO: Discuss automatic comparison between (non-)normalized data
 
 """
 import numpy as np
@@ -129,7 +130,8 @@ class ModelTrainer(object):
         # print(roc)
         # plt.plot(fpr, tpr, label="AUC="+str(auroc))
 
-        return [precision[0], recall[0], accuracy, f_score[0], auroc]
+        return {'Precision': precision[0], 'Recall': recall[0], 'Accuracy': accuracy,
+                'F-Score': f_score[0], 'AUROC': auroc}
 
     def save_model(self):
         # Saving Model
@@ -143,8 +145,8 @@ class ModelTrainer(object):
         torch.save(self._model.state_dict(), model_dir + '/model__' + name + '.pt')
         torch.save(self._validation_loss, model_dir + '/loss__' + name + '.pt')
 
-    def _classify(self):
-        pass
+    # def _classify(self):
+    #     pass
 
 
 def read_training_data(directory):
@@ -154,10 +156,11 @@ def read_training_data(directory):
     return TensorDataset(*map(torch.tensor, (np.load(input_file), np.load(output_file))))
 
 
-def evaluate_models(models, directory, num_iterations=100):
+def evaluate_models(models, directory, num_iterations=100, measures=None):
+    if measures is None:
+        measures = ['Accuracy', 'Precision', 'Recall', 'F-Score', 'AUROC']
     dataset = read_training_data(directory)
-    stats = ['Precision', 'Recall', 'Accuracy', 'F-Score', 'AUROC']
-    classification_stats = {model: {name: [] for name in stats} for model in models}
+    classification_stats = {measure: {model: [] for model in models} for measure in measures}
     for iteration in range(num_iterations):
         for train_index, test_index in KFold(n_splits=5, shuffle=True).split(dataset):
             # print("TRAIN:", train_index, "TEST:", test_index)
@@ -166,28 +169,13 @@ def evaluate_models(models, directory, num_iterations=100):
 
             for model in models:
                 result = models[model].test_model(training_set, test_set)
-                count = 0
-                for stat in stats:
-                    classification_stats[model][stat].append(result[count])
-                    count += 1
-
-    # print(classification_stats)
-    # print(np.array(classification_stats).mean(axis=0))
-    # print(np.array(classification_stats['Adam']['Precision']).shape)
-    # print(np.array([np.array(classification_stats[model]) for model in models]).transpose().shape)
-    # print(np.array([np.array(classification_stats[model]).transpose() for model in models]).shape)
-    # print(np.array([[classification_stats[model][stat] for model in models] for stat in stats]).shape)
-    # print(np.array([[np.array(classification_stats[model][stat]).mean(axis=0) for model in models] for stat in stats]).shape)
-    # print(np.array([*(np.array([[classification_stats[model][stat]
-    #                                          for model in models] for stat in stats]))]).shape)
-    # print(*(np.array([[classification_stats[model][stat]
-    #                                          for model in models] for stat in stats]))[0].shape)
-    plot_boxplot(models.keys(), *(np.array([[classification_stats[model][stat]
-                                             for model in models] for stat in stats])))
-    classification_stats = [[np.array(classification_stats[model][stat]).mean(axis=0) for model in models] for stat in stats]
-    # print(*classification_stats)
-
-    plot_classification_accuracy(models.keys(), *classification_stats)
+                for measure in measures:
+                    classification_stats[measure][model].append(result[measure])
+
+    plot_boxplot(models.keys(), classification_stats)
+    classification_stats = {measure: {model: np.array(classification_stats[measure][model]).mean()
+                                      for model in models} for measure in measures}
+    plot_classification_accuracy(models.keys(), classification_stats)
 
     # Set paths for plot files if not existing already
     plot_dir = directory + '/model evaluation'
diff --git a/Plotting.py b/Plotting.py
index 078381c7845b55386040a71b9db33bd7a07d313d..c9e1c74fa2b62ce226d7f3ce6b46ec0c8911f7b2 100644
--- a/Plotting.py
+++ b/Plotting.py
@@ -3,7 +3,8 @@
 @author: Laura C. Kühle
 
 TODO: Give option to select plotting color
-TODO: Improve classification plotting
+TODO: Improve classification plotting -> Done
+TODO: Add documentation to plot_boxplot()
 
 """
 import numpy as np
@@ -236,7 +237,7 @@ def calculate_exact_solution(mesh, cell_len, wave_speed, final_time, interval_le
     return grid, exact
 
 
-def plot_classification_accuracy(xlabels, precision, recall, accuracy, fscore, auroc):
+def plot_classification_accuracy(model_names, evaluation_dict):
     """Plots classification accuracy.
 
     Plots the accuracy, precision, and recall in a bar plot.
@@ -253,52 +254,50 @@ def plot_classification_accuracy(xlabels, precision, recall, accuracy, fscore, a
         List of strings for x-axis labels.
 
     """
-    pos = np.arange(len(xlabels))
-    width = 1/(3*len(xlabels))
+    pos = np.arange(len(model_names))
+    width = 1/(3*len(model_names))
     fig = plt.figure('classification_accuracy')
     ax = fig.add_axes([0.15, 0.1, 0.75, 0.8])
-    ax.bar(pos - 2*width, fscore, width, label='F-Score')
-    ax.bar(pos - width, precision, width, label='Precision')
-    ax.bar(pos, recall, width, label='Recall')
-    ax.bar(pos + width, accuracy, width, label='Accuracy')
-    ax.bar(pos + 2*width, auroc, width, label='AUROC')
+    step_len = 1
+    adjustment = -(len(model_names)//2)*step_len
+    for measure in evaluation_dict:
+        model_eval = [evaluation_dict[measure][model] for model in evaluation_dict[measure]]
+        ax.bar(pos + adjustment*width, model_eval, width, label=measure)
+        adjustment += step_len
     ax.set_xticks(pos)
-    ax.set_xticklabels(xlabels)
+    ax.set_xticklabels(model_names)
     ax.set_ylabel('Classification (%)')
     ax.set_ylim(bottom=-0.02)
     ax.set_ylim(top=1.02)
-    ax.set_title('Non-Normalized Test Data')
+    ax.set_title('Classification Evaluation (Barplot)')
     ax.legend(loc='upper right')
     # fig.tight_layout()
 
 
-def plot_boxplot(xlabels, precision, recall, accuracy, fscore, auroc):
+def plot_boxplot(model_names, evaluation_dict):
     fig = plt.figure('boxplot_accuracy')
-    pos = np.arange(len(xlabels))
-    width = 1/(5*len(xlabels))
     ax = fig.add_axes([0.15, 0.1, 0.75, 0.8])
+    step_len = 1.5
     boxplots = []
-    boxplots.append(ax.boxplot(fscore.transpose(), positions=pos - 3*width, widths=width, meanline=True,
-                               showmeans=True, patch_artist=True))
-    boxplots.append(ax.boxplot(precision.transpose(), positions=pos - 1.5*width, widths=width, meanline=True,
-                               showmeans=True, patch_artist=True))
-    boxplots.append(ax.boxplot(recall.transpose(), positions=pos, widths=width, meanline=True, showmeans=True,
-                               patch_artist=True))
-    boxplots.append(ax.boxplot(accuracy.transpose(), positions=pos + 1.5*width, widths=width, meanline=True,
-                               showmeans=True, patch_artist=True))
-    boxplots.append(ax.boxplot(auroc.transpose(), positions=pos + 3*width, widths=width, meanline=True,
-                               showmeans=True, patch_artist=True))
-    count = 0
+    adjustment = -(len(model_names)//2)*step_len
+    pos = np.arange(len(model_names))
+    width = 1/(5*len(model_names))
     colors = ['red', 'yellow', 'blue', 'tan', 'green']
-    for bp in boxplots:
-        for patch in bp['boxes']:
+    count = 0
+    for measure in evaluation_dict:
+        model_eval = [evaluation_dict[measure][model] for model in evaluation_dict[measure]]
+        boxplot = ax.boxplot(model_eval, positions=pos + adjustment*width, widths=width,
+                             meanline=True, showmeans=True, patch_artist=True)
+        for patch in boxplot['boxes']:
             patch.set(facecolor=colors[count])
-        count +=1
+        boxplots.append(boxplot)
+        count += 1
+        adjustment += step_len
+
     ax.set_xticks(pos)
-    ax.set_xticklabels(xlabels)
+    ax.set_xticklabels(model_names)
     ax.set_ylim(bottom=-0.02)
     ax.set_ylim(top=1.02)
     ax.set_ylabel('Classification (%)')
-    ax.set_title('Non-Normalized Test Data')
-    ax.legend([bp["boxes"][0] for bp in boxplots],
-              ['F-Score', 'Precision', 'Recall', 'Accuracy', 'AUROC'], loc='upper right')
+    ax.set_title('Classification Evaluation (Boxplot)')
+    ax.legend([bp["boxes"][0] for bp in boxplots], evaluation_dict.keys(), loc='upper right')