diff --git a/.gitignore b/.gitignore
index 2daddf9843e93cb05d516e15993235059a0b4523..eea8c7e5cc99e3e9f29ca8d09eb593bc4059b990 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,5 @@
 Pipeline/*/
 !Pipeline/envs/
 !Pipeline/scripts/
+
+HOGVAX/*/
diff --git a/ContextEmbedding/binding_affinities_for_embedded_peptide.py b/ContextEmbedding/binding_affinities_for_embedded_peptide.py
new file mode 100644
index 0000000000000000000000000000000000000000..b29f5a30f8eedf4388b139dbc739c7dcf24556a3
--- /dev/null
+++ b/ContextEmbedding/binding_affinities_for_embedded_peptide.py
@@ -0,0 +1,30 @@
+import pandas as pd
+import numpy as np
+from ProcessData import read_peptides
+
+peptide_file = '../../Gifford_Data/peptides/mhcI_peptides_with_mutations_1embedding.pep'
+embedded_epitope_features = '../../Gifford_Data/peptide_context_embedding/1EmbeddedEpitopeFeatures.pkl'
+ba_file = '/home/sara/Documents/VaccinesProject/ivp/Gifford_Data/25June_mhc1_netmhc-4.1_pred_affinity_pivot.pkl.gz'
+
+embedded_peptides = read_peptides.main(peptide_file)
+# embedded_peptides = ['MYSFVSEET', 'MYSFVSEETG', 'YSFVSEETGT', 'SFVSEETGTL']
+all_embedded = pd.read_pickle(embedded_epitope_features)
+ba_affinities = pd.read_pickle(ba_file)
+
+mapped_peptides = pd.DataFrame(all_embedded.loc[embedded_peptides]['peptide']).reset_index(level=0).set_index('peptide')
+print(mapped_peptides)
+same_pep = list(set([p for p in mapped_peptides.index if p in ba_affinities.index]))
+print(same_pep)
+mapped_peptides = mapped_peptides.loc[same_pep]
+filtered = ba_affinities.reset_index()
+filtered.Peptide = filtered.Peptide.apply(lambda x: mapped_peptides.loc[x]['embedded'] if any(x == p for p in same_pep) else np.nan)
+filtered = filtered[filtered['Peptide'].notna()]
+print(filtered)
+filtered = filtered.set_index(['Peptide'])
+for embed in mapped_peptides['embedded']:
+    if embed not in filtered.index:
+        print(embed, ' is not in filtered index!!!')
+        exit(-1)
+
+print(filtered)
+# new_df.to_pickle('../../Gifford_Data/' + str(embed) + 'embedded_25June_mhc1_netmhc-4.1_pred_affinity_pivot.pkl.gz', compression='gzip')
\ No newline at end of file
diff --git a/ContextEmbedding/create_embedded_input_peptides.py b/ContextEmbedding/create_embedded_input_peptides.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eb0d345679f5b73c08660e8ece1fcace9382eb7
--- /dev/null
+++ b/ContextEmbedding/create_embedded_input_peptides.py
@@ -0,0 +1,44 @@
+import pandas as pd
+from ProcessData import read_peptides
+
+max_embed = 10
+mhc = 'mhcII'
+peptide_range = [13, 26] # exclusive end value
+original_feature_file = '../../Gifford_Data/AllEpitopeFeatures.pkl'
+ba_file = '/home/sara/Documents/VaccinesProject/ivp/Gifford_Data/25June_mhc2_netmhcii-4.1_pred_affinity_pivot_v1v2.pkl.gz'
+
+for i in range(1, max_embed+1):
+    embed = str(i)
+    feature_file = '../../Gifford_Data/peptide_context_embedding/' + embed + 'EmbeddedEpitopeFeatures.pkl'
+    out_file_ori = '../../Gifford_Data/peptides/' + mhc + '_peptides_with_mutations.pep'
+    out_file = '../../Gifford_Data/peptides/' + mhc + '_peptides_with_mutations_' + embed + 'embedding.pep'
+
+    ori_features = pd.read_pickle(original_feature_file)
+    features = pd.read_pickle(feature_file)
+
+    # filtering like Gifford
+    selfp = open('../../Gifford_Data/self_pept.txt').read().split('\n')
+    ori_peptides = ori_features[(ori_features['epi_len'].isin(range(*peptide_range))) &
+                                (ori_features['glyco_probs']<=0.0) &
+                                (ori_features['crosses_cleavage']==0) &
+                                (ori_features['Is_self_pept']==False)].index.to_list()
+    print(len(ori_peptides))
+
+    # drop peptides that do not occur in ba predictions
+    ba = pd.read_pickle(ba_file)
+    na = []
+    for pep in ori_peptides:
+        if pep not in ba.index:
+            ori_peptides.remove(pep)
+            na.append(pep)
+    print(len(na), na)
+    print(len(ori_peptides))
+
+    peptides = features[features['peptide'].isin(ori_peptides)].index
+    print(len(peptides))
+
+    with open(out_file, 'w') as ori_out:
+        ori_out.write(('\n').join(peptides))
+
+with open(out_file_ori, 'w') as out:
+    out.write('\n'.join(ori_peptides))
\ No newline at end of file
diff --git a/ContextEmbedding/embedded_epitope_features.py b/ContextEmbedding/embedded_epitope_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b1e81effa880ded7f73a47a4db4b9761bc65920
--- /dev/null
+++ b/ContextEmbedding/embedded_epitope_features.py
@@ -0,0 +1,45 @@
+import pandas as pd
+import re
+
+
+def get_embedded_epitope_features(embed, all_epi_feat, min_len):
+    with open('../../Gifford_Data/peptide_context_embedding/base_proteins_with_end_codon.fasta', 'r') as fasta:
+        proteins = {}
+        key = ''
+        for line in fasta:
+            if line.startswith('>'):
+                key = re.match('> Protein: ([A-Z]+\d*[a-z]*) |', line).group(1)
+                continue
+            proteins[key] = line.strip('\n').strip('*')
+
+    df_list = []
+    for peptide in all_epi_feat.index:
+        protein = all_epi_feat.loc[peptide]['protein']
+        if protein == 'S1' or protein == 'S2':
+            protein = 'S'
+        pos = proteins[protein].index(peptide)
+        start_embed = pos - embed
+        end_embed = pos + len(peptide) + embed
+        if start_embed < 0 or end_embed > len(proteins[protein]):
+            if len(peptide) < min_len:
+                embedded = peptide
+            else:
+                continue
+        else:
+            embedded = proteins[protein][start_embed:end_embed]
+        # embedded = proteins[protein][max(pos-embed, 0):min(pos+len(peptide)+embed, len(proteins[protein]))]
+        tmp = pd.concat([pd.Series({'embedded': embedded, 'peptide': peptide, 'embedding_length': embed}), all_epi_feat.loc[peptide]])
+        df_list.append(tmp)
+
+    new_df = pd.concat(df_list, axis=1).T.set_index('embedded')
+    print('Writing ../../Gifford_Data/' + str(embed) + 'EmbeddedEpitopeFeatures.pkl')
+    new_df.to_pickle('../../Gifford_Data/peptide_context_embedding/' + str(embed) + 'EmbeddedEpitopeFeatures.pkl')
+
+
+max_embed = 10
+min_len = 10
+# read in all epitope features of original peptides
+all_epi_feat = pd.read_pickle('../../Gifford_Data/AllEpitopeFeatures.pkl')
+
+for i in range(1, max_embed+1):
+    get_embedded_epitope_features(embed=i, all_epi_feat=all_epi_feat, min_len=min_len)
\ No newline at end of file
diff --git a/ContextEmbedding/reformat_netchop_out.py b/ContextEmbedding/reformat_netchop_out.py
new file mode 100644
index 0000000000000000000000000000000000000000..127ebd594357eb36ed12ccb7a397984ee0fdcf7c
--- /dev/null
+++ b/ContextEmbedding/reformat_netchop_out.py
@@ -0,0 +1,51 @@
+import re
+import os
+
+prefix = '../../Proteasomal_Cleavage_Predictions/24SeptEmbeddingPredictions/MHCI/'
+files = [f for f in os.listdir(prefix) if re.search(r'NetChop3\.1_17Aug_mhcI+(_\d+embedded)*_hogvax(_with_mutations)*\.txt$', f)]
+# files = ['NetChop3.1_17Aug_mhcI_1embedded_hogvax.txt', 'NetChop3.1_17Aug_mhcI_3embedded_hogvax.txt',
+#          'NetChop3.1_17Aug_mhcI_4embedded_hogvax.txt', 'NetChop3.1_17Aug_mhcI_5embedded_hogvax.txt',
+#          'NetChop3.1_17Aug_mhcI_8embedded_hogvax.txt']
+# prefix = '../../Proteasomal_Cleavage_Predictions/SarsCoVCleavage/'
+print(files)
+
+for file in files:
+    keep = []
+    indices = []
+    with open(prefix + file, 'r') as f:
+        for line in f:
+            line = line.lstrip()
+            if line.startswith('#') or line.startswith('-'):
+                continue
+            if line.startswith('pos'):
+                indices.append(len(keep))
+                line = re.sub(pattern='\s+', repl='\t', string=line)
+                line = line.strip() + '\n'
+                keep.append(line)
+                continue
+            if re.match('^\d', line):
+                match = re.search(r'\s+(\D+\d*\D*)(\s*_*\t*)\n', line)
+                if match:
+                    ident = match.group(1)
+                    line = line.replace(match.group(1), '"' + ident + '"')
+                    line = re.sub(pattern='\s+', repl='\t', string=line)
+                    line = line.strip() + '\n'
+                    match2 = re.search(r'(\t_\t\D*)\"', line)
+                    if match2:
+                        line = line.replace(match2.group(1), '')
+                    keep.append(line)
+
+    if len(indices) == 2:
+        with open('../../Proteasomal_Cleavage_Predictions/24SeptEmbeddingPredictions/MHCI/' + file.replace('.txt', '_reformat.txt'), 'w') as out:
+            out.write(''.join(keep[indices[0]:indices[1]]))
+
+        with open('../../Proteasomal_Cleavage_Predictions/24SeptEmbeddingPredictions/MHCI/' + file.replace('.txt', '_reformat_concat.txt'), 'w') as out:
+            out.write(''.join(keep[indices[1]:-1]))
+    else:
+        with open('../../Proteasomal_Cleavage_Predictions/SarsCoVCleavage/' + file.replace('.txt', '_reformat.txt'), 'w') as out:
+            out.write(''.join(keep[0]))
+            for i in range(len(indices)):
+                end = i+1
+                if end >= len(indices):
+                    end = -1
+                out.write(''.join(keep[indices[i]+1:indices[end]]))
diff --git a/ContextEmbedding/sars_netchop_to_pkl.py b/ContextEmbedding/sars_netchop_to_pkl.py
new file mode 100644
index 0000000000000000000000000000000000000000..993ea4338b36a70d01a63d6b9f83dbe594a276ce
--- /dev/null
+++ b/ContextEmbedding/sars_netchop_to_pkl.py
@@ -0,0 +1,29 @@
+import pandas as pd
+import re
+
+keep = []
+indices = []
+with open('/home/sara/Documents/VaccinesProject/ivp/Proteasomal_Cleavage_Predictions/SarsCoVCleavage/NetChop3.1_sars_cov_2_predictions.txt') as file:
+    for line in file:
+        line = line.lstrip()
+        if line.startswith('#') or line.startswith('-'):
+            continue
+        if line.startswith('pos'):
+            indices.append(len(keep))
+            line = re.sub(pattern='\s+', repl='\t', string=line)
+            line = line.strip().split('\t')
+            keep.append(line)
+            continue
+        if re.match('^\d', line):
+            match = re.search(r'\s+(\D+)\n', line)
+            if match:
+                line = line.replace('Base', '').replace('_', '')
+                line = re.sub(pattern='\s+', repl='\t', string=line)
+                line = line.strip().split('\t')
+                keep.append(line)
+
+print(keep)
+df = pd.DataFrame(keep)
+df = df.iloc[: , :-1]
+print(df)
+df.to_pickle('/home/sara/Documents/VaccinesProject/ivp/Proteasomal_Cleavage_Predictions/SarsCoVCleavage/NetChop3.1_sars_cov_2_predictions.pkl')
\ No newline at end of file
diff --git a/Evaluation/binarize_entries.py b/Evaluation/binarize_entries.py
new file mode 100644
index 0000000000000000000000000000000000000000..f232cdd968120314830d06ce81d425c567bb4306
--- /dev/null
+++ b/Evaluation/binarize_entries.py
@@ -0,0 +1,14 @@
+import pandas as pd
+
+
+def binarize_entries(df, th):
+    for key in df.keys():
+        df.loc[df[key] >= th, key] = 1.0
+        df.loc[df[key] < th, key] = 0.0
+    return df
+
+
+def set_small_to_zero(df, th):
+    for key in df.keys():
+        df.loc[df[key] < th, key] = 0.0
+    return df
diff --git a/Evaluation/calculate_cleaved_peptides.py b/Evaluation/calculate_cleaved_peptides.py
new file mode 100644
index 0000000000000000000000000000000000000000..529fd8180822876722cff0e1a2578e9c8d97e562
--- /dev/null
+++ b/Evaluation/calculate_cleaved_peptides.py
@@ -0,0 +1,54 @@
+import sys
+import pandas as pd
+
+
+def get_cleaved_peptides(cleavage):
+    vax_seq = ''.join(cleavage['residue'])
+    start = 0
+    peptides_from_prediction = []
+    # print(cleavage.iloc[0])
+    # print(cleavage[cleavage['cleaved']])
+    for outer in cleavage[cleavage['cleaved']].itertuples():
+        outer = outer._asdict()
+        # cut happens after cleaved = True but file count is 1-based
+        next_start = outer['position']
+        for inner in cleavage[(cleavage['cleaved']) & (cleavage['position'] > start)].itertuples():
+            inner = inner._asdict()
+            # 1-based!
+            pos = inner['position']
+            peptide = vax_seq[start:pos]
+            if 8 <= len(peptide) <= 25:
+                peptides_from_prediction.append(vax_seq[start:pos])
+        start = next_start
+
+    return list(set(peptides_from_prediction))
+
+
+def main(pepsickle_file, netchop_file):
+    with open(pepsickle_file, 'r') as file:
+        df_pep = pd.read_csv(file, sep='\t', lineterminator='\n')
+
+    peptides_from_pepsickle = get_cleaved_peptides(df_pep)
+
+    with open('/home/sara/Documents/VaccinesProject/ivp/Proteasomal_Cleavage_Predictions/SarsCoVCleavage/pepsickle_sars_cov_2_cleavage_peptides.txt', 'w') as out:
+        out.write('\n'.join(peptides_from_pepsickle))
+
+    with open(netchop_file, 'r') as file:
+        df_net = pd.read_csv(file, sep='\s+', lineterminator='\n')
+
+    df_net = df_net.rename(columns={'pos': 'position', 'AA': 'residue', 'C': 'cleaved'})
+    df_net['cleaved'] = df_net['cleaved'].replace(['S'], True).replace(['.'], False)
+
+    peptides_from_netchop = get_cleaved_peptides(df_net)
+
+    with open(
+            '/home/sara/Documents/VaccinesProject/ivp/Proteasomal_Cleavage_Predictions/SarsCoVCleavage/netchop_sars_cov_2_cleavage_peptides.txt',
+            'w') as out:
+        out.write('\n'.join(peptides_from_netchop))
+
+
+if __name__ == '__main__':
+    pepsickle = sys.argv[1]
+    netchop = sys.argv[2]
+    main(pepsickle, netchop)
+
diff --git a/Evaluation/calculate_protein_cleaved_peptides.py b/Evaluation/calculate_protein_cleaved_peptides.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fec70db488fca82d877d6d466273a7ee3e96b9a
--- /dev/null
+++ b/Evaluation/calculate_protein_cleaved_peptides.py
@@ -0,0 +1,110 @@
+"""
+Compare cleavage prediction peptides of different (embedded) vaccines with cleavage predicted peptides from SARS-CoV-2
+proteome.
+"""
+import re
+import glob
+import pandas as pd
+import altair as alt
+from EvaluateVaccines import calculate_cleaved_peptides
+
+
+def read_peptides(file):
+    peptides = []
+    with open(file, 'r') as f:
+        for line in f:
+            pep = line.strip()
+            peptides.append(pep)
+    print(len(peptides))
+    return peptides
+
+
+def transform_data(data):
+    new_dict = {'embedding': [], 'tool': [], 'fraction': []}
+    for key in data:
+        new_dict['embedding'].extend([key, key])
+        new_dict['tool'].extend(['NetChop3.1', 'Pepsickle'])
+        new_dict['fraction'].extend([data[key]['NetChop3.1'], data[key]['Pepsickle']])
+    return new_dict
+
+
+out_name = '23Aug_mhcI_cleaved_virus_proteins'
+sars_peptides_pepsickle = read_peptides('/home/sara/Documents/VaccinesProject/ivp/Proteasomal_Cleavage_Predictions/SarsCoVCleavage/pepsickle_sars_cov_2_cleavage_peptides.txt')
+sars_peptides_netchop = read_peptides('/home/sara/Documents/VaccinesProject/ivp/Proteasomal_Cleavage_Predictions/SarsCoVCleavage/netchop_sars_cov_2_cleavage_peptides.txt')
+
+intersection_count = {}
+files = glob.glob('/home/sara/Documents/VaccinesProject/ivp/Proteasomal_Cleavage_Predictions/23AugEmbeddingPredictions/MHCI/NetChop3.1_17Aug_mhcI_*_reformat.txt') + \
+        ['/home/sara/Documents/VaccinesProject/ivp/Proteasomal_Cleavage_Predictions/23AugEmbeddingPredictions/MHCI/NetChop3.1_17Aug_mhcI_with_mutations_reformat_concat.txt']
+for file in files:
+    match = re.search(r'_17Aug_(mhcI+(_\d+embedded)*(_hogvax)*)(_with_mutations)*(_reformat_concat)*', file)
+    name = match.group(1).replace('_hogvax', '')
+    if match.group(5):
+        name += '_concat'
+
+    df_net = pd.read_csv(file, sep='\s+', lineterminator='\n')
+    df_net = df_net.rename(columns={'pos': 'position', 'AA': 'residue', 'C': 'cleaved'})
+    df_net['cleaved'] = df_net['cleaved'].replace(['S'], True).replace(['.'], False)
+
+    peptides_from_netchop = calculate_cleaved_peptides.get_cleaved_peptides(df_net)
+
+    intersec = [p for p in peptides_from_netchop if p in sars_peptides_netchop]
+    intersection_count[name] = {'NetChop3.1': len(intersec)/len(peptides_from_netchop)}
+
+for file in glob.glob('/home/sara/Documents/VaccinesProject/ivp/Proteasomal_Cleavage_Predictions/23AugEmbeddingPredictions/MHCI/pepsickle_17Aug_mhcI_*.txt'):
+    match = re.search(r'_17Aug_(mhcI+(_\d+embedded)*(_hogvax)*)(_with_mutations)*', file)
+    name = match.group(1).replace('_hogvax', '')
+
+    df_pep = pd.read_csv(file, sep='\t', lineterminator='\n')
+    df_pep = df_pep[df_pep['protein_id'] != 'HOGVAX']
+    peptides_from_pepsickle = calculate_cleaved_peptides.get_cleaved_peptides(df_pep)
+
+    intersec = [p for p in peptides_from_pepsickle if p in sars_peptides_pepsickle]
+    if not name in intersection_count:
+        print('ATTENTION! Is not in dict:', name)
+        exit(-1)
+
+    intersection_count[name]['Pepsickle'] = len(intersec) / len(peptides_from_pepsickle)
+
+# add concat data
+with open('/home/sara/Documents/VaccinesProject/ivp/Proteasomal_Cleavage_Predictions/23AugEmbeddingPredictions/MHCI/pepsickle_17Aug_mhcI_with_mutations.txt') as f:
+    df_pep = pd.read_csv(file, sep='\t', lineterminator='\n')
+    df_pep = df_pep[df_pep['protein_id'] != 'Concat']
+    peptides_from_pepsickle = calculate_cleaved_peptides.get_cleaved_peptides(df_pep)
+
+    intersec = [p for p in peptides_from_pepsickle if p in sars_peptides_pepsickle]
+    if not name in intersection_count:
+        print('ATTENTION! Is not in dict:', name)
+        exit(-1)
+
+    intersection_count['mhcI_concat']['Pepsickle'] = len(intersec) / len(peptides_from_pepsickle)
+
+
+print(intersection_count)
+df = pd.DataFrame(transform_data(intersection_count))
+print(df)
+
+# sorting = ['mhcII', 'mhcII_concat', 'mhcII_1embedded', 'mhcII_2embedded', 'mhcII_3embedded', 'mhcII_4embedded', 'mhcII_5embedded', 'mhcII_6embedded', 'mhcII_7embedded', 'mhcII_8embedded', 'mhcII_9embedded', 'mhcII_10embedded']
+sorting = ['mhcI', 'mhcI_concat', 'mhcI_1embedded', 'mhcI_2embedded', 'mhcI_3embedded', 'mhcI_4embedded', 'mhcI_5embedded', 'mhcI_6embedded', 'mhcI_7embedded', 'mhcI_8embedded', 'mhcI_9embedded', 'mhcI_10embedded']
+colors = ['#FFAD00', '#3A7A9B']
+bar = alt.Chart(df).mark_bar(size=25).encode(
+            x=alt.X('tool:O', axis=alt.Axis(title=None, labels=False, ticks=False)),
+            y=alt.Y('fraction:Q', title='Fraction of cleaved peptides'),
+            color=alt.Color('tool:O', scale=alt.Scale(range=colors), legend=alt.Legend(title='Tool')),
+            column=alt.Column('embedding:O', title='Cleaved peptides that are equally predicted from SARS-CoV-2 proteome for different embedding lengths', sort=sorting)
+        ).configure_axis(
+            # grid=False
+        ).configure_view(
+            strokeOpacity=0
+        ).properties(
+            width=90
+        )
+
+bar.save('cleavage_plots/' + out_name + '.html')
+
+
+# all_epi_ft = pd.read_pickle('/home/sara/Documents/VaccinesProject/ivp/Gifford_Data/AllEpitopeFeatures.pkl')
+# known_peptides_pepsickle = [p for p in sars_peptides_pepsickle if p in all_epi_ft.index]
+# print(len(known_peptides_pepsickle))
+# known_peptides_netchop = [p for p in sars_peptides_netchop if p in all_epi_ft.index]
+# print(len(known_peptides_netchop))
+
diff --git a/Evaluation/compare_pep_count_vs_score.py b/Evaluation/compare_pep_count_vs_score.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dcdec92774718d3bb65020bb85755070ad0fce4
--- /dev/null
+++ b/Evaluation/compare_pep_count_vs_score.py
@@ -0,0 +1,46 @@
+import re
+import glob
+import json
+import pandas as pd
+import altair as alt
+from ProcessData import read_peptides
+
+
+data = {'Peptide Count': [], 'Objective Value': [], 'Embedding': []}
+files = ['/home/sara/Documents/VaccinesProject/ivp/Pipeline/17Aug_mhcI_with_mutations/ilp/lp_out/27030_vaccine_ilp_hog.json'] + \
+        glob.glob('/home/sara/Documents/VaccinesProject/ivp/Pipeline/17Aug_mhcI_*embedded/ilp/lp_out/*_vaccine_ilp_hog.json')
+for file in files:
+    js = json.load(open(file))
+    tmp = pd.DataFrame(js['SolutionInfo'])
+    obj_val = tmp['ObjVal'][0]
+
+    embed_len = re.search(r'17Aug_mhcI_(\d*)(embedded)*', file).group(1)
+    if not embed_len:
+        embed_len = 0
+    else:
+        embed_len = int(embed_len)
+
+    pep_file = file.replace('lp_out', 'pep_out').replace('vaccine_ilp_hog.json', 'chosen_peptides_hog_inc_substrings.txt')
+    peptides = read_peptides.main(pep_file)
+    pep_count = len(peptides)
+
+    data['Peptide Count'].append(pep_count)
+    data['Objective Value'].append(obj_val)
+    data['Embedding'].append(embed_len)
+
+print(data)
+df = pd.DataFrame(data)
+
+sorting = ['mhcI', 'mhcI_1embedded', 'mhcI_2embedded', 'mhcI_3embedded', 'mhcI_4embedded', 'mhcI_5embedded', 'mhcI_6embedded', 'mhcI_7embedded', 'mhcI_8embedded', 'mhcI_9embedded', 'mhcI_10embedded']
+domain = [df.min()['Objective Value'], (df.min()['Objective Value'] + df.max()['Objective Value'])/2, df.max()['Objective Value']]
+print(domain)
+# colors = ['#9AD6E5', '#66A6D9', '#BBB465', '#7A9B3A', '#DA0034', '#3A7A9B', '#FFAD00', '#9B3A7A', '#294B5D', '#DA2300', '#9B5B3A']
+colors = ['#7A9B3A', '#DA0034', '#3A7A9B']
+chart = alt.Chart(df).mark_circle(size=90).encode(
+    x = alt.X('Embedding:Q'),
+    y = alt.Y('Peptide Count'),
+    # y = alt.Y('Objective Value', scale=alt.Scale(domain=[2, 2.8])),
+    color = alt.Color('Objective Value', scale=alt.Scale(domain=domain, range=colors))
+)
+
+chart.save('cleavage_plots/embedding_vs_pepcount_vs_score_mhc1.html')
\ No newline at end of file
diff --git a/Evaluation/compare_single_vs_combined.py b/Evaluation/compare_single_vs_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff105cf1984a8c1a2507c1ad24699376563b8c05
--- /dev/null
+++ b/Evaluation/compare_single_vs_combined.py
@@ -0,0 +1,64 @@
+# used this script for debugging
+import gzip
+import pandas as pd
+from ProcessData import read_peptides, binarize_entries, find_superstrings
+
+combined = '/home/sara/Documents/VaccinesProject/ivp/Pipeline/13July_mhcIandII/ilp/pep_out/41947_chosen_peptides_hog.txt'
+combined_substr = '/home/sara/Documents/VaccinesProject/ivp/Pipeline/13July_mhcIandII/ilp/pep_out/41947_chosen_peptides_hog_inc_substrings.txt'
+combined_peptides = read_peptides.main(combined)
+combined_peptides_substr = read_peptides.main(combined_substr)
+print(len(combined_peptides))
+
+# mhcI = '/home/sara/Documents/VaccinesProject/ivp/Pipeline/11July_filt_substrmhcI/ilp/pep_out/4512_chosen_peptides_hog_inc_substrings.txt'
+# mhcI_peptides = read_peptides.main(mhcI)
+# mhcII = '/home/sara/Documents/VaccinesProject/ivp/Pipeline/12July_filt_substrmhcII/ilp/pep_out/37435_chosen_peptides_hog_inc_substrings.txt'
+# mhcII_peptides = read_peptides.main(mhcII)
+
+# mhcI_equal = set(mhcI_peptides).intersection(combined_peptides)
+# print(len(mhcI_equal), 'of', len(mhcI_peptides))
+#
+# mhcII_equal = set(mhcII_peptides).intersection(combined_peptides)
+# print(len(mhcII_equal), 'of', len(mhcII_peptides))
+
+combined_ba = pd.read_pickle(gzip.open('/home/sara/Documents/VaccinesProject/ivp/Gifford_Data/25June_combined_mhc_netmhc_pred_affinity.pkl.gz'))
+combined_ba_substr = pd.read_pickle(gzip.open('/home/sara/Documents/VaccinesProject/ivp/Gifford_Data/substr_modified_25June_combined_mhc_netmhc_pred_affinity.pkl.gz'))
+# mhcI_ba = pd.read_pickle(gzip.open('../../Gifford_Data/25June_mhc1_netmhc-4.1_pred_affinity_pivot.pkl.gz'))
+# mhcII_ba = pd.read_pickle(gzip.open('../../Gifford_Data/25June_mhc2_netmhcii-4.1_pred_affinity_pivot_v1v2.pkl.gz'))
+
+combined_af = pd.read_pickle('../../Gifford_Data/IEDB_combined_population_frequency.pkl')
+# mhcI_af = pd.read_pickle('../../Gifford_Data/IEDB_population_frequency2392_normalized.pkl')
+# mhcII_af = pd.read_pickle('../../Gifford_Data/IEDB_population_frequency_mhc2_275normalized.pkl')
+
+print(set(combined_peptides_substr).intersection(combined_peptides))
+
+bin_ba = binarize_entries.binarize_entries(combined_ba, 0.638)
+bin_ba = bin_ba.loc[combined_peptides_substr]
+hit_alleles = bin_ba[[x for x in bin_ba if sum(bin_ba[x]) > 0]].columns
+score = sum(combined_af[[al for al in hit_alleles if al in combined_af]].loc['World'])
+print(score)
+
+bin_ba_sub = binarize_entries.binarize_entries(combined_ba_substr, 0.638)
+bin_ba_sub = bin_ba_sub.loc[combined_peptides]
+hit_alleles_sub = bin_ba_sub[[x for x in bin_ba_sub if sum(bin_ba_sub[x]) > 0]].columns
+score_sub = sum(combined_af[[al for al in hit_alleles_sub if al in combined_af]].loc['World'])
+print(score_sub)
+
+all_peptides = read_peptides.main('/home/sara/Documents/VaccinesProject/ivp/Gifford_Data/peptides/combined_filtered_mhc_peptides_gifford_style.pep')
+# superstrings = find_superstrings.main(all_peptides)
+super_covered_alleles = []
+for al in hit_alleles_sub:
+    if al not in hit_alleles:
+        super_covered_alleles.append(al)
+        # for p in bin_ba_sub.index:
+        #     if bin_ba_sub[al][p] > 0.0 and combined_ba[al][p] == 0.0:
+        #         print('Peptide {} covering allele {} in merged data, but not in original data'.format(p, al))
+        #         substrings = superstrings[p]
+        #         print(substrings)
+        #         if not any(combined_ba[al][x] > 0.0 for x in substrings):
+        #             print('None of the substrings has non-zero value in original data')
+        #         else:
+        #             for x in substrings:
+        #                 if x not in combined_peptides_substr:
+        #                     print('why is {} not in combined substr peptied, but {} is in and superstr'.format(x, p))
+
+print(super_covered_alleles)
\ No newline at end of file
diff --git a/Evaluation/compare_size_to_coverage.py b/Evaluation/compare_size_to_coverage.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c761bb7b84d3f35f1eabc2429764b46f153bf04
--- /dev/null
+++ b/Evaluation/compare_size_to_coverage.py
@@ -0,0 +1,84 @@
+import os
+import glob
+import json
+import re
+import gzip
+import pandas as pd
+import altair as alt
+from EvaluateVaccines import evaluate_vaccine as ev
+from EvaluateVaccines import comparison_data as cd
+from ProcessData import read_peptides
+
+
+mhc1_all_ba = pd.read_pickle(gzip.open('../../Gifford_Data/25June_mhc1_netmhc-4.1_pred_affinity_pivot.pkl.gz'))
+mhc1_af = pd.read_pickle('../../Gifford_Data/IEDB_population_frequency2392_normalized.pkl')
+
+mhc2_all_ba = pd.read_pickle(gzip.open('../../Gifford_Data/25June_mhc2_netmhcii-4.1_pred_affinity_pivot_v1v2.pkl.gz'))
+mhc2_af = pd.read_pickle('../../Gifford_Data/IEDB_population_frequency_mhc2_275normalized.pkl')
+
+mhc1_scores = {'Size': [], 'Score': [], 'Method': []}
+for file in glob.glob('/home/sara/Documents/VaccinesProject/ivp/Gifford_Code/optivax_1-19_vaccines_mhc1/13Sept_optivax_unlinked_peptides_*.txt'):
+    size = re.search(r'13Sept_optivax_unlinked_peptides_(\d+)', file).group(1)
+    mhc1_scores['Size'].append(size)
+
+    peptides = read_peptides.main(file)
+    mhc1_scores['Score'].append(cd.get_cmp_obj(peptides, mhc1_all_ba, mhc1_af, smaller_to_zero=True).pred_at_least_one('World'))
+    mhc1_scores['Method'].append('OptiVax')
+
+print(mhc1_scores)
+
+loci_cov = {'Locus': [], 'Score': [], 'Size': []}
+for hog_file in glob.glob('/home/sara/Documents/VaccinesProject/ivp/Pipeline/15Sept_mhcI_*/'):
+    size = re.search(r'15Sept_mhcI_(\d+)', hog_file).group(1)
+    mhc1_scores['Size'].append(size)
+
+    peptides = read_peptides.main(hog_file + 'ilp/pep_out/4512_chosen_peptides_hog_inc_substrings.txt')
+    mhc1_scores['Score'].append(cd.get_cmp_obj(peptides, mhc1_all_ba, mhc1_af, smaller_to_zero=True).pred_at_least_one('World'))
+    mhc1_scores['Method'].append('HOGVAX')
+
+print(mhc1_scores)
+
+df = pd.DataFrame.from_dict(mhc1_scores)
+print(df)
+
+chart = alt.Chart(df).mark_line().encode(
+    x='Size:Q',
+    y=alt.Y('Score:Q', scale=alt.Scale(domain=[0.4, 1.0])),
+    color=alt.Color('Method:N', scale=alt.Scale(range=['#DA0034', '#FFC12C']))
+)
+chart.save('plots/evalvax_cmp_mhc1.html')
+
+
+mhc2_scores = {'Size': [], 'Score': [], 'Method': []}
+for file in glob.glob('/home/sara/Documents/VaccinesProject/ivp/Gifford_Code/optivax_1-19_vaccines_mhc2/13Sept_optivax_unlinked_peptides_*.txt'):
+    size = re.search(r'13Sept_optivax_unlinked_peptides_(\d+)', file).group(1)
+    mhc2_scores['Size'].append(size)
+
+    peptides = read_peptides.main(file)
+    mhc2_scores['Score'].append(cd.get_cmp_obj(peptides, mhc2_all_ba, mhc2_af, smaller_to_zero=True).pred_at_least_one('World'))
+    mhc2_scores['Method'].append('OptiVax')
+
+print(mhc2_scores)
+
+for hog_file in glob.glob('/home/sara/Documents/VaccinesProject/ivp/Pipeline/22Sept_mhc2_*/'):
+    size = re.search(r'22Sept_mhc2_(\d+)', hog_file).group(1)
+    mhc2_scores['Size'].append(size)
+
+    pepfile = hog_file + 'ilp/pep_out/37435_chosen_peptides_hog_incl_substrings.txt'
+    if not os.path.isfile(pepfile):
+        pepfile = hog_file + 'ilp/pep_out/37435_chosen_peptides_hogvax_incl_substrings.txt'
+    peptides = read_peptides.main(pepfile)
+    mhc2_scores['Score'].append(cd.get_cmp_obj(peptides, mhc2_all_ba, mhc2_af, smaller_to_zero=True).pred_at_least_one('World'))
+    mhc2_scores['Method'].append('HOGVAX')
+
+print(mhc2_scores)
+
+df = pd.DataFrame.from_dict(mhc2_scores)
+print(df)
+
+chart = alt.Chart(df).mark_line().encode(
+    x='Size:Q',
+    y=alt.Y('Score:Q', scale=alt.Scale(domain=[0.6, 1.0])),
+    color=alt.Color('Method:N', scale=alt.Scale(range=['#DA0034', '#FFC12C']))
+)
+chart.save('plots/evalvax_cmp_mhc2.html')
\ No newline at end of file
diff --git a/Evaluation/comparison_data.py b/Evaluation/comparison_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..187138466296f881be6ef6406d01b7e564992602
--- /dev/null
+++ b/Evaluation/comparison_data.py
@@ -0,0 +1,302 @@
+import gzip
+import pickle
+import os
+import pandas as pd
+import numpy as np
+import evaluate_vaccine as ev
+import plot_evaluation_results as plot
+import read_peptides, binarize_entries, create_haplotype_ba_predictions
+from collections import Counter
+
+
+def get_cmp_obj(peptides, all_ba, af, smaller_to_zero=False, binary=False, filter=True):
+    # only keep peptide columns corresponding to chosen peptides that also exist in prediction file
+    good_keys = all_ba.index.intersection(peptides)
+    pep_ba = all_ba.loc[good_keys]
+
+    if smaller_to_zero:
+        pep_ba = binarize_entries.set_small_to_zero(pep_ba, 0.638)
+
+    if binary:
+        pep_ba = binarize_entries.binarize_entries(pep_ba, 0.638)
+
+    if filter:
+        if len(pep_ba.columns) != len(af.columns):
+            columns_to_keep = [key for key in pep_ba if key in af]
+            pep_ba = pep_ba[columns_to_keep]
+            af = af[columns_to_keep]
+
+    return ev.EvaluationMetrices(peptides, pep_ba, af)
+    
+
+def collect_data_for_bar_plot(method_A, method_A_mhc1, method_A_mhc2, method_B, method_B_mhc1, method_B_mhc2,
+                              mhc1_all_ba, mhc1_af, mhc2_all_ba, mhc2_af, outdir):
+
+    results = {'Method': [], 'Population Coverage': [], 'Number of Peptides': []}
+
+    # mhc1
+    peptides = read_peptides.main(method_A_mhc1)
+    results['Method'].append('MHCI ' + method_A)
+    results['Population Coverage'].append(get_cmp_obj(peptides, mhc1_all_ba, mhc1_af, smaller_to_zero=True).pred_at_least_one('World'))
+    results['Number of Peptides'].append(len(peptides))
+
+    peptides = read_peptides.main(method_B_mhc1)
+    results['Method'].append('MHCI ' + method_B)
+    results['Population Coverage'].append(get_cmp_obj(peptides, mhc1_all_ba, mhc1_af, smaller_to_zero=True).pred_at_least_one('World'))
+    results['Number of Peptides'].append(len(peptides))
+
+    # mhc2
+    peptides = read_peptides.main(method_A_mhc2)
+    results['Method'].append('MHCII ' + method_A)
+    results['Population Coverage'].append(get_cmp_obj(peptides, mhc2_all_ba, mhc2_af, smaller_to_zero=True).pred_at_least_one('World'))
+    results['Number of Peptides'].append(len(peptides))
+
+    peptides = read_peptides.main(method_B_mhc2)
+    results['Method'].append('MHCII ' + method_B)
+    results['Population Coverage'].append(get_cmp_obj(peptides, mhc2_all_ba, mhc2_af, smaller_to_zero=True).pred_at_least_one('World'))
+    results['Number of Peptides'].append(len(peptides))
+
+    df = pd.DataFrame.from_dict(results)
+    plot.EvaluationPlots(df, outdir).ilp_opitvax_bar_plot()
+    df.to_csv(outdir + 'evalvax-unlinked.csv')
+
+
+def collect_data_for_locus_cov_pie(peptide_file, mhc, mhc_ba, af, loci, population, method, outdir):
+    cov_loci = {}
+    hit_loci = {}
+    hit_loci_freq = {}
+    peptides = read_peptides.main(peptide_file)
+    for locus in loci:
+        covered, unpred_alleles, unpred_frac, total, allele_hit_counts, hit_allele_freq = get_cmp_obj(peptides,
+                                                                                                      mhc_ba,
+                                                                                                      af,
+                                                                                                      binary=True,
+                                                                                                      filter=False
+                                                                                                      ).number_covered_alleles(population, locus)
+        uncovered = abs(total - covered - unpred_frac)
+        values = [round(v, 4) for v in [covered, unpred_frac, uncovered]]
+        cov_loci[locus] = {'allele_category': ['Covered', 'No-BA-prediction', 'Uncovered'], 'values': values}
+        hit_loci[locus] = allele_hit_counts
+
+        categories = []
+        fractions = []
+        for key in sorted(hit_allele_freq.keys(), key=float):
+            categories.append(str(key))
+            fractions.append(hit_allele_freq[key])
+        hit_loci_freq[locus] = {'allele_category': categories + ['No-BA-prediction', 'Uncovered'],
+                                'values': fractions + [unpred_frac, uncovered],
+                                'method': [locus + method] * (len(categories) + 2)}
+        print(hit_loci_freq)
+
+    plot.EvaluationPlots(cov_loci, outdir).covered_locus_pie(loci, mhc, method)
+    plot.EvaluationPlots(hit_loci, outdir).hit_pies(loci, mhc, method)
+    plot.EvaluationPlots(hit_loci_freq, outdir).hit_covered_locus_pie(loci, mhc, method)
+    return hit_loci_freq
+
+
+def collect_data_for_haplotype_eval(peptide_file, single_ba, hap_ba, af, max_int, mhc, method, outdir):
+    peptides = read_peptides.main(peptide_file)
+    single_ba = binarize_entries.binarize_entries(single_ba, 0.638).loc[[p for p in peptides if p in single_ba.index]]
+    hit_dict = {}
+    hit_n_dict = []
+    for population in af.index:
+        # hits is number of hits per genotype, hits_n is probability for n>=N hits per person in population
+        hits, hits_n, exp, no_predictions = get_cmp_obj(peptides, hap_ba.T, af, binary=True).pred_genotype_hits(population, single_ba, max_int, mhc)
+        hit_dict[population] = [hits, {'expected': [exp]}]
+        tmp = pd.DataFrame(hits)
+        tmp.to_csv(outdir + 'hit_hist_' + mhc + '_' + method + '.csv')
+        # preparation for grouped bar plot
+        hit_n_dict.append(hits_n | {'population': [population]*max_int})
+    # plot histogram of peptide-hla hits
+    plot.EvaluationPlots(hit_dict, outdir).hit_histogram(mhc, method)
+
+    cur_probs = [np.array(dic['prob']) for dic in hit_n_dict]
+    mean_probs = list(np.mean(cur_probs, axis=0))
+    hit_n_dict.append({'prob': mean_probs, 'n': [i for i in range(1, max_int+1)], 'population': ['Average']*max_int})
+
+    # reformat dict for grouped bar plots
+    df = pd.concat(pd.DataFrame(entry) for entry in hit_n_dict)
+    plot.EvaluationPlots(df, outdir).population_hit_bar_plot(mhc, method)
+    df.to_csv(outdir + 'min_hit_dict' + mhc + '_' + method + '.csv')
+
+
+def compute_hap_freq_no_ld(population, allele_freq, loci):
+    hap_data_values = []
+    hap_data_labels = []
+    for locus in loci:
+        tmp = allele_freq[locus].loc[population]
+        tmp = tmp[tmp > 0]
+        if 'unknown' in tmp:
+            tmp = tmp.drop('unknown')
+        hap_data_labels.append(tmp.index)
+        hap_data_values.append(tmp.values)
+
+    hap_freq = []
+    for i in range(len(hap_data_labels[0])):
+        print(i)
+        for j in range(len(hap_data_labels[1])):
+            for k in range(len(hap_data_labels[2])):
+                f = hap_data_values[0][i] * hap_data_values[1][j] * hap_data_values[2][k]
+                hap_freq.append(((hap_data_labels[0][i], hap_data_labels[1][j], hap_data_labels[2][k]), f))
+
+    unknown = 1 - sum(i for _, i in hap_freq)
+    hap_freq.append((('unknown', 'unknown', 'unknown'), unknown))
+    df = pd.DataFrame(hap_freq, columns=['Haplotype', population])
+    df = df.set_index('Haplotype').T
+
+    return df
+
+
+def collect_data_for_geographic_eval(threads, peptide_file, populations, allele_freq, single_ba, loci, mhc_type):
+    peptides = read_peptides.main(peptide_file)
+    hit_n_dict = []
+    for population in populations:
+        print(population)
+        haplotype_data = compute_hap_freq_no_ld(population, allele_freq, loci)
+        print('Calculated haplotype frequencies for ' + population)
+        hap_ba = create_haplotype_ba_predictions.main(threads, single_ba, haplotype_data, loci, population, mhc_type)
+        print('Calculated haplotype binding affinities')
+        single_ba = binarize_entries.binarize_entries(single_ba, 0.638).loc[peptides]
+        hits, hits_n, exp, no_predictions = get_cmp_obj(peptides, hap_ba.T, haplotype_data, binary=True).pred_genotype_hits(population, single_ba, 1, mhc_type)
+        print('Calculated evaluation results')
+        hit_n_dict.append(hits_n | {'population': population})
+
+    with open('data/hit_predictions_no_ld.pkl', 'wb') as file:
+        pickle.dump(hit_n_dict, file)
+
+
+def collect_data_for_composition_eval(peptide_file, feature_table, method, mhc_type, outdir):
+    peptides = read_peptides.main(peptide_file)
+    # composition = dict(Counter(feature_table.loc[peptides].protein))
+    composition = dict(Counter(feature_table.loc[peptides].protein))
+    df = pd.DataFrame(composition.items(), columns=['Protein', 'Count'])
+    print(df)
+    plot.EvaluationPlots(df, outdir).composition_plot(mhc_type, method)
+
+
+def call_single_evaluations(peptides, mhc_type, feature_table, allele_ba_df, haplotype_ba_df, allele_af_df, haplotype_af_df, loci,
+                            population, name, outdir, max_hit=10):
+    # print('Create pie charts')
+    # hit_dict = collect_data_for_locus_cov_pie(peptides, mhc_type, allele_ba_df, allele_af_df, loci, population, name, outdir)
+    print('Create histograms for haplotype comparison')
+    collect_data_for_haplotype_eval(peptides, allele_ba_df, haplotype_ba_df, haplotype_af_df, max_hit, mhc_type, name, outdir)
+    # print('Plot vaccine composition')
+    # collect_data_for_composition_eval(peptides, feature_table, mhc_type, name, outdir)
+    # return hit_dict
+
+
+def new_df(dict_list):
+    new_dict = {'allele_category': [], 'values': [], 'method': []}
+    for dict in dict_list:
+        for key in dict:
+            new_dict['allele_category'].extend(dict[key]['allele_category'])
+            new_dict['values'].extend(dict[key]['values'])
+            new_dict['method'].extend(dict[key]['method'])
+    df = pd.DataFrame(new_dict)
+    return df
+
+
+def main():
+    # optivax_mhc1 = '/home/sara/Documents/VaccinesProject/ivp/Gifford_Code/mhc1_optivax_unlinked_World_June28/28June_optivax_unlinked_mhc1_World_beam_peptides.txt'
+    # optivax_mhc2 = '/home/sara/Documents/VaccinesProject/ivp/Gifford_Code/mhc2_optivax_unlinked_World_June28/28June_optivax_unlinked_mhc2_World_beam_peptides.txt'
+    #
+    # hogvax_mhc1 = '/home/sara/Documents/VaccinesProject/ivp/Pipeline/18Sept_mhcI_122peptides/ilp/pep_out/4512_chosen_peptides_hog_inc_substrings.txt'
+    # hogvax_mhc2 = '/home/sara/Documents/VaccinesProject/ivp/Pipeline/18Sept_mhcII_1112/ilp/pep_out/37435_chosen_peptides_hog_inc_substrings.txt'
+
+    # haplotypes
+    optivax_mhc1 = '/home/sara/Documents/VaccinesProject/ivp/Gifford_Code/mhc1_optivax_robust_haplotype_modifications_July27/27July_optivax_robust_mhc1_beam_peptides.txt'
+    optivax_mhc2 = '/home/sara/Documents/VaccinesProject/ivp/Gifford_Code/mhc2_optivax_robust_haplotype_modifications_July27/27July_optivax_robust_mhc2_beam_peptides.txt'
+
+    hogvax_mhc1 = '/home/sara/Documents/VaccinesProject/ivp/Pipeline/28July_mhcI_haplotypes/ilp/pep_out/4519_chosen_peptides_hog_inc_substrings.txt'
+    hogvax_mhc2 = '/home/sara/Documents/VaccinesProject/ivp/Pipeline/28July_mhcII_haplotypes/ilp/pep_out/37435_chosen_peptides_hog_inc_substrings.txt'
+
+    # hogvax_mhc1_gen = '/home/sara/Documents/VaccinesProject/ivp/Pipeline/29Aug_mhcI_100k_genotypes_average/ilp/pep_out/4519_chosen_peptides_hog_inc_substrings.txt'
+    # hogvax_mhc2_gen = '/home/sara/Documents/VaccinesProject/ivp/Pipeline/29Aug_mhcII_100k_genotypes_average/ilp/pep_out/37435_chosen_peptides_hog_inc_substrings.txt'
+    #
+    # optivax_mhc1_gen = '/home/sara/Documents/VaccinesProject/ivp/Gifford_Code/mhc1_optivax_robust_genotypes_July31/31July_optivax_robust_mhc1_genotype_peptides.txt'
+    # optivax_mhc2_gen = '/home/sara/Documents/VaccinesProject/ivp/Gifford_Code/mhc2_optivax_robust_genotypes_July31/31July_optivax_robust_mhc2_genotype_peptides.txt'
+
+    # optivax_mhc1_spike = '/home/sara/Documents/VaccinesProject/ivp/Gifford_Code/mhc1_optivax_unlinked_spike_protein/5Sep_optivax_unlinked_spike_peptides.txt'
+    # optivax_mhc2_spike = '/home/sara/Documents/VaccinesProject/ivp/Gifford_Code/mhc2_optivax_unlinked_spike_protein/5Sep_optivax_unlinked_spike_peptides.txt'
+    #
+    # hogvax_mhc1_spike = '/home/sara/Documents/VaccinesProject/ivp/Code/HOGVAX/spike_new_mhc1/pep_out/639_chosen_peptides_hogvax_inc_substrings.txt'
+    # hogvax_mhc2_spike = '/home/sara/Documents/VaccinesProject/ivp/Code/HOGVAX/spike_new_mhc2/pep_out/5064_chosen_peptides_hogvax_inc_substrings.txt'
+
+    # concat_mhcI = '/home/sara/Documents/VaccinesProject/ivp/Pipeline/28July_mhcI/new_concat(not using mod ba predictions)/naively_chosen_peptides.txt'
+    # concat_mhcII = '/home/sara/Documents/VaccinesProject/ivp/Pipeline/28July_mhcII/new_concat(no mod ba data)/naively_chosen_peptides.txt'
+
+    # evaluation for combined mhcI and II vaccine
+    # comb_peptide_file = '/home/sara/Documents/VaccinesProject/ivp/Pipeline/25Sept_mhcIandII/ilp/pep_out/41947_chosen_peptides_hog_inc_substrings.txt'
+
+    all_epitope_features = pd.read_pickle('../../Gifford_Data/AllEpitopeFeatures.pkl')
+
+    mhc1_all_ba = pd.read_pickle(gzip.open('../../Gifford_Data/25June_mhc1_netmhc-4.1_pred_affinity_pivot.pkl.gz'))
+    mhc1_af = pd.read_pickle('../../Gifford_Data/IEDB_population_frequency2392_normalized.pkl')
+
+    mhc2_all_ba = pd.read_pickle(gzip.open('../../Gifford_Data/25June_mhc2_netmhcii-4.1_pred_affinity_pivot_v1v2.pkl.gz'))
+    mhc2_af = pd.read_pickle('../../Gifford_Data/IEDB_population_frequency_mhc2_275normalized.pkl')
+
+    mhc1_haplotype_ba = pd.read_pickle(gzip.open('../../Gifford_Data/all_mhc1_predictions_for_haplotypes.pkl.gz')).T
+    mhc1_haplotype_freq = pd.read_pickle('../../Gifford_Data/haplotype_frequency_marry.pkl')
+
+    mhc2_haplotype_ba = pd.read_pickle(gzip.open('../../Gifford_Data/all_mhc2_predictions_for_haplotypes.pkl.gz')).T
+    mhc2_haplotype_freq = pd.read_pickle('../../Gifford_Data/haplotype_frequency_marry2.pkl')
+
+    outdir = 'plots_haplotypes/'
+    if not os.path.exists(outdir):
+        os.mkdir(outdir)
+
+    # collect data and create bar plot
+    print('Create simple bar plot')
+    collect_data_for_bar_plot('OptiVax-Unlinked', optivax_mhc1, optivax_mhc2, 'HOGVAX', hogvax_mhc1, hogvax_mhc2,
+                              mhc1_all_ba, mhc1_af, mhc2_all_ba, mhc2_af, outdir)
+    # collect_data_for_bar_plot('Concat', concat_mhcI, concat_mhcII, 'HOGVAX', hogvax_mhc1, hogvax_mhc2,
+    #                           mhc1_all_ba, mhc1_af, mhc2_all_ba, mhc2_af, outdir)
+    # collect_data_for_bar_plot('Combined', comb_peptide_file, comb_peptide_file, 'HOGVAX', hogvax_mhc1, hogvax_mhc2,
+    #                           mhc1_all_ba, mhc1_af, mhc2_all_ba, mhc2_af, outdir)
+
+    print('Call eval for MHC1 HOGVAX')
+    hits_alleles_hogvax_mhc1 = call_single_evaluations(hogvax_mhc1, 'mhc1', all_epitope_features,
+                                                       mhc1_all_ba, mhc1_haplotype_ba, mhc1_af, mhc1_haplotype_freq,
+                                                       ['HLA-A', 'HLA-B', 'HLA-C'], 'World', 'HOGVAX', outdir)
+    print('Call eval for MHC1 OptiVax')
+    hits_alleles_optivax_mhc1 = call_single_evaluations(optivax_mhc1, 'mhc1', all_epitope_features,
+                                                        mhc1_all_ba, mhc1_haplotype_ba, mhc1_af, mhc1_haplotype_freq,
+                                                        ['HLA-A', 'HLA-B', 'HLA-C'], 'World', 'OptiVax', outdir)
+    print('Call eval for MHC2 HOGVAX')
+    hits_alleles_hogvax_mhc2 = call_single_evaluations(hogvax_mhc2, 'mhc2', all_epitope_features,
+                                                       mhc2_all_ba, mhc2_haplotype_ba, mhc2_af, mhc2_haplotype_freq,
+                                                       ['DRB1', 'HLA-DP', 'HLA-DQ'], 'World', 'HOGVAX', outdir)
+    print('Call eval for MHC2 OptiVax')
+    hits_alleles_optivax_mhc2 = call_single_evaluations(optivax_mhc2, 'mhc2', all_epitope_features,
+                                                        mhc2_all_ba, mhc2_haplotype_ba, mhc2_af, mhc2_haplotype_freq,
+                                                        ['DRB1', 'HLA-DP', 'HLA-DQ'], 'World', 'OptiVax', outdir)
+
+    # print('Plot Allele hit frequencies hogvax + optivax mhc2')
+    # df = new_df([hits_alleles_hogvax_mhc2, hits_alleles_optivax_mhc2])
+    # plot.EvaluationPlots(df, outdir).stacked_hit_covered_alleles('mhc2', 'hogvax_optivax_unlinked')
+
+    # call_single_evaluations(concat_mhcI, 'mhc1', all_epitope_features, mhc1_all_ba, mhc1_haplotype_ba, mhc1_af, mhc1_haplotype_freq,
+    #                         ['HLA-A', 'HLA-B', 'HLA-C'], 'World', 'Concat')
+    # call_single_evaluations(concat_mhcII, 'mhc2', all_epitope_features, mhc2_all_ba, mhc2_haplotype_ba, mhc2_af, mhc2_haplotype_freq,
+    #                         ['DRB1', 'HLA-DP', 'HLA-DQ'], 'World', 'Concat')
+    #
+    # call_single_evaluations(comb_peptide_file, 'mhc1', all_epitope_features, mhc1_all_ba, mhc1_haplotype_ba, mhc1_af,
+    #                         mhc1_haplotype_freq, ['HLA-A', 'HLA-B', 'HLA-C'], 'World', 'Combined')
+    # call_single_evaluations(comb_peptide_file, 'mhc2', all_epitope_features, mhc2_all_ba, mhc2_haplotype_ba, mhc2_af,
+    #                         mhc2_haplotype_freq, ['DRB1', 'HLA-DP', 'HLA-DQ'], 'World', 'Combined')
+    # countries = ['Europe',
+    #              'Northeast Asia',
+    #              'Oceania', 'East Africa',
+    #              'East Asia', 'South Africa',
+    #              'Southeast Asia',
+    #              'Central Africa', 'North America',
+    #              'South America', 'West Africa',
+    #              'West Indies', 'South Asia',
+    #              'North Africa', 'Southwest Asia']
+    #
+    # threads = 1
+    # collect_data_for_geographic_eval(threads, ilp_mhc1, countries, mhc1_af, mhc1_all_ba, ['HLA-A', 'HLA-B', 'HLA-C'], 'mhc1')
+
+if __name__ == '__main__':
+    main()
diff --git a/Evaluation/create_haplotype_ba_predictions.py b/Evaluation/create_haplotype_ba_predictions.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2362ba91211ba76aa7fa7e2f2c87f305356d118
--- /dev/null
+++ b/Evaluation/create_haplotype_ba_predictions.py
@@ -0,0 +1,109 @@
+import argparse
+import sys
+import pandas as pd
+from ProcessData import read_peptides
+from multiprocessing import Pool
+from datetime import date
+
+
+def ba_for_haplotype(tup):
+    k, keys, i = tup
+    max_col = allele_ba_predictions[keys].max(axis=1)
+    df = pd.DataFrame(max_col, columns=[k])
+    return df
+
+
+def get_parser():
+    """Get parser object for calculating haplotype / genotype ba predictions."""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--population', '-pop', dest='population', default='World', type=str,
+                        help='Target population. Default "World"')
+    parser.add_argument('--peptides', '-pep', dest='peptides', required=True, type=str,
+                        help='Preprocessed peptide file with every peptide in a new line.')
+    parser.add_argument('--frequencies', '-f', dest='f_data', required=True, type=str,
+                        help='(Normalized) haplotype / genotype frequency file.')
+    parser.add_argument('--allele_binding_affinities', '-ba', dest='predictions', required=True, type=str,
+                        help='Binding affinity file for peptides and single alleles.')
+    parser.add_argument('--type', '-t', dest='loci', required=True, type=str,
+                        help='Type of data that is processed, e.g. mhc1 for haplotype mhc1; mhc2 for haplotype mhc2;'
+                             'mhc1_genotype for mhc1 genotype; mhc2_genotype for mhc2 genotype')
+    parser.add_argument('--nworkers', '-n', dest='nworkers', default=1, type=int,
+                        help='Number of cores to use in parallel.')
+
+    return parser
+
+
+def main():
+    args = get_parser().parse_args()
+
+    filtered_peptides = read_peptides.main(args.peptides)
+    print('Read number of peptides', len(filtered_peptides))
+
+    global allele_ba_predictions
+    allele_ba_predictions = pd.read_pickle(args.predictions).loc[filtered_peptides]
+    print('Loaded ba predictions', allele_ba_predictions.shape)
+
+    haplotypes = pd.read_pickle(args.f_data).keys()
+
+    if args.loci == 'mhc1':
+        loci = ['HLA-A', 'HLA-B', 'HLA-C']
+    elif args.loci == 'mhc2':
+        loci = ['DRB1', 'HLA-DP', 'HLA-DQ']
+    elif args.loci == 'mhc1_genotype':
+        loci = ['HLA-A', 'HLA-A', 'HLA-B', 'HLA-B', 'HLA-C', 'HLA-C']
+    elif args.loci == 'mhc2_genotype':
+        loci = ['DRB1', 'DRB1', 'HLA-DP', 'HLA-DP', 'HLA-DQ', 'HLA-DQ']
+    else:
+        print('Type unknown! Choose from [mhc1, mhc2, mhc1_genotype, mhc2_genotype].')
+        exit(-1)
+
+    print('Start collecting keys')
+    hap_list = []
+    for i, key in enumerate(haplotypes):
+        if i % 10000 == 0:
+            print(i, 'of', len(haplotypes))
+        key = ['unknown' if x=='nope' else x for x in key]
+        keys = list(zip(loci, key))
+        if all(k in allele_ba_predictions for k in keys):
+            hap_list.append((tuple(key), keys, i))
+    print('Processed all keys in genotypes / haplotypes')
+
+    print('Create threads')
+    pool = Pool(processes=args.nworkers)
+    outputs = pool.map_async(ba_for_haplotype, hap_list)
+
+    pool.close()
+    pool.join()
+
+    print('Finished all tasks')
+    new_df = pd.concat([out for out in outputs.get()], axis=1)
+    print('Finished finding maximum for each genotype / haplotype')
+
+    today = date.today().strftime('%d-%b')
+    new_df.to_pickle('../../Gifford_Data/' + today + '_' + args.population + '_predictions_for_' + args.loci +
+                     '_filtered_peptides.pkl.gz', compression='gzip')
+
+    return new_df
+
+
+if __name__ == '__main__':
+    main()
+
+    countries = ['Europe',
+                 'Northeast Asia',
+                 'Oceania', 'East Africa',
+                 'East Asia', 'South Africa',
+                 'Southeast Asia',
+                 'Central Africa', 'North America',
+                 'South America', 'West Africa',
+                 'West Indies', 'South Asia',
+                 'North Africa', 'Southwest Asia']
+
+    # for country in countries:
+    #     print(country)
+    #     haplotypes = pd.read_pickle('../EvaluateVaccines/data/haplotype_freq_no_ld_' + country + '.pkl')
+    #     main(threads, mhc1_ba_predictions, haplotypes, ['HLA-A', 'HLA-B', 'HLA-C'], country, 'mhc1')
+
+    # haplotypes = pd.read_pickle('../EvaluateVaccines/data/haplotype_freq_no_ld_' + countries[8] + '.pkl')
+    # main(threads, mhc1_ba_predictions, haplotypes, ['HLA-A', 'HLA-B', 'HLA-C'], countries[8], 'mhc1')
diff --git a/Evaluation/evaluate_cleavage_predictions.py b/Evaluation/evaluate_cleavage_predictions.py
new file mode 100644
index 0000000000000000000000000000000000000000..af1cf3414b48a3d12b0041783aefd39ac0da8124
--- /dev/null
+++ b/Evaluation/evaluate_cleavage_predictions.py
@@ -0,0 +1,63 @@
+import sys
+import pandas as pd
+from ProcessData import read_peptides
+from EvaluateVaccines import calculate_cleaved_peptides
+
+
+def count_pred_peptides(chosen_peptides, peptides_from_prediction):
+    counter = 0
+    peptides = []
+    for pep in chosen_peptides:
+        if pep in peptides_from_prediction:
+            counter += 1
+            peptides.append(pep)
+
+    print(len(chosen_peptides), counter, sorted(peptides))
+    return counter, peptides
+
+
+# read peptides chosen by ILP including all substrings
+peptide_file = sys.argv[1]
+chosen_peptides = read_peptides.main(peptide_file)
+
+# Note that netchop requires manually deleting header and footer lines + last column modification
+pepsickle_file = sys.argv[2]
+netchop_file = sys.argv[3]
+
+# csv file to write output
+out_file = sys.argv[4]
+
+concat_eval = False
+if len(sys.argv) > 5:
+    concat_eval = True
+
+with open(pepsickle_file, 'r') as file:
+    df_pep = pd.read_csv(file, sep='\t', lineterminator='\n')
+    if concat_eval:
+        df_pep = df_pep[df_pep['protein_id'] != 'HOGVAX']
+    else:
+        df_pep = df_pep[df_pep['protein_id'] != 'Concat']
+
+peptides_from_pepsickle = calculate_cleaved_peptides.get_cleaved_peptides(df_pep)
+count_pepsickle, pred_peptides_pepsickle = count_pred_peptides(chosen_peptides, peptides_from_pepsickle)
+
+
+with open(netchop_file, 'r') as file:
+    df_net = pd.read_csv(file, sep='\s+', lineterminator='\n')
+
+df_net = df_net.rename(columns={'pos': 'position', 'AA': 'residue', 'C': 'cleaved'})
+df_net['cleaved'] = df_net['cleaved'].replace(['S'], True).replace(['.'], False)
+
+peptides_from_netchop = calculate_cleaved_peptides.get_cleaved_peptides(df_net)
+count_netchop, pred_peptides_netchop = count_pred_peptides(chosen_peptides, peptides_from_netchop)
+
+d = {'Chosen Peptides': chosen_peptides,
+     'Pepsickle Predicted Peptides': pred_peptides_pepsickle,
+     'NetChop Predicted Peptides': pred_peptides_netchop}
+print(d)
+df = pd.DataFrame.from_dict(d, orient='index')
+df = df.T
+df.to_csv(out_file)
+
+
+
diff --git a/Evaluation/evaluate_vaccine.py b/Evaluation/evaluate_vaccine.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f7711b63e6add9a3d9cf6d917b0d89027c95c34
--- /dev/null
+++ b/Evaluation/evaluate_vaccine.py
@@ -0,0 +1,151 @@
+import gzip
+import re
+import numpy as np
+import pandas as pd
+import read_peptides, binarize_entries
+
+
+class EvaluationMetrices(object):
+    def __init__(self, peptides, binding_affinities, allele_freq):
+        self.peptides = peptides
+        self.binding_affinities = binding_affinities
+        self.allele_freq = allele_freq
+
+    def pred_at_least_one(self, population):
+        """
+        Calculate the probability that any person of the given population presents at least one peptide
+        (based on Gifford EvalVax-Unlinked)
+        """
+        self.population = population
+        loci_prob = 1
+        for locus in self.binding_affinities.columns.levels[0]:
+            loci_prob *= (1 - self.locus_prob(locus))
+        loci_prob = 1 - loci_prob
+        # print('Total prob', loci_prob)
+        return loci_prob
+
+    def locus_prob(self, locus):
+        """ based on Gifford """
+        # Equation 5
+        alleles_ba = self.binding_affinities[locus]
+        alleles_prob = 1 - np.prod(1-alleles_ba, axis=0)
+        # Equation 6
+        diploid_ba = 1 - np.outer((1-alleles_prob), (1-alleles_prob))
+        np.fill_diagonal(diploid_ba, alleles_prob)
+        diploid_ba = pd.DataFrame(diploid_ba, index=alleles_prob.index, columns=alleles_prob.index)
+        # Equation 7
+        diploid_af = np.outer(self.allele_freq[locus].loc[self.population], self.allele_freq[locus].loc[self.population])
+        diploid_af = pd.DataFrame(diploid_af, index=self.allele_freq[locus].columns, columns=self.allele_freq[locus].columns)
+        # sum up the multiplied values for each allele pair (done by np.sum automatically)
+        prob = np.sum(np.sum(diploid_af * diploid_ba))
+        # print(prob)
+        return prob
+
+    def get_all_genotypes(self, genotypes):
+        # generate each genotype only once
+        indices = genotypes.index
+        for i in range(len(indices)):
+            hap1 = genotypes.index[i]
+            for j in range(0, len(indices)):#range(i, len(indices)):
+                hap2 = genotypes.index[j]
+                yield hap1, hap2
+
+    def pred_genotype_hits(self, population, single_allele_ba, max_interest, mhc_type):
+        """
+        Calculate the probability that an individual of a specific population has exactly n peptide hla hits
+        (based on Gifford EvalVax-Robust)
+        """
+        # Equation 1
+        genotype_af = np.outer(self.allele_freq.loc[population], self.allele_freq.loc[population])
+        genotype_af = pd.DataFrame(genotype_af, index=self.allele_freq.columns, columns=self.allele_freq.columns)
+        # compute number of peptides binding to each allele e(a)
+        hits_per_allele = np.sum(single_allele_ba)
+        # Equation 2
+        hits_per_genotype = {}
+        for hap1, hap2 in self.get_all_genotypes(genotype_af):
+            unique_alleles = set(hap1).union(set(hap2))
+            count, no_predictions = self.get_genotype_counts(unique_alleles, hits_per_allele, mhc_type)
+            if count in hits_per_genotype:
+                hits_per_genotype[count].append((hap1, hap2))
+            else:
+                hits_per_genotype[count] = [(hap1, hap2)]
+        # Equation3 count exact hits
+        m = int(max(hits_per_genotype))
+        probs = {'count': [], 'prob': []}
+        for i in range(m + 1):
+            if i in hits_per_genotype:
+                frq = np.sum(genotype_af[h1][h2] for h1, h2 in hits_per_genotype[i])
+                probs['count'].append(i)
+                probs['prob'].append(frq)
+            else:
+                probs['count'].append(i)
+                probs['prob'].append(0)
+        # calculate expected value
+        expected_value = np.sum(np.array(probs['count']) * np.array(probs['prob']))
+        # Equation 4 for k in [1, max_interest] filter data frame for all counts >= k
+        cur_df = pd.DataFrame.from_dict(probs)
+        prob_n = {'n': [], 'prob': []}
+        for n in range(1, max_interest + 1):
+            prob_n['n'].append(n)
+            # have to use converse probability
+            prob_n['prob'].append(1-sum(cur_df[cur_df['count'] < n]['prob']))
+        return probs, prob_n, expected_value, no_predictions
+
+    def get_genotype_counts(self, alleles, hits_per_allele, mhc_type):
+        count = 0
+        no_ba_prediction = []
+        for allele in alleles:
+            if mhc_type == 'mhc1':
+                reg = re.search(r'(HLA-[A-Z]+)(\d)', allele)
+                locus = reg.group(1)
+            else:
+                reg = re.search(r'HLA-DP|HLA-DQ|DRB1', allele)
+                locus = reg.group(0)
+            if (locus, allele) in hits_per_allele:
+                count += hits_per_allele[locus, allele]
+            else:
+                no_ba_prediction.append((locus, allele))
+        return int(count), no_ba_prediction
+
+    def number_covered_alleles(self, population, locus):
+        """
+        Calculate the number of covered alleles of the population and with this metric the population coverage;
+        Track information about alleles for which we don't have prediction data from netmhc.
+        """
+        # binding affinities only contain peptides from vaccine
+        alleles_ba = self.binding_affinities[locus]
+        alleles_fa = self.allele_freq[locus]
+
+        # count hits per allele
+        tmp = self.binding_affinities[locus].sum().to_frame().reset_index()
+        tmp = tmp.rename(columns={'genotype':'count', 0:'hits'})
+        allele_hit_counts = tmp.groupby('hits').count().reset_index()
+        fractions = allele_hit_counts['count'] / sum(allele_hit_counts['count'])
+        allele_hit_counts = allele_hit_counts.assign(frac=fractions)
+
+        # check for all alleles with (entry >= 0.638) / non-zero entry -> sum up corresponding frequencies
+        hit_allele_freq = {}
+        filter = (alleles_ba != 0).any()
+        covered_alleles = filter.index[filter]
+        covered_fraction = 0
+        for allele in covered_alleles:
+            if allele in alleles_fa:
+                covered_fraction += alleles_fa[allele][population]
+                hit = tmp[tmp['count'] == allele]['hits'].values[0]
+                if hit in hit_allele_freq:
+                    hit_allele_freq[hit] += alleles_fa[allele][population]
+                else:
+                    hit_allele_freq[hit] = alleles_fa[allele][population]
+
+        no_predictions = []
+        fraction_no_predictions = 0
+        for allele in alleles_fa:
+            if allele == 'unknown':
+                no_predictions.append(allele)
+                fraction_no_predictions += alleles_fa[allele][population]
+            if allele not in alleles_ba:
+                no_predictions.append(allele)
+                fraction_no_predictions += alleles_fa[allele][population]
+
+        total = sum(alleles_fa.loc[population])
+        return covered_fraction, no_predictions, fraction_no_predictions, total, allele_hit_counts, hit_allele_freq
diff --git a/Evaluation/get_gifford_vax_length.py b/Evaluation/get_gifford_vax_length.py
new file mode 100644
index 0000000000000000000000000000000000000000..af12e5a0e621936252e65b272997290c5ca74359
--- /dev/null
+++ b/Evaluation/get_gifford_vax_length.py
@@ -0,0 +1,36 @@
+import glob
+import re
+
+def get_vax_len(in_file, out_file):
+    out = open(out_file, 'w')
+    peptide_list = []
+    with open(in_file) as file:
+        for line in file:
+            peptides_, score = line.split('	')
+            peptide_list = peptides_.split('_')
+            for p in peptide_list:
+                out.write(p + '\n')
+            break
+
+    concat_seq = ''.join(peptide_list)
+    seq_len = len(concat_seq)
+    out.write('# Total length ' + str(seq_len))
+
+    print(concat_seq)
+    print(seq_len)
+    return seq_len
+
+
+dict_k = {}
+for path in glob.glob('/Users/sara/Documents/VaccinesProject/ivp/Gifford_Code/result_mhc2_maxround*'):
+    print(path)
+    size = re.search(r'maxround(\d+)', path).group(1)
+    file = path + '/best_result.txt'
+    out_file = path + '/13Sept_optivax_unlinked_peptides.txt'
+    dict_k[size] = get_vax_len(file, out_file)
+
+print(dict_k)
+
+with open('../../Pipeline/snakemake_mhc2_call_file.sh', 'w') as f:
+    for key in dict_k:
+        f.write('snakemake --use-conda --cores 32 --config mhc=\'14Sept_mhcII_' + key + '\' k=' + str(dict_k[key]) + '\n')
diff --git a/Evaluation/plot_cleavage_predictions.py b/Evaluation/plot_cleavage_predictions.py
new file mode 100644
index 0000000000000000000000000000000000000000..f04c91772660467c6203cd89a574f16735f469b2
--- /dev/null
+++ b/Evaluation/plot_cleavage_predictions.py
@@ -0,0 +1,54 @@
+import glob
+import re
+import pandas as pd
+import altair as alt
+
+out_name = '24Sept_mhcI_cleavage_prediction_for_embedding_methods'
+
+data = {'embedding': [], 'tool': [], 'fraction': []}
+for file in glob.glob('/home/sara/Documents/VaccinesProject/ivp/Proteasomal_Cleavage_Predictions/24SeptEmbeddingPredictions/MHCI/CleavagePredictionsForEvaluation/*.csv'):
+    match = re.search(r'_17Aug_(mhcI+(_\d+embedded)*_+hogvax)(_with_mutations)*(_concat)*', file)
+    name = match.group(1).replace('_hogvax', '')
+    if match.group(4):
+        name += '_concat'
+
+    tmp = pd.read_csv(file)
+    tmp = tmp.drop(columns=tmp.columns[0])
+    chosen_pep = len(tmp['Chosen Peptides'].dropna())
+    pepsickle_pep = len(tmp['Pepsickle Predicted Peptides'].dropna())
+    netchop_pep = len(tmp['NetChop Predicted Peptides'].dropna())
+
+    check_a = [p for p in tmp['Pepsickle Predicted Peptides'] if p in list(tmp['Chosen Peptides'])]
+    if len(check_a) != pepsickle_pep:
+        print('Attention!')
+        exit(-1)
+
+    check_b = [p for p in tmp['NetChop Predicted Peptides'] if p in list(tmp['Chosen Peptides'])]
+    if len(check_b) != netchop_pep:
+        print('Attention!')
+        exit(-1)
+
+    data['embedding'].extend([name, name])
+    data['tool'].extend(['NetChop3.1', 'Pepsickle'])
+    data['fraction'].extend([netchop_pep/chosen_pep, pepsickle_pep/chosen_pep])
+
+data = pd.DataFrame.from_dict(data)
+print(data)
+
+# sorting = ['mhcII', 'mhcII_concat', 'mhcII_1embedded', 'mhcII_2embedded', 'mhcII_3embedded', 'mhcII_4embedded', 'mhcII_5embedded', 'mhcII_6embedded', 'mhcII_7embedded', 'mhcII_8embedded', 'mhcII_9embedded', 'mhcII_10embedded']
+sorting = ['mhcI', 'mhcI_concat', 'mhcI_1embedded', 'mhcI_2embedded', 'mhcI_3embedded', 'mhcI_4embedded', 'mhcI_5embedded', 'mhcI_6embedded', 'mhcI_7embedded', 'mhcI_8embedded', 'mhcI_9embedded', 'mhcI_10embedded']
+colors = ['#DA0034', '#3A7A9B']
+bar = alt.Chart(data).mark_bar(size=25).encode(
+            x=alt.X('tool:O', axis=alt.Axis(title=None, labels=False, ticks=False)),
+            y=alt.Y('fraction:Q', title='Cleaved peptides'),
+            color=alt.Color('tool:O', scale=alt.Scale(range=colors), legend=alt.Legend(title='Tool')),
+            column=alt.Column('embedding:O', title='Cleaved peptides for different context embedding lengths', sort=sorting)
+        ).configure_axis(
+            # grid=False
+        ).configure_view(
+            strokeOpacity=0
+        ).properties(
+            width=90
+        )
+
+bar.save('cleavage_plots/' + out_name + '.html')
\ No newline at end of file
diff --git a/Evaluation/plot_evaluation_results.py b/Evaluation/plot_evaluation_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c904583a7b0d12ac902e53e453d46cf61070f59
--- /dev/null
+++ b/Evaluation/plot_evaluation_results.py
@@ -0,0 +1,199 @@
+import altair_saver as saver
+import altair as alt
+import pandas as pd
+
+
+class EvaluationPlots(object):
+    def __init__(self, data, outdir):
+        self.colors = ['#9AD6E5', '#66A6D9', '#BBB465', '#FFC12C', '#DA0034', '#3A7A9B', '#FFAD00']
+        self.data = data
+        self.outdir = outdir
+
+    def ilp_opitvax_bar_plot(self):
+        min_ = round(min(self.data['Population Coverage']), 2) - 0.01
+        bar = alt.Chart(self.data).mark_bar().encode(
+            x='Method',
+            y=alt.Y('Population Coverage:Q', scale=alt.Scale(domain=[min_,1.0]), axis=alt.Axis(titleColor='#3a7a9b')),
+            color=alt.Color('Method',
+                            scale=alt.Scale(
+                                domain=self.data['Method'].tolist(),
+                                range=['#9ad6e5', '#3a7a9b', '#9ad6e5', '#3a7a9b']),
+                            legend=None)
+        )
+
+        crosses = alt.Chart(self.data).mark_circle(size=60, color='#FFAD00').encode(
+            x='Method',
+            y=alt.Y('Number of Peptides:Q', axis=alt.Axis(titleColor='#FFAD00'))
+        )
+
+        layered = alt.layer(bar, crosses).resolve_scale(
+            y='independent'
+        ).configure_axis(
+            labelFontSize=20,
+            titleFontSize=20
+        )
+
+        layered.save(self.outdir + 'unlinked_pop_coverage_bar.html')
+
+    def covered_locus_pie(self, loci, mhc_type, method):
+        charts = []
+        for locus in loci:
+            df = pd.DataFrame.from_dict(self.data[locus])
+            pie = alt.Chart(df).encode(
+                theta=alt.Theta('values:Q', stack=True),
+                color=alt.Color('allele_category:N', scale=alt.Scale(range=self.colors))
+            ).properties(
+                title='Coverage of alleles at ' + locus
+            )
+
+            c1 = pie.mark_arc(stroke='#fff')
+            c2 = pie.mark_text(outerRadius=168).encode(
+                text='values:Q'
+            )
+
+            chart = (c1 + c2)
+            charts.append(chart)
+
+        layered = alt.hconcat(*[chart for chart in charts])
+        layered.configure_view(
+            strokeWidth=0
+        ).configure_axis(
+            labelFontSize=20,
+            titleFontSize=20
+        ).save(self.outdir + 'pies_covered_alleles_' + mhc_type + '_' + method + '.html')
+
+    def hit_covered_locus_pie(self, loci, mhc_type, method):
+        charts = []
+        for locus in loci:
+            df = pd.DataFrame.from_dict(self.data[locus])
+            order = sorted(df.allele_category[:-2], key=float) + ['No-BA-prediction', 'Uncovered']
+            pie = alt.Chart(df).transform_calculate(
+                order=f"-indexof({order}, datum.Origin)"
+            ).encode(
+                theta=alt.Theta('values:Q', stack=True),
+                color=alt.Color('allele_category:N', scale=alt.Scale(scheme='category20b')),
+                order='order:Q'
+            ).properties(
+                title='Coverage of alleles at ' + locus
+            )
+
+            chart = pie.mark_arc(stroke='#fff')
+            charts.append(chart)
+
+        layered = alt.hconcat(*[chart for chart in charts]).resolve_scale(color='independent')
+        layered.configure_view(
+            strokeWidth=0
+        ).configure_axis(
+            labelFontSize=20,
+            titleFontSize=20
+        ).save(self.outdir + 'pies_hit_covered_alleles_' + mhc_type + '_' + method + '.html')
+
+    def stacked_hit_covered_alleles(self, mhc_key, method_key):
+        order = ['No-BA-prediction', 'Uncovered'] + sorted([x for x in self.data['allele_category'] if x!='No-BA-prediction' and x!='Uncovered'], key=float, reverse=True)
+        chart = alt.Chart(self.data).mark_bar().encode(
+            x='method',
+            y='sum(values)',
+            color=alt.Color('allele_category',
+                            scale=alt.Scale(scheme='viridis'),
+                            sort=order,
+                            legend=alt.Legend(columns=3, symbolLimit=0)),
+            order='order:Q'
+        )
+        chart.show()
+        chart.save(self.outdir + 'stacked_bar_hit_alleles_' + mhc_key + '_' + method_key + '.html')
+
+    def hit_pies(self, loci, mhc_type, method):
+        if mhc_type == 'mhc1':
+            bins = int(max([self.data[loc]['hits'].max() for loc in loci])) + 1
+        else:
+            bins = 10
+        for locus in loci:
+            df = self.data[locus]
+            pie = alt.Chart(df).encode(
+                theta=alt.Theta('count', stack=True),
+                color=alt.Color('hits', scale=alt.Scale(range=self.colors), bin=alt.Bin(maxbins=bins))
+            ).properties(
+                title='Coverage of alleles at ' + locus
+            )
+
+            # c1 = pie.mark_arc(stroke='#fff')
+            c2 = pie.mark_text(outerRadius=168).encode(
+                text='hits'
+            )
+
+            # chart = (c1 + c2)
+            chart = pie.mark_arc() + c2
+            chart.configure_view(
+                strokeWidth=0
+            ).configure_axis(
+                labelFontSize=20,
+                titleFontSize=20
+            ).save(self.outdir + 'allele_hit_pie_' + locus + '_' + mhc_type + '_' + method + '.html')
+
+    def hit_histogram(self, mhc, method):
+        charts = []
+        for population, data in self.data.items():
+            df = pd.DataFrame.from_dict(data[0])
+            hist = alt.Chart(df).mark_bar(color='#9AD6E5').encode(
+                x=alt.X('count:Q',
+                        title='Number of peptide-HLA hits',
+                        bin=alt.Bin(maxbins=30)),
+                y=alt.Y('prob:Q', title='Frequency')
+            ).properties(
+                title=population
+            )
+
+            exp = pd.DataFrame(data[1])
+            line = alt.Chart(exp).mark_rule(size=4, color='#FFAD00').encode(
+                x='expected:Q'
+            )
+
+            text = line.mark_text(
+                align='left',
+                baseline='middle',
+                dx=7,
+                color='#FFAD00'
+            ).encode(
+                text=alt.Text('expected', format=',.3f')
+            )
+
+            chart = (hist + line + text)
+            charts.append(chart)
+
+        layered = alt.hconcat(*[chart for chart in charts])
+        layered.configure_view(
+            strokeWidth=0
+        ).configure_axis(
+            labelFontSize=20,
+            titleFontSize=20
+        ).save(self.outdir + 'hit_histogram_' + mhc + '_' + method + '.html')
+
+    def population_hit_bar_plot(self, mhc, method):
+        sorting = sorted(list(set([p for p in self.data['population'] if not p == 'Average']))) + ['Average']
+        bar = alt.Chart(self.data).mark_bar().encode(
+            x=alt.X('population', axis=alt.Axis(title=None, labels=False, ticks=False), sort=sorting),
+            y=alt.Y('prob:Q', title='Population coverage'),
+            color=alt.Color('population', scale=alt.Scale(range=self.colors)),
+            column=alt.Column('n', title='Minimum number of peptide-Hla hits')
+        ).configure_axis(
+            grid=False,
+            labelFontSize=20,
+            titleFontSize=20
+        ).configure_view(
+            strokeOpacity=0
+        )
+
+        bar.save(self.outdir + 'min_hit_number_bar_' + mhc + '_' + method + '.html')
+
+    def composition_plot(self, mhc, method):
+        bar = alt.Chart(self.data).mark_bar().encode(
+            x=alt.X('Protein:N', axis=alt.Axis(labelAngle=0)),
+            y='Count',
+            color=alt.Color('Protein', scale=alt.Scale(range=self.colors))
+        ).properties(
+            width=230
+        ).configure_axis(
+            labelFontSize=20,
+            titleFontSize=20
+        )
+        bar.save(self.outdir + 'composition_bar_' + mhc + '_' + method + '.html')
\ No newline at end of file
diff --git a/Evaluation/read_peptides.py b/Evaluation/read_peptides.py
new file mode 100644
index 0000000000000000000000000000000000000000..58eb33be566a7381d2b55e6c4f9a058965552dc8
--- /dev/null
+++ b/Evaluation/read_peptides.py
@@ -0,0 +1,15 @@
+import sys
+
+
+def main(file):
+    peptides = []
+    for line in open(file, 'r'):
+        if line.startswith('#'):
+            continue
+        pep = line.strip()
+        peptides.append(pep)
+    return peptides
+
+if __name__ == '__main__':
+    f = sys.argv[1]
+    main(f)
\ No newline at end of file
diff --git a/HOGVAX/example_arguments.txt b/HOGVAX/example_arguments.txt
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..b6560f04cd7beeaad863c454c805ceadc2812c33 100644
--- a/HOGVAX/example_arguments.txt
+++ b/HOGVAX/example_arguments.txt
@@ -0,0 +1,3 @@
+python hogvax.py --k 170 --outdir mhcI_unlinked --populations World --peptides ../OptiVax_Data/Peptides/optivax_unlinked_mhc1_4512_filtered_peptides.pep --allele-frequencies ../OptiVax_Data/Frequencies/IEDB_population_frequency2392_normalized.pkl --ba-threshold 0.638 --binding-affinities ../OptiVax_Data/BindingAffinities/25June_mhc1_netmhc-4.1_pred_affinity_pivot.pkl.gz --min-hits 1 --verbose 1
+
+python hogvax.py --k 322 --outdir mhcII_unlinked --population World --peptides ../../OptiVax_Data/Peptides/optivax_unlinked_mhc2_37435_filtered_peptides.pep -af ../../OptiVax_Data/Frequencies/IEDB_population_frequency_mhc2_275normalized.pkl -t 0.638 -ba ../../OptiVax_Data/BindingAffinities/25June_mhc2_netmhcii-4.1_pred_affinity_pivot_v1v2.pkl.gz --min-hits 1 --verbose 1
diff --git a/README.md b/README.md
index 5fd8317f3c15c7a0061bd5187e701031948bf6c2..563f61333daed6d40cff0744e9ab8b598ca1f1ac 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,11 @@
 Vaccination is the key component to overcoming the global COVID-19 pandemic. Peptide vaccines present a safe and cost-efficient alternative to traditional vaccines. They are rapidly produced and adapted for new viral variants. The vaccine's efficacy essentially depends on two components: the peptides included in the vaccine and the ability of major histocompatibility complex (MHC) molecules to bind and present these peptides to cells of the immune system. Due to the high diversity of MHC alleles and their diverging specificities in binding peptides, choosing a set of peptides that maximizes population coverage is a challenging task. Further, peptide vaccines are limited in their size allowing only for a small set of peptides to be included. Thus, they might fail to immunize a large part of the human population or protect against upcoming viral variants. Here, we present HOGVAX, a combinatorial optimization approach to select peptides that maximize population coverage. Furthermore, we exploit overlaps between peptide sequences to include a large number of peptides in a limited space and thereby also cover rare MHC alleles. We model this task as a theoretical problem, which we call the *Maximal Scoring k-Superstring Problem*. Additionally, HOGVAX is able to consider haplotype frequencies to take linkage disequilibrium between MHC loci into account. Our vaccine formulations contain significantly more peptides compared to vaccine sequences built from concatenated peptides. We predicted over 98% population coverage for our vaccine candidates of MHC class I and II based on single-allele and haplotype frequencies. Moreover, we predicted high numbers of per-individual presented peptides leading to a robust immunity in the face of new virus variants.
 
 # Execute HOGVAX
+## Snakemake Pipeline
 
 
-## Stand-alone version
+
+## Stand-alone Version
 Open a shell in the [HOGVAX](HOGVAX/) folder and create the conda environment from the `yaml` file for proper execution of HOGVAX.
 ```shell
 conda env create -f hogvax_env.yaml
@@ -14,5 +16,5 @@ conda activate hogvax_env
 ```
 To execute HOGVAX, call the python script with the necessary arguments, see the [example arguments](HOGVAX/example_arguments.txt).
 ```shell
-python hogvax.py [arguments]
+python hogvax.py <arguments>
 ```