diff --git a/01-prepare_data/01-01-prepare_NOG_dataset_dload_seqs.ipynb b/01-prepare_data/01-01-prepare_NOG_dataset_dload_seqs.ipynb
index 47e022d9480fe9e5e52220fb9d1bda7041f79507..caa4cc496c59d41118c507689c15ee4e710000fb 100644
--- a/01-prepare_data/01-01-prepare_NOG_dataset_dload_seqs.ipynb
+++ b/01-prepare_data/01-01-prepare_NOG_dataset_dload_seqs.ipynb
@@ -111,7 +111,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -122,7 +122,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -177,21 +177,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1548/1548 [01:04<00:00, 23.82it/s]\n"
+      "100%|██████████| 1548/1548 [00:56<00:00, 27.46it/s]\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Number of taxa with gene data: 854\n"
+      "Number of taxa with gene data: 853\n"
      ]
     }
    ],
@@ -319,7 +319,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -352,7 +352,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-05-06 10:10:01 URL:https://biocore.github.io/wol/data/genomes/metadata.tsv.xz [971240/971240] -> \"../data/wol_metadata.tsv.xz\" [1]\n"
+      "2025-05-08 14:52:45 URL:https://biocore.github.io/wol/data/genomes/metadata.tsv.xz [971240/971240] -> \"../data/wol_metadata.tsv.xz\" [1]\n"
      ]
     },
     {
@@ -385,7 +385,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-05-06 10:10:01 URL:https://raw.githubusercontent.com/biocore/wol/master/data/trees/astral/branch_length/cons/astral.cons.nwk [407397/407397] -> \"../data/wol_tree.nwk\" [1]\n"
+      "2025-05-08 14:52:46 URL:https://raw.githubusercontent.com/biocore/wol/master/data/trees/astral/branch_length/cons/astral.cons.nwk [407397/407397] -> \"../data/wol_tree.nwk\" [1]\n"
      ]
     }
    ],
@@ -628,10 +628,10 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Number of genomes after pruning based on taxid in gene_features_dict: 413\n",
-      "acc_id_to_keep looks like: ['GCF_000773685.1', 'GCF_001767295.1', 'GCF_000817975.1', 'GCF_001423155.1', 'GCA_001801665.1']\n",
-      "Number of genomes in the pruned metadata: 361\n",
-      "Number of genomes in the pruned tree: 361\n"
+      "Number of genomes after pruning based on taxid in gene_features_dict: 412\n",
+      "acc_id_to_keep looks like: ['GCF_000245735.1', 'GCA_001801365.1', 'GCF_000829415.1', 'GCF_000271305.1', 'GCF_001469215.1']\n",
+      "Number of genomes in the pruned metadata: 360\n",
+      "Number of genomes in the pruned tree: 360\n"
      ]
     }
    ],
@@ -696,7 +696,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Node names look like: [Tree node '1896966' (0x7f34f1d0f50), Tree node '381306' (0x7f34f17884a), Tree node '246195' (0x7f34f17e11c), Tree node '797473' (0x7f34f17e128), Tree node 'N1' (0x7f34f17e110)] ...\n",
+      "Node names look like: [Tree node '1896966' (0x7f38b054c0b), Tree node '381306' (0x7f38afdb0b9), Tree node '246195' (0x7f38afe058b), Tree node '797473' (0x7f38afe0597), Tree node 'N1' (0x7f38afe057f)] ...\n",
       "New leaf names look like:  ['1896966', '381306', '246195', '797473', '1766620', '555778', '1033802', '1304275', '1172194', '1579979'] ...\n"
      ]
     }
@@ -749,7 +749,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Number of taxa in the tree:  361 Number of taxa in the dictionary:  854\n"
+      "Number of taxa in the tree:  360 Number of taxa in the dictionary:  853\n"
      ]
     }
    ],
@@ -786,7 +786,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -794,9 +794,9 @@
      "output_type": "stream",
      "text": [
       "Min number of genes to retain a NOG:  10 and min number of taxa to retain a NOG:  30\n",
-      "Mean #genes_to_keep:  83.92499327896765 Std dev of #genes_to_keep:  177.982200175856\n",
+      "Mean #genes_to_keep:  84.05419908773813 Std dev of #genes_to_keep:  178.40979118790855\n",
       "Starting with n =  100  as ~quarter of the std dev of #genes_to_keep in nog_members_df\n",
-      "At n =  1300 , taxon_list is a subset of taxa_in_subset, with total of  359  taxa\n"
+      "At n =  1300 , taxon_list is a subset of taxa_in_subset, with total of  360  taxa\n"
      ]
     },
     {
@@ -831,53 +831,53 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>101263</th>\n",
+       "      <th>14341</th>\n",
        "      <td>1236</td>\n",
-       "      <td>ETAWV</td>\n",
-       "      <td>372</td>\n",
-       "      <td>393</td>\n",
-       "      <td>[160488, 634500, 214092, 223283, 1414654, 6650...</td>\n",
-       "      <td>[1006000.GKAS_01002, 1028989.PSCI_4210, 105164...</td>\n",
+       "      <td>ER3QB</td>\n",
+       "      <td>190</td>\n",
+       "      <td>221</td>\n",
+       "      <td>[1294143, 1798802, 572477, 273526, 1441930, 40...</td>\n",
+       "      <td>[105559.Nwat_2444, 1121939.L861_15835, 1178482...</td>\n",
        "      <td>55</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>17358</th>\n",
+       "      <th>14363</th>\n",
        "      <td>1236</td>\n",
-       "      <td>ER6B2</td>\n",
-       "      <td>254</td>\n",
-       "      <td>301</td>\n",
-       "      <td>[743720, 717774, 491952, 1178482, 290398, 1445...</td>\n",
-       "      <td>[1028989.PSCI_0675, 1051646.IX91_03105, 105164...</td>\n",
+       "      <td>ER3QZ</td>\n",
+       "      <td>229</td>\n",
+       "      <td>258</td>\n",
+       "      <td>[1926881, 230089, 910964, 69222, 1484157, 1681...</td>\n",
+       "      <td>[1005058.UMN179_01181, 1006000.GKAS_02034, 105...</td>\n",
        "      <td>55</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>95227</th>\n",
+       "      <th>75558</th>\n",
        "      <td>1236</td>\n",
-       "      <td>ET5PB</td>\n",
-       "      <td>255</td>\n",
-       "      <td>255</td>\n",
-       "      <td>[314276, 1635005, 211586, 326297, 151081, 3807...</td>\n",
-       "      <td>[1051646.IX91_11355, 1085623.GNIT_2424, 112767...</td>\n",
+       "      <td>ESNNU</td>\n",
+       "      <td>337</td>\n",
+       "      <td>372</td>\n",
+       "      <td>[1736612, 216778, 273526, 871585, 1484157, 140...</td>\n",
+       "      <td>[1006000.GKAS_04123, 1045855.DSC_01400, 111551...</td>\n",
        "      <td>55</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>55467</th>\n",
+       "      <th>17358</th>\n",
        "      <td>1236</td>\n",
-       "      <td>ES59X</td>\n",
-       "      <td>264</td>\n",
-       "      <td>277</td>\n",
-       "      <td>[1294143, 1384056, 267850, 322710, 1177154, 16...</td>\n",
-       "      <td>[1026882.MAMP_03077, 1028989.PSCI_3136, 103380...</td>\n",
+       "      <td>ER6B2</td>\n",
+       "      <td>254</td>\n",
+       "      <td>301</td>\n",
+       "      <td>[743720, 717774, 491952, 1178482, 290398, 1445...</td>\n",
+       "      <td>[1028989.PSCI_0675, 1051646.IX91_03105, 105164...</td>\n",
        "      <td>55</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>97026</th>\n",
+       "      <th>44651</th>\n",
        "      <td>1236</td>\n",
-       "      <td>ET788</td>\n",
-       "      <td>270</td>\n",
-       "      <td>272</td>\n",
-       "      <td>[1926881, 230089, 198214, 1681196, 910964, 692...</td>\n",
-       "      <td>[1006000.GKAS_02548, 1115515.EV102420_33_00080...</td>\n",
+       "      <td>ERVXT</td>\n",
+       "      <td>256</td>\n",
+       "      <td>273</td>\n",
+       "      <td>[1736612, 43263, 1294143, 216778, 1535422, 179...</td>\n",
+       "      <td>[1163408.UU9_10362, 1187848.A1QO_01760, 119576...</td>\n",
        "      <td>55</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -885,26 +885,26 @@
        "</div>"
       ],
       "text/plain": [
-       "        taxonomic_group    NOG  #taxa  #genes  \\\n",
-       "101263             1236  ETAWV    372     393   \n",
-       "17358              1236  ER6B2    254     301   \n",
-       "95227              1236  ET5PB    255     255   \n",
-       "55467              1236  ES59X    264     277   \n",
-       "97026              1236  ET788    270     272   \n",
+       "       taxonomic_group    NOG  #taxa  #genes  \\\n",
+       "14341             1236  ER3QB    190     221   \n",
+       "14363             1236  ER3QZ    229     258   \n",
+       "75558             1236  ESNNU    337     372   \n",
+       "17358             1236  ER6B2    254     301   \n",
+       "44651             1236  ERVXT    256     273   \n",
        "\n",
-       "                                                     taxa  \\\n",
-       "101263  [160488, 634500, 214092, 223283, 1414654, 6650...   \n",
-       "17358   [743720, 717774, 491952, 1178482, 290398, 1445...   \n",
-       "95227   [314276, 1635005, 211586, 326297, 151081, 3807...   \n",
-       "55467   [1294143, 1384056, 267850, 322710, 1177154, 16...   \n",
-       "97026   [1926881, 230089, 198214, 1681196, 910964, 692...   \n",
+       "                                                    taxa  \\\n",
+       "14341  [1294143, 1798802, 572477, 273526, 1441930, 40...   \n",
+       "14363  [1926881, 230089, 910964, 69222, 1484157, 1681...   \n",
+       "75558  [1736612, 216778, 273526, 871585, 1484157, 140...   \n",
+       "17358  [743720, 717774, 491952, 1178482, 290398, 1445...   \n",
+       "44651  [1736612, 43263, 1294143, 216778, 1535422, 179...   \n",
        "\n",
-       "                                                    genes  #genes_to_keep  \n",
-       "101263  [1006000.GKAS_01002, 1028989.PSCI_4210, 105164...              55  \n",
-       "17358   [1028989.PSCI_0675, 1051646.IX91_03105, 105164...              55  \n",
-       "95227   [1051646.IX91_11355, 1085623.GNIT_2424, 112767...              55  \n",
-       "55467   [1026882.MAMP_03077, 1028989.PSCI_3136, 103380...              55  \n",
-       "97026   [1006000.GKAS_02548, 1115515.EV102420_33_00080...              55  "
+       "                                                   genes  #genes_to_keep  \n",
+       "14341  [105559.Nwat_2444, 1121939.L861_15835, 1178482...              55  \n",
+       "14363  [1005058.UMN179_01181, 1006000.GKAS_02034, 105...              55  \n",
+       "75558  [1006000.GKAS_04123, 1045855.DSC_01400, 111551...              55  \n",
+       "17358  [1028989.PSCI_0675, 1051646.IX91_03105, 105164...              55  \n",
+       "44651  [1163408.UU9_10362, 1187848.A1QO_01760, 119576...              55  "
       ]
      },
      "metadata": {},
@@ -1065,14 +1065,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Number of locus_tags considering the wol_tree_taxa: 1032634. Looks like: ['1005090.BAKON_001', '1005090.BAKON_002', '1005090.BAKON_003', '1005090.BAKON_004', '1005090.BAKON_005']\n"
+      "Number of locus_tags considering the wol_tree_taxa: 1032613. Looks like: ['1005090.BAKON_001', '1005090.BAKON_002', '1005090.BAKON_003', '1005090.BAKON_004', '1005090.BAKON_005']\n"
      ]
     }
    ],
@@ -1099,7 +1099,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -1165,16 +1165,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Taxa in wol tree but not in gene trees: 0 \n",
+      "Taxa in wol tree but not in gene trees: 1 \n",
       "and vice versa: 0\n",
-      "They look like this: [] and []\n",
+      "They look like this: ['243277'] and []\n",
       "Number of taxa to keep in the wol tree: 359\n"
      ]
     }
@@ -1238,7 +1238,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1271,7 +1271,7 @@
     "nog_members_df['genes'] = nog_members_df['genes'].apply(lambda x: ','.join(x))\n",
     "\n",
     "# overwrite the NOG members file\n",
-    "nog_members_df.to_csv(f\"{data_dir}{taxonomicID}_nog_members.tsv\", sep=\"\\t\", index=False, header=False)"
+    "nog_members_df.to_csv(f\"{data_dir}{taxonomicID}_nog_members.tsv\", sep=\"\\t\", index=False, header=False)\n"
    ]
   },
   {
@@ -1283,7 +1283,8 @@
     "The below command is what you can run and then wait to finish. It will be run in the background (because of `nohup` and `disown`) even if you lose connection to the server. This process can take in the order of hours to finish, for a list of trees in the order of 10k trees.\n",
     "\n",
     "```\n",
-    "$ nohup python src/run_MAD_on_EggNOG_parallel.py -i ../data/1236_pruned_gene_trees.tsv -m ~/bin/mad/mad -n 100 > ./nohup_mad_eggnog_rooting_1236.log & disown\n",
+    "# cd 01-prepare_data\n",
+    "$ nohup python src/run_MAD_on_EggNOG_parallel.py -i ../data/1236_pruned_gene_trees.tsv -m ~/bin/mad/mad -n 100 > ../data/nohup_mad_eggnog_rooting_1236.log & disown\n",
     "```\n",
     "\n",
     "**NOTE**: Pruning took roughly 1.5 hours for 15k trees"
@@ -1307,14 +1308,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Number of genome accessions: 1470, and they look like:\n"
+      "Number of genome accessions: 1473, and they look like:\n"
      ]
     },
     {
@@ -1346,27 +1347,27 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>GCF_000953635.1</td>\n",
-       "      <td>/root/work/projects/hgt_inference_comparative_...</td>\n",
+       "      <td>/root/work/projects/paper_comparative_study_of...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>GCF_900461565.1</td>\n",
-       "      <td>/root/work/projects/hgt_inference_comparative_...</td>\n",
+       "      <td>/root/work/projects/paper_comparative_study_of...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>GCF_900450575.1</td>\n",
-       "      <td>/root/work/projects/hgt_inference_comparative_...</td>\n",
+       "      <td>/root/work/projects/paper_comparative_study_of...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>GCF_015476275.1</td>\n",
-       "      <td>/root/work/projects/hgt_inference_comparative_...</td>\n",
+       "      <td>/root/work/projects/paper_comparative_study_of...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>GCF_000953655.1</td>\n",
-       "      <td>/root/work/projects/hgt_inference_comparative_...</td>\n",
+       "      <td>/root/work/projects/paper_comparative_study_of...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1374,11 +1375,11 @@
       ],
       "text/plain": [
        "  genome_accession                                       fna_filepath\n",
-       "0  GCF_000953635.1  /root/work/projects/hgt_inference_comparative_...\n",
-       "1  GCF_900461565.1  /root/work/projects/hgt_inference_comparative_...\n",
-       "2  GCF_900450575.1  /root/work/projects/hgt_inference_comparative_...\n",
-       "3  GCF_015476275.1  /root/work/projects/hgt_inference_comparative_...\n",
-       "4  GCF_000953655.1  /root/work/projects/hgt_inference_comparative_..."
+       "0  GCF_000953635.1  /root/work/projects/paper_comparative_study_of...\n",
+       "1  GCF_900461565.1  /root/work/projects/paper_comparative_study_of...\n",
+       "2  GCF_900450575.1  /root/work/projects/paper_comparative_study_of...\n",
+       "3  GCF_015476275.1  /root/work/projects/paper_comparative_study_of...\n",
+       "4  GCF_000953655.1  /root/work/projects/paper_comparative_study_of..."
       ]
      },
      "metadata": {},
@@ -1401,7 +1402,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1440,7 +1441,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1500,35 +1501,6 @@
     "            write_buffer(buffer, out_file)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "def return_gene_features(gene):\n",
-    "    taxon, locus_tag = gene.split('.', 1)\n",
-    "    return [\n",
-    "        gene, \n",
-    "        gene_features_dict[taxon]['genes'][locus_tag]['id'], \n",
-    "        gene_features_dict[taxon]['genes'][locus_tag]['seqid'],\n",
-    "        gene_features_dict[taxon]['genes'][locus_tag]['start'], \n",
-    "        gene_features_dict[taxon]['genes'][locus_tag]['end'], \n",
-    "        gene_features_dict[taxon]['genes'][locus_tag]['strand'],\n",
-    "        gene_features_dict[taxon]['acc_id'],\n",
-    "    ]\n",
-    "\n",
-    "# create a pool of worker processes\n",
-    "with mp.Pool(int(mp.cpu_count()/2)) as p:\n",
-    "    subset_gene_features_list = p.imap_unordered(return_gene_features, subset_genes_list)\n",
-    "\n",
-    "subset_gene_features_df = pd.DataFrame(subset_gene_features_list, columns=['locus_tag', 'seqid', 'gene_id', 'start', 'end', 'strand', 'genome_accession'])\n",
-    "\n",
-    "# write this to a file\n",
-    "subset_gene_features_df.to_csv(f'{data_dir}{taxonomicID}_subset_gene_features.tsv', sep='\\t', index=False)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1538,21 +1510,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The taxa list looks like this: ['1005057', '1005058', '1005090', '1006000', '1009858', '1026882', '1028989', '1033802', '1036674', '1045855'] ...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The size of the pa_df is: (359, 1286)\n"
+      "The taxa list looks like this: ['1005057', '1005058', '1005090', '1006000', '1009858', '1026882', '1028989', '1033802', '1036674', '1045855'] ...\n",
+      "The size of the pa_df is: (359, 1300)\n"
      ]
     }
    ],
@@ -1622,7 +1588,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/01-prepare_data/01-02-prepare_nucleotide_data.ipynb b/01-prepare_data/01-02-prepare_nucleotide_data.ipynb
index 7018e83b3b54c23c6e6c2470f7b0ddaddee2104c..e77f2fbf7ba5a82dc2b88732d84ae7016c69acda 100644
--- a/01-prepare_data/01-02-prepare_nucleotide_data.ipynb
+++ b/01-prepare_data/01-02-prepare_nucleotide_data.ipynb
@@ -7,7 +7,7 @@
     "Run `download_ncbi_genome_sequences.py` like so:\n",
     "\n",
     "```bash\n",
-    "nohup python src/download_ncbi_genome_sequences.py -i 1236_subset.taxa  > nohup_download_genome_seqs.out & disown\n",
+    "nohup ~/mambaforge/envs/hgt_analyses/bin/python src/download_ncbi_genome_sequences.py -i ../data/1236_subset_taxa.txt  > ../data/nohup_download_genome_seqs.out & disown\n",
     "```\n",
     "\n",
     "By default, `-o` output dir is `../data/genome_sequences/` where fasta and gff files are downloaded for each taxon in the list. \n",
@@ -29,15 +29,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found 100692 unique gene names. List looks like this: ['1123053_GCA_000425345_02619', '128785_GCA_001556105_01030', '1123514_GCA_000381085_02040', '1441930_Z042_05715', '1449087_GCA_000711305_00098', '767434_Fraau_2269', '377629_TERTU_3187', '437022_CC99x_00673', '658445_H744_2c0221', '1265503_GCA_000378625_02015']\n",
-      "Writing gene names to 1236_subset_NOG_genes.list\n"
+      "Found 156509 unique gene names. List looks like this: ['291331.XOO2297', '225937.HP15_481', '573983.B0681_04825', '318167.Sfri_1100', '291331.XOO1980', '472759.Nhal_3786', '1798287.A3F14_06600', '1354304.XPG1_2379', '28173.VIBNI_B0337', '1461581.BN1049_02975']\n",
+      "Writing gene names to ../data/1236_nog_genes.list\n"
      ]
     }
    ],
@@ -49,7 +49,7 @@
     "\n",
     "# for the genes of interest, we need to first prepare a list of all the gene names.\n",
     "# for this, first read in the members.tsv file\n",
-    "members_tsv_filepath = '1236_subset_NOG_members.tsv'\n",
+    "members_tsv_filepath = '../data/1236_nog_members.tsv'\n",
     "# read only the 6th column (CSV lists of the gene names)\n",
     "with open(members_tsv_filepath) as fo:\n",
     "    flines = fo.readlines()\n",
@@ -87,7 +87,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.2"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,
diff --git a/01-prepare_data/src/run_MAD_on_EggNOG_parallel.py b/01-prepare_data/src/run_MAD_on_EggNOG_parallel.py
index 6ef8d01725ad1d10a5e719251152647a87ad0650..006772cb7208155f62722ea8af7aea7b160b360c 100755
--- a/01-prepare_data/src/run_MAD_on_EggNOG_parallel.py
+++ b/01-prepare_data/src/run_MAD_on_EggNOG_parallel.py
@@ -8,6 +8,7 @@ import argparse
 import glob
 from multiprocessing import Pool
 
+
 def run_command(params):
     """
     Inputs:
@@ -21,6 +22,7 @@ def run_command(params):
         if process.wait() != 0:
             raise CalledProcessError(process.returncode, cmd)
 
+
 def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_subprocesses):
     """
     Inputs:
@@ -36,8 +38,9 @@ def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_s
         The output file will contain the same number of lines as the input file, but the second column will contain the rooted trees.
     """
     # Check if the input file exists
-    if not os.path.isfile(input_file_path): 
-        raise FileNotFoundError(f"The input file {input_file_path} does not seem to exist. Check the filepath provided.")
+    if not os.path.isfile(input_file_path):
+        raise FileNotFoundError(f"The input file {
+                                input_file_path} does not seem to exist. Check the filepath provided.")
     # Find the full path of the input file and extract the dir name
     input_file_path = os.path.abspath(input_file_path)
     input_dir = os.path.dirname(input_file_path)
@@ -51,50 +54,53 @@ def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_s
     # Create the temporary directory
     os.mkdir(tmp_dir)
     # log this with time, along with location of the tmp folder
-    print(f'Created the tmp folder at: {datetime.now()} at location: {tmp_dir}') 
-
+    print(f'Created the tmp folder at: {
+          datetime.now()} at location: {tmp_dir}')
 
     # Open the input file and read it in chunks of `n_lines` lines
-    nwk_split_files = [] # track the filenames of split files containing the second column (tree)
-    info_split_files = [] # track the filenames of split files containing the first column (treeID)
-
+    # track the filenames of split files containing the second column (tree)
+    nwk_split_files = []
+    # track the filenames of split files containing the first column (treeID)
+    info_split_files = []
+
+    def write_split_files(chunk, chunk_index, nwk_split_files, info_split_files):
+        """
+        Helper function to write out the split files for a given chunk.
+        """
+        split_chunk = [line.split() for line in chunk]
+        nwk_split_filepath = f"{tmp_dir}/tmp_tree_split.nwk.{chunk_index}"
+        info_split_filepath = f"{tmp_dir}/tmp_tree_split.info.{chunk_index}"
+        with open(nwk_split_filepath, 'w') as split_out_fo:
+            split_out_fo.write('\n'.join(line[1]
+                               for line in split_chunk) + '\n')
+        with open(info_split_filepath, 'w') as split_out_fo:
+            split_out_fo.write('\n'.join(line[0]
+                               for line in split_chunk) + '\n')
+        nwk_split_files.append(nwk_split_filepath)
+        info_split_files.append(info_split_filepath)
+        return nwk_split_files, info_split_files
+
+    # open the input file
     with open(input_file_path, 'r') as treesFO:
         chunk = []
         for i, line in enumerate(treesFO):
-            split_line = line.strip().split()
-            chunk.append(split_line)
+            chunk.append(line)
             if (i + 1) % n_lines == 0:
-                # Write out the split trees file
-                nwk_split_filepath = tmp_dir+'/tmp_tree_split.nwk.'+str((i // n_lines) + 1)
-                info_split_filepath = tmp_dir+'/tmp_tree_split.info.'+str((i // n_lines) + 1)
-                with open(nwk_split_filepath, 'w') as split_out_fo:
-                    split_out_fo.write('\n'.join(line[1] for line in chunk) + '\n')
-                with open(info_split_filepath, 'w') as split_out_fo:
-                    split_out_fo.write('\n'.join(line[0] for line in chunk) + '\n')
-                # track the filenames
-                nwk_split_files.append(nwk_split_filepath)
-                info_split_files.append(info_split_filepath)
+                nwk_split_files, info_split_files = write_split_files(
+                    chunk, (i // n_lines) + 1, nwk_split_files, info_split_files)
                 chunk = []
 
         # Write the last chunk if it's not empty
         if chunk:
-            split_chunk = [line.split() for line in chunk]
-            nwk_split_filepath = tmp_dir+'/tmp_tree_split.nwk.'+str((i // n_lines) + 2)
-            info_split_filepath = tmp_dir+'/tmp_tree_split.info.'+str((i // n_lines) + 2)
-            with open(nwk_split_filepath, 'w') as split_out_fo:
-                split_out_fo.write('\n'.join(line[1] for line in split_chunk) + '\n')
-            with open(info_split_filepath, 'w') as split_out_fo:
-                split_out_fo.write('\n'.join(line[0] for line in split_chunk) + '\n')
-            # track the filenames
-            nwk_split_files.append(nwk_split_filepath)
-            info_split_files.append(info_split_filepath)
-
-    
-    # using subprocess.Popen inside the run_command function, 
+            nwk_split_files, info_split_files = write_split_files(
+                chunk, (len(nwk_split_files) // n_lines) + 2, nwk_split_files, info_split_files)
+
+    # using subprocess.Popen inside the run_command function,
     # call MAD on all of these split newick files. Prepare the commands.
     # `mad -n trees.nwk.1`
-    commands = [([mad_executable, '-t', '-n', split_file_path], split_file_path+'.stdout') for split_index, split_file_path in enumerate(nwk_split_files)]
-    
+    commands = [([mad_executable, '-t', '-n', split_file_path], split_file_path+'.stdout')
+                for split_index, split_file_path in enumerate(nwk_split_files)]
+
     mad_start_time = time.time()
     # Log timestamp for the start of the MAD rooting process
     print(f'MAD rooting starting at: {datetime.now()}')
@@ -108,7 +114,6 @@ def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_s
     # Log that the split files are being combined, along with the timestamp
     print(f'{datetime.now()}: Combining the split files into one file...')
 
-
     # Note: mad adds a `.rooted` at the end of the input filename, as the output filename
     # combine the columns from the info files with the nwk.rooted files to prepare the final output file
     with open(input_file_path + '.rooted', 'w') as rootedOutputFO:
@@ -124,25 +129,28 @@ def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_s
             # combine the info and rooted trees
             with open(info_split_filepath, 'r') as info_file:
                 rooted_info_plus_tree_chunks = [
-                    '\t'.join([info_line.strip(), rooted_nwk_trees[line_index]])
+                    '\t'.join(
+                        [info_line.strip(), rooted_nwk_trees[line_index]])
                     for line_index, info_line in enumerate(info_file)
                 ]
 
             # write out the new tree line chunks to the output file
-            rootedOutputFO.write('\n'.join(rooted_info_plus_tree_chunks) + '\n')
+            rootedOutputFO.write(
+                '\n'.join(rooted_info_plus_tree_chunks) + '\n')
 
             # Clear memory
             del rooted_info_plus_tree_chunks
             del rooted_nwk_trees
-    
-    # delete the split nwk files in the tmp folder
+
+    # # delete the split nwk files in the tmp folder
     for split_file_name in nwk_split_files:
         os.remove(split_file_name)
     # combine all the log files into one log file using `tail -n +1`, and write in the input_dir with timestamp
     now = datetime.now()
     timestamp = now.strftime("%Y%m%d_%H%M%S")
-    os.system('tail -n +1 '+tmp_dir+'/*.stdout > '+input_dir+'/tmp_mad_rooting_splits_'+timestamp+'.log')
-    # delete the tmp folder
+    os.system('tail -n +1 '+tmp_dir+'/*.stdout > '+input_dir +
+              '/tmp_mad_rooting_splits_'+timestamp+'.log')
+    # # delete the tmp folder
     # os.system('rm -rf '+tmp_dir)
 
 
@@ -170,7 +178,8 @@ if __name__ == '__main__':
 
     args = parser.parse_args()
 
-    MAD_roots_NOG_trees_parallel(args.Input, args.mad, args.nlines, args.num_subprocesses)
+    MAD_roots_NOG_trees_parallel(
+        args.Input, args.mad, args.nlines, args.num_subprocesses)
 
     print('Process time taken from start to finish: ' +
-                str(timedelta(seconds=time.time() - start_time)) )
\ No newline at end of file
+          str(timedelta(seconds=time.time() - start_time)))
diff --git a/02-run_programs/02-run_programs.ipynb b/02-run_programs/02-run_programs.ipynb
index 92d6a1f452c4cbfe1971436affcea994193bc1e6..b46f34fbed14a30499b4b725c90b3d0453e9021c 100644
--- a/02-run_programs/02-run_programs.ipynb
+++ b/02-run_programs/02-run_programs.ipynb
@@ -11,12 +11,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
     "taxonomicID = '1236'\n",
-    "data = '/root/work/projects/hgt_inference_comparative_study/data/'"
+    "data = '/root/work/projects/paper_comparative_study_of_hgt_inference_methods/data/'"
    ]
   },
   {
@@ -35,12 +35,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
     "%%bash\n",
     "# make directories for programs\n",
+    "mkdir -p program_runs\n",
+    "cd program_runs\n",
     "mkdir -p ALE AnGST RANGER RANGER-Fast Count GLOOME_with_tree GLOOME_without_tree"
    ]
   },
@@ -109,15 +111,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%bash\n",
+    "%%bash -s \"$taxonomicID\" \"$data\"\n",
     "# create AnGST directory if it doesn't exist\n",
     "# write penalty file\n",
     "# these params were taken from the AnGST manual\n",
-    "mkdir -p ./AnGST\n",
+    "mkdir -p $2/program_runs/AnGST\n",
     "cat > ./src/angst_penalty_file.txt << EOL\n",
     "hgt: 3.0\n",
     "dup: 2.0\n",
@@ -131,9 +133,14 @@
    "metadata": {},
    "source": [
     "Run AnGST using the script `run_angst.py` in the following manner:\n",
+    "\n",
     "```bash\n",
-    "# cd AnGST\n",
-    "$ nohup python ../src/run_angst.py > nohup_run_angst.out & disown\n",
+    "$ cd AnGST\n",
+    "# make sure you activate the python2 environment\n",
+    "$ mamba activate hgt_analyses_py2\n",
+    "\n",
+    "# run the script\n",
+    "$ nohup ~/mambaforge/envs/hgt_analyses_py2/bin/python ../src/run_angst.py > nohup_run_angst.out & disown\n",
     "```"
    ]
   },
@@ -160,16 +167,23 @@
     "The function below creates the presence-absence (PA) matrix for GLOOME, based on the PA of taxa in the NOGs of interest. We use that matrix (as a fasta file input) along with the species tree (and a separate run, without the tree), to infer gains and losses. Since Count also uses a PA matrix but in TSV format, we prepare it here itself. Note that Count requires each row to be that of a gene family (in our case, a NOG) whereas GLOOME needs each row (aka each sequence in the FASTA file) to be that of a taxon."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### GLOOME ML"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "%%bash -s \"$taxonomicID\" \"$data\"\n",
-    "\n",
+    "cd $2/program_runs/\n",
     "# write param file that uses the rooted tree\n",
-    "cat > ./GLOOME_with_tree/gloome_tree.params << EOL\n",
+    "cat > ./GLOOME_with_tree/gloome_ml_tree.params << EOL\n",
     "_seqFile $2$1_pa_matrix.fasta\n",
     "_treeFile $2$1_wol_tree_pruned_no_internal_labels.nwk\n",
     "_gainLossDist 1\n",
@@ -179,11 +193,11 @@
     "_costMatrixGainLossRatio 1\n",
     "## Advanced \n",
     "_logValue 4\n",
-    "_outDir Results_GLOOME_with_tree\n",
+    "_outDir Results_GLOOME_ML_with_tree\n",
     "EOL\n",
     "\n",
     "# write param file that doesn't use the rooted tree\n",
-    "cat > ./GLOOME_without_tree/gloome_without_tree.params << EOL\n",
+    "cat > ./GLOOME_without_tree/gloome_ml_without_tree.params << EOL\n",
     "_seqFile $2$1_pa_matrix.fasta\n",
     "_gainLossDist 1\n",
     "# for COG and EggNOG only patterns with 3 or more ones are observable\n",
@@ -192,7 +206,7 @@
     "_costMatrixGainLossRatio 1\n",
     "## Advanced\n",
     "_logValue 4\n",
-    "_outDir Results_GLOOME_without_tree\n",
+    "_outDir Results_GLOOME_ML_without_tree\n",
     "EOL"
    ]
   },
@@ -213,18 +227,95 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Run Count"
+    "### GLOOME MP"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash -s \"$taxonomicID\" \"$data\"\n",
+    "# we need a params file for each iteration of the _costMatrixGainLossRatio\n",
+    "# _costMatrixGainLossRatio is the ratio of the cost of a gain to the cost of a loss\n",
+    "# the default is 1:1 but we want to test 0.33, 0.5, 1, 2, ..., 8\n",
+    "# for each of these ratios we need to create a params file\n",
+    "cd $2/program_runs/\n",
+    "for costRatio in 0.33 0.5 1 2 3 4 5 6 7 8; do\n",
+    "# write param file that uses the rooted tree\n",
+    "cat > ./GLOOME_with_tree/gloome_tree_${costRatio}.params << EOL\n",
+    "_seqFile $2$1_pa_matrix.fasta\n",
+    "_treeFile $2$1_wol_tree_pruned_no_internal_labels.nwk\n",
+    "_costMatrixGainLossRatio $costRatio\n",
+    "_logValue 4\n",
+    "_performOptimizations 0\n",
+    "_outDir Results_GLOOME_MP_with_tree_$costRatio\n",
+    "EOL\n",
+    "\n",
+    "# write param file that doesn't use the rooted tree\n",
+    "cat > ./GLOOME_without_tree/gloome_without_tree_${costRatio}.params << EOL\n",
+    "_seqFile $2$1_pa_matrix.fasta\n",
+    "_costMatrixGainLossRatio $costRatio\n",
+    "_logValue 4\n",
+    "_performOptimizations 0\n",
+    "_outDir Results_GLOOME_MP_without_tree_$costRatio\n",
+    "EOL\n",
+    "done\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then run all of the GLOOME MP scripts like so:\n",
+    "```bash\n",
+    "cd GLOOME_with_tree\n",
+    "for i in 0.33 0.5 1 2 3 4 5 6 7 8 ; do nohup ~/bin/GLOOME.VR01.266 gloome_tree_${i}.params > nohup_gloome_run_with_tree_${i}.out & disown ; done\n",
+    "```\n",
+    "\n",
+    "```bash\n",
+    "cd ../GLOOME_without_tree\n",
+    "for i in 0.33 0.5 1 2 3 4 5 6 7 8 ; do nohup ~/bin/GLOOME.VR01.266 gloome_without_tree_${i}.params > nohup_gloome_run_without_tree_${i}.out & disown ; done\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Count ML"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```bash\n",
+    "# cd Count/Count_ML\n",
+    "nohup java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.ML -v true  ../../../1236_wol_tree_pruned_with_internal_labels.nwk ../1236_pa_matrix.tsv > 1236_rates.r & disown\n",
+    "# and after completing the above:\n",
+    "nohup java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.Posteriors ../../../1236_wol_tree_pruned_with_internal_labels.nwk ../1236_pa_matrix.tsv ./1236_rates.r  > Count_output.tsv & disown\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Count MP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Current working directory: /root/work/projects/paper_comparative_study_of_hgt_inference_methods/02-run_programs\n",
       "Count with gain ratio 0.33 done\n",
       "Count output for gain ratio 0.33 grepped\n",
       "Count with gain ratio 0.5 done\n",
@@ -252,10 +343,12 @@
     "%%bash -s \"$taxonomicID\" \"$data\"\n",
     "# exit if command exits with non-zero status\n",
     "set -e\n",
+    "# print working directory\n",
+    "echo \"Current working directory: $(pwd)\"\n",
     "# change to the directory where Count will be run\n",
     "cd ./program_runs/Count/Count_MP/\n",
     "\n",
-    "# for `gain` parameter being 0.33, 0.5, 1, 2, 3, 4, 5, 6, 7, run Count\n",
+    "# for `gain` parameter being 0.33, 0.5, 1, 2, ,.. 8, run Count\n",
     "for g in 0.33 0.5 1 2 3 4 5 6 7 8;\n",
     "do\n",
     "    # run Count\n",
diff --git a/02-run_programs/src/run_ALE.py b/02-run_programs/src/run_ALE.py
index 78e2fc35d51f9e96fb793901e983793b251aeade..b02d16260565f65f44817724dc608ed04290ea91 100644
--- a/02-run_programs/src/run_ALE.py
+++ b/02-run_programs/src/run_ALE.py
@@ -98,10 +98,10 @@ if __name__ == '__main__':
     # write everything in terms of argparse instead of hardcoding
     import argparse
     parser = argparse.ArgumentParser(description="Run ALE on all gene trees")
-    parser.add_argument("--species", "-s", type=str, default="../../data/1236_wol_tree_pruned_no_internal_labels.nwk",
-                        help="Path to species tree file (default: ../../data/1236_wol_tree_pruned_no_internal_labels.nwk)")
-    parser.add_argument("--gene", "-g", type=str, default="../../data/1236_pruned_gene_trees.tsv.rooted",
-                        help="Path to gene trees file (default: ../../data/1236_pruned_gene_trees.tsv.rooted)")
+    parser.add_argument("--species", "-s", type=str, default="../../1236_wol_tree_pruned_no_internal_labels.nwk",
+                        help="Path to species tree file (default: ../../1236_wol_tree_pruned_no_internal_labels.nwk)")
+    parser.add_argument("--gene", "-g", type=str, default="../../1236_pruned_gene_trees.tsv.rooted",
+                        help="Path to gene trees file (default: ../../1236_pruned_gene_trees.tsv.rooted)")
     parser.add_argument("--threads", "-t", type=int, default=50,
                         help="Number of threads to use for parallelization (default: 50)")
     parser.add_argument("--bin", "-b", type=str,