correct paths etc and include detail of GLOOME MP run instructions

388b938c · Swastik Mishra · 20092aea · 388b938c · 388b938c · 388b938c
Commit 388b938c authored 2 months ago by Swastik Mishra
--- a/01-prepare_data/01-01-prepare_NOG_dataset_dload_seqs.ipynb
+++ b/01-prepare_data/01-01-prepare_NOG_dataset_dload_seqs.ipynb
@@ -111,7 +111,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -122,7 +122,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -177,21 +177,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|██████████| 1548/1548 [01:04<00:00, 23.82it/s]\n"
+      "100%|██████████| 1548/1548 [00:56<00:00, 27.46it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Number of taxa with gene data: 854\n"
+      "Number of taxa with gene data: 853\n"
     ]
    }
   ],
@@ -319,7 +319,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -352,7 +352,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "2024-05-06 10:10:01 URL:https://biocore.github.io/wol/data/genomes/metadata.tsv.xz [971240/971240] -> \"../data/wol_metadata.tsv.xz\" [1]\n"
+      "2025-05-08 14:52:45 URL:https://biocore.github.io/wol/data/genomes/metadata.tsv.xz [971240/971240] -> \"../data/wol_metadata.tsv.xz\" [1]\n"
     ]
    },
    {
@@ -385,7 +385,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "2024-05-06 10:10:01 URL:https://raw.githubusercontent.com/biocore/wol/master/data/trees/astral/branch_length/cons/astral.cons.nwk [407397/407397] -> \"../data/wol_tree.nwk\" [1]\n"
+      "2025-05-08 14:52:46 URL:https://raw.githubusercontent.com/biocore/wol/master/data/trees/astral/branch_length/cons/astral.cons.nwk [407397/407397] -> \"../data/wol_tree.nwk\" [1]\n"
     ]
    }
   ],
@@ -628,10 +628,10 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Number of genomes after pruning based on taxid in gene_features_dict: 413\n",
-      "acc_id_to_keep looks like: ['GCF_000773685.1', 'GCF_001767295.1', 'GCF_000817975.1', 'GCF_001423155.1', 'GCA_001801665.1']\n",
-      "Number of genomes in the pruned metadata: 361\n",
-      "Number of genomes in the pruned tree: 361\n"
+      "Number of genomes after pruning based on taxid in gene_features_dict: 412\n",
+      "acc_id_to_keep looks like: ['GCF_000245735.1', 'GCA_001801365.1', 'GCF_000829415.1', 'GCF_000271305.1', 'GCF_001469215.1']\n",
+      "Number of genomes in the pruned metadata: 360\n",
+      "Number of genomes in the pruned tree: 360\n"
     ]
    }
   ],
@@ -696,7 +696,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Node names look like: [Tree node '1896966' (0x7f34f1d0f50), Tree node '381306' (0x7f34f17884a), Tree node '246195' (0x7f34f17e11c), Tree node '797473' (0x7f34f17e128), Tree node 'N1' (0x7f34f17e110)] ...\n",
+      "Node names look like: [Tree node '1896966' (0x7f38b054c0b), Tree node '381306' (0x7f38afdb0b9), Tree node '246195' (0x7f38afe058b), Tree node '797473' (0x7f38afe0597), Tree node 'N1' (0x7f38afe057f)] ...\n",
      "New leaf names look like:  ['1896966', '381306', '246195', '797473', '1766620', '555778', '1033802', '1304275', '1172194', '1579979'] ...\n"
     ]
    }
@@ -749,7 +749,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Number of taxa in the tree:  361 Number of taxa in the dictionary:  854\n"
+      "Number of taxa in the tree:  360 Number of taxa in the dictionary:  853\n"
     ]
    }
   ],
@@ -786,7 +786,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
@@ -794,9 +794,9 @@
     "output_type": "stream",
     "text": [
      "Min number of genes to retain a NOG:  10 and min number of taxa to retain a NOG:  30\n",
-      "Mean #genes_to_keep:  83.92499327896765 Std dev of #genes_to_keep:  177.982200175856\n",
+      "Mean #genes_to_keep:  84.05419908773813 Std dev of #genes_to_keep:  178.40979118790855\n",
      "Starting with n =  100  as ~quarter of the std dev of #genes_to_keep in nog_members_df\n",
-      "At n =  1300 , taxon_list is a subset of taxa_in_subset, with total of  359  taxa\n"
+      "At n =  1300 , taxon_list is a subset of taxa_in_subset, with total of  360  taxa\n"
     ]
    },
    {
@@ -831,53 +831,53 @@
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
-       "      <th>101263</th>\n",
+       "      <th>14341</th>\n",
       "      <td>1236</td>\n",
-       "      <td>ETAWV</td>\n",
-       "      <td>372</td>\n",
-       "      <td>393</td>\n",
-       "      <td>[160488, 634500, 214092, 223283, 1414654, 6650...</td>\n",
-       "      <td>[1006000.GKAS_01002, 1028989.PSCI_4210, 105164...</td>\n",
+       "      <td>ER3QB</td>\n",
+       "      <td>190</td>\n",
+       "      <td>221</td>\n",
+       "      <td>[1294143, 1798802, 572477, 273526, 1441930, 40...</td>\n",
+       "      <td>[105559.Nwat_2444, 1121939.L861_15835, 1178482...</td>\n",
       "      <td>55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>17358</th>\n",
+       "      <th>14363</th>\n",
       "      <td>1236</td>\n",
-       "      <td>ER6B2</td>\n",
-       "      <td>254</td>\n",
-       "      <td>301</td>\n",
-       "      <td>[743720, 717774, 491952, 1178482, 290398, 1445...</td>\n",
-       "      <td>[1028989.PSCI_0675, 1051646.IX91_03105, 105164...</td>\n",
+       "      <td>ER3QZ</td>\n",
+       "      <td>229</td>\n",
+       "      <td>258</td>\n",
+       "      <td>[1926881, 230089, 910964, 69222, 1484157, 1681...</td>\n",
+       "      <td>[1005058.UMN179_01181, 1006000.GKAS_02034, 105...</td>\n",
       "      <td>55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>95227</th>\n",
+       "      <th>75558</th>\n",
       "      <td>1236</td>\n",
-       "      <td>ET5PB</td>\n",
-       "      <td>255</td>\n",
-       "      <td>255</td>\n",
-       "      <td>[314276, 1635005, 211586, 326297, 151081, 3807...</td>\n",
-       "      <td>[1051646.IX91_11355, 1085623.GNIT_2424, 112767...</td>\n",
+       "      <td>ESNNU</td>\n",
+       "      <td>337</td>\n",
+       "      <td>372</td>\n",
+       "      <td>[1736612, 216778, 273526, 871585, 1484157, 140...</td>\n",
+       "      <td>[1006000.GKAS_04123, 1045855.DSC_01400, 111551...</td>\n",
       "      <td>55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>55467</th>\n",
+       "      <th>17358</th>\n",
       "      <td>1236</td>\n",
-       "      <td>ES59X</td>\n",
-       "      <td>264</td>\n",
-       "      <td>277</td>\n",
-       "      <td>[1294143, 1384056, 267850, 322710, 1177154, 16...</td>\n",
-       "      <td>[1026882.MAMP_03077, 1028989.PSCI_3136, 103380...</td>\n",
+       "      <td>ER6B2</td>\n",
+       "      <td>254</td>\n",
+       "      <td>301</td>\n",
+       "      <td>[743720, 717774, 491952, 1178482, 290398, 1445...</td>\n",
+       "      <td>[1028989.PSCI_0675, 1051646.IX91_03105, 105164...</td>\n",
       "      <td>55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
-       "      <th>97026</th>\n",
+       "      <th>44651</th>\n",
       "      <td>1236</td>\n",
-       "      <td>ET788</td>\n",
-       "      <td>270</td>\n",
-       "      <td>272</td>\n",
-       "      <td>[1926881, 230089, 198214, 1681196, 910964, 692...</td>\n",
-       "      <td>[1006000.GKAS_02548, 1115515.EV102420_33_00080...</td>\n",
+       "      <td>ERVXT</td>\n",
+       "      <td>256</td>\n",
+       "      <td>273</td>\n",
+       "      <td>[1736612, 43263, 1294143, 216778, 1535422, 179...</td>\n",
+       "      <td>[1163408.UU9_10362, 1187848.A1QO_01760, 119576...</td>\n",
       "      <td>55</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
@@ -886,25 +886,25 @@
      ],
      "text/plain": [
       "       taxonomic_group    NOG  #taxa  #genes  \\\n",
-       "101263             1236  ETAWV    372     393   \n",
+       "14341             1236  ER3QB    190     221   \n",
+       "14363             1236  ER3QZ    229     258   \n",
+       "75558             1236  ESNNU    337     372   \n",
       "17358             1236  ER6B2    254     301   \n",
-       "95227              1236  ET5PB    255     255   \n",
-       "55467              1236  ES59X    264     277   \n",
-       "97026              1236  ET788    270     272   \n",
+       "44651             1236  ERVXT    256     273   \n",
       "\n",
       "                                                    taxa  \\\n",
-       "101263  [160488, 634500, 214092, 223283, 1414654, 6650...   \n",
+       "14341  [1294143, 1798802, 572477, 273526, 1441930, 40...   \n",
+       "14363  [1926881, 230089, 910964, 69222, 1484157, 1681...   \n",
+       "75558  [1736612, 216778, 273526, 871585, 1484157, 140...   \n",
       "17358  [743720, 717774, 491952, 1178482, 290398, 1445...   \n",
-       "95227   [314276, 1635005, 211586, 326297, 151081, 3807...   \n",
-       "55467   [1294143, 1384056, 267850, 322710, 1177154, 16...   \n",
-       "97026   [1926881, 230089, 198214, 1681196, 910964, 692...   \n",
+       "44651  [1736612, 43263, 1294143, 216778, 1535422, 179...   \n",
       "\n",
       "                                                   genes  #genes_to_keep  \n",
-       "101263  [1006000.GKAS_01002, 1028989.PSCI_4210, 105164...              55  \n",
+       "14341  [105559.Nwat_2444, 1121939.L861_15835, 1178482...              55  \n",
+       "14363  [1005058.UMN179_01181, 1006000.GKAS_02034, 105...              55  \n",
+       "75558  [1006000.GKAS_04123, 1045855.DSC_01400, 111551...              55  \n",
       "17358  [1028989.PSCI_0675, 1051646.IX91_03105, 105164...              55  \n",
-       "95227   [1051646.IX91_11355, 1085623.GNIT_2424, 112767...              55  \n",
-       "55467   [1026882.MAMP_03077, 1028989.PSCI_3136, 103380...              55  \n",
-       "97026   [1006000.GKAS_02548, 1115515.EV102420_33_00080...              55  "
+       "44651  [1163408.UU9_10362, 1187848.A1QO_01760, 119576...              55  "
      ]
     },
     "metadata": {},
@@ -1065,14 +1065,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Number of locus_tags considering the wol_tree_taxa: 1032634. Looks like: ['1005090.BAKON_001', '1005090.BAKON_002', '1005090.BAKON_003', '1005090.BAKON_004', '1005090.BAKON_005']\n"
+      "Number of locus_tags considering the wol_tree_taxa: 1032613. Looks like: ['1005090.BAKON_001', '1005090.BAKON_002', '1005090.BAKON_003', '1005090.BAKON_004', '1005090.BAKON_005']\n"
     ]
    }
   ],
@@ -1099,7 +1099,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
@@ -1165,16 +1165,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Taxa in wol tree but not in gene trees: 0 \n",
+      "Taxa in wol tree but not in gene trees: 1 \n",
      "and vice versa: 0\n",
-      "They look like this: [] and []\n",
+      "They look like this: ['243277'] and []\n",
      "Number of taxa to keep in the wol tree: 359\n"
     ]
    }
@@ -1238,7 +1238,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1271,7 +1271,7 @@
    "nog_members_df['genes'] = nog_members_df['genes'].apply(lambda x: ','.join(x))\n",
    "\n",
    "# overwrite the NOG members file\n",
-    "nog_members_df.to_csv(f\"{data_dir}{taxonomicID}_nog_members.tsv\", sep=\"\\t\", index=False, header=False)"
+    "nog_members_df.to_csv(f\"{data_dir}{taxonomicID}_nog_members.tsv\", sep=\"\\t\", index=False, header=False)\n"
   ]
  },
  {
@@ -1283,7 +1283,8 @@
    "The below command is what you can run and then wait to finish. It will be run in the background (because of `nohup` and `disown`) even if you lose connection to the server. This process can take in the order of hours to finish, for a list of trees in the order of 10k trees.\n",
    "\n",
    "```\n",
-    "$ nohup python src/run_MAD_on_EggNOG_parallel.py -i ../data/1236_pruned_gene_trees.tsv -m ~/bin/mad/mad -n 100 > ./nohup_mad_eggnog_rooting_1236.log & disown\n",
+    "# cd 01-prepare_data\n",
+    "$ nohup python src/run_MAD_on_EggNOG_parallel.py -i ../data/1236_pruned_gene_trees.tsv -m ~/bin/mad/mad -n 100 > ../data/nohup_mad_eggnog_rooting_1236.log & disown\n",
    "```\n",
    "\n",
    "**NOTE**: Pruning took roughly 1.5 hours for 15k trees"
@@ -1307,14 +1308,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Number of genome accessions: 1470, and they look like:\n"
+      "Number of genome accessions: 1473, and they look like:\n"
     ]
    },
    {
@@ -1346,27 +1347,27 @@
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>GCF_000953635.1</td>\n",
-       "      <td>/root/work/projects/hgt_inference_comparative_...</td>\n",
+       "      <td>/root/work/projects/paper_comparative_study_of...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>GCF_900461565.1</td>\n",
-       "      <td>/root/work/projects/hgt_inference_comparative_...</td>\n",
+       "      <td>/root/work/projects/paper_comparative_study_of...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>GCF_900450575.1</td>\n",
-       "      <td>/root/work/projects/hgt_inference_comparative_...</td>\n",
+       "      <td>/root/work/projects/paper_comparative_study_of...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>GCF_015476275.1</td>\n",
-       "      <td>/root/work/projects/hgt_inference_comparative_...</td>\n",
+       "      <td>/root/work/projects/paper_comparative_study_of...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>GCF_000953655.1</td>\n",
-       "      <td>/root/work/projects/hgt_inference_comparative_...</td>\n",
+       "      <td>/root/work/projects/paper_comparative_study_of...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
@@ -1374,11 +1375,11 @@
      ],
      "text/plain": [
       "  genome_accession                                       fna_filepath\n",
-       "0  GCF_000953635.1  /root/work/projects/hgt_inference_comparative_...\n",
-       "1  GCF_900461565.1  /root/work/projects/hgt_inference_comparative_...\n",
-       "2  GCF_900450575.1  /root/work/projects/hgt_inference_comparative_...\n",
-       "3  GCF_015476275.1  /root/work/projects/hgt_inference_comparative_...\n",
-       "4  GCF_000953655.1  /root/work/projects/hgt_inference_comparative_..."
+       "0  GCF_000953635.1  /root/work/projects/paper_comparative_study_of...\n",
+       "1  GCF_900461565.1  /root/work/projects/paper_comparative_study_of...\n",
+       "2  GCF_900450575.1  /root/work/projects/paper_comparative_study_of...\n",
+       "3  GCF_015476275.1  /root/work/projects/paper_comparative_study_of...\n",
+       "4  GCF_000953655.1  /root/work/projects/paper_comparative_study_of..."
      ]
     },
     "metadata": {},
@@ -1401,7 +1402,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1440,7 +1441,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1500,35 +1501,6 @@
    "            write_buffer(buffer, out_file)"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "def return_gene_features(gene):\n",
-    "    taxon, locus_tag = gene.split('.', 1)\n",
-    "    return [\n",
-    "        gene, \n",
-    "        gene_features_dict[taxon]['genes'][locus_tag]['id'], \n",
-    "        gene_features_dict[taxon]['genes'][locus_tag]['seqid'],\n",
-    "        gene_features_dict[taxon]['genes'][locus_tag]['start'], \n",
-    "        gene_features_dict[taxon]['genes'][locus_tag]['end'], \n",
-    "        gene_features_dict[taxon]['genes'][locus_tag]['strand'],\n",
-    "        gene_features_dict[taxon]['acc_id'],\n",
-    "    ]\n",
-    "\n",
-    "# create a pool of worker processes\n",
-    "with mp.Pool(int(mp.cpu_count()/2)) as p:\n",
-    "    subset_gene_features_list = p.imap_unordered(return_gene_features, subset_genes_list)\n",
-    "\n",
-    "subset_gene_features_df = pd.DataFrame(subset_gene_features_list, columns=['locus_tag', 'seqid', 'gene_id', 'start', 'end', 'strand', 'genome_accession'])\n",
-    "\n",
-    "# write this to a file\n",
-    "subset_gene_features_df.to_csv(f'{data_dir}{taxonomicID}_subset_gene_features.tsv', sep='\\t', index=False)"
-   ]
-  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -1538,21 +1510,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "The taxa list looks like this: ['1005057', '1005058', '1005090', '1006000', '1009858', '1026882', '1028989', '1033802', '1036674', '1045855'] ...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The size of the pa_df is: (359, 1286)\n"
+      "The taxa list looks like this: ['1005057', '1005058', '1005090', '1006000', '1009858', '1026882', '1028989', '1033802', '1036674', '1045855'] ...\n",
+      "The size of the pa_df is: (359, 1300)\n"
     ]
    }
   ],
@@ -1622,7 +1588,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [

 %% Cell type:markdown id: tags:

 This notebook does the following:

 1. **Retrieve Nucleotide Accession IDs and GFF Files**

 2. **Download GFF3 Files**
   - Use a script to download GFF3 files from NCBI

 3. **Find Overlap Between Genes in EggNOG and GFF Files**
   - Define functions to process files and extract unique values.
   - Process gene groups to find corresponding GFF files and extract gene features.
   - Write results to JSON and log skipped taxa.

 4. **Download and Prune ASTRAL WoL Tree**
   - Prune the tree to include only taxa with gene data.
   - Replace leaf names with taxon IDs and write pruned trees to files.

 5. **Extract Gene Trees of Interest**
   - Extract gene trees based on the number of genes and taxa.

 6. **Prune Gene Trees**

 7. **Check Taxa Consistency**
   - Ensure taxa in WoL tree and gene trees are consistent.
   - Prune WoL tree variants to include only taxa in gene trees.

 8. **Download Genome Sequence Files**
   - Download genome sequence files for each taxon in the dataset.

 9. **Replace Leaf Labels in Gene Trees**
   - Replace the first underscore with a dot in leaf labels of gene trees.

 10. **Create Gene Features TSV**
    - Extract gene features for the subset of genes in the gene trees.

 11. **Create Presence-Absence Matrices**
    - Create presence-absence matrices for GLOOME and COUNT.
    - Write matrices to TSV and FASTA files.

 12. **Extract Function Profiles**
    - Extract function profiles for NOGs in the dataset from the EggNOG database.

 %% Cell type:code id: tags:

 ``` python
 # we find a subset of the data in EGGNOG, for which we can retrieve nucleotide data
 # this means that
 # 1. first we find for which genomes in EggNOG can we find NCBI genome assembly accession IDs
 # 2. We download the assembly gff files for these acc IDs
 # 3. We look through the list of genes in EggNOG, which we can find in the gff files.
 taxonomicID = '1236'
 data_dir = '../data/'
 ```

 %% Cell type:markdown id: tags:

 ## Retrieve Nucleotide Accession IDs and GFF files

 %% Cell type:code id: tags:

 ``` python
 %%bash -s "$taxonomicID" "$data_dir"
 ## extract data for this dataset based on taxonomic ID declared before, using ripgrep
 rg -w "^$1" $2/eggnog6/e6.og2seqs_and_species.tsv > $2$1_og2seqs_and_species.tsv
 ```

 %% Cell type:code id: tags:

 ``` python
 %%bash -s "$taxonomicID" "$data_dir"
 # make a list of all taxa in *_og2seqs_and_species.tsv and write it to *_all.taxa
 # the fifth column in this TSV file is a comma-delimited string of taxon IDs
 awk -F"\t" '{print $5}' $2$1_og2seqs_and_species.tsv | tr ',' '\n' | sort -n | uniq > $2$1_all.taxa
 ```

 %% Cell type:markdown id: tags:

 Use `src/download_ncbi_genome_sequences.py` to download the gff3 files from NCBI, using multiprocessing. The script will download the files in the `data_dir`

 ```bash
 nohup ~/mambaforge/envs/hgt_analyses/bin/python src/download_ncbi_genome_sequences.py -i 1236_all.taxa --to-download gff3  > nohup_gff3_dload.log & disown
 ```

 %% Cell type:markdown id: tags:

 ## Find overlap between genes in EggNOG and genes in the GFF files

 %% Cell type:code id: tags:

 ``` python
 # suppress SyntaxWarnings in ete3 import using the warnings module
 import warnings
 warnings.filterwarnings("ignore", category=SyntaxWarning)
 ```

 %% Cell type:code id: tags:

 ``` python
 import csv
 import sys
 import pandas as pd
 import multiprocessing as mp
 import gffutils
 import os
 import json
 import tqdm
 import csv
 import ete3
 import numpy as np
 ```

 %% Cell type:code id: tags:

 ``` python

 csv.field_size_limit(sys.maxsize)

 all_taxa_filepath = f'{data_dir}{taxonomicID}_all.taxa'
 nog_members_tsv_filepath = f'{data_dir}{taxonomicID}_og2seqs_and_species.tsv'


 def process_file(input_file_path, column_index):
    # This set will store all unique values from the specified column
    all_values = set()

    # Open the file and read it line by line
    with open(input_file_path, 'r') as fo:
        reader = csv.reader(fo, delimiter='\t')
        for row in reader:
            # Split the column by commas and add the items to the set
            if row[column_index]:  # Ensure the column isn't empty
                values = row[column_index].split(',')
                all_values.update(value.strip() for value in values)

    return all_values


 unique_values = process_file(nog_members_tsv_filepath, 5)

 # Write the unique values to a file or process them further
 with open(f'{data_dir}{taxonomicID}_all.genes', 'w') as out_file:
    out_file.write('\n'.join(sorted(unique_values)) + '\n')
 ```

 %% Cell type:code id: tags:

 ``` python
 # read in the values from the *all.genes file into a df with a single column
 all_genes_filepath = data_dir + taxonomicID + '_all.genes'

 all_genes_df = pd.read_csv(
    all_genes_filepath, header=None, names=['locus_tag'])

 taxon_acc_id_filepath = f'{data_dir}/genome_data/{taxonomicID}_all_accession_ids.tsv'
 genome_data_dir = f'{data_dir}/genome_data/'

 taxon_acc_id_df = pd.read_csv(taxon_acc_id_filepath, sep='\t', header=0)
 taxon_acc_id_df['taxon_id'] = taxon_acc_id_df['taxon_id'].astype(str)

 # add a column called "taxon_id" based on the first element in the period delimited 'locus_tag' column string
 all_genes_df['taxon_id'] = all_genes_df['locus_tag'].apply(
    lambda x: x.split('.', 1)[0]).astype(str)

 # groupby taxon and for every taxon, run this fn
 # this fn takes finds the corresponding gff file using the taxon to gff mapping in taxon_acc_id_df
 # it then looks for the locus_tag values in the gff. It returns a list of all locus_tag values that exist for this taxon in the gff file


 def process_gene_group(gene_group, taxon_acc_id_df, genome_data_dir):
    """
    This function takes a group of genes from the all_genes_df dataframe and processes the corresponding gff file.
    It returns a list of gene IDs that are found in the gff file.
    """
    taxon_id = gene_group['taxon_id'].iloc[0]
    locus_tags = set(gene_group['locus_tag'])
    # remove taxon_id from locus_tags (i.e. the first element in the period delimited 'locus_tag' column string)
    locus_tags = {locus_tag.split('.', 1)[1] for locus_tag in locus_tags}

    # find corresponding gff file for taxon ID
    if taxon_acc_id_df['taxon_id'].isin([taxon_id]).any():
        acc_id = taxon_acc_id_df[taxon_acc_id_df['taxon_id']
                                 == taxon_id]['acc_id'].values[0]
    else:
        # print(f'taxon ID {taxon_id} not found. Skipping')
        return 'skipped: taxon ID not in TSV', taxon_id
    gff_filepath = os.path.join(genome_data_dir, f'{taxon_id}_{acc_id}_genomic.gff')
    if not os.path.exists(gff_filepath):
        # print(f'gff file {gff_filepath} not found. Skipping')
        return 'skipped: gff file not found', taxon_id

    # create gffutils df from the gff file
    gff_db = gffutils.create_db(gff_filepath, ':memory:', merge_strategy='merge')


    # return the list of dict of locus_tag values found in the gff file, the taxon_id, the acc_id, and the gene_features_list
    return_dict = {}

    # iterate over each feature of type 'gene' in the gff_db
    for feature in gff_db.features_of_type('gene'):
        # check if 'locus_tag' or 'old_locus_tag' is in feature.attributes and if its value is in locus_tags
        if 'locus_tag' in feature.attributes and feature.attributes['locus_tag'][0] in locus_tags:
            # if yes, append the locus_tag and the entire feature as a dict, including information of start, end, strand, and feature attributes
            return_dict[feature.attributes['locus_tag'][0]] = {
                'id': feature.id,  # 'gene:NC_000913:thrL'
                'locus_tag': feature.attributes['locus_tag'][0],
                'start': feature.start,
                'end': feature.end,
                'strand': feature.strand,
                'attributes': dict(feature.attributes),
                'seqid': feature.seqid
            }

        elif 'old_locus_tag' in feature.attributes and feature.attributes['old_locus_tag'][0] in locus_tags:
            # if yes, append to dict
            return_dict[feature.attributes['old_locus_tag'][0]] = {
                'id': feature.id,
                'locus_tag': feature.attributes['old_locus_tag'][0],
                'start': feature.start,
                'end': feature.end,
                'strand': feature.strand,
                'attributes': dict(feature.attributes),
                'seqid': feature.seqid
            }

    # if fewer than 10 genes are found, skip
    if len(return_dict) < 10:
        return 'skipped: less than 10 genes found', taxon_id

    return 'values', taxon_id, acc_id, return_dict


 def process_gene_group_wrapper(args):
    return process_gene_group(*args)

 # groupby taxon_id the genes in all_genes_df
 gene_groups = all_genes_df.groupby('taxon_id')

 # prepare arguments for process_gene_group function
 args = [(group, taxon_acc_id_df, genome_data_dir) for _, group in gene_groups]

 taxa_acc_genes_dict = {}
 skipped_taxa_list = []
 # create a multiprocessing Pool
 with mp.Pool(mp.cpu_count()) as pool:
    # apply process_gene_group function to each group in parallel
    for pool_return in tqdm.tqdm(pool.imap_unordered(process_gene_group_wrapper, args), total=len(args)):
        if pool_return[0].startswith('skipped'):
            skipped_taxa_list.append((pool_return[1], pool_return[0]))
        elif pool_return[0] == 'values':
            taxa_acc_genes_dict[pool_return[1]] = {
                'acc_id': pool_return[2], 'genes': pool_return[3]}
        else:
            print('ERROR: invalid return value')
            break

 # write the results to a json file
 with open(f'{data_dir}{taxonomicID}_gene_features.json', 'w') as out_file:
    json.dump(taxa_acc_genes_dict, out_file, indent=4)
 # log number of taxa in this json
 print(f'Number of taxa with gene data: {len(taxa_acc_genes_dict)}')

 # write the skipped taxa to a file
 with open(f'{data_dir}{taxonomicID}_skipped_taxa.tsv', 'w') as out_file:
    # csv with quotes
    writer = csv.writer(out_file, quoting=csv.QUOTE_MINIMAL, delimiter='\t')
    writer.writerows(skipped_taxa_list)
 ```

 %% Output

-    100%|██████████| 1548/1548 [01:04<00:00, 23.82it/s]
+    100%|██████████| 1548/1548 [00:56<00:00, 27.46it/s]

-    Number of taxa with gene data: 854
+    Number of taxa with gene data: 853

 %% Cell type:code id: tags:

 ``` python
 # find which locus_tags are present in the json file of gene_features
 gene_features_filepath = f'{data_dir}{taxonomicID}_gene_features.json'
 with open(gene_features_filepath, 'r') as gene_features_file:
    gene_features_dict = json.load(gene_features_file)

 # find which 'genes' are present in the gene_features_dict
 gene_features_dict_genes = [f'{taxon}.{locus_tag}' for taxon in gene_features_dict.keys() for locus_tag in gene_features_dict[taxon]['genes'].keys()]

 # write this list to a file
 with open(f'{data_dir}{taxonomicID}_locus_tags_with_features.txt', 'w') as gene_features_genes_file:
    gene_features_genes_file.write('\n'.join(gene_features_dict_genes) + '\n')
 ```

 %% Cell type:markdown id: tags:

 ## Download and Prune ASTRAL WoL Tree

 %% Cell type:code id: tags:

 ``` python
 %%bash -s "$taxonomicID" "$data_dir"
 # remove existing files if they exist
 rm -f $2"wol_metadata.tsv.xz" $2"wol_metadata.tsv" $2"wol_tree.nwk"
 wget -nv https://biocore.github.io/wol/data/genomes/metadata.tsv.xz -O $2"wol_metadata.tsv.xz"
 # this requires 7z
 7z x $2"wol_metadata.tsv.xz" -o$2
 # download the ASTRAL tree
 wget -nv https://github.com/biocore/wol/raw/master/data/trees/astral/branch_length/cons/astral.cons.nwk -O $2"wol_tree.nwk"
 ```

 %% Output

-    2024-05-06 10:10:01 URL:https://biocore.github.io/wol/data/genomes/metadata.tsv.xz [971240/971240] -> "../data/wol_metadata.tsv.xz" [1]
+    2025-05-08 14:52:45 URL:https://biocore.github.io/wol/data/genomes/metadata.tsv.xz [971240/971240] -> "../data/wol_metadata.tsv.xz" [1]

    
    7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
    p7zip Version 16.02 (locale=en_US.UTF8,Utf16=on,HugeFiles=on,64 bits,128 CPUs AMD EPYC 7601 32-Core Processor                 (800F12),ASM,AES-NI)
    
    Scanning the drive for archives:
    1 file, 971240 bytes (949 KiB)
    
    Extracting archive: ../data/wol_metadata.tsv.xz
    --
    Path = ../data/wol_metadata.tsv.xz
    Type = xz
    Physical Size = 971240
    Method = LZMA2:26 CRC64
    Streams = 1
    Blocks = 1
    
    Everything is Ok
    
    Size:       6601710
    Compressed: 971240

-    2024-05-06 10:10:01 URL:https://raw.githubusercontent.com/biocore/wol/master/data/trees/astral/branch_length/cons/astral.cons.nwk [407397/407397] -> "../data/wol_tree.nwk" [1]
+    2025-05-08 14:52:46 URL:https://raw.githubusercontent.com/biocore/wol/master/data/trees/astral/branch_length/cons/astral.cons.nwk [407397/407397] -> "../data/wol_tree.nwk" [1]

 %% Cell type:code id: tags:

 ``` python
 # prune the ASTRAL WoL tree to only include the taxa for which we have gene data. This is our genome tree
 # read in the tree

 wol_metadata_filepath = f'{data_dir}wol_metadata.tsv'
 wol_tree_filepath = f'{data_dir}wol_tree.nwk'
 wol_tree = ete3.Tree(wol_tree_filepath, format=1)
 wol_tree_leaf_names = wol_tree.get_leaf_names()

 # read in the gene features json
 json_filepath = f'{data_dir}{taxonomicID}_gene_features.json'
 with open(json_filepath, 'r') as in_file:
    gene_features_dict = json.load(in_file)

 # Read the metadata file from WOL as a dataframe
 wol_df = pd.read_csv(wol_metadata_filepath, sep='\t', header=0)
 print(f'Number of genomes in the metadata: {len(wol_df)}')
 # change dtype of taxid column to str type
 wol_df['taxid'] = wol_df['taxid'].astype(str)
 # display first 5 rows
 display(wol_df.head())
 # check if every #genome in the wol_df maps to a unique assembly_accession value
 assert wol_df['assembly_accession'].nunique() == wol_df['#genome'].nunique()
 # smaller wol_df based on which taxid are in the gene_features_dict
 wol_df = wol_df[wol_df['taxid'].isin(gene_features_dict.keys())]
 print(f'Number of genomes after pruning based on taxid in gene_features_dict: {len(wol_df)}')

 # gene_features_dict has taxid to acc_id mapping. wol_df has #genome to acc_id (`assembly_accession` column) mapping. Tree has #genome as leaf names
 # we want to prune the trees and replace the leaf names with taxid values
 acc_id_to_keep = set([i['acc_id'] for i in gene_features_dict.values()])
 print(f'acc_id_to_keep looks like: {list(acc_id_to_keep)[:5]}')
 # remove all other rows from the wol_df
 wol_df = wol_df[wol_df['assembly_accession'].isin(acc_id_to_keep)]
 print(f'Number of genomes in the pruned metadata: {len(wol_df)}')
 genomes_to_keep = [i for i in wol_tree_leaf_names if i in wol_df['#genome'].values]

 wol_tree.prune(genomes_to_keep, preserve_branch_length=True)
 print(f'Number of genomes in the pruned tree: {len(wol_tree.get_leaf_names())}')

 # create a dictionary mapping genome IDs to assembly accessions IDs
 genome_to_accid_dict = wol_df.set_index('#genome')['assembly_accession'].to_dict()
 # create a dictionary mapping assembly accessions to taxon IDs
 accid_to_taxid_dict = {v['acc_id']: k for k, v in gene_features_dict.items()}
 # create a dictionary mapping genome IDs to taxon IDs
 genome_to_taxid_dict = {k: accid_to_taxid_dict[v]
                   for k, v in genome_to_accid_dict.items()}

 # replace pruned tree leaf names with taxon IDs
 for leaf in wol_tree.get_leaves():
    leaf.name = genome_to_taxid_dict[leaf.name]
 ```

 %% Output

    Number of genomes in the metadata: 10575


-    Number of genomes after pruning based on taxid in gene_features_dict: 413
-    acc_id_to_keep looks like: ['GCF_000773685.1', 'GCF_001767295.1', 'GCF_000817975.1', 'GCF_001423155.1', 'GCA_001801665.1']
-    Number of genomes in the pruned metadata: 361
-    Number of genomes in the pruned tree: 361
+    Number of genomes after pruning based on taxid in gene_features_dict: 412
+    acc_id_to_keep looks like: ['GCF_000245735.1', 'GCA_001801365.1', 'GCF_000829415.1', 'GCF_000271305.1', 'GCF_001469215.1']
+    Number of genomes in the pruned metadata: 360
+    Number of genomes in the pruned tree: 360

 %% Cell type:code id: tags:

 ``` python
 # write the pruned tree to a newick file, with different variations in labelling for different program requirements
 pruned_no_internal_wol_tree_filepath = f'{data_dir}{taxonomicID}_wol_tree_pruned_no_internal_labels.nwk'
 pruned_angst_wol_tree_filepath = f'{data_dir}{taxonomicID}_wol_tree_pruned_angst.nwk'
 pruned_internal_wol_tree_filepath = f'{data_dir}{taxonomicID}_wol_tree_pruned_with_internal_labels.nwk'

 # write the pruned tree to a newick file without internal node names
 wol_tree.write(outfile=pruned_no_internal_wol_tree_filepath, format=1, format_root_node=True, dist_formatter='%.10f')

 # write a version of the tree with root length for AnGST.
 # first make a copy of the tree
 wol_tree.write(outfile=pruned_angst_wol_tree_filepath,
               format=1, dist_formatter='%f', format_root_node=True)
 # find the last ")" in this file and replace everything after it with ":0.0);".
 # AnGST doesn't like the branch length at the root to be like 0.00000
 with open(pruned_angst_wol_tree_filepath, "r") as fi:
    tree_str = fi.read()
    tree_str = tree_str[:tree_str.rfind(")")+1] + ":0.0);"
 with open(pruned_angst_wol_tree_filepath, "w") as fo:
    fo.write(tree_str)

 # traverse the tree in postorder and number the internal nodes as N1, N2, N3, etc.
 # We don't skip a loop index if the node is a leaf. But we don't use the index for the leaf nodes, retaining the original name.
 # We can use the enumerate function to get the index and the node at the same time.
 internal_node_index = 1
 for node in wol_tree.traverse(strategy="postorder"):
    if not node.is_leaf():
        node.name = f"N{internal_node_index}"
        internal_node_index += 1
 # print the first 5 nodes of the tree
 print(f"Node names look like: {
      [i for i in wol_tree.traverse(strategy='postorder')][:5]} ...")
 # show the pruned tree leaf labels
 print("New leaf names look like: ", wol_tree.get_leaf_names()[:10], "...")
 # write this wol tree to a file. Branches are full floating point numbers
 wol_tree.write(outfile=pruned_internal_wol_tree_filepath,
               format=1, dist_formatter='%f', format_root_node=True)
 ```

 %% Output

-    Node names look like: [Tree node '1896966' (0x7f34f1d0f50), Tree node '381306' (0x7f34f17884a), Tree node '246195' (0x7f34f17e11c), Tree node '797473' (0x7f34f17e128), Tree node 'N1' (0x7f34f17e110)] ...
+    Node names look like: [Tree node '1896966' (0x7f38b054c0b), Tree node '381306' (0x7f38afdb0b9), Tree node '246195' (0x7f38afe058b), Tree node '797473' (0x7f38afe0597), Tree node 'N1' (0x7f38afe057f)] ...
    New leaf names look like:  ['1896966', '381306', '246195', '797473', '1766620', '555778', '1033802', '1304275', '1172194', '1579979'] ...

 %% Cell type:code id: tags:

 ``` python
 # how many taxa are in the tree and how many in the dictionary?
 print("Number of taxa in the tree: ", len(wol_tree.get_leaves()), "Number of taxa in the dictionary: ", len(gene_features_dict))
 ```

 %% Output

-    Number of taxa in the tree:  361 Number of taxa in the dictionary:  854
+    Number of taxa in the tree:  360 Number of taxa in the dictionary:  853

 %% Cell type:markdown id: tags:

 ## Extract gene trees of the subset data of interest

 %% Cell type:markdown id: tags:

 EggNOG 6.0 has one huge file with all the trees as well as (compressed) MSA. We need just the trees, and only for the NOGs that are of interest. Extract them.

 %% Cell type:code id: tags:

 ``` python
 %%bash -s "$taxonomicID" "$data_dir"
 # use ripgrep to extract the trees for eggnog subset dataset (taxonomic ID should be in second column of TSV file).
 # Write column 1 and 3 to 2_all_trees.tsv
 rg "\t$1\t" $2"e6.all_raw_trees_and_algs.tsv" | cut -f1,3 > $2$1"_all_trees.tsv"
 ```

 %% Cell type:code id: tags:

 ``` python
 # first we decide on which NOGs to keep.

 # we calculate the mean and in 0.25xSD around the mean, we include NOGs from the distribution of NOG sizes
 # until the list of NOGs included, includes all the taxa to be included.

 wol_tree = ete3.Tree(f"{data_dir}{taxonomicID}_wol_tree_pruned_no_internal_labels.nwk", format=1)
 taxon_list = wol_tree.get_leaf_names()
 taxon_set = set(taxon_list)

 nog_members_tsv_filepath = f"{data_dir}{taxonomicID}_og2seqs_and_species.tsv"
 nog_members_df = pd.read_csv(nog_members_tsv_filepath, sep='\t', header=None, names=['taxonomic_group',
                                                                                    'NOG',
                                                                                    '#taxa',
                                                                                    '#genes',
                                                                                    'taxa', 'genes'])


 # pruning preparation
 min_genes = 10  # minimum number of genes to retain a NOG
 min_taxa = 30  # minimum number of taxa to retain a NOG
 print("Min number of genes to retain a NOG: ", min_genes,
      "and min number of taxa to retain a NOG: ", min_taxa)

 # taxa column in members_df contains a list of taxonomic IDs that are members of the NOG, as comma-seperated string
 # Split the 'taxa' column by commas to create a list of taxa in each NOG, add it to a new column
 nog_members_df['taxa'] = nog_members_df['taxa'].str.split(',')

 # for the column 'genes' in nog_members_df, replace the periods with underscores
 nog_members_df['genes'] = nog_members_df['genes'].str.replace('.', '_')
 # there are mistakes in the 'gene' column, where some underscores are replaced with colon characters. Replace these with underscores
 nog_members_df['genes'] = nog_members_df['genes'].str.replace(':', '_')
 # Split the 'genes' column by commas to create a list of genes in each NOG
 nog_members_df['genes'] = nog_members_df['genes'].str.split(',')

 # replace the first underscore in 'genes' with a period, to get the taxon ID
 nog_members_df['genes'] = nog_members_df['genes'].apply(
    lambda genes: [gene.replace('_', '.', 1) for gene in genes])
 # Filter the list of genes based on the taxon_set
 # Use the first part of an period-split to check if the taxonomic ID is in the taxon_list, and retain only those genes
 nog_members_df['genes'] = nog_members_df['genes'].apply(
    lambda genes: [gene for gene in genes if gene.split('.')[0] in taxon_set])
 # Create a new column that contains the number of genes to be retained in the gene tree
 nog_members_df['#genes_to_keep'] = nog_members_df['genes'].str.len()
 # remove rows where #genes_to_keep is less than 10 or more than max_genes
 nog_members_df = nog_members_df[(nog_members_df['#genes_to_keep'] >= min_genes)]
 # similarly, prune the taxa list column to only keep the taxa that are in the taxon_list
 nog_members_df['taxa'] = nog_members_df['taxa'].apply(
    lambda taxa: [taxon for taxon in taxa if taxon in taxon_set])

 # Sort the df by the absolute difference between #genes_to_keep and its mean
 # This will give us the NOGs that are closest to the mean
 mean_genes_to_keep = nog_members_df['#genes_to_keep'].mean()
 nog_members_df['diff_from_mean'] = abs(
    nog_members_df['#genes_to_keep'] - mean_genes_to_keep)
 nog_members_df = nog_members_df.sort_values(by='diff_from_mean')

 # find std dev of #genes_to_keep in members_df
 nog_members_df_std_dev = nog_members_df['#genes_to_keep'].std()
 # display mean and std dev
 print("Mean #genes_to_keep: ", nog_members_df['#genes_to_keep'].mean(
 ), "Std dev of #genes_to_keep: ", nog_members_df_std_dev)

 # Flatten the list of taxa in the dataframe
 all_taxa_in_df = set(
    [taxon for sublist in nog_members_df['taxa'].tolist() for taxon in sublist])

 # Check if all taxa in taxon_list are represented in nog_members_df
 if not set(taxon_list).issubset(all_taxa_in_df):
    print("Not all taxa in taxon_list are represented in nog_members_df")
    print("Taxa not represented: ", set(taxon_list) - all_taxa_in_df)
 else:

    # retain only NOGs with at least min_taxa taxa
    nog_members_df = nog_members_df[nog_members_df['#taxa'] >= min_taxa]

    # check if the smallest n NOGs contain all the taxa in taxon_list, if not, increase n by 100 and repeat

    # start with a quarter of the std dev of #genes_to_keep in nog_members_df, rounded to closest higher 100
    n = int(np.ceil(nog_members_df_std_dev/4/100)*100)
    print("Starting with n = ", n,
          " as ~quarter of the std dev of #genes_to_keep in nog_members_df")
    while True:
        # make sure the loop doesn't run forever
        if n > len(nog_members_df):
            print("n is greater than the number of NOGs in the nog_members_df")
            break
        # get the smallest n NOGs
        nog_members_df_subset = nog_members_df.head(n)
        # create a set of all the taxa in the subset
        taxa_in_subset = set(
            [taxon for sublist in nog_members_df_subset['taxa'].tolist() for taxon in sublist])
        # check if the taxon_list is a subset of taxa_in_subset
        if set(taxon_list).issubset(taxa_in_subset):
            print("At n = ", n, ", taxon_list is a subset of taxa_in_subset, with total of ", len(
                taxa_in_subset), " taxa")
            break
        else:
            n += 100

 # replace the nog_members_df with the subset, and rewrite the nog_members_df file
 nog_members_df = nog_members_df_subset
 # before writing out the nog_members_df, remove the columns that are not needed
 nog_members_df = nog_members_df.drop(columns=['diff_from_mean'])
 # also join the lists into comma-separated strings
 nog_members_df['taxa'] = nog_members_df['taxa'].apply(lambda x: ','.join(x))
 nog_members_df['genes'] = nog_members_df['genes'].apply(lambda x: ','.join(x))
 nog_members_df.to_csv(f"{data_dir}{taxonomicID}_nog_members.tsv",
                  sep="\t", index=False, header=False)


 # split the taxa and genes comma-separated strings back into lists
 nog_members_df['taxa'] = nog_members_df['taxa'].str.split(',')
 nog_members_df['genes'] = nog_members_df['genes'].str.split(',')

 # display the last 5 rows of the nog_members_df
 display(nog_members_df.tail())

 # write out the list of taxa in this subset
 with open(f"{data_dir}{taxonomicID}_subset_taxa.txt", "w") as taxa_fo:
    taxa_fo.write('\n'.join(taxon_list) + '\n')
 ```

 %% Output

    Min number of genes to retain a NOG:  10 and min number of taxa to retain a NOG:  30
-    Mean #genes_to_keep:  83.92499327896765 Std dev of #genes_to_keep:  177.982200175856
+    Mean #genes_to_keep:  84.05419908773813 Std dev of #genes_to_keep:  178.40979118790855
    Starting with n =  100  as ~quarter of the std dev of #genes_to_keep in nog_members_df
-    At n =  1300 , taxon_list is a subset of taxa_in_subset, with total of  359  taxa
+    At n =  1300 , taxon_list is a subset of taxa_in_subset, with total of  360  taxa


 %% Cell type:code id: tags:

 ``` python
 # Read in the TSV file with NOG to trees mapping
 trees_df = pd.read_csv(f"{data_dir}{taxonomicID}_all_trees.tsv",
                       sep="\t", names=['NOG', 'tree'])
 # Display length of the DataFrame
 print("Length of trees_df: ", len(trees_df))
 # Remove NOGs that are not in the nog_members_df
 trees_df = trees_df[trees_df['NOG'].isin(nog_members_df['NOG'])]
 # Display length of the new trees_df
 print("Length of trees_df after filtering: ", len(trees_df))
 # Overwrite the subset_trees.tsv file with the new trees
 trees_df.to_csv(f"{data_dir}{taxonomicID}_subset_trees.tsv",
                sep="\t", index=False, header=False)
 ```

 %% Output

    Length of trees_df:  103547
    Length of trees_df after filtering:  1300

 %% Cell type:code id: tags:

 ``` python
 # read in wol_tree and make a list of taxa in the tree
 wol_tree_filepath = f'{data_dir}{taxonomicID}_wol_tree_pruned_no_internal_labels.nwk'
 wol_tree = ete3.Tree(wol_tree_filepath, format=1)
 wol_tree_taxa = wol_tree.get_leaf_names()

 # read in list of locus tags with features
 all_genes_list_filepath = f'{data_dir}{taxonomicID}_locus_tags_with_features.txt'
 with open(all_genes_list_filepath, 'r') as in_file:
    all_genes_list = in_file.read().splitlines()

 # create a list of locus_tags that are in the wol_tree_taxa
 all_locus_tags = [locus_tag for locus_tag in all_genes_list if locus_tag.split('.', 1)[0] in wol_tree_taxa]
 print(f'Number of locus_tags considering the wol_tree_taxa: {len(all_locus_tags)}. Looks like: {all_locus_tags[:5]}')

 # write this to a file
 all_locus_tags_filepath = f'{data_dir}{taxonomicID}_subset_genes.txt'
 with open(all_locus_tags_filepath, 'w') as out_file:
    out_file.write('\n'.join(all_locus_tags) + '\n')
 ```

 %% Output

-    Number of locus_tags considering the wol_tree_taxa: 1032634. Looks like: ['1005090.BAKON_001', '1005090.BAKON_002', '1005090.BAKON_003', '1005090.BAKON_004', '1005090.BAKON_005']
+    Number of locus_tags considering the wol_tree_taxa: 1032613. Looks like: ['1005090.BAKON_001', '1005090.BAKON_002', '1005090.BAKON_003', '1005090.BAKON_004', '1005090.BAKON_005']

 %% Cell type:code id: tags:

 ``` python
 # prune trees
 # read in the subset trees
 subset_trees_filepath = f'{data_dir}{taxonomicID}_subset_trees.tsv'
 subset_trees_df = pd.read_csv(subset_trees_filepath, sep='\t', header=None, names=['NOG', 'tree'])

 # testing
 # subset_trees_df = subset_trees_df.head(10)

 # read in the subset genes
 subset_genes_filepath = f'{data_dir}{taxonomicID}_subset_genes.txt'
 with open(subset_genes_filepath, 'r') as in_file:
    subset_genes_list = in_file.read().splitlines()

 def apply_prune_tree(args):
    """
    fn for multiprocessing pool to prune trees
    """
    index, row, subset_genes_list = args
    tree_newick = row['tree'].strip()
    # read in the tree
    tree = ete3.Tree(tree_newick, format=1)
    # get the leaf names
    leaf_names = tree.get_leaf_names()
    # find the genes that are in the tree
    leaves_to_keep = [leaf for leaf in leaf_names if leaf in subset_genes_list]
    # print(f'Number of leaves in tree: {len(leaf_names)}, Number of leaves to keep: {len(leaves_to_keep)}')
    # prune the tree
    if len(leaves_to_keep) <=4:
        return None
    elif len(leaves_to_keep) == len(leaf_names):
        return tree.write(format=1)
    else:
        tree.prune(leaves_to_keep)
        return tree.write(format=1)

 # create a list of tuples to pass to the pool
 args = [(index, row, subset_genes_list) for index, row in subset_trees_df.iterrows()]
 max_proc = 2*mp.cpu_count()
 with mp.Pool(max_proc) as pool:
    subset_trees_df['pruned_tree'] = list(pool.map(apply_prune_tree, args))

 # remove rows where pruned_tree is None
 print(f'Number of trees before pruning: {len(subset_trees_df)}')
 subset_trees_df = subset_trees_df[subset_trees_df['pruned_tree'].notnull()]
 print(f'Number of trees after pruning: {len(subset_trees_df)}')

 # write the NOg: prune_tree mapping to a file
 pruned_trees_filepath = f'{data_dir}{taxonomicID}_pruned_gene_trees.tsv'
 subset_trees_df[['NOG', 'pruned_tree']].to_csv(pruned_trees_filepath, sep='\t', index=False, header=False)
 ```

 %% Output

    Number of trees before pruning: 1300
    Number of trees after pruning: 1300

 %% Cell type:code id: tags:

 ``` python
 # check if the taxa in wol tree, and taxa in set of all gene trees, are the exact same sets
 # read in the rooted gene trees file
 gene_trees_filepath = f'{data_dir}{taxonomicID}_pruned_gene_trees.tsv'
 with open(gene_trees_filepath, 'r') as in_file:
    rooted_gene_trees = [i.split()[1].strip() for i in in_file.readlines()]

 # create a set of all taxa in the gene trees
 all_taxa_in_gene_trees = set()
 for tree in rooted_gene_trees:
    gene_tree = ete3.Tree(tree, format=1)
    all_genes = gene_tree.get_leaf_names()
    all_taxa_in_gene_trees.update([gene.split('.', 1)[0] for gene in all_genes])

 # read in the wol tree
 wol_tree_no_internal_labels_filepath = f'{data_dir}{taxonomicID}_wol_tree_pruned_no_internal_labels.nwk'
 wol_tree_with_internal_labels_filepath = f'{data_dir}{taxonomicID}_wol_tree_pruned_with_internal_labels.nwk'
 wol_tree_angst_filepath = f'{data_dir}{taxonomicID}_wol_tree_pruned_angst.nwk'
 wol_tree = ete3.Tree(wol_tree_no_internal_labels_filepath, format=1)
 wol_tree_taxa = set(wol_tree.get_leaf_names())

 # print which taxa are in wol tree but not in gene trees and vice versa
 print(f'Taxa in wol tree but not in gene trees: {len(wol_tree_taxa - all_taxa_in_gene_trees)} \nand vice versa: {len(all_taxa_in_gene_trees - wol_tree_taxa)}')
 print(f'They look like this: {list(wol_tree_taxa - all_taxa_in_gene_trees)[:5]} and {list(all_taxa_in_gene_trees - wol_tree_taxa)[:5]}')

 # prune all the variants of the wol_tree to only include the taxa in the gene trees
 wol_tree_taxa_to_keep = wol_tree_taxa.intersection(all_taxa_in_gene_trees)
 print(f'Number of taxa to keep in the wol tree: {len(wol_tree_taxa_to_keep)}')
 # prune the wol tree to only include the taxa in the gene trees
 wol_tree.prune(wol_tree_taxa_to_keep)
 wol_tree.write(outfile=wol_tree_no_internal_labels_filepath, format=1, format_root_node=True, dist_formatter='%.10f')

 # write a version of the tree with root length for AnGST.
 # first make a copy of the tree
 wol_tree.write(outfile=wol_tree_angst_filepath,
               format=1, dist_formatter='%f', format_root_node=True)
 # find the last ")" in this file and replace everything after it with ":0.0);".
 # AnGST doesn't like the branch length at the root to be like 0.00000
 with open(wol_tree_angst_filepath, "r") as fi:
    tree_str = fi.read()
    tree_str = tree_str[:tree_str.rfind(")")+1] + ":0.0);"
 with open(wol_tree_angst_filepath, "w") as fo:
    fo.write(tree_str)


 # read in the wol tree with internal labels, prune it, and overwrite it
 wol_tree = ete3.Tree(wol_tree_with_internal_labels_filepath, format=1)
 wol_tree.prune(wol_tree_taxa_to_keep)
 # relabel the internal node names
 internal_node_index = 1
 for node in wol_tree.traverse(strategy="postorder"):
    if not node.is_leaf():
        node.name = f"N{internal_node_index}"
        internal_node_index += 1
 wol_tree.write(outfile=wol_tree_with_internal_labels_filepath, format=1, format_root_node=True, dist_formatter='%.10f')
 ```

 %% Output

-    Taxa in wol tree but not in gene trees: 0
+    Taxa in wol tree but not in gene trees: 1
    and vice versa: 0
-    They look like this: [] and []
+    They look like this: ['243277'] and []
    Number of taxa to keep in the wol tree: 359

 %% Cell type:code id: tags:

 ``` python
 # write a new NOG members file for the subset of the NOGs and the genes in the subset
 # read in original NOG members file
 nog_members_tsv_filepath = f"{data_dir}{taxonomicID}_nog_members.tsv"
 nog_members_df = pd.read_csv(nog_members_tsv_filepath, sep='\t', header=None, names=['taxonomic_group',
                                                                                    'NOG',
                                                                                    '#taxa',
                                                                                    '#genes',
                                                                                    'taxa', 'genes', '#genes_to_keep'])
 # remove NOGs not in subset_trees_df
 subset_trees_df = pd.read_csv(f"{data_dir}{taxonomicID}_pruned_gene_trees.tsv", sep="\t", header=None, names=['NOG', 'tree'])
 nog_members_df = nog_members_df[nog_members_df['NOG'].isin(subset_trees_df['NOG'])]

 # in `taxa` column keep only taxa that are in wol_tree_taxa_to_keep
 nog_members_df['taxa'] = nog_members_df['taxa'].apply(
    lambda taxa: [taxon for taxon in taxa.split(',') if taxon in wol_tree_taxa_to_keep])
 # in `genes` column keep only genes that are from taxa in wol_tree_taxa_to_keep
 nog_members_df['genes'] = nog_members_df['genes'].apply(
    lambda genes: [gene for gene in genes.split(',') if gene.split('.', 1)[0] in wol_tree_taxa_to_keep])
 # update the #taxa and #genes columns by counting the length of the lists in `taxa` and `genes` columns
 nog_members_df['#taxa'] = nog_members_df['taxa'].apply(len)
 nog_members_df['#genes'] = nog_members_df['genes'].apply(len)
 # drop the #genes_to_keep column
 nog_members_df = nog_members_df.drop(columns=['#genes_to_keep'])

 # join the lists into comma-separated strings
 nog_members_df['taxa'] = nog_members_df['taxa'].apply(lambda x: ','.join(x))
 nog_members_df['genes'] = nog_members_df['genes'].apply(lambda x: ','.join(x))

 # overwrite the NOG members file
 nog_members_df.to_csv(f"{data_dir}{taxonomicID}_nog_members.tsv", sep="\t", index=False, header=False)
 ```

 %% Cell type:markdown id: tags:

 ## Root all of these gene trees using MAD

 The below command is what you can run and then wait to finish. It will be run in the background (because of `nohup` and `disown`) even if you lose connection to the server. This process can take in the order of hours to finish, for a list of trees in the order of 10k trees.

 ```
-$ nohup python src/run_MAD_on_EggNOG_parallel.py -i ../data/1236_pruned_gene_trees.tsv -m ~/bin/mad/mad -n 100 > ./nohup_mad_eggnog_rooting_1236.log & disown
+# cd 01-prepare_data
+$ nohup python src/run_MAD_on_EggNOG_parallel.py -i ../data/1236_pruned_gene_trees.tsv -m ~/bin/mad/mad -n 100 > ../data/nohup_mad_eggnog_rooting_1236.log & disown
 ```

 **NOTE**: Pruning took roughly 1.5 hours for 15k trees

 %% Cell type:markdown id: tags:

 ## Download genome sequence files for each of the taxa in dataset

 %% Cell type:markdown id: tags:

 ```bash
 nohup python src/download_ncbi_genome_sequences.py -i ../data/1236_subset_taxa.txt  --to-download genome -a ../data/genome_data/1236_all_accession_ids.tsv > nohup_genome_dload.log & disown
 ```

 %% Cell type:code id: tags:

 ``` python
 # list all the files downloaded into a TSV file with columns genome_accession and fna_filepath
 genome_data_dir = f'{data_dir}/genome_data/'
 genome_data_dir_fullpath = os.path.abspath(genome_data_dir)
 genome_data_files = [i for i in os.listdir(genome_data_dir) if i.endswith('.fna')]
 # filename basenames are of the form {taxonID}_{GCA_xxxxxxx.n}_{genomeAssembly}_genomic.fna
 # we want to extract the genome accession from this
 genome_accession_filepath_tuples_list = [('_'.join(i.split('_')[1:3]), os.path.join(genome_data_dir_fullpath, i)) for i in genome_data_files]
 genome_accessions_filepaths_df = pd.DataFrame(genome_accession_filepath_tuples_list, columns=['genome_accession', 'fna_filepath'])
 genome_accessions_filepaths_df.to_csv(f'{data_dir}{taxonomicID}_genome_fna_filepaths.tsv', sep='\t', index=False, header=True)
 print(f'Number of genome accessions: {len(genome_accessions_filepaths_df)}, and they look like:')
 display(genome_accessions_filepaths_df.head())
 ```

 %% Output

-    Number of genome accessions: 1470, and they look like:
+    Number of genome accessions: 1473, and they look like:


 %% Cell type:code id: tags:

 ``` python
 # read in this rooted pruned trees file, and replace the leaf labels
 # such that taxID.geneID_geneID2 is replaced with taxID_geneID_geneID2

 # function to replace first underscore with dot in leaf label
 def replace_first_dot_with_underscore_in_leaf_label(tree_line):
    tree_ID, tree = tree_line.split('\t')
    gene_tree = ete3.Tree(tree, format=1)
    for leaf in gene_tree:
        leaf.name = leaf.name.replace('.', '_', 1)
    # add a branch length of 0.0 to the root node
    gene_tree.get_tree_root().dist = 0.0
    return tree_ID+'\t'+gene_tree.write(format=1, dist_formatter='%f', format_root_node=True)


 # create a pool of workers
 max_workers = 500
 pool = mp.Pool(max_workers)

 pruned_rooted_gene_trees_tsv_filepath = f'{data_dir}{taxonomicID}_pruned_gene_trees.tsv.rooted'
 # open both the input and output file and apply the function to each line in the input file
 with open(pruned_rooted_gene_trees_tsv_filepath) as infile, open(f"{pruned_rooted_gene_trees_tsv_filepath}.underscored", 'w') as outfile:
    # iterate over each line in the input file
    for line in infile:
        # apply the function to the line and write the result to the output file
        result = pool.apply_async(
            replace_first_dot_with_underscore_in_leaf_label, (line.strip(),))
        outfile.write(result.get() + '\n')

 # close the pool
 pool.close()
 pool.join()
 ```

 %% Cell type:code id: tags:

 ``` python
 # create a TSV file with gene ID, start position, end position, strand, and genome accession ID
 # for gene IDs in the gene features json for the subset of genes in the gene trees

 def return_gene_features(args):
    """
    generator fn to return gene features for a taxon as a list
    """
    taxon, genes_of_taxon, taxon_gene_features_dict = args
    gene_features_lists = []
    for gene in genes_of_taxon:
        if gene in taxon_gene_features_dict['genes']:
            try:
                gene_features = taxon_gene_features_dict['genes'][gene]
                gene_features_lists.append([gene, gene_features['id'],
                       gene_features['start'], gene_features['end'],
                       gene_features['strand'], gene_features['seqid'],
                       taxon_gene_features_dict['acc_id'], taxon])
            except KeyError:
                print(f'KeyError for gene {gene} in taxon {taxon}, with accesion {taxon_gene_features_dict["acc_id"]}')
    return gene_features_lists

 with open(f'{data_dir}{taxonomicID}_gene_features.json', 'r') as in_file:
    gene_features_dict = json.load(in_file)

 with open(f'{data_dir}{taxonomicID}_subset_taxa.txt', 'r') as in_file:
    subset_taxa_list = in_file.read().splitlines()

 args = [(taxon, list(gene_features_dict[taxon]['genes'].keys()), gene_features_dict[taxon]) for taxon in subset_taxa_list]

 max_workers = len(args)

 gene_features_tsv_filepath = f'{data_dir}{taxonomicID}_gene_features.tsv'
 header_list = ['locus_tag', 'gene_id', 'start', 'end', 'strand', 'seqid', 'genome_accession', 'taxon_id']

 BUFFER_SIZE = 1000  # Keeping a buffer to write to file in chunks
 # the buffer is important otherwise IOStream will be the bottleneck

 def write_buffer(buffer, out_file):
    out_file.write('\n'.join(buffer) + '\n')
    buffer.clear()

 with open(gene_features_tsv_filepath, 'w') as out_file:
    # write the header
    out_file.write('\t'.join(header_list) + '\n')
    # as you get the results, write them to the file instead of storing them in memory
    buffer = []
    with mp.Pool(max_workers) as pool:
        for result in pool.imap(return_gene_features, args):
            for gene_features in result:
                buffer.append('\t'.join(map(str, gene_features)))
                if len(buffer) >= BUFFER_SIZE:
                    write_buffer(buffer, out_file)
        if buffer:
            write_buffer(buffer, out_file)
 ```

-%% Cell type:code id: tags:
-
-``` python
-
-def return_gene_features(gene):
-    taxon, locus_tag = gene.split('.', 1)
-    return [
-        gene,
-        gene_features_dict[taxon]['genes'][locus_tag]['id'],
-        gene_features_dict[taxon]['genes'][locus_tag]['seqid'],
-        gene_features_dict[taxon]['genes'][locus_tag]['start'],
-        gene_features_dict[taxon]['genes'][locus_tag]['end'],
-        gene_features_dict[taxon]['genes'][locus_tag]['strand'],
-        gene_features_dict[taxon]['acc_id'],
-    ]
-
-# create a pool of worker processes
-with mp.Pool(int(mp.cpu_count()/2)) as p:
-    subset_gene_features_list = p.imap_unordered(return_gene_features, subset_genes_list)
-
-subset_gene_features_df = pd.DataFrame(subset_gene_features_list, columns=['locus_tag', 'seqid', 'gene_id', 'start', 'end', 'strand', 'genome_accession'])
-
-# write this to a file
-subset_gene_features_df.to_csv(f'{data_dir}{taxonomicID}_subset_gene_features.tsv', sep='\t', index=False)
-```
-
 %% Cell type:markdown id: tags:

 ## Create Presence-Absence (PA) matrices for GLOOME and Count

 %% Cell type:code id: tags:

 ``` python
 def nogs_pa_matrix(taxonomicID) -> None:
    '''
    use the members.tsv file of the NOGs from eggNOG, to prepare the pa_matrix
    '''
    import pandas as pd
    import numpy as np
    import os

    # read in the members.tsv file, but only the NOG, and the taxa columns (column 1 and 4)
    members_df_subset = pd.read_csv(f"{data_dir}/{taxonomicID}_nog_members.tsv", sep='\t', header=None,
                                    names=['NOG', 'taxa', 'genes'], usecols=[1, 4, 5])
    # split the taxa column into a list of taxa
    members_df_subset['taxa'] = members_df_subset['taxa'].apply(
        lambda x: x.split(','))
    # similar for genes column
    members_df_subset['genes'] = members_df_subset['genes'].apply(
        lambda x: x.split(','))

    # create pa matrix based on the presence-absence of the taxa in the NOGs
    # first create a list of all the taxa
    taxa = list(
        set([item for sublist in members_df_subset['taxa'].tolist() for item in sublist]))
    # sort the list
    taxa.sort()
    # display how the list looks like
    print("The taxa list looks like this: {}".format(taxa[:10]), "...")

    # create a dataframe with the NOGs as columns and the taxa as index
    # fill the dataframe with 0s as strings
    pa_df = pd.DataFrame(
        index=taxa, columns=members_df_subset['NOG'], dtype=str).fillna('0')

    # fill in the presence-absence matrix based on whether the taxa are present in the list of taxa for each NOG
    for index, row in members_df_subset.iterrows():
        for taxon in row['taxa']:
            # find how many ("_"-delimited) genes in the corresponding gene column, have the taxon as the first element
            # (i.e. the taxon is the first element of the gene name)
            pa_df.loc[taxon, row['NOG']] = str(
                int(len([gene for gene in row['genes'] if gene.split('.', 1)[0] == taxon])))

    # display the size of the matrix/df
    print("The size of the pa_df is: {}".format(pa_df.shape))

    # TSV file for COUNT, with NOGs in index and taxa as columns
    pa_df.T.to_csv(f"{data_dir}/{taxonomicID}_pa_matrix.tsv", sep='\t')

    # now binarize the matrix (i.e. if a NOG has at least one gene from a taxon, it gets a 1, otherwise 0)
    pa_df = pa_df.map(lambda x: '1' if x != '0' else '0')

    # Concatenate the columns into a single string per taxon and write out as FASTA file
    fasta_df = pd.DataFrame(pa_df.astype(str).apply(
        ''.join, axis=1), columns=['sequence'])
    fasta_df.index = '>' + fasta_df.index.str.strip()
    fasta_df.to_csv(
        f"{data_dir}{taxonomicID}_pa_matrix.fasta", sep='\n', header=False)

    return None


 # run function
 nogs_pa_matrix(taxonomicID)
 ```

 %% Output

    The taxa list looks like this: ['1005057', '1005058', '1005090', '1006000', '1009858', '1026882', '1028989', '1033802', '1036674', '1045855'] ...
-
-    The size of the pa_df is: (359, 1286)
+    The size of the pa_df is: (359, 1300)

 %% Cell type:code id: tags:

 ``` python
 %%bash -s "$data_dir"
 # extract the function profiles for the NOGs in our dataset from the eggnog6 database

 # remove entries with no function profile
 rg -v "\{\}" $1"/eggnog6/e6.func_profiles.json" > $1"/eggnog6/e6.func_profiles.available.json"
 # get function profiles for nog members in our dataset
 rg -f <(cut -f2 $1"/1236_nog_members.tsv" )  $1"/eggnog6/e6.func_profiles.available.json" > $1"/1236.func_profiles.json"
 ```

--- a/01-prepare_data/01-02-prepare_nucleotide_data.ipynb
+++ b/01-prepare_data/01-02-prepare_nucleotide_data.ipynb
@@ -7,7 +7,7 @@
    "Run `download_ncbi_genome_sequences.py` like so:\n",
    "\n",
    "```bash\n",
-    "nohup python src/download_ncbi_genome_sequences.py -i 1236_subset.taxa  > nohup_download_genome_seqs.out & disown\n",
+    "nohup ~/mambaforge/envs/hgt_analyses/bin/python src/download_ncbi_genome_sequences.py -i ../data/1236_subset_taxa.txt  > ../data/nohup_download_genome_seqs.out & disown\n",
    "```\n",
    "\n",
    "By default, `-o` output dir is `../data/genome_sequences/` where fasta and gff files are downloaded for each taxon in the list. \n",
@@ -29,15 +29,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Found 100692 unique gene names. List looks like this: ['1123053_GCA_000425345_02619', '128785_GCA_001556105_01030', '1123514_GCA_000381085_02040', '1441930_Z042_05715', '1449087_GCA_000711305_00098', '767434_Fraau_2269', '377629_TERTU_3187', '437022_CC99x_00673', '658445_H744_2c0221', '1265503_GCA_000378625_02015']\n",
-      "Writing gene names to 1236_subset_NOG_genes.list\n"
+      "Found 156509 unique gene names. List looks like this: ['291331.XOO2297', '225937.HP15_481', '573983.B0681_04825', '318167.Sfri_1100', '291331.XOO1980', '472759.Nhal_3786', '1798287.A3F14_06600', '1354304.XPG1_2379', '28173.VIBNI_B0337', '1461581.BN1049_02975']\n",
+      "Writing gene names to ../data/1236_nog_genes.list\n"
     ]
    }
   ],
@@ -49,7 +49,7 @@
    "\n",
    "# for the genes of interest, we need to first prepare a list of all the gene names.\n",
    "# for this, first read in the members.tsv file\n",
-    "members_tsv_filepath = '1236_subset_NOG_members.tsv'\n",
+    "members_tsv_filepath = '../data/1236_nog_members.tsv'\n",
    "# read only the 6th column (CSV lists of the gene names)\n",
    "with open(members_tsv_filepath) as fo:\n",
    "    flines = fo.readlines()\n",
@@ -87,7 +87,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.2"
+   "version": "3.12.3"
  }
 },
 "nbformat": 4,

 %% Cell type:markdown id: tags:

 Run `download_ncbi_genome_sequences.py` like so:

 ```bash
-nohup python src/download_ncbi_genome_sequences.py -i 1236_subset.taxa  > nohup_download_genome_seqs.out & disown
+nohup ~/mambaforge/envs/hgt_analyses/bin/python src/download_ncbi_genome_sequences.py -i ../data/1236_subset_taxa.txt  > ../data/nohup_download_genome_seqs.out & disown
 ```

 By default, `-o` output dir is `../data/genome_sequences/` where fasta and gff files are downloaded for each taxon in the list.

 Each of these files is of the form `{NCBI_taxon_ID}_{NCBI_genome_accession_ID}.{extension}`

 The TSV file `genome_sequences/1236_subset_accession_ids.tsv` lists out the mapping between taxon_ID, accession_ID, as well as other information such as assembly name and source DB (sometimes it is not REFSEQ)

 %% Cell type:code id: tags:

 ``` python
 # import libraries
 import itertools
 ```

 %% Cell type:code id: tags:

 ``` python
 # Then, prepare a multiple sequence fasta file for all of these genomes, and similarly such a file for all of the genes of interest.

 # for the genomes, we just need to concatenate all the fasta files
 # in the genome_sequences directory, i.e. the output dir of the previous script

 # for the genes of interest, we need to first prepare a list of all the gene names.
 # for this, first read in the members.tsv file
-members_tsv_filepath = '1236_subset_NOG_members.tsv'
+members_tsv_filepath = '../data/1236_nog_members.tsv'
 # read only the 6th column (CSV lists of the gene names)
 with open(members_tsv_filepath) as fo:
    flines = fo.readlines()
    gene_names = [line.split('\t')[5] for line in flines]
 # now, split the gene names into a list of lists
 gene_names = [gn.split(',') for gn in gene_names]
 # flatten this huge list of lists efficiently
 gene_names = list(itertools.chain.from_iterable(gene_names))
 # remove duplicates
 gene_names = list(set(gene_names))
 print(f'Found {len(gene_names)
               } unique gene names. List looks like this:', gene_names[:10])
 # write the gene names to a file, replacing in the members.tsv filepath, 'members' with 'genes' and 'tsv' with 'list'
 gene_names_filepath = members_tsv_filepath.replace(
    'members', 'genes').replace('tsv', 'list')
 print(f'Writing gene names to {gene_names_filepath}')
 with open(gene_names_filepath, 'w') as fo:
    fo.write('\n'.join(gene_names) + '\n')
 ```

 %% Output

-    Found 100692 unique gene names. List looks like this: ['1123053_GCA_000425345_02619', '128785_GCA_001556105_01030', '1123514_GCA_000381085_02040', '1441930_Z042_05715', '1449087_GCA_000711305_00098', '767434_Fraau_2269', '377629_TERTU_3187', '437022_CC99x_00673', '658445_H744_2c0221', '1265503_GCA_000378625_02015']
-    Writing gene names to 1236_subset_NOG_genes.list
+    Found 156509 unique gene names. List looks like this: ['291331.XOO2297', '225937.HP15_481', '573983.B0681_04825', '318167.Sfri_1100', '291331.XOO1980', '472759.Nhal_3786', '1798287.A3F14_06600', '1354304.XPG1_2379', '28173.VIBNI_B0337', '1461581.BN1049_02975']
+    Writing gene names to ../data/1236_nog_genes.list

--- a/01-prepare_data/src/run_MAD_on_EggNOG_parallel.py
+++ b/01-prepare_data/src/run_MAD_on_EggNOG_parallel.py
@@ -8,6 +8,7 @@ import argparse
 import glob
 from multiprocessing import Pool

+
 def run_command(params):
    """
    Inputs:
@@ -21,6 +22,7 @@ def run_command(params):
        if process.wait() != 0:
            raise CalledProcessError(process.returncode, cmd)

+
 def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_subprocesses):
    """
    Inputs:
@@ -37,7 +39,8 @@ def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_s
    """
    # Check if the input file exists
    if not os.path.isfile(input_file_path):
-        raise FileNotFoundError(f"The input file {input_file_path} does not seem to exist. Check the filepath provided.")
+        raise FileNotFoundError(f"The input file {
+                                input_file_path} does not seem to exist. Check the filepath provided.")
    # Find the full path of the input file and extract the dir name
    input_file_path = os.path.abspath(input_file_path)
    input_dir = os.path.dirname(input_file_path)
@@ -51,49 +54,52 @@ def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_s
    # Create the temporary directory
    os.mkdir(tmp_dir)
    # log this with time, along with location of the tmp folder
-    print(f'Created the tmp folder at: {datetime.now()} at location: {tmp_dir}') 
-
+    print(f'Created the tmp folder at: {
+          datetime.now()} at location: {tmp_dir}')

    # Open the input file and read it in chunks of `n_lines` lines
-    nwk_split_files = [] # track the filenames of split files containing the second column (tree)
-    info_split_files = [] # track the filenames of split files containing the first column (treeID)
+    # track the filenames of split files containing the second column (tree)
+    nwk_split_files = []
+    # track the filenames of split files containing the first column (treeID)
+    info_split_files = []

-    with open(input_file_path, 'r') as treesFO:
-        chunk = []
-        for i, line in enumerate(treesFO):
-            split_line = line.strip().split()
-            chunk.append(split_line)
-            if (i + 1) % n_lines == 0:
-                # Write out the split trees file
-                nwk_split_filepath = tmp_dir+'/tmp_tree_split.nwk.'+str((i // n_lines) + 1)
-                info_split_filepath = tmp_dir+'/tmp_tree_split.info.'+str((i // n_lines) + 1)
+    def write_split_files(chunk, chunk_index, nwk_split_files, info_split_files):
+        """
+        Helper function to write out the split files for a given chunk.
+        """
+        split_chunk = [line.split() for line in chunk]
+        nwk_split_filepath = f"{tmp_dir}/tmp_tree_split.nwk.{chunk_index}"
+        info_split_filepath = f"{tmp_dir}/tmp_tree_split.info.{chunk_index}"
        with open(nwk_split_filepath, 'w') as split_out_fo:
-                    split_out_fo.write('\n'.join(line[1] for line in chunk) + '\n')
+            split_out_fo.write('\n'.join(line[1]
+                               for line in split_chunk) + '\n')
        with open(info_split_filepath, 'w') as split_out_fo:
-                    split_out_fo.write('\n'.join(line[0] for line in chunk) + '\n')
-                # track the filenames
+            split_out_fo.write('\n'.join(line[0]
+                               for line in split_chunk) + '\n')
        nwk_split_files.append(nwk_split_filepath)
        info_split_files.append(info_split_filepath)
+        return nwk_split_files, info_split_files
+
+    # open the input file
+    with open(input_file_path, 'r') as treesFO:
+        chunk = []
+        for i, line in enumerate(treesFO):
+            chunk.append(line)
+            if (i + 1) % n_lines == 0:
+                nwk_split_files, info_split_files = write_split_files(
+                    chunk, (i // n_lines) + 1, nwk_split_files, info_split_files)
                chunk = []

        # Write the last chunk if it's not empty
        if chunk:
-            split_chunk = [line.split() for line in chunk]
-            nwk_split_filepath = tmp_dir+'/tmp_tree_split.nwk.'+str((i // n_lines) + 2)
-            info_split_filepath = tmp_dir+'/tmp_tree_split.info.'+str((i // n_lines) + 2)
-            with open(nwk_split_filepath, 'w') as split_out_fo:
-                split_out_fo.write('\n'.join(line[1] for line in split_chunk) + '\n')
-            with open(info_split_filepath, 'w') as split_out_fo:
-                split_out_fo.write('\n'.join(line[0] for line in split_chunk) + '\n')
-            # track the filenames
-            nwk_split_files.append(nwk_split_filepath)
-            info_split_files.append(info_split_filepath)
-
+            nwk_split_files, info_split_files = write_split_files(
+                chunk, (len(nwk_split_files) // n_lines) + 2, nwk_split_files, info_split_files)

    # using subprocess.Popen inside the run_command function,
    # call MAD on all of these split newick files. Prepare the commands.
    # `mad -n trees.nwk.1`
-    commands = [([mad_executable, '-t', '-n', split_file_path], split_file_path+'.stdout') for split_index, split_file_path in enumerate(nwk_split_files)]
+    commands = [([mad_executable, '-t', '-n', split_file_path], split_file_path+'.stdout')
+                for split_index, split_file_path in enumerate(nwk_split_files)]

    mad_start_time = time.time()
    # Log timestamp for the start of the MAD rooting process
@@ -108,7 +114,6 @@ def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_s
    # Log that the split files are being combined, along with the timestamp
    print(f'{datetime.now()}: Combining the split files into one file...')

-
    # Note: mad adds a `.rooted` at the end of the input filename, as the output filename
    # combine the columns from the info files with the nwk.rooted files to prepare the final output file
    with open(input_file_path + '.rooted', 'w') as rootedOutputFO:
@@ -124,25 +129,28 @@ def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_s
            # combine the info and rooted trees
            with open(info_split_filepath, 'r') as info_file:
                rooted_info_plus_tree_chunks = [
-                    '\t'.join([info_line.strip(), rooted_nwk_trees[line_index]])
+                    '\t'.join(
+                        [info_line.strip(), rooted_nwk_trees[line_index]])
                    for line_index, info_line in enumerate(info_file)
                ]

            # write out the new tree line chunks to the output file
-            rootedOutputFO.write('\n'.join(rooted_info_plus_tree_chunks) + '\n')
+            rootedOutputFO.write(
+                '\n'.join(rooted_info_plus_tree_chunks) + '\n')

            # Clear memory
            del rooted_info_plus_tree_chunks
            del rooted_nwk_trees

-    # delete the split nwk files in the tmp folder
+    # # delete the split nwk files in the tmp folder
    for split_file_name in nwk_split_files:
        os.remove(split_file_name)
    # combine all the log files into one log file using `tail -n +1`, and write in the input_dir with timestamp
    now = datetime.now()
    timestamp = now.strftime("%Y%m%d_%H%M%S")
-    os.system('tail -n +1 '+tmp_dir+'/*.stdout > '+input_dir+'/tmp_mad_rooting_splits_'+timestamp+'.log')
-    # delete the tmp folder
+    os.system('tail -n +1 '+tmp_dir+'/*.stdout > '+input_dir +
+              '/tmp_mad_rooting_splits_'+timestamp+'.log')
+    # # delete the tmp folder
    # os.system('rm -rf '+tmp_dir)


@@ -170,7 +178,8 @@ if __name__ == '__main__':

    args = parser.parse_args()

-    MAD_roots_NOG_trees_parallel(args.Input, args.mad, args.nlines, args.num_subprocesses)
+    MAD_roots_NOG_trees_parallel(
+        args.Input, args.mad, args.nlines, args.num_subprocesses)

    print('Process time taken from start to finish: ' +
          str(timedelta(seconds=time.time() - start_time)))
--- a/02-run_programs/02-run_programs.ipynb
+++ b/02-run_programs/02-run_programs.ipynb
@@ -11,12 +11,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "taxonomicID = '1236'\n",
-    "data = '/root/work/projects/hgt_inference_comparative_study/data/'"
+    "data = '/root/work/projects/paper_comparative_study_of_hgt_inference_methods/data/'"
   ]
  },
  {
@@ -35,12 +35,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "# make directories for programs\n",
+    "mkdir -p program_runs\n",
+    "cd program_runs\n",
    "mkdir -p ALE AnGST RANGER RANGER-Fast Count GLOOME_with_tree GLOOME_without_tree"
   ]
  },
@@ -109,15 +111,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "%%bash\n",
+    "%%bash -s \"$taxonomicID\" \"$data\"\n",
    "# create AnGST directory if it doesn't exist\n",
    "# write penalty file\n",
    "# these params were taken from the AnGST manual\n",
-    "mkdir -p ./AnGST\n",
+    "mkdir -p $2/program_runs/AnGST\n",
    "cat > ./src/angst_penalty_file.txt << EOL\n",
    "hgt: 3.0\n",
    "dup: 2.0\n",
@@ -131,9 +133,14 @@
   "metadata": {},
   "source": [
    "Run AnGST using the script `run_angst.py` in the following manner:\n",
+    "\n",
    "```bash\n",
-    "# cd AnGST\n",
-    "$ nohup python ../src/run_angst.py > nohup_run_angst.out & disown\n",
+    "$ cd AnGST\n",
+    "# make sure you activate the python2 environment\n",
+    "$ mamba activate hgt_analyses_py2\n",
+    "\n",
+    "# run the script\n",
+    "$ nohup ~/mambaforge/envs/hgt_analyses_py2/bin/python ../src/run_angst.py > nohup_run_angst.out & disown\n",
    "```"
   ]
  },
@@ -160,16 +167,23 @@
    "The function below creates the presence-absence (PA) matrix for GLOOME, based on the PA of taxa in the NOGs of interest. We use that matrix (as a fasta file input) along with the species tree (and a separate run, without the tree), to infer gains and losses. Since Count also uses a PA matrix but in TSV format, we prepare it here itself. Note that Count requires each row to be that of a gene family (in our case, a NOG) whereas GLOOME needs each row (aka each sequence in the FASTA file) to be that of a taxon."
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### GLOOME ML"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash -s \"$taxonomicID\" \"$data\"\n",
-    "\n",
+    "cd $2/program_runs/\n",
    "# write param file that uses the rooted tree\n",
-    "cat > ./GLOOME_with_tree/gloome_tree.params << EOL\n",
+    "cat > ./GLOOME_with_tree/gloome_ml_tree.params << EOL\n",
    "_seqFile $2$1_pa_matrix.fasta\n",
    "_treeFile $2$1_wol_tree_pruned_no_internal_labels.nwk\n",
    "_gainLossDist 1\n",
@@ -179,11 +193,11 @@
    "_costMatrixGainLossRatio 1\n",
    "## Advanced \n",
    "_logValue 4\n",
-    "_outDir Results_GLOOME_with_tree\n",
+    "_outDir Results_GLOOME_ML_with_tree\n",
    "EOL\n",
    "\n",
    "# write param file that doesn't use the rooted tree\n",
-    "cat > ./GLOOME_without_tree/gloome_without_tree.params << EOL\n",
+    "cat > ./GLOOME_without_tree/gloome_ml_without_tree.params << EOL\n",
    "_seqFile $2$1_pa_matrix.fasta\n",
    "_gainLossDist 1\n",
    "# for COG and EggNOG only patterns with 3 or more ones are observable\n",
@@ -192,7 +206,7 @@
    "_costMatrixGainLossRatio 1\n",
    "## Advanced\n",
    "_logValue 4\n",
-    "_outDir Results_GLOOME_without_tree\n",
+    "_outDir Results_GLOOME_ML_without_tree\n",
    "EOL"
   ]
  },
@@ -213,18 +227,95 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Run Count"
+    "### GLOOME MP"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash -s \"$taxonomicID\" \"$data\"\n",
+    "# we need a params file for each iteration of the _costMatrixGainLossRatio\n",
+    "# _costMatrixGainLossRatio is the ratio of the cost of a gain to the cost of a loss\n",
+    "# the default is 1:1 but we want to test 0.33, 0.5, 1, 2, ..., 8\n",
+    "# for each of these ratios we need to create a params file\n",
+    "cd $2/program_runs/\n",
+    "for costRatio in 0.33 0.5 1 2 3 4 5 6 7 8; do\n",
+    "# write param file that uses the rooted tree\n",
+    "cat > ./GLOOME_with_tree/gloome_tree_${costRatio}.params << EOL\n",
+    "_seqFile $2$1_pa_matrix.fasta\n",
+    "_treeFile $2$1_wol_tree_pruned_no_internal_labels.nwk\n",
+    "_costMatrixGainLossRatio $costRatio\n",
+    "_logValue 4\n",
+    "_performOptimizations 0\n",
+    "_outDir Results_GLOOME_MP_with_tree_$costRatio\n",
+    "EOL\n",
+    "\n",
+    "# write param file that doesn't use the rooted tree\n",
+    "cat > ./GLOOME_without_tree/gloome_without_tree_${costRatio}.params << EOL\n",
+    "_seqFile $2$1_pa_matrix.fasta\n",
+    "_costMatrixGainLossRatio $costRatio\n",
+    "_logValue 4\n",
+    "_performOptimizations 0\n",
+    "_outDir Results_GLOOME_MP_without_tree_$costRatio\n",
+    "EOL\n",
+    "done\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then run all of the GLOOME MP scripts like so:\n",
+    "```bash\n",
+    "cd GLOOME_with_tree\n",
+    "for i in 0.33 0.5 1 2 3 4 5 6 7 8 ; do nohup ~/bin/GLOOME.VR01.266 gloome_tree_${i}.params > nohup_gloome_run_with_tree_${i}.out & disown ; done\n",
+    "```\n",
+    "\n",
+    "```bash\n",
+    "cd ../GLOOME_without_tree\n",
+    "for i in 0.33 0.5 1 2 3 4 5 6 7 8 ; do nohup ~/bin/GLOOME.VR01.266 gloome_without_tree_${i}.params > nohup_gloome_run_without_tree_${i}.out & disown ; done\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Count ML"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```bash\n",
+    "# cd Count/Count_ML\n",
+    "nohup java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.ML -v true  ../../../1236_wol_tree_pruned_with_internal_labels.nwk ../1236_pa_matrix.tsv > 1236_rates.r & disown\n",
+    "# and after completing the above:\n",
+    "nohup java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.Posteriors ../../../1236_wol_tree_pruned_with_internal_labels.nwk ../1236_pa_matrix.tsv ./1236_rates.r  > Count_output.tsv & disown\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Count MP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
+      "Current working directory: /root/work/projects/paper_comparative_study_of_hgt_inference_methods/02-run_programs\n",
      "Count with gain ratio 0.33 done\n",
      "Count output for gain ratio 0.33 grepped\n",
      "Count with gain ratio 0.5 done\n",
@@ -252,10 +343,12 @@
    "%%bash -s \"$taxonomicID\" \"$data\"\n",
    "# exit if command exits with non-zero status\n",
    "set -e\n",
+    "# print working directory\n",
+    "echo \"Current working directory: $(pwd)\"\n",
    "# change to the directory where Count will be run\n",
    "cd ./program_runs/Count/Count_MP/\n",
    "\n",
-    "# for `gain` parameter being 0.33, 0.5, 1, 2, 3, 4, 5, 6, 7, run Count\n",
+    "# for `gain` parameter being 0.33, 0.5, 1, 2, ,.. 8, run Count\n",
    "for g in 0.33 0.5 1 2 3 4 5 6 7 8;\n",
    "do\n",
    "    # run Count\n",

 %% Cell type:markdown id: tags:

 # Run the HGT-inference programs

 It is advised to use each program script separately since those are very time consuming. For some, via multiprocessing, we can split up the work to reduce the running time. We are running these in an HPC with ~50 threads each.

 %% Cell type:code id: tags:

 ``` python
 taxonomicID = '1236'
-data = '/root/work/projects/hgt_inference_comparative_study/data/'
+data = '/root/work/projects/paper_comparative_study_of_hgt_inference_methods/data/'
 ```

 %% Cell type:markdown id: tags:

 **Note:**

 For each of the scripts run below of the form `run_{inference_program}.py` (e.g. `run_ALE.py`),
 you can use the `-h` flag to see the help message for the script.

 For example, run `python run_ALE.py -h`.

 These scripts have are run with their default parameters. You can take a look at which exact parameters or files are being used by opening the script and looking at the exact `parser.add_argument` calls. If you want to use different files, you can prepare files with the same format as those used in the scripts and pass them as arguments to the scripts.

 %% Cell type:code id: tags:

 ``` python
 %%bash
 # make directories for programs
+mkdir -p program_runs
+cd program_runs
 mkdir -p ALE AnGST RANGER RANGER-Fast Count GLOOME_with_tree GLOOME_without_tree
 ```

 %% Cell type:markdown id: tags:

 ## Ranger

 From the manual for RANGER-DTL 2.0, the program takes as input a single file (specified using the `–i` command line option) containing first the species tree, followed by a single gene tree. All input trees must be expressed using the Newick format terminated by a semicolon, and they must be fully binary (fully resolved) and rooted. Species names in the species tree must be
 unique. E.g., ((speciesA, speciesB), speciesC);

 Each leaf in the gene tree must be labeled with the name of the species from which that gene was sampled. If desired, the gene name can be appended to the species name separated by an underscore `_` character. The gene tree may contain any number (zero, one, or more) of homologous genes from the same species. E.g., (((speciesA_gene1, speciesC_gene1), speciesB_geneX), speciesC_gene2);

 The following script (`src/run_rangerdtl.py`) runs Ranger on each of the gene trees. First it creates input files for each gene tree in our dataset by concatenating the species tree and the gene tree into one input file. Then it runs using `multiprocessing`, the program `Ranger-DTL-Fast` (from `SupplementaryPrograms/`) and `AggregateRanger` (from `CorePrograms/`) from the dir with Ranger executables. To run `AggregateRanger`, note that the output reconciliation files of Ranger-DTL-Fast needs to be of the format `{prefix}{index}` where in our case the prefix is `recon` and the indices begin with 1. The path to these output files are then provided to `AggregateRanger` by using the prefix only. See the comments in the script for more information.

 Run Ranger using the script `run_rangerdtl.py` in the following manner:
 ```bash
 # cd RANGER
 $ nohup python ../src/run_rangerdtl.py > nohup_ranger.out & disown
 ```

 ### Ranger-Fast

 Run Ranger-Fast using the same script but with `--fast` flag:
 ```bash
 # cd RANGER-Fast
 $ nohup python ../src/run_rangerdtl.py --fast > nohup_ranger_fast.out & disown
 ```

 %% Cell type:markdown id: tags:

 ## AnGST

 %% Cell type:markdown id: tags:

 AnGST needs the species tree to be rooted AND to have an additional branch length for the root (prepared in the previous step's notebook; the needed species tree file is named `*_with_root_length.nwk`).

 In case of any alteration of the scripts below, note that "AnGST.py" can only be run with python2.

 We need an input file for "AnGST.py". The input file requires to have the path to the species tree/gene tree/output/penalties. Example:

 %% Cell type:raw id: tags:

 species=/path/X_species_edited_angst.nwk
 gene=/path/one_unrooted_genetree.nwk
 output=/path/output_result
 penalties=/path/penalty.file

 %% Cell type:markdown id: tags:

 We create the penalty file based on the parameters suggested in the manual.

 %% Cell type:code id: tags:

 ``` python
-%%bash
+%%bash -s "$taxonomicID" "$data"
 # create AnGST directory if it doesn't exist
 # write penalty file
 # these params were taken from the AnGST manual
-mkdir -p ./AnGST
+mkdir -p $2/program_runs/AnGST
 cat > ./src/angst_penalty_file.txt << EOL
 hgt: 3.0
 dup: 2.0
 los: 1.0
 spc: 0.0
 EOL
 ```

 %% Cell type:markdown id: tags:

 Run AnGST using the script `run_angst.py` in the following manner:
+
 ```bash
-# cd AnGST
-$ nohup python ../src/run_angst.py > nohup_run_angst.out & disown
+$ cd AnGST
+# make sure you activate the python2 environment
+$ mamba activate hgt_analyses_py2
+
+# run the script
+$ nohup ~/mambaforge/envs/hgt_analyses_py2/bin/python ../src/run_angst.py > nohup_run_angst.out & disown
 ```

 %% Cell type:markdown id: tags:

 ## ALE

 The script here makes use of the direct compilation of ALE (instead of using `Docker`). Run the script from a terminal, from inside the `ALE` directory inside `program runs` directory. For each gene tree, the script runs a new process where the gene tree is written to a new file, `ALEobserve` is used to create `.ale` files, and `ALEml_undated` is run to perform the reconciliation. Sometimes the ALE programs give errors for no particular reason, so the script also has a number of retries that it can perform for each of the ALE programs before giving up. Use `--help` to see the options available or the default values used.

 ```bash
 # cd ALE
 $ nohup python ../src/run_ALE.py > run_ALE_nohup.out & disown
 ```

 %% Cell type:markdown id: tags:

 ## GLOOME and Count

 The function below creates the presence-absence (PA) matrix for GLOOME, based on the PA of taxa in the NOGs of interest. We use that matrix (as a fasta file input) along with the species tree (and a separate run, without the tree), to infer gains and losses. Since Count also uses a PA matrix but in TSV format, we prepare it here itself. Note that Count requires each row to be that of a gene family (in our case, a NOG) whereas GLOOME needs each row (aka each sequence in the FASTA file) to be that of a taxon.

+%% Cell type:markdown id: tags:
+
+### GLOOME ML
+
 %% Cell type:code id: tags:

 ``` python
 %%bash -s "$taxonomicID" "$data"
-
+cd $2/program_runs/
 # write param file that uses the rooted tree
-cat > ./GLOOME_with_tree/gloome_tree.params << EOL
+cat > ./GLOOME_with_tree/gloome_ml_tree.params << EOL
 _seqFile $2$1_pa_matrix.fasta
 _treeFile $2$1_wol_tree_pruned_no_internal_labels.nwk
 _gainLossDist 1
 # for COG and EggNOG only patterns with 3 or more ones are observable
 _minNumOfOnes 4
 # include Parsimony results along with ML
 _costMatrixGainLossRatio 1
 ## Advanced
 _logValue 4
-_outDir Results_GLOOME_with_tree
+_outDir Results_GLOOME_ML_with_tree
 EOL

 # write param file that doesn't use the rooted tree
-cat > ./GLOOME_without_tree/gloome_without_tree.params << EOL
+cat > ./GLOOME_without_tree/gloome_ml_without_tree.params << EOL
 _seqFile $2$1_pa_matrix.fasta
 _gainLossDist 1
 # for COG and EggNOG only patterns with 3 or more ones are observable
 _minNumOfOnes 4
 # include Parsimony results along with ML
 _costMatrixGainLossRatio 1
 ## Advanced
 _logValue 4
-_outDir Results_GLOOME_without_tree
+_outDir Results_GLOOME_ML_without_tree
 EOL
 ```

 %% Cell type:markdown id: tags:

 Run Gloome via command line for each of the param files as input, inside GLOOME directory (`cd GLOOME`). Using `nohup`, `&`, and `disown` here because the process may take a while to finish, and I am running it remotely on a server. Processes shouldn't be affected this way, even if I logout.

 ```bash
 nohup ~/bin/GLOOME.VR01.266 gloome_tree.params > nohup_gloome_run_with_tree.out & disown

 nohup ~/bin/GLOOME.VR01.266 gloome_without_tree.params > nohup_gloome_run_without_tree.out & disown
 ```

 %% Cell type:markdown id: tags:

-### Run Count
+### GLOOME MP
+
+%% Cell type:code id: tags:
+
+``` python
+%%bash -s "$taxonomicID" "$data"
+# we need a params file for each iteration of the _costMatrixGainLossRatio
+# _costMatrixGainLossRatio is the ratio of the cost of a gain to the cost of a loss
+# the default is 1:1 but we want to test 0.33, 0.5, 1, 2, ..., 8
+# for each of these ratios we need to create a params file
+cd $2/program_runs/
+for costRatio in 0.33 0.5 1 2 3 4 5 6 7 8; do
+# write param file that uses the rooted tree
+cat > ./GLOOME_with_tree/gloome_tree_${costRatio}.params << EOL
+_seqFile $2$1_pa_matrix.fasta
+_treeFile $2$1_wol_tree_pruned_no_internal_labels.nwk
+_costMatrixGainLossRatio $costRatio
+_logValue 4
+_performOptimizations 0
+_outDir Results_GLOOME_MP_with_tree_$costRatio
+EOL
+
+# write param file that doesn't use the rooted tree
+cat > ./GLOOME_without_tree/gloome_without_tree_${costRatio}.params << EOL
+_seqFile $2$1_pa_matrix.fasta
+_costMatrixGainLossRatio $costRatio
+_logValue 4
+_performOptimizations 0
+_outDir Results_GLOOME_MP_without_tree_$costRatio
+EOL
+done
+```
+
+%% Cell type:markdown id: tags:
+
+Then run all of the GLOOME MP scripts like so:
+```bash
+cd GLOOME_with_tree
+for i in 0.33 0.5 1 2 3 4 5 6 7 8 ; do nohup ~/bin/GLOOME.VR01.266 gloome_tree_${i}.params > nohup_gloome_run_with_tree_${i}.out & disown ; done
+```
+
+```bash
+cd ../GLOOME_without_tree
+for i in 0.33 0.5 1 2 3 4 5 6 7 8 ; do nohup ~/bin/GLOOME.VR01.266 gloome_without_tree_${i}.params > nohup_gloome_run_without_tree_${i}.out & disown ; done
+```
+
+%% Cell type:markdown id: tags:
+
+### Count ML
+
+%% Cell type:markdown id: tags:
+
+```bash
+# cd Count/Count_ML
+nohup java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.ML -v true  ../../../1236_wol_tree_pruned_with_internal_labels.nwk ../1236_pa_matrix.tsv > 1236_rates.r & disown
+# and after completing the above:
+nohup java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.Posteriors ../../../1236_wol_tree_pruned_with_internal_labels.nwk ../1236_pa_matrix.tsv ./1236_rates.r  > Count_output.tsv & disown
+```
+
+%% Cell type:markdown id: tags:
+
+### Count MP

 %% Cell type:code id: tags:

 ``` python
 %%bash -s "$taxonomicID" "$data"
 # exit if command exits with non-zero status
 set -e
+# print working directory
+echo "Current working directory: $(pwd)"
 # change to the directory where Count will be run
 cd ./program_runs/Count/Count_MP/

-# for `gain` parameter being 0.33, 0.5, 1, 2, 3, 4, 5, 6, 7, run Count
+# for `gain` parameter being 0.33, 0.5, 1, 2, ,.. 8, run Count
 for g in 0.33 0.5 1 2 3 4 5 6 7 8;
 do
    # run Count
    java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.AsymmetricWagner -gain $g $2$1_wol_tree_pruned_with_internal_labels.nwk $2$1_pa_matrix.tsv > ${1}_Count_output_gain_${g}.tsv && echo "Count with gain ratio $g done" &&
    # grep the output
    grep "# PRESENT" ${1}_Count_output_gain_${g}.tsv > ${1}_Count_output_gain_${g}_genome_sizes.tsv &&
    grep "# CHANGE" ${1}_Count_output_gain_${g}.tsv > ${1}_Count_output_gain_${g}_changes.tsv &&
    grep "# FAMILY" ${1}_Count_output_gain_${g}.tsv > ${1}_Count_output_gain_${g}_families.tsv &&
    echo "Count output for gain ratio $g grepped"
 done
 ```

 %% Output

+    Current working directory: /root/work/projects/paper_comparative_study_of_hgt_inference_methods/02-run_programs
    Count with gain ratio 0.33 done
    Count output for gain ratio 0.33 grepped
    Count with gain ratio 0.5 done
    Count output for gain ratio 0.5 grepped
    Count with gain ratio 1 done
    Count output for gain ratio 1 grepped
    Count with gain ratio 2 done
    Count output for gain ratio 2 grepped
    Count with gain ratio 3 done
    Count output for gain ratio 3 grepped
    Count with gain ratio 4 done
    Count output for gain ratio 4 grepped
    Count with gain ratio 5 done
    Count output for gain ratio 5 grepped
    Count with gain ratio 6 done
    Count output for gain ratio 6 grepped
    Count with gain ratio 7 done
    Count output for gain ratio 7 grepped
    Count with gain ratio 8 done
    Count output for gain ratio 8 grepped

 %% Cell type:markdown id: tags:

 ## Wn

 %% Cell type:markdown id: tags:

 Wn can be run directly using the script `run_Wn.py` in the `src` directory:

 ```bash
 cd 02-run_programs/program_runs/Wn/
 nohup ~/mambaforge/envs/hgt_analyses/bin/python ../../../02-run_programs/src/run_Wn.py > nohup_run_Wn.log & disown
 ```

 The results will be saved by default in the `Results` directory.

--- a/02-run_programs/src/run_ALE.py
+++ b/02-run_programs/src/run_ALE.py
@@ -98,10 +98,10 @@ if __name__ == '__main__':
    # write everything in terms of argparse instead of hardcoding
    import argparse
    parser = argparse.ArgumentParser(description="Run ALE on all gene trees")
-    parser.add_argument("--species", "-s", type=str, default="../../data/1236_wol_tree_pruned_no_internal_labels.nwk",
-                        help="Path to species tree file (default: ../../data/1236_wol_tree_pruned_no_internal_labels.nwk)")
-    parser.add_argument("--gene", "-g", type=str, default="../../data/1236_pruned_gene_trees.tsv.rooted",
-                        help="Path to gene trees file (default: ../../data/1236_pruned_gene_trees.tsv.rooted)")
+    parser.add_argument("--species", "-s", type=str, default="../../1236_wol_tree_pruned_no_internal_labels.nwk",
+                        help="Path to species tree file (default: ../../1236_wol_tree_pruned_no_internal_labels.nwk)")
+    parser.add_argument("--gene", "-g", type=str, default="../../1236_pruned_gene_trees.tsv.rooted",
+                        help="Path to gene trees file (default: ../../1236_pruned_gene_trees.tsv.rooted)")
    parser.add_argument("--threads", "-t", type=int, default=50,
                        help="Number of threads to use for parallelization (default: 50)")
    parser.add_argument("--bin", "-b", type=str,