diff --git a/01-prepare_data/01-01-prepare_NOG_dataset_dload_seqs.ipynb b/01-prepare_data/01-01-prepare_NOG_dataset_dload_seqs.ipynb index 47e022d9480fe9e5e52220fb9d1bda7041f79507..caa4cc496c59d41118c507689c15ee4e710000fb 100644 --- a/01-prepare_data/01-01-prepare_NOG_dataset_dload_seqs.ipynb +++ b/01-prepare_data/01-01-prepare_NOG_dataset_dload_seqs.ipynb @@ -111,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -122,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -177,21 +177,21 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1548/1548 [01:04<00:00, 23.82it/s]\n" + "100%|██████████| 1548/1548 [00:56<00:00, 27.46it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Number of taxa with gene data: 854\n" + "Number of taxa with gene data: 853\n" ] } ], @@ -319,7 +319,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -352,7 +352,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-05-06 10:10:01 URL:https://biocore.github.io/wol/data/genomes/metadata.tsv.xz [971240/971240] -> \"../data/wol_metadata.tsv.xz\" [1]\n" + "2025-05-08 14:52:45 URL:https://biocore.github.io/wol/data/genomes/metadata.tsv.xz [971240/971240] -> \"../data/wol_metadata.tsv.xz\" [1]\n" ] }, { @@ -385,7 +385,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-05-06 10:10:01 URL:https://raw.githubusercontent.com/biocore/wol/master/data/trees/astral/branch_length/cons/astral.cons.nwk [407397/407397] -> \"../data/wol_tree.nwk\" [1]\n" + "2025-05-08 14:52:46 URL:https://raw.githubusercontent.com/biocore/wol/master/data/trees/astral/branch_length/cons/astral.cons.nwk [407397/407397] -> \"../data/wol_tree.nwk\" [1]\n" ] } ], @@ -628,10 +628,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of genomes after pruning based on taxid in gene_features_dict: 413\n", - "acc_id_to_keep looks like: ['GCF_000773685.1', 'GCF_001767295.1', 'GCF_000817975.1', 'GCF_001423155.1', 'GCA_001801665.1']\n", - "Number of genomes in the pruned metadata: 361\n", - "Number of genomes in the pruned tree: 361\n" + "Number of genomes after pruning based on taxid in gene_features_dict: 412\n", + "acc_id_to_keep looks like: ['GCF_000245735.1', 'GCA_001801365.1', 'GCF_000829415.1', 'GCF_000271305.1', 'GCF_001469215.1']\n", + "Number of genomes in the pruned metadata: 360\n", + "Number of genomes in the pruned tree: 360\n" ] } ], @@ -696,7 +696,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Node names look like: [Tree node '1896966' (0x7f34f1d0f50), Tree node '381306' (0x7f34f17884a), Tree node '246195' (0x7f34f17e11c), Tree node '797473' (0x7f34f17e128), Tree node 'N1' (0x7f34f17e110)] ...\n", + "Node names look like: [Tree node '1896966' (0x7f38b054c0b), Tree node '381306' (0x7f38afdb0b9), Tree node '246195' (0x7f38afe058b), Tree node '797473' (0x7f38afe0597), Tree node 'N1' (0x7f38afe057f)] ...\n", "New leaf names look like: ['1896966', '381306', '246195', '797473', '1766620', '555778', '1033802', '1304275', '1172194', '1579979'] ...\n" ] } @@ -749,7 +749,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of taxa in the tree: 361 Number of taxa in the dictionary: 854\n" + "Number of taxa in the tree: 360 Number of taxa in the dictionary: 853\n" ] } ], @@ -786,7 +786,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -794,9 +794,9 @@ "output_type": "stream", "text": [ "Min number of genes to retain a NOG: 10 and min number of taxa to retain a NOG: 30\n", - "Mean #genes_to_keep: 83.92499327896765 Std dev of #genes_to_keep: 177.982200175856\n", + "Mean #genes_to_keep: 84.05419908773813 Std dev of #genes_to_keep: 178.40979118790855\n", "Starting with n = 100 as ~quarter of the std dev of #genes_to_keep in nog_members_df\n", - "At n = 1300 , taxon_list is a subset of taxa_in_subset, with total of 359 taxa\n" + "At n = 1300 , taxon_list is a subset of taxa_in_subset, with total of 360 taxa\n" ] }, { @@ -831,53 +831,53 @@ " </thead>\n", " <tbody>\n", " <tr>\n", - " <th>101263</th>\n", + " <th>14341</th>\n", " <td>1236</td>\n", - " <td>ETAWV</td>\n", - " <td>372</td>\n", - " <td>393</td>\n", - " <td>[160488, 634500, 214092, 223283, 1414654, 6650...</td>\n", - " <td>[1006000.GKAS_01002, 1028989.PSCI_4210, 105164...</td>\n", + " <td>ER3QB</td>\n", + " <td>190</td>\n", + " <td>221</td>\n", + " <td>[1294143, 1798802, 572477, 273526, 1441930, 40...</td>\n", + " <td>[105559.Nwat_2444, 1121939.L861_15835, 1178482...</td>\n", " <td>55</td>\n", " </tr>\n", " <tr>\n", - " <th>17358</th>\n", + " <th>14363</th>\n", " <td>1236</td>\n", - " <td>ER6B2</td>\n", - " <td>254</td>\n", - " <td>301</td>\n", - " <td>[743720, 717774, 491952, 1178482, 290398, 1445...</td>\n", - " <td>[1028989.PSCI_0675, 1051646.IX91_03105, 105164...</td>\n", + " <td>ER3QZ</td>\n", + " <td>229</td>\n", + " <td>258</td>\n", + " <td>[1926881, 230089, 910964, 69222, 1484157, 1681...</td>\n", + " <td>[1005058.UMN179_01181, 1006000.GKAS_02034, 105...</td>\n", " <td>55</td>\n", " </tr>\n", " <tr>\n", - " <th>95227</th>\n", + " <th>75558</th>\n", " <td>1236</td>\n", - " <td>ET5PB</td>\n", - " <td>255</td>\n", - " <td>255</td>\n", - " <td>[314276, 1635005, 211586, 326297, 151081, 3807...</td>\n", - " <td>[1051646.IX91_11355, 1085623.GNIT_2424, 112767...</td>\n", + " <td>ESNNU</td>\n", + " <td>337</td>\n", + " <td>372</td>\n", + " <td>[1736612, 216778, 273526, 871585, 1484157, 140...</td>\n", + " <td>[1006000.GKAS_04123, 1045855.DSC_01400, 111551...</td>\n", " <td>55</td>\n", " </tr>\n", " <tr>\n", - " <th>55467</th>\n", + " <th>17358</th>\n", " <td>1236</td>\n", - " <td>ES59X</td>\n", - " <td>264</td>\n", - " <td>277</td>\n", - " <td>[1294143, 1384056, 267850, 322710, 1177154, 16...</td>\n", - " <td>[1026882.MAMP_03077, 1028989.PSCI_3136, 103380...</td>\n", + " <td>ER6B2</td>\n", + " <td>254</td>\n", + " <td>301</td>\n", + " <td>[743720, 717774, 491952, 1178482, 290398, 1445...</td>\n", + " <td>[1028989.PSCI_0675, 1051646.IX91_03105, 105164...</td>\n", " <td>55</td>\n", " </tr>\n", " <tr>\n", - " <th>97026</th>\n", + " <th>44651</th>\n", " <td>1236</td>\n", - " <td>ET788</td>\n", - " <td>270</td>\n", - " <td>272</td>\n", - " <td>[1926881, 230089, 198214, 1681196, 910964, 692...</td>\n", - " <td>[1006000.GKAS_02548, 1115515.EV102420_33_00080...</td>\n", + " <td>ERVXT</td>\n", + " <td>256</td>\n", + " <td>273</td>\n", + " <td>[1736612, 43263, 1294143, 216778, 1535422, 179...</td>\n", + " <td>[1163408.UU9_10362, 1187848.A1QO_01760, 119576...</td>\n", " <td>55</td>\n", " </tr>\n", " </tbody>\n", @@ -885,26 +885,26 @@ "</div>" ], "text/plain": [ - " taxonomic_group NOG #taxa #genes \\\n", - "101263 1236 ETAWV 372 393 \n", - "17358 1236 ER6B2 254 301 \n", - "95227 1236 ET5PB 255 255 \n", - "55467 1236 ES59X 264 277 \n", - "97026 1236 ET788 270 272 \n", + " taxonomic_group NOG #taxa #genes \\\n", + "14341 1236 ER3QB 190 221 \n", + "14363 1236 ER3QZ 229 258 \n", + "75558 1236 ESNNU 337 372 \n", + "17358 1236 ER6B2 254 301 \n", + "44651 1236 ERVXT 256 273 \n", "\n", - " taxa \\\n", - "101263 [160488, 634500, 214092, 223283, 1414654, 6650... \n", - "17358 [743720, 717774, 491952, 1178482, 290398, 1445... \n", - "95227 [314276, 1635005, 211586, 326297, 151081, 3807... \n", - "55467 [1294143, 1384056, 267850, 322710, 1177154, 16... \n", - "97026 [1926881, 230089, 198214, 1681196, 910964, 692... \n", + " taxa \\\n", + "14341 [1294143, 1798802, 572477, 273526, 1441930, 40... \n", + "14363 [1926881, 230089, 910964, 69222, 1484157, 1681... \n", + "75558 [1736612, 216778, 273526, 871585, 1484157, 140... \n", + "17358 [743720, 717774, 491952, 1178482, 290398, 1445... \n", + "44651 [1736612, 43263, 1294143, 216778, 1535422, 179... \n", "\n", - " genes #genes_to_keep \n", - "101263 [1006000.GKAS_01002, 1028989.PSCI_4210, 105164... 55 \n", - "17358 [1028989.PSCI_0675, 1051646.IX91_03105, 105164... 55 \n", - "95227 [1051646.IX91_11355, 1085623.GNIT_2424, 112767... 55 \n", - "55467 [1026882.MAMP_03077, 1028989.PSCI_3136, 103380... 55 \n", - "97026 [1006000.GKAS_02548, 1115515.EV102420_33_00080... 55 " + " genes #genes_to_keep \n", + "14341 [105559.Nwat_2444, 1121939.L861_15835, 1178482... 55 \n", + "14363 [1005058.UMN179_01181, 1006000.GKAS_02034, 105... 55 \n", + "75558 [1006000.GKAS_04123, 1045855.DSC_01400, 111551... 55 \n", + "17358 [1028989.PSCI_0675, 1051646.IX91_03105, 105164... 55 \n", + "44651 [1163408.UU9_10362, 1187848.A1QO_01760, 119576... 55 " ] }, "metadata": {}, @@ -1065,14 +1065,14 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Number of locus_tags considering the wol_tree_taxa: 1032634. Looks like: ['1005090.BAKON_001', '1005090.BAKON_002', '1005090.BAKON_003', '1005090.BAKON_004', '1005090.BAKON_005']\n" + "Number of locus_tags considering the wol_tree_taxa: 1032613. Looks like: ['1005090.BAKON_001', '1005090.BAKON_002', '1005090.BAKON_003', '1005090.BAKON_004', '1005090.BAKON_005']\n" ] } ], @@ -1099,7 +1099,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1165,16 +1165,16 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Taxa in wol tree but not in gene trees: 0 \n", + "Taxa in wol tree but not in gene trees: 1 \n", "and vice versa: 0\n", - "They look like this: [] and []\n", + "They look like this: ['243277'] and []\n", "Number of taxa to keep in the wol tree: 359\n" ] } @@ -1238,7 +1238,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -1271,7 +1271,7 @@ "nog_members_df['genes'] = nog_members_df['genes'].apply(lambda x: ','.join(x))\n", "\n", "# overwrite the NOG members file\n", - "nog_members_df.to_csv(f\"{data_dir}{taxonomicID}_nog_members.tsv\", sep=\"\\t\", index=False, header=False)" + "nog_members_df.to_csv(f\"{data_dir}{taxonomicID}_nog_members.tsv\", sep=\"\\t\", index=False, header=False)\n" ] }, { @@ -1283,7 +1283,8 @@ "The below command is what you can run and then wait to finish. It will be run in the background (because of `nohup` and `disown`) even if you lose connection to the server. This process can take in the order of hours to finish, for a list of trees in the order of 10k trees.\n", "\n", "```\n", - "$ nohup python src/run_MAD_on_EggNOG_parallel.py -i ../data/1236_pruned_gene_trees.tsv -m ~/bin/mad/mad -n 100 > ./nohup_mad_eggnog_rooting_1236.log & disown\n", + "# cd 01-prepare_data\n", + "$ nohup python src/run_MAD_on_EggNOG_parallel.py -i ../data/1236_pruned_gene_trees.tsv -m ~/bin/mad/mad -n 100 > ../data/nohup_mad_eggnog_rooting_1236.log & disown\n", "```\n", "\n", "**NOTE**: Pruning took roughly 1.5 hours for 15k trees" @@ -1307,14 +1308,14 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Number of genome accessions: 1470, and they look like:\n" + "Number of genome accessions: 1473, and they look like:\n" ] }, { @@ -1346,27 +1347,27 @@ " <tr>\n", " <th>0</th>\n", " <td>GCF_000953635.1</td>\n", - " <td>/root/work/projects/hgt_inference_comparative_...</td>\n", + " <td>/root/work/projects/paper_comparative_study_of...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>GCF_900461565.1</td>\n", - " <td>/root/work/projects/hgt_inference_comparative_...</td>\n", + " <td>/root/work/projects/paper_comparative_study_of...</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>GCF_900450575.1</td>\n", - " <td>/root/work/projects/hgt_inference_comparative_...</td>\n", + " <td>/root/work/projects/paper_comparative_study_of...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>GCF_015476275.1</td>\n", - " <td>/root/work/projects/hgt_inference_comparative_...</td>\n", + " <td>/root/work/projects/paper_comparative_study_of...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>GCF_000953655.1</td>\n", - " <td>/root/work/projects/hgt_inference_comparative_...</td>\n", + " <td>/root/work/projects/paper_comparative_study_of...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", @@ -1374,11 +1375,11 @@ ], "text/plain": [ " genome_accession fna_filepath\n", - "0 GCF_000953635.1 /root/work/projects/hgt_inference_comparative_...\n", - "1 GCF_900461565.1 /root/work/projects/hgt_inference_comparative_...\n", - "2 GCF_900450575.1 /root/work/projects/hgt_inference_comparative_...\n", - "3 GCF_015476275.1 /root/work/projects/hgt_inference_comparative_...\n", - "4 GCF_000953655.1 /root/work/projects/hgt_inference_comparative_..." + "0 GCF_000953635.1 /root/work/projects/paper_comparative_study_of...\n", + "1 GCF_900461565.1 /root/work/projects/paper_comparative_study_of...\n", + "2 GCF_900450575.1 /root/work/projects/paper_comparative_study_of...\n", + "3 GCF_015476275.1 /root/work/projects/paper_comparative_study_of...\n", + "4 GCF_000953655.1 /root/work/projects/paper_comparative_study_of..." ] }, "metadata": {}, @@ -1401,7 +1402,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -1440,7 +1441,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -1500,35 +1501,6 @@ " write_buffer(buffer, out_file)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "def return_gene_features(gene):\n", - " taxon, locus_tag = gene.split('.', 1)\n", - " return [\n", - " gene, \n", - " gene_features_dict[taxon]['genes'][locus_tag]['id'], \n", - " gene_features_dict[taxon]['genes'][locus_tag]['seqid'],\n", - " gene_features_dict[taxon]['genes'][locus_tag]['start'], \n", - " gene_features_dict[taxon]['genes'][locus_tag]['end'], \n", - " gene_features_dict[taxon]['genes'][locus_tag]['strand'],\n", - " gene_features_dict[taxon]['acc_id'],\n", - " ]\n", - "\n", - "# create a pool of worker processes\n", - "with mp.Pool(int(mp.cpu_count()/2)) as p:\n", - " subset_gene_features_list = p.imap_unordered(return_gene_features, subset_genes_list)\n", - "\n", - "subset_gene_features_df = pd.DataFrame(subset_gene_features_list, columns=['locus_tag', 'seqid', 'gene_id', 'start', 'end', 'strand', 'genome_accession'])\n", - "\n", - "# write this to a file\n", - "subset_gene_features_df.to_csv(f'{data_dir}{taxonomicID}_subset_gene_features.tsv', sep='\\t', index=False)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1538,21 +1510,15 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The taxa list looks like this: ['1005057', '1005058', '1005090', '1006000', '1009858', '1026882', '1028989', '1033802', '1036674', '1045855'] ...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The size of the pa_df is: (359, 1286)\n" + "The taxa list looks like this: ['1005057', '1005058', '1005090', '1006000', '1009858', '1026882', '1028989', '1033802', '1036674', '1045855'] ...\n", + "The size of the pa_df is: (359, 1300)\n" ] } ], @@ -1622,7 +1588,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ diff --git a/01-prepare_data/01-02-prepare_nucleotide_data.ipynb b/01-prepare_data/01-02-prepare_nucleotide_data.ipynb index 7018e83b3b54c23c6e6c2470f7b0ddaddee2104c..e77f2fbf7ba5a82dc2b88732d84ae7016c69acda 100644 --- a/01-prepare_data/01-02-prepare_nucleotide_data.ipynb +++ b/01-prepare_data/01-02-prepare_nucleotide_data.ipynb @@ -7,7 +7,7 @@ "Run `download_ncbi_genome_sequences.py` like so:\n", "\n", "```bash\n", - "nohup python src/download_ncbi_genome_sequences.py -i 1236_subset.taxa > nohup_download_genome_seqs.out & disown\n", + "nohup ~/mambaforge/envs/hgt_analyses/bin/python src/download_ncbi_genome_sequences.py -i ../data/1236_subset_taxa.txt > ../data/nohup_download_genome_seqs.out & disown\n", "```\n", "\n", "By default, `-o` output dir is `../data/genome_sequences/` where fasta and gff files are downloaded for each taxon in the list. \n", @@ -29,15 +29,15 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Found 100692 unique gene names. List looks like this: ['1123053_GCA_000425345_02619', '128785_GCA_001556105_01030', '1123514_GCA_000381085_02040', '1441930_Z042_05715', '1449087_GCA_000711305_00098', '767434_Fraau_2269', '377629_TERTU_3187', '437022_CC99x_00673', '658445_H744_2c0221', '1265503_GCA_000378625_02015']\n", - "Writing gene names to 1236_subset_NOG_genes.list\n" + "Found 156509 unique gene names. List looks like this: ['291331.XOO2297', '225937.HP15_481', '573983.B0681_04825', '318167.Sfri_1100', '291331.XOO1980', '472759.Nhal_3786', '1798287.A3F14_06600', '1354304.XPG1_2379', '28173.VIBNI_B0337', '1461581.BN1049_02975']\n", + "Writing gene names to ../data/1236_nog_genes.list\n" ] } ], @@ -49,7 +49,7 @@ "\n", "# for the genes of interest, we need to first prepare a list of all the gene names.\n", "# for this, first read in the members.tsv file\n", - "members_tsv_filepath = '1236_subset_NOG_members.tsv'\n", + "members_tsv_filepath = '../data/1236_nog_members.tsv'\n", "# read only the 6th column (CSV lists of the gene names)\n", "with open(members_tsv_filepath) as fo:\n", " flines = fo.readlines()\n", @@ -87,7 +87,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.2" + "version": "3.12.3" } }, "nbformat": 4, diff --git a/01-prepare_data/src/run_MAD_on_EggNOG_parallel.py b/01-prepare_data/src/run_MAD_on_EggNOG_parallel.py index 6ef8d01725ad1d10a5e719251152647a87ad0650..006772cb7208155f62722ea8af7aea7b160b360c 100755 --- a/01-prepare_data/src/run_MAD_on_EggNOG_parallel.py +++ b/01-prepare_data/src/run_MAD_on_EggNOG_parallel.py @@ -8,6 +8,7 @@ import argparse import glob from multiprocessing import Pool + def run_command(params): """ Inputs: @@ -21,6 +22,7 @@ def run_command(params): if process.wait() != 0: raise CalledProcessError(process.returncode, cmd) + def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_subprocesses): """ Inputs: @@ -36,8 +38,9 @@ def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_s The output file will contain the same number of lines as the input file, but the second column will contain the rooted trees. """ # Check if the input file exists - if not os.path.isfile(input_file_path): - raise FileNotFoundError(f"The input file {input_file_path} does not seem to exist. Check the filepath provided.") + if not os.path.isfile(input_file_path): + raise FileNotFoundError(f"The input file { + input_file_path} does not seem to exist. Check the filepath provided.") # Find the full path of the input file and extract the dir name input_file_path = os.path.abspath(input_file_path) input_dir = os.path.dirname(input_file_path) @@ -51,50 +54,53 @@ def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_s # Create the temporary directory os.mkdir(tmp_dir) # log this with time, along with location of the tmp folder - print(f'Created the tmp folder at: {datetime.now()} at location: {tmp_dir}') - + print(f'Created the tmp folder at: { + datetime.now()} at location: {tmp_dir}') # Open the input file and read it in chunks of `n_lines` lines - nwk_split_files = [] # track the filenames of split files containing the second column (tree) - info_split_files = [] # track the filenames of split files containing the first column (treeID) - + # track the filenames of split files containing the second column (tree) + nwk_split_files = [] + # track the filenames of split files containing the first column (treeID) + info_split_files = [] + + def write_split_files(chunk, chunk_index, nwk_split_files, info_split_files): + """ + Helper function to write out the split files for a given chunk. + """ + split_chunk = [line.split() for line in chunk] + nwk_split_filepath = f"{tmp_dir}/tmp_tree_split.nwk.{chunk_index}" + info_split_filepath = f"{tmp_dir}/tmp_tree_split.info.{chunk_index}" + with open(nwk_split_filepath, 'w') as split_out_fo: + split_out_fo.write('\n'.join(line[1] + for line in split_chunk) + '\n') + with open(info_split_filepath, 'w') as split_out_fo: + split_out_fo.write('\n'.join(line[0] + for line in split_chunk) + '\n') + nwk_split_files.append(nwk_split_filepath) + info_split_files.append(info_split_filepath) + return nwk_split_files, info_split_files + + # open the input file with open(input_file_path, 'r') as treesFO: chunk = [] for i, line in enumerate(treesFO): - split_line = line.strip().split() - chunk.append(split_line) + chunk.append(line) if (i + 1) % n_lines == 0: - # Write out the split trees file - nwk_split_filepath = tmp_dir+'/tmp_tree_split.nwk.'+str((i // n_lines) + 1) - info_split_filepath = tmp_dir+'/tmp_tree_split.info.'+str((i // n_lines) + 1) - with open(nwk_split_filepath, 'w') as split_out_fo: - split_out_fo.write('\n'.join(line[1] for line in chunk) + '\n') - with open(info_split_filepath, 'w') as split_out_fo: - split_out_fo.write('\n'.join(line[0] for line in chunk) + '\n') - # track the filenames - nwk_split_files.append(nwk_split_filepath) - info_split_files.append(info_split_filepath) + nwk_split_files, info_split_files = write_split_files( + chunk, (i // n_lines) + 1, nwk_split_files, info_split_files) chunk = [] # Write the last chunk if it's not empty if chunk: - split_chunk = [line.split() for line in chunk] - nwk_split_filepath = tmp_dir+'/tmp_tree_split.nwk.'+str((i // n_lines) + 2) - info_split_filepath = tmp_dir+'/tmp_tree_split.info.'+str((i // n_lines) + 2) - with open(nwk_split_filepath, 'w') as split_out_fo: - split_out_fo.write('\n'.join(line[1] for line in split_chunk) + '\n') - with open(info_split_filepath, 'w') as split_out_fo: - split_out_fo.write('\n'.join(line[0] for line in split_chunk) + '\n') - # track the filenames - nwk_split_files.append(nwk_split_filepath) - info_split_files.append(info_split_filepath) - - - # using subprocess.Popen inside the run_command function, + nwk_split_files, info_split_files = write_split_files( + chunk, (len(nwk_split_files) // n_lines) + 2, nwk_split_files, info_split_files) + + # using subprocess.Popen inside the run_command function, # call MAD on all of these split newick files. Prepare the commands. # `mad -n trees.nwk.1` - commands = [([mad_executable, '-t', '-n', split_file_path], split_file_path+'.stdout') for split_index, split_file_path in enumerate(nwk_split_files)] - + commands = [([mad_executable, '-t', '-n', split_file_path], split_file_path+'.stdout') + for split_index, split_file_path in enumerate(nwk_split_files)] + mad_start_time = time.time() # Log timestamp for the start of the MAD rooting process print(f'MAD rooting starting at: {datetime.now()}') @@ -108,7 +114,6 @@ def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_s # Log that the split files are being combined, along with the timestamp print(f'{datetime.now()}: Combining the split files into one file...') - # Note: mad adds a `.rooted` at the end of the input filename, as the output filename # combine the columns from the info files with the nwk.rooted files to prepare the final output file with open(input_file_path + '.rooted', 'w') as rootedOutputFO: @@ -124,25 +129,28 @@ def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_s # combine the info and rooted trees with open(info_split_filepath, 'r') as info_file: rooted_info_plus_tree_chunks = [ - '\t'.join([info_line.strip(), rooted_nwk_trees[line_index]]) + '\t'.join( + [info_line.strip(), rooted_nwk_trees[line_index]]) for line_index, info_line in enumerate(info_file) ] # write out the new tree line chunks to the output file - rootedOutputFO.write('\n'.join(rooted_info_plus_tree_chunks) + '\n') + rootedOutputFO.write( + '\n'.join(rooted_info_plus_tree_chunks) + '\n') # Clear memory del rooted_info_plus_tree_chunks del rooted_nwk_trees - - # delete the split nwk files in the tmp folder + + # # delete the split nwk files in the tmp folder for split_file_name in nwk_split_files: os.remove(split_file_name) # combine all the log files into one log file using `tail -n +1`, and write in the input_dir with timestamp now = datetime.now() timestamp = now.strftime("%Y%m%d_%H%M%S") - os.system('tail -n +1 '+tmp_dir+'/*.stdout > '+input_dir+'/tmp_mad_rooting_splits_'+timestamp+'.log') - # delete the tmp folder + os.system('tail -n +1 '+tmp_dir+'/*.stdout > '+input_dir + + '/tmp_mad_rooting_splits_'+timestamp+'.log') + # # delete the tmp folder # os.system('rm -rf '+tmp_dir) @@ -170,7 +178,8 @@ if __name__ == '__main__': args = parser.parse_args() - MAD_roots_NOG_trees_parallel(args.Input, args.mad, args.nlines, args.num_subprocesses) + MAD_roots_NOG_trees_parallel( + args.Input, args.mad, args.nlines, args.num_subprocesses) print('Process time taken from start to finish: ' + - str(timedelta(seconds=time.time() - start_time)) ) \ No newline at end of file + str(timedelta(seconds=time.time() - start_time))) diff --git a/02-run_programs/02-run_programs.ipynb b/02-run_programs/02-run_programs.ipynb index 92d6a1f452c4cbfe1971436affcea994193bc1e6..b46f34fbed14a30499b4b725c90b3d0453e9021c 100644 --- a/02-run_programs/02-run_programs.ipynb +++ b/02-run_programs/02-run_programs.ipynb @@ -11,12 +11,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "taxonomicID = '1236'\n", - "data = '/root/work/projects/hgt_inference_comparative_study/data/'" + "data = '/root/work/projects/paper_comparative_study_of_hgt_inference_methods/data/'" ] }, { @@ -35,12 +35,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "# make directories for programs\n", + "mkdir -p program_runs\n", + "cd program_runs\n", "mkdir -p ALE AnGST RANGER RANGER-Fast Count GLOOME_with_tree GLOOME_without_tree" ] }, @@ -109,15 +111,15 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "%%bash\n", + "%%bash -s \"$taxonomicID\" \"$data\"\n", "# create AnGST directory if it doesn't exist\n", "# write penalty file\n", "# these params were taken from the AnGST manual\n", - "mkdir -p ./AnGST\n", + "mkdir -p $2/program_runs/AnGST\n", "cat > ./src/angst_penalty_file.txt << EOL\n", "hgt: 3.0\n", "dup: 2.0\n", @@ -131,9 +133,14 @@ "metadata": {}, "source": [ "Run AnGST using the script `run_angst.py` in the following manner:\n", + "\n", "```bash\n", - "# cd AnGST\n", - "$ nohup python ../src/run_angst.py > nohup_run_angst.out & disown\n", + "$ cd AnGST\n", + "# make sure you activate the python2 environment\n", + "$ mamba activate hgt_analyses_py2\n", + "\n", + "# run the script\n", + "$ nohup ~/mambaforge/envs/hgt_analyses_py2/bin/python ../src/run_angst.py > nohup_run_angst.out & disown\n", "```" ] }, @@ -160,16 +167,23 @@ "The function below creates the presence-absence (PA) matrix for GLOOME, based on the PA of taxa in the NOGs of interest. We use that matrix (as a fasta file input) along with the species tree (and a separate run, without the tree), to infer gains and losses. Since Count also uses a PA matrix but in TSV format, we prepare it here itself. Note that Count requires each row to be that of a gene family (in our case, a NOG) whereas GLOOME needs each row (aka each sequence in the FASTA file) to be that of a taxon." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GLOOME ML" + ] + }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "%%bash -s \"$taxonomicID\" \"$data\"\n", - "\n", + "cd $2/program_runs/\n", "# write param file that uses the rooted tree\n", - "cat > ./GLOOME_with_tree/gloome_tree.params << EOL\n", + "cat > ./GLOOME_with_tree/gloome_ml_tree.params << EOL\n", "_seqFile $2$1_pa_matrix.fasta\n", "_treeFile $2$1_wol_tree_pruned_no_internal_labels.nwk\n", "_gainLossDist 1\n", @@ -179,11 +193,11 @@ "_costMatrixGainLossRatio 1\n", "## Advanced \n", "_logValue 4\n", - "_outDir Results_GLOOME_with_tree\n", + "_outDir Results_GLOOME_ML_with_tree\n", "EOL\n", "\n", "# write param file that doesn't use the rooted tree\n", - "cat > ./GLOOME_without_tree/gloome_without_tree.params << EOL\n", + "cat > ./GLOOME_without_tree/gloome_ml_without_tree.params << EOL\n", "_seqFile $2$1_pa_matrix.fasta\n", "_gainLossDist 1\n", "# for COG and EggNOG only patterns with 3 or more ones are observable\n", @@ -192,7 +206,7 @@ "_costMatrixGainLossRatio 1\n", "## Advanced\n", "_logValue 4\n", - "_outDir Results_GLOOME_without_tree\n", + "_outDir Results_GLOOME_ML_without_tree\n", "EOL" ] }, @@ -213,18 +227,95 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Run Count" + "### GLOOME MP" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash -s \"$taxonomicID\" \"$data\"\n", + "# we need a params file for each iteration of the _costMatrixGainLossRatio\n", + "# _costMatrixGainLossRatio is the ratio of the cost of a gain to the cost of a loss\n", + "# the default is 1:1 but we want to test 0.33, 0.5, 1, 2, ..., 8\n", + "# for each of these ratios we need to create a params file\n", + "cd $2/program_runs/\n", + "for costRatio in 0.33 0.5 1 2 3 4 5 6 7 8; do\n", + "# write param file that uses the rooted tree\n", + "cat > ./GLOOME_with_tree/gloome_tree_${costRatio}.params << EOL\n", + "_seqFile $2$1_pa_matrix.fasta\n", + "_treeFile $2$1_wol_tree_pruned_no_internal_labels.nwk\n", + "_costMatrixGainLossRatio $costRatio\n", + "_logValue 4\n", + "_performOptimizations 0\n", + "_outDir Results_GLOOME_MP_with_tree_$costRatio\n", + "EOL\n", + "\n", + "# write param file that doesn't use the rooted tree\n", + "cat > ./GLOOME_without_tree/gloome_without_tree_${costRatio}.params << EOL\n", + "_seqFile $2$1_pa_matrix.fasta\n", + "_costMatrixGainLossRatio $costRatio\n", + "_logValue 4\n", + "_performOptimizations 0\n", + "_outDir Results_GLOOME_MP_without_tree_$costRatio\n", + "EOL\n", + "done\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then run all of the GLOOME MP scripts like so:\n", + "```bash\n", + "cd GLOOME_with_tree\n", + "for i in 0.33 0.5 1 2 3 4 5 6 7 8 ; do nohup ~/bin/GLOOME.VR01.266 gloome_tree_${i}.params > nohup_gloome_run_with_tree_${i}.out & disown ; done\n", + "```\n", + "\n", + "```bash\n", + "cd ../GLOOME_without_tree\n", + "for i in 0.33 0.5 1 2 3 4 5 6 7 8 ; do nohup ~/bin/GLOOME.VR01.266 gloome_without_tree_${i}.params > nohup_gloome_run_without_tree_${i}.out & disown ; done\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Count ML" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```bash\n", + "# cd Count/Count_ML\n", + "nohup java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.ML -v true ../../../1236_wol_tree_pruned_with_internal_labels.nwk ../1236_pa_matrix.tsv > 1236_rates.r & disown\n", + "# and after completing the above:\n", + "nohup java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.Posteriors ../../../1236_wol_tree_pruned_with_internal_labels.nwk ../1236_pa_matrix.tsv ./1236_rates.r > Count_output.tsv & disown\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Count MP" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "Current working directory: /root/work/projects/paper_comparative_study_of_hgt_inference_methods/02-run_programs\n", "Count with gain ratio 0.33 done\n", "Count output for gain ratio 0.33 grepped\n", "Count with gain ratio 0.5 done\n", @@ -252,10 +343,12 @@ "%%bash -s \"$taxonomicID\" \"$data\"\n", "# exit if command exits with non-zero status\n", "set -e\n", + "# print working directory\n", + "echo \"Current working directory: $(pwd)\"\n", "# change to the directory where Count will be run\n", "cd ./program_runs/Count/Count_MP/\n", "\n", - "# for `gain` parameter being 0.33, 0.5, 1, 2, 3, 4, 5, 6, 7, run Count\n", + "# for `gain` parameter being 0.33, 0.5, 1, 2, ,.. 8, run Count\n", "for g in 0.33 0.5 1 2 3 4 5 6 7 8;\n", "do\n", " # run Count\n", diff --git a/02-run_programs/src/run_ALE.py b/02-run_programs/src/run_ALE.py index 78e2fc35d51f9e96fb793901e983793b251aeade..b02d16260565f65f44817724dc608ed04290ea91 100644 --- a/02-run_programs/src/run_ALE.py +++ b/02-run_programs/src/run_ALE.py @@ -98,10 +98,10 @@ if __name__ == '__main__': # write everything in terms of argparse instead of hardcoding import argparse parser = argparse.ArgumentParser(description="Run ALE on all gene trees") - parser.add_argument("--species", "-s", type=str, default="../../data/1236_wol_tree_pruned_no_internal_labels.nwk", - help="Path to species tree file (default: ../../data/1236_wol_tree_pruned_no_internal_labels.nwk)") - parser.add_argument("--gene", "-g", type=str, default="../../data/1236_pruned_gene_trees.tsv.rooted", - help="Path to gene trees file (default: ../../data/1236_pruned_gene_trees.tsv.rooted)") + parser.add_argument("--species", "-s", type=str, default="../../1236_wol_tree_pruned_no_internal_labels.nwk", + help="Path to species tree file (default: ../../1236_wol_tree_pruned_no_internal_labels.nwk)") + parser.add_argument("--gene", "-g", type=str, default="../../1236_pruned_gene_trees.tsv.rooted", + help="Path to gene trees file (default: ../../1236_pruned_gene_trees.tsv.rooted)") parser.add_argument("--threads", "-t", type=int, default=50, help="Number of threads to use for parallelization (default: 50)") parser.add_argument("--bin", "-b", type=str,