{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Run `download_ncbi_genome_sequences.py` like so:\n",
    "\n",
    "```bash\n",
    "nohup ~/mambaforge/envs/hgt_analyses/bin/python src/download_ncbi_genome_sequences.py -i ../data/1236_subset_taxa.txt  > ../data/nohup_download_genome_seqs.out & disown\n",
    "```\n",
    "\n",
    "By default, `-o` output dir is `../data/genome_sequences/` where fasta and gff files are downloaded for each taxon in the list. \n",
    "\n",
    "Each of these files is of the form `{NCBI_taxon_ID}_{NCBI_genome_accession_ID}.{extension}`\n",
    "\n",
    "The TSV file `genome_sequences/1236_subset_accession_ids.tsv` lists out the mapping between taxon_ID, accession_ID, as well as other information such as assembly name and source DB (sometimes it is not REFSEQ)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import libraries\n",
    "import itertools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 156509 unique gene names. List looks like this: ['291331.XOO2297', '225937.HP15_481', '573983.B0681_04825', '318167.Sfri_1100', '291331.XOO1980', '472759.Nhal_3786', '1798287.A3F14_06600', '1354304.XPG1_2379', '28173.VIBNI_B0337', '1461581.BN1049_02975']\n",
      "Writing gene names to ../data/1236_nog_genes.list\n"
     ]
    }
   ],
   "source": [
    "# Then, prepare a multiple sequence fasta file for all of these genomes, and similarly such a file for all of the genes of interest.\n",
    "\n",
    "# for the genomes, we just need to concatenate all the fasta files\n",
    "# in the genome_sequences directory, i.e. the output dir of the previous script\n",
    "\n",
    "# for the genes of interest, we need to first prepare a list of all the gene names.\n",
    "# for this, first read in the members.tsv file\n",
    "members_tsv_filepath = '../data/1236_nog_members.tsv'\n",
    "# read only the 6th column (CSV lists of the gene names)\n",
    "with open(members_tsv_filepath) as fo:\n",
    "    flines = fo.readlines()\n",
    "    gene_names = [line.split('\\t')[5] for line in flines]\n",
    "# now, split the gene names into a list of lists\n",
    "gene_names = [gn.split(',') for gn in gene_names]\n",
    "# flatten this huge list of lists efficiently\n",
    "gene_names = list(itertools.chain.from_iterable(gene_names))\n",
    "# remove duplicates\n",
    "gene_names = list(set(gene_names))\n",
    "print(f'Found {len(gene_names)\n",
    "               } unique gene names. List looks like this:', gene_names[:10])\n",
    "# write the gene names to a file, replacing in the members.tsv filepath, 'members' with 'genes' and 'tsv' with 'list'\n",
    "gene_names_filepath = members_tsv_filepath.replace(\n",
    "    'members', 'genes').replace('tsv', 'list')\n",
    "print(f'Writing gene names to {gene_names_filepath}')\n",
    "with open(gene_names_filepath, 'w') as fo:\n",
    "    fo.write('\\n'.join(gene_names) + '\\n')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "hgt_analyses",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}