Initial commit

3e76c60a · Swastik Mishra · 3e76c60a · 3e76c60a · 3e76c60a · 3e76c60a
Commit 3e76c60a authored 6 months ago by Swastik Mishra
--- a/.gitignore
+++ b/.gitignore
+arc/
+scrap/
+tmp*
+nohup*
+.txt
+.vscode
+pyrightconfig.json
+__pycache__
+# Ignore everything in this directory
+data/*
+plots/*
+# But not the directory itself
+!data/.gitkeep
+!plots/.gitkeep
--- a/01-prepare_data/01-01-prepare_NOG_dataset_dload_seqs.ipynb
+++ b/01-prepare_data/01-01-prepare_NOG_dataset_dload_seqs.ipynb
--- a/01-prepare_data/01-02-prepare_nucleotide_data.ipynb
+++ b/01-prepare_data/01-02-prepare_nucleotide_data.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run `download_ncbi_genome_sequences.py` like so:\n",
+    "\n",
+    "```bash\n",
+    "nohup python src/download_ncbi_genome_sequences.py -i 1236_subset.taxa  > nohup_download_genome_seqs.out & disown\n",
+    "```\n",
+    "\n",
+    "By default, `-o` output dir is `../data/genome_sequences/` where fasta and gff files are downloaded for each taxon in the list. \n",
+    "\n",
+    "Each of these files is of the form `{NCBI_taxon_ID}_{NCBI_genome_accession_ID}.{extension}`\n",
+    "\n",
+    "The TSV file `genome_sequences/1236_subset_accession_ids.tsv` lists out the mapping between taxon_ID, accession_ID, as well as other information such as assembly name and source DB (sometimes it is not REFSEQ)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import libraries\n",
+    "import itertools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 100692 unique gene names. List looks like this: ['1123053_GCA_000425345_02619', '128785_GCA_001556105_01030', '1123514_GCA_000381085_02040', '1441930_Z042_05715', '1449087_GCA_000711305_00098', '767434_Fraau_2269', '377629_TERTU_3187', '437022_CC99x_00673', '658445_H744_2c0221', '1265503_GCA_000378625_02015']\n",
+      "Writing gene names to 1236_subset_NOG_genes.list\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Then, prepare a multiple sequence fasta file for all of these genomes, and similarly such a file for all of the genes of interest.\n",
+    "\n",
+    "# for the genomes, we just need to concatenate all the fasta files\n",
+    "# in the genome_sequences directory, i.e. the output dir of the previous script\n",
+    "\n",
+    "# for the genes of interest, we need to first prepare a list of all the gene names.\n",
+    "# for this, first read in the members.tsv file\n",
+    "members_tsv_filepath = '1236_subset_NOG_members.tsv'\n",
+    "# read only the 6th column (CSV lists of the gene names)\n",
+    "with open(members_tsv_filepath) as fo:\n",
+    "    flines = fo.readlines()\n",
+    "    gene_names = [line.split('\\t')[5] for line in flines]\n",
+    "# now, split the gene names into a list of lists\n",
+    "gene_names = [gn.split(',') for gn in gene_names]\n",
+    "# flatten this huge list of lists efficiently\n",
+    "gene_names = list(itertools.chain.from_iterable(gene_names))\n",
+    "# remove duplicates\n",
+    "gene_names = list(set(gene_names))\n",
+    "print(f'Found {len(gene_names)\n",
+    "               } unique gene names. List looks like this:', gene_names[:10])\n",
+    "# write the gene names to a file, replacing in the members.tsv filepath, 'members' with 'genes' and 'tsv' with 'list'\n",
+    "gene_names_filepath = members_tsv_filepath.replace(\n",
+    "    'members', 'genes').replace('tsv', 'list')\n",
+    "print(f'Writing gene names to {gene_names_filepath}')\n",
+    "with open(gene_names_filepath, 'w') as fo:\n",
+    "    fo.write('\\n'.join(gene_names) + '\\n')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "hgt_analyses",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:markdown id: tags:
+
+Run `download_ncbi_genome_sequences.py` like so:
+
+```bash
+nohup python src/download_ncbi_genome_sequences.py -i 1236_subset.taxa  > nohup_download_genome_seqs.out & disown
+```
+
+By default, `-o` output dir is `../data/genome_sequences/` where fasta and gff files are downloaded for each taxon in the list.
+
+Each of these files is of the form `{NCBI_taxon_ID}_{NCBI_genome_accession_ID}.{extension}`
+
+The TSV file `genome_sequences/1236_subset_accession_ids.tsv` lists out the mapping between taxon_ID, accession_ID, as well as other information such as assembly name and source DB (sometimes it is not REFSEQ)
+
+%% Cell type:code id: tags:
+
+``` python
+# import libraries
+import itertools
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Then, prepare a multiple sequence fasta file for all of these genomes, and similarly such a file for all of the genes of interest.
+
+# for the genomes, we just need to concatenate all the fasta files
+# in the genome_sequences directory, i.e. the output dir of the previous script
+
+# for the genes of interest, we need to first prepare a list of all the gene names.
+# for this, first read in the members.tsv file
+members_tsv_filepath = '1236_subset_NOG_members.tsv'
+# read only the 6th column (CSV lists of the gene names)
+with open(members_tsv_filepath) as fo:
+    flines = fo.readlines()
+    gene_names = [line.split('\t')[5] for line in flines]
+# now, split the gene names into a list of lists
+gene_names = [gn.split(',') for gn in gene_names]
+# flatten this huge list of lists efficiently
+gene_names = list(itertools.chain.from_iterable(gene_names))
+# remove duplicates
+gene_names = list(set(gene_names))
+print(f'Found {len(gene_names)
+               } unique gene names. List looks like this:', gene_names[:10])
+# write the gene names to a file, replacing in the members.tsv filepath, 'members' with 'genes' and 'tsv' with 'list'
+gene_names_filepath = members_tsv_filepath.replace(
+    'members', 'genes').replace('tsv', 'list')
+print(f'Writing gene names to {gene_names_filepath}')
+with open(gene_names_filepath, 'w') as fo:
+    fo.write('\n'.join(gene_names) + '\n')
+```
+
+%% Output
+
+    Found 100692 unique gene names. List looks like this: ['1123053_GCA_000425345_02619', '128785_GCA_001556105_01030', '1123514_GCA_000381085_02040', '1441930_Z042_05715', '1449087_GCA_000711305_00098', '767434_Fraau_2269', '377629_TERTU_3187', '437022_CC99x_00673', '658445_H744_2c0221', '1265503_GCA_000378625_02015']
+    Writing gene names to 1236_subset_NOG_genes.list
--- a/01-prepare_data/src/download_ncbi_genome_sequences.py
+++ b/01-prepare_data/src/download_ncbi_genome_sequences.py
+import time
+import os
+import subprocess
+from loguru import logger
+import pandas as pd
+from multiprocessing import Pool
+import shutil
+import traceback
+import io
+import zipfile
+
+
+# Uses ncbi `datasets` to download each genome assembly sequence
+def download_genome_sequences(args):
+    """
+    Download genome sequences from NCBI using ncbi `datasets` CLI program
+    Args:
+        acc_id (str): assembly accession ID
+        output_dir (str): path to the output directory
+        n_retries (int): number of retries to perform
+    Returns:
+        None
+    """
+    # unpack the arguments 
+    taxon_id, acc_id, output_dir, n_retries, files_to_download, bin_dir = args
+    # Create the output directory if it does not exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    # run ncbi `datasets`
+    try:
+        ncbi_retries = n_retries
+        while ncbi_retries > 0:
+            try:
+
+                logger.info(f"Running datasets for assembly accession ID: {
+                            acc_id}, taxon ID: {taxon_id}")
+                download_process = subprocess.run(
+                    f"{bin_dir}/datasets download genome accession {acc_id} --include {files_to_download} --filename {output_dir}/{acc_id}.zip", shell=True)
+                logger.info(
+                    f"now extracting files for assembly accession ID: {acc_id}, taxon ID: {taxon_id}")
+                # if download_process didn't have success exit code, log error and try again
+                if download_process.returncode != 0:
+                    logger.error(f"datasets failed for assembly accession ID: {acc_id}, taxon ID: {
+                                 taxon_id} with error code {download_process.returncode}")
+                    if ncbi_retries == 0:
+                        logger.error(f"no more retries for assembly accession ID: {acc_id}, taxon ID: {
+                                     taxon_id} after {n_retries} retries, skipping to the next assembly accession ID")
+                        break
+                    else:
+                        logger.info(f"Retrying datasets for assembly accession ID: {
+                                    acc_id}, taxon ID: {taxon_id}, retries left: {ncbi_retries}")
+                    ncbi_retries -= 1
+                    continue
+                
+                # extract the files from the zip archive, except jsonl files
+                # basically rename each of the files to {taxon_id}_{acc_id}_{original_file_name}
+                with zipfile.ZipFile(f"{output_dir}/{acc_id}.zip", 'r') as zip_ref:
+                    if len(zip_ref.namelist()) == 0:
+                        logger.error(f"no files found in zip archive for assembly accession ID: {acc_id}, taxon ID: {taxon_id}")
+                        break
+                    for file_i in zip_ref.namelist():
+                        if file_i.endswith('.jsonl') or file_i.endswith('.json') or file_i.endswith('README.md'):
+                            continue
+                        logger.info(f"extracting file: {file_i} for assembly accession ID: {acc_id}, taxon ID: {taxon_id}")
+                        with zip_ref.open(file_i) as f:
+                            file_i_basename = os.path.basename(file_i)
+                            # check if file_i_basename starts with acc_id
+                            if file_i_basename.startswith(acc_id):
+                                with open(f"{output_dir}/{taxon_id}_{file_i_basename}", 'wb') as f_out:
+                                    f_out.write(f.read())
+                            else:
+                                with open(f"{output_dir}/{taxon_id}_{acc_id}_{file_i_basename}", 'wb') as f_out:
+                                    f_out.write(f.read())
+                    
+                # if the command runs successfully, break out of the while loop
+                break
+            except subprocess.CalledProcessError as e:
+                logger.error(f"datasets failed for assembly accession ID: {
+                             acc_id}, taxon ID: {taxon_id} with error\n {e}")
+                if ncbi_retries == 0:
+                    logger.error(
+                        f"No more retries left for assembly accession ID: {acc_id}, taxon ID: {taxon_id} after {n_retries} retries, skipping to the next assembly accession ID")
+                    break
+                else:
+                    logger.info(
+                        f"Retrying datasets for assembly accession ID: {acc_id}, taxon ID: {taxon_id}, retries left: {ncbi_retries}")
+                ncbi_retries -= 1
+                continue
+    except subprocess.CalledProcessError as e:
+        logger.error(f"datasets failed with error\n {e}")
+        return None
+    # 
+    return taxon_id
+
+
+def retrieve_accession_info(args):
+    """
+    Fn for `multiprocessing.Pool` to retrieve assembly accession ID for a given taxon ID.
+    """
+    taxon_id, n_retries, bin_dir = args
+    try:
+        datasets_retries = n_retries
+        while datasets_retries > 0:
+            try:
+                logger.info(f"Running datasets for taxon ID: {taxon_id}")
+                #
+                p1 = subprocess.Popen(
+                    f"{bin_dir}/datasets summary genome taxon {taxon_id} --as-json-lines", shell=True, stdout=subprocess.PIPE)
+                logger.info(
+                    f"now running dataformat for taxon ID: {taxon_id}")
+                tsv_records_process = subprocess.run(
+                    f"{bin_dir}/dataformat tsv genome --fields accession,assminfo-name,source_database,assminfo-refseq-category", shell=True, stdin=p1.stdout, stdout=subprocess.PIPE)
+                # if the command runs successfully, break out of the while loop
+                if tsv_records_process.returncode == 0:
+                    if len(tsv_records_process.stdout) == 0:
+                        logger.info(
+                            f"No records found for taxon ID: {taxon_id}")
+                        return None
+                    else:
+                        tsv_records = tsv_records_process.stdout.decode(
+                            'utf-8').split('\n')
+                        break
+                else:
+                    logger.error(
+                        f"dataformat failed for taxon ID: {taxon_id} with returncode {tsv_records_process.returncode} and error:\n {tsv_records_process.stderr}")
+                    datasets_retries -= 1
+                    if datasets_retries == 0:
+                        logger.error(
+                            f"datasets failed for taxon ID: {taxon_id} after {n_retries} retries for taxon ID: {taxon_id}.")
+                        return None
+            except subprocess.CalledProcessError as e:
+                logger.error(f"datasets failed for taxon ID: {
+                            taxon_id} with error\n {get_exception_traceback_str(e)}")
+                datasets_retries -= 1
+                if datasets_retries == 0:
+                    logger.error(
+                        f"datasets failed for taxon ID: {taxon_id} after {n_retries} retries for taxon ID: {taxon_id}.")
+                    return None
+                else:
+                    logger.info(
+                        f"Retrying datasets for taxon ID: {taxon_id}, retries left: {datasets_retries}")
+                continue
+
+        # remove header
+        if tsv_records[0].startswith('Assembly Accession'):
+            tsv_records = tsv_records[1:]
+        # if there are no records, log an error and continue to the next taxon ID
+        if len(tsv_records) == 0:
+            logger.info(f"No records found for taxon ID: {taxon_id}")
+            # this typically means that the ID itself doesn't have a genome associated with it
+            # but the ID is valid and could be a strain ID.
+
+        # if there are records, log the number of records found
+        logger.info(f"Found {len(tsv_records)
+                            } records for taxon ID: {taxon_id}")
+        try:
+            if tsv_records[-1] == '':
+                tsv_records = tsv_records[:-1]
+            # for each record, extract the assembly accession ID and source database
+            # for record in tsv_records:
+            # print('\n'.join(tsv_records))
+            # if there's only one record, that is the acc_id
+            if len(tsv_records) == 1:
+                assembly_name = tsv_records[0].split('\t')[1]
+            # elif there are 2 or more records,
+            # check if one of them is RefSeq or if one of the RefSeq records has category of 'representative genome' or 'reference genome'
+            elif len(tsv_records) >= 2:
+                refseq_record = None
+                for record in tsv_records:
+                    if record.split('\t')[2].endswith('REFSEQ'):
+                        refseq_record = record
+                        if record.split('\t')[3] in ['representative genome', 'reference genome']:
+                            assembly_name = tsv_records[0].split('\t')[1]
+                            break
+                    refseq_record = None
+                if refseq_record is None:
+                    assembly_name = tsv_records[0].split('\t')[1]
+            # now use the assembly_name to get the acc_id of the record
+            # i.e. record in tsv_records with the same assembly_name but from GENBANK
+            # if there's no record from GENBANK, use the first record
+            taxon_record = [record for record in tsv_records if record.split(
+                '\t')[1] == assembly_name and record.split('\t')[2] == 'GENBANK']
+            if len(taxon_record) == 0:
+                taxon_record = tsv_records[0]
+            else:
+                taxon_record = taxon_record[0]
+            return {'taxon_id': taxon_id, 'acc_id': taxon_record.split('\t')[0],
+                'assembly_name': taxon_record.split('\t')[1], 'source_db': taxon_record.split('\t')[2]}
+        except Exception as e:
+            logger.info(
+                f"TSV records list that was not processed: {tsv_records}")
+            logger.error(f"Failed to extract acc_id for taxon ID: {
+                        taxon_id}. Traceback\n {get_exception_traceback_str(e)}")
+            return None
+
+    except Exception as e:
+        logger.error(f"datasets failed for taxon ID: {
+                    taxon_id}. Traceback\n {get_exception_traceback_str(e)}")
+        return None
+
+
+
+def get_exception_traceback_str(exc: Exception) -> str:
+    # Ref: https://stackoverflow.com/a/76584117/
+    iofile = io.StringIO()
+    traceback.print_exception(exc, file=iofile)
+    return iofile.getvalue().rstrip()
+
+
+def process_taxa_ids(input_file, taxon_ids_list, output_dir, threads, n_retries, files_to_download, bin_dir, acc_id_tsv_filepath):
+    """
+    This function is used to download genome sequences from NCBI using ncbi-genome-download.
+    Args:
+        input_file (str): path to the input file containing taxon IDs
+        taxon_ids_list (list): list of taxon IDs
+        output_dir (str): path to the output directory
+        threads (int): number of threads to use
+        n_retries (int): number of retries to perform if failures are encountered
+        files_to_download (str): files to download from NCBI
+        bin_dir (str): path to the directory where the `datasets` and `dataformat` CLI programs are located
+        acc_id_tsv_filepath (str): path to the file containing assembly accession IDs
+        debug (bool): run in debug mode
+    Returns:
+        None: if the function runs successfully
+    """
+    
+    to_download_info = False
+
+    # if the acc_id_tsv_filepath is provided, read in the assembly accession IDs from the file and use this df to download genome/gff/etc.
+    if acc_id_tsv_filepath is not None:
+        logger.info(f"Assembly accession IDs file provided, reading in assembly accession IDs from file: {acc_id_tsv_filepath}")
+        taxon_id_acc_id_df = pd.read_csv(acc_id_tsv_filepath, sep='\t')
+        taxon_id_acc_id_df['taxon_id'] = taxon_id_acc_id_df['taxon_id'].astype(str)
+        logger.info(f"Read in assembly accession IDs. Number of records: {len(taxon_id_acc_id_df)}")
+        acc_id_tsv_taxon_ids_list = taxon_id_acc_id_df['taxon_id'].tolist()
+        taxa_without_acc_list = set(taxon_ids_list) - set(acc_id_tsv_taxon_ids_list)
+        logger.info(f"taxon IDs list looks like this: {taxon_ids_list[:5]}, and in acc_id_tsv file: {acc_id_tsv_taxon_ids_list[:5]}. Difference: {len(taxa_without_acc_list)}")
+        # check if the taxon IDs in the file are the same as the taxon IDs in the input file. If not, then to_download_info = True
+        if len(taxa_without_acc_list) > 0:
+            to_download_info = True
+            logger.info(f"{acc_id_tsv_filepath} doesn't have Acc IDs for {len(taxa_without_acc_list)} taxon IDs that are in the input file")
+        else:
+            to_download_info = False
+    else:
+        taxon_id_acc_id_df = pd.DataFrame(columns=['taxon_id', 'acc_id', 'assembly_name', 'source_db'])
+        to_download_info = True
+        taxa_without_acc_list = taxon_ids_list
+    
+    if to_download_info: # if the assembly accession IDs file contains taxon IDs that are not in the input file, run datasets to get assembly accession IDs for these taxon IDs 
+        logger.info(f"Running datasets to get assembly accession IDs for each taxon ID")
+        
+        # run ncbi 'datasets' to query for assembly accession IDs for each taxon ID, and write it into a pandas dataframe
+        taxon_id_acc_id_df2 = pd.DataFrame(columns=['taxon_id', 'acc_id', 'assembly_name', 'source_db'])
+        taxon_id_acc_id_list = []
+        # use multiprocessing to run the datasets function for each taxon ID, and write the results to a dataframe
+        with Pool(processes=threads) as pool:
+            for pool_out in pool.imap(retrieve_accession_info, [(taxon_id, n_retries, bin_dir) for taxon_id in taxa_without_acc_list]):
+                if pool_out is not None:
+                    taxon_id_acc_id_list.append(pool_out)
+
+        taxon_id_acc_id_df2 = pd.DataFrame(taxon_id_acc_id_list)
+        # remove rows where duplicates of the acc_id column exist
+        taxon_id_acc_id_df2.drop_duplicates(subset='acc_id', inplace=True)
+        # add these new records to the existing taxon_id_acc_id_df
+        taxon_id_acc_id_df = pd.concat([taxon_id_acc_id_df, taxon_id_acc_id_df2], ignore_index=True)
+        # write the new taxon_id_acc_id_df to a file
+        taxon_id_acc_id_df.to_csv(
+            f"{output_dir}/{input_file.split('/')[-1].split('.')[0]}_accession_ids.tsv", sep='\t', index=False, quoting=1)
+        logger.info(f"Assembly accession IDs written to file: {output_dir}/{input_file.split('/')[-1].split('.')[0]}_accession_ids.tsv")
+
+        # if the taxon_id_acc_id_df is empty and there was no original acc_id_tsv_filepath provided, log an error and exit
+        if taxon_id_acc_id_df.empty and acc_id_tsv_filepath is None:
+            logger.error(f"No assembly accession IDs found for any taxon ID, exiting")
+            return None
+        elif taxon_id_acc_id_df.empty and acc_id_tsv_filepath is not None:
+            logger.error(f"No assembly accession IDs found for any additional taxon ID. Continuing with the original assembly accession IDs file")
+
+    # now we have the assembly accession IDs, we can use ncbi-genome-download to download the genome sequences, in parallel
+    # create args list for parallel processing
+    tax_acc_tuples_list = list(
+        zip(taxon_id_acc_id_df['taxon_id'], taxon_id_acc_id_df['acc_id']))
+    logger.info(f"Number of assembly accession IDs in dataframe: {len(taxon_id_acc_id_df)}")
+    
+    # which `files_to_download` files are already downloaded in output_dir?
+    files_to_download_list = files_to_download.split(',')
+    extensions_dict = {'genome': '.fna', 'gff3': '.gff', 
+                  'gbff': '.gbff', 'protein': '.faa', 
+                  'cds': '.ffn', 'rna': '.fna', 
+                  'gtf': '.gtf'}
+    for file_type in files_to_download_list:
+        if not all([os.path.exists(f"{output_dir}/{taxon_id}_{acc_id}{extensions_dict[file_type]}") for taxon_id, acc_id in tax_acc_tuples_list]):
+            logger.info(f"Downloading {file_type} files")
+            tax_acc_tuples_list = [(taxon_id, acc_id) for taxon_id, acc_id in tax_acc_tuples_list if not os.path.exists(f"{output_dir}/{taxon_id}_{acc_id}{extensions_dict[file_type]}")]
+            logger.info(f"Number of assembly accession IDs to download: {len(tax_acc_tuples_list)}")
+            if len(tax_acc_tuples_list) == 0:
+                logger.info(f"All {file_type} files already downloaded, skipping")
+                continue
+
+    args_list = [(taxon_id, acc_id, output_dir, n_retries, files_to_download, bin_dir)
+                 for taxon_id, acc_id in tax_acc_tuples_list]
+    logger.info(f"Downloading {files_to_download} from NCBI using {
+        threads} threads, input file: {input_file}, output directory: {output_dir}")
+    with Pool(processes=threads) as pool:
+        # iterate through what has been created via pool.imap()
+        for pool_out in pool.imap(download_genome_sequences, args_list):
+            if pool_out is not None:
+                logger.info(
+                    f"Downloading {files_to_download} for taxon ID: {pool_out} finished successfully")
+
+    logger.info(f"Downloading genome sequences finished successfully")
+    return None
+
+
+if __name__ == '__main__':
+    # log time
+    logger.info(f"Running {__file__} at {time.asctime()}")
+
+    start_time = time.time()
+
+    # import argparse and read in the arguments
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='Download genome sequences from NCBI')
+    parser.add_argument(
+        '--input_file', '-i', help='Path to the input file containing taxon IDs', type=str, required=True)
+    parser.add_argument('--output_dir', '-o', help='Path to the output directory',
+                        type=str, default='../data/genome_data/')
+    parser.add_argument('--threads', '-t',
+                        help='Number of threads to use', type=int, default=100)
+    parser.add_argument('--to-download', type=str, default='genome,gff3',
+                        help='Files to dload from NCBI. Default is genome,gff3. \
+                            See NCBI datasets CLI documentation for more possibilities.')
+    parser.add_argument(
+        '--retries', '-r', help='Number of retries to perform if failures are encountered, for each taxon ID input.', type=int, default=5)
+    parser.add_argument(
+        '--debug', '-d', help='Run in debug mode (runs first <--threads> entries only)', action='store_true')
+    parser.add_argument(
+        '--bin-dir', '-b', type=str, default='/root/mambaforge/envs/hgt_analyses/bin/',
+        help='Path to dir where `datasets` and `dataformat` CLI programs are.')
+    parser.add_argument('--acc_ids_tsv', '-a',
+                        type=str, default=None,
+                        help='Path to the file containing assembly accession IDs')
+    args = parser.parse_args()
+
+    # Create the output directory if it does not exist
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # read in all the taxon IDs from the input file
+    with open(args.input_file, 'r') as f:
+        taxon_ids_list = [line.strip()
+                     for line in f.readlines() if line.strip() != '']
+
+    # debugging: run only for first 'threads' lines using a tmp_ file
+    if args.debug:
+        logger.debug(f"Running in debug mode, using only first {
+                     args.threads} taxon IDs")
+        taxon_ids_list = taxon_ids_list[:args.threads]
+
+    # run the main function
+    process_taxa_ids(args.input_file, taxon_ids_list, args.output_dir, args.threads, args.retries, args.to_download, args.bin_dir, args.acc_ids_tsv)
+
+    # log the time
+    end_time = time.time()
+    duration = end_time - start_time
+    human_readable_duration = time.strftime("%H:%M:%S", time.gmtime(duration))
+    logger.info(f"Finished running {__file__} in {human_readable_duration}")
--- a/01-prepare_data/src/run_MAD_on_EggNOG_parallel.py
+++ b/01-prepare_data/src/run_MAD_on_EggNOG_parallel.py
+#!/usr/bin/env python3
+
+from subprocess import CalledProcessError, Popen
+import os
+import time
+from datetime import timedelta, datetime
+import argparse
+import glob
+from multiprocessing import Pool
+
+def run_command(params):
+    """
+    Inputs:
+        params: tuple of (shell_command, logFileObject)
+    Function:
+        Runs the shell command, and writes the output to the log file object
+    """
+    cmd, log_file_path = params
+    with open(log_file_path, 'w') as log_FO:
+        process = Popen(cmd, stdout=log_FO, universal_newlines=True)
+        if process.wait() != 0:
+            raise CalledProcessError(process.returncode, cmd)
+
+def MAD_roots_NOG_trees_parallel(input_file_path, mad_executable, n_lines, num_subprocesses):
+    """
+    Inputs:
+        input_file_path: TSV format input file. Columns are: treeID(str) and tree(str)
+        mad_executable: path to MAD executable
+        n_lines: number of lines that the input file will be split into for each split file. 
+    Outputs:
+        Writes out a file with the same name as the input file, but with a `.rooted` at the end of the filename.
+    Additional info:
+        This function splits the input file into multiple files, and runs MAD on each of these split files in parallel.
+        The number of processes that will be run in parallel is equal to the number of split files.
+        The number of lines in each split file is equal to the `n_lines` argument.
+        The output file will contain the same number of lines as the input file, but the second column will contain the rooted trees.
+    """
+    # Check if the input file exists
+    if not os.path.isfile(input_file_path): 
+        raise FileNotFoundError(f"The input file {input_file_path} does not seem to exist. Check the filepath provided.")
+    # Find the full path of the input file and extract the dir name
+    input_file_path = os.path.abspath(input_file_path)
+    input_dir = os.path.dirname(input_file_path)
+
+    # Get the current date and time
+    now = datetime.now()
+    # Format the current date and time as a string
+    timestamp = now.strftime("%Y%m%d_%H%M%S")
+    # Create a temporary directory path with the timestamp
+    tmp_dir = f"{input_dir}/tmp_{timestamp}"
+    # Create the temporary directory
+    os.mkdir(tmp_dir)
+    # log this with time, along with location of the tmp folder
+    print(f'Created the tmp folder at: {datetime.now()} at location: {tmp_dir}') 
+
+
+    # Open the input file and read it in chunks of `n_lines` lines
+    nwk_split_files = [] # track the filenames of split files containing the second column (tree)
+    info_split_files = [] # track the filenames of split files containing the first column (treeID)
+
+    with open(input_file_path, 'r') as treesFO:
+        chunk = []
+        for i, line in enumerate(treesFO):
+            split_line = line.strip().split()
+            chunk.append(split_line)
+            if (i + 1) % n_lines == 0:
+                # Write out the split trees file
+                nwk_split_filepath = tmp_dir+'/tmp_tree_split.nwk.'+str((i // n_lines) + 1)
+                info_split_filepath = tmp_dir+'/tmp_tree_split.info.'+str((i // n_lines) + 1)
+                with open(nwk_split_filepath, 'w') as split_out_fo:
+                    split_out_fo.write('\n'.join(line[1] for line in chunk) + '\n')
+                with open(info_split_filepath, 'w') as split_out_fo:
+                    split_out_fo.write('\n'.join(line[0] for line in chunk) + '\n')
+                # track the filenames
+                nwk_split_files.append(nwk_split_filepath)
+                info_split_files.append(info_split_filepath)
+                chunk = []
+
+        # Write the last chunk if it's not empty
+        if chunk:
+            split_chunk = [line.split() for line in chunk]
+            nwk_split_filepath = tmp_dir+'/tmp_tree_split.nwk.'+str((i // n_lines) + 2)
+            info_split_filepath = tmp_dir+'/tmp_tree_split.info.'+str((i // n_lines) + 2)
+            with open(nwk_split_filepath, 'w') as split_out_fo:
+                split_out_fo.write('\n'.join(line[1] for line in split_chunk) + '\n')
+            with open(info_split_filepath, 'w') as split_out_fo:
+                split_out_fo.write('\n'.join(line[0] for line in split_chunk) + '\n')
+            # track the filenames
+            nwk_split_files.append(nwk_split_filepath)
+            info_split_files.append(info_split_filepath)
+
+    
+    # using subprocess.Popen inside the run_command function, 
+    # call MAD on all of these split newick files. Prepare the commands.
+    # `mad -n trees.nwk.1`
+    commands = [([mad_executable, '-t', '-n', split_file_path], split_file_path+'.stdout') for split_index, split_file_path in enumerate(nwk_split_files)]
+    
+    mad_start_time = time.time()
+    # Log timestamp for the start of the MAD rooting process
+    print(f'MAD rooting starting at: {datetime.now()}')
+    # Create a multiprocessing Pool with the defined maximum number of subprocesses
+    with Pool(num_subprocesses) as pool:
+        # Use pool.map to run the commands in parallel
+        pool.map(run_command, commands)
+        # Indicate the completion of the MAD rooting process
+        print(f'MAD rooting completed at: {datetime.now()}. \
+              Total time taken for MAD rooting: {timedelta(seconds=time.time() - mad_start_time)}')
+    # Log that the split files are being combined, along with the timestamp
+    print(f'{datetime.now()}: Combining the split files into one file...')
+
+
+    # Note: mad adds a `.rooted` at the end of the input filename, as the output filename
+    # combine the columns from the info files with the nwk.rooted files to prepare the final output file
+    with open(input_file_path + '.rooted', 'w') as rootedOutputFO:
+        for i, nwk_split_filepath in enumerate(nwk_split_files):
+            # read in the corresponding split nwk.rooted file
+            rooted_nwk_split_filepath = f'{nwk_split_filepath}.rooted'
+            info_split_filepath = info_split_files[i]
+            with open(rooted_nwk_split_filepath, 'r') as rooted_nwk_split_FO:
+                rooted_nwk_trees = [
+                    i.strip() for i in rooted_nwk_split_FO.readlines()
+                ]   # remove trailing newlines. This is a list of rooted trees
+
+            # combine the info and rooted trees
+            with open(info_split_filepath, 'r') as info_file:
+                rooted_info_plus_tree_chunks = [
+                    '\t'.join([info_line.strip(), rooted_nwk_trees[line_index]])
+                    for line_index, info_line in enumerate(info_file)
+                ]
+
+            # write out the new tree line chunks to the output file
+            rootedOutputFO.write('\n'.join(rooted_info_plus_tree_chunks) + '\n')
+
+            # Clear memory
+            del rooted_info_plus_tree_chunks
+            del rooted_nwk_trees
+    
+    # delete the split nwk files in the tmp folder
+    for split_file_name in nwk_split_files:
+        os.remove(split_file_name)
+    # combine all the log files into one log file using `tail -n +1`, and write in the input_dir with timestamp
+    now = datetime.now()
+    timestamp = now.strftime("%Y%m%d_%H%M%S")
+    os.system('tail -n +1 '+tmp_dir+'/*.stdout > '+input_dir+'/tmp_mad_rooting_splits_'+timestamp+'.log')
+    # delete the tmp folder
+    # os.system('rm -rf '+tmp_dir)
+
+
+if __name__ == '__main__':
+
+    start_time = time.time()
+    # Catches and interprets the user input from the command line.
+    parser = argparse.ArgumentParser()
+    # General stuff
+    parser.add_argument('--Input', '-i', required=True, type=str,
+                        help="path to input file which should be tsv format with two columns: first column is the tree ID and the second column is the tree in newick")
+    parser.add_argument('--mad', '-m', required=False, type=str, default='mad',
+                        help="path to MAD executable. Assumes mad is within $PATH by default, if this argument is not provided.")
+    parser.add_argument('--nlines', '-n', required=False, type=int, default=50,
+                        help="(Int) default=50. \
+                        Number of lines that the input file will be split into for each split\
+                        file. Smaller number implies less trees being processed by one MAD process.")
+    parser.add_argument('--num_subprocesses', '-p', required=False, type=int, default=50,
+                        help="(Int) default=50. Number of processes to run in parallel. \
+                        The larger this number, the more number of processes that will \
+                        be run in parallel, since each split file chunk gets its own process")
+
+    # parser.add_argument('--Output', '-o', required=True, type=str,
+    #                     help="path to output files")
+
+    args = parser.parse_args()
+
+    MAD_roots_NOG_trees_parallel(args.Input, args.mad, args.nlines, args.num_subprocesses)
+
+    print('Process time taken from start to finish: ' +
+                str(timedelta(seconds=time.time() - start_time)) )
\ No newline at end of file
--- a/02-run_programs/02-run_programs.ipynb
+++ b/02-run_programs/02-run_programs.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run the HGT-inference programs \n",
+    "\n",
+    "It is advised to use each program script separately since those are very time consuming. For some, via multiprocessing, we can split up the work to reduce the running time. We are running these in an HPC with ~50 threads each. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "taxonomicID = '1236'\n",
+    "data = '/root/work/projects/hgt_inference_comparative_study/data/'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Note:** \n",
+    "\n",
+    "For each of the scripts run below of the form `run_{inference_program}.py` (e.g. `run_ALE.py`), \n",
+    "you can use the `-h` flag to see the help message for the script. \n",
+    "\n",
+    "For example, run `python run_ALE.py -h`. \n",
+    "\n",
+    "These scripts have are run with their default parameters. You can take a look at which exact parameters or files are being used by opening the script and looking at the exact `parser.add_argument` calls. If you want to use different files, you can prepare files with the same format as those used in the scripts and pass them as arguments to the scripts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "# make directories for programs\n",
+    "mkdir -p ALE AnGST RANGER RANGER-Fast Count GLOOME_with_tree GLOOME_without_tree"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Ranger\n",
+    "\n",
+    "From the manual for RANGER-DTL 2.0, the program takes as input a single file (specified using the `–i` command line option) containing first the species tree, followed by a single gene tree. All input trees must be expressed using the Newick format terminated by a semicolon, and they must be fully binary (fully resolved) and rooted. Species names in the species tree must be\n",
+    "unique. E.g., ((speciesA, speciesB), speciesC);\n",
+    "\n",
+    "Each leaf in the gene tree must be labeled with the name of the species from which that gene was sampled. If desired, the gene name can be appended to the species name separated by an underscore `_` character. The gene tree may contain any number (zero, one, or more) of homologous genes from the same species. E.g., (((speciesA_gene1, speciesC_gene1), speciesB_geneX), speciesC_gene2);\n",
+    "\n",
+    "The following script (`src/run_rangerdtl.py`) runs Ranger on each of the gene trees. First it creates input files for each gene tree in our dataset by concatenating the species tree and the gene tree into one input file. Then it runs using `multiprocessing`, the program `Ranger-DTL-Fast` (from `SupplementaryPrograms/`) and `AggregateRanger` (from `CorePrograms/`) from the dir with Ranger executables. To run `AggregateRanger`, note that the output reconciliation files of Ranger-DTL-Fast needs to be of the format `{prefix}{index}` where in our case the prefix is `recon` and the indices begin with 1. The path to these output files are then provided to `AggregateRanger` by using the prefix only. See the comments in the script for more information.\n",
+    "\n",
+    "Run Ranger using the script `run_rangerdtl.py` in the following manner:\n",
+    "```bash\n",
+    "# cd RANGER\n",
+    "$ nohup python ../src/run_rangerdtl.py > nohup_ranger.out & disown\n",
+    "```\n",
+    "\n",
+    "### Ranger-Fast\n",
+    "\n",
+    "Run Ranger-Fast using the same script but with `--fast` flag:\n",
+    "```bash\n",
+    "# cd RANGER-Fast\n",
+    "$ nohup python ../src/run_rangerdtl.py --fast > nohup_ranger_fast.out & disown\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## AnGST "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "AnGST needs the species tree to be rooted AND to have an additional branch length for the root (prepared in the previous step's notebook; the needed species tree file is named `*_with_root_length.nwk`).\n",
+    "\n",
+    "In case of any alteration of the scripts below, note that \"AnGST.py\" can only be run with python2. \n",
+    "\n",
+    "We need an input file for \"AnGST.py\". The input file requires to have the path to the species tree/gene tree/output/penalties. Example:"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {},
+   "source": [
+    "species=/path/X_species_edited_angst.nwk\n",
+    "gene=/path/one_unrooted_genetree.nwk\n",
+    "output=/path/output_result\n",
+    "penalties=/path/penalty.file"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We create the penalty file based on the parameters suggested in the manual."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "# create AnGST directory if it doesn't exist\n",
+    "# write penalty file\n",
+    "# these params were taken from the AnGST manual\n",
+    "mkdir -p ./AnGST\n",
+    "cat > ./src/angst_penalty_file.txt << EOL\n",
+    "hgt: 3.0\n",
+    "dup: 2.0\n",
+    "los: 1.0\n",
+    "spc: 0.0\n",
+    "EOL"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run AnGST using the script `run_angst.py` in the following manner:\n",
+    "```bash\n",
+    "# cd AnGST\n",
+    "$ nohup python ../src/run_angst.py > nohup_run_angst.out & disown\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ALE\n",
+    "\n",
+    "The script here makes use of the direct compilation of ALE (instead of using `Docker`). Run the script from a terminal, from inside the `ALE` directory inside `program runs` directory. For each gene tree, the script runs a new process where the gene tree is written to a new file, `ALEobserve` is used to create `.ale` files, and `ALEml_undated` is run to perform the reconciliation. Sometimes the ALE programs give errors for no particular reason, so the script also has a number of retries that it can perform for each of the ALE programs before giving up. Use `--help` to see the options available or the default values used.\n",
+    "\n",
+    "```bash\n",
+    "# cd ALE\n",
+    "$ nohup python ../src/run_ALE.py > run_ALE_nohup.out & disown\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## GLOOME and Count\n",
+    "\n",
+    "The function below creates the presence-absence (PA) matrix for GLOOME, based on the PA of taxa in the NOGs of interest. We use that matrix (as a fasta file input) along with the species tree (and a separate run, without the tree), to infer gains and losses. Since Count also uses a PA matrix but in TSV format, we prepare it here itself. Note that Count requires each row to be that of a gene family (in our case, a NOG) whereas GLOOME needs each row (aka each sequence in the FASTA file) to be that of a taxon."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash -s \"$taxonomicID\" \"$data\"\n",
+    "\n",
+    "# write param file that uses the rooted tree\n",
+    "cat > ./GLOOME_with_tree/gloome_tree.params << EOL\n",
+    "_seqFile $2$1_pa_matrix.fasta\n",
+    "_treeFile $2$1_wol_tree_pruned_no_internal_labels.nwk\n",
+    "_gainLossDist 1\n",
+    "# for COG and EggNOG only patterns with 3 or more ones are observable\n",
+    "_minNumOfOnes 4\n",
+    "# include Parsimony results along with ML\n",
+    "_costMatrixGainLossRatio 1\n",
+    "## Advanced \n",
+    "_logValue 4\n",
+    "_outDir Results_GLOOME_with_tree\n",
+    "EOL\n",
+    "\n",
+    "# write param file that doesn't use the rooted tree\n",
+    "cat > ./GLOOME_without_tree/gloome_without_tree.params << EOL\n",
+    "_seqFile $2$1_pa_matrix.fasta\n",
+    "_gainLossDist 1\n",
+    "# for COG and EggNOG only patterns with 3 or more ones are observable\n",
+    "_minNumOfOnes 4\n",
+    "# include Parsimony results along with ML\n",
+    "_costMatrixGainLossRatio 1\n",
+    "## Advanced\n",
+    "_logValue 4\n",
+    "_outDir Results_GLOOME_without_tree\n",
+    "EOL"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run Gloome via command line for each of the param files as input, inside GLOOME directory (`cd GLOOME`). Using `nohup`, `&`, and `disown` here because the process may take a while to finish, and I am running it remotely on a server. Processes shouldn't be affected this way, even if I logout.\n",
+    "\n",
+    "```bash\n",
+    "nohup ~/bin/GLOOME.VR01.266 gloome_tree.params > nohup_gloome_run_with_tree.out & disown\n",
+    "\n",
+    "nohup ~/bin/GLOOME.VR01.266 gloome_without_tree.params > nohup_gloome_run_without_tree.out & disown\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run Count"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Count with gain ratio 0.33 done\n",
+      "Count output for gain ratio 0.33 grepped\n",
+      "Count with gain ratio 0.5 done\n",
+      "Count output for gain ratio 0.5 grepped\n",
+      "Count with gain ratio 1 done\n",
+      "Count output for gain ratio 1 grepped\n",
+      "Count with gain ratio 2 done\n",
+      "Count output for gain ratio 2 grepped\n",
+      "Count with gain ratio 3 done\n",
+      "Count output for gain ratio 3 grepped\n",
+      "Count with gain ratio 4 done\n",
+      "Count output for gain ratio 4 grepped\n",
+      "Count with gain ratio 5 done\n",
+      "Count output for gain ratio 5 grepped\n",
+      "Count with gain ratio 6 done\n",
+      "Count output for gain ratio 6 grepped\n",
+      "Count with gain ratio 7 done\n",
+      "Count output for gain ratio 7 grepped\n",
+      "Count with gain ratio 8 done\n",
+      "Count output for gain ratio 8 grepped\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash -s \"$taxonomicID\" \"$data\"\n",
+    "# exit if command exits with non-zero status\n",
+    "set -e\n",
+    "# change to the directory where Count will be run\n",
+    "cd ./program_runs/Count/Count_MP/\n",
+    "\n",
+    "# for `gain` parameter being 0.33, 0.5, 1, 2, 3, 4, 5, 6, 7, run Count\n",
+    "for g in 0.33 0.5 1 2 3 4 5 6 7 8;\n",
+    "do\n",
+    "    # run Count\n",
+    "    java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.AsymmetricWagner -gain $g $2$1_wol_tree_pruned_with_internal_labels.nwk $2$1_pa_matrix.tsv > ${1}_Count_output_gain_${g}.tsv && echo \"Count with gain ratio $g done\" &&\n",
+    "    # grep the output\n",
+    "    grep \"# PRESENT\" ${1}_Count_output_gain_${g}.tsv > ${1}_Count_output_gain_${g}_genome_sizes.tsv &&\n",
+    "    grep \"# CHANGE\" ${1}_Count_output_gain_${g}.tsv > ${1}_Count_output_gain_${g}_changes.tsv &&\n",
+    "    grep \"# FAMILY\" ${1}_Count_output_gain_${g}.tsv > ${1}_Count_output_gain_${g}_families.tsv &&\n",
+    "    echo \"Count output for gain ratio $g grepped\"\n",
+    "done\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Wn"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Wn can be run directly using the script `run_Wn.py` in the `src` directory:\n",
+    "\n",
+    "```bash\n",
+    "cd 02-run_programs/program_runs/Wn/\n",
+    "nohup ~/mambaforge/envs/hgt_analyses/bin/python ../../../02-run_programs/src/run_Wn.py > nohup_run_Wn.log & disown\n",
+    "```\n",
+    "\n",
+    "The results will be saved by default in the `Results` directory. \n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "hgt_analyses",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:markdown id: tags:
+
+# Run the HGT-inference programs
+
+It is advised to use each program script separately since those are very time consuming. For some, via multiprocessing, we can split up the work to reduce the running time. We are running these in an HPC with ~50 threads each.
+
+%% Cell type:code id: tags:
+
+``` python
+taxonomicID = '1236'
+data = '/root/work/projects/hgt_inference_comparative_study/data/'
+```
+
+%% Cell type:markdown id: tags:
+
+**Note:**
+
+For each of the scripts run below of the form `run_{inference_program}.py` (e.g. `run_ALE.py`),
+you can use the `-h` flag to see the help message for the script.
+
+For example, run `python run_ALE.py -h`.
+
+These scripts have are run with their default parameters. You can take a look at which exact parameters or files are being used by opening the script and looking at the exact `parser.add_argument` calls. If you want to use different files, you can prepare files with the same format as those used in the scripts and pass them as arguments to the scripts.
+
+%% Cell type:code id: tags:
+
+``` python
+%%bash
+# make directories for programs
+mkdir -p ALE AnGST RANGER RANGER-Fast Count GLOOME_with_tree GLOOME_without_tree
+```
+
+%% Cell type:markdown id: tags:
+
+## Ranger
+
+From the manual for RANGER-DTL 2.0, the program takes as input a single file (specified using the `–i` command line option) containing first the species tree, followed by a single gene tree. All input trees must be expressed using the Newick format terminated by a semicolon, and they must be fully binary (fully resolved) and rooted. Species names in the species tree must be
+unique. E.g., ((speciesA, speciesB), speciesC);
+
+Each leaf in the gene tree must be labeled with the name of the species from which that gene was sampled. If desired, the gene name can be appended to the species name separated by an underscore `_` character. The gene tree may contain any number (zero, one, or more) of homologous genes from the same species. E.g., (((speciesA_gene1, speciesC_gene1), speciesB_geneX), speciesC_gene2);
+
+The following script (`src/run_rangerdtl.py`) runs Ranger on each of the gene trees. First it creates input files for each gene tree in our dataset by concatenating the species tree and the gene tree into one input file. Then it runs using `multiprocessing`, the program `Ranger-DTL-Fast` (from `SupplementaryPrograms/`) and `AggregateRanger` (from `CorePrograms/`) from the dir with Ranger executables. To run `AggregateRanger`, note that the output reconciliation files of Ranger-DTL-Fast needs to be of the format `{prefix}{index}` where in our case the prefix is `recon` and the indices begin with 1. The path to these output files are then provided to `AggregateRanger` by using the prefix only. See the comments in the script for more information.
+
+Run Ranger using the script `run_rangerdtl.py` in the following manner:
+```bash
+# cd RANGER
+$ nohup python ../src/run_rangerdtl.py > nohup_ranger.out & disown
+```
+
+### Ranger-Fast
+
+Run Ranger-Fast using the same script but with `--fast` flag:
+```bash
+# cd RANGER-Fast
+$ nohup python ../src/run_rangerdtl.py --fast > nohup_ranger_fast.out & disown
+```
+
+%% Cell type:markdown id: tags:
+
+## AnGST
+
+%% Cell type:markdown id: tags:
+
+AnGST needs the species tree to be rooted AND to have an additional branch length for the root (prepared in the previous step's notebook; the needed species tree file is named `*_with_root_length.nwk`).
+
+In case of any alteration of the scripts below, note that "AnGST.py" can only be run with python2.
+
+We need an input file for "AnGST.py". The input file requires to have the path to the species tree/gene tree/output/penalties. Example:
+
+%% Cell type:raw id: tags:
+
+species=/path/X_species_edited_angst.nwk
+gene=/path/one_unrooted_genetree.nwk
+output=/path/output_result
+penalties=/path/penalty.file
+
+%% Cell type:markdown id: tags:
+
+We create the penalty file based on the parameters suggested in the manual.
+
+%% Cell type:code id: tags:
+
+``` python
+%%bash
+# create AnGST directory if it doesn't exist
+# write penalty file
+# these params were taken from the AnGST manual
+mkdir -p ./AnGST
+cat > ./src/angst_penalty_file.txt << EOL
+hgt: 3.0
+dup: 2.0
+los: 1.0
+spc: 0.0
+EOL
+```
+
+%% Cell type:markdown id: tags:
+
+Run AnGST using the script `run_angst.py` in the following manner:
+```bash
+# cd AnGST
+$ nohup python ../src/run_angst.py > nohup_run_angst.out & disown
+```
+
+%% Cell type:markdown id: tags:
+
+## ALE
+
+The script here makes use of the direct compilation of ALE (instead of using `Docker`). Run the script from a terminal, from inside the `ALE` directory inside `program runs` directory. For each gene tree, the script runs a new process where the gene tree is written to a new file, `ALEobserve` is used to create `.ale` files, and `ALEml_undated` is run to perform the reconciliation. Sometimes the ALE programs give errors for no particular reason, so the script also has a number of retries that it can perform for each of the ALE programs before giving up. Use `--help` to see the options available or the default values used.
+
+```bash
+# cd ALE
+$ nohup python ../src/run_ALE.py > run_ALE_nohup.out & disown
+```
+
+%% Cell type:markdown id: tags:
+
+## GLOOME and Count
+
+The function below creates the presence-absence (PA) matrix for GLOOME, based on the PA of taxa in the NOGs of interest. We use that matrix (as a fasta file input) along with the species tree (and a separate run, without the tree), to infer gains and losses. Since Count also uses a PA matrix but in TSV format, we prepare it here itself. Note that Count requires each row to be that of a gene family (in our case, a NOG) whereas GLOOME needs each row (aka each sequence in the FASTA file) to be that of a taxon.
+
+%% Cell type:code id: tags:
+
+``` python
+%%bash -s "$taxonomicID" "$data"
+
+# write param file that uses the rooted tree
+cat > ./GLOOME_with_tree/gloome_tree.params << EOL
+_seqFile $2$1_pa_matrix.fasta
+_treeFile $2$1_wol_tree_pruned_no_internal_labels.nwk
+_gainLossDist 1
+# for COG and EggNOG only patterns with 3 or more ones are observable
+_minNumOfOnes 4
+# include Parsimony results along with ML
+_costMatrixGainLossRatio 1
+## Advanced
+_logValue 4
+_outDir Results_GLOOME_with_tree
+EOL
+
+# write param file that doesn't use the rooted tree
+cat > ./GLOOME_without_tree/gloome_without_tree.params << EOL
+_seqFile $2$1_pa_matrix.fasta
+_gainLossDist 1
+# for COG and EggNOG only patterns with 3 or more ones are observable
+_minNumOfOnes 4
+# include Parsimony results along with ML
+_costMatrixGainLossRatio 1
+## Advanced
+_logValue 4
+_outDir Results_GLOOME_without_tree
+EOL
+```
+
+%% Cell type:markdown id: tags:
+
+Run Gloome via command line for each of the param files as input, inside GLOOME directory (`cd GLOOME`). Using `nohup`, `&`, and `disown` here because the process may take a while to finish, and I am running it remotely on a server. Processes shouldn't be affected this way, even if I logout.
+
+```bash
+nohup ~/bin/GLOOME.VR01.266 gloome_tree.params > nohup_gloome_run_with_tree.out & disown
+
+nohup ~/bin/GLOOME.VR01.266 gloome_without_tree.params > nohup_gloome_run_without_tree.out & disown
+```
+
+%% Cell type:markdown id: tags:
+
+### Run Count
+
+%% Cell type:code id: tags:
+
+``` python
+%%bash -s "$taxonomicID" "$data"
+# exit if command exits with non-zero status
+set -e
+# change to the directory where Count will be run
+cd ./program_runs/Count/Count_MP/
+
+# for `gain` parameter being 0.33, 0.5, 1, 2, 3, 4, 5, 6, 7, run Count
+for g in 0.33 0.5 1 2 3 4 5 6 7 8;
+do
+    # run Count
+    java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.AsymmetricWagner -gain $g $2$1_wol_tree_pruned_with_internal_labels.nwk $2$1_pa_matrix.tsv > ${1}_Count_output_gain_${g}.tsv && echo "Count with gain ratio $g done" &&
+    # grep the output
+    grep "# PRESENT" ${1}_Count_output_gain_${g}.tsv > ${1}_Count_output_gain_${g}_genome_sizes.tsv &&
+    grep "# CHANGE" ${1}_Count_output_gain_${g}.tsv > ${1}_Count_output_gain_${g}_changes.tsv &&
+    grep "# FAMILY" ${1}_Count_output_gain_${g}.tsv > ${1}_Count_output_gain_${g}_families.tsv &&
+    echo "Count output for gain ratio $g grepped"
+done
+```
+
+%% Output
+
+    Count with gain ratio 0.33 done
+    Count output for gain ratio 0.33 grepped
+    Count with gain ratio 0.5 done
+    Count output for gain ratio 0.5 grepped
+    Count with gain ratio 1 done
+    Count output for gain ratio 1 grepped
+    Count with gain ratio 2 done
+    Count output for gain ratio 2 grepped
+    Count with gain ratio 3 done
+    Count output for gain ratio 3 grepped
+    Count with gain ratio 4 done
+    Count output for gain ratio 4 grepped
+    Count with gain ratio 5 done
+    Count output for gain ratio 5 grepped
+    Count with gain ratio 6 done
+    Count output for gain ratio 6 grepped
+    Count with gain ratio 7 done
+    Count output for gain ratio 7 grepped
+    Count with gain ratio 8 done
+    Count output for gain ratio 8 grepped
+
+%% Cell type:markdown id: tags:
+
+## Wn
+
+%% Cell type:markdown id: tags:
+
+Wn can be run directly using the script `run_Wn.py` in the `src` directory:
+
+```bash
+cd 02-run_programs/program_runs/Wn/
+nohup ~/mambaforge/envs/hgt_analyses/bin/python ../../../02-run_programs/src/run_Wn.py > nohup_run_Wn.log & disown
+```
+
+The results will be saved by default in the `Results` directory.
--- a/02-run_programs/program_runs
+++ b/02-run_programs/program_runs
+../data/program_runs
\ No newline at end of file
--- a/02-run_programs/src/FC_analysis.py
+++ b/02-run_programs/src/FC_analysis.py
+''' This script does all the functional category analysis. It includes the calculation of absolute and relative transferability, fisher exact test on relative transferability, Kruskal-Wallis test on absolute transferability and Mann-Whitney test on meta categories. '''
+
+#list of python libraries
+import os
+import ete3
+import argparse
+import statistics
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from scipy import stats
+import scipy.stats as stats
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-m',metavar='memberfile',type=str,help='eggNOG memberfile')
+parser.add_argument('-a',metavar='annotations',type=str,help='eggNOG annotations')
+parser.add_argument('-i',metavar='HGT file',type=str,help='HGT outfile. Each line should contain only one putative HGT candidate.')
+parser.add_argument('-g',metavar='Input type',type=int,help='HGT input type. Wn_threshold returns a file with putative HGT candidates, but with specific gene IDs, while phylogenetic programs return either pairwise recipient donor pairs or only recipients, both as taxonomic IDs. Specify the input as 0 -> gene, 1 -> taxonomix ID')
+parser.add_argument('-o',metavar='outfile name',type=str,help='Outfile name with pathing.')
+
+parser.add_argument('-f',metavar='BH correction, false positive rate',type=int,default=50,help='BH correction parameter to set the ratio of false positives.')
+args = parser.parse_args()
+
+if not args.i:
+								parser.error("No HGT input given. Please check your inputs again.")
+								exit
+if not args.o:
+								parser.error("No outfil name. Please check your inputs again.")
+								exit
+if not args.m:
+								parser.error("No memberfile to be found. Please check your inputs again.")
+								exit
+if not args.a:
+								parser.error("No annotation file to be found. Please check your inputs again.")
+								exit
+#############################################################################
+NOG2IDs = dict() # NOG to taxonomic IDs
+NOG2HGT = dict() # NOG to the amount of HGT
+prot2NOGs = dict() #protein acession ID to its NOG
+ID2NOGs = dict()
+NOG2FC = dict() # NOG to its functional category
+FC2HGT= dict() #Functional category to a list of proteins belonging to the functional category
+
+cats = ["C","G","E","F","H","I","P","Q","V","D","T","M","N","U","O","J","K","L"] 
+cat_to_total = dict() # counts the amount of NOGs of one functional category
+cat_to_transfers = dict() # counts the amount of transfers/gains of one functional category
+cat_to_transferable = dict() # counts the amount of transferable NOGs of one functional category
+cat_to_description = {
+								"C":"Energy production and conversion",
+								"G":"Carbohydrate transport and metabolism",
+								"E":"Amino acid transport and metabolism",
+								"F":"Nucleotide transport and metabolism",
+								"H":"Coenzyme transport and metabolism",
+								"I":"Lipid transport and metabolism",
+								"P":"Inorganic ion transport and metabolism",
+								"Q":"Secondary metabolites biosynthesis, transport and catabolism",
+								"V":"Defense mechanisms",
+								"D":"Cell cycle control, cell division, chromosome partitioning",
+								"T":"Signal transduction mechanisms",
+								"M":"Cell wall/membrane/envelope biogenesis",
+								"N":"Cell motility",
+								"U":"Intracellular tracking, secretion, and vesicular transport",
+								"O":"Post-translational modi cation, protein turnover, chaperones",
+								"J":"Translation, ribosomal structure and biogenesis",
+								"K":"Transcription",
+								"L":"Replication, recombination and repair"
+}
+
+meta2tottrans = dict()
+meta_to_total = dict() ## counts the amount of gene families of one meta category
+meta_to_transfers = dict() ## counts the amount of transfers/gains of one meta category
+meta_to_transferable = dict() ## counts the amount of transferable gene families of one meta category
+
+FC2tottrans = dict() #FC to all transfers in a list; each element of the list contains the amount of transfers of one NOG; needed for the Kruskal-Wallis test
+meta2tottrans = dict() #same as above, only with meta categories
+
+## Declare a dictionary where each functional category points to their corresponding meta category.
+## A -> metabolic function, B -> cellular processes/signaling, C -> informational
+cat_to_meta = {
+        "C":"A",
+        "G":"A",
+        "E":"A",
+        "F":"A",
+        "H":"A",
+        "I":"A",
+        "P":"A",
+        "Q":"A",
+        "V":"B",
+        "D":"B",
+        "T":"B",
+        "M":"B",
+        "N":"B",
+        "U":"B",
+        "O":"B",
+        "J":"C",
+        "K":"C",
+        "L":"C"
+    } 
+
+# BH-correction function
+def BH(pvalues):
+								pvalues = dict(sorted(pvalues.items(), key=lambda x:x[1]))
+								FCs = list(pvalues.keys())
+								critical = 0
+								pvalue2significant = dict()
+								for i in range(len(FCs)):
+																pvalue = pvalues[FCs[i]]
+																correction = (i+1/len(FCs))*(args.f/100)
+																if pvalue < correction:
+																								if pvalue > critical:
+																																critical = pvalue
+																
+																pvalue2significant[FCs[i]] = "no"
+								for i in range(len(FCs)):
+																if pvalues[FCs[i]] < critical:
+																								pvalue2significant[FCs[i]] = "yes"
+								return pvalue2significant
+################################################################################
+for line in open(args.m):
+								line = line.rstrip()
+								line = line.split("\t")
+								NOG = line[1]
+								protnames = line[4].split(",") # list containing all protein IDs
+								taxIDs = line[5].split(",") # list containing all taxonomic IDs
+								NOG2IDs[NOG] = taxIDs
+								NOG2HGT[NOG] = list()
+								for name in protnames:
+																if name not in prot2NOGs.keys():
+																								prot2NOGs[name] = NOG
+																else:
+																								prot2NOGs[name] += ","+NOG
+								for ID in taxIDs:
+																if ID not in ID2NOGs.keys():
+																								ID2NOGs[ID] = NOG
+																else:
+																								ID2NOGs[ID] += ","+NOG
+																								
+for line in open(args.a):
+        line = line.rstrip()
+        line = line.split("\t")
+        NOG = line[1]
+        FC = line[2]
+        NOG2FC[NOG] = FC
+        if FC not in FC2HGTkeys():
+                FC2HGT[FC] = list()
+                
+# depending on the HGT input (either gene IDs or taxonomic IDs), the general procedure differs by a bit. 
+if args.g == 0: # if the HGT input contains gene IDs
+								for line in open(args.i):
+																HGTs = line.rstrip().split(",")
+																for HGT in HGTs:
+																								if HGT in prot2NOGs.keys():
+																																for NOG in prot2NOGs[HGT].split(","):
+																																								NOG2HGT[NOG].append(HGT)
+
+else: # if the HGT input contains taxnomic IDs
+								for line in open(args.i):
+																HGTs = line.rstrip().split(",")
+																for HGT in HGTs:
+																								if HGT in ID2NOGs.keys():
+																																for NOG in ID2NOGs[HGT].split(","):
+																																								NOG2HGT[NOG].append(HGT)
+ #########################################################################################################                                                 
+# Now sort the NOGs into their functional category. After that, count the amount of HGT per functional category/meta category    
+
+
+for NOG in NOG2HGT:
+        if NOG in NOG2FC.keys():
+                HGTs = NOG2HGT[NOG]
+                FC = NOG2FC[NOG]
+                for HGT in HGTs:
+                        FC2HGT[FC].append(HGT)
+                        
+''' calculating the relative and absolute transferability for all functional and meta categories '''
+#### All FC
+rates_FC = open(args.o+"_rates_FC.txt","w") #this is the outfile
+
+for cat in cats: ## We are declaring every key-value pair in the three dictionaries above. Key will be the functional category and the value will be 0.
+        cat_to_total[cat] = 0
+        cat_to_transfers[cat] = 0
+        cat_to_transferable[cat] = 0
+        
+for NOG in NOG2FC:
+        
+        NOG = NOG
+        cat = NOG2FC[NOG]
+        trans = len(NOG2HGT[NOG])
+        for i in range(len(cat)): ## There might be gene families with more than one functional category, so we iterate through the whole length of "cat".
+                onecat = cat[i]
+                if onecat in cats: ## Check if this functional category is even in the list of functional categories we are looking for
+                        cat_to_total[onecat] += 1
+                        cat_to_transfers[onecat] += trans
+                        if cat not in FC2tottrans.keys():
+                                FC2tottrans[cat] = list()
+                                FC2tottrans[cat].append(trans)
+                        else:
+                                FC2tottrans[cat].append(trans)
+                        if trans > 1:
+                                cat_to_transferable[onecat] += 1
+
+## Now calculate all rates.
+sum_total = sum(list(cat_to_total[FC] for FC in cat_to_total.keys()))
+sum_transferables = sum(list(cat_to_transferable[FC] for FC in cat_to_transferable.keys())) 
+rates_FC.write("#FC\tabtrans\treltrans\n")
+for cat in cats:
+        transfers = cat_to_transfers[cat]
+        total = cat_to_total[cat]
+        transferable = cat_to_transferable[cat]
+        nontransferable = total - transferable
+
+        abtrans = transfers/total
+        rate = transferable/total
+        reltrans = rate/(sum_transferables/sum_total)
+        rates_FC.write(cat+"\t"+str(abtrans)+"\t"+str(reltrans)+"\n")
+rates_FC.close()
+
+#### Meta categories
+rates_meta = open(args.o+"_rates_meta.txt","w")
+
+#later needed for Fisher exact test
+total_transfers = 0
+total_prots = 0
+total_transferable = 0
+
+for meta in ["A","B","C"]: ## We are declaring every key-value pair in the three dictionaries above. Key will be the meta category and the value will be 0.
+        meta2tottrans[meta] = list()
+        meta_to_total[meta] = 0
+        meta_to_transfers[meta] = 0
+        meta_to_transferable[meta] = 0
+        
+for NOG in NOG2FC:
+        
+        NOG = NOG
+        cat = NOG2FC[NOG]
+        trans = len(NOG2HGT[NOG])
+        
+        for i in range(len(cat)): ## There might be gene families with more than one functional category, so we iterate through the whole length of "cat".
+                onecat = cat[i]
+                if onecat in cats: ## Check if this functional category is even in the list of functional categories we are looking for
+                        ## get the meta category of the current functional category
+                        meta = cat_to_meta[onecat]
+                
+                        meta_to_total[meta] += 1
+                        total_prots += 1  #Fisher
+                        meta_to_transfers[meta] += trans
+                        total_transfers += trans  #Fisher
+                        meta2tottrans[meta].append(trans)
+                        if float(trans) > 2:
+                                meta_to_transferable[meta] += 1
+                                total_transferable += 1  #Fisher
+                                
+                                
+## Now calculate all transferability values.  
+sum_total = sum(list(meta_to_total[meta] for meta in meta_to_total.keys()))
+sum_transferables = sum(list(meta_to_transferable[meta] for meta in meta_to_transferable.keys())) 
+rates_meta.write("#meta\tabtrans\treltrans\n")
+category_name = {
+        "A":"metabolic function",
+        "B":"cellular processes/signalling",
+        "C":"informational"
+}
+for meta in ["A","B","C"]:
+        transfers = meta_to_transfers[meta]
+        total = meta_to_total[meta]
+        transferable = meta_to_transferable[meta]
+        nontransferable = total - transferable
+
+        abtrans_meta = transfers/total
+        rate_meta = transferable/total
+        reltrans_meta = rate_meta/(sum_transferables/sum_total)
+        rates_meta.write(meta+"\t"+str(abtrans_meta)+"\t"+str(reltrans_meta)+"\n")
+rates_meta.close()
+
+
+''' Fisher exact test on relative transferability '''
+#For all the FC we have the three dictionaries to use:
+#cat_to_total
+#cat_to_transfers
+#cat_to_transferable
+fisher_outfile = open(args.o+"_fisher_allFC.txt","w")
+total_transferable_FC = sum(list(cat_to_transferable[FC] for FC in cat_to_transferable.keys()))
+
+pvalues = dict()
+oddsratios = dict()
+for FC in cat_to_transferable.keys():
+								transferable = cat_to_transferable[FC]
+								nontransferable = cat_to_total[FC] - transferable
+								cont_transferable = total_transferable_FC - transferable
+								cont_nontransferable = sum(list(cat_to_total[FC] for FC in cat_to_total.keys())) - total_transferable_FC - nontransferable
+
+								data = [[transferable,nontransferable],[cont_transferable,cont_nontransferable]]
+								df = pd.DataFrame(data)
+								oddsratio,pvalue = stats.fisher_exact(df)
+								pvalues[FC] = pvalue
+								oddsratios[FC] = oddsratio
+								
+BH_pvalues = BH(pvalues)
+for FC in cats:
+								pvalue = pvalues[FC]
+								significant = BH_pvalues[FC]
+								oddsratio = oddsratios[FC]
+								if significant == "yes":
+																fisher_outfile.write(cat_to_description[FC]+" ("+FC+")\t"+str(pvalue)+"\t"+str(oddsratio)+"\t*\n")
+								else:
+																fisher_outfile.write(FC+"\t"+str(pvalue)+"\t"+str(oddsratio)+"\t \n")
+fisher_outfile.close()
+
+
+#For the meta categories we have the three dictionaries to use:
+#meta_to_total
+#meta_to_transfers
+#meta_to_transferable
+pvalues = dict()
+oddsratios = dict()
+fisher_outfile = open(args.o+"_fisher_allmeta.txt","w")
+total_transferable_meta = sum(list(meta_to_transferable[meta] for meta in ["A","B","C"]))
+for meta in ["A","B","C"]:
+								transferable = meta_to_transferable[meta]
+								nontransferable = meta_to_total[meta] - transferable
+								cont_transferable = total_transferable_meta - transferable #this is the total amount of transfers across all meta categories excluding the current meta cat
+								cont_nontransferable = sum(list(meta_to_total[meta] for meta in meta_to_total.keys())) - total_transferable_meta - nontransferable #this is the total amount of non-transfers across all meta categories excluding the current meta cat
+        
+        #print(transferable,nontransferable,cont_transferable,cont_nontransferable)
+								data = [[transferable,nontransferable],[cont_transferable,cont_nontransferable]]
+								df = pd.DataFrame(data)
+								oddsratio, pvalue = stats.fisher_exact(df)
+								pvalues[meta] = pvalue
+								oddsratios[meta] = oddsratio
+								
+BH_pvalues = BH(pvalues)
+for meta in ["A","B","C"]:
+								pvalue = pvalues[meta]
+								significant = BH_pvalues[meta]
+								oddsratio = oddsratios[meta]
+								if significant == "yes":
+																fisher_outfile.write(category_name[meta]+" ("+meta+")\t"+str(pvalue)+"\t"+str(oddsratio)+"\t*\n")
+								else:
+																fisher_outfile.write(meta+"\t"+str(pvalue)+"\t"+str(oddsratio)+"\t \n")
+fisher_outfile.close()
+
+''' Kruskal-Wallis-test on absolute transferability '''
+kruskal_outfile = open(args.o+"_kruskal_results.txt","w")
+#### for all FC
+K = list()
+for FC in FC2tottrans.keys():
+        K.append(FC2tottrans[FC])
+
+kruskal_outfile.write(str(stats.kruskal(K[0],K[1],K[2],K[3],K[4],K[5],K[6],K[7],K[8],K[9],K[10],K[11],K[12],K[13],K[14],K[15],K[16],K[17]))+"\n")
+kruskal_outfile.close()
+
+''' Mann-Whitney test on meta categories '''
+MW_outfile = open(args.o+"_mannwhitney_results.txt","w")
+M = list()
+for meta in meta2tottrans:
+        transf=meta2tottrans[meta]
+        transf=[float(transf[j]) for j in range(1,len(transf))]
+        M.append(transf)
+        
+AB = stats.mannwhitneyu(M[0],M[1])
+AC = stats.mannwhitneyu(M[0],M[2])
+BC = stats.mannwhitneyu(M[1],M[2])
+
+MW_outfile.write(str(AB)+"\n")
+MW_outfile.write(str(AC)+"\n")
+MW_outfile.write(str(BC)+"\n")
+MW_outfile.close()
+
+
--- a/02-run_programs/src/angst_penalty_file.txt
+++ b/02-run_programs/src/angst_penalty_file.txt
+hgt: 3.0
+dup: 2.0
+los: 1.0
+spc: 0.0
--- a/02-run_programs/src/run_ALE.py
+++ b/02-run_programs/src/run_ALE.py
+import os
+import subprocess
+from multiprocessing import Pool
+from loguru import logger
+import glob
+import shutil
+import time
+
+
+def run_ALE_on_NOG(NOG_ID, species_tree_filepath, gene_trees_filepath, bin_dir, n_samples, separator, max_retries):
+    """Run ALE on a single NOG_ID
+    Args:
+        NOG_ID (str): NOG ID to run ALE on
+        species_tree_filepath (str): Path to the species tree file
+        gene_trees_filepath (str): Path to the gene trees file
+        bin_dir (str): Path to the ALE bin directory
+        n_samples (int): Number of reconciliation samples for ALEml_undated
+        separator (str): Separator for ALEml_undated
+
+    Returns:
+        NOG_ID (str): NOG ID that was run, if successful
+        If unsuccessful, returns None
+    """
+    base_dir = os.getcwd()  # this is the directory where the script is run from
+
+    genetree_filepath = os.path.join(base_dir, f"{NOG_ID}.tree")
+    # write this specific genetree to a file by grepping for the NOG_ID
+    # and then writing the second column to a file
+    try:
+        with open(genetree_filepath, 'w') as tree_file_object:
+            p1 = subprocess.Popen(
+                ["grep", NOG_ID, gene_trees_filepath], stdout=subprocess.PIPE)
+            p2 = subprocess.Popen(
+                ["awk", "{print $2}"], stdin=p1.stdout, stdout=tree_file_object)
+            p2.communicate()
+            if p1.poll() is None:
+                p1.communicate()
+        logger.info(f"Successfully wrote gene tree {
+                    NOG_ID} to file {genetree_filepath}")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Failed to write gene tree to file with error {e}")
+        return None
+    # run ALE on the genetree
+    # first run ALEobserve to create the .ale file
+    n_sleep = 2  # sleep for n_sleep seconds before retrying
+    for i in range(max_retries + 1):
+        try:
+            subprocess.run(
+                [f"{bin_dir}ALEobserve", genetree_filepath], check=True)
+            break
+        except subprocess.CalledProcessError as e:
+            if i < max_retries:  # i is zero indexed
+                logger.warning(f"ALEobserve failed with error {
+                               e}, retrying ({i+1}/{max_retries})")
+                # sleep for n_sleep seconds before retrying
+                time.sleep(n_sleep)
+                continue
+            else:
+                logger.error(f"ALEobserve failed with error {
+                             e}, no more retries left")
+                return None
+
+    # then run ALEml_undated with retry mechanism
+    for i in range(max_retries + 1):
+        try:
+            aleml_undated_command = [f"{bin_dir}ALEml_undated", species_tree_filepath, f"{
+                genetree_filepath}.ale", f'separators="{separator}"', f"sample={n_samples}"]
+            logger.info(f"Running ALEml_undated with command: {
+                        aleml_undated_command}")
+            subprocess.run(aleml_undated_command, check=True)
+            return NOG_ID  # If ALEml_undated succeeds, return the NOG_ID
+        except subprocess.CalledProcessError as e:
+            if i < max_retries:  # i is zero indexed
+                logger.warning(f"ALEml_undated failed with error {
+                               e}, retrying ({i+1}/{max_retries})")
+                time.sleep(n_sleep)
+                continue
+            else:
+                logger.error(f"ALEml_undated failed with error {
+                             e}, no more retries left")
+                return None
+
+    return NOG_ID
+
+
+def run_ALE_on_NOG_wrapper(args):
+    return run_ALE_on_NOG(*args)
+
+
+if __name__ == '__main__':
+    # calculate total time taken
+    from datetime import timedelta
+    import time
+    start_time = time.time()
+    # log using start time
+    logger.info("Starting ALE run")
+
+    # write everything in terms of argparse instead of hardcoding
+    import argparse
+    parser = argparse.ArgumentParser(description="Run ALE on all gene trees")
+    parser.add_argument("--species", "-s", type=str, default="../../data/1236_wol_tree_pruned_no_internal_labels.nwk",
+                        help="Path to species tree file (default: ../../data/1236_wol_tree_pruned_no_internal_labels.nwk)")
+    parser.add_argument("--gene", "-g", type=str, default="../../data/1236_pruned_gene_trees.tsv.rooted",
+                        help="Path to gene trees file (default: ../../data/1236_pruned_gene_trees.tsv.rooted)")
+    parser.add_argument("--threads", "-t", type=int, default=50,
+                        help="Number of threads to use for parallelization (default: 50)")
+    parser.add_argument("--bin", "-b", type=str,
+                        default="/root/bin/ALE/bin/", help="Path to ALE bin directory (default: /root/bin/ALE/bin/)")
+    parser.add_argument("--samples", "-n", type=int, default=100,
+                        help="Number of reconciliation samples for ALEml_undated (default: 100)")
+    parser.add_argument("--separator", "-sep", type=str, default=".",
+                        help="Separator for ALEml_undated (default: '.')")
+    parser.add_argument("--debug", "-d", action="store_true",
+                        help="Enable debug mode (run ALE on first 5 input trees only)")
+    parser.add_argument("--output-dir", "-o", type=str, default="./Results/",
+                        help="Path to output directory (default: ./Results/)")
+    parser.add_argument("--retries", "-r", type=int, default=10,
+                        help="Max number of retries for ALEml_undated and ALEobserve (default: 10)")
+    args = parser.parse_args()
+
+    # parse the arguments
+    species_tree_filepath = args.species
+    gene_trees_filepath = args.gene
+    max_threads = args.threads
+    bin_dir = args.bin
+    n_samples = args.samples
+    separator = args.separator
+    output_dir = args.output_dir
+    max_retries = args.retries
+
+    # log the arguments
+    logger.info(f"\nArguments passed: \
+                \nSpecies tree file: {species_tree_filepath} \
+                \nGene trees file: {gene_trees_filepath} \
+                \nNumber of threads: {max_threads} \
+                \nALE bin directory: {bin_dir} \
+                \nNumber of samples: {n_samples} \
+                \nSeparator: {separator} \
+                \nOutput directory: {output_dir} \
+                \nNumber of retries: {max_retries}")
+
+    # read and extract first column as list of NOG_IDs
+    with open(gene_trees_filepath, "r") as treeIDs:
+        NOG_ID_list = [line.rstrip().split("\t")[0]
+                       for line in treeIDs.readlines()]
+
+    # if debug
+    if args.debug:
+        NOG_ID_list = NOG_ID_list[:5]
+        logger.warning(
+            "Debug mode enabled. Running ALE on first 5 gene trees only")
+        max_threads = 5
+
+    # log the number of NOGs to run ALE on, with time
+    logger.info(f"ALE will be run on {(len(NOG_ID_list))} NOGs, using {
+                str(max_threads)} threads")
+
+    with Pool(processes=max_threads) as pool:  # run ALE on each NOG in parallel
+        for pool_out in pool.imap(run_ALE_on_NOG_wrapper, [(NOG_ID, species_tree_filepath, gene_trees_filepath, bin_dir, n_samples, separator, max_retries) for NOG_ID in NOG_ID_list]):
+            if pool_out is not None:                                # if ALE run was successful
+                # log time when you finished this NOG_ID
+                print(f"{time.strftime('%X %x %Z')}: Finished {pool_out}\n")
+
+    # move all files that end with `ale.uTs` to the outputs directory
+    # check if the outputs directory exists, if not create it. If creating new, raise warning
+    if os.path.exists(output_dir):
+        logger.warning(
+            "The outputs directory already exists. Overwriting files in the outputs directory")
+        # remove all files in the outputs directory
+        for file in glob.glob(f"./{output_dir}/*"):
+            os.remove(file)
+    else:
+        os.makedirs(output_dir)
+    # remove all `*.ale.uml_rec` files
+    # for file in glob.glob(f"./*.ale.uml_rec"):
+        # os.remove(file)
+    # or move them to the outputs directory
+    for file in glob.glob(f"./*.ale.uml_rec"):
+        shutil.move(file, output_dir)
+    # get the filename of the species tree
+    species_tree_filename = os.path.basename(species_tree_filepath)
+    for file in glob.glob(f"./*.ale.uTs"):
+        shutil.move(file, output_dir)
+    # remove all files that end with .ale
+    for file in glob.glob(f"./*.ale"):
+        os.remove(file)
+    # move all files that end with .tree to the trees directory
+    # create the directory if it doesn't exist
+    os.makedirs("./tmp_trees/", exist_ok=True)
+    for file in glob.glob(f"./*.tree"):
+        shutil.move(file, "./tmp_trees/")
+
+    # calculate total time taken
+    end_time = time.time()
+    total_time = end_time - start_time
+    # log the time taken in a readable format
+    logger.info(f"Total time taken: {timedelta(seconds=total_time)}")
--- a/02-run_programs/src/run_Wn.py
+++ b/02-run_programs/src/run_Wn.py
--- a/02-run_programs/src/run_angst.py
+++ b/02-run_programs/src/run_angst.py
+# Run AnGST with multiprocessing. This script requires python2 to run AnGST.
+import os
+import subprocess
+from multiprocessing import Pool
+import time
+import logging
+import shutil
+from datetime import timedelta
+from ete3 import Tree
+
+# Create or get the logger
+logger = logging.getLogger(__name__)
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s | %(levelname)s | %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+
+
+def run_ANGST_on_NOG(pool_args):
+    """
+    This function is used to run AnGST in a multithreaded procedure.
+    For each thread/worker process, the function is called with a different NOG_ID.
+    Args:
+        pool_args (tuple): Tuple containing the following elements:
+            nog_id (str): NOG ID to run AnGST on
+            species_tree (str): Path to species tree file
+            gene_tree_filepath (str): Path to gene tree file
+            output_dir (str): Path to output directory
+            penalty_file (str): Path to penalty file
+    Returns:
+        nog_id (str): NOG ID that was run, if successful
+        If unsuccessful, returns None
+    """
+
+    nog_id, species_tree, gene_tree_filepath, output_dir, penalty_file, bin_dir, tmp_dir = pool_args
+
+    # create input file for AnGST in tmp folder
+    input_path = os.path.join(tmp_dir, "{}.input".format(nog_id))
+    angst_script = "{}/angst_lib/AnGST.py".format(bin_dir)
+    # output directory for AnGST results for this NOG
+    output_dir = os.path.join(output_dir, nog_id)
+
+    input_content = [
+        "species={}\n".format(species_tree),
+        "gene={}\n".format(gene_tree_filepath),
+        "output={}/\n".format(output_dir),
+        "penalties={}".format(penalty_file)
+    ]
+
+    # write input file
+    with open(input_path, "w") as infile:
+        infile.writelines(input_content)
+
+    # Run AnGST
+    try:
+        logger.info("Running AnGST for NOG_ID {}".format(nog_id))
+        subprocess.check_call(["python2", angst_script, input_path])
+    except subprocess.CalledProcessError as e:
+        logger.error(
+            "AnGST failed for NOG_ID {} with error\n {}".format(nog_id, e))
+        return None
+
+    return nog_id
+
+
+def unroot(newick_string, outfile_path):
+    """
+    This function takes a newick string and writes it to a file after unrooting it.
+    Args:
+        newick_string (str): Newick string to be written to file
+        outfile_path (str): Path to output file
+    Returns:
+        None
+    """
+    # read in the newick string
+    t = Tree(newick_string, format=1)
+    # unroot the tree
+    t.unroot()
+    # write the tree to file
+    t.write(outfile=outfile_path, format=1)
+    logger.info("Unrooted gene tree and wrote it to {}".format(outfile_path))
+    return None
+
+
+if __name__ == '__main__':
+    import argparse
+    # track time
+    start_time = time.time()
+    logger.info("Started running {} at {}".format(__file__, time.asctime()))
+
+    # use argparse for reading in the number of threads and the input trees file
+    parser = argparse.ArgumentParser(
+        description="Run AnGST on a set of gene trees")
+    parser.add_argument("--species", "-s", type=str, default="../../data/1236_wol_tree_pruned_angst.nwk",
+                        help="Path to species tree file (default: ../../data/1236_wol_tree_pruned_angst.nwk)")
+    parser.add_argument("--gene", "-g", type=str, default="../../data/1236_pruned_gene_trees.tsv",
+                        help="Path to gene trees file (default: ../../data/1236_pruned_gene_trees.tsv)")
+    parser.add_argument("--threads", "-t", type=int, default=50,
+                        help="Number of threads to use for parallelization (default: 50)")
+    parser.add_argument("--output", "-o", type=str, default="./Results",
+                        help="Output directory for AnGST results (default: ./Results)")
+    parser.add_argument("--penalties", "-p", type=str, default="../src/angst_penalty_file.txt",
+                        help="Path to penalty file for AnGST (default: ../src/angst_penalty_file.txt)")
+    parser.add_argument("--bin", "-b", type=str, default="/root/bin/angst/",
+                        help="Path to AnGST bin directory (default: /root/bin/angst/)")
+
+    # parse args
+    args = parser.parse_args()
+    species_tree = args.species
+    gene_trees = args.gene
+    max_threads = args.threads
+    output_dir = args.output
+    penalty_file = args.penalties
+    bin_dir = args.bin
+    # log arguments
+    logger.info("Running {} with arguments: {}".format(__file__, args))
+
+    # convert all the paths to absolute paths
+    species_tree = os.path.abspath(species_tree)
+    gene_trees = os.path.abspath(gene_trees)
+    output_dir = os.path.abspath(output_dir)
+    penalty_file = os.path.abspath(penalty_file)
+    bin_dir = os.path.abspath(bin_dir)
+
+    # create the output directory if it does not exist
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+        logger.info("Created output directory {}".format(output_dir))
+    else:
+        logger.info("Output directory {} already exists".format(output_dir))
+        # delete the output dir and recreate it
+        shutil.rmtree(output_dir)
+        os.makedirs(output_dir)
+    # create tmp directory for gene trees and input files, with a timestamp, with abs path
+    timestamp = time.strftime("%Y%m%d%H%M%S")
+    tmp_dir = os.path.join(os.getcwd(), "tmp_{}".format(timestamp))
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
+    logger.info("Created temporary directory {}".format(tmp_dir))
+
+    # Read in the gene trees and create a list of eggNOG IDs
+    # additionally, write out the species tree and each gene tree to a file as first and second lines,
+    # with the ID of gene tree as the filename, in the tmp folder
+    if not os.path.exists(gene_trees):
+        logger.error("Gene trees file {} does not exist".format(gene_trees))
+        raise IOError("Gene trees file {} does not exist".format(gene_trees))
+    else:
+        logger.info("Reading gene trees from {}".format(gene_trees))
+        with open(gene_trees, "r") as gene_trees_fo:
+            gene_trees = gene_trees_fo.readlines()
+            nog_id_list = [tree.split("\t")[0] for tree in gene_trees]
+
+    # Debugging: run AnGST on last n_debug NOGs only
+    # n_debug=2; nog_id_list = nog_id_list[-n_debug:]; max_threads=n_debug; gene_trees=gene_trees[-n_debug:]
+
+    # unroot the gene trees and write them to file
+    for gene_tree in gene_trees:
+        nog_id, nog_newick = gene_tree.split("\t")
+        # first read in the tree using ete3 and then write it out after unrooting it
+        try:
+            unroot(nog_newick, "{}/{}.nwk".format(tmp_dir, nog_id))
+        except Exception as e:
+            logger.error(
+                "Error unrooting gene tree for NOG_ID {}\n{}".format(nog_id, e))
+            continue
+
+    # prepare list of tuples to be fed into each process for parallelization
+    pool_args = [(nog_id, species_tree, "{}/{}.nwk".format(tmp_dir, nog_id),
+                  output_dir, penalty_file, bin_dir, tmp_dir) for nog_id in nog_id_list]
+
+    # create the pool. Note that this is python2, so we can't use `with Pool() as pool:`
+    pool = Pool(processes=max_threads)
+
+    # use the pool
+    for pool_out in pool.imap(run_ANGST_on_NOG, pool_args):
+        if pool_out is not None:
+            logger.info(
+                "AnGST run for {} completed successfully".format(pool_out))
+
+    # close and join the pool
+    pool.close()
+    pool.join()
+
+    # Optional: remove the temporary directory
+    # shutil.rmtree(tmp_dir); logger.info("Removed temporary directory {}".format(tmp_dir))
+
+    # track time
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    logger.info("Finished running {} at {}, took {}".format(
+        __file__, time.asctime(), str(timedelta(seconds=elapsed_time))))
--- a/02-run_programs/src/run_rangerdtl.py
+++ b/02-run_programs/src/run_rangerdtl.py
+import subprocess
+import os
+from multiprocessing import Pool
+from loguru import logger
+import time
+import shutil
+from datetime import timedelta
+
+
+def run_RANGERDTL_on_NOG(NOG_ID, tmp_dir, bin_dir, n_runs, ranger_bin):
+    """
+    This function is used to run RangerDTL in a multithreaded procedure. 
+    For each thread/worker process, the function is called with a different NOG_ID.
+    Args:
+        NOG_ID (str): NOG ID to run RangerDTL on
+        tmp_dir (str): path to temporary directory to store input files
+        bin_dir (str): path to Ranger bin directory
+        n_runs (int): number of runs to perform
+    Returns:
+        NOG_ID (str): NOG ID that was run, if successful
+        If unsuccessful, returns None
+
+    The output of RangerDTL's AggregateRanger is a file with the same name as the input file, but with the ending "_aggregateoutput.txt". 
+    This file contains the reconciliation results of n_runs runs of RangerDTL.
+
+    All the results are stored in the folder "Ranger/Results". The input files are stored in "Ranger/tmp_{NOG_ID}_results". 
+
+    The input files are (optionally) deleted after the program has been run. 
+    """
+
+    # Make a folder for the results of each tree
+    os.makedirs(f"./Results/tmp_{NOG_ID}_results", exist_ok=True)
+    # use the NOG_ID to define the filename of the input file
+    input_file = f"{tmp_dir}/{NOG_ID}.input"
+    # Run Ranger to perform n_runs reconciliations
+    for i in range(1, n_runs+1):
+        # Run RangerDTL. Output files need to have prefix "recon{index}", with index starting at 1.
+        try:
+            subprocess.run(f"{bin_dir}/{ranger_bin} -i {input_file} -o  ./Results/tmp_{
+                NOG_ID}_results/recon{i} >> ./Results/tmp_{NOG_ID}_results/RANGERDTL.log", shell=True)
+            # subprocess.run(f"{bin_dir}/SupplementaryPrograms/Ranger-DTL-Fast.linux -i {input_file} -o  ./Results/tmp_{
+            # NOG_ID}_results/recon{i} >> ./Results/tmp_{NOG_ID}_results/RANGERDTL.log", shell=True)
+            # subprocess.run(f"{bin_dir}/CorePrograms/Ranger-DTL.linux -i {input_file} -o  ./Results/tmp_{
+            # NOG_ID}_results/recon{i} >> ./Results/tmp_{NOG_ID}_results/RANGERDTL.log", shell=True)
+        except subprocess.CalledProcessError as e:
+            logger.error(f"RangerDTL failed for NOG_ID {
+                         NOG_ID} with error\n {e}")
+            return None
+    logger.info(f"Running RangerDTL finished for NOG_ID {NOG_ID}")
+    # Aggregate the results of the runs into one file.
+    try:
+        subprocess.run(f"{bin_dir}/CorePrograms/AggregateRanger.linux ./Results/tmp_{
+                       NOG_ID}_results/recon > ./Results/{NOG_ID}_aggregateoutput.txt", shell=True)
+        logger.info(f"Aggregating results finished for NOG_ID {NOG_ID}")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Aggregating RangerDTL results failed for NOG_ID {
+                     NOG_ID} with error\n {e}")
+        return None
+
+    # Optional: Delete the input file because it is not needed anymore
+    if os.path.exists(input_file):
+        os.remove(input_file)
+        logger.info(f"Deleted input file {input_file}")
+    # Optional: remove tmp_{NOG_ID}_results folder
+    # shutil.rmtree(f"./Results/tmp_{NOG_ID}_results")
+    return NOG_ID
+
+
+def run_RANGERDTL_on_NOG_wrapper(args):
+    """
+    Wrapper function for running RangerDTL on a single NOG_ID
+    Args:
+        args (tuple): Tuple containing (NOG_ID, species_tree_filepath, gene_trees_filepath, bin_dir)
+    Returns:
+        NOG_ID (str): NOG ID that was run, if successful
+        If unsuccessful, returns None
+    """
+    return run_RANGERDTL_on_NOG(*args)
+
+
+if __name__ == '__main__':
+    # log time
+    logger.info(f"Started running {__file__} at {time.asctime()}")
+
+    start_time = time.time()
+
+    # import argparse and read in the arguments
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--speciestree", "-s", help="Path to the species tree (default: ../../data/1236_wol_tree_pruned_with_internal_labels.nwk)",
+                        type=str, default="../../data/1236_wol_tree_pruned_with_internal_labels.nwk")
+    parser.add_argument("--genetrees", "-g", help="Path to the gene trees (default: ../../data/1236_pruned_gene_trees.tsv.rooted.underscored)", type=str,
+                        default="../../data/1236_pruned_gene_trees.tsv.rooted.underscored")
+    parser.add_argument("--threads", "-t",
+                        help="Number of threads to use (default: 50)", type=int, default=50)
+    parser.add_argument("--bin", "-b", help="Path to the Ranger bin directory (default: /root/bin/RANGERDTL_LINUX/)",
+                        type=str, default="/root/bin/RANGERDTL_LINUX/")
+    parser.add_argument(
+        "--runs", "-r", help="Number of runs to perform (default: 100)", type=int, default=100)
+    parser.add_argument(
+        "--fast", "-f", help="Use RANGER-DTL-Fast instead of RANGER-DTL (default: False)", default=False, action="store_true")
+    args = parser.parse_args()
+    # log arguments
+    logger.info(f"Running {__file__} with arguments: {args}")
+    # Read in the arguments
+    species_tree_filepath = args.speciestree
+    gene_trees_filepath = args.genetrees
+    max_threads = args.threads
+    bin_dir = args.bin
+    n_runs = args.runs
+    if args.fast:
+        ranger_bin = "SupplementaryPrograms/Ranger-DTL-Fast.linux"
+    else:
+        ranger_bin = "CorePrograms/Ranger-DTL.linux"
+
+    # create temporary dir for the input files to RANGER, using tempfile with date-time stamp
+    timestamp = time.strftime("%Y%m%d%H%M%S")
+    tmp_dir = f"./tmp_ranger_{timestamp}"
+    os.makedirs(tmp_dir, exist_ok=True)
+
+    # Read in the newick string of the species tree
+    if not os.path.exists(species_tree_filepath):
+        logger.error(f"Species tree file {
+                     species_tree_filepath} does not exist")
+        raise FileNotFoundError(f"Species tree file {
+                                species_tree_filepath} does not exist")
+    else:
+        logger.info(f"Reading species tree from {species_tree_filepath}")
+        with open(species_tree_filepath, "r") as species_tree:
+            species_tree_newick = species_tree.read()
+            # make sure the newick string ends with a newline character. If not, add it.
+            if not species_tree_newick.endswith("\n"):
+                species_tree_newick += "\n"
+
+    # Read in the gene trees and create a list of eggNOG IDs
+    # additionally, write out the species tree and each gene tree to a file as first and second lines,
+    # with the ID of gene tree as the filename, in the tmp folder
+    if not os.path.exists(gene_trees_filepath):
+        logger.error(f"Gene trees file {gene_trees_filepath} does not exist")
+        raise FileNotFoundError(
+            f"Gene trees file {gene_trees_filepath} does not exist")
+    else:
+        logger.info(f"Reading gene trees from {gene_trees_filepath}")
+        with open(gene_trees_filepath, "r") as gene_trees_fo:
+            gene_trees = gene_trees_fo.readlines()
+            NOG_ID_list = [tree.split("\t")[0] for tree in gene_trees]
+            for gene_tree in gene_trees:
+                nog_id, nog_newick = gene_tree.split("\t")
+                # add newline character to newick string if it doesn't end with it
+                if not nog_newick.endswith("\n"):
+                    nog_newick += "\n"
+                with open(f"{tmp_dir}/{nog_id}.input", "w") as gene_tree_fo:
+                    gene_tree_fo.write(species_tree_newick)
+                    gene_tree_fo.write(nog_newick)
+
+    # Debugging: Run RangerDTL on last 10 NOGs only
+    # NOG_ID_list = NOG_ID_list[-2:]; max_threads=2;
+
+    logger.info(f"Running RangerDTL with {max_threads} threads on {
+                len(NOG_ID_list)} NOGs, performing {n_runs} runs for each NOG.")
+    with Pool(processes=max_threads) as pool:
+        # iterate through what has been created via pool.imap()
+        for pool_out in pool.imap(run_RANGERDTL_on_NOG_wrapper, [(NOG_ID, tmp_dir, bin_dir, n_runs, ranger_bin) for NOG_ID in NOG_ID_list]):
+            if pool_out is not None:
+                logger.info(f"RangerDTL finished for {pool_out}")
+
+    # log time taken
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    logger.info(f"Finished running {__file__} at {time.asctime()}, took {
+                str(timedelta(seconds=elapsed_time))}")
--- a/03-analyse_outputs/03-01-compile_results.ipynb
+++ b/03-analyse_outputs/03-01-compile_results.ipynb
--- a/03-analyse_outputs/03-02-cotransfer_analyses.ipynb
+++ b/03-analyse_outputs/03-02-cotransfer_analyses.ipynb
--- a/03-analyse_outputs/03-03-intersections.ipynb
+++ b/03-analyse_outputs/03-03-intersections.ipynb
--- a/03-analyse_outputs/03-04-function_vs_acquisitions_analyses.ipynb
+++ b/03-analyse_outputs/03-04-function_vs_acquisitions_analyses.ipynb
--- a/03-analyse_outputs/03-05-coacquisitions_by_gene_characteristics.ipynb
+++ b/03-analyse_outputs/03-05-coacquisitions_by_gene_characteristics.ipynb
--- a/03-analyse_outputs/compiled_results
+++ b/03-analyse_outputs/compiled_results
+../data/compiled_results
\ No newline at end of file
--- a/03-analyse_outputs/lib/coacq_logscale_summary_lib.py
+++ b/03-analyse_outputs/lib/coacq_logscale_summary_lib.py