Initial commit

1633b33a · Swastik Mishra · 1633b33a · 1633b33a · 1633b33a · 1633b33a
Commit 1633b33a authored 6 months ago by Swastik Mishra
--- a/.gitignore
+++ b/.gitignore
+arc/
+data/
+scrap/
+.vscode
+.txt
--- a/README.md
+++ b/README.md
+# Environmental transitions and HGT rates
+
+Code to reproduce the results of the paper, "Horizontal gene transfer rates decrease during transitions to new ecosystems".
+
+Jupyter notebooks in `notebooks` directory were run in the order of their numbering. Scripts and other helper functions that they depend on or are mentioned in the notebooks can be found in the `code` directory.
+
+Please note that using 'Run all' or equivalent in the Jupyter notebooks will generally not be useful. Some of the intervening steps in the notebooks are markdown cells instructing how to run programs via shell, separately. These programs are time consuming ones, often utilising multiprocessing in an HPC. Please run the notebooks, cell by cell, keeping this in mind. Generally speaking, if you have run the previuos cell, you can run the current cell if it's a Python cell. If it's a markdown cell instructing you to do something, do it before proceeding.
+
+This study downloads and processes an number of large files, which were stored in the `data` directory, which is empty in this repo. If you follow/run the notebooks you will progressively fill the `data` directory to reproduce all the results as well a the figures in the paper. Alternatively, you can download this repo containing the results and figures in the `data` directory, from the Zenodo link in the paper.
+
+## Required python packages
+
+A Mamba/Conda environment called `hgt_analyses` was used for all the analyses. This environment with all required packages can easily be created again using the `mamba_packages.yml` file, using the following command:
+```
+mamba env create -f mamba_packages.yml
+```
+I use Mamba since it's faster than using regular Conda but if you use Conda just replace `mamba` with `conda` in the above command.
+
+The only other file you need to install is `xlsx2csv` using `pip`:
+```
+pip install xlsx2csv
+```
--- a/code/compare_gene_pathogenicity_dynamics.py
+++ b/code/compare_gene_pathogenicity_dynamics.py
--- a/code/compile_results.py
+++ b/code/compile_results.py
--- a/code/download_enterobase_assemblies.py
+++ b/code/download_enterobase_assemblies.py
+#!/usr/bin/env python3
+
+import os
+import time
+import argparse
+import getpass
+import requests
+from loguru import logger
+import sys
+from urllib.parse import urlencode
+from requests.exceptions import RequestException
+import json
+
+
+def get_command_line_arguments():
+    cl_parser = argparse.ArgumentParser(
+        description="Download genome assemblies from Enterobase given a list of assembly barcodes.\n \
+            Download the assembly barcodes by using the web GUI for \n \
+                filtering and downloading a TSV file (use 'Data > Save to Local File') \n \
+                    and extract the assembly barcodes column from the TSV file.")
+    cl_parser.add_argument("-i", "--input", help="Path to text file containing assembly barcodes of the assemblies to download.", required=True,
+                           dest="barcodes", type=str)
+    cl_parser.add_argument("-o", "--output", help="Specify output path to save assemblies", default=os.getcwd() +
+                           os.sep+"Enterobase_Assemblies"+time.strftime('%Y%m%d_%H%M'), dest="output", type=str)
+    cl_parser.add_argument("--debug", help="Run in debug mode (first 5 assembly codes only).",
+                           action='store_true', required=False, dest="debug")
+    cl_parser.add_argument(
+        "-d", "--database", help="Enterobase database: (senterica, ecoli, yersinia, mcatarrhalis)", dest="db", type=str, required=True)
+    cl_parser.add_argument("--retry", type=int, default=5,
+                           help="Number of retries for failed requests.")
+    cl_parser.add_argument("--sleep", type=int, default=5,
+                           help="Sleep time between requests in seconds.")
+    return cl_parser.parse_args()
+
+
+def check_user_input(cl_args):
+    if not os.path.isfile(cl_args.barcodes):
+        logger.error('Could not find download list file.')
+        sys.exit()
+    cl_args.output = cl_args.output + os.sep
+    os.makedirs(os.path.dirname(cl_args.output), exist_ok=True)
+    if cl_args.db not in ['senterica', 'ecoli', 'yersinia', 'mcatarrhalis']:
+        logger.error(
+            'Invalid database. Please choose from senterica, ecoli, yersinia, mcatarrhalis')
+        sys.exit()
+    if cl_args.debug:
+        logger.warning(
+            'Running in debug mode. Only the first 5 assembly codes will be downloaded. Those are:')
+
+
+def get_login_details():
+    ENTEROBASE_USERNAME = input("Please enter Enterobase username: ")
+    ENTEROBASE_PASSWORD = getpass.getpass("Please enter Enterobase password: ")
+    return ENTEROBASE_USERNAME, ENTEROBASE_PASSWORD
+
+
+def get_api_token(ENTEROBASE_USERNAME, ENTEROBASE_PASSWORD):
+    ENTEROBASE_SERVER = 'https://enterobase.warwick.ac.uk'
+    params = urlencode({"username": ENTEROBASE_USERNAME,
+                       "password": ENTEROBASE_PASSWORD})
+    apiaddress = f'{ENTEROBASE_SERVER}/api/v2.0/login?{params}'
+    logger.info('Retrieving API token.')
+    response = requests.get(apiaddress)
+    if response.text:  # Check if the response is not empty
+        try:
+            data = response.json()
+            API_TOKEN = data['api_token']
+            logger.info("Retrieved API token.")
+            return API_TOKEN
+        except json.JSONDecodeError:
+            logger.error("Failed to decode JSON from the response.")
+            return None
+    else:
+        logger.error(
+            "Empty response received while trying to retrieve API token.")
+        return None
+
+
+def create_request(request_str, API_TOKEN, retries, sleep):
+    for _ in range(retries):
+        try:
+            response = requests.get(request_str, headers={
+                                    "Authorization": f"Basic {API_TOKEN}"})
+            if response.status_code == 200:
+                return response
+            else:
+                logger.error(f"Request to {request_str} failed with status code {
+                             response.status_code}. Retrying...")
+        except RequestException as e:
+            logger.error(f"Request to {request_str} failed with exception {
+                         e}. Retrying...")
+        time.sleep(sleep)
+    return None
+
+
+def search(header_line, search_term):
+    try:
+        return header_line.split('\t').index(search_term)
+    except ValueError:
+        logger.error(f'{search_term} could not be found in header')
+
+
+def get_assembly_codes_and_dict(cl_args):
+    assembly_codes = []
+    assembly_dict = {}
+    with open(cl_args.barcodes, 'r') as fin:
+        header = fin.readline()
+        indexer_barcode = search(header, 'Assembly barcode')
+        indexer_accession = search(
+            header, 'Data Source(Accession No.;Sequencing Platform;Sequencing Library;Insert Size;Experiment;Status)')
+        for line in fin:
+            assembly_codes.append(line.split('\t')[indexer_barcode].strip())
+            assembly_dict[line.split('\t')[indexer_barcode].strip()] = line.split('\t')[
+                indexer_accession].split(';')[0]
+    return assembly_codes, assembly_dict
+
+
+def main():
+    start_time = time.time()
+    # Get command line arguments
+    cl_args = get_command_line_arguments()
+    # Check user input for errors, and set up output directory
+    check_user_input(cl_args)
+    # Set up logging
+    logger.add(f"{cl_args.output}download_enterobase_assemblies.log",
+               format="{time} {level} {message}", level="INFO")
+    # Get login details from user. Use getpass to hide password input. Get API token
+    ENTEROBASE_USERNAME, ENTEROBASE_PASSWORD = get_login_details()
+    # Get API token from Enterobase using login details
+    API_TOKEN = get_api_token(ENTEROBASE_USERNAME, ENTEROBASE_PASSWORD)
+    # Get assembly codes and dictionary from input file
+    assembly_codes, assembly_dict = get_assembly_codes_and_dict(
+        cl_args)  # Get assembly codes and dictionary
+    count = 0  # Counter for progress
+    fasta_error_count = 0
+    assembly_code_error_count = 0
+    for querycode in assembly_codes:
+        time.sleep(cl_args.sleep)
+        params = urlencode({"barcode": querycode, "limit": 50})
+        address = f'http://enterobase.warwick.ac.uk/api/v2.0/{
+            cl_args.db}/assemblies?{params}'
+        response = create_request(
+            address, API_TOKEN, cl_args.retry, cl_args.sleep)
+        # If response is not None, get the fasta download link from the response
+        if response:
+            data = response.json()
+            fasta_url = data['Assemblies'][0]['download_fasta_link']
+            fasta_response = create_request(
+                fasta_url, API_TOKEN, cl_args.retry, cl_args.sleep)
+            if fasta_response:
+                # Fasta filename is the assembly code or the accession number if it exists
+                with open(f"{cl_args.output}{assembly_dict.get(querycode, querycode)}.fasta", 'w') as fasta_out:
+                    fasta_out.write(fasta_response.text)
+                logger.info(f"Successfully downloaded {querycode}")
+            else:
+                logger.error(f"Failed to download {querycode} from {
+                             fasta_url} after retries.")
+                fasta_error_count += 1
+        # If response is None, log the failed request
+        else:
+            logger.error(f"Failed request for {querycode} from {
+                         address} after retries.")
+            assembly_code_error_count += 1
+        logger.info(f'Progress: {count+1} out of {len(assembly_codes)}')
+        count += 1
+        if cl_args.debug and count == 5:
+            break
+    # Final log of completion
+    logger.info(f"Downloaded {count} assemblies. {fasta_error_count} assemblies failed to download. {
+                assembly_code_error_count} assemblies failed to retrieve.")
+    logger.info(f"Output directory: {cl_args.output}")
+    # log total time taken in human readable format
+    logger.info(f"Total time taken: {time.strftime(
+        '%H:%M:%S', time.gmtime(time.time()-start_time))}")
+
+
+if __name__ == "__main__":
+    main()
--- a/code/iterate_iqtree.py
+++ b/code/iterate_iqtree.py
+# run IQtree on the MSA files output by MAFFT (via OrthoFinder, in previous step)
+# we run IQtree in parallel on all the MSA files
+import os
+import multiprocessing
+import subprocess
+from loguru import logger
+import random
+import sys
+
+
+def run_iqtree(
+    msa_filepath,
+    iqtree_bin,
+    output_trees_dir,
+    substitution_model,
+    threads_max,
+    threads_per_process=1,
+):
+    output_tree_prefix = f'{
+        output_trees_dir}/{os.path.basename(msa_filepath).replace(".fa", "")}'
+    # run iqtree
+    cmd = f"{
+        iqtree_bin} -s {msa_filepath} -m {substitution_model} -pre {output_tree_prefix}, -quiet -T {threads_max} -nt {threads_per_process}"
+    logger.info(
+        f"Running IQtree on {
+                msa_filepath}, to write to {output_tree_prefix} as output prefix."
+    )
+    try:
+        subprocess.run(cmd, shell=True, check=True)
+    except subprocess.CalledProcessError as e:
+        return f"Error running IQtree on {msa_filepath}: {e}"
+    logger.info(f"Finished running IQtree on {msa_filepath}")
+    return None
+
+
+def run_iqtree_wrapper(args):
+    """
+    Wrapper function to run IQtree with multiple arguments, for the multiprocessing.Pool.imap_unordered function.
+    """
+    return run_iqtree(*args)
+
+
+def get_seq_length(msa_filepath):
+    """
+    Function to get the number of characters in the first sequence of the MSA file.
+    """
+    with open(msa_filepath, "r") as f:
+        # split file by >, and take the second element, which is the first sequence
+        seq = f.read().split(">")[1]
+        return len(seq)
+
+
+if __name__ == "__main__":
+    import time
+    import argparse
+
+    start_time = time.time()
+
+    parser = argparse.ArgumentParser(
+        description="Run IQtree on all MSA files in a directory. For large MSA files, run with multiple threads per process"
+    )
+
+    parser.add_argument(
+        "-m",
+        "--msa_dir",
+        type=str,
+        required=True,
+        help="Directory containing MSA files",
+    )
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        type=str,
+        default="./gene_trees/",
+        help="Output directory for gene trees",
+    )
+    parser.add_argument(
+        "-T",
+        "--threads-max",
+        type=int,
+        default=100,
+        help="Maximum number of threads to use",
+    )
+    parser.add_argument(
+        "-t",
+        "--threads-per-process",
+        type=int,
+        default=5,
+        help="Number of threads per process to use, for large MSA files",
+    )
+    parser.add_argument(
+        "-b",
+        "--iqtree_bin",
+        type=str,
+        default="/root/bin/iqtree-2.2.2.6-Linux/bin/iqtree2",
+        help="Path to IQtree binary",
+    )
+    parser.add_argument(
+        "-d",
+        "--debug",
+        action="store_true",
+        help="Enable debug mode (run on first 5 MSA files only)",
+    )
+    parser.add_argument(
+        "-s",
+        "--substitution_model",
+        type=str,
+        default="JTT",
+        help="Substitution model for IQtree",
+    )
+    parser.add_argument(
+        "--filesize_cutoff",
+        type=int,
+        default=150,
+        help="Cutoff for number of sequences in MSA file to determine if it is a large or small file.\n \
+                        Files with more than this cutoff will be run with multiple threads per process.",
+    )
+    parser.add_argument(
+        "--min_seqs",
+        type=int,
+        default=10,
+        help="Minimum number of sequences in MSA file to run IQtree. Files with fewer sequences will be skipped.",
+    )
+    parser.add_argument(
+        "--random",
+        type=int,
+        nargs="?",
+        const=1000,
+        default=None,
+        help="If given, pick this many MSA at random. \
+                            If no number is provided, but argument is provided, default to 1000. \
+                            If this argument is not given, take all MSA above min_seqs in size.",
+    )
+    parser.add_argument(
+        "--min_len",
+        type=int,
+        default=150,
+        help="Minimum length of longest sequence of MSA file (including gaps). Files with smaller genes will be skipped.",
+    )
+    args = parser.parse_args()
+
+    # set up logging
+    logger.add(f'{args.output_dir}/iqtree_run_{time.strftime("%Y%m%d_%H%M%S")}.log')
+
+    # remove output directory if it exists
+    if os.path.exists(args.output_dir):
+        os.system(f"rm -r {args.output_dir}")
+        logger.warning(f"Removed existing output directory: {args.output_dir}")
+
+    # create output directory if it doesn't exist
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # check if IQtree binary exists and MSA directory exists
+    if not os.path.exists(args.iqtree_bin):
+        raise FileNotFoundError(f"IQtree binary not found: {args.iqtree_bin}")
+    if not os.path.exists(args.msa_dir):
+        raise FileNotFoundError(f"MSA directory not found: {args.msa_dir}")
+
+    # get all MSA files (in FASTA format, ending with ".fa") in the directory specified by `args.msa_dir`
+    msa_filepaths = [f for f in os.listdir(args.msa_dir) if f.endswith(".fa")]
+    msa_filepaths = [f"{args.msa_dir}{f}" for f in msa_filepaths]
+
+    # first get the sizes (number of characters in the second line), of all the MSA files
+    msa_lengths = [(f, get_seq_length(f)) for f in msa_filepaths]
+
+    # remove all MSA files that have fewer than min_len characters in the first sequence
+    short_seqs_to_remove = []
+    for msa_filepath, length in msa_lengths:
+        if length < args.min_len:
+            short_seqs_to_remove.append(msa_filepath)
+    msa_filepaths = [f for f in msa_filepaths if f not in short_seqs_to_remove]
+    logger.info(
+        f"Number of MSA files with more than {args.min_len} characters in seq: {len(msa_filepaths)}"
+    )
+
+    # write this list of MSA files and their sequence counts to a file in output directory
+    # with open(f"../scrap/msa_files_sequence_counts.txt", "w") as f:
+    with open(f"{args.output_dir}/msa_files_sequence_counts.txt", "w") as f:
+        for msa_filepath, num_seqs in msa_lengths:
+            f.write(f"{num_seqs}\t{msa_filepath}\n")
+
+    # for all the MSA files, find the number of sequences they have by opening and reading the number of lines starting with ">".
+    # Retain only MSA files that have more than `args.min_seqs` sequences
+    small_files_to_remove = []
+    msa_filepath_sizes = []
+    for msa_filepath in msa_filepaths:
+        with open(msa_filepath, "r") as f:
+            num_seqs = sum(1 for l in f if l.startswith(">"))
+        if num_seqs < args.min_seqs:
+            small_files_to_remove.append(msa_filepath)
+        else:
+            msa_filepath_sizes.append((msa_filepath, num_seqs))
+
+    msa_filepaths = [f for f in msa_filepaths if f not in small_files_to_remove]
+    logger.info(
+        f"Number of MSA files with more than {args.min_seqs} sequences: {len(msa_filepaths)}"
+    )
+
+    if args.random is not None:
+        msa_filepath_sizes = random.sample(
+            msa_filepath_sizes, min(args.random, len(msa_filepath_sizes))
+        )
+        logger.info(f"Picked {len(msa_filepath_sizes)} MSA files at random")
+
+    # run IQtree on all MSA files
+    if args.debug:
+        msa_filepath_sizes = msa_filepath_sizes[:5]
+
+    # we will use a hybrid approach, wherein large MSA files will have multiple threads per process
+    # and small MSA files will have a single thread per process
+    # large MSA files are those with more than 100 sequences
+    small_msa_filepaths = [
+        f for f in msa_filepath_sizes if f[1] <= args.filesize_cutoff
+    ]
+    large_msa_filepaths = [f for f in msa_filepath_sizes if f[1] > args.filesize_cutoff]
+
+    logger.info(
+        f"Number of MSA files with <= {args.filesize_cutoff} sequences: {len(small_msa_filepaths)}, and > {args.filesize_cutoff} sequences: {len(large_msa_filepaths)}"
+    )
+    # sys.exit(0)
+
+    if small_msa_filepaths:
+        logger.info(
+            f"Running IQtree on {len(small_msa_filepaths)} MSA files each with <= 100 sequences"
+        )
+        # run small MSA files first
+        with multiprocessing.Pool(processes=args.threads_max) as pool:
+            for pool_return in pool.imap_unordered(
+                run_iqtree_wrapper,
+                [
+                    (
+                        msa_filepath,
+                        args.iqtree_bin,
+                        args.output_dir,
+                        args.substitution_model,
+                        args.threads_max,
+                        1,
+                    )
+                    for msa_filepath, _ in small_msa_filepaths
+                ],
+            ):
+                if pool_return is not None:
+                    logger.error(f"Error running IQtree: {pool_return}")
+    if large_msa_filepaths:
+        logger.info(
+            f"Running IQtree on {len(large_msa_filepaths)} MSA files each with > 100 sequences"
+        )
+        # run large MSA files next
+        with multiprocessing.Pool(
+            processes=args.threads_max // args.threads_per_process
+        ) as pool:
+            for pool_return in pool.imap_unordered(
+                run_iqtree_wrapper,
+                [
+                    (
+                        msa_filepath,
+                        args.iqtree_bin,
+                        args.output_dir,
+                        args.substitution_model,
+                        args.threads_max,
+                        args.threads_per_process,
+                    )
+                    for msa_filepath, _ in large_msa_filepaths
+                ],
+            ):
+                if pool_return is not None:
+                    logger.error(f"Error running IQtree: {pool_return}")
+
+    # log time taken
+    time_taken = time.time() - start_time
+    logger.info(
+        f'Time taken to finish: {
+                time.strftime("%H:%M:%S", time.gmtime(time_taken))}'
+    )
--- a/code/run_ALE.py
+++ b/code/run_ALE.py
+import os
+import subprocess
+from multiprocessing import Pool
+from loguru import logger
+import glob
+import shutil
+import time
+
+
+def run_ALE_on_NOG(NOG_ID, species_tree_filepath, gene_trees_filepath, bin_dir, n_samples, separator, max_retries):
+    """Run ALE on a single NOG_ID
+    Args:
+        NOG_ID (str): NOG ID to run ALE on
+        species_tree_filepath (str): Path to the species tree file
+        gene_trees_filepath (str): Path to the gene trees file
+        bin_dir (str): Path to the ALE bin directory
+        n_samples (int): Number of reconciliation samples for ALEml_undated
+        separator (str): Separator for ALEml_undated
+
+    Returns:
+        NOG_ID (str): NOG ID that was run, if successful
+        If unsuccessful, returns None
+    """
+    base_dir = os.getcwd()  # this is the directory where the script is run from
+
+    genetree_filepath = os.path.join(base_dir, f"{NOG_ID}.tree")
+    # write this specific genetree to a file by grepping for the NOG_ID
+    # and then writing the second column to a file
+    try:
+        with open(genetree_filepath, 'w') as tree_file_object:
+            p1 = subprocess.Popen(
+                ["grep", NOG_ID, gene_trees_filepath], stdout=subprocess.PIPE)
+            p2 = subprocess.Popen(
+                ["awk", "{print $2}"], stdin=p1.stdout, stdout=tree_file_object)
+            p2.communicate()
+            if p1.poll() is None:
+                p1.communicate()
+        logger.info(f"Successfully wrote gene tree {NOG_ID} to file {genetree_filepath}")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Failed to write gene tree to file with error {e}")
+        return None
+    # run ALE on the genetree
+    # first run ALEobserve to create the .ale file
+    n_sleep = 2  # sleep for n_sleep seconds before retrying
+    for i in range(max_retries + 1):
+        try:
+            subprocess.run(
+                [f"{bin_dir}ALEobserve", genetree_filepath], check=True)
+            break
+        except subprocess.CalledProcessError as e:
+            if i < max_retries:  # i is zero indexed
+                logger.warning(f"ALEobserve failed with error {
+                               e}, retrying ({i+1}/{max_retries})")
+                # sleep for n_sleep seconds before retrying
+                time.sleep(n_sleep)
+                continue
+            else:
+                logger.error(f"ALEobserve failed with error {
+                             e}, no more retries left")
+                return None
+
+    # then run ALEml_undated with retry mechanism
+    for i in range(max_retries + 1):
+        try:
+            aleml_undated_command = [f"{bin_dir}ALEml_undated", species_tree_filepath, f"{genetree_filepath}.ale", f'separators="{separator}"', f"sample={n_samples}"]
+            logger.info(f"Running ALEml_undated with command: {aleml_undated_command}")
+            subprocess.run(aleml_undated_command, check=True)
+            return NOG_ID  # If ALEml_undated succeeds, return the NOG_ID
+        except subprocess.CalledProcessError as e:
+            if i < max_retries:  # i is zero indexed
+                logger.warning(f"ALEml_undated failed with error {
+                               e}, retrying ({i+1}/{max_retries})")
+                time.sleep(n_sleep)
+                continue
+            else:
+                logger.error(f"ALEml_undated failed with error {
+                             e}, no more retries left")
+                return None
+
+    return NOG_ID
+
+
+def run_ALE_on_NOG_wrapper(args):
+    return run_ALE_on_NOG(*args)
+
+
+if __name__ == '__main__':
+    # calculate total time taken
+    from datetime import timedelta
+    import time
+    start_time = time.time()
+    # log using start time
+    logger.info("Starting ALE run")
+
+    # write everything in terms of argparse instead of hardcoding
+    import argparse
+    parser = argparse.ArgumentParser(description="Run ALE on all gene trees")
+    parser.add_argument("--species", "-s", type=str, default="../../01-prepare_data/wol_tree_pruned_1236_no_internal_labels.nwk",
+                        help="Path to species tree file")
+    parser.add_argument("--gene", "-g", type=str, default="../../01-prepare_data/1236_subset_pruned_trees.tsv.rooted",
+                        help="Path to gene trees file")
+    parser.add_argument("--threads", "-t", type=int, default=50,
+                        help="Number of threads to use for parallelization")
+    parser.add_argument("--bin", "-b", type=str,
+                        default="/root/bin/ALE/bin/", help="Path to ALE bin directory")
+    parser.add_argument("--samples", "-n", type=int, default=100,
+                        help="Number of reconciliation samples for ALEml_undated")
+    parser.add_argument("--separator", "-sep", type=str, default=".",
+                        help="Separator for ALEml_undated")
+    parser.add_argument("--debug", "-d", action="store_true",
+                        help="Enable debug mode (run ALE on first 5 input trees only)")
+    parser.add_argument("--output-dir", "-o", type=str, default="./RESULTS/",
+                        help="Path to output directory")
+    parser.add_argument("--retries", "-r", type=int, default=10,
+                        help="Max number of retries for ALEml_undated and ALEobserve")
+    args = parser.parse_args()
+
+    # parse the arguments
+    species_tree_filepath = args.species
+    gene_trees_filepath = args.gene
+    max_threads = args.threads
+    bin_dir = args.bin
+    n_samples = args.samples
+    separator = args.separator
+    output_dir = args.output_dir
+    max_retries = args.retries
+
+    # log the arguments
+    logger.info(f"\nArguments passed: \
+                \nSpecies tree file: {species_tree_filepath} \
+                \nGene trees file: {gene_trees_filepath} \
+                \nNumber of threads: {max_threads} \
+                \nALE bin directory: {bin_dir} \
+                \nNumber of samples: {n_samples} \
+                \nSeparator: {separator} \
+                \nOutput directory: {output_dir} \
+                \nNumber of retries: {max_retries}")
+
+    # read and extract first column as list of NOG_IDs
+    with open(gene_trees_filepath, "r") as treeIDs:
+        NOG_ID_list = [line.rstrip().split("\t")[0]
+                       for line in treeIDs.readlines()]
+
+    # if debug
+    if args.debug:
+        NOG_ID_list = NOG_ID_list[:5]
+        logger.warning(
+            "Debug mode enabled. Running ALE on first 5 gene trees only")
+        max_threads = 5
+
+    # log the number of NOGs to run ALE on, with time
+    logger.info(f"ALE will be run on {(len(NOG_ID_list))} NOGs, using {
+                str(max_threads)} threads")
+
+    with Pool(processes=max_threads) as pool:  # run ALE on each NOG in parallel
+        for pool_out in pool.imap(run_ALE_on_NOG_wrapper, [(NOG_ID, species_tree_filepath, gene_trees_filepath, bin_dir, n_samples, separator, max_retries) for NOG_ID in NOG_ID_list]):
+            if pool_out is not None:                                # if ALE run was successful
+                # log time when you finished this NOG_ID
+                print(f"{time.strftime('%X %x %Z')}: Finished {pool_out}\n")
+
+    # move all files that end with `ale.uTs` to the outputs directory
+    # check if the outputs directory exists, if not create it. If creating new, raise warning
+    if os.path.exists(output_dir):
+        logger.warning(
+            "The outputs directory already exists. Overwriting files in the outputs directory")
+        # remove all files in the outputs directory
+        for file in glob.glob(f"./{output_dir}/*"):
+            os.remove(file)
+    else:
+        os.makedirs(output_dir)
+    # remove all `*.ale.uml_rec` files
+    # for file in glob.glob(f"./*.ale.uml_rec"):
+        # os.remove(file)
+    # or move them to the outputs directory
+    for file in glob.glob(f"./*.ale.uml_rec"):
+        shutil.move(file, output_dir)
+    # get the filename of the species tree
+    species_tree_filename = os.path.basename(species_tree_filepath)
+    for file in glob.glob(f"./*.ale.uTs"):
+        shutil.move(file, output_dir)
+    # remove all files that end with .ale
+    for file in glob.glob(f"./*.ale"):
+        os.remove(file)
+    # move all files that end with .tree to the trees directory
+    # create the directory if it doesn't exist
+    os.makedirs("./tmp_trees/", exist_ok=True)
+    for file in glob.glob(f"./*.tree"):
+        shutil.move(file, "./tmp_trees/")
+
+    # calculate total time taken
+    end_time = time.time()
+    total_time = end_time - start_time
+    # log the time taken in a readable format
+    logger.info(f"Total time taken: {timedelta(seconds=total_time)}")
--- a/code/run_MAD_on_EggNOG_parallel.py
+++ b/code/run_MAD_on_EggNOG_parallel.py
+#!/usr/bin/env python3
+
+from subprocess import CalledProcessError, Popen
+import os
+import time
+from datetime import timedelta, datetime
+import argparse
+from multiprocessing import Pool
+from loguru import logger
+
+
+def run_command(params):
+    """
+    Inputs:
+        params: tuple of (shell_command, logFileObject)
+    Function:
+        Runs the shell command, and writes the output to the log file object
+    """
+    cmd, log_file_path = params
+    with open(log_file_path, "w") as log_FO:
+        process = Popen(cmd, stdout=log_FO, universal_newlines=True)
+        if process.wait() != 0:
+            raise CalledProcessError(process.returncode, cmd)
+
+
+def MAD_roots_NOG_trees_parallel(
+    input_file_path, mad_executable, n_lines, num_subprocesses
+):
+    """
+    Inputs:
+        input_file_path: TSV format input file. Columns are: treeID(str) and tree(str)
+        mad_executable: path to MAD executable
+        n_lines: number of lines that the input file will be split into for each split file.
+    Outputs:
+        Writes out a file with the same name as the input file, but with a `.rooted` at the end of the filename.
+    Additional info:
+        This function splits the input file into multiple files, and runs MAD on each of these split files in parallel.
+        The number of processes that will be run in parallel is equal to the number of split files.
+        The number of lines in each split file is equal to the `n_lines` argument.
+        The output file will contain the same number of lines as the input file, but the second column will contain the rooted trees.
+    """
+    # Check if the input file exists
+    if not os.path.isfile(input_file_path):
+        raise FileNotFoundError(
+            f"The input file {
+                                input_file_path} does not seem to exist. Check the filepath provided."
+        )
+    # Find the full path of the input file and extract the dir name
+    input_file_path = os.path.abspath(input_file_path)
+    input_dir = os.path.dirname(input_file_path)
+
+    # Get the current date and time
+    now = datetime.now()
+    # Format the current date and time as a string
+    timestamp = now.strftime("%Y%m%d_%H%M%S")
+    # Create a temporary directory path with the timestamp
+    tmp_dir = f"{input_dir}/tmp_{timestamp}"
+    # Create the temporary directory
+    os.mkdir(tmp_dir)
+    # log this with time, along with location of the tmp folder
+    logger.info(f"Created the tmp folder at location: {tmp_dir}")
+
+    # Open the input file and read it in chunks of `n_lines` lines
+    # track the filenames of split files containing the second column (tree)
+    nwk_split_files = []
+    # track the filenames of split files containing the first column (treeID)
+    info_split_files = []
+
+    def write_split_files(chunk, chunk_index, nwk_split_files, info_split_files):
+        """
+        Helper function to write out the split files for a given chunk.
+        """
+        split_chunk = [line.split() for line in chunk]
+        nwk_split_filepath = f"{tmp_dir}/tmp_tree_split.nwk.{chunk_index}"
+        info_split_filepath = f"{tmp_dir}/tmp_tree_split.info.{chunk_index}"
+        with open(nwk_split_filepath, "w") as split_out_fo:
+            split_out_fo.write("\n".join(line[1] for line in split_chunk) + "\n")
+        with open(info_split_filepath, "w") as split_out_fo:
+            split_out_fo.write("\n".join(line[0] for line in split_chunk) + "\n")
+        nwk_split_files.append(nwk_split_filepath)
+        info_split_files.append(info_split_filepath)
+        return nwk_split_files, info_split_files
+
+    # open the input file
+    with open(input_file_path, "r") as treesFO:
+        chunk = []
+        for i, line in enumerate(treesFO):
+            chunk.append(line)
+            if (i + 1) % n_lines == 0:
+                nwk_split_files, info_split_files = write_split_files(
+                    chunk, (i // n_lines) + 1, nwk_split_files, info_split_files
+                )
+                chunk = []
+
+        # Write the last chunk if it's not empty
+        if chunk:
+            nwk_split_files, info_split_files = write_split_files(
+                chunk, (i // n_lines) + 2, nwk_split_files, info_split_files
+            )
+
+    # using subprocess.Popen inside the run_command function,
+    # call MAD on all of these split newick files. Prepare the commands.
+    # `mad -n trees.nwk.1`
+    commands = [
+        ([mad_executable, "-t", "-n", split_file_path], split_file_path + ".stdout")
+        for split_index, split_file_path in enumerate(nwk_split_files)
+    ]
+
+    mad_start_time = time.time()
+    # Log timestamp for the start of the MAD rooting process
+    logger.info(f"Starting MAD rooting")
+    # Create a multiprocessing Pool with the defined maximum number of subprocesses
+    with Pool(num_subprocesses) as pool:
+        # Use pool.map to run the commands in parallel
+        pool.map(run_command, commands)
+        # Indicate the completion of the MAD rooting process
+        logger.info(
+            f"MAD rooting completed \
+              Total time taken for MAD rooting: {timedelta(seconds=time.time() - mad_start_time)}"
+        )
+    # Log that the split files are being combined, along with the timestamp
+    logger.info("Combining the split files into one file...")
+
+    # Note: mad adds a `.rooted` at the end of the input filename, as the output filename
+    # combine the columns from the info files with the nwk.rooted files to prepare the final output file
+    with open(input_file_path + ".rooted", "w") as rootedOutputFO:
+        for i, nwk_split_filepath in enumerate(nwk_split_files):
+            # read in the corresponding split nwk.rooted file
+            rooted_nwk_split_filepath = f"{nwk_split_filepath}.rooted"
+            info_split_filepath = info_split_files[i]
+            with open(rooted_nwk_split_filepath, "r") as rooted_nwk_split_FO:
+                rooted_nwk_trees = [
+                    i.strip() for i in rooted_nwk_split_FO.readlines()
+                ]  # remove trailing newlines. This is a list of rooted trees
+
+            # combine the info and rooted trees
+            with open(info_split_filepath, "r") as info_file:
+                rooted_info_plus_tree_chunks = [
+                    "\t".join([info_line.strip(), rooted_nwk_trees[line_index]])
+                    for line_index, info_line in enumerate(info_file)
+                ]
+
+            # write out the new tree line chunks to the output file
+            rootedOutputFO.write("\n".join(rooted_info_plus_tree_chunks) + "\n")
+
+            # Clear memory
+            del rooted_info_plus_tree_chunks
+            del rooted_nwk_trees
+
+    # delete the split nwk files in the tmp folder
+    for split_file_name in nwk_split_files:
+        os.remove(split_file_name)
+    # combine all the log files into one log file using `tail -n +1`, and write in the input_dir with timestamp
+    now = datetime.now()
+    timestamp = now.strftime("%Y%m%d_%H%M%S")
+    os.system(
+        "tail -n +1 "
+        + tmp_dir
+        + "/*.stdout > "
+        + input_dir
+        + "/tmp_mad_rooting_splits_"
+        + timestamp
+        + ".log"
+    )
+    # delete the tmp folder
+    # os.system('rm -rf '+tmp_dir)
+
+
+if __name__ == "__main__":
+
+    start_time = time.time()
+    # Catches and interprets the user input from the command line.
+    parser = argparse.ArgumentParser()
+    # General stuff
+    parser.add_argument(
+        "--Input",
+        "-i",
+        required=True,
+        type=str,
+        help="path to input file which should be tsv format with two columns: first column is the tree ID and the second column is the tree in newick",
+    )
+    parser.add_argument(
+        "--mad",
+        "-m",
+        required=False,
+        type=str,
+        default="mad",
+        help="path to MAD executable. Assumes mad is within $PATH by default, if this argument is not provided.",
+    )
+    parser.add_argument(
+        "--nlines",
+        "-n",
+        required=False,
+        type=int,
+        default=50,
+        help="(Int) default=50. \
+                        Number of lines that the input file will be split into for each split\
+                        file. Smaller number implies less trees being processed by one MAD process.",
+    )
+    parser.add_argument(
+        "--num_subprocesses",
+        "-p",
+        required=False,
+        type=int,
+        default=50,
+        help="(Int) default=50. Number of processes to run in parallel. \
+                        The larger this number, the more number of processes that will \
+                        be run in parallel, since each split file chunk gets its own process",
+    )
+
+    # parser.add_argument('--Output', '-o', required=True, type=str,
+    #                     help="path to output files")
+
+    args = parser.parse_args()
+
+    MAD_roots_NOG_trees_parallel(
+        args.Input, args.mad, args.nlines, args.num_subprocesses
+    )
+
+    logger.info(
+        "Process time taken from start to finish: "
+        + str(timedelta(seconds=time.time() - start_time))
+    )
--- a/notebooks/01-01-dload_data.ipynb
+++ b/notebooks/01-01-dload_data.ipynb
--- a/notebooks/01-02-prepare_trees.ipynb
+++ b/notebooks/01-02-prepare_trees.ipynb
--- a/notebooks/02-01-convert_NCBI_IDs.ipynb
+++ b/notebooks/02-01-convert_NCBI_IDs.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "ALE seems to fail if the input gene IDs in terms of NCBI IDs currently of the form:\n",
+    " - for genome IDs in genomes tree: `GCF_xxxxxxxx_n`\n",
+    " - for gene IDs in gene trees: `GCF_xxxxxxxx_n_ABC_xxxxxxxx_m`\n",
+    "\n",
+    "Here we convert the genome tree taxa IDs to our own naming of the form `txidxxxxxxx` and the gene tree gene IDs to our own naming of the form `txidxxxxxxx_ABC_xxxxxxxx_n`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# suppress syntax warnings of ete3\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\", category=SyntaxWarning)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tree written to ../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled.renamed\n",
+      "Old name\tNew name\n",
+      "GCF_016128235_1_\ttaxon00001.\n",
+      "GCF_020097475_1_\ttaxon00002.\n",
+      "GCF_016904755_1_\ttaxon00003.\n",
+      "GCF_009931435_1_\ttaxon00004.\n",
+      "GCA_017672245_1_\ttaxon00005.\n",
+      "Gene trees written to ../data/gene_trees_newick.tsv.rooted.renamed\n"
+     ]
+    }
+   ],
+   "source": [
+    "# take genome tree as input, read the taxa IDs, rename them, and rewrite the tree with the renamed taxa. Save the mapping of old and new names in a file.\n",
+    "import os\n",
+    "import dendropy\n",
+    "import ete3\n",
+    "import re\n",
+    "\n",
+    "genome_tree_filepath = \"../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled\"\n",
+    "output_tree_filepath = \"../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled.renamed\"\n",
+    "mapping_filepath = \"../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled.renamed.mapping\"\n",
+    "\n",
+    "# read the rooted tree\n",
+    "genome_tree = dendropy.Tree.get(\n",
+    "    path=genome_tree_filepath, schema=\"newick\", preserve_underscores=True\n",
+    ")\n",
+    "# read the taxa\n",
+    "taxa = genome_tree.taxon_namespace\n",
+    "# rename the taxa into taxon_0000x where x is the index of the taxon in the list add with 0 padding to 5 digits\n",
+    "mapping = {taxa[i].label: f\"taxon{str(i+1).zfill(5)}\" for i in range(len(taxa))}\n",
+    "\n",
+    "# rename the taxa\n",
+    "for taxon in taxa:\n",
+    "    taxon.label = mapping[taxon.label]\n",
+    "\n",
+    "# write the tree\n",
+    "genome_tree.write(path=output_tree_filepath, schema=\"newick\")\n",
+    "# write the mapping\n",
+    "with open(mapping_filepath, \"w\") as mapping_file:\n",
+    "    for old_name, new_name in mapping.items():\n",
+    "        mapping_file.write(f\"{old_name}\\t{new_name}\\n\")\n",
+    "\n",
+    "print(f\"Tree written to {output_tree_filepath}\")\n",
+    "\n",
+    "# make a version with no internal labels\n",
+    "output_tree_filepath_no_internal_labels = (\n",
+    "    \"../data/genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels\"\n",
+    ")\n",
+    "# remove the internal labels from genome_tree\n",
+    "for node in genome_tree:\n",
+    "    if not node.is_leaf():\n",
+    "        node.label = None\n",
+    "genome_tree.write(path=output_tree_filepath_no_internal_labels, \n",
+    "                  # newick format\n",
+    "                  schema=\"newick\")\n",
+    "# read in using ete3 and rewrite without scientific notation branch lengths\n",
+    "ete3_genome_tree = ete3.Tree(output_tree_filepath_no_internal_labels, format=1)\n",
+    "ete3_genome_tree.write(outfile=output_tree_filepath_no_internal_labels, format=1,\n",
+    "                       dist_formatter=\"%.18f\")\n",
+    "\n",
+    "\n",
+    "############################################################################################################\n",
+    "\n",
+    "# now we replace the old taxa labels with new ones in the newick strings of the gene trees\n",
+    "gene_trees_tsv = (\n",
+    "    \"../data/gene_trees_newick.tsv.rooted\"  # two columns: gene_tree_id, newick\n",
+    ")\n",
+    "# two columns: gene_tree_id, newick\n",
+    "new_gene_trees_tsv = (\n",
+    "    \"../data/gene_trees_newick.tsv.rooted.renamed\"\n",
+    ")\n",
+    "\n",
+    "# first read in the trees into a dict\n",
+    "with open(gene_trees_tsv, \"r\") as gene_trees_file:\n",
+    "    gene_trees_dict = {\n",
+    "        line.split(\"\\t\")[0]: line.split(\"\\t\")[1].strip()\n",
+    "        for line in gene_trees_file.readlines()\n",
+    "    }\n",
+    "\n",
+    "# leaf labels are of the form \"GCF_000123456_1_WP_123456789_1\" which should be renamed to \"taxon00001.WP_123456789_1\"\n",
+    "# first replace the dots in the gene tree leaf names with underscores. Make sure the branch lengths are not affected.\n",
+    "for gene_tree_id, gene_tree in list(gene_trees_dict.items()):\n",
+    "    # match the leaf labels (they end with :) and replace the dots with underscores\n",
+    "    gene_trees_dict[gene_tree_id] = re.sub(\n",
+    "        r\"([A-Za-z0-9]+)\\.([A-Za-z0-9]+):\", r\"\\1_\\2:\", gene_tree\n",
+    "    )\n",
+    "\n",
+    "# add an underscore at the end of the old taxon names and a dot at the end of the new taxon names in the mapping\n",
+    "mapping2 = {old_name + \"_\": new_name + \".\" for old_name, new_name in mapping.items()}\n",
+    "# display a few mappings\n",
+    "print(f\"Old name\\tNew name\")\n",
+    "for i, (old_name, new_name) in enumerate(mapping2.items()):\n",
+    "    if i < 5:\n",
+    "        print(f\"{old_name}\\t{new_name}\")\n",
+    "    else:\n",
+    "        break\n",
+    "\n",
+    "# now replace the old taxa names with the new ones, iterating over the gene trees and reading the newick strings\n",
+    "for gene_tree_id, gene_tree in list(gene_trees_dict.items()):\n",
+    "    for old_name, new_name in mapping2.items():\n",
+    "        gene_tree = re.sub(f\"{old_name}\", new_name, gene_tree)\n",
+    "        gene_trees_dict[gene_tree_id] = gene_tree\n",
+    "\n",
+    "# write the renamed gene trees to a new file\n",
+    "with open(new_gene_trees_tsv, \"w\") as new_gene_trees_file:\n",
+    "    for gene_tree_id, gene_tree in gene_trees_dict.items():\n",
+    "        new_gene_trees_file.write(f\"{gene_tree_id}\\t{gene_tree}\\n\")\n",
+    "\n",
+    "print(f\"Gene trees written to {new_gene_trees_tsv}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "hgt_analyses",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:markdown id: tags:
+
+ALE seems to fail if the input gene IDs in terms of NCBI IDs currently of the form:
+ - for genome IDs in genomes tree: `GCF_xxxxxxxx_n`
+ - for gene IDs in gene trees: `GCF_xxxxxxxx_n_ABC_xxxxxxxx_m`
+
+Here we convert the genome tree taxa IDs to our own naming of the form `txidxxxxxxx` and the gene tree gene IDs to our own naming of the form `txidxxxxxxx_ABC_xxxxxxxx_n`
+
+%% Cell type:code id: tags:
+
+``` python
+# suppress syntax warnings of ete3
+import warnings
+warnings.filterwarnings("ignore", category=SyntaxWarning)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# take genome tree as input, read the taxa IDs, rename them, and rewrite the tree with the renamed taxa. Save the mapping of old and new names in a file.
+import os
+import dendropy
+import ete3
+import re
+
+genome_tree_filepath = "../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled"
+output_tree_filepath = "../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled.renamed"
+mapping_filepath = "../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled.renamed.mapping"
+
+# read the rooted tree
+genome_tree = dendropy.Tree.get(
+    path=genome_tree_filepath, schema="newick", preserve_underscores=True
+)
+# read the taxa
+taxa = genome_tree.taxon_namespace
+# rename the taxa into taxon_0000x where x is the index of the taxon in the list add with 0 padding to 5 digits
+mapping = {taxa[i].label: f"taxon{str(i+1).zfill(5)}" for i in range(len(taxa))}
+
+# rename the taxa
+for taxon in taxa:
+    taxon.label = mapping[taxon.label]
+
+# write the tree
+genome_tree.write(path=output_tree_filepath, schema="newick")
+# write the mapping
+with open(mapping_filepath, "w") as mapping_file:
+    for old_name, new_name in mapping.items():
+        mapping_file.write(f"{old_name}\t{new_name}\n")
+
+print(f"Tree written to {output_tree_filepath}")
+
+# make a version with no internal labels
+output_tree_filepath_no_internal_labels = (
+    "../data/genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels"
+)
+# remove the internal labels from genome_tree
+for node in genome_tree:
+    if not node.is_leaf():
+        node.label = None
+genome_tree.write(path=output_tree_filepath_no_internal_labels,
+                  # newick format
+                  schema="newick")
+# read in using ete3 and rewrite without scientific notation branch lengths
+ete3_genome_tree = ete3.Tree(output_tree_filepath_no_internal_labels, format=1)
+ete3_genome_tree.write(outfile=output_tree_filepath_no_internal_labels, format=1,
+                       dist_formatter="%.18f")
+
+
+############################################################################################################
+
+# now we replace the old taxa labels with new ones in the newick strings of the gene trees
+gene_trees_tsv = (
+    "../data/gene_trees_newick.tsv.rooted"  # two columns: gene_tree_id, newick
+)
+# two columns: gene_tree_id, newick
+new_gene_trees_tsv = (
+    "../data/gene_trees_newick.tsv.rooted.renamed"
+)
+
+# first read in the trees into a dict
+with open(gene_trees_tsv, "r") as gene_trees_file:
+    gene_trees_dict = {
+        line.split("\t")[0]: line.split("\t")[1].strip()
+        for line in gene_trees_file.readlines()
+    }
+
+# leaf labels are of the form "GCF_000123456_1_WP_123456789_1" which should be renamed to "taxon00001.WP_123456789_1"
+# first replace the dots in the gene tree leaf names with underscores. Make sure the branch lengths are not affected.
+for gene_tree_id, gene_tree in list(gene_trees_dict.items()):
+    # match the leaf labels (they end with :) and replace the dots with underscores
+    gene_trees_dict[gene_tree_id] = re.sub(
+        r"([A-Za-z0-9]+)\.([A-Za-z0-9]+):", r"\1_\2:", gene_tree
+    )
+
+# add an underscore at the end of the old taxon names and a dot at the end of the new taxon names in the mapping
+mapping2 = {old_name + "_": new_name + "." for old_name, new_name in mapping.items()}
+# display a few mappings
+print(f"Old name\tNew name")
+for i, (old_name, new_name) in enumerate(mapping2.items()):
+    if i < 5:
+        print(f"{old_name}\t{new_name}")
+    else:
+        break
+
+# now replace the old taxa names with the new ones, iterating over the gene trees and reading the newick strings
+for gene_tree_id, gene_tree in list(gene_trees_dict.items()):
+    for old_name, new_name in mapping2.items():
+        gene_tree = re.sub(f"{old_name}", new_name, gene_tree)
+        gene_trees_dict[gene_tree_id] = gene_tree
+
+# write the renamed gene trees to a new file
+with open(new_gene_trees_tsv, "w") as new_gene_trees_file:
+    for gene_tree_id, gene_tree in gene_trees_dict.items():
+        new_gene_trees_file.write(f"{gene_tree_id}\t{gene_tree}\n")
+
+print(f"Gene trees written to {new_gene_trees_tsv}")
+```
+
+%% Output
+
+    Tree written to ../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled.renamed
+    Old name	New name
+    GCF_016128235_1_	taxon00001.
+    GCF_020097475_1_	taxon00002.
+    GCF_016904755_1_	taxon00003.
+    GCF_009931435_1_	taxon00004.
+    GCA_017672245_1_	taxon00005.
+    Gene trees written to ../data/gene_trees_newick.tsv.rooted.renamed
+
+%% Cell type:code id: tags:
+
+``` python
+```
--- a/notebooks/02-02-infer_dynamics.ipynb
+++ b/notebooks/02-02-infer_dynamics.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run ALE for inferring gene dynamics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "ALE only requires the species tree to be rooted, so we can use the unrooted gene trees directly.\n",
+    "However, rooted trees seem to make ALE run faster, so we use rooted gene trees, rooted using MAD.\n",
+    "\n",
+    "```bash\n",
+    "# make the directory for ALE inferences\n",
+    "mkdir -p data/inferences/gene_dynamics/ALE\n",
+    "cd data/inferences/gene_dynamics/ALE\n",
+    "# run ALE\n",
+    "nohup ~/mambaforge/envs/hgt_analyses/bin/python ../../../../code/run_ALE.py   -s ../../../genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels -g ../../../gene_trees_newick.tsv.rooted.renamed  -o ./ -t 100 > ../../../nohup_run_ALE_GD.out & disown\n",
+    "# this results in the ALE output dir being in ./RESULTS (i.e. data/inferences/gene_dynamics/ALE/RESULTS)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run COUNT and GLOOME for both gene and pathogenicity dynamics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir -p ../data/inferences/gene_dynamics/Count ../data/inferences/gene_dynamics/GLOOME ../data/inferences/pathogenicity_dynamics/Count ../data/inferences/pathogenicity_dynamics/GLOOME"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare PA Matrices and param files for running COUNT and GLOOME"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# suppress syntax warnings of ete3\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\", category=SyntaxWarning)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import os\n",
+    "import ete3\n",
+    "\n",
+    "orthogroups_tsv_filepath = \"../data/escherichia_ncbi_assemblies/OrthoFinder/Results_Nov28/Orthogroups/Orthogroups.tsv\"\n",
+    "acc_to_taxa_mapping_filepath = (\n",
+    "    \"../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled.renamed.mapping\"\n",
+    ")\n",
+    "ncbi_summary_tsv_filepath = (\n",
+    "    \"../data/escherichia_pathogen_commensal_ncbi_summary.tsv\"\n",
+    ")\n",
+    "\n",
+    "og_pa_matrix_filepath = \"../data/og_pa_matrix.tsv\"\n",
+    "og_pa_matrix_filepath = os.path.abspath(og_pa_matrix_filepath)\n",
+    "pathogenicity_pa_matrix_filepath = \"../data/pathogenicity_pa_matrix.tsv\"\n",
+    "pathogenicity_pa_matrix_filepath = os.path.abspath(pathogenicity_pa_matrix_filepath)\n",
+    "tree_filepath = (\n",
+    "    \"../data/genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels\"\n",
+    ")\n",
+    "tree_filepath = os.path.abspath(tree_filepath)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "OG PA matrix shape: (5503, 151)\n",
+      "Pruned tree written to /root/work/projects/hgt_pathogenicity/data/genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels.pruned\n",
+      "Wrote OG presence-absence matrix to /root/work/projects/hgt_pathogenicity/data/og_pa_matrix.tsv\n",
+      "Pathogenicity PA matrix shape: (151, 1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# functions to prepare OGs and Pathogenicity PA matrices\n",
+    "\n",
+    "def prepare_og_pa_matrix(\n",
+    "    orthogroups_tsv_filepath,\n",
+    "    acc_to_taxa_mapping_filepath,\n",
+    "    og_pa_matrix_filepath: str,\n",
+    "    tree_filepath: str,\n",
+    "    min_taxa=10, # in line with ALE's gene trees having at least 10 taxa\n",
+    "    single_copy=False,\n",
+    ") -> None:\n",
+    "    \"\"\"\n",
+    "    Prepare OG presence-absence matrix for GLOOME/COUNT and prune the tree\n",
+    "    \"\"\"\n",
+    "    # OG mapping from Orthofinder\n",
+    "    orthogroups_df = pd.read_csv(\n",
+    "        orthogroups_tsv_filepath, sep=\"\\t\", header=0, index_col=0, low_memory=False\n",
+    "    )\n",
+    "    # keep only OGs with at least min_taxa genomes (columns) with any genes in them\n",
+    "    orthogroups_df = orthogroups_df.loc[\n",
+    "        orthogroups_df.apply(lambda x: x.count() >= min_taxa, axis=1)\n",
+    "    ]\n",
+    "\n",
+    "    if not single_copy:\n",
+    "        gloome_output_fasta_filepath = os.path.join(\n",
+    "        os.path.dirname(og_pa_matrix_filepath),\n",
+    "        os.path.basename(og_pa_matrix_filepath).replace(\".tsv\", \".fasta\"),\n",
+    "        )\n",
+    "        count_output_tsv_filepath = og_pa_matrix_filepath\n",
+    "        pruned_tree_out_file = f\"{tree_filepath}.pruned\"\n",
+    "    else:\n",
+    "        gloome_output_fasta_filepath = os.path.join(\n",
+    "        os.path.dirname(og_pa_matrix_filepath),\n",
+    "        os.path.basename(og_pa_matrix_filepath).replace(\".tsv\", \".single_copy.fasta\"),\n",
+    "        )\n",
+    "        count_output_tsv_filepath = os.path.join(\n",
+    "        os.path.dirname(og_pa_matrix_filepath),\n",
+    "        os.path.basename(og_pa_matrix_filepath).replace(\".tsv\", \".single_copy.tsv\"),\n",
+    "        )\n",
+    "        pruned_tree_out_file = f\"{tree_filepath}.single_copy_ogs.pruned\"\n",
+    "\n",
+    "    # index is the name of the OG, and header columns are the IDs of the genomes. Every entry is (comma-separated) list of genes in the OG for the corresponding genome\n",
+    "    # we want to convert this to a presence-absence matrix\n",
+    "    # convert the comma-separated lists to number of genes in the OG, and wherever it's NaN, replace with 0. Df entries should be integer values\n",
+    "    orthogroups_df = orthogroups_df.map(lambda x: 0 if pd.isna(x) else len(x.split(',')))\n",
+    "    # convert dots in genome IDs to underscores\n",
+    "    orthogroups_df.columns = orthogroups_df.columns.str.replace(\".\", \"_\")\n",
+    "    # remove the name of the index\n",
+    "    orthogroups_df.index.name = None\n",
+    "\n",
+    "    # replace genome acc IDs with our own taxon IDs using the acc_to_taxon_mapping_file\n",
+    "    acc_to_taxa_mapping_dict = (\n",
+    "        pd.read_csv(\n",
+    "            acc_to_taxa_mapping_filepath,\n",
+    "            sep=\"\\t\",\n",
+    "            header=None,\n",
+    "            names=[\"acc_id\", \"taxon_id\"],\n",
+    "        )\n",
+    "        .set_index(\"acc_id\")[\"taxon_id\"]\n",
+    "        .to_dict()\n",
+    "    )\n",
+    "    orthogroups_df.rename(columns=acc_to_taxa_mapping_dict, inplace=True)\n",
+    "\n",
+    "    # sort the column names\n",
+    "    orthogroups_df = orthogroups_df[sorted(orthogroups_df.columns)]\n",
+    "    single_copy_og_df = orthogroups_df.copy()\n",
+    "    # keep only OGs with exactly one gene in every genome\n",
+    "    single_copy_og_df = single_copy_og_df.apply(lambda x: x == 1).all(axis=1)\n",
+    "    # write the list of single-copy OGs to a file\n",
+    "    single_copy_og_df.index.to_series().to_csv(\n",
+    "        os.path.join(\n",
+    "            os.path.dirname(og_pa_matrix_filepath),\n",
+    "            \"list.single_copy_ogs.txt\",\n",
+    "        ),\n",
+    "        header=False,\n",
+    "        index=False,\n",
+    "    )\n",
+    "    \n",
+    "    if single_copy:\n",
+    "        # keep only OGs with exactly one gene in every genome\n",
+    "        orthogroups_df = orthogroups_df.apply(lambda x: x == 1).all(axis=1)\n",
+    "        print(\"Retained only single-copy OGs\")\n",
+    "    # GLOOME requires a fasta file with every row as genome ID, and every column as OG.\n",
+    "    # Every sequence basically has to be a 0/1 string. We can convert the OG PA matrix to this format\n",
+    "    # transpose\n",
+    "    orthogroups_df_binary = orthogroups_df.T\n",
+    "    # convert to 0/1\n",
+    "    orthogroups_df_binary = orthogroups_df_binary.map(lambda x: 1 if x > 0 else 0)\n",
+    "    # write to a fasta file\n",
+    "\n",
+    "    with open(\n",
+    "        gloome_output_fasta_filepath,\n",
+    "        \"w\",\n",
+    "    ) as f:\n",
+    "        for genome, row in orthogroups_df_binary.iterrows():\n",
+    "            f.write(\">\" + genome + \"\\n\")\n",
+    "            f.write(\"\".join(row.to_numpy().astype(str)) + \"\\n\")\n",
+    "\n",
+    "    # COUNT requires a TSV file with every row as OG and every column as genome ID\n",
+    "    orthogroups_df.to_csv(count_output_tsv_filepath, sep=\"\\t\")\n",
+    "\n",
+    "    # print size of the matrix\n",
+    "    print(\"OG PA matrix shape:\", orthogroups_df.shape)\n",
+    "\n",
+    "    # Prune the tree to only include the genomes in the OG PA matrix\n",
+    "    genomes_in_og_pa_matrix = orthogroups_df.columns.tolist()\n",
+    "    genome_tree = ete3.Tree(tree_filepath, format=1)\n",
+    "    genome_tree.prune(genomes_in_og_pa_matrix)\n",
+    "    genome_tree_write_str = genome_tree.write(format=1, format_root_node=True)\n",
+    "    with open(pruned_tree_out_file, \"w\") as f:\n",
+    "        f.write(genome_tree_write_str + \"\\n\")\n",
+    "    print(\"Pruned tree written to\", pruned_tree_out_file)\n",
+    "\n",
+    "    # this tree is without internal labels. We make another tree where we label the internal nodes\n",
+    "    # labels: N1, N2, N3, ...\n",
+    "    for i, node in enumerate(genome_tree.traverse(\"postorder\")):\n",
+    "        if not node.is_leaf():\n",
+    "            node.name = f\"N{i+1}\"\n",
+    "    \n",
+    "    pruned_labeled_out_file = f\"{pruned_tree_out_file.replace('no_internal_labels', 'labeled')}\"\n",
+    "    genome_tree_write_str = genome_tree.write(format=1, format_root_node=True)\n",
+    "    with open(pruned_labeled_out_file, \"w\") as f:\n",
+    "        f.write(genome_tree_write_str + \"\\n\")\n",
+    "\n",
+    "    return None\n",
+    "\n",
+    "\n",
+    "def prepare_pathogenicity_pa_matrix(\n",
+    "    ncbi_summary_tsv_filepath,\n",
+    "    acc_to_taxa_mapping_filepath,\n",
+    "    tree_filepath,\n",
+    "    pathogenicity_pa_matrix_filepath: str,\n",
+    "    single_copy=False,\n",
+    ") -> None:\n",
+    "    \"\"\"\n",
+    "    Prepare pathogenicity presence-absence matrix for GLOOME/COUNT\n",
+    "    \"\"\"\n",
+    "    # pathogenicity data is from the original ncbi summary tsv file\n",
+    "    ncbi_summary_df = pd.read_csv(\n",
+    "        ncbi_summary_tsv_filepath, sep=\"\\t\", header=0, index_col=0\n",
+    "    )\n",
+    "    ncbi_summary_df = ncbi_summary_df[\n",
+    "        [\"Pathogenicity\"]\n",
+    "    ]  # and index contains the genome IDs\n",
+    "    # convert to 0/1\n",
+    "    ncbi_summary_df = ncbi_summary_df.map(lambda x: 1 if x == \"Pathogen\" else 0)\n",
+    "    # repace dots in genome IDs with underscores\n",
+    "    ncbi_summary_df.index = ncbi_summary_df.index.str.replace(\".\", \"_\")\n",
+    "    # replace genome acc IDs with our own taxon IDs using the acc_to_taxon_mapping_file\n",
+    "    acc_to_taxa_mapping_dict = (\n",
+    "        pd.read_csv(\n",
+    "            acc_to_taxa_mapping_filepath,\n",
+    "            sep=\"\\t\",\n",
+    "            header=None,\n",
+    "            names=[\"acc_id\", \"taxon_id\"],\n",
+    "        )\n",
+    "        .set_index(\"acc_id\")[\"taxon_id\"]\n",
+    "        .to_dict()\n",
+    "    )\n",
+    "    ncbi_summary_df.rename(index=acc_to_taxa_mapping_dict, inplace=True)\n",
+    "\n",
+    "    taxa_tree = ete3.Tree(f\"{tree_filepath}.pruned\", format=1)\n",
+    "    # keep only the genomes that are in the pruned tree for the whole OG PA matrix\n",
+    "    taxa = [leaf.name for leaf in taxa_tree.iter_leaves()]\n",
+    "    ncbi_summary_df = ncbi_summary_df.loc[taxa]\n",
+    "    fasta_out_file = pathogenicity_pa_matrix_filepath.replace(\".tsv\", \".fasta\")\n",
+    "    tsv_out_file = pathogenicity_pa_matrix_filepath\n",
+    "\n",
+    "    # GLOOME requires a fasta file with every row as genome ID, and a single column with the pathogenicity values\n",
+    "    # with open(\"pathogenicity_pa_matrix.fasta\", \"w\") as f:\n",
+    "    with open(\n",
+    "        fasta_out_file, \"w\" ) as f:\n",
+    "        for genome, row in ncbi_summary_df.iterrows():\n",
+    "            f.write(\">\" + genome + \"\\n\")\n",
+    "            f.write(str(row.iloc[0]) + \"\\n\")\n",
+    "\n",
+    "    # COUNT requires a TSV file with every column as genome ID and a single row with the pathogenicity values\n",
+    "    ncbi_summary_df.T.to_csv(tsv_out_file, \n",
+    "                             sep=\"\\t\", header=True)\n",
+    "\n",
+    "    # print size of the matrix\n",
+    "    print(\"Pathogenicity PA matrix shape:\", ncbi_summary_df.shape)\n",
+    "\n",
+    "    return None\n",
+    "\n",
+    "prepare_og_pa_matrix(\n",
+    "    orthogroups_tsv_filepath,\n",
+    "    acc_to_taxa_mapping_filepath,\n",
+    "    og_pa_matrix_filepath,\n",
+    "    tree_filepath,\n",
+    ")\n",
+    "print(\"Wrote OG presence-absence matrix to\", og_pa_matrix_filepath)\n",
+    "\n",
+    "# for the pathogenicity data for all taxa\n",
+    "prepare_pathogenicity_pa_matrix(\n",
+    "    ncbi_summary_tsv_filepath,\n",
+    "    acc_to_taxa_mapping_filepath,\n",
+    "    tree_filepath,\n",
+    "    pathogenicity_pa_matrix_filepath,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash -s \"$og_pa_matrix_filepath\" \"$pathogenicity_pa_matrix_filepath\" \"$tree_filepath\"\n",
+    "# create params files for GLOOME\n",
+    "# first for OGs\n",
+    "\n",
+    "cat > ../data/infer_gene_dynamics_ogs.gloome.params << EOL\n",
+    "_seqFile $(dirname $1)/$(basename $1 .tsv).fasta\n",
+    "_treeFile $3.pruned\n",
+    "# use mixture-model\n",
+    "_gainLossDist 1\n",
+    "# for COG and EggNOG only patterns with 3 or more ones are observable\n",
+    "_minNumOfOnes 3\n",
+    "# include Parsimony results along with ML\n",
+    "_costMatrixGainLossRatio 1\n",
+    "# in this case, character frequencies are not equal across the tree\n",
+    "_isRootFreqEQstationary 0\n",
+    "## Advanced \n",
+    "_logValue 4\n",
+    "# print all probabilities for all events per pos per branch (default is 0.05 and everything below is not printed)\n",
+    "_probCutOffPrintEvent 0.0\n",
+    "_outDir RESULTS_OGs\n",
+    "EOL\n",
+    "\n",
+    "############################################################################################################\n",
+    "\n",
+    "# similarly but for single-copy OGs\n",
+    "cat > ../data/infer_gene_dynamics_single_copy_ogs.gloome.params << EOL\n",
+    "_seqFile $(dirname $1)/$(basename $1 .tsv).single_copy.fasta\n",
+    "_treeFile $3.pruned\n",
+    "# use mixture-model\n",
+    "_gainLossDist 1\n",
+    "# for COG and EggNOG only patterns with 3 or more ones are observable\n",
+    "_minNumOfOnes 3\n",
+    "# include Parsimony results along with ML\n",
+    "_costMatrixGainLossRatio 1\n",
+    "# in this case, character frequencies are not equal across the tree\n",
+    "_isRootFreqEQstationary 0\n",
+    "## Advanced\n",
+    "_logValue 4\n",
+    "# print all probabilities for all events per pos per branch (default is 0.05 and everything below is not printed)\n",
+    "_probCutOffPrintEvent 0.0\n",
+    "_outDir RESULTS_Single_Copy_OGs\n",
+    "EOL\n",
+    "\n",
+    "############################################################################################################\n",
+    "\n",
+    "# then for pathogenicity\n",
+    "cat > ../data/infer_pathogenicity_dynamics.gloome.params << EOL\n",
+    "_seqFile $(dirname $2)/$(basename $2 .tsv).fasta\n",
+    "_treeFile $3.pruned\n",
+    "# use mixture-model\n",
+    "_gainLossDist 1\n",
+    "# include Parsimony results along with ML\n",
+    "_costMatrixGainLossRatio 1\n",
+    "# in this case, character frequencies are not equal across the tree\n",
+    "_isRootFreqEQstationary 0\n",
+    "## Advanced \n",
+    "_logValue 4\n",
+    "_outDir RESULTS_Pathogenicity\n",
+    "EOL\n",
+    "\n",
+    "############################################################################################################\n",
+    "\n",
+    "# similarly but for single-copy OGs\n",
+    "cat > ../data/infer_pathogenicity_dynamics_single_copy.gloome.params << EOL\n",
+    "_seqFile $(dirname $2)/$(basename $2 .tsv).single_copy.fasta\n",
+    "_treeFile $3.pruned\n",
+    "# use mixture-model\n",
+    "_gainLossDist 1\n",
+    "# include Parsimony results along with ML\n",
+    "_costMatrixGainLossRatio 1\n",
+    "# in this case, character frequencies are not equal across the tree\n",
+    "_isRootFreqEQstationary 0\n",
+    "## Advanced\n",
+    "_logValue 4\n",
+    "_outDir RESULTS_Pathogenicity_Single_Copy\n",
+    "EOL"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can use these params files as arguments for GLOOME.\n",
+    "\n",
+    "Run the following processes. Using `nohup`, `&`, and `disown` here because the process may take a while to finish, and I am running it remotely on a server. Processes shouldn't be affected this way, even if I logout.\n",
+    "\n",
+    "```bash\n",
+    "nohup ~/bin/GLOOME.VR01.266 infer_gene_dynamics_nogs.params > nohup_infer_gene_dynamics_nogs.out & disown\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "--------\n",
+    "\n",
+    "Similarly for Count:\n",
+    "\n",
+    "```bash\n",
+    "# change to the output directory for COUNT Pathogenicity dynamics\n",
+    "cd data/inferences/pathogenicity_dynamics/Count\n",
+    "\n",
+    "# run COUNT for pathogenicity dynamics, first for all genes and then for single copy genes\n",
+    "java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.AsymmetricWagner -gain 1 ../../../genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels.pruned  ../../../pathogenicity_pa_matrix.tsv  > Count_output.tsv\n",
+    "\n",
+    "# separate the output into genome sizes, changes, and families\n",
+    "grep \"# PRESENT\" Count_output.tsv  > Count_genome_sizes.tsv && grep \"# CHANGE\" Count_output.tsv  > Count_changes.tsv && grep \"# FAMILY\" Count_output.tsv  > Count_families.tsv\n",
+    "\n",
+    "# change to the output directory for COUNT Gene dynamics\n",
+    "cd ../../gene_dynamics/Count\n",
+    "\n",
+    "# run COUNT for gene dynamics, first for all genes and then for single copy genes\n",
+    "java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.AsymmetricWagner -gain 1 ../../../genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels.pruned  ../../../og_pa_matrix.tsv  > Count_output.tsv\n",
+    "\n",
+    "# separate the output into genome sizes, changes, and families\n",
+    "grep \"# PRESENT\" Count_output.tsv  > Count_genome_sizes.tsv && grep \"# CHANGE\" Count_output.tsv  > Count_changes.tsv && grep \"# FAMILY\" Count_output.tsv  > Count_families.tsv\n",
+    "```"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "hgt_analyses",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:markdown id: tags:
+
+# Run ALE for inferring gene dynamics
+
+%% Cell type:markdown id: tags:
+
+ALE only requires the species tree to be rooted, so we can use the unrooted gene trees directly.
+However, rooted trees seem to make ALE run faster, so we use rooted gene trees, rooted using MAD.
+
+```bash
+# make the directory for ALE inferences
+mkdir -p data/inferences/gene_dynamics/ALE
+cd data/inferences/gene_dynamics/ALE
+# run ALE
+nohup ~/mambaforge/envs/hgt_analyses/bin/python ../../../../code/run_ALE.py   -s ../../../genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels -g ../../../gene_trees_newick.tsv.rooted.renamed  -o ./ -t 100 > ../../../nohup_run_ALE_GD.out & disown
+# this results in the ALE output dir being in ./RESULTS (i.e. data/inferences/gene_dynamics/ALE/RESULTS)
+```
+
+%% Cell type:markdown id: tags:
+
+# Run COUNT and GLOOME for both gene and pathogenicity dynamics
+
+%% Cell type:code id: tags:
+
+``` python
+!mkdir -p ../data/inferences/gene_dynamics/Count ../data/inferences/gene_dynamics/GLOOME ../data/inferences/pathogenicity_dynamics/Count ../data/inferences/pathogenicity_dynamics/GLOOME
+```
+
+%% Cell type:markdown id: tags:
+
+## Prepare PA Matrices and param files for running COUNT and GLOOME
+
+%% Cell type:code id: tags:
+
+``` python
+# suppress syntax warnings of ete3
+import warnings
+warnings.filterwarnings("ignore", category=SyntaxWarning)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+import pandas as pd
+import os
+import ete3
+
+orthogroups_tsv_filepath = "../data/escherichia_ncbi_assemblies/OrthoFinder/Results_Nov28/Orthogroups/Orthogroups.tsv"
+acc_to_taxa_mapping_filepath = (
+    "../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled.renamed.mapping"
+)
+ncbi_summary_tsv_filepath = (
+    "../data/escherichia_pathogen_commensal_ncbi_summary.tsv"
+)
+
+og_pa_matrix_filepath = "../data/og_pa_matrix.tsv"
+og_pa_matrix_filepath = os.path.abspath(og_pa_matrix_filepath)
+pathogenicity_pa_matrix_filepath = "../data/pathogenicity_pa_matrix.tsv"
+pathogenicity_pa_matrix_filepath = os.path.abspath(pathogenicity_pa_matrix_filepath)
+tree_filepath = (
+    "../data/genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels"
+)
+tree_filepath = os.path.abspath(tree_filepath)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# functions to prepare OGs and Pathogenicity PA matrices
+
+def prepare_og_pa_matrix(
+    orthogroups_tsv_filepath,
+    acc_to_taxa_mapping_filepath,
+    og_pa_matrix_filepath: str,
+    tree_filepath: str,
+    min_taxa=10, # in line with ALE's gene trees having at least 10 taxa
+    single_copy=False,
+) -> None:
+    """
+    Prepare OG presence-absence matrix for GLOOME/COUNT and prune the tree
+    """
+    # OG mapping from Orthofinder
+    orthogroups_df = pd.read_csv(
+        orthogroups_tsv_filepath, sep="\t", header=0, index_col=0, low_memory=False
+    )
+    # keep only OGs with at least min_taxa genomes (columns) with any genes in them
+    orthogroups_df = orthogroups_df.loc[
+        orthogroups_df.apply(lambda x: x.count() >= min_taxa, axis=1)
+    ]
+
+    if not single_copy:
+        gloome_output_fasta_filepath = os.path.join(
+        os.path.dirname(og_pa_matrix_filepath),
+        os.path.basename(og_pa_matrix_filepath).replace(".tsv", ".fasta"),
+        )
+        count_output_tsv_filepath = og_pa_matrix_filepath
+        pruned_tree_out_file = f"{tree_filepath}.pruned"
+    else:
+        gloome_output_fasta_filepath = os.path.join(
+        os.path.dirname(og_pa_matrix_filepath),
+        os.path.basename(og_pa_matrix_filepath).replace(".tsv", ".single_copy.fasta"),
+        )
+        count_output_tsv_filepath = os.path.join(
+        os.path.dirname(og_pa_matrix_filepath),
+        os.path.basename(og_pa_matrix_filepath).replace(".tsv", ".single_copy.tsv"),
+        )
+        pruned_tree_out_file = f"{tree_filepath}.single_copy_ogs.pruned"
+
+    # index is the name of the OG, and header columns are the IDs of the genomes. Every entry is (comma-separated) list of genes in the OG for the corresponding genome
+    # we want to convert this to a presence-absence matrix
+    # convert the comma-separated lists to number of genes in the OG, and wherever it's NaN, replace with 0. Df entries should be integer values
+    orthogroups_df = orthogroups_df.map(lambda x: 0 if pd.isna(x) else len(x.split(',')))
+    # convert dots in genome IDs to underscores
+    orthogroups_df.columns = orthogroups_df.columns.str.replace(".", "_")
+    # remove the name of the index
+    orthogroups_df.index.name = None
+
+    # replace genome acc IDs with our own taxon IDs using the acc_to_taxon_mapping_file
+    acc_to_taxa_mapping_dict = (
+        pd.read_csv(
+            acc_to_taxa_mapping_filepath,
+            sep="\t",
+            header=None,
+            names=["acc_id", "taxon_id"],
+        )
+        .set_index("acc_id")["taxon_id"]
+        .to_dict()
+    )
+    orthogroups_df.rename(columns=acc_to_taxa_mapping_dict, inplace=True)
+
+    # sort the column names
+    orthogroups_df = orthogroups_df[sorted(orthogroups_df.columns)]
+    single_copy_og_df = orthogroups_df.copy()
+    # keep only OGs with exactly one gene in every genome
+    single_copy_og_df = single_copy_og_df.apply(lambda x: x == 1).all(axis=1)
+    # write the list of single-copy OGs to a file
+    single_copy_og_df.index.to_series().to_csv(
+        os.path.join(
+            os.path.dirname(og_pa_matrix_filepath),
+            "list.single_copy_ogs.txt",
+        ),
+        header=False,
+        index=False,
+    )
+
+    if single_copy:
+        # keep only OGs with exactly one gene in every genome
+        orthogroups_df = orthogroups_df.apply(lambda x: x == 1).all(axis=1)
+        print("Retained only single-copy OGs")
+    # GLOOME requires a fasta file with every row as genome ID, and every column as OG.
+    # Every sequence basically has to be a 0/1 string. We can convert the OG PA matrix to this format
+    # transpose
+    orthogroups_df_binary = orthogroups_df.T
+    # convert to 0/1
+    orthogroups_df_binary = orthogroups_df_binary.map(lambda x: 1 if x > 0 else 0)
+    # write to a fasta file
+
+    with open(
+        gloome_output_fasta_filepath,
+        "w",
+    ) as f:
+        for genome, row in orthogroups_df_binary.iterrows():
+            f.write(">" + genome + "\n")
+            f.write("".join(row.to_numpy().astype(str)) + "\n")
+
+    # COUNT requires a TSV file with every row as OG and every column as genome ID
+    orthogroups_df.to_csv(count_output_tsv_filepath, sep="\t")
+
+    # print size of the matrix
+    print("OG PA matrix shape:", orthogroups_df.shape)
+
+    # Prune the tree to only include the genomes in the OG PA matrix
+    genomes_in_og_pa_matrix = orthogroups_df.columns.tolist()
+    genome_tree = ete3.Tree(tree_filepath, format=1)
+    genome_tree.prune(genomes_in_og_pa_matrix)
+    genome_tree_write_str = genome_tree.write(format=1, format_root_node=True)
+    with open(pruned_tree_out_file, "w") as f:
+        f.write(genome_tree_write_str + "\n")
+    print("Pruned tree written to", pruned_tree_out_file)
+
+    # this tree is without internal labels. We make another tree where we label the internal nodes
+    # labels: N1, N2, N3, ...
+    for i, node in enumerate(genome_tree.traverse("postorder")):
+        if not node.is_leaf():
+            node.name = f"N{i+1}"
+
+    pruned_labeled_out_file = f"{pruned_tree_out_file.replace('no_internal_labels', 'labeled')}"
+    genome_tree_write_str = genome_tree.write(format=1, format_root_node=True)
+    with open(pruned_labeled_out_file, "w") as f:
+        f.write(genome_tree_write_str + "\n")
+
+    return None
+
+
+def prepare_pathogenicity_pa_matrix(
+    ncbi_summary_tsv_filepath,
+    acc_to_taxa_mapping_filepath,
+    tree_filepath,
+    pathogenicity_pa_matrix_filepath: str,
+    single_copy=False,
+) -> None:
+    """
+    Prepare pathogenicity presence-absence matrix for GLOOME/COUNT
+    """
+    # pathogenicity data is from the original ncbi summary tsv file
+    ncbi_summary_df = pd.read_csv(
+        ncbi_summary_tsv_filepath, sep="\t", header=0, index_col=0
+    )
+    ncbi_summary_df = ncbi_summary_df[
+        ["Pathogenicity"]
+    ]  # and index contains the genome IDs
+    # convert to 0/1
+    ncbi_summary_df = ncbi_summary_df.map(lambda x: 1 if x == "Pathogen" else 0)
+    # repace dots in genome IDs with underscores
+    ncbi_summary_df.index = ncbi_summary_df.index.str.replace(".", "_")
+    # replace genome acc IDs with our own taxon IDs using the acc_to_taxon_mapping_file
+    acc_to_taxa_mapping_dict = (
+        pd.read_csv(
+            acc_to_taxa_mapping_filepath,
+            sep="\t",
+            header=None,
+            names=["acc_id", "taxon_id"],
+        )
+        .set_index("acc_id")["taxon_id"]
+        .to_dict()
+    )
+    ncbi_summary_df.rename(index=acc_to_taxa_mapping_dict, inplace=True)
+
+    taxa_tree = ete3.Tree(f"{tree_filepath}.pruned", format=1)
+    # keep only the genomes that are in the pruned tree for the whole OG PA matrix
+    taxa = [leaf.name for leaf in taxa_tree.iter_leaves()]
+    ncbi_summary_df = ncbi_summary_df.loc[taxa]
+    fasta_out_file = pathogenicity_pa_matrix_filepath.replace(".tsv", ".fasta")
+    tsv_out_file = pathogenicity_pa_matrix_filepath
+
+    # GLOOME requires a fasta file with every row as genome ID, and a single column with the pathogenicity values
+    # with open("pathogenicity_pa_matrix.fasta", "w") as f:
+    with open(
+        fasta_out_file, "w" ) as f:
+        for genome, row in ncbi_summary_df.iterrows():
+            f.write(">" + genome + "\n")
+            f.write(str(row.iloc[0]) + "\n")
+
+    # COUNT requires a TSV file with every column as genome ID and a single row with the pathogenicity values
+    ncbi_summary_df.T.to_csv(tsv_out_file,
+                             sep="\t", header=True)
+
+    # print size of the matrix
+    print("Pathogenicity PA matrix shape:", ncbi_summary_df.shape)
+
+    return None
+
+prepare_og_pa_matrix(
+    orthogroups_tsv_filepath,
+    acc_to_taxa_mapping_filepath,
+    og_pa_matrix_filepath,
+    tree_filepath,
+)
+print("Wrote OG presence-absence matrix to", og_pa_matrix_filepath)
+
+# for the pathogenicity data for all taxa
+prepare_pathogenicity_pa_matrix(
+    ncbi_summary_tsv_filepath,
+    acc_to_taxa_mapping_filepath,
+    tree_filepath,
+    pathogenicity_pa_matrix_filepath,
+)
+```
+
+%% Output
+
+    OG PA matrix shape: (5503, 151)
+    Pruned tree written to /root/work/projects/hgt_pathogenicity/data/genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels.pruned
+    Wrote OG presence-absence matrix to /root/work/projects/hgt_pathogenicity/data/og_pa_matrix.tsv
+    Pathogenicity PA matrix shape: (151, 1)
+
+%% Cell type:code id: tags:
+
+``` python
+%%bash -s "$og_pa_matrix_filepath" "$pathogenicity_pa_matrix_filepath" "$tree_filepath"
+# create params files for GLOOME
+# first for OGs
+
+cat > ../data/infer_gene_dynamics_ogs.gloome.params << EOL
+_seqFile $(dirname $1)/$(basename $1 .tsv).fasta
+_treeFile $3.pruned
+# use mixture-model
+_gainLossDist 1
+# for COG and EggNOG only patterns with 3 or more ones are observable
+_minNumOfOnes 3
+# include Parsimony results along with ML
+_costMatrixGainLossRatio 1
+# in this case, character frequencies are not equal across the tree
+_isRootFreqEQstationary 0
+## Advanced
+_logValue 4
+# print all probabilities for all events per pos per branch (default is 0.05 and everything below is not printed)
+_probCutOffPrintEvent 0.0
+_outDir RESULTS_OGs
+EOL
+
+############################################################################################################
+
+# similarly but for single-copy OGs
+cat > ../data/infer_gene_dynamics_single_copy_ogs.gloome.params << EOL
+_seqFile $(dirname $1)/$(basename $1 .tsv).single_copy.fasta
+_treeFile $3.pruned
+# use mixture-model
+_gainLossDist 1
+# for COG and EggNOG only patterns with 3 or more ones are observable
+_minNumOfOnes 3
+# include Parsimony results along with ML
+_costMatrixGainLossRatio 1
+# in this case, character frequencies are not equal across the tree
+_isRootFreqEQstationary 0
+## Advanced
+_logValue 4
+# print all probabilities for all events per pos per branch (default is 0.05 and everything below is not printed)
+_probCutOffPrintEvent 0.0
+_outDir RESULTS_Single_Copy_OGs
+EOL
+
+############################################################################################################
+
+# then for pathogenicity
+cat > ../data/infer_pathogenicity_dynamics.gloome.params << EOL
+_seqFile $(dirname $2)/$(basename $2 .tsv).fasta
+_treeFile $3.pruned
+# use mixture-model
+_gainLossDist 1
+# include Parsimony results along with ML
+_costMatrixGainLossRatio 1
+# in this case, character frequencies are not equal across the tree
+_isRootFreqEQstationary 0
+## Advanced
+_logValue 4
+_outDir RESULTS_Pathogenicity
+EOL
+
+############################################################################################################
+
+# similarly but for single-copy OGs
+cat > ../data/infer_pathogenicity_dynamics_single_copy.gloome.params << EOL
+_seqFile $(dirname $2)/$(basename $2 .tsv).single_copy.fasta
+_treeFile $3.pruned
+# use mixture-model
+_gainLossDist 1
+# include Parsimony results along with ML
+_costMatrixGainLossRatio 1
+# in this case, character frequencies are not equal across the tree
+_isRootFreqEQstationary 0
+## Advanced
+_logValue 4
+_outDir RESULTS_Pathogenicity_Single_Copy
+EOL
+```
+
+%% Cell type:markdown id: tags:
+
+We can use these params files as arguments for GLOOME.
+
+Run the following processes. Using `nohup`, `&`, and `disown` here because the process may take a while to finish, and I am running it remotely on a server. Processes shouldn't be affected this way, even if I logout.
+
+```bash
+nohup ~/bin/GLOOME.VR01.266 infer_gene_dynamics_nogs.params > nohup_infer_gene_dynamics_nogs.out & disown
+```
+
+%% Cell type:markdown id: tags:
+
+--------
+
+Similarly for Count:
+
+```bash
+# change to the output directory for COUNT Pathogenicity dynamics
+cd data/inferences/pathogenicity_dynamics/Count
+
+# run COUNT for pathogenicity dynamics, first for all genes and then for single copy genes
+java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.AsymmetricWagner -gain 1 ../../../genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels.pruned  ../../../pathogenicity_pa_matrix.tsv  > Count_output.tsv
+
+# separate the output into genome sizes, changes, and families
+grep "# PRESENT" Count_output.tsv  > Count_genome_sizes.tsv && grep "# CHANGE" Count_output.tsv  > Count_changes.tsv && grep "# FAMILY" Count_output.tsv  > Count_families.tsv
+
+# change to the output directory for COUNT Gene dynamics
+cd ../../gene_dynamics/Count
+
+# run COUNT for gene dynamics, first for all genes and then for single copy genes
+java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.AsymmetricWagner -gain 1 ../../../genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels.pruned  ../../../og_pa_matrix.tsv  > Count_output.tsv
+
+# separate the output into genome sizes, changes, and families
+grep "# PRESENT" Count_output.tsv  > Count_genome_sizes.tsv && grep "# CHANGE" Count_output.tsv  > Count_changes.tsv && grep "# FAMILY" Count_output.tsv  > Count_families.tsv
+```
--- a/notebooks/03-compile_outputs.ipynb
+++ b/notebooks/03-compile_outputs.ipynb
--- a/notebooks/04-compare_gene_pathogenicity_dynamics.ipynb
+++ b/notebooks/04-compare_gene_pathogenicity_dynamics.ipynb
--- a/notebooks/04-compare_gene_pathogenicity_dynamics.single_copy_ogs.ipynb
+++ b/notebooks/04-compare_gene_pathogenicity_dynamics.single_copy_ogs.ipynb
--- a/notebooks/05-gd_pd_across_genomic_distance.ipynb
+++ b/notebooks/05-gd_pd_across_genomic_distance.ipynb
--- a/notebooks/lib/plot.mplstyle
+++ b/notebooks/lib/plot.mplstyle
+### CCB plot style sheet v0.5
+# based on template from: 
+# https://matplotlib.org/stable/tutorials/introductory/customizing.html#matplotlibrc-sample
+#
+#   Compared to previous versions, this version only contains changes from default settings
+#   instead of containing all settings, and is therefore shorter.
+#
+#   Color scheme: Bright qualitative (7 colors) reordered, colour-blind safe
+#   Borrowed from Paul Tol: https://personal.sron.nl/~pault/
+#   4477AA EE6677 228833 CCBB44 66CCEE AA3377 BBBBBB
+#   blue   cyan   green  yellow red    purple grey
+#
+#   Stylesheet changes originally created by Peter Schubert, 2020
+#   Updated to current matplotlib version 3.7 by Swastik Mishra, 2023
+#
+# ***************************************************************************
+# * LINES                                                                   *
+# ***************************************************************************
+# See https://matplotlib.org/stable/api/artist_api.html#module-matplotlib.lines
+# for more information on line properties.
+
+lines.markersize:      6           # marker size, in points
+# markers.fillstyle: none  # {full, left, right, bottom, top, none}
+
+## ***************************************************************************
+## * BOXPLOT                                                                 *
+## ***************************************************************************
+
+boxplot.meanprops.marker:          x
+
+# ***************************************************************************
+# * AXES                                                                    *
+# ***************************************************************************
+# Following are default face and edge colors, default tick sizes,
+# default font sizes for tick labels, and so on.  See
+# https://matplotlib.org/stable/api/axes_api.html#module-matplotlib.axes
+
+axes.edgecolor:     666666   # axes edge color
+axes.titlepad:      12.0     # pad between axes and title in points
+axes.labelpad:      3.0     # space between label and axis
+axes.axisbelow:     True    # draw axis gridlines and ticks:
+                            #     - below patches (True)
+                            #     - above patches but below lines ('line')
+                            #     - above all (False)
+axes.formatter.limits: -3, 3  # use scientific notation if log10
+                              # of the axis range is smaller than the
+                              # first or larger than the second
+
+axes.spines.top:    False
+axes.spines.right:  False
+
+axes.unicode_minus: False  # use Unicode for the minus symbol rather than hyphen.  See
+                           # https://en.wikipedia.org/wiki/Plus_and_minus_signs#Character_codes
+axes.prop_cycle: cycler(color  = ['4477AA', 'EE6677', '228833', 'CCBB44', '66CCEE', 'AA3377', 'BBBBBB'], marker = ['o','^','s','*','d','P','X'] )
+                ## bright qualitative color scheme - reordered, as per Paul Tol's Colour Schemes and templates
+                ##    https://personal.sron.nl/~pault/
+                #   color cycle for plot lines as list of string color specs:
+                #   single letter, long name, or web-style hex
+                #   As opposed to all other parameters in this file, the color
+                #   values must be enclosed in quotes for this parameter,
+                #   e.g. '1f77b4', instead of 1f77b4.
+                #   See also https://matplotlib.org/stable/tutorials/intermediate/color_cycle.html
+                #   for more details on prop_cycle usage.
+axes.xmargin:   .02  # x margin.  See `axes.Axes.margins`
+axes.ymargin:   .02  # y margin.  See `axes.Axes.margins`
+axes.zmargin:   .02  # z margin.  See `axes.Axes.margins`
+
+# ***************************************************************************
+# * TICKS                                                                   *
+# ***************************************************************************
+# See https://matplotlib.org/stable/api/axis_api.html#matplotlib.axis.Tick
+
+xtick.major.size:    5     # major tick size in points
+xtick.minor.size:    3       # minor tick size in points
+
+xtick.color:         666666   # color of the ticks
+
+xtick.labelcolor:    inherit # color of the tick labels or inherit from xtick.color
+xtick.labelsize:     medium  # font size of the tick labels
+xtick.direction:     inout     # direction: {in, out, inout}
+
+ytick.major.size:    5     # major tick size in points
+ytick.minor.size:    3       # minor tick size in points
+
+ytick.color:         666666   # color of the ticks
+ytick.labelcolor:    inherit # color of the tick labels or inherit from ytick.color
+ytick.labelsize:     medium  # font size of the tick labels
+ytick.direction:     inout     # direction: {in, out, inout}
+
+# ***************************************************************************
+# * GRIDS                                                                   *
+# ***************************************************************************
+grid.linestyle: --         # solid
+grid.alpha:     1.0        # transparency, between 0.0 and 1.0
+
+
+# ***************************************************************************
+# * LEGEND                                                                  *
+# ***************************************************************************
+
+legend.frameon:       False     # if True, draw the legend on a background patch
+legend.framealpha:    0.7      # legend patch transparency
+legend.markerscale:   0.9      # the relative size of legend markers vs. original
+
+legend.title_fontsize: medium    # None sets to the same as the default axes.
+
+# Dimensions as fraction of font size:
+legend.borderpad:     0.2  # border whitespace
+legend.labelspacing:  0.3  # the vertical space between the legend entries
+legend.handlelength:  1.0  # the length of the legend lines
+legend.handleheight:  0.7  # the height of the legend handle
+legend.handletextpad: 0.5  # the space between the legend line and legend text
+legend.borderaxespad: 0.2  # the border between the axes and legend edge
+legend.columnspacing: 2.0  # column separation
+
+# ***************************************************************************
+# * FIGURE                                                                  *
+# ***************************************************************************
+# See https://matplotlib.org/stable/api/figure_api.html#matplotlib.figure.Figure
+figure.titlesize:   xx-large  # size of the figure title (``Figure.suptitle()``)
+figure.titleweight: bold      # weight of the figure title
+figure.labelsize:   large     # size of the figure label (``Figure.sup[x|y]label()``)
+figure.figsize:     5, 3.09   # figure size in inches
+                              ### golden ratio: phi = y/x = (1+sqrt(5))/2 ≈ 1.618
+figure.frameon:     False     # enable figure frame
+
+figure.subplot.wspace: 0.3    # the amount of width reserved for space between subplots,
+                              # expressed as a fraction of the average axis width
+figure.subplot.hspace: 0.45   # the amount of height reserved for space between subplots,
+                              # expressed as a fraction of the average axis height
+
+# ***************************************************************************
+# * ERRORBAR PLOTS                                                          *
+# ***************************************************************************
+errorbar.capsize: 2  # length of end cap on error bars in pixels
+
+# ***************************************************************************
+# * HISTOGRAM PLOTS                                                         *
+# ***************************************************************************
+hist.bins: 50  # The default number of histogram bins or 'auto'.
+
+# ***************************************************************************
+# * SAVING FIGURES                                                          *
+# ***************************************************************************
+# The default savefig parameters can be different from the display parameters
+# e.g., you may want a higher resolution, or to make the figure
+# background white
+
+savefig.facecolor: white    # figure face color when saving
+savefig.edgecolor: auto     # figure edge color when saving
+savefig.format:    pdf      # {png, ps, pdf, svg}
+savefig.bbox:      tight    #    {tight, standard}
+                            #    'tight' is incompatible with pipe-based animation
+                            #    backends (e.g. 'ffmpeg') but will work with those
+                            #    based on temporary files (e.g. 'ffmpeg_file')
+