Skip to content
Snippets Groups Projects
Commit 1633b33a authored by Swastik Mishra's avatar Swastik Mishra
Browse files

Initial commit

parents
Branches
No related tags found
No related merge requests found
Showing
with 1681 additions and 0 deletions
arc/
data/
scrap/
.vscode
.txt
# Environmental transitions and HGT rates
Code to reproduce the results of the paper, "Horizontal gene transfer rates decrease during transitions to new ecosystems".
Jupyter notebooks in `notebooks` directory were run in the order of their numbering. Scripts and other helper functions that they depend on or are mentioned in the notebooks can be found in the `code` directory.
Please note that using 'Run all' or equivalent in the Jupyter notebooks will generally not be useful. Some of the intervening steps in the notebooks are markdown cells instructing how to run programs via shell, separately. These programs are time consuming ones, often utilising multiprocessing in an HPC. Please run the notebooks, cell by cell, keeping this in mind. Generally speaking, if you have run the previuos cell, you can run the current cell if it's a Python cell. If it's a markdown cell instructing you to do something, do it before proceeding.
This study downloads and processes an number of large files, which were stored in the `data` directory, which is empty in this repo. If you follow/run the notebooks you will progressively fill the `data` directory to reproduce all the results as well a the figures in the paper. Alternatively, you can download this repo containing the results and figures in the `data` directory, from the Zenodo link in the paper.
## Required python packages
A Mamba/Conda environment called `hgt_analyses` was used for all the analyses. This environment with all required packages can easily be created again using the `mamba_packages.yml` file, using the following command:
```
mamba env create -f mamba_packages.yml
```
I use Mamba since it's faster than using regular Conda but if you use Conda just replace `mamba` with `conda` in the above command.
The only other file you need to install is `xlsx2csv` using `pip`:
```
pip install xlsx2csv
```
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/env python3
import os
import time
import argparse
import getpass
import requests
from loguru import logger
import sys
from urllib.parse import urlencode
from requests.exceptions import RequestException
import json
def get_command_line_arguments():
cl_parser = argparse.ArgumentParser(
description="Download genome assemblies from Enterobase given a list of assembly barcodes.\n \
Download the assembly barcodes by using the web GUI for \n \
filtering and downloading a TSV file (use 'Data > Save to Local File') \n \
and extract the assembly barcodes column from the TSV file.")
cl_parser.add_argument("-i", "--input", help="Path to text file containing assembly barcodes of the assemblies to download.", required=True,
dest="barcodes", type=str)
cl_parser.add_argument("-o", "--output", help="Specify output path to save assemblies", default=os.getcwd() +
os.sep+"Enterobase_Assemblies"+time.strftime('%Y%m%d_%H%M'), dest="output", type=str)
cl_parser.add_argument("--debug", help="Run in debug mode (first 5 assembly codes only).",
action='store_true', required=False, dest="debug")
cl_parser.add_argument(
"-d", "--database", help="Enterobase database: (senterica, ecoli, yersinia, mcatarrhalis)", dest="db", type=str, required=True)
cl_parser.add_argument("--retry", type=int, default=5,
help="Number of retries for failed requests.")
cl_parser.add_argument("--sleep", type=int, default=5,
help="Sleep time between requests in seconds.")
return cl_parser.parse_args()
def check_user_input(cl_args):
if not os.path.isfile(cl_args.barcodes):
logger.error('Could not find download list file.')
sys.exit()
cl_args.output = cl_args.output + os.sep
os.makedirs(os.path.dirname(cl_args.output), exist_ok=True)
if cl_args.db not in ['senterica', 'ecoli', 'yersinia', 'mcatarrhalis']:
logger.error(
'Invalid database. Please choose from senterica, ecoli, yersinia, mcatarrhalis')
sys.exit()
if cl_args.debug:
logger.warning(
'Running in debug mode. Only the first 5 assembly codes will be downloaded. Those are:')
def get_login_details():
ENTEROBASE_USERNAME = input("Please enter Enterobase username: ")
ENTEROBASE_PASSWORD = getpass.getpass("Please enter Enterobase password: ")
return ENTEROBASE_USERNAME, ENTEROBASE_PASSWORD
def get_api_token(ENTEROBASE_USERNAME, ENTEROBASE_PASSWORD):
ENTEROBASE_SERVER = 'https://enterobase.warwick.ac.uk'
params = urlencode({"username": ENTEROBASE_USERNAME,
"password": ENTEROBASE_PASSWORD})
apiaddress = f'{ENTEROBASE_SERVER}/api/v2.0/login?{params}'
logger.info('Retrieving API token.')
response = requests.get(apiaddress)
if response.text: # Check if the response is not empty
try:
data = response.json()
API_TOKEN = data['api_token']
logger.info("Retrieved API token.")
return API_TOKEN
except json.JSONDecodeError:
logger.error("Failed to decode JSON from the response.")
return None
else:
logger.error(
"Empty response received while trying to retrieve API token.")
return None
def create_request(request_str, API_TOKEN, retries, sleep):
for _ in range(retries):
try:
response = requests.get(request_str, headers={
"Authorization": f"Basic {API_TOKEN}"})
if response.status_code == 200:
return response
else:
logger.error(f"Request to {request_str} failed with status code {
response.status_code}. Retrying...")
except RequestException as e:
logger.error(f"Request to {request_str} failed with exception {
e}. Retrying...")
time.sleep(sleep)
return None
def search(header_line, search_term):
try:
return header_line.split('\t').index(search_term)
except ValueError:
logger.error(f'{search_term} could not be found in header')
def get_assembly_codes_and_dict(cl_args):
assembly_codes = []
assembly_dict = {}
with open(cl_args.barcodes, 'r') as fin:
header = fin.readline()
indexer_barcode = search(header, 'Assembly barcode')
indexer_accession = search(
header, 'Data Source(Accession No.;Sequencing Platform;Sequencing Library;Insert Size;Experiment;Status)')
for line in fin:
assembly_codes.append(line.split('\t')[indexer_barcode].strip())
assembly_dict[line.split('\t')[indexer_barcode].strip()] = line.split('\t')[
indexer_accession].split(';')[0]
return assembly_codes, assembly_dict
def main():
start_time = time.time()
# Get command line arguments
cl_args = get_command_line_arguments()
# Check user input for errors, and set up output directory
check_user_input(cl_args)
# Set up logging
logger.add(f"{cl_args.output}download_enterobase_assemblies.log",
format="{time} {level} {message}", level="INFO")
# Get login details from user. Use getpass to hide password input. Get API token
ENTEROBASE_USERNAME, ENTEROBASE_PASSWORD = get_login_details()
# Get API token from Enterobase using login details
API_TOKEN = get_api_token(ENTEROBASE_USERNAME, ENTEROBASE_PASSWORD)
# Get assembly codes and dictionary from input file
assembly_codes, assembly_dict = get_assembly_codes_and_dict(
cl_args) # Get assembly codes and dictionary
count = 0 # Counter for progress
fasta_error_count = 0
assembly_code_error_count = 0
for querycode in assembly_codes:
time.sleep(cl_args.sleep)
params = urlencode({"barcode": querycode, "limit": 50})
address = f'http://enterobase.warwick.ac.uk/api/v2.0/{
cl_args.db}/assemblies?{params}'
response = create_request(
address, API_TOKEN, cl_args.retry, cl_args.sleep)
# If response is not None, get the fasta download link from the response
if response:
data = response.json()
fasta_url = data['Assemblies'][0]['download_fasta_link']
fasta_response = create_request(
fasta_url, API_TOKEN, cl_args.retry, cl_args.sleep)
if fasta_response:
# Fasta filename is the assembly code or the accession number if it exists
with open(f"{cl_args.output}{assembly_dict.get(querycode, querycode)}.fasta", 'w') as fasta_out:
fasta_out.write(fasta_response.text)
logger.info(f"Successfully downloaded {querycode}")
else:
logger.error(f"Failed to download {querycode} from {
fasta_url} after retries.")
fasta_error_count += 1
# If response is None, log the failed request
else:
logger.error(f"Failed request for {querycode} from {
address} after retries.")
assembly_code_error_count += 1
logger.info(f'Progress: {count+1} out of {len(assembly_codes)}')
count += 1
if cl_args.debug and count == 5:
break
# Final log of completion
logger.info(f"Downloaded {count} assemblies. {fasta_error_count} assemblies failed to download. {
assembly_code_error_count} assemblies failed to retrieve.")
logger.info(f"Output directory: {cl_args.output}")
# log total time taken in human readable format
logger.info(f"Total time taken: {time.strftime(
'%H:%M:%S', time.gmtime(time.time()-start_time))}")
if __name__ == "__main__":
main()
# run IQtree on the MSA files output by MAFFT (via OrthoFinder, in previous step)
# we run IQtree in parallel on all the MSA files
import os
import multiprocessing
import subprocess
from loguru import logger
import random
import sys
def run_iqtree(
msa_filepath,
iqtree_bin,
output_trees_dir,
substitution_model,
threads_max,
threads_per_process=1,
):
output_tree_prefix = f'{
output_trees_dir}/{os.path.basename(msa_filepath).replace(".fa", "")}'
# run iqtree
cmd = f"{
iqtree_bin} -s {msa_filepath} -m {substitution_model} -pre {output_tree_prefix}, -quiet -T {threads_max} -nt {threads_per_process}"
logger.info(
f"Running IQtree on {
msa_filepath}, to write to {output_tree_prefix} as output prefix."
)
try:
subprocess.run(cmd, shell=True, check=True)
except subprocess.CalledProcessError as e:
return f"Error running IQtree on {msa_filepath}: {e}"
logger.info(f"Finished running IQtree on {msa_filepath}")
return None
def run_iqtree_wrapper(args):
"""
Wrapper function to run IQtree with multiple arguments, for the multiprocessing.Pool.imap_unordered function.
"""
return run_iqtree(*args)
def get_seq_length(msa_filepath):
"""
Function to get the number of characters in the first sequence of the MSA file.
"""
with open(msa_filepath, "r") as f:
# split file by >, and take the second element, which is the first sequence
seq = f.read().split(">")[1]
return len(seq)
if __name__ == "__main__":
import time
import argparse
start_time = time.time()
parser = argparse.ArgumentParser(
description="Run IQtree on all MSA files in a directory. For large MSA files, run with multiple threads per process"
)
parser.add_argument(
"-m",
"--msa_dir",
type=str,
required=True,
help="Directory containing MSA files",
)
parser.add_argument(
"-o",
"--output_dir",
type=str,
default="./gene_trees/",
help="Output directory for gene trees",
)
parser.add_argument(
"-T",
"--threads-max",
type=int,
default=100,
help="Maximum number of threads to use",
)
parser.add_argument(
"-t",
"--threads-per-process",
type=int,
default=5,
help="Number of threads per process to use, for large MSA files",
)
parser.add_argument(
"-b",
"--iqtree_bin",
type=str,
default="/root/bin/iqtree-2.2.2.6-Linux/bin/iqtree2",
help="Path to IQtree binary",
)
parser.add_argument(
"-d",
"--debug",
action="store_true",
help="Enable debug mode (run on first 5 MSA files only)",
)
parser.add_argument(
"-s",
"--substitution_model",
type=str,
default="JTT",
help="Substitution model for IQtree",
)
parser.add_argument(
"--filesize_cutoff",
type=int,
default=150,
help="Cutoff for number of sequences in MSA file to determine if it is a large or small file.\n \
Files with more than this cutoff will be run with multiple threads per process.",
)
parser.add_argument(
"--min_seqs",
type=int,
default=10,
help="Minimum number of sequences in MSA file to run IQtree. Files with fewer sequences will be skipped.",
)
parser.add_argument(
"--random",
type=int,
nargs="?",
const=1000,
default=None,
help="If given, pick this many MSA at random. \
If no number is provided, but argument is provided, default to 1000. \
If this argument is not given, take all MSA above min_seqs in size.",
)
parser.add_argument(
"--min_len",
type=int,
default=150,
help="Minimum length of longest sequence of MSA file (including gaps). Files with smaller genes will be skipped.",
)
args = parser.parse_args()
# set up logging
logger.add(f'{args.output_dir}/iqtree_run_{time.strftime("%Y%m%d_%H%M%S")}.log')
# remove output directory if it exists
if os.path.exists(args.output_dir):
os.system(f"rm -r {args.output_dir}")
logger.warning(f"Removed existing output directory: {args.output_dir}")
# create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
# check if IQtree binary exists and MSA directory exists
if not os.path.exists(args.iqtree_bin):
raise FileNotFoundError(f"IQtree binary not found: {args.iqtree_bin}")
if not os.path.exists(args.msa_dir):
raise FileNotFoundError(f"MSA directory not found: {args.msa_dir}")
# get all MSA files (in FASTA format, ending with ".fa") in the directory specified by `args.msa_dir`
msa_filepaths = [f for f in os.listdir(args.msa_dir) if f.endswith(".fa")]
msa_filepaths = [f"{args.msa_dir}{f}" for f in msa_filepaths]
# first get the sizes (number of characters in the second line), of all the MSA files
msa_lengths = [(f, get_seq_length(f)) for f in msa_filepaths]
# remove all MSA files that have fewer than min_len characters in the first sequence
short_seqs_to_remove = []
for msa_filepath, length in msa_lengths:
if length < args.min_len:
short_seqs_to_remove.append(msa_filepath)
msa_filepaths = [f for f in msa_filepaths if f not in short_seqs_to_remove]
logger.info(
f"Number of MSA files with more than {args.min_len} characters in seq: {len(msa_filepaths)}"
)
# write this list of MSA files and their sequence counts to a file in output directory
# with open(f"../scrap/msa_files_sequence_counts.txt", "w") as f:
with open(f"{args.output_dir}/msa_files_sequence_counts.txt", "w") as f:
for msa_filepath, num_seqs in msa_lengths:
f.write(f"{num_seqs}\t{msa_filepath}\n")
# for all the MSA files, find the number of sequences they have by opening and reading the number of lines starting with ">".
# Retain only MSA files that have more than `args.min_seqs` sequences
small_files_to_remove = []
msa_filepath_sizes = []
for msa_filepath in msa_filepaths:
with open(msa_filepath, "r") as f:
num_seqs = sum(1 for l in f if l.startswith(">"))
if num_seqs < args.min_seqs:
small_files_to_remove.append(msa_filepath)
else:
msa_filepath_sizes.append((msa_filepath, num_seqs))
msa_filepaths = [f for f in msa_filepaths if f not in small_files_to_remove]
logger.info(
f"Number of MSA files with more than {args.min_seqs} sequences: {len(msa_filepaths)}"
)
if args.random is not None:
msa_filepath_sizes = random.sample(
msa_filepath_sizes, min(args.random, len(msa_filepath_sizes))
)
logger.info(f"Picked {len(msa_filepath_sizes)} MSA files at random")
# run IQtree on all MSA files
if args.debug:
msa_filepath_sizes = msa_filepath_sizes[:5]
# we will use a hybrid approach, wherein large MSA files will have multiple threads per process
# and small MSA files will have a single thread per process
# large MSA files are those with more than 100 sequences
small_msa_filepaths = [
f for f in msa_filepath_sizes if f[1] <= args.filesize_cutoff
]
large_msa_filepaths = [f for f in msa_filepath_sizes if f[1] > args.filesize_cutoff]
logger.info(
f"Number of MSA files with <= {args.filesize_cutoff} sequences: {len(small_msa_filepaths)}, and > {args.filesize_cutoff} sequences: {len(large_msa_filepaths)}"
)
# sys.exit(0)
if small_msa_filepaths:
logger.info(
f"Running IQtree on {len(small_msa_filepaths)} MSA files each with <= 100 sequences"
)
# run small MSA files first
with multiprocessing.Pool(processes=args.threads_max) as pool:
for pool_return in pool.imap_unordered(
run_iqtree_wrapper,
[
(
msa_filepath,
args.iqtree_bin,
args.output_dir,
args.substitution_model,
args.threads_max,
1,
)
for msa_filepath, _ in small_msa_filepaths
],
):
if pool_return is not None:
logger.error(f"Error running IQtree: {pool_return}")
if large_msa_filepaths:
logger.info(
f"Running IQtree on {len(large_msa_filepaths)} MSA files each with > 100 sequences"
)
# run large MSA files next
with multiprocessing.Pool(
processes=args.threads_max // args.threads_per_process
) as pool:
for pool_return in pool.imap_unordered(
run_iqtree_wrapper,
[
(
msa_filepath,
args.iqtree_bin,
args.output_dir,
args.substitution_model,
args.threads_max,
args.threads_per_process,
)
for msa_filepath, _ in large_msa_filepaths
],
):
if pool_return is not None:
logger.error(f"Error running IQtree: {pool_return}")
# log time taken
time_taken = time.time() - start_time
logger.info(
f'Time taken to finish: {
time.strftime("%H:%M:%S", time.gmtime(time_taken))}'
)
import os
import subprocess
from multiprocessing import Pool
from loguru import logger
import glob
import shutil
import time
def run_ALE_on_NOG(NOG_ID, species_tree_filepath, gene_trees_filepath, bin_dir, n_samples, separator, max_retries):
"""Run ALE on a single NOG_ID
Args:
NOG_ID (str): NOG ID to run ALE on
species_tree_filepath (str): Path to the species tree file
gene_trees_filepath (str): Path to the gene trees file
bin_dir (str): Path to the ALE bin directory
n_samples (int): Number of reconciliation samples for ALEml_undated
separator (str): Separator for ALEml_undated
Returns:
NOG_ID (str): NOG ID that was run, if successful
If unsuccessful, returns None
"""
base_dir = os.getcwd() # this is the directory where the script is run from
genetree_filepath = os.path.join(base_dir, f"{NOG_ID}.tree")
# write this specific genetree to a file by grepping for the NOG_ID
# and then writing the second column to a file
try:
with open(genetree_filepath, 'w') as tree_file_object:
p1 = subprocess.Popen(
["grep", NOG_ID, gene_trees_filepath], stdout=subprocess.PIPE)
p2 = subprocess.Popen(
["awk", "{print $2}"], stdin=p1.stdout, stdout=tree_file_object)
p2.communicate()
if p1.poll() is None:
p1.communicate()
logger.info(f"Successfully wrote gene tree {NOG_ID} to file {genetree_filepath}")
except subprocess.CalledProcessError as e:
logger.error(f"Failed to write gene tree to file with error {e}")
return None
# run ALE on the genetree
# first run ALEobserve to create the .ale file
n_sleep = 2 # sleep for n_sleep seconds before retrying
for i in range(max_retries + 1):
try:
subprocess.run(
[f"{bin_dir}ALEobserve", genetree_filepath], check=True)
break
except subprocess.CalledProcessError as e:
if i < max_retries: # i is zero indexed
logger.warning(f"ALEobserve failed with error {
e}, retrying ({i+1}/{max_retries})")
# sleep for n_sleep seconds before retrying
time.sleep(n_sleep)
continue
else:
logger.error(f"ALEobserve failed with error {
e}, no more retries left")
return None
# then run ALEml_undated with retry mechanism
for i in range(max_retries + 1):
try:
aleml_undated_command = [f"{bin_dir}ALEml_undated", species_tree_filepath, f"{genetree_filepath}.ale", f'separators="{separator}"', f"sample={n_samples}"]
logger.info(f"Running ALEml_undated with command: {aleml_undated_command}")
subprocess.run(aleml_undated_command, check=True)
return NOG_ID # If ALEml_undated succeeds, return the NOG_ID
except subprocess.CalledProcessError as e:
if i < max_retries: # i is zero indexed
logger.warning(f"ALEml_undated failed with error {
e}, retrying ({i+1}/{max_retries})")
time.sleep(n_sleep)
continue
else:
logger.error(f"ALEml_undated failed with error {
e}, no more retries left")
return None
return NOG_ID
def run_ALE_on_NOG_wrapper(args):
return run_ALE_on_NOG(*args)
if __name__ == '__main__':
# calculate total time taken
from datetime import timedelta
import time
start_time = time.time()
# log using start time
logger.info("Starting ALE run")
# write everything in terms of argparse instead of hardcoding
import argparse
parser = argparse.ArgumentParser(description="Run ALE on all gene trees")
parser.add_argument("--species", "-s", type=str, default="../../01-prepare_data/wol_tree_pruned_1236_no_internal_labels.nwk",
help="Path to species tree file")
parser.add_argument("--gene", "-g", type=str, default="../../01-prepare_data/1236_subset_pruned_trees.tsv.rooted",
help="Path to gene trees file")
parser.add_argument("--threads", "-t", type=int, default=50,
help="Number of threads to use for parallelization")
parser.add_argument("--bin", "-b", type=str,
default="/root/bin/ALE/bin/", help="Path to ALE bin directory")
parser.add_argument("--samples", "-n", type=int, default=100,
help="Number of reconciliation samples for ALEml_undated")
parser.add_argument("--separator", "-sep", type=str, default=".",
help="Separator for ALEml_undated")
parser.add_argument("--debug", "-d", action="store_true",
help="Enable debug mode (run ALE on first 5 input trees only)")
parser.add_argument("--output-dir", "-o", type=str, default="./RESULTS/",
help="Path to output directory")
parser.add_argument("--retries", "-r", type=int, default=10,
help="Max number of retries for ALEml_undated and ALEobserve")
args = parser.parse_args()
# parse the arguments
species_tree_filepath = args.species
gene_trees_filepath = args.gene
max_threads = args.threads
bin_dir = args.bin
n_samples = args.samples
separator = args.separator
output_dir = args.output_dir
max_retries = args.retries
# log the arguments
logger.info(f"\nArguments passed: \
\nSpecies tree file: {species_tree_filepath} \
\nGene trees file: {gene_trees_filepath} \
\nNumber of threads: {max_threads} \
\nALE bin directory: {bin_dir} \
\nNumber of samples: {n_samples} \
\nSeparator: {separator} \
\nOutput directory: {output_dir} \
\nNumber of retries: {max_retries}")
# read and extract first column as list of NOG_IDs
with open(gene_trees_filepath, "r") as treeIDs:
NOG_ID_list = [line.rstrip().split("\t")[0]
for line in treeIDs.readlines()]
# if debug
if args.debug:
NOG_ID_list = NOG_ID_list[:5]
logger.warning(
"Debug mode enabled. Running ALE on first 5 gene trees only")
max_threads = 5
# log the number of NOGs to run ALE on, with time
logger.info(f"ALE will be run on {(len(NOG_ID_list))} NOGs, using {
str(max_threads)} threads")
with Pool(processes=max_threads) as pool: # run ALE on each NOG in parallel
for pool_out in pool.imap(run_ALE_on_NOG_wrapper, [(NOG_ID, species_tree_filepath, gene_trees_filepath, bin_dir, n_samples, separator, max_retries) for NOG_ID in NOG_ID_list]):
if pool_out is not None: # if ALE run was successful
# log time when you finished this NOG_ID
print(f"{time.strftime('%X %x %Z')}: Finished {pool_out}\n")
# move all files that end with `ale.uTs` to the outputs directory
# check if the outputs directory exists, if not create it. If creating new, raise warning
if os.path.exists(output_dir):
logger.warning(
"The outputs directory already exists. Overwriting files in the outputs directory")
# remove all files in the outputs directory
for file in glob.glob(f"./{output_dir}/*"):
os.remove(file)
else:
os.makedirs(output_dir)
# remove all `*.ale.uml_rec` files
# for file in glob.glob(f"./*.ale.uml_rec"):
# os.remove(file)
# or move them to the outputs directory
for file in glob.glob(f"./*.ale.uml_rec"):
shutil.move(file, output_dir)
# get the filename of the species tree
species_tree_filename = os.path.basename(species_tree_filepath)
for file in glob.glob(f"./*.ale.uTs"):
shutil.move(file, output_dir)
# remove all files that end with .ale
for file in glob.glob(f"./*.ale"):
os.remove(file)
# move all files that end with .tree to the trees directory
# create the directory if it doesn't exist
os.makedirs("./tmp_trees/", exist_ok=True)
for file in glob.glob(f"./*.tree"):
shutil.move(file, "./tmp_trees/")
# calculate total time taken
end_time = time.time()
total_time = end_time - start_time
# log the time taken in a readable format
logger.info(f"Total time taken: {timedelta(seconds=total_time)}")
#!/usr/bin/env python3
from subprocess import CalledProcessError, Popen
import os
import time
from datetime import timedelta, datetime
import argparse
from multiprocessing import Pool
from loguru import logger
def run_command(params):
"""
Inputs:
params: tuple of (shell_command, logFileObject)
Function:
Runs the shell command, and writes the output to the log file object
"""
cmd, log_file_path = params
with open(log_file_path, "w") as log_FO:
process = Popen(cmd, stdout=log_FO, universal_newlines=True)
if process.wait() != 0:
raise CalledProcessError(process.returncode, cmd)
def MAD_roots_NOG_trees_parallel(
input_file_path, mad_executable, n_lines, num_subprocesses
):
"""
Inputs:
input_file_path: TSV format input file. Columns are: treeID(str) and tree(str)
mad_executable: path to MAD executable
n_lines: number of lines that the input file will be split into for each split file.
Outputs:
Writes out a file with the same name as the input file, but with a `.rooted` at the end of the filename.
Additional info:
This function splits the input file into multiple files, and runs MAD on each of these split files in parallel.
The number of processes that will be run in parallel is equal to the number of split files.
The number of lines in each split file is equal to the `n_lines` argument.
The output file will contain the same number of lines as the input file, but the second column will contain the rooted trees.
"""
# Check if the input file exists
if not os.path.isfile(input_file_path):
raise FileNotFoundError(
f"The input file {
input_file_path} does not seem to exist. Check the filepath provided."
)
# Find the full path of the input file and extract the dir name
input_file_path = os.path.abspath(input_file_path)
input_dir = os.path.dirname(input_file_path)
# Get the current date and time
now = datetime.now()
# Format the current date and time as a string
timestamp = now.strftime("%Y%m%d_%H%M%S")
# Create a temporary directory path with the timestamp
tmp_dir = f"{input_dir}/tmp_{timestamp}"
# Create the temporary directory
os.mkdir(tmp_dir)
# log this with time, along with location of the tmp folder
logger.info(f"Created the tmp folder at location: {tmp_dir}")
# Open the input file and read it in chunks of `n_lines` lines
# track the filenames of split files containing the second column (tree)
nwk_split_files = []
# track the filenames of split files containing the first column (treeID)
info_split_files = []
def write_split_files(chunk, chunk_index, nwk_split_files, info_split_files):
"""
Helper function to write out the split files for a given chunk.
"""
split_chunk = [line.split() for line in chunk]
nwk_split_filepath = f"{tmp_dir}/tmp_tree_split.nwk.{chunk_index}"
info_split_filepath = f"{tmp_dir}/tmp_tree_split.info.{chunk_index}"
with open(nwk_split_filepath, "w") as split_out_fo:
split_out_fo.write("\n".join(line[1] for line in split_chunk) + "\n")
with open(info_split_filepath, "w") as split_out_fo:
split_out_fo.write("\n".join(line[0] for line in split_chunk) + "\n")
nwk_split_files.append(nwk_split_filepath)
info_split_files.append(info_split_filepath)
return nwk_split_files, info_split_files
# open the input file
with open(input_file_path, "r") as treesFO:
chunk = []
for i, line in enumerate(treesFO):
chunk.append(line)
if (i + 1) % n_lines == 0:
nwk_split_files, info_split_files = write_split_files(
chunk, (i // n_lines) + 1, nwk_split_files, info_split_files
)
chunk = []
# Write the last chunk if it's not empty
if chunk:
nwk_split_files, info_split_files = write_split_files(
chunk, (i // n_lines) + 2, nwk_split_files, info_split_files
)
# using subprocess.Popen inside the run_command function,
# call MAD on all of these split newick files. Prepare the commands.
# `mad -n trees.nwk.1`
commands = [
([mad_executable, "-t", "-n", split_file_path], split_file_path + ".stdout")
for split_index, split_file_path in enumerate(nwk_split_files)
]
mad_start_time = time.time()
# Log timestamp for the start of the MAD rooting process
logger.info(f"Starting MAD rooting")
# Create a multiprocessing Pool with the defined maximum number of subprocesses
with Pool(num_subprocesses) as pool:
# Use pool.map to run the commands in parallel
pool.map(run_command, commands)
# Indicate the completion of the MAD rooting process
logger.info(
f"MAD rooting completed \
Total time taken for MAD rooting: {timedelta(seconds=time.time() - mad_start_time)}"
)
# Log that the split files are being combined, along with the timestamp
logger.info("Combining the split files into one file...")
# Note: mad adds a `.rooted` at the end of the input filename, as the output filename
# combine the columns from the info files with the nwk.rooted files to prepare the final output file
with open(input_file_path + ".rooted", "w") as rootedOutputFO:
for i, nwk_split_filepath in enumerate(nwk_split_files):
# read in the corresponding split nwk.rooted file
rooted_nwk_split_filepath = f"{nwk_split_filepath}.rooted"
info_split_filepath = info_split_files[i]
with open(rooted_nwk_split_filepath, "r") as rooted_nwk_split_FO:
rooted_nwk_trees = [
i.strip() for i in rooted_nwk_split_FO.readlines()
] # remove trailing newlines. This is a list of rooted trees
# combine the info and rooted trees
with open(info_split_filepath, "r") as info_file:
rooted_info_plus_tree_chunks = [
"\t".join([info_line.strip(), rooted_nwk_trees[line_index]])
for line_index, info_line in enumerate(info_file)
]
# write out the new tree line chunks to the output file
rootedOutputFO.write("\n".join(rooted_info_plus_tree_chunks) + "\n")
# Clear memory
del rooted_info_plus_tree_chunks
del rooted_nwk_trees
# delete the split nwk files in the tmp folder
for split_file_name in nwk_split_files:
os.remove(split_file_name)
# combine all the log files into one log file using `tail -n +1`, and write in the input_dir with timestamp
now = datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S")
os.system(
"tail -n +1 "
+ tmp_dir
+ "/*.stdout > "
+ input_dir
+ "/tmp_mad_rooting_splits_"
+ timestamp
+ ".log"
)
# delete the tmp folder
# os.system('rm -rf '+tmp_dir)
if __name__ == "__main__":
start_time = time.time()
# Catches and interprets the user input from the command line.
parser = argparse.ArgumentParser()
# General stuff
parser.add_argument(
"--Input",
"-i",
required=True,
type=str,
help="path to input file which should be tsv format with two columns: first column is the tree ID and the second column is the tree in newick",
)
parser.add_argument(
"--mad",
"-m",
required=False,
type=str,
default="mad",
help="path to MAD executable. Assumes mad is within $PATH by default, if this argument is not provided.",
)
parser.add_argument(
"--nlines",
"-n",
required=False,
type=int,
default=50,
help="(Int) default=50. \
Number of lines that the input file will be split into for each split\
file. Smaller number implies less trees being processed by one MAD process.",
)
parser.add_argument(
"--num_subprocesses",
"-p",
required=False,
type=int,
default=50,
help="(Int) default=50. Number of processes to run in parallel. \
The larger this number, the more number of processes that will \
be run in parallel, since each split file chunk gets its own process",
)
# parser.add_argument('--Output', '-o', required=True, type=str,
# help="path to output files")
args = parser.parse_args()
MAD_roots_NOG_trees_parallel(
args.Input, args.mad, args.nlines, args.num_subprocesses
)
logger.info(
"Process time taken from start to finish: "
+ str(timedelta(seconds=time.time() - start_time))
)
This diff is collapsed.
This diff is collapsed.
%% Cell type:markdown id: tags:
ALE seems to fail if the input gene IDs in terms of NCBI IDs currently of the form:
- for genome IDs in genomes tree: `GCF_xxxxxxxx_n`
- for gene IDs in gene trees: `GCF_xxxxxxxx_n_ABC_xxxxxxxx_m`
Here we convert the genome tree taxa IDs to our own naming of the form `txidxxxxxxx` and the gene tree gene IDs to our own naming of the form `txidxxxxxxx_ABC_xxxxxxxx_n`
%% Cell type:code id: tags:
``` python
# suppress syntax warnings of ete3
import warnings
warnings.filterwarnings("ignore", category=SyntaxWarning)
```
%% Cell type:code id: tags:
``` python
# take genome tree as input, read the taxa IDs, rename them, and rewrite the tree with the renamed taxa. Save the mapping of old and new names in a file.
import os
import dendropy
import ete3
import re
genome_tree_filepath = "../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled"
output_tree_filepath = "../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled.renamed"
mapping_filepath = "../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled.renamed.mapping"
# read the rooted tree
genome_tree = dendropy.Tree.get(
path=genome_tree_filepath, schema="newick", preserve_underscores=True
)
# read the taxa
taxa = genome_tree.taxon_namespace
# rename the taxa into taxon_0000x where x is the index of the taxon in the list add with 0 padding to 5 digits
mapping = {taxa[i].label: f"taxon{str(i+1).zfill(5)}" for i in range(len(taxa))}
# rename the taxa
for taxon in taxa:
taxon.label = mapping[taxon.label]
# write the tree
genome_tree.write(path=output_tree_filepath, schema="newick")
# write the mapping
with open(mapping_filepath, "w") as mapping_file:
for old_name, new_name in mapping.items():
mapping_file.write(f"{old_name}\t{new_name}\n")
print(f"Tree written to {output_tree_filepath}")
# make a version with no internal labels
output_tree_filepath_no_internal_labels = (
"../data/genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels"
)
# remove the internal labels from genome_tree
for node in genome_tree:
if not node.is_leaf():
node.label = None
genome_tree.write(path=output_tree_filepath_no_internal_labels,
# newick format
schema="newick")
# read in using ete3 and rewrite without scientific notation branch lengths
ete3_genome_tree = ete3.Tree(output_tree_filepath_no_internal_labels, format=1)
ete3_genome_tree.write(outfile=output_tree_filepath_no_internal_labels, format=1,
dist_formatter="%.18f")
############################################################################################################
# now we replace the old taxa labels with new ones in the newick strings of the gene trees
gene_trees_tsv = (
"../data/gene_trees_newick.tsv.rooted" # two columns: gene_tree_id, newick
)
# two columns: gene_tree_id, newick
new_gene_trees_tsv = (
"../data/gene_trees_newick.tsv.rooted.renamed"
)
# first read in the trees into a dict
with open(gene_trees_tsv, "r") as gene_trees_file:
gene_trees_dict = {
line.split("\t")[0]: line.split("\t")[1].strip()
for line in gene_trees_file.readlines()
}
# leaf labels are of the form "GCF_000123456_1_WP_123456789_1" which should be renamed to "taxon00001.WP_123456789_1"
# first replace the dots in the gene tree leaf names with underscores. Make sure the branch lengths are not affected.
for gene_tree_id, gene_tree in list(gene_trees_dict.items()):
# match the leaf labels (they end with :) and replace the dots with underscores
gene_trees_dict[gene_tree_id] = re.sub(
r"([A-Za-z0-9]+)\.([A-Za-z0-9]+):", r"\1_\2:", gene_tree
)
# add an underscore at the end of the old taxon names and a dot at the end of the new taxon names in the mapping
mapping2 = {old_name + "_": new_name + "." for old_name, new_name in mapping.items()}
# display a few mappings
print(f"Old name\tNew name")
for i, (old_name, new_name) in enumerate(mapping2.items()):
if i < 5:
print(f"{old_name}\t{new_name}")
else:
break
# now replace the old taxa names with the new ones, iterating over the gene trees and reading the newick strings
for gene_tree_id, gene_tree in list(gene_trees_dict.items()):
for old_name, new_name in mapping2.items():
gene_tree = re.sub(f"{old_name}", new_name, gene_tree)
gene_trees_dict[gene_tree_id] = gene_tree
# write the renamed gene trees to a new file
with open(new_gene_trees_tsv, "w") as new_gene_trees_file:
for gene_tree_id, gene_tree in gene_trees_dict.items():
new_gene_trees_file.write(f"{gene_tree_id}\t{gene_tree}\n")
print(f"Gene trees written to {new_gene_trees_tsv}")
```
%% Output
Tree written to ../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled.renamed
Old name New name
GCF_016128235_1_ taxon00001.
GCF_020097475_1_ taxon00002.
GCF_016904755_1_ taxon00003.
GCF_009931435_1_ taxon00004.
GCA_017672245_1_ taxon00005.
Gene trees written to ../data/gene_trees_newick.tsv.rooted.renamed
%% Cell type:code id: tags:
``` python
```
%% Cell type:markdown id: tags:
# Run ALE for inferring gene dynamics
%% Cell type:markdown id: tags:
ALE only requires the species tree to be rooted, so we can use the unrooted gene trees directly.
However, rooted trees seem to make ALE run faster, so we use rooted gene trees, rooted using MAD.
```bash
# make the directory for ALE inferences
mkdir -p data/inferences/gene_dynamics/ALE
cd data/inferences/gene_dynamics/ALE
# run ALE
nohup ~/mambaforge/envs/hgt_analyses/bin/python ../../../../code/run_ALE.py -s ../../../genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels -g ../../../gene_trees_newick.tsv.rooted.renamed -o ./ -t 100 > ../../../nohup_run_ALE_GD.out & disown
# this results in the ALE output dir being in ./RESULTS (i.e. data/inferences/gene_dynamics/ALE/RESULTS)
```
%% Cell type:markdown id: tags:
# Run COUNT and GLOOME for both gene and pathogenicity dynamics
%% Cell type:code id: tags:
``` python
!mkdir -p ../data/inferences/gene_dynamics/Count ../data/inferences/gene_dynamics/GLOOME ../data/inferences/pathogenicity_dynamics/Count ../data/inferences/pathogenicity_dynamics/GLOOME
```
%% Cell type:markdown id: tags:
## Prepare PA Matrices and param files for running COUNT and GLOOME
%% Cell type:code id: tags:
``` python
# suppress syntax warnings of ete3
import warnings
warnings.filterwarnings("ignore", category=SyntaxWarning)
```
%% Cell type:code id: tags:
``` python
import pandas as pd
import os
import ete3
orthogroups_tsv_filepath = "../data/escherichia_ncbi_assemblies/OrthoFinder/Results_Nov28/Orthogroups/Orthogroups.tsv"
acc_to_taxa_mapping_filepath = (
"../data/genome_tree/genome_tree.iqtree.treefile.rooted.labeled.renamed.mapping"
)
ncbi_summary_tsv_filepath = (
"../data/escherichia_pathogen_commensal_ncbi_summary.tsv"
)
og_pa_matrix_filepath = "../data/og_pa_matrix.tsv"
og_pa_matrix_filepath = os.path.abspath(og_pa_matrix_filepath)
pathogenicity_pa_matrix_filepath = "../data/pathogenicity_pa_matrix.tsv"
pathogenicity_pa_matrix_filepath = os.path.abspath(pathogenicity_pa_matrix_filepath)
tree_filepath = (
"../data/genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels"
)
tree_filepath = os.path.abspath(tree_filepath)
```
%% Cell type:code id: tags:
``` python
# functions to prepare OGs and Pathogenicity PA matrices
def prepare_og_pa_matrix(
orthogroups_tsv_filepath,
acc_to_taxa_mapping_filepath,
og_pa_matrix_filepath: str,
tree_filepath: str,
min_taxa=10, # in line with ALE's gene trees having at least 10 taxa
single_copy=False,
) -> None:
"""
Prepare OG presence-absence matrix for GLOOME/COUNT and prune the tree
"""
# OG mapping from Orthofinder
orthogroups_df = pd.read_csv(
orthogroups_tsv_filepath, sep="\t", header=0, index_col=0, low_memory=False
)
# keep only OGs with at least min_taxa genomes (columns) with any genes in them
orthogroups_df = orthogroups_df.loc[
orthogroups_df.apply(lambda x: x.count() >= min_taxa, axis=1)
]
if not single_copy:
gloome_output_fasta_filepath = os.path.join(
os.path.dirname(og_pa_matrix_filepath),
os.path.basename(og_pa_matrix_filepath).replace(".tsv", ".fasta"),
)
count_output_tsv_filepath = og_pa_matrix_filepath
pruned_tree_out_file = f"{tree_filepath}.pruned"
else:
gloome_output_fasta_filepath = os.path.join(
os.path.dirname(og_pa_matrix_filepath),
os.path.basename(og_pa_matrix_filepath).replace(".tsv", ".single_copy.fasta"),
)
count_output_tsv_filepath = os.path.join(
os.path.dirname(og_pa_matrix_filepath),
os.path.basename(og_pa_matrix_filepath).replace(".tsv", ".single_copy.tsv"),
)
pruned_tree_out_file = f"{tree_filepath}.single_copy_ogs.pruned"
# index is the name of the OG, and header columns are the IDs of the genomes. Every entry is (comma-separated) list of genes in the OG for the corresponding genome
# we want to convert this to a presence-absence matrix
# convert the comma-separated lists to number of genes in the OG, and wherever it's NaN, replace with 0. Df entries should be integer values
orthogroups_df = orthogroups_df.map(lambda x: 0 if pd.isna(x) else len(x.split(',')))
# convert dots in genome IDs to underscores
orthogroups_df.columns = orthogroups_df.columns.str.replace(".", "_")
# remove the name of the index
orthogroups_df.index.name = None
# replace genome acc IDs with our own taxon IDs using the acc_to_taxon_mapping_file
acc_to_taxa_mapping_dict = (
pd.read_csv(
acc_to_taxa_mapping_filepath,
sep="\t",
header=None,
names=["acc_id", "taxon_id"],
)
.set_index("acc_id")["taxon_id"]
.to_dict()
)
orthogroups_df.rename(columns=acc_to_taxa_mapping_dict, inplace=True)
# sort the column names
orthogroups_df = orthogroups_df[sorted(orthogroups_df.columns)]
single_copy_og_df = orthogroups_df.copy()
# keep only OGs with exactly one gene in every genome
single_copy_og_df = single_copy_og_df.apply(lambda x: x == 1).all(axis=1)
# write the list of single-copy OGs to a file
single_copy_og_df.index.to_series().to_csv(
os.path.join(
os.path.dirname(og_pa_matrix_filepath),
"list.single_copy_ogs.txt",
),
header=False,
index=False,
)
if single_copy:
# keep only OGs with exactly one gene in every genome
orthogroups_df = orthogroups_df.apply(lambda x: x == 1).all(axis=1)
print("Retained only single-copy OGs")
# GLOOME requires a fasta file with every row as genome ID, and every column as OG.
# Every sequence basically has to be a 0/1 string. We can convert the OG PA matrix to this format
# transpose
orthogroups_df_binary = orthogroups_df.T
# convert to 0/1
orthogroups_df_binary = orthogroups_df_binary.map(lambda x: 1 if x > 0 else 0)
# write to a fasta file
with open(
gloome_output_fasta_filepath,
"w",
) as f:
for genome, row in orthogroups_df_binary.iterrows():
f.write(">" + genome + "\n")
f.write("".join(row.to_numpy().astype(str)) + "\n")
# COUNT requires a TSV file with every row as OG and every column as genome ID
orthogroups_df.to_csv(count_output_tsv_filepath, sep="\t")
# print size of the matrix
print("OG PA matrix shape:", orthogroups_df.shape)
# Prune the tree to only include the genomes in the OG PA matrix
genomes_in_og_pa_matrix = orthogroups_df.columns.tolist()
genome_tree = ete3.Tree(tree_filepath, format=1)
genome_tree.prune(genomes_in_og_pa_matrix)
genome_tree_write_str = genome_tree.write(format=1, format_root_node=True)
with open(pruned_tree_out_file, "w") as f:
f.write(genome_tree_write_str + "\n")
print("Pruned tree written to", pruned_tree_out_file)
# this tree is without internal labels. We make another tree where we label the internal nodes
# labels: N1, N2, N3, ...
for i, node in enumerate(genome_tree.traverse("postorder")):
if not node.is_leaf():
node.name = f"N{i+1}"
pruned_labeled_out_file = f"{pruned_tree_out_file.replace('no_internal_labels', 'labeled')}"
genome_tree_write_str = genome_tree.write(format=1, format_root_node=True)
with open(pruned_labeled_out_file, "w") as f:
f.write(genome_tree_write_str + "\n")
return None
def prepare_pathogenicity_pa_matrix(
ncbi_summary_tsv_filepath,
acc_to_taxa_mapping_filepath,
tree_filepath,
pathogenicity_pa_matrix_filepath: str,
single_copy=False,
) -> None:
"""
Prepare pathogenicity presence-absence matrix for GLOOME/COUNT
"""
# pathogenicity data is from the original ncbi summary tsv file
ncbi_summary_df = pd.read_csv(
ncbi_summary_tsv_filepath, sep="\t", header=0, index_col=0
)
ncbi_summary_df = ncbi_summary_df[
["Pathogenicity"]
] # and index contains the genome IDs
# convert to 0/1
ncbi_summary_df = ncbi_summary_df.map(lambda x: 1 if x == "Pathogen" else 0)
# repace dots in genome IDs with underscores
ncbi_summary_df.index = ncbi_summary_df.index.str.replace(".", "_")
# replace genome acc IDs with our own taxon IDs using the acc_to_taxon_mapping_file
acc_to_taxa_mapping_dict = (
pd.read_csv(
acc_to_taxa_mapping_filepath,
sep="\t",
header=None,
names=["acc_id", "taxon_id"],
)
.set_index("acc_id")["taxon_id"]
.to_dict()
)
ncbi_summary_df.rename(index=acc_to_taxa_mapping_dict, inplace=True)
taxa_tree = ete3.Tree(f"{tree_filepath}.pruned", format=1)
# keep only the genomes that are in the pruned tree for the whole OG PA matrix
taxa = [leaf.name for leaf in taxa_tree.iter_leaves()]
ncbi_summary_df = ncbi_summary_df.loc[taxa]
fasta_out_file = pathogenicity_pa_matrix_filepath.replace(".tsv", ".fasta")
tsv_out_file = pathogenicity_pa_matrix_filepath
# GLOOME requires a fasta file with every row as genome ID, and a single column with the pathogenicity values
# with open("pathogenicity_pa_matrix.fasta", "w") as f:
with open(
fasta_out_file, "w" ) as f:
for genome, row in ncbi_summary_df.iterrows():
f.write(">" + genome + "\n")
f.write(str(row.iloc[0]) + "\n")
# COUNT requires a TSV file with every column as genome ID and a single row with the pathogenicity values
ncbi_summary_df.T.to_csv(tsv_out_file,
sep="\t", header=True)
# print size of the matrix
print("Pathogenicity PA matrix shape:", ncbi_summary_df.shape)
return None
prepare_og_pa_matrix(
orthogroups_tsv_filepath,
acc_to_taxa_mapping_filepath,
og_pa_matrix_filepath,
tree_filepath,
)
print("Wrote OG presence-absence matrix to", og_pa_matrix_filepath)
# for the pathogenicity data for all taxa
prepare_pathogenicity_pa_matrix(
ncbi_summary_tsv_filepath,
acc_to_taxa_mapping_filepath,
tree_filepath,
pathogenicity_pa_matrix_filepath,
)
```
%% Output
OG PA matrix shape: (5503, 151)
Pruned tree written to /root/work/projects/hgt_pathogenicity/data/genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels.pruned
Wrote OG presence-absence matrix to /root/work/projects/hgt_pathogenicity/data/og_pa_matrix.tsv
Pathogenicity PA matrix shape: (151, 1)
%% Cell type:code id: tags:
``` python
%%bash -s "$og_pa_matrix_filepath" "$pathogenicity_pa_matrix_filepath" "$tree_filepath"
# create params files for GLOOME
# first for OGs
cat > ../data/infer_gene_dynamics_ogs.gloome.params << EOL
_seqFile $(dirname $1)/$(basename $1 .tsv).fasta
_treeFile $3.pruned
# use mixture-model
_gainLossDist 1
# for COG and EggNOG only patterns with 3 or more ones are observable
_minNumOfOnes 3
# include Parsimony results along with ML
_costMatrixGainLossRatio 1
# in this case, character frequencies are not equal across the tree
_isRootFreqEQstationary 0
## Advanced
_logValue 4
# print all probabilities for all events per pos per branch (default is 0.05 and everything below is not printed)
_probCutOffPrintEvent 0.0
_outDir RESULTS_OGs
EOL
############################################################################################################
# similarly but for single-copy OGs
cat > ../data/infer_gene_dynamics_single_copy_ogs.gloome.params << EOL
_seqFile $(dirname $1)/$(basename $1 .tsv).single_copy.fasta
_treeFile $3.pruned
# use mixture-model
_gainLossDist 1
# for COG and EggNOG only patterns with 3 or more ones are observable
_minNumOfOnes 3
# include Parsimony results along with ML
_costMatrixGainLossRatio 1
# in this case, character frequencies are not equal across the tree
_isRootFreqEQstationary 0
## Advanced
_logValue 4
# print all probabilities for all events per pos per branch (default is 0.05 and everything below is not printed)
_probCutOffPrintEvent 0.0
_outDir RESULTS_Single_Copy_OGs
EOL
############################################################################################################
# then for pathogenicity
cat > ../data/infer_pathogenicity_dynamics.gloome.params << EOL
_seqFile $(dirname $2)/$(basename $2 .tsv).fasta
_treeFile $3.pruned
# use mixture-model
_gainLossDist 1
# include Parsimony results along with ML
_costMatrixGainLossRatio 1
# in this case, character frequencies are not equal across the tree
_isRootFreqEQstationary 0
## Advanced
_logValue 4
_outDir RESULTS_Pathogenicity
EOL
############################################################################################################
# similarly but for single-copy OGs
cat > ../data/infer_pathogenicity_dynamics_single_copy.gloome.params << EOL
_seqFile $(dirname $2)/$(basename $2 .tsv).single_copy.fasta
_treeFile $3.pruned
# use mixture-model
_gainLossDist 1
# include Parsimony results along with ML
_costMatrixGainLossRatio 1
# in this case, character frequencies are not equal across the tree
_isRootFreqEQstationary 0
## Advanced
_logValue 4
_outDir RESULTS_Pathogenicity_Single_Copy
EOL
```
%% Cell type:markdown id: tags:
We can use these params files as arguments for GLOOME.
Run the following processes. Using `nohup`, `&`, and `disown` here because the process may take a while to finish, and I am running it remotely on a server. Processes shouldn't be affected this way, even if I logout.
```bash
nohup ~/bin/GLOOME.VR01.266 infer_gene_dynamics_nogs.params > nohup_infer_gene_dynamics_nogs.out & disown
```
%% Cell type:markdown id: tags:
--------
Similarly for Count:
```bash
# change to the output directory for COUNT Pathogenicity dynamics
cd data/inferences/pathogenicity_dynamics/Count
# run COUNT for pathogenicity dynamics, first for all genes and then for single copy genes
java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.AsymmetricWagner -gain 1 ../../../genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels.pruned ../../../pathogenicity_pa_matrix.tsv > Count_output.tsv
# separate the output into genome sizes, changes, and families
grep "# PRESENT" Count_output.tsv > Count_genome_sizes.tsv && grep "# CHANGE" Count_output.tsv > Count_changes.tsv && grep "# FAMILY" Count_output.tsv > Count_families.tsv
# change to the output directory for COUNT Gene dynamics
cd ../../gene_dynamics/Count
# run COUNT for gene dynamics, first for all genes and then for single copy genes
java -Xmx2048M -cp ~/bin/Count/Count.jar ca.umontreal.iro.evolution.genecontent.AsymmetricWagner -gain 1 ../../../genome_tree/genome_tree.iqtree.treefile.rooted.renamed.no_internal_labels.pruned ../../../og_pa_matrix.tsv > Count_output.tsv
# separate the output into genome sizes, changes, and families
grep "# PRESENT" Count_output.tsv > Count_genome_sizes.tsv && grep "# CHANGE" Count_output.tsv > Count_changes.tsv && grep "# FAMILY" Count_output.tsv > Count_families.tsv
```
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
### CCB plot style sheet v0.5
# based on template from:
# https://matplotlib.org/stable/tutorials/introductory/customizing.html#matplotlibrc-sample
#
# Compared to previous versions, this version only contains changes from default settings
# instead of containing all settings, and is therefore shorter.
#
# Color scheme: Bright qualitative (7 colors) reordered, colour-blind safe
# Borrowed from Paul Tol: https://personal.sron.nl/~pault/
# 4477AA EE6677 228833 CCBB44 66CCEE AA3377 BBBBBB
# blue cyan green yellow red purple grey
#
# Stylesheet changes originally created by Peter Schubert, 2020
# Updated to current matplotlib version 3.7 by Swastik Mishra, 2023
#
# ***************************************************************************
# * LINES *
# ***************************************************************************
# See https://matplotlib.org/stable/api/artist_api.html#module-matplotlib.lines
# for more information on line properties.
lines.markersize: 6 # marker size, in points
# markers.fillstyle: none # {full, left, right, bottom, top, none}
## ***************************************************************************
## * BOXPLOT *
## ***************************************************************************
boxplot.meanprops.marker: x
# ***************************************************************************
# * AXES *
# ***************************************************************************
# Following are default face and edge colors, default tick sizes,
# default font sizes for tick labels, and so on. See
# https://matplotlib.org/stable/api/axes_api.html#module-matplotlib.axes
axes.edgecolor: 666666 # axes edge color
axes.titlepad: 12.0 # pad between axes and title in points
axes.labelpad: 3.0 # space between label and axis
axes.axisbelow: True # draw axis gridlines and ticks:
# - below patches (True)
# - above patches but below lines ('line')
# - above all (False)
axes.formatter.limits: -3, 3 # use scientific notation if log10
# of the axis range is smaller than the
# first or larger than the second
axes.spines.top: False
axes.spines.right: False
axes.unicode_minus: False # use Unicode for the minus symbol rather than hyphen. See
# https://en.wikipedia.org/wiki/Plus_and_minus_signs#Character_codes
axes.prop_cycle: cycler(color = ['4477AA', 'EE6677', '228833', 'CCBB44', '66CCEE', 'AA3377', 'BBBBBB'], marker = ['o','^','s','*','d','P','X'] )
## bright qualitative color scheme - reordered, as per Paul Tol's Colour Schemes and templates
## https://personal.sron.nl/~pault/
# color cycle for plot lines as list of string color specs:
# single letter, long name, or web-style hex
# As opposed to all other parameters in this file, the color
# values must be enclosed in quotes for this parameter,
# e.g. '1f77b4', instead of 1f77b4.
# See also https://matplotlib.org/stable/tutorials/intermediate/color_cycle.html
# for more details on prop_cycle usage.
axes.xmargin: .02 # x margin. See `axes.Axes.margins`
axes.ymargin: .02 # y margin. See `axes.Axes.margins`
axes.zmargin: .02 # z margin. See `axes.Axes.margins`
# ***************************************************************************
# * TICKS *
# ***************************************************************************
# See https://matplotlib.org/stable/api/axis_api.html#matplotlib.axis.Tick
xtick.major.size: 5 # major tick size in points
xtick.minor.size: 3 # minor tick size in points
xtick.color: 666666 # color of the ticks
xtick.labelcolor: inherit # color of the tick labels or inherit from xtick.color
xtick.labelsize: medium # font size of the tick labels
xtick.direction: inout # direction: {in, out, inout}
ytick.major.size: 5 # major tick size in points
ytick.minor.size: 3 # minor tick size in points
ytick.color: 666666 # color of the ticks
ytick.labelcolor: inherit # color of the tick labels or inherit from ytick.color
ytick.labelsize: medium # font size of the tick labels
ytick.direction: inout # direction: {in, out, inout}
# ***************************************************************************
# * GRIDS *
# ***************************************************************************
grid.linestyle: -- # solid
grid.alpha: 1.0 # transparency, between 0.0 and 1.0
# ***************************************************************************
# * LEGEND *
# ***************************************************************************
legend.frameon: False # if True, draw the legend on a background patch
legend.framealpha: 0.7 # legend patch transparency
legend.markerscale: 0.9 # the relative size of legend markers vs. original
legend.title_fontsize: medium # None sets to the same as the default axes.
# Dimensions as fraction of font size:
legend.borderpad: 0.2 # border whitespace
legend.labelspacing: 0.3 # the vertical space between the legend entries
legend.handlelength: 1.0 # the length of the legend lines
legend.handleheight: 0.7 # the height of the legend handle
legend.handletextpad: 0.5 # the space between the legend line and legend text
legend.borderaxespad: 0.2 # the border between the axes and legend edge
legend.columnspacing: 2.0 # column separation
# ***************************************************************************
# * FIGURE *
# ***************************************************************************
# See https://matplotlib.org/stable/api/figure_api.html#matplotlib.figure.Figure
figure.titlesize: xx-large # size of the figure title (``Figure.suptitle()``)
figure.titleweight: bold # weight of the figure title
figure.labelsize: large # size of the figure label (``Figure.sup[x|y]label()``)
figure.figsize: 5, 3.09 # figure size in inches
### golden ratio: phi = y/x = (1+sqrt(5))/2 ≈ 1.618
figure.frameon: False # enable figure frame
figure.subplot.wspace: 0.3 # the amount of width reserved for space between subplots,
# expressed as a fraction of the average axis width
figure.subplot.hspace: 0.45 # the amount of height reserved for space between subplots,
# expressed as a fraction of the average axis height
# ***************************************************************************
# * ERRORBAR PLOTS *
# ***************************************************************************
errorbar.capsize: 2 # length of end cap on error bars in pixels
# ***************************************************************************
# * HISTOGRAM PLOTS *
# ***************************************************************************
hist.bins: 50 # The default number of histogram bins or 'auto'.
# ***************************************************************************
# * SAVING FIGURES *
# ***************************************************************************
# The default savefig parameters can be different from the display parameters
# e.g., you may want a higher resolution, or to make the figure
# background white
savefig.facecolor: white # figure face color when saving
savefig.edgecolor: auto # figure edge color when saving
savefig.format: pdf # {png, ps, pdf, svg}
savefig.bbox: tight # {tight, standard}
# 'tight' is incompatible with pipe-based animation
# backends (e.g. 'ffmpeg') but will work with those
# based on temporary files (e.g. 'ffmpeg_file')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment